summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--autoload.c17
-rw-r--r--stdlib/csv.tl98
-rw-r--r--tests/010/csv.tl166
-rw-r--r--txr.192
4 files changed, 373 insertions, 0 deletions
diff --git a/autoload.c b/autoload.c
index dd1ef410..251707aa 100644
--- a/autoload.c
+++ b/autoload.c
@@ -1020,6 +1020,22 @@ static val enum_instantiate(void)
return nil;
}
+static val csv_set_entries(val fun)
+{
+ val name[] = {
+ lit("get-csv"),
+ nil
+ };
+ autoload_set(al_fun, name, fun);
+ return nil;
+}
+
+static val csv_instantiate(void)
+{
+ load(scat2(stdlib_path, lit("csv")));
+ return nil;
+}
+
val autoload_reg(val (*instantiate)(void),
val (*set_entries)(val))
{
@@ -1092,6 +1108,7 @@ void autoload_init(void)
autoload_reg(csort_instantiate, csort_set_entries);
autoload_reg(glob_instantiate, glob_set_entries);
autoload_reg(enum_instantiate, enum_set_entries);
+ autoload_reg(csv_instantiate, csv_set_entries);
reg_fun(intern(lit("autoload-try-fun"), system_package), func_n1(autoload_try_fun));
}
diff --git a/stdlib/csv.tl b/stdlib/csv.tl
new file mode 100644
index 00000000..9b03f3ce
--- /dev/null
+++ b/stdlib/csv.tl
@@ -0,0 +1,98 @@
+;; Copyright 2025
+;; Kaz Kylheku <kaz@kylheku.com>
+;; Vancouver, Canada
+;; All rights reserved.
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; 1. Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;;
+;; 2. Redistributions in binary form must reproduce the above copyright notice,
+;; this list of conditions and the following disclaimer in the documentation
+;; and/or other materials provided with the distribution.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+;; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+;; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+;; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+;; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+;; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+;; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;; POSSIBILITY OF SUCH DAMAGE.
+
+(defun get-csv (: (stream *stdin*))
+ (if (stringp stream)
+ (upd stream make-string-input-stream))
+ (enumlet (rfield qfield cr qcr quot)
+ (let ((record (vec))
+ (field (str 0))
+ (state rfield)
+ (done nil))
+ (while (not done)
+ (let ((ch (get-char stream)))
+ (caseql* state
+ (rfield (caseql* ch
+ (#\return (set state cr))
+ (#\newline (vec-push record field)
+ (set done t))
+ (#\, (vec-push record field)
+ (set field (str 0)))
+ (#\" (cond
+ ((empty field)
+ (set state qfield))
+ (t
+ (string-extend field ch))))
+ (nil (vec-push record field)
+ (set done t))
+ (t (string-extend field ch))))
+ (qfield (caseql* ch
+ (#\" (set state quot))
+ (#\return (set state qcr))
+ (nil (vec-push record field)
+ (set done t))
+ (t (string-extend field ch))))
+ (cr (caseql* ch
+ (#\newline (vec-push record field)
+ (set done t))
+ (#\return (string-extend field ch))
+ (#\, (string-extend field #\return)
+ (vec-push record field)
+ (set field (str 0)
+ state rfield))
+ (nil (string-extend field #\return)
+ (vec-push record field)
+ (set done t))
+ (t (string-extend field #\return)
+ (string-extend field ch)
+ (set state rfield))))
+ (qcr (caseql* ch
+ (#\newline (string-extend field ch)
+ (set state qfield))
+ (#\return (string-extend field ch))
+ (nil (string-extend field #\return)
+ (vec-push record field)
+ (set done t))
+ (#\" (string-extend field #\return)
+ (set state quot))
+ (t (string-extend field #\return)
+ (string-extend field ch)
+ (set state qfield))))
+ (quot (caseql* ch
+ (#\, (vec-push record field)
+ (set field (str 0)
+ state rfield))
+ (#\" (string-extend field ch)
+ (set state qfield))
+ (#\newline (vec-push record field)
+ (set done t))
+ (#\return (set state cr))
+ (nil (vec-push record field)
+ (set done t))
+ (t (string-extend field ch)
+ (set state rfield)))))))
+ record)))
diff --git a/tests/010/csv.tl b/tests/010/csv.tl
new file mode 100644
index 00000000..fffd8d0d
--- /dev/null
+++ b/tests/010/csv.tl
@@ -0,0 +1,166 @@
+(load "../common")
+
+(defmacro mtest-csv (. pairs)
+ (tree-bind (data expected) (flow pairs (tuples 2) transpose)
+ ^(mtest ,*(mappend (ret ^((get-csv ,(regsub "#" "\"" @1))
+ ,(mapcar (op regsub "#" "\"") @2)))
+ data expected))))
+
+(mtest-csv
+ "" #("")
+ "," #("" "")
+ ",," #("" "" ""))
+
+(mtest-csv
+ "\r" #("\r")
+ "\r\n" #("")
+ "\r\r" #("\r\r")
+ "\r\r\n" #("\r")
+ "\r\r" #("\r\r")
+ "\r\r\r" #("\r\r\r")
+ "\r\r\r\n" #("\r\r")
+ "\r,\r" #("\r" "\r")
+ "\r\n,\r" #("")
+ "\r\r\n,\r" #("\r")
+ ",\r\r" #("" "\r\r")
+ ",\r\n" #("" ""))
+
+(mtest-csv
+ "a\r" #("a\r")
+ "a\r\n" #("a")
+ "a\r\r" #("a\r\r")
+ "a\r\r\n" #("a\r")
+ "a\r\r" #("a\r\r")
+ "a\r\r\r" #("a\r\r\r")
+ "a\r\r\r\n" #("a\r\r")
+ "a\r,a\r" #("a\r" "a\r")
+ "a\r\n,a\r" #("a")
+ "a\r\r\n,a\r" #("a\r")
+ "a,a\r\r" #("a" "a\r\r")
+ "a,\r\r" #("a" "\r\r")
+ ",a\r\n" #("" "a"))
+
+(mtest-csv
+ "##" #("")
+ "#" #("")
+ "#,#" #(",")
+ "##,##" #("" "")
+ "####" #("#")
+ "##a#" #("a#")
+ "#\r#" #("\r")
+ "#\n#" #("\n")
+ "#\r\n#" #("\n")
+ "#\r\r\n#" #("\r\n")
+ "#\r\r\r\n#" #("\r\r\n")
+ "##\r\n#" #("")
+ "##\r\r\n#" #("\r")
+ "##\r\r\r\n#" #("\r\r"))
+
+(mtest-csv
+ "##\n" #("")
+ "#\n" #("\n")
+ "#,#\n" #(",")
+ "##,##\n" #("" "")
+ "####\n" #("#")
+ "##a#\n" #("a#")
+ "#\r#\n" #("\r")
+ "#\n#\n" #("\n")
+ "#\r\n#\n" #("\n")
+ "#\r\r\n#\n" #("\r\n")
+ "#\r\r\r\n#\n" #("\r\r\n")
+ "##\r\n#\n" #("")
+ "##\r\r\n#\n" #("\r")
+ "##\r\r\r\n#\n" #("\r\r"))
+
+(mtest-csv
+ "#\r##\r#" #("\r#\r")
+ "#\n##\n#" #("\n#\n")
+ "#\r\n##\r\n#" #("\n#\n"))
+
+(mtest-csv
+ "a#b" #("a#b")
+ "a##b" #("a##b")
+ "a###b" #("a###b"))
+
+(mtest-csv
+ "#a#b" #("ab")
+ "ab#cd#ef" #("ab#cd#ef"))
+
+(mtest-csv
+ "a," #("a" "")
+ ",a" #("" "a")
+ "a,b,c" #("a" "b" "c")
+ "#a#,b,c" #("a" "b" "c")
+ "a,#b#,c" #("a" "b" "c")
+ "a,b,#c#" #("a" "b" "c")
+ "#a#,b,#c#" #("a" "b" "c")
+ "#a#,#b#,#c#" #("a" "b" "c"))
+
+(mtest-csv
+ "a,\r\n" #("a" "")
+ ",a\r\n" #("" "a")
+ "a,b,c\r\n" #("a" "b" "c")
+ "#a#,b,c\r\n" #("a" "b" "c")
+ "a,#b#,c\r\n" #("a" "b" "c")
+ "a,b,#c#\r\n" #("a" "b" "c")
+ "#a#,b,#c#\r\n" #("a" "b" "c")
+ "#a#,#b#,#c#\r\n" #("a" "b" "c"))
+
+(mtest-csv
+ "a,\n" #("a" "")
+ ",a\n" #("" "a")
+ "a,b,c\n" #("a" "b" "c")
+ "#a#,b,c\n" #("a" "b" "c")
+ "a,#b#,c\n" #("a" "b" "c")
+ "a,b,#c#\n" #("a" "b" "c")
+ "#a#,b,#c#\n" #("a" "b" "c")
+ "#a#,#b#,#c#\n" #("a" "b" "c"))
+
+(mtest-csv
+ "a,\r" #("a" "\r")
+ ",a\r" #("" "a\r")
+ "a,b,c\r" #("a" "b" "c\r")
+ "#a#,b,c\r" #("a" "b" "c\r")
+ "a,#b#,c\r" #("a" "b" "c\r")
+ "a,b,#c#\r" #("a" "b" "c\r")
+ "#a#,b,#c#\r" #("a" "b" "c\r")
+ "#a#,#b#,#c#\r" #("a" "b" "c\r"))
+
+(mtest-csv
+ "a,\n" #("a" "")
+ ",a\n" #("" "a")
+ "a,b,c\n" #("a" "b" "c")
+ "#a#,b,c\n" #("a" "b" "c")
+ "a,#b#,c\n" #("a" "b" "c")
+ "a,b,#c#\n" #("a" "b" "c")
+ "#a#,b,#c#\n" #("a" "b" "c")
+ "#a#,#b#,#c#\n" #("a" "b" "c"))
+
+(mtest-csv
+ "a,\r" #("a" "\r")
+ ",a\r" #("" "a\r")
+ "a,b,c\r" #("a" "b" "c\r")
+ "#a#,b,c\r" #("a" "b" "c\r")
+ "a,#b#,c\r" #("a" "b" "c\r")
+ "a,b,#c#\r" #("a" "b" "c\r")
+ "#a#,b,#c#\r" #("a" "b" "c\r")
+ "#a#,#b#,#c#\r" #("a" "b" "c\r"))
+
+(mtest-csv
+ "aa,\r\n" #("aa" "")
+ ",aa\r\n" #("" "aa")
+ "aa,bb,cc\r\n" #("aa" "bb" "cc")
+ "#aa#,bb,cc\r\n" #("aa" "bb" "cc")
+ "aa,#bb#,cc\r\n" #("aa" "bb" "cc")
+ "aa,bb,#cc#\r\n" #("aa" "bb" "cc")
+ "#aa#,bb,#cc#\r\n" #("aa" "bb" "cc")
+ "#aa#,#bb#,#cc#\r\n" #("aa" "bb" "cc"))
+
+(mtest-csv
+ "#Hello, ##Bob##!" #("Hello, #Bob#!")
+ "#Hello, ##Bob##!\r" #("Hello, #Bob#!\r")
+ "#Hello, ##Bob##!\n" #("Hello, #Bob#!\n")
+ "#Hello, ##Bob##!\r\n" #("Hello, #Bob#!\n")
+ "#Hello, ##Bob##!#\r\n" #("Hello, #Bob#!")
+ "#Hello, ##Bob##!#\n" #("Hello, #Bob#!")
+ "#Hello, ##Bob##!#\r" #("Hello, #Bob#!\r"))
diff --git a/txr.1 b/txr.1
index a4bf6237..2d3090dd 100644
--- a/txr.1
+++ b/txr.1
@@ -85056,6 +85056,98 @@ If this variable is
.codn nil ,
then JSON numbers are all converted to floating point.
+.coNP Function @ get-csv
+.synb
+.mets (get-csv <> [ source ])
+.syne
+.desc
+The
+.code get-csv
+function reads a single record of CSV ("comma-separated values")
+data from the input
+.metn source ,
+returning a vector of strings.
+
+The
+.meta source
+must be a stream or a string.
+If it is omitted, then
+.code *stdin*
+is used.
+
+The CSV scanning is implemented in a way which is nearly compatible with RFC
+4180, with certain differences, as well as extensions of behavior.
+
+RFC 4180 specifies that the line separators in CSV are CR-LF pairs.
+The specification makes it unclear whether, when these separators occur
+in the data, they are retained in that two-character form or whether
+they may be mapped to a native newline representation.
+
+In contrast,
+.meta get-csv
+function recognizes two equivalent line breaks: CR-LF and LF.
+When a line break occurs in field data, it is represented as a single LF,
+which is the newline character in \*(TL: the character
+.code #\enewline
+denoted in strings by the escape sequence
+.codn \en .
+
+An isolated CR character in the CSV data (one not followed by
+LF) is considered an ordinary character and becomes a constituent
+character of a field; it is never treated as a line break.
+
+RFC 4180 specifies CSV as consisting of 7 bit characters only. The
+.code get-csv
+function extends the behavior by operating on Unicode characters,
+which are decoded from UTF-8 by the underlying stream implementation.
+
+RFC 4180 excludes control characters other than those encoding
+line breaks, and also excludes the character U+007F;
+.code get-csv
+treats control characters as literal field constituent characters.
+A NUL character occurring in the UTF-8 data is mapped by the \*(TX
+stream implementation to pseudo-null character, and
+.code get-csv
+then allows it as a field constituent.
+
+RFC 4180 neglects to specify behavior when the input deviates from
+the specified syntax. The
+.code get-csv
+function implements the following extensions of behavior for
+nonconforming input:
+
+When the closing quote of a double-quoted field is followed by trailing
+characters, these are added to the field. In other words, when a doubly quoted
+field is closed, then processing of additional characters continues int the
+same manner as for an unquoted field, allowing additional characters to be
+recognized and added to the field prior to the appearance of a comma or end of
+record.
+
+The RFC states that fields containing double quotes should be
+enclosed in double-quotes, with the constituent double-quotes being
+escaped. The
+.code get-csv
+function allows an unquoted field to contain double quote characters,
+which are treated as ordinary characters belonging to the field. In this
+situation, a sequence of two double quotes specifies two double quotes.
+
+The RFC states that the last field of a record must not be followed
+by a comma. Under the
+.code get-csv
+function, this situation is impossible. A trailing comma at the end
+of a record specifies an empty last field, which is not itself
+followed by a comma.
+
+The
+.code get-csv
+function does not recognize or diagnose any errors; it extracts the
+maximal prefix of the input source which constitutes a valid CSV record.
+Characters not belonging to the CSV record remain in the stream.
+Multiple calls to
+.code get-csv
+for the same input stream given as
+.meta source
+extract consecutive CSV records.
.SH* FOREIGN FUNCTION INTERFACE