diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2025-01-21 07:24:17 -0800 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2025-01-21 07:24:17 -0800 |
commit | 6824735d879f42572778b40c323828f533b87248 (patch) | |
tree | b135841d76b1854936138e348355ae8f93001ee1 | |
parent | 519b05f5281572ef9f6f686844af25159ca5896e (diff) | |
download | txr-6824735d879f42572778b40c323828f533b87248.tar.gz txr-6824735d879f42572778b40c323828f533b87248.tar.bz2 txr-6824735d879f42572778b40c323828f533b87248.zip |
New function: get-csv.
* autloload.c (csv_set_entries, csv_instantiate): New
static funtions.
(autoload_init): Register autoload of stdlib/csv
module via new functions.
* stdlib/csv.tl: New file.
* tests/010/csv.tl: Likewise.
* txr.1: Documented.
-rw-r--r-- | autoload.c | 17 | ||||
-rw-r--r-- | stdlib/csv.tl | 98 | ||||
-rw-r--r-- | tests/010/csv.tl | 166 | ||||
-rw-r--r-- | txr.1 | 92 |
4 files changed, 373 insertions, 0 deletions
@@ -1020,6 +1020,22 @@ static val enum_instantiate(void) return nil; } +static val csv_set_entries(val fun) +{ + val name[] = { + lit("get-csv"), + nil + }; + autoload_set(al_fun, name, fun); + return nil; +} + +static val csv_instantiate(void) +{ + load(scat2(stdlib_path, lit("csv"))); + return nil; +} + val autoload_reg(val (*instantiate)(void), val (*set_entries)(val)) { @@ -1092,6 +1108,7 @@ void autoload_init(void) autoload_reg(csort_instantiate, csort_set_entries); autoload_reg(glob_instantiate, glob_set_entries); autoload_reg(enum_instantiate, enum_set_entries); + autoload_reg(csv_instantiate, csv_set_entries); reg_fun(intern(lit("autoload-try-fun"), system_package), func_n1(autoload_try_fun)); } diff --git a/stdlib/csv.tl b/stdlib/csv.tl new file mode 100644 index 00000000..9b03f3ce --- /dev/null +++ b/stdlib/csv.tl @@ -0,0 +1,98 @@ +;; Copyright 2025 +;; Kaz Kylheku <kaz@kylheku.com> +;; Vancouver, Canada +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; 1. Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; +;; 2. Redistributions in binary form must reproduce the above copyright notice, +;; this list of conditions and the following disclaimer in the documentation +;; and/or other materials provided with the distribution. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +;; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +;; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +;; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +;; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +;; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +;; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +;; POSSIBILITY OF SUCH DAMAGE. + +(defun get-csv (: (stream *stdin*)) + (if (stringp stream) + (upd stream make-string-input-stream)) + (enumlet (rfield qfield cr qcr quot) + (let ((record (vec)) + (field (str 0)) + (state rfield) + (done nil)) + (while (not done) + (let ((ch (get-char stream))) + (caseql* state + (rfield (caseql* ch + (#\return (set state cr)) + (#\newline (vec-push record field) + (set done t)) + (#\, (vec-push record field) + (set field (str 0))) + (#\" (cond + ((empty field) + (set state qfield)) + (t + (string-extend field ch)))) + (nil (vec-push record field) + (set done t)) + (t (string-extend field ch)))) + (qfield (caseql* ch + (#\" (set state quot)) + (#\return (set state qcr)) + (nil (vec-push record field) + (set done t)) + (t (string-extend field ch)))) + (cr (caseql* ch + (#\newline (vec-push record field) + (set done t)) + (#\return (string-extend field ch)) + (#\, (string-extend field #\return) + (vec-push record field) + (set field (str 0) + state rfield)) + (nil (string-extend field #\return) + (vec-push record field) + (set done t)) + (t (string-extend field #\return) + (string-extend field ch) + (set state rfield)))) + (qcr (caseql* ch + (#\newline (string-extend field ch) + (set state qfield)) + (#\return (string-extend field ch)) + (nil (string-extend field #\return) + (vec-push record field) + (set done t)) + (#\" (string-extend field #\return) + (set state quot)) + (t (string-extend field #\return) + (string-extend field ch) + (set state qfield)))) + (quot (caseql* ch + (#\, (vec-push record field) + (set field (str 0) + state rfield)) + (#\" (string-extend field ch) + (set state qfield)) + (#\newline (vec-push record field) + (set done t)) + (#\return (set state cr)) + (nil (vec-push record field) + (set done t)) + (t (string-extend field ch) + (set state rfield))))))) + record))) diff --git a/tests/010/csv.tl b/tests/010/csv.tl new file mode 100644 index 00000000..fffd8d0d --- /dev/null +++ b/tests/010/csv.tl @@ -0,0 +1,166 @@ +(load "../common") + +(defmacro mtest-csv (. pairs) + (tree-bind (data expected) (flow pairs (tuples 2) transpose) + ^(mtest ,*(mappend (ret ^((get-csv ,(regsub "#" "\"" @1)) + ,(mapcar (op regsub "#" "\"") @2))) + data expected)))) + +(mtest-csv + "" #("") + "," #("" "") + ",," #("" "" "")) + +(mtest-csv + "\r" #("\r") + "\r\n" #("") + "\r\r" #("\r\r") + "\r\r\n" #("\r") + "\r\r" #("\r\r") + "\r\r\r" #("\r\r\r") + "\r\r\r\n" #("\r\r") + "\r,\r" #("\r" "\r") + "\r\n,\r" #("") + "\r\r\n,\r" #("\r") + ",\r\r" #("" "\r\r") + ",\r\n" #("" "")) + +(mtest-csv + "a\r" #("a\r") + "a\r\n" #("a") + "a\r\r" #("a\r\r") + "a\r\r\n" #("a\r") + "a\r\r" #("a\r\r") + "a\r\r\r" #("a\r\r\r") + "a\r\r\r\n" #("a\r\r") + "a\r,a\r" #("a\r" "a\r") + "a\r\n,a\r" #("a") + "a\r\r\n,a\r" #("a\r") + "a,a\r\r" #("a" "a\r\r") + "a,\r\r" #("a" "\r\r") + ",a\r\n" #("" "a")) + +(mtest-csv + "##" #("") + "#" #("") + "#,#" #(",") + "##,##" #("" "") + "####" #("#") + "##a#" #("a#") + "#\r#" #("\r") + "#\n#" #("\n") + "#\r\n#" #("\n") + "#\r\r\n#" #("\r\n") + "#\r\r\r\n#" #("\r\r\n") + "##\r\n#" #("") + "##\r\r\n#" #("\r") + "##\r\r\r\n#" #("\r\r")) + +(mtest-csv + "##\n" #("") + "#\n" #("\n") + "#,#\n" #(",") + "##,##\n" #("" "") + "####\n" #("#") + "##a#\n" #("a#") + "#\r#\n" #("\r") + "#\n#\n" #("\n") + "#\r\n#\n" #("\n") + "#\r\r\n#\n" #("\r\n") + "#\r\r\r\n#\n" #("\r\r\n") + "##\r\n#\n" #("") + "##\r\r\n#\n" #("\r") + "##\r\r\r\n#\n" #("\r\r")) + +(mtest-csv + "#\r##\r#" #("\r#\r") + "#\n##\n#" #("\n#\n") + "#\r\n##\r\n#" #("\n#\n")) + +(mtest-csv + "a#b" #("a#b") + "a##b" #("a##b") + "a###b" #("a###b")) + +(mtest-csv + "#a#b" #("ab") + "ab#cd#ef" #("ab#cd#ef")) + +(mtest-csv + "a," #("a" "") + ",a" #("" "a") + "a,b,c" #("a" "b" "c") + "#a#,b,c" #("a" "b" "c") + "a,#b#,c" #("a" "b" "c") + "a,b,#c#" #("a" "b" "c") + "#a#,b,#c#" #("a" "b" "c") + "#a#,#b#,#c#" #("a" "b" "c")) + +(mtest-csv + "a,\r\n" #("a" "") + ",a\r\n" #("" "a") + "a,b,c\r\n" #("a" "b" "c") + "#a#,b,c\r\n" #("a" "b" "c") + "a,#b#,c\r\n" #("a" "b" "c") + "a,b,#c#\r\n" #("a" "b" "c") + "#a#,b,#c#\r\n" #("a" "b" "c") + "#a#,#b#,#c#\r\n" #("a" "b" "c")) + +(mtest-csv + "a,\n" #("a" "") + ",a\n" #("" "a") + "a,b,c\n" #("a" "b" "c") + "#a#,b,c\n" #("a" "b" "c") + "a,#b#,c\n" #("a" "b" "c") + "a,b,#c#\n" #("a" "b" "c") + "#a#,b,#c#\n" #("a" "b" "c") + "#a#,#b#,#c#\n" #("a" "b" "c")) + +(mtest-csv + "a,\r" #("a" "\r") + ",a\r" #("" "a\r") + "a,b,c\r" #("a" "b" "c\r") + "#a#,b,c\r" #("a" "b" "c\r") + "a,#b#,c\r" #("a" "b" "c\r") + "a,b,#c#\r" #("a" "b" "c\r") + "#a#,b,#c#\r" #("a" "b" "c\r") + "#a#,#b#,#c#\r" #("a" "b" "c\r")) + +(mtest-csv + "a,\n" #("a" "") + ",a\n" #("" "a") + "a,b,c\n" #("a" "b" "c") + "#a#,b,c\n" #("a" "b" "c") + "a,#b#,c\n" #("a" "b" "c") + "a,b,#c#\n" #("a" "b" "c") + "#a#,b,#c#\n" #("a" "b" "c") + "#a#,#b#,#c#\n" #("a" "b" "c")) + +(mtest-csv + "a,\r" #("a" "\r") + ",a\r" #("" "a\r") + "a,b,c\r" #("a" "b" "c\r") + "#a#,b,c\r" #("a" "b" "c\r") + "a,#b#,c\r" #("a" "b" "c\r") + "a,b,#c#\r" #("a" "b" "c\r") + "#a#,b,#c#\r" #("a" "b" "c\r") + "#a#,#b#,#c#\r" #("a" "b" "c\r")) + +(mtest-csv + "aa,\r\n" #("aa" "") + ",aa\r\n" #("" "aa") + "aa,bb,cc\r\n" #("aa" "bb" "cc") + "#aa#,bb,cc\r\n" #("aa" "bb" "cc") + "aa,#bb#,cc\r\n" #("aa" "bb" "cc") + "aa,bb,#cc#\r\n" #("aa" "bb" "cc") + "#aa#,bb,#cc#\r\n" #("aa" "bb" "cc") + "#aa#,#bb#,#cc#\r\n" #("aa" "bb" "cc")) + +(mtest-csv + "#Hello, ##Bob##!" #("Hello, #Bob#!") + "#Hello, ##Bob##!\r" #("Hello, #Bob#!\r") + "#Hello, ##Bob##!\n" #("Hello, #Bob#!\n") + "#Hello, ##Bob##!\r\n" #("Hello, #Bob#!\n") + "#Hello, ##Bob##!#\r\n" #("Hello, #Bob#!") + "#Hello, ##Bob##!#\n" #("Hello, #Bob#!") + "#Hello, ##Bob##!#\r" #("Hello, #Bob#!\r")) @@ -85056,6 +85056,98 @@ If this variable is .codn nil , then JSON numbers are all converted to floating point. +.coNP Function @ get-csv +.synb +.mets (get-csv <> [ source ]) +.syne +.desc +The +.code get-csv +function reads a single record of CSV ("comma-separated values") +data from the input +.metn source , +returning a vector of strings. + +The +.meta source +must be a stream or a string. +If it is omitted, then +.code *stdin* +is used. + +The CSV scanning is implemented in a way which is nearly compatible with RFC +4180, with certain differences, as well as extensions of behavior. + +RFC 4180 specifies that the line separators in CSV are CR-LF pairs. +The specification makes it unclear whether, when these separators occur +in the data, they are retained in that two-character form or whether +they may be mapped to a native newline representation. + +In contrast, +.meta get-csv +function recognizes two equivalent line breaks: CR-LF and LF. +When a line break occurs in field data, it is represented as a single LF, +which is the newline character in \*(TL: the character +.code #\enewline +denoted in strings by the escape sequence +.codn \en . + +An isolated CR character in the CSV data (one not followed by +LF) is considered an ordinary character and becomes a constituent +character of a field; it is never treated as a line break. + +RFC 4180 specifies CSV as consisting of 7 bit characters only. The +.code get-csv +function extends the behavior by operating on Unicode characters, +which are decoded from UTF-8 by the underlying stream implementation. + +RFC 4180 excludes control characters other than those encoding +line breaks, and also excludes the character U+007F; +.code get-csv +treats control characters as literal field constituent characters. +A NUL character occurring in the UTF-8 data is mapped by the \*(TX +stream implementation to pseudo-null character, and +.code get-csv +then allows it as a field constituent. + +RFC 4180 neglects to specify behavior when the input deviates from +the specified syntax. The +.code get-csv +function implements the following extensions of behavior for +nonconforming input: + +When the closing quote of a double-quoted field is followed by trailing +characters, these are added to the field. In other words, when a doubly quoted +field is closed, then processing of additional characters continues int the +same manner as for an unquoted field, allowing additional characters to be +recognized and added to the field prior to the appearance of a comma or end of +record. + +The RFC states that fields containing double quotes should be +enclosed in double-quotes, with the constituent double-quotes being +escaped. The +.code get-csv +function allows an unquoted field to contain double quote characters, +which are treated as ordinary characters belonging to the field. In this +situation, a sequence of two double quotes specifies two double quotes. + +The RFC states that the last field of a record must not be followed +by a comma. Under the +.code get-csv +function, this situation is impossible. A trailing comma at the end +of a record specifies an empty last field, which is not itself +followed by a comma. + +The +.code get-csv +function does not recognize or diagnose any errors; it extracts the +maximal prefix of the input source which constitutes a valid CSV record. +Characters not belonging to the CSV record remain in the stream. +Multiple calls to +.code get-csv +for the same input stream given as +.meta source +extract consecutive CSV records. .SH* FOREIGN FUNCTION INTERFACE |