summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--stdlib/awk.tl8
-rw-r--r--tests/015/awk-basic.tl18
-rw-r--r--txr.159
3 files changed, 82 insertions, 3 deletions
diff --git a/stdlib/awk.tl b/stdlib/awk.tl
index 50121736..35bd436e 100644
--- a/stdlib/awk.tl
+++ b/stdlib/awk.tl
@@ -95,6 +95,14 @@
(if (< end l)
(add [self.rec end..:])))
self.nf i))))
+ ((eq self.fs :csv)
+ (when self.ft
+ (awk-error "both fs and ft set"))
+ (when self.kfs
+ (awk-error "fs set to :csv and ft set"))
+ (lambda (self)
+ (set self.fields (get-csv self.rec)
+ self.nf (length self.fields))))
(self.fs
(when self.ft
(awk-error "both fs and ft set"))
diff --git a/tests/015/awk-basic.tl b/tests/015/awk-basic.tl
index a411d495..caf6087f 100644
--- a/tests/015/awk-basic.tl
+++ b/tests/015/awk-basic.tl
@@ -39,3 +39,21 @@
(t (set f f)
(prn)))
"one,two,three\nfour,five,six\nseven,eight,nine,ten\n")
+
+(defvarl *d1* "a,b,c\r\n\r\nd,e,f\r\ng,h,i")
+
+(motest
+ (awk (:inputs (make-string-input-stream *d1*))
+ (:set fs :csv)
+ (t (prn nf)))
+ "3\n1\n3\n3\n"
+ (awk (:inputs (make-string-input-stream *d1*))
+ (:set fs :csv ofs "|")
+ (t (set f f)
+ (prn)))
+ "a|b|c\r\n\r\nd|e|f\r\ng|h|i\n"
+ (awk (:inputs (make-string-input-stream *d1*))
+ (:set fs :csv rs "\r\n" ofs "|")
+ (t (set f f)
+ (prn)))
+ "a|b|c\n\nd|e|f\ng|h|i\n")
diff --git a/txr.1 b/txr.1
index b5dcf076..6ca45263 100644
--- a/txr.1
+++ b/txr.1
@@ -71914,18 +71914,42 @@ and
each specify a string or regular expression which is used for each
record that is stored in the
.code rec
-variable into fields.
+variable into fields. Additionally,
+.code fs
+may be assigned the keyword symbol
+.code :csv
+to enable CSV-based separation of fields.
Both variables are initialized to
.codn nil ,
in which case a default behavior is in effect, described below.
-Use of these variable is mutually exclusive; it is an error for both of these
+Note that whenever the variable
+.code fw
+has a value other than
+.codn nil ,
+then the
+.code fs
+and
+.code ft
+variables are ignored; field splitting takes place according to the
+field-widths indicated in that variable.
+
+Use of the
+.code fs
+and
+.code ft
+variables is mutually exclusive; it is an error for both of these
variables to simultaneously have a value other than
.codn nil .
The value stored in either variable must be
.codn nil ,
-a character string or a regular expression. If it contains a string or
+a character string, a regular expression or,
+in the case of
+.codn fs ,
+the keyword
+.codn :csv .
+If it contains a string or
regex, it is said to contain a pattern. A string value effectively behaves
as a fixed regular expression which matches the sequence of characters
in the string verbatim, without treating any of them as regex operators.
@@ -71975,6 +71999,35 @@ The tokenizing is performed using the
.code tok-str
function.
+When the value
+.code fs
+is the keyword symbol
+.codn :csv ,
+the
+.code kfs
+variable is ignored. The
+.code awk
+macro then operates in CSV mode.
+Each record
+.code rec
+is split into fields via the
+.code get-csv
+function. Because that function returns a vector, the field list
+.code f
+is a vector in CSV mode.
+
+Note that in CSV mode,
+.code awk
+still controls the splitting of the input stream into records; the
+.code get-csv
+function is given a delimited record to split into fields.
+If the CSV input uses CR-LF line termination, then
+.code awk
+must be configured to recognize it via the
+.code rs
+variable, otherwise the CR characters end up as constituents of
+the last field.
+
If
.code fs
and