diff options
-rw-r--r-- | stdlib/awk.tl | 8 | ||||
-rw-r--r-- | tests/015/awk-basic.tl | 18 | ||||
-rw-r--r-- | txr.1 | 59 |
3 files changed, 82 insertions, 3 deletions
diff --git a/stdlib/awk.tl b/stdlib/awk.tl index 50121736..35bd436e 100644 --- a/stdlib/awk.tl +++ b/stdlib/awk.tl @@ -95,6 +95,14 @@ (if (< end l) (add [self.rec end..:]))) self.nf i)))) + ((eq self.fs :csv) + (when self.ft + (awk-error "both fs and ft set")) + (when self.kfs + (awk-error "fs set to :csv and ft set")) + (lambda (self) + (set self.fields (get-csv self.rec) + self.nf (length self.fields)))) (self.fs (when self.ft (awk-error "both fs and ft set")) diff --git a/tests/015/awk-basic.tl b/tests/015/awk-basic.tl index a411d495..caf6087f 100644 --- a/tests/015/awk-basic.tl +++ b/tests/015/awk-basic.tl @@ -39,3 +39,21 @@ (t (set f f) (prn))) "one,two,three\nfour,five,six\nseven,eight,nine,ten\n") + +(defvarl *d1* "a,b,c\r\n\r\nd,e,f\r\ng,h,i") + +(motest + (awk (:inputs (make-string-input-stream *d1*)) + (:set fs :csv) + (t (prn nf))) + "3\n1\n3\n3\n" + (awk (:inputs (make-string-input-stream *d1*)) + (:set fs :csv ofs "|") + (t (set f f) + (prn))) + "a|b|c\r\n\r\nd|e|f\r\ng|h|i\n" + (awk (:inputs (make-string-input-stream *d1*)) + (:set fs :csv rs "\r\n" ofs "|") + (t (set f f) + (prn))) + "a|b|c\n\nd|e|f\ng|h|i\n") @@ -71914,18 +71914,42 @@ and each specify a string or regular expression which is used for each record that is stored in the .code rec -variable into fields. +variable into fields. Additionally, +.code fs +may be assigned the keyword symbol +.code :csv +to enable CSV-based separation of fields. Both variables are initialized to .codn nil , in which case a default behavior is in effect, described below. -Use of these variable is mutually exclusive; it is an error for both of these +Note that whenever the variable +.code fw +has a value other than +.codn nil , +then the +.code fs +and +.code ft +variables are ignored; field splitting takes place according to the +field-widths indicated in that variable. + +Use of the +.code fs +and +.code ft +variables is mutually exclusive; it is an error for both of these variables to simultaneously have a value other than .codn nil . The value stored in either variable must be .codn nil , -a character string or a regular expression. If it contains a string or +a character string, a regular expression or, +in the case of +.codn fs , +the keyword +.codn :csv . +If it contains a string or regex, it is said to contain a pattern. A string value effectively behaves as a fixed regular expression which matches the sequence of characters in the string verbatim, without treating any of them as regex operators. @@ -71975,6 +71999,35 @@ The tokenizing is performed using the .code tok-str function. +When the value +.code fs +is the keyword symbol +.codn :csv , +the +.code kfs +variable is ignored. The +.code awk +macro then operates in CSV mode. +Each record +.code rec +is split into fields via the +.code get-csv +function. Because that function returns a vector, the field list +.code f +is a vector in CSV mode. + +Note that in CSV mode, +.code awk +still controls the splitting of the input stream into records; the +.code get-csv +function is given a delimited record to split into fields. +If the CSV input uses CR-LF line termination, then +.code awk +must be configured to recognize it via the +.code rs +variable, otherwise the CR characters end up as constituents of +the last field. + If .code fs and |