From b0bbe6e9dfd169f78b4908296d6edba52ed9a707 Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Sun, 25 Sep 2016 10:40:51 -0700 Subject: awk macro: proper fs semantics in paragraph mode. * share/txr/stdlib/awk.tl (sys:awk-state): New slots: par-mode, par-mode-fs, par-mode-prev-fs. (sys:awk-state rec-to-f): In paragraph mode, detect that fs has changed since the last call. In that case, take the user's fs and add to it a newline match. If it is a regex, take the source, add the syntax and recompile the regex. If it's a string, build regex around it and compile. (sys:awk-state loop): Maintain the par-mode-t variable in the state structure as the rs value triggers transitions into or out of paragraph mode. * txr.1: Updated documentation for rs. --- share/txr/stdlib/awk.tl | 40 +++++++++++++++++++++++++++------------- txr.1 | 29 ++++++++++++----------------- 2 files changed, 39 insertions(+), 30 deletions(-) diff --git a/share/txr/stdlib/awk.tl b/share/txr/stdlib/awk.tl index 3efd8cc6..e99f62d2 100644 --- a/share/txr/stdlib/awk.tl +++ b/share/txr/stdlib/awk.tl @@ -37,6 +37,7 @@ (rec-num 0) rec orig-rec fields nf rng-vec (rng-n 0) + par-mode par-mode-fs par-mode-prev-fs (:postinit (self) (if (plusp self.rng-n) (set self.rng-vec (vector self.rng-n))) @@ -61,8 +62,18 @@ (if (and (not self.kfs) (equal self.rec "")) (set self.fields nil self.nf 0) - (set self.fields (split-str self.rec self.fs self.kfs) - self.nf (length self.fields)))) + (let ((eff-fs (if self.par-mode + (if (equal self.fs self.par-mode-prev-fs) + self.par-mode-fs + (set self.par-mode-prev-fs self.fs + self.par-mode-fs + (regex-compile ^(or ,(if (regexp self.fs) + (regex-source self.fs) + self.fs) + "\n")))) + self.fs))) + (set self.fields (split-str self.rec eff-fs self.kfs) + self.nf (length self.fields))))) (self.ft (set self.fields (tok-str self.rec self.ft self.kfs) self.nf (length self.fields))) @@ -95,20 +106,23 @@ (set cached-rr (cond ((and (equal aws.rs "\n") (not aws.krs)) + (set aws.par-mode nil) (lambda () (get-line stin))) ((null aws.rs) - (let ((rin (record-adapter #/\n[ \n\t]*\n/)) - (flag t)) - (lambda () - (let ((r (get-line rin))) - (cond - (flag - (set flag nil) - (if (equal r "") - (get-line rin) - r)) - (t r)))))) + (set aws.par-mode t) + (let ((rin (record-adapter #/\n[ \n\t]*\n/)) + (flag t)) + (lambda () + (let ((r (get-line rin))) + (cond + (flag + (set flag nil) + (if (equal r "") + (get-line rin) + r)) + (t r)))))) (t + (set aws.par-mode nil) (let ((rin (record-adapter (if (regexp aws.rs) aws.rs (regex-compile aws.rs)) diff --git a/txr.1 b/txr.1 index 508b8c72..bea89447 100644 --- a/txr.1 +++ b/txr.1 @@ -38597,27 +38597,22 @@ or more blank lines (empty lines or lines containing only a mixture of tabs and spaces). This means that, effectively, the record-separating sequences match the regular expression .codn "/\en[ \en\et]*\en/" . -There is a difference between paragraph mode and simply using the above + +There are two differences between paragraph mode and simply using the above regular expression as .codn rs . -The difference is that if the first record which is read upon entering +The first difference is that if the first record which is read upon entering paragraph mode is empty (because the input begins with a match for the -separator regex), then that record is thrown away, and the next record -is read. - -Note that the POSIX Awk paragraph mode (which occurs when -.code RS -is blank) there is an additional difference: regardless of the value -of the field separator -.codn FS , -newline characters separate fields. This behavior is not implemented -in the -.code awk -macro. Since newlines are included as separators in under the default field -separation, the behaviors match in that case. Code using a custom +separator regex), then that record is thrown away, and the next record is read. +The second difference is that, if field separation based on the +.code fs +variable is in effect, then regardless of the value of +.codn fs , +newline characters separate fields. Therefore, the programmer-defined .code fs -must explicitly include a match for newline to obtain that as a field -separator. +doesn't have to include a match for newline. Moreover, if it is a simple +fixed string, it need not be converted to a regular expression which also +matches a newline. .coNP Variable @ krs .desc -- cgit v1.2.3