summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib.c27
-rw-r--r--tests/015/split.tl16
-rw-r--r--txr.169
3 files changed, 92 insertions, 20 deletions
diff --git a/lib.c b/lib.c
index 5f92b6c1..f77ba01a 100644
--- a/lib.c
+++ b/lib.c
@@ -3994,11 +3994,13 @@ val tok_str(val str, val tok_regex, val keep_sep)
{
list_collect_decl (out, iter);
val pos = zero;
+ val last_end = zero;
val slen = length(str);
+ int prev_empty = 1;
keep_sep = default_bool_arg(keep_sep);
- for (;;) {
+ if (opt_compat && opt_compat <= 155) for (;;) {
cons_bind (new_pos, len, search_regex(str, tok_regex, pos, nil));
if (len == zero && new_pos != slen)
@@ -4015,6 +4017,29 @@ val tok_str(val str, val tok_regex, val keep_sep)
pos = plus(new_pos, len);
iter = list_collect(iter, sub_str(str, new_pos, pos));
+ } else for (;;) {
+ cons_bind (new_pos, len, search_regex(str, tok_regex, pos, nil));
+
+ if (!len || (new_pos == slen && !prev_empty)) {
+ if (keep_sep)
+ iter = list_collect(iter, sub_str(str, last_end, t));
+ break;
+ }
+
+ if (len != zero || prev_empty) {
+ if (keep_sep)
+ iter = list_collect(iter, sub_str(str, last_end, new_pos));
+ last_end = plus(new_pos, len);
+ iter = list_collect(iter, sub_str(str, new_pos, last_end));
+ prev_empty = (len == zero);
+ } else {
+ prev_empty = 1;
+ }
+
+ pos = plus(new_pos, len);
+
+ if (len == zero)
+ pos = succ(pos);
}
return out;
diff --git a/tests/015/split.tl b/tests/015/split.tl
index 30a8e01c..ae77a642 100644
--- a/tests/015/split.tl
+++ b/tests/015/split.tl
@@ -123,34 +123,34 @@
(split-str "abcacabcac" #/ab?/ t) ("" "ab" "c" "a" "c" "ab" "c" "a" "c"))
(mtest
- (tok-str "" #//) nil
- (tok-str "a" #//) nil
+ (tok-str "" #//) ("")
+ (tok-str "a" #//) ("" "")
(tok-str "" #/a/) nil
(tok-str "a" #/a/) ("a"))
(mtest
- (tok-str "" #// t) ("")
- (tok-str "a" #// t) ("a")
+ (tok-str "" #// t) ("" "" "")
+ (tok-str "a" #// t) ("" "" "a" "" "")
(tok-str "" #/a/ t) ("")
(tok-str "a" #/a/ t) ("" "a" ""))
(mtest
- (tok-str "ab" #//) ("")
+ (tok-str "ab" #//) ("" "" "")
(tok-str "ab" #/a/) ("a")
(tok-str "ab" #/b/) ("b")
(tok-str "ab" #/ab/) ("ab")
(tok-str "ab" #/abc/) nil)
(mtest
- (tok-str "ab" #// t) ("a" "" "b")
+ (tok-str "ab" #// t) ("" "" "a" "" "b" "" "")
(tok-str "ab" #/a/ t) ("" "a" "b")
(tok-str "ab" #/b/ t) ("a" "b" "")
(tok-str "ab" #/ab/ t) ("" "ab" "")
(tok-str "ab" #/abc/ t) ("ab"))
(mtest
- (tok-str "abc" #//) ("" "")
- (tok-str "abc" #// t) ("a" "" "b" "" "c"))
+ (tok-str "abc" #//) ("" "" "" "")
+ (tok-str "abc" #// t) ("" "" "a" "" "b" "" "c" "" ""))
(mtest
(tok-str "abc" #/a/) ("a")
diff --git a/txr.1 b/txr.1
index 007feb36..d2d45722 100644
--- a/txr.1
+++ b/txr.1
@@ -19021,7 +19021,7 @@ into the resulting list, such that if the resulting
list is catenated, a string equivalent to the original
string will be produced.
-Note: To split a string into pieces of length one such that an empty string
+Note: to split a string into pieces of length one such that an empty string
produces
.code nil
rather than
@@ -19032,6 +19032,31 @@ use the
.cble
pattern.
+Note: the function call
+.code "(split-str s r t)"
+produces a resulting list identical to
+.codn "(tok-str s r t)" ,
+for all values of
+.code r
+and
+.codn s ,
+provided that
+.code r
+does not match empty strings. If
+.code r
+matches empty strings, then the
+.code tok-str
+call returns extra elements compared to
+.codn split-str ,
+because
+.code tok-str
+allows empty matches to take place and extract empty tokens
+before the first character of the string, and after the
+last character, whereas
+.code split-str
+does not recognize empty separators at these outer limits
+of the string.
+
.coNP Function @ split-str-set
.synb
.mets (split-str-set < string << set )
@@ -19089,25 +19114,36 @@ matches an empty string, then an empty token is returned, and
the search for another token within
.meta string
resumes after advancing by one
-character position. So for instance,
+character position. However, if an empty match occurs immediately
+after a non-empty token, that empty match is not turned into
+a token.
+
+So for instance,
.cblk
(tok-str "abc" #/a?/)
.cble
-returns the
+returns
.cblk
-("a" "" "" "").
+("a" "" "").
.cble
After the token
.str "a"
is extracted from a non-empty match
-for the regex, the regex is considered to match three more times: before the
-.strn "b" ,
-between
-.str "b"
+for the regex, an empty match for the regex occurs just
+before the character
+.codn b .
+This match is discarded because it is an empty match which
+immediately follows the non-empty match. The character
+.code b
+is skipped. The next match is an empty match between the
+.code b
and
-.strn "c" ,
-and after the
-.strn "c" .
+.code c
+characters. This match causes an empty token to be
+extracted. The character
+.code c
+is skipped, and one more empty match occurs after that
+character and is extracted.
If the
.meta keep-between
@@ -47785,6 +47821,17 @@ of these version values, the described behaviors are provided if
is given an argument which is equal or lower. For instance
.code "-C 103"
selects the behaviors described below for version 105, but not those for 102.
+.IP 155
+After version 155, the
+.code tok-str
+and
+.code tok-where
+functions changed semantics. Previously, these functions exhibited the
+flaw that under some conditions they extracted an empty token immediately
+following a non-empty token. This behavior was working as designed and
+documented, but the design was flawed, creating a major difficulty in simple
+tokenizing tasks when tokens may be empty strings. Requesting compatibility
+with version 155 or earlier restores the behavior.
.IP 154
After version 154, changes were introduced in the semantics of struct
literals. Previously, the syntax