summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2014-03-17 11:48:29 -0700
committerKaz Kylheku <kaz@kylheku.com>2014-03-17 11:48:29 -0700
commit6ac24f8203cd10d5442a02c220a1229b2b7d5513 (patch)
tree46186b93b835f7027470440b8eae00f091bb0182
parentb9f7be76a0d7fd986603ae24ff820547d0f78716 (diff)
downloadunix-cruft-6ac24f8203cd10d5442a02c220a1229b2b7d5513.tar.gz
unix-cruft-6ac24f8203cd10d5442a02c220a1229b2b7d5513.tar.bz2
unix-cruft-6ac24f8203cd10d5442a02c220a1229b2b7d5513.zip
Fix in {m,n} syntax. Improved driver code.
The issue is that the parser partially consumes broken {m,n} syntax. The eat_rep_notation function must backtrack fully, and its caller must detect it has done so.
-rw-r--r--ChangeLog30
-rw-r--r--awkreg.awk47
2 files changed, 66 insertions, 11 deletions
diff --git a/ChangeLog b/ChangeLog
new file mode 100644
index 0000000..7f4faff
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1,30 @@
+2014-03-17 Kaz Kylheku <kaz@kylheku.com>
+
+ Fix in {m,n} syntax.
+
+ The issue is that the parser partially consumes broken {m,n}
+ syntax. The eat_rep_notation function must backtrack fully,
+ and its caller must detect it has done so.
+
+ Improved driver code.
+
+ * awkreg.awk (dbg): New function, handy for debugging.
+ Prints its argument, and then returns it.
+ (match_and_eat_else): New function. Similar to match_and_eat,
+ except that if there si no match, it returns the new third
+ argument, rather than the rejected string itself.
+ This simplifies the expression of some backtracking logic.
+ (eat_rep_notation): If any of the notation doesn't match,
+ then backtrack all the way and return the original input unconsumed.
+ For this, a new local variable o ("original") retains the input.
+ (eat_factor): When matching a parenthesized regex, if matching closing
+ parens after the regex fails, then backtrack all the way,
+ returning the whole factor unconsumed.
+ (regex_check): New function: contains most of the logic of
+ the original is_regex. Basically it is like eat_regex, but with support
+ for ^ and $.
+ (is_regex): Basically now just reports whether regex_check fully
+ consumes the string or not.
+ (driver): The driver action is improved, showing the non-matching
+ suffix of the regex.
+
diff --git a/awkreg.awk b/awkreg.awk
index a0ea898..f966168 100644
--- a/awkreg.awk
+++ b/awkreg.awk
@@ -1,3 +1,9 @@
+function dbg(x)
+{
+ printf("dbg: <%s>\n", x)
+ return x
+}
+
function empty(s)
{
return s == ""
@@ -25,6 +31,13 @@ function match_and_eat(s, pfx)
return s
}
+function match_and_eat_else(s, pfx, e)
+{
+ if (matches(s, pfx))
+ return eat_chars(s, length(pfx))
+ return e
+}
+
function eat_rchar(c)
{
if (c ~ /^\\./)
@@ -102,12 +115,15 @@ function eat_bracket_exp(e,
}
}
-function eat_rep_notation(n)
+function eat_rep_notation(n,
+ # local
+ o)
{
+ o = n
n = eat_char(n)
if (n !~ /^[0-9]/)
- return n
+ return o
while (n ~ /^[0-9]/)
n = eat_char(n)
@@ -116,7 +132,7 @@ function eat_rep_notation(n)
return eat_char(n)
if (!matches(n, ","))
- return n
+ return o
n = eat_char(n)
@@ -124,18 +140,18 @@ function eat_rep_notation(n)
return eat_char(n)
if (n !~ /^[0-9]/)
- return n
+ return o
while (n ~ /^[0-9]/)
n = eat_char(n)
- return match_and_eat(n, "}")
+ return match_and_eat_else(n, "}", o)
}
function eat_factor(f)
{
if (matches(f, "("))
- return match_and_eat(eat_regex(eat_char(f)), ")")
+ return match_and_eat_else(eat_regex(eat_char(f)), ")", f)
if (matches(f, "["))
return eat_bracket_exp(f)
@@ -183,23 +199,32 @@ function eat_regex(r,
return eat_regex(r)
}
-
-function is_regex(r)
+function regex_check(r)
{
if (matches(r, "^"))
r = eat_char(r)
if (empty(r))
- return 1
+ return r
r = eat_regex(r)
if (r == "$")
r = ""
- return empty(r);
+ return r
+}
+
+function is_regex(r)
+{
+ return empty(regex_check(r))
}
{
- printf("is_regex(%s)\n", is_regex($0) ? "yes" : "no")
+ ok = is_regex($0)
+
+ printf("is_regex(\"%s\") = %d", $0, ir)
+ if (!ok)
+ printf(", junk = \"%s\"", regex_check($0))
+ printf("\n")
}