diff options
-rw-r--r-- | ChangeLog | 30 | ||||
-rw-r--r-- | awkreg.awk | 47 |
2 files changed, 66 insertions, 11 deletions
diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..7f4faff --- /dev/null +++ b/ChangeLog @@ -0,0 +1,30 @@ +2014-03-17 Kaz Kylheku <kaz@kylheku.com> + + Fix in {m,n} syntax. + + The issue is that the parser partially consumes broken {m,n} + syntax. The eat_rep_notation function must backtrack fully, + and its caller must detect it has done so. + + Improved driver code. + + * awkreg.awk (dbg): New function, handy for debugging. + Prints its argument, and then returns it. + (match_and_eat_else): New function. Similar to match_and_eat, + except that if there si no match, it returns the new third + argument, rather than the rejected string itself. + This simplifies the expression of some backtracking logic. + (eat_rep_notation): If any of the notation doesn't match, + then backtrack all the way and return the original input unconsumed. + For this, a new local variable o ("original") retains the input. + (eat_factor): When matching a parenthesized regex, if matching closing + parens after the regex fails, then backtrack all the way, + returning the whole factor unconsumed. + (regex_check): New function: contains most of the logic of + the original is_regex. Basically it is like eat_regex, but with support + for ^ and $. + (is_regex): Basically now just reports whether regex_check fully + consumes the string or not. + (driver): The driver action is improved, showing the non-matching + suffix of the regex. + @@ -1,3 +1,9 @@ +function dbg(x) +{ + printf("dbg: <%s>\n", x) + return x +} + function empty(s) { return s == "" @@ -25,6 +31,13 @@ function match_and_eat(s, pfx) return s } +function match_and_eat_else(s, pfx, e) +{ + if (matches(s, pfx)) + return eat_chars(s, length(pfx)) + return e +} + function eat_rchar(c) { if (c ~ /^\\./) @@ -102,12 +115,15 @@ function eat_bracket_exp(e, } } -function eat_rep_notation(n) +function eat_rep_notation(n, + # local + o) { + o = n n = eat_char(n) if (n !~ /^[0-9]/) - return n + return o while (n ~ /^[0-9]/) n = eat_char(n) @@ -116,7 +132,7 @@ function eat_rep_notation(n) return eat_char(n) if (!matches(n, ",")) - return n + return o n = eat_char(n) @@ -124,18 +140,18 @@ function eat_rep_notation(n) return eat_char(n) if (n !~ /^[0-9]/) - return n + return o while (n ~ /^[0-9]/) n = eat_char(n) - return match_and_eat(n, "}") + return match_and_eat_else(n, "}", o) } function eat_factor(f) { if (matches(f, "(")) - return match_and_eat(eat_regex(eat_char(f)), ")") + return match_and_eat_else(eat_regex(eat_char(f)), ")", f) if (matches(f, "[")) return eat_bracket_exp(f) @@ -183,23 +199,32 @@ function eat_regex(r, return eat_regex(r) } - -function is_regex(r) +function regex_check(r) { if (matches(r, "^")) r = eat_char(r) if (empty(r)) - return 1 + return r r = eat_regex(r) if (r == "$") r = "" - return empty(r); + return r +} + +function is_regex(r) +{ + return empty(regex_check(r)) } { - printf("is_regex(%s)\n", is_regex($0) ? "yes" : "no") + ok = is_regex($0) + + printf("is_regex(\"%s\") = %d", $0, ir) + if (!ok) + printf(", junk = \"%s\"", regex_check($0)) + printf("\n") } |