regex: don't consume input past final match.

The read-until-match functions and the two others in the same family always read a character beyond the characters matched by the regex. This will cause blocking behavior in cases where a TTY or network socket has provided the a matching record delimiter already, using a trivial, fixed-length regex. Similar behavior is seen in GNU Awk also, with its RS (record separator); let's fix it in our world. We introduce a REGM_MATCH_DONE result code, which, like REGM_MATCH, indicates that the state machine is an acceptance state. Unlike REGM_MATCH it also indicates that no more transitions are possible. For instance, for a regex like #/ab|c/, the REGM_MATCH_DONE code will be indicated when the input "ab" is seen, or the input "c" is seen. Any additional characters will cause a mismatch. This indication makes it possible for the caller to avoid reading more characters from an input source. * regex.c (enum regm_reesult, regm_result_t): New REGM_MATCH_DONE enum member. (nfa_has_transitions): New macro. (nfa_closure, nfa_move_closure): New pointer-to-int parameter more. This is set to true only if one or more states in the output state have transitions. (nfa_run): Initialize new local variable more and pass to nfa_closure and nfa_move closure. Break out of the character feeding loop if more is zero. (regex_machine_reset): Pass more parameter to nfa_closure. (regex_machine_feed): Pass more parameter to nfa_move_closure. When returning REG_MATCH, if more is false, return REG_MATCH_DONE. In the derivatives implementation, we report REGM_MATCH_DONE when the derivative we have calculated is null. (search_regex, match_regex): Break loop on REGM_MATCH_DONE, and avoid feeding the null character in that case. (match_regex_right): Likewise, and also handle the REGM_MATCH_DONE case specially at the end. We need to check whether the match reached the end of the string (is anchored to the right). If not, we continue the search. (regex_prefix_match): Break loop on REGM_MATCH_DONE. (scan_until_common): If we hit REGM_MATCH_DONE, break out of the loop and proceed straight to the out_match block, indicating that no characters need to be pushed back from the stack.
author: Kaz Kylheku <kaz@kylheku.com> 2024-07-03 18:21:17 -0700
committer: Kaz Kylheku <kaz@kylheku.com> 2024-07-03 18:21:17 -0700
commit: 0ccf4483d8ed2ad5805116df5dba4a858e5c373a (patch)
tree: f8e2a595c4a52afb55f3025c767ae790de231357 /regex.c
parent: 77deceded0e5c9143e01d07a19eb219b3273151b (diff)
download: txr-0ccf4483d8ed2ad5805116df5dba4a858e5c373a.tar.gz
txr-0ccf4483d8ed2ad5805116df5dba4a858e5c373a.tar.bz2
txr-0ccf4483d8ed2ad5805116df5dba4a858e5c373a.zip
1 files changed, 61 insertions, 26 deletions
diff --git a/regex.c b/regex.c
index dc356325..eda096f3 100644
--- a/regex.c
+++ b/regex.c
@@ -81,6 +81,8 @@ typedef struct regex {
  * REGM_INCOMPLETE: no match at this character, but matching can continue.
  * REGM_FAIL: no more state transitions are possible.
  * REGM_MATCH: match (accept state) for this character.
+ * REGM_MATCH_DONE: match (accept state) for this character, and no more
+ * moves are possible.
  *
  * When the end of the input is encountered, or a REGM_FAIL,
  * then regex_machine_feed is called one more time with
@@ -88,6 +90,7 @@ typedef struct regex {
  * REGM_INCOMPLETE: there was a partial match for the input.
  * REGM_FAIL: none of the input matched.
  * REGM_MATCH: the input was completely matched
+ * REGM_MATCH_DONE: not returned.
  *
  * Note that a REGM_FAIL (no transitions) during the character feeding phase
  * can turn into REGM_INCOMPLETE (partial match) when the match is sealed with
@@ -96,7 +99,8 @@ typedef struct regex {
 typedef enum regm_result {
   REGM_INCOMPLETE,
   REGM_FAIL,
-  REGM_MATCH
+  REGM_MATCH,
+  REGM_MATCH_DONE
 } regm_result_t;
 
 typedef union regex_machine regex_machine_t;
@@ -230,6 +234,8 @@ union nfa_state {
 #define nfa_accept_state_p(s) ((s)->a.kind == nfa_accept)
 #define nfa_empty_state_p(s) ((s)->a.kind == nfa_accept || \
                               (s)->a.kind == nfa_empty)
+#define nfa_has_transitions(s) ((s)->a.kind != nfa_empty || \
+                                (s)->e.trans0 || (s)->e.trans1)
 
 struct nfa_machine {
   int is_nfa;           /* common member */
@@ -1272,10 +1278,11 @@ static nfa_t nfa_optimize(nfa_t nfa)
  * (Transitions that don't do not consume and match an input character).
  */
 static int nfa_closure(nfa_state_t **stack, nfa_state_t **set, int nin,
-                       int nstates, unsigned visited, int *accept)
+                       int nstates, unsigned visited, int *accept, int *more)
 {
   int i, nout = 0;
   int stackp = 0;
+  int moreset = 0;
 
   /* First, add all states in the input set to the closure,
      push them on the stack, and mark them as visited. */
@@ -1288,6 +1295,8 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **set, int nin,
     set[nout++] = s;
     if (nfa_accept_state_p(s))
       *accept = 1;
+    if (!moreset && nfa_has_transitions(s))
+      moreset = *more = 1;
   }
 
   while (stackp) {
@@ -1305,6 +1314,8 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **set, int nin,
         set[nout++] = e0;
         if (nfa_accept_state_p(e0))
           *accept = 1;
+        if (!moreset && nfa_has_transitions(e0))
+          *more = 1;
       }
 
       if (nfa_test_set_visited(e1, visited)) {
@@ -1312,6 +1323,8 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **set, int nin,
         set[nout++] = e1;
         if (nfa_accept_state_p(e1))
           *accept = 1;
+        if (!moreset && nfa_has_transitions(e1))
+          *more = 1;
       }
     }
   }
@@ -1327,9 +1340,9 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **set, int nin,
  */
 static int nfa_move_closure(nfa_state_t **stack, nfa_state_t **set, int nin,
                             int nstates, wchar_t ch,
-                            unsigned visited, int *accept)
+                            unsigned visited, int *accept, int *more)
 {
-  int i, nout, stackp;
+  int i, nout, stackp, moreset = 0;
 
   /*
    * Compute the move set from a given set of NFA states. The move
@@ -1371,6 +1384,8 @@ static int nfa_move_closure(nfa_state_t **stack, nfa_state_t **set, int nin,
         set[nout++] = mov;
         if (nfa_accept_state_p(mov))
           *accept = 1;
+        if (!moreset && nfa_has_transitions(mov))
+          *more = 1;
       }
     }
   }
@@ -1390,6 +1405,8 @@ static int nfa_move_closure(nfa_state_t **stack, nfa_state_t **set, int nin,
         set[nout++] = e0;
         if (nfa_accept_state_p(e0))
           *accept = 1;
+        if (!moreset && nfa_has_transitions(e0))
+          *more = 1;
       }
 
       if (nfa_test_set_visited(e1, visited)) {
@@ -1397,6 +1414,8 @@ static int nfa_move_closure(nfa_state_t **stack, nfa_state_t **set, int nin,
         set[nout++] = e1;
         if (nfa_accept_state_p(e1))
           *accept = 1;
+        if (!moreset && nfa_has_transitions(e0))
+          *more = 1;
       }
     }
   }
@@ -1434,13 +1453,13 @@ static cnum nfa_run(nfa_t nfa, int nstates, const wchar_t *str)
   nfa_state_t **set = coerce(nfa_state_t **, alloca(nstates * sizeof *set));
   nfa_state_t **stack = coerce(nfa_state_t **, alloca(nstates * sizeof *stack));
   int nclos = 0;
-  int accept = 0;
+  int accept = 0, more = 0;
 
   nfa_handle_wraparound(nfa.start, &visited);
 
   if (nfa.start) {
     set[0] = nfa.start;
-    nclos = nfa_closure(stack, set, 1, nstates, ++visited, &accept);
+    nclos = nfa_closure(stack, set, 1, nstates, ++visited, &accept, &more);
   }
 
   if (accept)
@@ -1449,17 +1468,17 @@ static cnum nfa_run(nfa_t nfa, int nstates, const wchar_t *str)
   for (; *ptr != 0; ptr++) {
     wchar_t ch = *ptr;
 
-    accept = 0;
+    accept = more = 0;
 
     nfa_handle_wraparound(nfa.start, &visited);
 
     nclos = nfa_move_closure(stack, set, nclos,
-                             nstates, ch, ++visited, &accept);
+                             nstates, ch, ++visited, &accept, &more);
 
     if (accept)
       last_accept_pos = ptr + 1;
 
-    if (nclos == 0) /* dead end; no match */
+    if (nclos == 0 || more == 0) /* dead end */
       break;
   }
 
@@ -2470,7 +2489,7 @@ static cnum regex_run(val compiled_regex, const wchar_t *str)
 
 static void regex_machine_reset(regex_machine_t *regm)
 {
-  int accept = 0;
+  int accept = 0, more = 0;
 
   regm->n.last_accept_pos = -1;
   regm->n.count = 0;
@@ -2484,7 +2503,7 @@ static void regex_machine_reset(regex_machine_t *regm)
       regm->n.set[0] = s;
       regm->n.nclos = nfa_closure(regm->n.stack, regm->n.set, 1,
                                   regm->n.nstates,
-                                  regm->n.visited, &accept);
+                                  regm->n.visited, &accept, &more);
       s->a.visited = regm->n.visited;
     } else {
       regm->n.nclos = 0;
@@ -2541,7 +2560,7 @@ static regm_result_t regex_machine_infer_init_state(regex_machine_t *regm)
 
 static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch)
 {
-  int accept = 0;
+  int accept = 0, more = 0;
 
   if (regm->n.is_nfa) {
     nfa_handle_wraparound(regm->n.nfa.start, &regm->n.visited);
@@ -2552,14 +2571,14 @@ static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch)
       regm->n.nclos = nfa_move_closure(regm->n.stack,
                                        regm->n.set, regm->n.nclos,
                                        regm->n.nstates, ch, ++regm->n.visited,
-                                       &accept);
+                                       &accept, &more);
 
       if (regm->n.nfa.start)
         regm->n.nfa.start->a.visited = regm->n.visited;
 
       if (accept) {
         regm->n.last_accept_pos = regm->n.count;
-        return REGM_MATCH;
+        return more ? REGM_MATCH : REGM_MATCH_DONE;
       }
 
       return (regm->n.nclos != 0) ? REGM_INCOMPLETE : REGM_FAIL;
@@ -2573,15 +2592,14 @@ static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch)
 
       if ((accept = reg_nullable(regm->d.deriv))) {
         regm->d.last_accept_pos = regm->d.count;
-        return REGM_MATCH;
+        return regm->d.deriv ? REGM_MATCH : REGM_MATCH_DONE;
       }
 
       return (regm->d.deriv != t) ? REGM_INCOMPLETE : REGM_FAIL;
     }
   }
 
-  /* Reached if the null character is
-     consumed, or NFA/derivation hit a transition dead end. */
+  /* Reached if the null character is consumed. */
 
   if (regm->n.last_accept_pos == regm->n.count)
     return REGM_MATCH;
@@ -2645,13 +2663,18 @@ again:
         }
         break;
       }
+
+      if (last_res == REGM_MATCH_DONE)
+        break;
     }
 
-    last_res = regex_machine_feed(&regm, 0);
+    if (last_res != REGM_MATCH_DONE)
+      last_res = regex_machine_feed(&regm, 0);
 
     switch (last_res) {
     case REGM_INCOMPLETE:
     case REGM_MATCH:
+    case REGM_MATCH_DONE:
       retval = cons(pos, num(regex_machine_match_span(&regm)));
       regex_machine_cleanup(&regm);
       return retval;
@@ -2736,15 +2759,17 @@ val match_regex(val str, val reg, val pos)
 
   for (i = pos; length_str_gt(str, i); i = plus(i, one)) {
     last_res = regex_machine_feed(&regm, c_chr(chr_str(str, i)));
-    if (last_res == REGM_FAIL)
+    if (last_res == REGM_FAIL || last_res == REGM_MATCH_DONE)
       break;
   }
 
-  last_res = regex_machine_feed(&regm, 0);
+  if (last_res != REGM_MATCH_DONE)
+    last_res = regex_machine_feed(&regm, 0);
 
   switch (last_res) {
   case REGM_INCOMPLETE:
   case REGM_MATCH:
+  case REGM_MATCH_DONE:
     retval = plus(pos, num(regex_machine_match_span(&regm)));
     regex_machine_cleanup(&regm);
     return retval;
@@ -2809,23 +2834,28 @@ val match_regex_right(val str, val regex, val end)
 
   while (le(pos, end)) {
     regex_machine_t regm;
-    val i ;
+    val i;
     regm_result_t last_res = REGM_INCOMPLETE;
 
     regex_machine_init(self, &regm, regex);
 
     for (i = pos; lt(i, end); i = plus(i, one)) {
       last_res = regex_machine_feed(&regm, c_chr(chr_str(str, i)));
-      if (last_res == REGM_FAIL)
+      if (last_res == REGM_FAIL || last_res == REGM_MATCH_DONE)
         break;
     }
 
-    last_res = regex_machine_feed(&regm, 0);
+    if (last_res != REGM_MATCH_DONE)
+      last_res = regex_machine_feed(&regm, 0);
 
     switch (last_res) {
+    case REGM_MATCH_DONE:
+      if (!lt(plus(i, one), end)) {
     case REGM_MATCH:
-      regex_machine_cleanup(&regm);
-      return minus(end, pos);
+        regex_machine_cleanup(&regm);
+        return minus(end, pos);
+      }
+      /* fallthrough */
     case REGM_INCOMPLETE:
     case REGM_FAIL:
       regex_machine_cleanup(&regm);
@@ -2861,7 +2891,7 @@ val regex_prefix_match(val reg, val str, val pos)
 
   for (i = pos; length_str_gt(str, i); i = plus(i, one)) {
     last_res = regex_machine_feed(&regm, c_chr(chr_str(str, i)));
-    if (last_res == REGM_FAIL)
+    if (last_res == REGM_FAIL || last_res == REGM_MATCH_DONE)
       break;
   }
 
@@ -2870,6 +2900,7 @@ val regex_prefix_match(val reg, val str, val pos)
   switch (last_res) {
   case REGM_INCOMPLETE:
   case REGM_MATCH:
+  case REGM_MATCH_DONE:
     return t;
   default:
   case REGM_FAIL:
@@ -3173,6 +3204,7 @@ static val scan_until_common(val self, val regex, val stream_in,
           goto out_match;
         break;
       case REGM_MATCH:
+      case REGM_MATCH_DONE:
         goto out_match;
       }
       break;
@@ -3201,6 +3233,9 @@ static val scan_until_common(val self, val regex, val stream_in,
 
       regex_machine_reset(&regm);
       continue;
+    case REGM_MATCH_DONE:
+      match = stack;
+      goto out_match;
     case REGM_MATCH:
       push(ch, &stack);
       match = stack;
author	Kaz Kylheku <kaz@kylheku.com>	2024-07-03 18:21:17 -0700
committer	Kaz Kylheku <kaz@kylheku.com>	2024-07-03 18:21:17 -0700
commit	0ccf4483d8ed2ad5805116df5dba4a858e5c373a (patch)
tree	f8e2a595c4a52afb55f3025c767ae790de231357 /regex.c
parent	77deceded0e5c9143e01d07a19eb219b3273151b (diff)
download	txr-0ccf4483d8ed2ad5805116df5dba4a858e5c373a.tar.gz txr-0ccf4483d8ed2ad5805116df5dba4a858e5c373a.tar.bz2 txr-0ccf4483d8ed2ad5805116df5dba4a858e5c373a.zip