From 01504b7436608a7501bb06f4f1b965607ceb1345 Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Sun, 9 Mar 2014 00:02:03 -0800 Subject: Issue: match_regex and search_regex were continuing to feed characters to the regex machine even when there is no transition available. This was due to the broken return value protocol of regex_machine_feed. For instance for the regex / +/ (one or more spaces), after matching some spaces, it would report REGM_INCOMPLETE for additional non-space characters, never reporting REGM_FAIL. * regex.c (regm_result_t): Block comment added, documenting protocol. (regex_machine_feed): Return REGM_FAIL if there are no transitions for the given character, even a partial match has been recorded. This is a signal to stop feeding more characters. At that point, the function can be called with a null character to distinguish the three cases: fail, partial or full match. (search_regex): Now when the search loop gets a REGM_FAIL, it can no longer assume that nothing was matched and the search must restart at the next position. Upon the REGM_FAIL signal, it is necesary to seal the search by feeding in the 0 character. Only if that returns REGM_FAIL is it a no match situation. Otherwise it is actually a match! --- regex.c | 64 ++++++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 20 deletions(-) (limited to 'regex.c') diff --git a/regex.c b/regex.c index 2e24338d..2137ca56 100644 --- a/regex.c +++ b/regex.c @@ -54,8 +54,32 @@ typedef struct nfa { nfa_state_t *accept; } nfa_t; +/* + * Result from regex_machine_feed. + * These values have two meanings, based on whether + * the matching is still open (characters are being fed) + * or finalized. + * + * When feeding characters: + * REGM_INCOMPLETE: no match at this character, but matching can continue. + * REGM_FAIL: no more state transitions are possible. + * REGM_MATCH: match (accept state) for this character. + * + * When the end of the input is encountered, or a REGM_FAIL, + * then regex_machine_feed is called one more time with + * the null character. It then reports: + * REGM_INCOMPLETE: there was a partial match for the input. + * REGM_FAIL: none of the input matched. + * REGM_MATCH: the input was completely matched + * + * Note that a REGM_FAIL (no transitions) during the character feeding phase + * can turn into REGM_INCOMPLETE (partial match) when the match is sealed with + * the null character signal! + */ typedef enum regm_result { - REGM_INCOMPLETE, REGM_FAIL, REGM_MATCH + REGM_INCOMPLETE, + REGM_FAIL, + REGM_MATCH } regm_result_t; typedef union regex_machine regex_machine_t; @@ -1097,7 +1121,7 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin, int i, nout = 0; int stackp = 0; - /* First, add all states in the input state to the closure, + /* First, add all states in the input set to the closure, push them on the stack, and mark them as visited. */ for (i = 0; i < nin; i++) { if (stackp >= NFA_SET_SIZE) @@ -1728,16 +1752,14 @@ static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch) regm->n.nmove, regm->n.clos, regm->n.visited++, &accept); - if (accept) - regm->n.last_accept_pos = regm->n.count; - } + regm->n.nfa.start->a.visited = regm->n.visited; - regm->n.nfa.start->a.visited = regm->n.visited; + if (accept) { + regm->n.last_accept_pos = regm->n.count; + return REGM_MATCH; + } - if (ch && regm->n.nclos != 0) { - if (accept) - return REGM_MATCH; - return REGM_INCOMPLETE; + return (regm->n.nclos != 0) ? REGM_INCOMPLETE : REGM_FAIL; } } else { val accept = nil; @@ -1745,14 +1767,13 @@ static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch) if (ch != 0) { regm->d.count++; regm->d.deriv = reg_derivative(regm->d.deriv, chr(ch)); - if ((accept = reg_nullable(regm->d.deriv))) + + if ((accept = reg_nullable(regm->d.deriv))) { regm->d.last_accept_pos = regm->d.count; - } + return REGM_MATCH; + } - if (ch && regm->d.deriv != t) { - if (accept) - return REGM_MATCH; - return REGM_INCOMPLETE; + return (regm->d.deriv != t) ? REGM_INCOMPLETE : REGM_FAIL; } } @@ -1766,7 +1787,6 @@ static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch) return REGM_INCOMPLETE; } - val search_regex(val haystack, val needle_regex, val start, val from_end) { @@ -1798,9 +1818,13 @@ again: last_res = regex_machine_feed(®m, c_chr(chr_str(haystack, i))); if (last_res == REGM_FAIL) { - regex_machine_reset(®m); - pos = plus(pos, one); - goto again; + last_res = regex_machine_feed(®m, 0); + if (last_res == REGM_FAIL) { + regex_machine_reset(®m); + pos = plus(pos, one); + goto again; + } + break; } } -- cgit v1.2.3