diff options
-rw-r--r-- | ChangeLog | 50 | ||||
-rw-r--r-- | lib.c | 4 | ||||
-rw-r--r-- | lib.h | 2 | ||||
-rw-r--r-- | parser.l | 8 | ||||
-rw-r--r-- | parser.y | 8 | ||||
-rw-r--r-- | regex.c | 305 | ||||
-rw-r--r-- | txr.1 | 46 |
7 files changed, 372 insertions, 51 deletions
@@ -1,3 +1,53 @@ +2010-01-05 Kaz Kylheku <kkylheku@gmail.com> + + Implemented the regular expression ~ and & operators. + This turns out to be easy to do in NFA land. + The complement of an NFA has exactly the same number + and configuration of states and transitions, except + that the states have an inverted meaning; and furthermore, + failed character transitions are routed to an extra + state (which in this impelmentation is permanently + allocated and shared by all regexes). The regex & + is implemented trivially using DeMorgan's. + + Also, bugfix: regular expressions like A|B|C are allowed + now by the syntax, rather than constituting syntax error. + Previously, this would have been entered as (A|B)|C. + + * lib.c (comp_s, and_s): New symbol globals. + (obj_init): New symbols interned. + + * lib.h (comp_s, and_s): Declared. + + * parser.l (grammar): Provide new ~ and & tokens in REGEX state. + + * parser.y (regexpr): Constituents of '|' are regexprs, + rather than regbranches (see bugfix note above). + The '&' operator is added. + (regterm): The '~' operator is added. + + * regex.c (struct any_char_set, struct small_char_set, struct + displaced_char_set): refs field added. + (nfa_kind_t): New enum members nfa_super_accept, + nfa_reject, nfa_compl_empty, nfa_compl_wild, + nfa_compl_single, nfa_compl_set. + (nfa_super_accept_state): New static structure. + (nfa_is_accept_state): New inline function. + (char_set_create): Initialize reference count to 1. + (char_set_destroy): Decrement refcount, free if zero. + (char_set_clone): New function. + (nfa_state_empty_convert, nfa_state_merge): Handle nfa_reject state, + the complement of nfa_accept. + (nfa_compl_state, nfa_compl): New functions. + (nfa_compile_regex): Handle new operators. + (nfa_all_states, nfa_closure): Handle new state types. + (nfa_move): Handle new types according to special rules: + the new complemented states that have character transitions have a next + move to the super-accept state if they do not match the input + character. + + * txr.1: Documented new regex operators. + 2009-12-09 Kaz Kylheku <kkylheku@gmail.com> * parser.l (YYINPUT): Fix signed/unsigned comparison. @@ -52,7 +52,7 @@ val system_package, keyword_package, user_package; val null, t, cons_s, str_s, chr_s, num_s, sym_s, pkg_s, fun_s, vec_s; val stream_s, hash_s, lcons_s, lstr_s, cobj_s; val var_s, regex_s, set_s, cset_s, wild_s, oneplus_s; -val zeroplus_s, optional_s, compound_s, or_s, quasi_s; +val zeroplus_s, optional_s, compl_s, compound_s, or_s, and_s, quasi_s; val skip_s, trailer_s, block_s, next_s, freeform_s, fail_s, accept_s; val all_s, some_s, none_s, maybe_s, cases_s, collect_s, until_s, coll_s; val define_s, output_s, single_s, first_s, last_s, empty_s; @@ -1891,8 +1891,10 @@ static void obj_init(void) oneplus_s = intern(lit("1+"), user_package); zeroplus_s = intern(lit("0+"), user_package); optional_s = intern(lit("?"), user_package); + compl_s = intern(lit("~"), user_package); compound_s = intern(lit("compound"), user_package); or_s = intern(lit("or"), user_package); + and_s = intern(lit("and"), user_package); quasi_s = intern(lit("quasi"), system_package); skip_s = intern(lit("skip"), user_package); trailer_s = intern(lit("trailer"), user_package); @@ -208,7 +208,7 @@ extern val keyword_package, system_package, user_package; extern val null, t, cons_s, str_s, chr_s, num_s, sym_s, pkg_s, fun_s, vec_s; extern val stream_s, hash_s, lcons_s, lstr_s, cobj_s; extern val var_s, regex_s, set_s, cset_s, wild_s, oneplus_s; -extern val zeroplus_s, optional_s, compound_s, or_s, quasi_s; +extern val zeroplus_s, optional_s, compl_s, compound_s, or_s, and_s, quasi_s; extern val skip_s, trailer_s, block_s, next_s, freeform_s, fail_s, accept_s; extern val all_s, some_s, none_s, maybe_s, cases_s, collect_s, until_s, coll_s; extern val define_s, output_s, single_s, first_s, last_s, empty_s; @@ -444,10 +444,10 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} yyerror("newline in regex"); } -<REGEX>[.*?+^] { - yylval.chr = yytext[0]; - return yytext[0]; - } +<REGEX>[.*?+^~&] { + yylval.chr = yytext[0]; + return yytext[0]; + } <REGEX>[\[\]\-] { @@ -79,7 +79,8 @@ static val parsed_spec; %nonassoc '{' '}' '[' ']' '(' ')' %right IDENT TEXT NUMBER %left '|' '/' -%right '*' '?' '+' +%left '&' +%right '~' '*' '?' '+' %right '^' '.' '\\' REGCHAR LITCHAR %% @@ -454,7 +455,9 @@ regex : '/' regexpr '/' { $$ = $2; } ; regexpr : regbranch { $$ = $1; } - | regbranch '|' regbranch { $$ = list(list(or_s, $1, + | regexpr '|' regexpr { $$ = list(list(or_s, $1, + $3, nao), nao); } + | regexpr '&' regexpr { $$ = list(list(and_s, $1, $3, nao), nao); } ; @@ -471,6 +474,7 @@ regterm : '[' regclass ']' { $$ = cons(set_s, $2); } | regterm '*' { $$ = list(zeroplus_s, $1, nao); } | regterm '+' { $$ = list(oneplus_s, $1, nao); } | regterm '?' { $$ = list(optional_s, $1, nao); } + | '~' regterm { $$ = list(compl_s, $2, nao); } | REGCHAR { $$ = chr($1); } | '(' regexpr ')' { $$ = cons(compound_s, $2); } | '(' error { $$ = nil; @@ -71,17 +71,20 @@ typedef cset_L2_t *cset_L3_t[17]; struct any_char_set { unsigned type : 3; unsigned comp : 1; + unsigned refs : 12; }; struct small_char_set { unsigned type : 3; unsigned comp : 1; + unsigned refs : 12; cset_L0_t bitcell; }; struct displaced_char_set { unsigned type : 3; unsigned comp : 1; + unsigned refs : 12; cset_L0_t bitcell; wchar_t base; }; @@ -90,12 +93,14 @@ struct displaced_char_set { struct large_char_set { unsigned type : 3; unsigned comp : 1; + unsigned refs : 12; cset_L2_t dir; }; struct xlarge_char_set { unsigned type : 3; unsigned comp : 1; + unsigned refs : 12; cset_L3_t dir; }; @@ -109,15 +114,38 @@ typedef union char_set { #define NFA_SET_SIZE 512 +/* + * NFA node types, where each node represents a state, plus up to + * either two epsilon-transitions, or a single transition for a character, * + * character class or wild (match any character). The types starting + * with nfa_reject are involved in special representations needed + * for complement NFA's; these only occur if a regex contains the + * complement operator. + */ + typedef enum { - nfa_accept, nfa_empty, nfa_wild, nfa_single, nfa_set + nfa_accept, nfa_empty, nfa_wild, + nfa_single, nfa_set, nfa_super_accept, + nfa_reject, nfa_compl_empty, nfa_compl_wild, + nfa_compl_single, nfa_compl_set } nfa_kind_t; +/* + * Node used for types nfa_accept, nfa_reject, nfa_super_accept. + * + * NOTE: nfa_super_accept is globally instantiated; there is + * only one node of this type in the system. Transitions to it + * are implicit. Its visit flag is never used, so there is no + * reentrancy issue. + */ struct nfa_state_accept { nfa_kind_t kind; unsigned visited; }; +/* + * Node used for nfa_empty, nfa_compl_empty. + */ struct nfa_state_empty { nfa_kind_t kind; unsigned visited; @@ -125,13 +153,20 @@ struct nfa_state_empty { nfa_state_t *trans1; }; +/* + * Node used for nfa_single, nfa_wild, + * nfa_compl_single, nfa_compl_wild. + */ struct nfa_state_single { nfa_kind_t kind; unsigned visited; nfa_state_t *trans; - wchar_t ch; + wchar_t ch; /* set to 0 if nfa_state_wild or nfa_compl_wild. */ }; +/* + * Node used for nfa_set, nfa_compl_set. + */ struct nfa_state_set { nfa_kind_t kind; unsigned visited; @@ -155,6 +190,23 @@ struct nfa_machine { nfa_t nfa; }; +static nfa_state_t nfa_super_accept_state = { { nfa_super_accept } }; + +INLINE int nfa_is_accept_state(nfa_state_t *state) +{ + switch (state->a.kind) { + case nfa_accept: + case nfa_super_accept: + case nfa_compl_empty: + case nfa_compl_wild: + case nfa_compl_single: + case nfa_compl_set: + return 1; + default: + return 0; + } +} + static int L0_full(cset_L0_t *L0) { int i; @@ -423,6 +475,7 @@ static char_set_t *char_set_create(chset_type_t type, wchar_t base) char_set_t *cs = (char_set_t *) chk_malloc(sizeof *cs); *cs = blank; cs->any.type = type; + cs->any.refs = 1; if (type == CHSET_DISPLACED) cs->d.base = base; @@ -432,22 +485,30 @@ static char_set_t *char_set_create(chset_type_t type, wchar_t base) static void char_set_destroy(char_set_t *set) { - switch (set->any.type) { - case CHSET_DISPLACED: - case CHSET_SMALL: - free(set); - break; - case CHSET_LARGE: - L2_free(&set->l.dir); - free(set); - break; - case CHSET_XLARGE: - L3_free(&set->xl.dir); - free(set); - break; + if (--set->any.refs == 0) { + switch (set->any.type) { + case CHSET_DISPLACED: + case CHSET_SMALL: + free(set); + break; + case CHSET_LARGE: + L2_free(&set->l.dir); + free(set); + break; + case CHSET_XLARGE: + L3_free(&set->xl.dir); + free(set); + break; + } } } +static char_set_t *char_set_clone(char_set_t *set) +{ + set->any.refs++; + return set; +} + static void char_set_compl(char_set_t *set) { set->any.comp = 1; @@ -602,7 +663,7 @@ static nfa_state_t *nfa_state_set(nfa_state_t *t, char_set_t *cs) static void nfa_state_empty_convert(nfa_state_t *acc, nfa_state_t *t0, nfa_state_t *t1) { - assert (acc->a.kind == nfa_accept); + assert (acc->a.kind == nfa_accept || acc->a.kind == nfa_reject); acc->e.kind = nfa_empty; acc->e.trans0 = t0; acc->e.trans1 = t1; @@ -624,7 +685,7 @@ static void nfa_state_empty_convert(nfa_state_t *acc, nfa_state_t *t0, */ static void nfa_state_merge(nfa_state_t *acc, nfa_state_t *st) { - assert (acc->a.kind == nfa_accept); + assert (acc->a.kind == nfa_accept || acc->a.kind == nfa_reject); *acc = *st; } @@ -723,6 +784,136 @@ static nfa_t nfa_compile_set(val args, int comp) } } +static nfa_state_t *nfa_compl_state(nfa_state_t *in) +{ + nfa_state_t *st = (nfa_state_t *) chk_malloc(sizeof *st); + + st->a.visited = 0; + + switch (in->a.kind) { + case nfa_accept: + st->a.kind = nfa_reject; + break; + case nfa_empty: + st->a.kind = nfa_compl_empty; + st->e.trans0 = 0; + st->e.trans1 = 0; + break; + case nfa_wild: + st->a.kind = nfa_compl_wild; + st->o.trans = 0; + st->o.ch = 0; + break; + case nfa_single: + st->a.kind = nfa_compl_single; + st->o.trans = 0; + st->o.ch = in->o.ch; + break; + case nfa_set: + st->a.kind = nfa_compl_set; + st->s.trans = 0; + st->s.set = char_set_clone(in->s.set); + break; + case nfa_super_accept: + internal_error("NFA super accept state explicitly reachable in graph"); + case nfa_reject: + st->a.kind = nfa_accept; + break; + case nfa_compl_empty: + st->a.kind = nfa_empty; + st->e.trans0 = 0; + st->e.trans1 = 0; + break; + case nfa_compl_wild: + st->a.kind = nfa_wild; + st->o.trans = 0; + st->o.ch = 0; + break; + case nfa_compl_single: + st->a.kind = nfa_single; + st->o.trans = 0; + st->o.ch = in->o.ch; + break; + case nfa_compl_set: + st->a.kind = nfa_set; + st->s.trans = 0; + st->s.set = char_set_clone(in->s.set); + break; + } + + return st; +} + +static nfa_t nfa_compl(nfa_t in) +{ + nfa_state_t **stack = (nfa_state_t **) chk_malloc(NFA_SET_SIZE*sizeof *stack); + nfa_state_t **temp = (nfa_state_t **) chk_malloc(NFA_SET_SIZE*sizeof *temp); + unsigned visited = ++in.start->a.visited; + int i, num = 2; + + stack[0] = in.start; + stack[1] = in.accept; + + temp[0] = nfa_compl_state(stack[0]); + temp[1] = nfa_compl_state(stack[1]); + + for (i = 0; i < num; i++) { + nfa_state_t *s = stack[i]; + nfa_state_t *t = temp[i]; + + if (num >= NFA_SET_SIZE) + internal_error("NFA set size exceeded"); + + switch (s->a.kind) { + case nfa_accept: + case nfa_reject: + break; + case nfa_empty: + case nfa_compl_empty: + { + nfa_state_t *e0 = s->e.trans0; + nfa_state_t *e1 = s->e.trans1; + + if (e0 && e0->a.visited != visited) { + e0->a.visited = visited; + temp[num] = t->e.trans0 = nfa_compl_state(e0); + stack[num++] = e0; + } + if (e1 && e1->a.visited != visited) { + e1->a.visited = visited; + temp[num] = t->e.trans1 = nfa_compl_state(e1); + stack[num++] = e1; + } + } + break; + case nfa_wild: + case nfa_single: + case nfa_set: + case nfa_compl_wild: + case nfa_compl_single: + case nfa_compl_set: + if (s->o.trans->a.visited != visited) { + s->o.trans->a.visited = visited; + temp[num] = t->o.trans = nfa_compl_state(s->o.trans); + stack[num++] = s->o.trans; + } + break; + case nfa_super_accept: + internal_error("NFA super accept state explicitly reachable in graph"); + } + } + + if (num > NFA_SET_SIZE) + internal_error("NFA set size exceeded"); + + { + nfa_t ret = { temp[0], temp[1] }; + free(temp); + free(stack); + return ret; + } +} + /* * Input is the items from a regex form, * not including the regex symbol. @@ -785,6 +976,9 @@ nfa_t nfa_compile_regex(val items) nfa_t nfa_args = nfa_compile_regex(args); nfa_state_t *s = nfa_state_empty(nfa_args.start, nfa_args.accept); nfa = nfa_make(s, nfa_args.accept); + } else if (sym == compl_s) { + nfa_t nfa_args = nfa_compile_regex(args); + nfa = nfa_compl(nfa_args); } else if (sym == or_s) { /* Simple: make a new start and acceptance state, which form the ends of a spindle that goes through two branches. */ @@ -798,6 +992,19 @@ nfa_t nfa_compile_regex(val items) nfa_state_empty_convert(nfa_first.accept, acc, 0); nfa_state_empty_convert(nfa_second.accept, acc, 0); nfa = nfa_make(s, acc); + } else if (sym == and_s) { + /* We compile regex and using the De Morgan's equality: + R1&R2 == ~(~R1|~R2). + We could do this by transforming the source code, + and recursing into compiler, but instead, the approach taken + is NFA manipulation closely based on the or_s case above. */ + nfa_t nfa_first = nfa_compl(nfa_compile_regex(first(args))); + nfa_t nfa_second = nfa_compl(nfa_compile_regex(second(args))); + nfa_state_t *acc = nfa_state_accept(); + nfa_state_t *s = nfa_state_empty(nfa_first.start, nfa_second.start); + nfa_state_empty_convert(nfa_first.accept, acc, 0); + nfa_state_empty_convert(nfa_second.accept, acc, 0); + nfa = nfa_compl(nfa_make(s, acc)); } else { assert (0 && "internal error: bad operator in regex"); } @@ -817,45 +1024,52 @@ nfa_t nfa_compile_regex(val items) } } -static int nfa_all_states(nfa_state_t **inout, int num, unsigned visited) +static int nfa_all_states(nfa_state_t **stack, int num, unsigned visited) { int i; for (i = 0; i < num; i++) - inout[i]->a.visited = visited; + stack[i]->a.visited = visited; for (i = 0; i < num; i++) { - nfa_state_t *s = inout[i]; + nfa_state_t *s = stack[i]; if (num >= NFA_SET_SIZE) internal_error("NFA set size exceeded"); switch (s->a.kind) { case nfa_accept: + case nfa_reject: break; case nfa_empty: + case nfa_compl_empty: { nfa_state_t *e0 = s->e.trans0; nfa_state_t *e1 = s->e.trans1; if (e0 && e0->a.visited != visited) { e0->a.visited = visited; - inout[num++] = e0; + stack[num++] = e0; } if (e1 && e1->a.visited != visited) { e1->a.visited = visited; - inout[num++] = e1; + stack[num++] = e1; } } break; case nfa_wild: case nfa_single: case nfa_set: + case nfa_compl_wild: + case nfa_compl_single: + case nfa_compl_set: if (s->o.trans->a.visited != visited) { s->o.trans->a.visited = visited; - inout[num++] = s->o.trans; + stack[num++] = s->o.trans; } break; + case nfa_super_accept: + internal_error("NFA super accept state explicitly reachable in graph"); } } @@ -873,7 +1087,7 @@ void nfa_free(nfa_t nfa) all[0] = nfa.start; all[1] = nfa.accept; - nstates = nfa_all_states(all, 2, nfa.start->a.visited); + nstates = nfa_all_states(all, 2, nfa.start->a.visited + 1); for (i = 0; i < nstates; i++) nfa_state_free(all[i]); @@ -906,10 +1120,15 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin, for (i = 0; i < nin; i++) { if (stackp >= NFA_SET_SIZE) internal_error("NFA set size exceeded"); - in[i]->a.visited = visited; + /* We do not mark the super accept state visited, + and we will never visit it by following e-transitions, + because all transitions into the super-accept state + are character-consuming. */ + if (in[i] != &nfa_super_accept_state) + in[i]->a.visited = visited; stack[stackp++] = in[i]; out[nout++] = in[i]; - if (in[i]->a.kind == nfa_accept) + if (nfa_is_accept_state(in[i])) *accept = 1; } @@ -919,10 +1138,13 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin, if (nout >= NFA_SET_SIZE) internal_error("NFA set size exceeded"); - /* Only states of type nfa_empty are interesting. - Each such state at most two epsilon transitions. */ - - if (top->a.kind == nfa_empty) { + /* Only states of type nfa_empty or its complement are interesting. + Each such state at most two epsilon transitions. + Note above comment that epsilon transitions never lead + to the special super accept state; i.e. e0 and e1 + are never the super accept state and so we can write to their + their visited field. */ + if (top->a.kind == nfa_empty || top->a.kind == nfa_compl_empty) { nfa_state_t *e0 = top->e.trans0; nfa_state_t *e1 = top->e.trans1; @@ -930,7 +1152,7 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin, e0->a.visited = visited; stack[stackp++] = e0; out[nout++] = e0; - if (e0->a.kind == nfa_accept) + if (nfa_is_accept_state(e0)) *accept = 1; } @@ -938,7 +1160,7 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin, e1->a.visited = visited; stack[stackp++] = e1; out[nout++] = e1; - if (e1->a.kind == nfa_accept) + if (nfa_is_accept_state(e1)) *accept = 1; } } @@ -958,22 +1180,41 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin, static int nfa_move(nfa_state_t **in, int nin, nfa_state_t **out, wchar_t ch) { int i, nmove; + int super_added = 0; for (nmove = 0, i = 0; i < nin; i++) { nfa_state_t *s = in[i]; switch (s->a.kind) { case nfa_wild: + case nfa_compl_wild: /* Unconditional match; don't have to look at ch. */ break; case nfa_single: if (s->o.ch == ch) /* Character match. */ break; continue; /* no match */ + case nfa_compl_single: + if (s->o.ch == ch) /* Character match. */ + break; + goto add_super; /* no match means we can move to super accept state */ case nfa_set: if (char_set_contains(s->s.set, ch)) /* Set match. */ break; continue; /* no match */ + case nfa_compl_set: + if (char_set_contains(s->s.set, ch)) /* Set match. */ + break; + goto add_super; + case nfa_super_accept: + /* super always transitions to itself, for any character. */ + add_super: + if (!super_added) { + super_added = 1; + if (nmove >= NFA_SET_SIZE) + internal_error("NFA set size exceeded"); + out[nmove++] = &nfa_super_accept_state; + } default: /* Epsilon-transition and acceptance states have no character moves. */ continue; @@ -643,17 +643,21 @@ If RE is a regular expression, then so is (RE). The contents of parentheses denote one regular expression unit, so that for instance in (RE)*, the * operator applies to the entire parenthesized group. .IP (RE)? -optionally matches the preceding regular expression (RE). +optionally match the preceding regular expression (RE). .IP (RE)+ -matches the preceding expression one or more times. +match the preceding expression one or more times. .IP (RE)* -matches the preceding expression zero or more times. +match the preceding expression zero or more times. +.IP ~(RE) +match the complement of the preceding expression; i.e. match +those texts that (RE) does not match. .IP (RE1)(RE2) Two consecutive regular expressions denote catenation: the left expression must match, and then the right. - .IP (RE1)|(RE2) -matches either the expression RE1 or RE2. +match either the expression RE1 or RE2. +.IP (RE1)&(RE2) +match both the expression RE1 and RE2 simultaneously. .PP Any of the special characters, including the delimiting /, can be escaped with @@ -674,14 +678,34 @@ The postfix operators ?, + and * have the second highest precedence, and associate left to right, so that in A+?*, the * applies to A+?, and the ? applies to A+. -Catenation is on the next lower precedence rung, so that AB? means "match A, -and then optionally B" not "match A and B, as one optional unit". The latter -must be written (AB)? using parentheses to override precedence. +The unary complement operator has the next lower precedence, so +that ~A* means the ~(A*): "match the all text that is not matched by zero +or more repetitions of A", not "match zero or more times the text +not matched by A". + +Catenation is on the next lower precedence rung, so that AB? means A(B?), or +"match A, and then optionally B", not "match A and B, as one optional +unit". The latter must be written (AB)? using parentheses to override +precedence. + +The precedence of conjunction (&) is lower than catenation, but not as low as +that of disjunction, thus AB The disjunction operator | has the lowest precedence, lower than catenation. -Thus abc|def means "match abc, or match def". The meaning "match ab, -then c or d, then ef" must be expressed as ab(c|d)ef, or using -a character class: ab[cd]ef. +Thus ABC|DEF means "match ABC, or match DEF". The meaning "match AB, +then C or D, then EF" must be expressed as AB(C|D)EF, or using +a character class: AB[CD]EF. + +The regular expression + + ABC*&~DEF+|Z + +Is parsed as if it were grouped with parentheses like this: + + ((AB(C*))&(~(DE(F+))))|Z + +The main constituent is the | operator, whose left side is an & expression, +et cetera. In .b txr, |