summaryrefslogtreecommitdiffstats
path: root/regex.c
diff options
context:
space:
mode:
Diffstat (limited to 'regex.c')
-rw-r--r--regex.c305
1 files changed, 273 insertions, 32 deletions
diff --git a/regex.c b/regex.c
index ec1e656c..5cbe11a0 100644
--- a/regex.c
+++ b/regex.c
@@ -71,17 +71,20 @@ typedef cset_L2_t *cset_L3_t[17];
struct any_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned refs : 12;
};
struct small_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned refs : 12;
cset_L0_t bitcell;
};
struct displaced_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned refs : 12;
cset_L0_t bitcell;
wchar_t base;
};
@@ -90,12 +93,14 @@ struct displaced_char_set {
struct large_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned refs : 12;
cset_L2_t dir;
};
struct xlarge_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned refs : 12;
cset_L3_t dir;
};
@@ -109,15 +114,38 @@ typedef union char_set {
#define NFA_SET_SIZE 512
+/*
+ * NFA node types, where each node represents a state, plus up to
+ * either two epsilon-transitions, or a single transition for a character, *
+ * character class or wild (match any character). The types starting
+ * with nfa_reject are involved in special representations needed
+ * for complement NFA's; these only occur if a regex contains the
+ * complement operator.
+ */
+
typedef enum {
- nfa_accept, nfa_empty, nfa_wild, nfa_single, nfa_set
+ nfa_accept, nfa_empty, nfa_wild,
+ nfa_single, nfa_set, nfa_super_accept,
+ nfa_reject, nfa_compl_empty, nfa_compl_wild,
+ nfa_compl_single, nfa_compl_set
} nfa_kind_t;
+/*
+ * Node used for types nfa_accept, nfa_reject, nfa_super_accept.
+ *
+ * NOTE: nfa_super_accept is globally instantiated; there is
+ * only one node of this type in the system. Transitions to it
+ * are implicit. Its visit flag is never used, so there is no
+ * reentrancy issue.
+ */
struct nfa_state_accept {
nfa_kind_t kind;
unsigned visited;
};
+/*
+ * Node used for nfa_empty, nfa_compl_empty.
+ */
struct nfa_state_empty {
nfa_kind_t kind;
unsigned visited;
@@ -125,13 +153,20 @@ struct nfa_state_empty {
nfa_state_t *trans1;
};
+/*
+ * Node used for nfa_single, nfa_wild,
+ * nfa_compl_single, nfa_compl_wild.
+ */
struct nfa_state_single {
nfa_kind_t kind;
unsigned visited;
nfa_state_t *trans;
- wchar_t ch;
+ wchar_t ch; /* set to 0 if nfa_state_wild or nfa_compl_wild. */
};
+/*
+ * Node used for nfa_set, nfa_compl_set.
+ */
struct nfa_state_set {
nfa_kind_t kind;
unsigned visited;
@@ -155,6 +190,23 @@ struct nfa_machine {
nfa_t nfa;
};
+static nfa_state_t nfa_super_accept_state = { { nfa_super_accept } };
+
+INLINE int nfa_is_accept_state(nfa_state_t *state)
+{
+ switch (state->a.kind) {
+ case nfa_accept:
+ case nfa_super_accept:
+ case nfa_compl_empty:
+ case nfa_compl_wild:
+ case nfa_compl_single:
+ case nfa_compl_set:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
static int L0_full(cset_L0_t *L0)
{
int i;
@@ -423,6 +475,7 @@ static char_set_t *char_set_create(chset_type_t type, wchar_t base)
char_set_t *cs = (char_set_t *) chk_malloc(sizeof *cs);
*cs = blank;
cs->any.type = type;
+ cs->any.refs = 1;
if (type == CHSET_DISPLACED)
cs->d.base = base;
@@ -432,22 +485,30 @@ static char_set_t *char_set_create(chset_type_t type, wchar_t base)
static void char_set_destroy(char_set_t *set)
{
- switch (set->any.type) {
- case CHSET_DISPLACED:
- case CHSET_SMALL:
- free(set);
- break;
- case CHSET_LARGE:
- L2_free(&set->l.dir);
- free(set);
- break;
- case CHSET_XLARGE:
- L3_free(&set->xl.dir);
- free(set);
- break;
+ if (--set->any.refs == 0) {
+ switch (set->any.type) {
+ case CHSET_DISPLACED:
+ case CHSET_SMALL:
+ free(set);
+ break;
+ case CHSET_LARGE:
+ L2_free(&set->l.dir);
+ free(set);
+ break;
+ case CHSET_XLARGE:
+ L3_free(&set->xl.dir);
+ free(set);
+ break;
+ }
}
}
+static char_set_t *char_set_clone(char_set_t *set)
+{
+ set->any.refs++;
+ return set;
+}
+
static void char_set_compl(char_set_t *set)
{
set->any.comp = 1;
@@ -602,7 +663,7 @@ static nfa_state_t *nfa_state_set(nfa_state_t *t, char_set_t *cs)
static void nfa_state_empty_convert(nfa_state_t *acc, nfa_state_t *t0,
nfa_state_t *t1)
{
- assert (acc->a.kind == nfa_accept);
+ assert (acc->a.kind == nfa_accept || acc->a.kind == nfa_reject);
acc->e.kind = nfa_empty;
acc->e.trans0 = t0;
acc->e.trans1 = t1;
@@ -624,7 +685,7 @@ static void nfa_state_empty_convert(nfa_state_t *acc, nfa_state_t *t0,
*/
static void nfa_state_merge(nfa_state_t *acc, nfa_state_t *st)
{
- assert (acc->a.kind == nfa_accept);
+ assert (acc->a.kind == nfa_accept || acc->a.kind == nfa_reject);
*acc = *st;
}
@@ -723,6 +784,136 @@ static nfa_t nfa_compile_set(val args, int comp)
}
}
+static nfa_state_t *nfa_compl_state(nfa_state_t *in)
+{
+ nfa_state_t *st = (nfa_state_t *) chk_malloc(sizeof *st);
+
+ st->a.visited = 0;
+
+ switch (in->a.kind) {
+ case nfa_accept:
+ st->a.kind = nfa_reject;
+ break;
+ case nfa_empty:
+ st->a.kind = nfa_compl_empty;
+ st->e.trans0 = 0;
+ st->e.trans1 = 0;
+ break;
+ case nfa_wild:
+ st->a.kind = nfa_compl_wild;
+ st->o.trans = 0;
+ st->o.ch = 0;
+ break;
+ case nfa_single:
+ st->a.kind = nfa_compl_single;
+ st->o.trans = 0;
+ st->o.ch = in->o.ch;
+ break;
+ case nfa_set:
+ st->a.kind = nfa_compl_set;
+ st->s.trans = 0;
+ st->s.set = char_set_clone(in->s.set);
+ break;
+ case nfa_super_accept:
+ internal_error("NFA super accept state explicitly reachable in graph");
+ case nfa_reject:
+ st->a.kind = nfa_accept;
+ break;
+ case nfa_compl_empty:
+ st->a.kind = nfa_empty;
+ st->e.trans0 = 0;
+ st->e.trans1 = 0;
+ break;
+ case nfa_compl_wild:
+ st->a.kind = nfa_wild;
+ st->o.trans = 0;
+ st->o.ch = 0;
+ break;
+ case nfa_compl_single:
+ st->a.kind = nfa_single;
+ st->o.trans = 0;
+ st->o.ch = in->o.ch;
+ break;
+ case nfa_compl_set:
+ st->a.kind = nfa_set;
+ st->s.trans = 0;
+ st->s.set = char_set_clone(in->s.set);
+ break;
+ }
+
+ return st;
+}
+
+static nfa_t nfa_compl(nfa_t in)
+{
+ nfa_state_t **stack = (nfa_state_t **) chk_malloc(NFA_SET_SIZE*sizeof *stack);
+ nfa_state_t **temp = (nfa_state_t **) chk_malloc(NFA_SET_SIZE*sizeof *temp);
+ unsigned visited = ++in.start->a.visited;
+ int i, num = 2;
+
+ stack[0] = in.start;
+ stack[1] = in.accept;
+
+ temp[0] = nfa_compl_state(stack[0]);
+ temp[1] = nfa_compl_state(stack[1]);
+
+ for (i = 0; i < num; i++) {
+ nfa_state_t *s = stack[i];
+ nfa_state_t *t = temp[i];
+
+ if (num >= NFA_SET_SIZE)
+ internal_error("NFA set size exceeded");
+
+ switch (s->a.kind) {
+ case nfa_accept:
+ case nfa_reject:
+ break;
+ case nfa_empty:
+ case nfa_compl_empty:
+ {
+ nfa_state_t *e0 = s->e.trans0;
+ nfa_state_t *e1 = s->e.trans1;
+
+ if (e0 && e0->a.visited != visited) {
+ e0->a.visited = visited;
+ temp[num] = t->e.trans0 = nfa_compl_state(e0);
+ stack[num++] = e0;
+ }
+ if (e1 && e1->a.visited != visited) {
+ e1->a.visited = visited;
+ temp[num] = t->e.trans1 = nfa_compl_state(e1);
+ stack[num++] = e1;
+ }
+ }
+ break;
+ case nfa_wild:
+ case nfa_single:
+ case nfa_set:
+ case nfa_compl_wild:
+ case nfa_compl_single:
+ case nfa_compl_set:
+ if (s->o.trans->a.visited != visited) {
+ s->o.trans->a.visited = visited;
+ temp[num] = t->o.trans = nfa_compl_state(s->o.trans);
+ stack[num++] = s->o.trans;
+ }
+ break;
+ case nfa_super_accept:
+ internal_error("NFA super accept state explicitly reachable in graph");
+ }
+ }
+
+ if (num > NFA_SET_SIZE)
+ internal_error("NFA set size exceeded");
+
+ {
+ nfa_t ret = { temp[0], temp[1] };
+ free(temp);
+ free(stack);
+ return ret;
+ }
+}
+
/*
* Input is the items from a regex form,
* not including the regex symbol.
@@ -785,6 +976,9 @@ nfa_t nfa_compile_regex(val items)
nfa_t nfa_args = nfa_compile_regex(args);
nfa_state_t *s = nfa_state_empty(nfa_args.start, nfa_args.accept);
nfa = nfa_make(s, nfa_args.accept);
+ } else if (sym == compl_s) {
+ nfa_t nfa_args = nfa_compile_regex(args);
+ nfa = nfa_compl(nfa_args);
} else if (sym == or_s) {
/* Simple: make a new start and acceptance state, which form
the ends of a spindle that goes through two branches. */
@@ -798,6 +992,19 @@ nfa_t nfa_compile_regex(val items)
nfa_state_empty_convert(nfa_first.accept, acc, 0);
nfa_state_empty_convert(nfa_second.accept, acc, 0);
nfa = nfa_make(s, acc);
+ } else if (sym == and_s) {
+ /* We compile regex and using the De Morgan's equality:
+ R1&R2 == ~(~R1|~R2).
+ We could do this by transforming the source code,
+ and recursing into compiler, but instead, the approach taken
+ is NFA manipulation closely based on the or_s case above. */
+ nfa_t nfa_first = nfa_compl(nfa_compile_regex(first(args)));
+ nfa_t nfa_second = nfa_compl(nfa_compile_regex(second(args)));
+ nfa_state_t *acc = nfa_state_accept();
+ nfa_state_t *s = nfa_state_empty(nfa_first.start, nfa_second.start);
+ nfa_state_empty_convert(nfa_first.accept, acc, 0);
+ nfa_state_empty_convert(nfa_second.accept, acc, 0);
+ nfa = nfa_compl(nfa_make(s, acc));
} else {
assert (0 && "internal error: bad operator in regex");
}
@@ -817,45 +1024,52 @@ nfa_t nfa_compile_regex(val items)
}
}
-static int nfa_all_states(nfa_state_t **inout, int num, unsigned visited)
+static int nfa_all_states(nfa_state_t **stack, int num, unsigned visited)
{
int i;
for (i = 0; i < num; i++)
- inout[i]->a.visited = visited;
+ stack[i]->a.visited = visited;
for (i = 0; i < num; i++) {
- nfa_state_t *s = inout[i];
+ nfa_state_t *s = stack[i];
if (num >= NFA_SET_SIZE)
internal_error("NFA set size exceeded");
switch (s->a.kind) {
case nfa_accept:
+ case nfa_reject:
break;
case nfa_empty:
+ case nfa_compl_empty:
{
nfa_state_t *e0 = s->e.trans0;
nfa_state_t *e1 = s->e.trans1;
if (e0 && e0->a.visited != visited) {
e0->a.visited = visited;
- inout[num++] = e0;
+ stack[num++] = e0;
}
if (e1 && e1->a.visited != visited) {
e1->a.visited = visited;
- inout[num++] = e1;
+ stack[num++] = e1;
}
}
break;
case nfa_wild:
case nfa_single:
case nfa_set:
+ case nfa_compl_wild:
+ case nfa_compl_single:
+ case nfa_compl_set:
if (s->o.trans->a.visited != visited) {
s->o.trans->a.visited = visited;
- inout[num++] = s->o.trans;
+ stack[num++] = s->o.trans;
}
break;
+ case nfa_super_accept:
+ internal_error("NFA super accept state explicitly reachable in graph");
}
}
@@ -873,7 +1087,7 @@ void nfa_free(nfa_t nfa)
all[0] = nfa.start;
all[1] = nfa.accept;
- nstates = nfa_all_states(all, 2, nfa.start->a.visited);
+ nstates = nfa_all_states(all, 2, nfa.start->a.visited + 1);
for (i = 0; i < nstates; i++)
nfa_state_free(all[i]);
@@ -906,10 +1120,15 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
for (i = 0; i < nin; i++) {
if (stackp >= NFA_SET_SIZE)
internal_error("NFA set size exceeded");
- in[i]->a.visited = visited;
+ /* We do not mark the super accept state visited,
+ and we will never visit it by following e-transitions,
+ because all transitions into the super-accept state
+ are character-consuming. */
+ if (in[i] != &nfa_super_accept_state)
+ in[i]->a.visited = visited;
stack[stackp++] = in[i];
out[nout++] = in[i];
- if (in[i]->a.kind == nfa_accept)
+ if (nfa_is_accept_state(in[i]))
*accept = 1;
}
@@ -919,10 +1138,13 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
if (nout >= NFA_SET_SIZE)
internal_error("NFA set size exceeded");
- /* Only states of type nfa_empty are interesting.
- Each such state at most two epsilon transitions. */
-
- if (top->a.kind == nfa_empty) {
+ /* Only states of type nfa_empty or its complement are interesting.
+ Each such state at most two epsilon transitions.
+ Note above comment that epsilon transitions never lead
+ to the special super accept state; i.e. e0 and e1
+ are never the super accept state and so we can write to their
+ their visited field. */
+ if (top->a.kind == nfa_empty || top->a.kind == nfa_compl_empty) {
nfa_state_t *e0 = top->e.trans0;
nfa_state_t *e1 = top->e.trans1;
@@ -930,7 +1152,7 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
e0->a.visited = visited;
stack[stackp++] = e0;
out[nout++] = e0;
- if (e0->a.kind == nfa_accept)
+ if (nfa_is_accept_state(e0))
*accept = 1;
}
@@ -938,7 +1160,7 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
e1->a.visited = visited;
stack[stackp++] = e1;
out[nout++] = e1;
- if (e1->a.kind == nfa_accept)
+ if (nfa_is_accept_state(e1))
*accept = 1;
}
}
@@ -958,22 +1180,41 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
static int nfa_move(nfa_state_t **in, int nin, nfa_state_t **out, wchar_t ch)
{
int i, nmove;
+ int super_added = 0;
for (nmove = 0, i = 0; i < nin; i++) {
nfa_state_t *s = in[i];
switch (s->a.kind) {
case nfa_wild:
+ case nfa_compl_wild:
/* Unconditional match; don't have to look at ch. */
break;
case nfa_single:
if (s->o.ch == ch) /* Character match. */
break;
continue; /* no match */
+ case nfa_compl_single:
+ if (s->o.ch == ch) /* Character match. */
+ break;
+ goto add_super; /* no match means we can move to super accept state */
case nfa_set:
if (char_set_contains(s->s.set, ch)) /* Set match. */
break;
continue; /* no match */
+ case nfa_compl_set:
+ if (char_set_contains(s->s.set, ch)) /* Set match. */
+ break;
+ goto add_super;
+ case nfa_super_accept:
+ /* super always transitions to itself, for any character. */
+ add_super:
+ if (!super_added) {
+ super_added = 1;
+ if (nmove >= NFA_SET_SIZE)
+ internal_error("NFA set size exceeded");
+ out[nmove++] = &nfa_super_accept_state;
+ }
default:
/* Epsilon-transition and acceptance states have no character moves. */
continue;