summaryrefslogtreecommitdiffstats
path: root/regex.c
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2015-09-15 00:32:23 -0700
committerKaz Kylheku <kaz@kylheku.com>2015-09-15 00:32:23 -0700
commit943b5baa2d54fc1cc90758429c8a69c288f70868 (patch)
treea47c713a890830f942703ecfe35ca1e7901c1e3b /regex.c
parent55207f2ce58d2ecde6a3274147d3162c987cb510 (diff)
downloadtxr-943b5baa2d54fc1cc90758429c8a69c288f70868.tar.gz
txr-943b5baa2d54fc1cc90758429c8a69c288f70868.tar.bz2
txr-943b5baa2d54fc1cc90758429c8a69c288f70868.zip
Remove limit on NFA state size and allocate tightly.
* regex.c (struct regex): New member, nstates. (NFA_SET_SIZE): Preprocessor symbol removed. (struct nfa_machine): New member, nstates. (nfa_all_states): Function removed. (nfa_map_states): New static function. (nfa_count_one, nfa_count_states, nfa_collect_one): New static functions. (nfa_free): Takes nstates argument. Calculate array of all states using nfa_map_states over nfa_collect_one rather than nfa_all_states. The array is tightly allocated. Also the spanning tree traversal needs just one root, nfa.start. It's not clear why nfa_all_states used nfa.start and nfa.accept as roots. (nfa_closure): Takes nstates parameter; array bounds checking performed tightly against nstates rather than NFA_SET_SIZE. (nfa_move): Check against NFA_SET_SIZE removed. (nfa_run): Take nstates argument. Allocate arrays tightly. Pass nstates to nfa_closure. (regex_destroy): Pass regex->nstates to nfa_free. (regex_compile): Initialize regex->nstates. (regex_run): Pass regex->nstates to nfa_run. (regex_machine_reset): Pass nstates to nfa_closure. (regex_machine_init): Initialize n.nstates member of regex machine. Allocate arrays tightly. (regex_machine_feed): Pass nstates to nfa_closure.
Diffstat (limited to 'regex.c')
-rw-r--r--regex.c125
1 files changed, 63 insertions, 62 deletions
diff --git a/regex.c b/regex.c
index fd404b8b..24d66a42 100644
--- a/regex.c
+++ b/regex.c
@@ -64,6 +64,7 @@ typedef struct regex {
struct nfa nfa;
val dv;
} r;
+ int nstates;
val source;
} regex_t;
@@ -186,8 +187,6 @@ typedef union char_set {
#endif
} char_set_t;
-#define NFA_SET_SIZE 512
-
typedef enum {
nfa_accept, nfa_empty, nfa_wild, nfa_single, nfa_set
} nfa_kind_t;
@@ -233,6 +232,7 @@ struct nfa_machine {
nfa_state_t **move, **clos, **stack;
int nmove, nclos;
nfa_t nfa;
+ int nstates;
};
struct dv_machine {
@@ -1048,63 +1048,62 @@ static nfa_t nfa_compile_regex(val exp)
}
}
-static int nfa_all_states(nfa_state_t **inout, int num, unsigned visited)
+static void nfa_map_states(nfa_state_t *s,
+ mem_t *ctx, void (*fun)(nfa_state_t *,
+ mem_t *ctx),
+ unsigned visited)
{
- int i;
-
- for (i = 0; i < num; i++)
- inout[i]->a.visited = visited;
-
- for (i = 0; i < num; i++) {
- nfa_state_t *s = inout[i];
-
- if (num >= NFA_SET_SIZE)
- internal_error("NFA set size exceeded");
+ if (s != 0 && s->a.visited != visited) {
+ s->a.visited = visited;
+ fun(s, ctx);
switch (s->a.kind) {
case nfa_accept:
break;
case nfa_empty:
- {
- nfa_state_t *e0 = s->e.trans0;
- nfa_state_t *e1 = s->e.trans1;
-
- if (e0 && e0->a.visited != visited) {
- e0->a.visited = visited;
- inout[num++] = e0;
- }
- if (e1 && e1->a.visited != visited) {
- e1->a.visited = visited;
- inout[num++] = e1;
- }
- }
+ nfa_map_states(s->e.trans0, ctx, fun, visited);
+ nfa_map_states(s->e.trans1, ctx, fun, visited);
break;
case nfa_wild:
case nfa_single:
case nfa_set:
- if (s->o.trans->a.visited != visited) {
- s->o.trans->a.visited = visited;
- inout[num++] = s->o.trans;
- }
+ nfa_map_states(s->o.trans, ctx, fun, visited);
break;
}
}
+}
- if (num > NFA_SET_SIZE)
- internal_error("NFA set size exceeded");
+static void nfa_count_one(nfa_state_t *s, mem_t *ctx)
+{
+ (void) s;
+ int *pcount = coerce(int *, ctx);
+ (*pcount)++;
+}
+
+static int nfa_count_states(nfa_state_t *s)
+{
+ int count = 0;
+ unsigned visited = s->a.visited + 1;
+ nfa_map_states(s, coerce(mem_t *, &count), nfa_count_one, visited);
+ return count;
+}
- return num;
+static void nfa_collect_one(nfa_state_t *s, mem_t *ctx)
+{
+ nfa_state_t ***ppel = coerce(nfa_state_t ***, ctx);
+ *(*ppel)++ = s;
}
-static void nfa_free(nfa_t nfa)
+static void nfa_free(nfa_t nfa, int nstates)
{
- nfa_state_t **all = coerce(nfa_state_t **, chk_malloc(NFA_SET_SIZE * sizeof *all));
- int nstates, i;
+ nfa_state_t **all = coerce(nfa_state_t **, chk_malloc(nstates * sizeof *all));
+ nfa_state_t **pelem = all, *s = nfa.start;
+ unsigned visited = s->a.visited + 1;
+ int i;
- all[0] = nfa.start;
- all[1] = nfa.accept;
+ nfa_map_states(s, coerce(mem_t *, &pelem), nfa_collect_one, visited);
- nstates = nfa_all_states(all, 2, nfa.start->a.visited + 1);
+ assert (pelem - all == nstates);
for (i = 0; i < nstates; i++)
nfa_state_free(all[i]);
@@ -1127,7 +1126,8 @@ static void nfa_free(nfa_t nfa)
* (Transitions that don't do not consume and match an input character).
*/
static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
- nfa_state_t **out, unsigned visited, int *accept)
+ nfa_state_t **out, int nstates,
+ unsigned visited, int *accept)
{
int i, nout = 0;
int stackp = 0;
@@ -1135,8 +1135,8 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
/* First, add all states in the input set to the closure,
push them on the stack, and mark them as visited. */
for (i = 0; i < nin; i++) {
- if (stackp >= NFA_SET_SIZE)
- internal_error("NFA set size exceeded");
+ bug_unless (stackp < nstates);
+
in[i]->a.visited = visited;
stack[stackp++] = in[i];
out[nout++] = in[i];
@@ -1147,9 +1147,6 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
while (stackp) {
nfa_state_t *top = stack[--stackp];
- if (nout >= NFA_SET_SIZE)
- internal_error("NFA set size exceeded");
-
/* Only states of type nfa_empty are interesting.
Each such state at most two epsilon transitions. */
@@ -1175,8 +1172,7 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
}
}
- if (nout > NFA_SET_SIZE)
- internal_error("NFA set size exceeded");
+ bug_unless (nout <= nstates);
return nout;
}
@@ -1215,8 +1211,6 @@ static int nfa_move(nfa_state_t **in, int nin, nfa_state_t **out, wchar_t ch)
pointer to the next state in the same position,
among a common set of leading struct members in the union. */
- if (nmove >= NFA_SET_SIZE)
- internal_error("NFA set size exceeded");
out[nmove++] = s->o.trans;
}
@@ -1244,19 +1238,20 @@ static int nfa_move(nfa_state_t **in, int nin, nfa_state_t **out, wchar_t ch)
* determines the match length (defaulting to zero
* if no acceptance states were encountered).
*/
-static cnum nfa_run(nfa_t nfa, const wchar_t *str)
+static cnum nfa_run(nfa_t nfa, int nstates, const wchar_t *str)
{
const wchar_t *last_accept_pos = 0, *ptr = str;
unsigned visited = nfa.start->a.visited + 1;
- nfa_state_t **move = coerce(nfa_state_t **, chk_malloc(NFA_SET_SIZE * sizeof *move));
- nfa_state_t **clos = coerce(nfa_state_t **, chk_malloc(NFA_SET_SIZE * sizeof *clos));
- nfa_state_t **stack = coerce(nfa_state_t **, chk_malloc(NFA_SET_SIZE * sizeof *stack));
+ nfa_state_t **move = coerce(nfa_state_t **, chk_malloc(nstates * sizeof *move));
+ nfa_state_t **clos = coerce(nfa_state_t **, chk_malloc(nstates * sizeof *clos));
+ nfa_state_t **stack = coerce(nfa_state_t **, chk_malloc(nstates * sizeof *stack));
int nmove = 1, nclos;
int accept = 0;
move[0] = nfa.start;
- nclos = nfa_closure(stack, move, nmove, clos, visited++, &accept);
+ nclos = nfa_closure(stack, move, nmove, clos,
+ nstates, visited++, &accept);
if (accept)
last_accept_pos = ptr;
@@ -1267,7 +1262,8 @@ static cnum nfa_run(nfa_t nfa, const wchar_t *str)
accept = 0;
nmove = nfa_move(clos, nclos, move, ch);
- nclos = nfa_closure(stack, move, nmove, clos, visited++, &accept);
+ nclos = nfa_closure(stack, move, nmove, clos,
+ nstates, visited++, &accept);
if (accept)
last_accept_pos = ptr + 1;
@@ -1294,7 +1290,7 @@ static void regex_destroy(val obj)
{
regex_t *regex = coerce(regex_t *, obj->co.handle);
if (regex->kind == REGEX_NFA)
- nfa_free(regex->r.nfa);
+ nfa_free(regex->r.nfa, regex->nstates);
free(regex);
obj->co.handle = 0;
}
@@ -1670,6 +1666,7 @@ val regex_compile(val regex_sexp, val error_stream)
val ret;
val dv = dv_compile_regex(regex_sexp);
regex->kind = REGEX_DV;
+ regex->nstates = 0;
regex->source = nil;
ret = cobj(coerce(mem_t *, regex), regex_s, &regex_obj_ops);
regex->r.dv = dv;
@@ -1682,6 +1679,7 @@ val regex_compile(val regex_sexp, val error_stream)
regex->source = nil;
ret = cobj(coerce(mem_t *, regex), regex_s, &regex_obj_ops);
regex->r.nfa = nfa_compile_regex(regex_sexp);
+ regex->nstates = nfa_count_states(regex->r.nfa.start);
regex->source = regex_sexp;
return ret;
}
@@ -1852,7 +1850,7 @@ static cnum regex_run(val compiled_regex, const wchar_t *str)
return if3(regex->kind == REGEX_DV,
dv_run(regex->r.dv, str),
- nfa_run(regex->r.nfa, str));
+ nfa_run(regex->r.nfa, regex->nstates, str));
}
/*
@@ -1874,7 +1872,8 @@ static void regex_machine_reset(regex_machine_t *regm)
regm->n.move[0] = regm->n.nfa.start;
regm->n.nclos = nfa_closure(regm->n.stack, regm->n.move, regm->n.nmove,
- regm->n.clos, regm->n.visited++, &accept);
+ regm->n.clos, regm->n.nstates,
+ regm->n.visited++, &accept);
} else {
regm->d.deriv = regm->d.regex;
accept = (reg_nullable(regm->d.regex) != nil);
@@ -1894,12 +1893,13 @@ static void regex_machine_init(regex_machine_t *regm, val reg)
} else {
regm->n.is_nfa = 1;
regm->n.nfa = regex->r.nfa;
+ regm->n.nstates = regex->nstates;
regm->n.move = coerce(nfa_state_t **,
- chk_malloc(NFA_SET_SIZE * sizeof *regm->n.move));
+ chk_malloc(regex->nstates * sizeof *regm->n.move));
regm->n.clos = coerce(nfa_state_t **,
- chk_malloc(NFA_SET_SIZE * sizeof *regm->n.clos));
+ chk_malloc(regex->nstates * sizeof *regm->n.clos));
regm->n.stack = coerce(nfa_state_t **,
- chk_malloc(NFA_SET_SIZE * sizeof *regm->n.stack));
+ chk_malloc(regex->nstates * sizeof *regm->n.stack));
}
regex_machine_reset(regm);
@@ -1930,7 +1930,8 @@ static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch)
regm->n.nmove = nfa_move(regm->n.clos, regm->n.nclos, regm->n.move, ch);
regm->n.nclos = nfa_closure(regm->n.stack, regm->n.move,
regm->n.nmove, regm->n.clos,
- regm->n.visited++, &accept);
+ regm->n.nstates, regm->n.visited++,
+ &accept);
regm->n.nfa.start->a.visited = regm->n.visited;