summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog53
-rw-r--r--dep.mk2
-rw-r--r--lib.c5
-rw-r--r--lib.h3
-rw-r--r--match.c4
-rw-r--r--parser.y14
-rw-r--r--regex.c805
-rw-r--r--regex.h22
-rw-r--r--txr.h1
9 files changed, 627 insertions, 282 deletions
diff --git a/ChangeLog b/ChangeLog
index 25750514..28c8285c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,56 @@
+2010-01-13 Kaz Kylheku <kkylheku@gmail.com>
+
+ Impelement derivative-based regular expressions.
+
+ * lib.c (chset_s, compiled_regex_s): New symbol globals.
+ (obj_init): New symbols interned.
+
+ * lib.h (chest_s, compiled_regex_s): Declared.
+
+ * match.c (match_line, match_files): Use regexp predicate
+ function instead of typeof for detecting regex.
+
+ * parser.y (regexpr, regbranch, regterm): Minor syntactic refactoring.
+
+ * regex.h (union nfa_state, nfa_state_t, struct nfa, enum nfam_result,
+ nfa_machine_t, nfa_compile_regex, nfa_free, nfa_run,
+ nfa_machine_reset, nfa_machine_init, nfa_machine_cleanup,
+ nfa_machine_feed, nfa_machine_match_span, regex_nfa): Declarations
+ for internal material removed from header, some moved into regex.c.
+
+ * regex.c: Includes txr.h now to get declaration of new option global.
+ (union nfa_state, nfa_state_t, struct nfa,
+ nfa_compile_regex, nfa_free, nfa_run, regex_nfa): Declarations
+ moved from regex.h.
+ (enum nfam_result, nfa_machine_reset, nfa_machine_init,
+ nfa_machine_cleanup, nfa_machine_feed, nfa_machine_match_span):
+ Renamed from nfam_* and nfa_machine_* to regm_* and regex_machine_*.
+ Functions made static. Regex machine is now polymorphic: the
+ machine is instantiated based on whether the regex is NFA or
+ derivative type, and the behavior of the functions is type dependent.
+ (nfa_machine_t): Renamed to regex_machine_t, now typedef name for union
+ regex_machine.
+ (struct dv_machine, union regex_machine): New types.
+ (struct nfa_machine): New member is_nfa. A few members rearranged,
+ so that union common members are at the start of the structure.
+ (opt_derivative_regex): New global added.
+ (char_set_compile, char_set_cobj_destroy): New function.
+ (char_set_cobj_ops): New static structure.
+ (nfa_compile_set): Refactored to use char_set_compile; made static.
+ (nfa_compile_list): New function.
+ (nfa_compile_regex): Refactored to follow new syntax from parser.y;
+ made static.
+ (nfa_free, nfa_run, regex_nfa): Made static.
+ (dv_compile_regex, reg_nullable_list, reg_nullable,
+ reg_derivative_list, reg_derivative, dv_run): New functions.
+ (regex_compile): Can compile either kind of regex now.
+ (search_regex, match_regex): Decoupled from dependency on NFA
+ implementation.
+
+ * txr.h (opt_derivative_regex): Declared.
+
+ * dep.mk: Regenerated.
+
2010-01-06 Kaz Kylheku <kkylheku@gmail.com>
Remove incorrect implementation of extended
diff --git a/dep.mk b/dep.mk
index 355dc989..55764b17 100644
--- a/dep.mk
+++ b/dep.mk
@@ -3,7 +3,7 @@ lex.yy.o: config.h $(top_srcdir)/lib.h y.tab.h $(top_srcdir)/gc.h $(top_srcdir)/
y.tab.o: config.h $(top_srcdir)/lib.h $(top_srcdir)/regex.h $(top_srcdir)/utf8.h $(top_srcdir)/parser.h
match.o: config.h $(top_srcdir)/lib.h $(top_srcdir)/gc.h $(top_srcdir)/unwind.h $(top_srcdir)/regex.h $(top_srcdir)/stream.h $(top_srcdir)/parser.h $(top_srcdir)/txr.h $(top_srcdir)/utf8.h $(top_srcdir)/match.h
lib.o: config.h $(top_srcdir)/lib.h $(top_srcdir)/gc.h $(top_srcdir)/hash.h $(top_srcdir)/unwind.h $(top_srcdir)/stream.h $(top_srcdir)/utf8.h
-regex.o: config.h $(top_srcdir)/lib.h $(top_srcdir)/unwind.h $(top_srcdir)/regex.h
+regex.o: config.h $(top_srcdir)/lib.h $(top_srcdir)/unwind.h $(top_srcdir)/regex.h $(top_srcdir)/txr.h
gc.o: config.h $(top_srcdir)/lib.h $(top_srcdir)/stream.h $(top_srcdir)/hash.h $(top_srcdir)/txr.h $(top_srcdir)/gc.h
unwind.o: config.h $(top_srcdir)/lib.h $(top_srcdir)/gc.h $(top_srcdir)/stream.h $(top_srcdir)/txr.h $(top_srcdir)/unwind.h
stream.o: config.h $(top_srcdir)/lib.h $(top_srcdir)/gc.h $(top_srcdir)/unwind.h $(top_srcdir)/stream.h $(top_srcdir)/utf8.h
diff --git a/lib.c b/lib.c
index 55e961bf..b838e21b 100644
--- a/lib.c
+++ b/lib.c
@@ -51,7 +51,8 @@ val system_package, keyword_package, user_package;
val null, t, cons_s, str_s, chr_s, num_s, sym_s, pkg_s, fun_s, vec_s;
val stream_s, hash_s, lcons_s, lstr_s, cobj_s;
-val var_s, regex_s, set_s, cset_s, wild_s, oneplus_s;
+val var_s, regex_s, chset_s, set_s, cset_s, wild_s, oneplus_s;
+val compiled_regex_s;
val zeroplus_s, optional_s, compl_s, compound_s, or_s, and_s, quasi_s;
val skip_s, trailer_s, block_s, next_s, freeform_s, fail_s, accept_s;
val all_s, some_s, none_s, maybe_s, cases_s, collect_s, until_s, coll_s;
@@ -1887,6 +1888,8 @@ static void obj_init(void)
cobj_s = intern(lit("cobj"), user_package);
var_s = intern(lit("var"), system_package);
regex_s = intern(lit("regex"), system_package);
+ compiled_regex_s = intern(lit("compiled-regex"), system_package);
+ chset_s = intern(lit("chset"), system_package);
set_s = intern(lit("set"), user_package);
cset_s = intern(lit("cset"), user_package);
wild_s = intern(lit("wild"), user_package);
diff --git a/lib.h b/lib.h
index 4afaa812..6a925cff 100644
--- a/lib.h
+++ b/lib.h
@@ -207,7 +207,8 @@ INLINE wchar_t *litptr(val obj)
extern val keyword_package, system_package, user_package;
extern val null, t, cons_s, str_s, chr_s, num_s, sym_s, pkg_s, fun_s, vec_s;
extern val stream_s, hash_s, lcons_s, lstr_s, cobj_s;
-extern val var_s, regex_s, set_s, cset_s, wild_s, oneplus_s;
+extern val var_s, regex_s, chset_s, set_s, cset_s, wild_s, oneplus_s;
+extern val compiled_regex_s;
extern val zeroplus_s, optional_s, compl_s, compound_s, or_s, and_s, quasi_s;
extern val skip_s, trailer_s, block_s, next_s, freeform_s, fail_s, accept_s;
extern val all_s, some_s, none_s, maybe_s, cases_s, collect_s, until_s, coll_s;
diff --git a/match.c b/match.c
index bdf7714d..c9d45670 100644
--- a/match.c
+++ b/match.c
@@ -368,7 +368,7 @@ static val match_line(val bindings, val specline, val dataline,
LOG_MATCH("var delimiting string", find);
bindings = acons_new(bindings, sym, sub_str(dataline, pos, find));
pos = plus(find, length_str(pat));
- } else if (consp(pat) && typeof(first(pat)) == regex_s) {
+ } else if (consp(pat) && regexp(first(pat))) {
val find = search_regex(dataline, first(pat), pos, modifier);
val fpos = car(find);
val flen = cdr(find);
@@ -412,7 +412,7 @@ static val match_line(val bindings, val specline, val dataline,
sem_error(spec_lineno,
lit("variable followed by invalid element"), nao);
}
- } else if (typeof(directive) == regex_s) {
+ } else if (regexp(directive)) {
val past = match_regex(dataline, directive, pos);
if (nullp(past)) {
LOG_MISMATCH("regex");
diff --git a/parser.y b/parser.y
index cc0815c8..57692704 100644
--- a/parser.y
+++ b/parser.y
@@ -453,17 +453,17 @@ regex : '/' regexpr '/' { $$ = $2; }
yybadtoken(yychar, lit("regex")); }
;
-regexpr : regbranch { $$ = $1; }
- | regexpr '|' regexpr { $$ = list(list(or_s, $1,
- $3, nao), nao); }
- | regexpr '&' regexpr { $$ = list(list(and_s, $1,
- $3, nao), nao); }
+regexpr : regbranch { $$ = if3(cdr($1),
+ cons(compound_s, $1),
+ car($1)); }
+ | regexpr '|' regexpr { $$ = list(or_s, $1, $3, nao); }
+ | regexpr '&' regexpr { $$ = list(and_s, $1, $3, nao); }
+ | '~' regexpr { $$ = list(compl_s, $2, nao); }
| /* empty */ { $$ = nil; }
;
regbranch : regterm { $$ = cons($1, nil); }
| regterm regbranch { $$ = cons($1, $2); }
- | '~' regbranch { $$ = cons(cons(compl_s, $2), nil); }
;
regterm : '[' regclass ']' { $$ = cons(set_s, $2); }
@@ -476,7 +476,7 @@ regterm : '[' regclass ']' { $$ = cons(set_s, $2); }
| regterm '+' { $$ = list(oneplus_s, $1, nao); }
| regterm '?' { $$ = list(optional_s, $1, nao); }
| REGCHAR { $$ = chr($1); }
- | '(' regexpr ')' { $$ = cons(compound_s, $2); }
+ | '(' regexpr ')' { $$ = $2; }
| '(' error { $$ = nil;
yybadtoken(yychar,
lit("regex subexpression")); }
diff --git a/regex.c b/regex.c
index ec1e656c..147f03cb 100644
--- a/regex.c
+++ b/regex.c
@@ -36,6 +36,20 @@
#include "lib.h"
#include "unwind.h"
#include "regex.h"
+#include "txr.h"
+
+typedef union nfa_state nfa_state_t;
+
+typedef struct nfa {
+ nfa_state_t *start;
+ nfa_state_t *accept;
+} nfa_t;
+
+typedef enum regm_result {
+ REGM_INCOMPLETE, REGM_FAIL, REGM_MATCH
+} regm_result_t;
+
+typedef union regex_machine regex_machine_t;
typedef unsigned int bitcell_t;
@@ -147,14 +161,30 @@ union nfa_state {
};
struct nfa_machine {
- cnum last_accept_pos;
+ int is_nfa; /* common member */
+ cnum last_accept_pos; /* common member */
+ cnum count; /* common member */
unsigned visited;
nfa_state_t **move, **clos, **stack;
int nmove, nclos;
- cnum count;
nfa_t nfa;
};
+struct dv_machine {
+ int is_nfa; /* common member */
+ cnum last_accept_pos; /* common member */
+ cnum count; /* common member */
+ val deriv;
+ val regex;
+};
+
+union regex_machine {
+ struct nfa_machine n;
+ struct dv_machine d;
+};
+
+int opt_derivative_regex = 1;
+
static int L0_full(cset_L0_t *L0)
{
int i;
@@ -531,6 +561,91 @@ static int char_set_contains(char_set_t *set, wchar_t ch)
return set->any.comp ? !result : result;
}
+static char_set_t *char_set_compile(val args, val comp)
+{
+ val iter;
+ wchar_t min = WCHAR_MAX;
+ wchar_t max = 0;
+ chset_type_t cst;
+
+ for (iter = args; iter; iter = rest(iter)) {
+ val item = first(iter);
+
+ if (consp(item)) {
+ val from = car(item);
+ val to = cdr(item);
+
+ assert (typeof(from) == chr_s && typeof(to) == chr_s);
+
+ if (c_chr(from) < min)
+ min = c_chr(from);
+ if (c_chr(from) > max)
+ max = c_chr(from);
+
+ if (c_chr(to) < min)
+ min = c_chr(to);
+ if (c_chr(to) > max)
+ max = c_chr(to);
+ } else if (typeof(item) == chr_s) {
+ if (c_chr(item) < min)
+ min = c_chr(item);
+ if (c_chr(item) > max)
+ max = c_chr(item);
+ } else {
+ assert(0 && "bad regex set");
+ }
+ }
+
+ if (max < 0x100)
+ cst = CHSET_SMALL;
+ else if (max - min < 0x100)
+ cst = CHSET_DISPLACED;
+ else if (max < 0x10000)
+ cst = CHSET_LARGE;
+ else
+ cst = CHSET_XLARGE;
+
+ {
+ char_set_t *set = char_set_create(cst, min);
+
+ for (iter = args; iter; iter = rest(iter)) {
+ val item = first(iter);
+
+ if (consp(item)) {
+ val from = car(item);
+ val to = cdr(item);
+
+ assert (typeof(from) == chr_s && typeof(to) == chr_s);
+ char_set_add_range(set, c_chr(from), c_chr(to));
+ } else if (typeof(item) == chr_s) {
+ char_set_add(set, c_chr(item));
+ } else {
+ assert(0 && "bad regex set");
+ }
+ }
+
+ if (comp)
+ char_set_compl(set);
+
+ return set;
+ }
+}
+
+static void char_set_cobj_destroy(val chset)
+{
+ char_set_t *set = (char_set_t *) chset->co.handle;
+ char_set_destroy(set);
+ chset->co.handle = 0;
+}
+
+static struct cobj_ops char_set_obj_ops = {
+ cobj_equal_op,
+ cobj_print_op,
+ char_set_cobj_destroy,
+ cobj_mark_op,
+ cobj_hash_op
+};
+
static nfa_state_t *nfa_state_accept(void)
{
nfa_state_t *st = (nfa_state_t *) chk_malloc(sizeof *st);
@@ -651,75 +766,30 @@ static nfa_t nfa_combine(nfa_t pred, nfa_t succ)
return ret;
}
-static nfa_t nfa_compile_set(val args, int comp)
+static nfa_t nfa_compile_set(val args, val comp)
{
- val iter;
- wchar_t min = WCHAR_MAX;
- wchar_t max = 0;
- chset_type_t cst;
-
- for (iter = args; iter; iter = rest(iter)) {
- val item = first(iter);
-
- if (consp(item)) {
- val from = car(item);
- val to = cdr(item);
-
- assert (typeof(from) == chr_s && typeof(to) == chr_s);
-
- if (c_chr(from) < min)
- min = c_chr(from);
- if (c_chr(from) > max)
- max = c_chr(from);
-
- if (c_chr(to) < min)
- min = c_chr(to);
- if (c_chr(to) > max)
- max = c_chr(to);
- } else if (typeof(item) == chr_s) {
- if (c_chr(item) < min)
- min = c_chr(item);
- if (c_chr(item) > max)
- max = c_chr(item);
- } else {
- assert(0 && "bad regex set");
- }
- }
-
- if (max < 0x100)
- cst = CHSET_SMALL;
- else if (max - min < 0x100)
- cst = CHSET_DISPLACED;
- else if (max < 0x10000)
- cst = CHSET_LARGE;
- else
- cst = CHSET_XLARGE;
-
- {
- char_set_t *set = char_set_create(cst, min);
- nfa_state_t *acc = nfa_state_accept();
- nfa_state_t *s = nfa_state_set(acc, set);
- nfa_t ret = nfa_make(s, acc);
+ char_set_t *set = char_set_compile(args, comp);
+ nfa_state_t *acc = nfa_state_accept();
+ nfa_state_t *s = nfa_state_set(acc, set);
+ return nfa_make(s, acc);
+}
- for (iter = args; iter; iter = rest(iter)) {
- val item = first(iter);
+static nfa_t nfa_compile_regex(val regex);
- if (consp(item)) {
- val from = car(item);
- val to = cdr(item);
+/*
+ * Helper to nfa_compile_regex for compiling the argument list of
+ * a compound regex.
+ */
- assert (typeof(from) == chr_s && typeof(to) == chr_s);
- char_set_add_range(set, c_chr(from), c_chr(to));
- } else if (typeof(item) == chr_s) {
- char_set_add(set, c_chr(item));
- } else {
- assert(0 && "bad regex set");
- }
- }
+static nfa_t nfa_compile_list(val exp_list)
+{
+ nfa_t nfa_first = nfa_compile_regex(first(exp_list));
- if (comp)
- char_set_compl(set);
- return ret;
+ if (rest(exp_list)) {
+ nfa_t nfa_rest = nfa_compile_list(rest(exp_list));
+ return nfa_combine(nfa_first, nfa_rest);
+ } else {
+ return nfa_first;
}
}
@@ -728,92 +798,73 @@ static nfa_t nfa_compile_set(val args, int comp)
* not including the regex symbol.
* I.e. (rest '(regex ...)) not '(regex ...).
*/
-nfa_t nfa_compile_regex(val items)
+static nfa_t nfa_compile_regex(val exp)
{
- if (nullp(items)) {
+ if (nullp(exp)) {
nfa_state_t *acc = nfa_state_accept();
nfa_state_t *s = nfa_state_empty(acc, 0);
- nfa_t nfa = nfa_make(s, acc);
- return nfa;
+ return nfa_make(s, acc);
+ } else if (typeof(exp) == chr_s) {
+ nfa_state_t *acc = nfa_state_accept();
+ nfa_state_t *s = nfa_state_single(acc, c_chr(exp));
+ return nfa_make(s, acc);
+ } else if (exp == wild_s) {
+ nfa_state_t *acc = nfa_state_accept();
+ nfa_state_t *s = nfa_state_wild(acc);
+ return nfa_make(s, acc);
} else {
- val item = first(items), others = rest(items);
- nfa_t nfa;
-
- if (typeof(item) == chr_s) {
+ val sym = first(exp), args = rest(exp);
+
+ if (sym == set_s) {
+ return nfa_compile_set(args, nil);
+ } else if (sym == cset_s) {
+ return nfa_compile_set(args, t);
+ } else if (sym == compound_s) {
+ return nfa_compile_list(args);
+ } else if (sym == zeroplus_s) {
+ nfa_t nfa_arg = nfa_compile_regex(first(args));
nfa_state_t *acc = nfa_state_accept();
- nfa_state_t *s = nfa_state_single(acc, c_chr(item));
- nfa = nfa_make(s, acc);
- } else if (item == wild_s) {
+ /* New start state has empty transitions going through
+ the inner NFA, or skipping it right to the new acceptance state. */
+ nfa_state_t *s = nfa_state_empty(nfa_arg.start, acc);
+ /* Convert acceptance state of inner NFA to one which has
+ an empty transition back to the start state, and
+ an empty transition to the new acceptance state. */
+ nfa_state_empty_convert(nfa_arg.accept, nfa_arg.start, acc);
+ return nfa_make(s, acc);
+ } else if (sym == oneplus_s) {
+ /* One-plus case differs from zero-plus in that the new start state
+ does not have an empty transition to the acceptance state.
+ So the inner NFA must be traversed once. */
+ nfa_t nfa_arg = nfa_compile_regex(first(args));
nfa_state_t *acc = nfa_state_accept();
- nfa_state_t *s = nfa_state_wild(acc);
- nfa = nfa_make(s, acc);
- } else if (consp(item)) {
- val sym = first(item);
- val args = rest(item);
-
- if (sym == set_s) {
- nfa = nfa_compile_set(args, 0);
- } else if (sym == cset_s) {
- nfa = nfa_compile_set(args, 1);
- } else if (sym == compound_s) {
- nfa = nfa_compile_regex(args);
- } else if (sym == zeroplus_s) {
- nfa_t nfa_args = nfa_compile_regex(args);
- nfa_state_t *acc = nfa_state_accept();
- /* New start state has empty transitions going through
- the inner NFA, or skipping it right to the new acceptance state. */
- nfa_state_t *s = nfa_state_empty(nfa_args.start, acc);
- /* Convert acceptance state of inner NFA to one which has
- an empty transition back to the start state, and
- an empty transition to the new acceptance state. */
- nfa_state_empty_convert(nfa_args.accept, nfa_args.start, acc);
- nfa = nfa_make(s, acc);
- } else if (sym == oneplus_s) {
- /* One-plus case differs from zero-plus in that the new start state
- does not have an empty transition to the acceptance state.
- So the inner NFA must be traversed once. */
- nfa_t nfa_args = nfa_compile_regex(args);
- nfa_state_t *acc = nfa_state_accept();
- nfa_state_t *s = nfa_state_empty(nfa_args.start, 0); /* <-- diff */
- nfa_state_empty_convert(nfa_args.accept, nfa_args.start, acc);
- nfa = nfa_make(s, acc);
- } else if (sym == optional_s) {
- /* In this case, we can keep the acceptance state of the inner
- NFA as the acceptance state of the new NFA. We simply add
- a new start state which can short-circuit to it via an empty
- transition. */
- nfa_t nfa_args = nfa_compile_regex(args);
- nfa_state_t *s = nfa_state_empty(nfa_args.start, nfa_args.accept);
- nfa = nfa_make(s, nfa_args.accept);
- } else if (sym == or_s) {
- /* Simple: make a new start and acceptance state, which form
- the ends of a spindle that goes through two branches. */
- nfa_t nfa_first = nfa_compile_regex(first(args));
- nfa_t nfa_second = nfa_compile_regex(second(args));
- nfa_state_t *acc = nfa_state_accept();
- /* New state s has empty transitions into each inner NFA. */
- nfa_state_t *s = nfa_state_empty(nfa_first.start, nfa_second.start);
- /* Acceptance state of each inner NFA converted to empty
- transition to new combined acceptance state. */
- nfa_state_empty_convert(nfa_first.accept, acc, 0);
- nfa_state_empty_convert(nfa_second.accept, acc, 0);
- nfa = nfa_make(s, acc);
- } else {
- assert (0 && "internal error: bad operator in regex");
- }
+ nfa_state_t *s = nfa_state_empty(nfa_arg.start, 0); /* <-- diff */
+ nfa_state_empty_convert(nfa_arg.accept, nfa_arg.start, acc);
+ return nfa_make(s, acc);
+ } else if (sym == optional_s) {
+ /* In this case, we can keep the acceptance state of the inner
+ NFA as the acceptance state of the new NFA. We simply add
+ a new start state which can short-circuit to it via an empty
+ transition. */
+ nfa_t nfa_arg = nfa_compile_regex(first(args));
+ nfa_state_t *s = nfa_state_empty(nfa_arg.start, nfa_arg.accept);
+ return nfa_make(s, nfa_arg.accept);
+ } else if (sym == or_s) {
+ /* Simple: make a new start and acceptance state, which form
+ the ends of a spindle that goes through two branches. */
+ nfa_t nfa_first = nfa_compile_regex(first(args));
+ nfa_t nfa_second = nfa_compile_regex(second(args));
+ nfa_state_t *acc = nfa_state_accept();
+ /* New state s has empty transitions into each inner NFA. */
+ nfa_state_t *s = nfa_state_empty(nfa_first.start, nfa_second.start);
+ /* Acceptance state of each inner NFA converted to empty
+ transition to new combined acceptance state. */
+ nfa_state_empty_convert(nfa_first.accept, acc, 0);
+ nfa_state_empty_convert(nfa_second.accept, acc, 0);
+ return nfa_make(s, acc);
} else {
- assert (0 && "internal error: bad regex item");
+ internal_error("bad operator in regex");
}
-
- /* We made an NFA for the first item, but others follow.
- Compile the others to an NFA recursively, then
- stick it with this NFA. */
- if (others) {
- nfa_t nfa_others = nfa_compile_regex(others);
- nfa = nfa_combine(nfa, nfa_others);
- }
-
- return nfa;
}
}
@@ -865,7 +916,7 @@ static int nfa_all_states(nfa_state_t **inout, int num, unsigned visited)
return num;
}
-void nfa_free(nfa_t nfa)
+static void nfa_free(nfa_t nfa)
{
nfa_state_t **all = (nfa_state_t **) chk_malloc(NFA_SET_SIZE * sizeof *all);
int nstates, i;
@@ -1013,7 +1064,7 @@ static int nfa_move(nfa_state_t **in, int nin, nfa_state_t **out, wchar_t ch)
* determines the match length (defaulting to zero
* if no acceptance states were encountered).
*/
-cnum nfa_run(nfa_t nfa, const wchar_t *str)
+static cnum nfa_run(nfa_t nfa, const wchar_t *str)
{
const wchar_t *last_accept_pos = 0, *ptr = str;
unsigned visited = nfa.start->a.visited + 1;
@@ -1054,128 +1105,387 @@ cnum nfa_run(nfa_t nfa, const wchar_t *str)
return last_accept_pos ? last_accept_pos - str : -1;
}
-cnum nfa_machine_match_span(nfa_machine_t *nfam)
+static cnum regex_machine_match_span(regex_machine_t *regm)
{
- return nfam->last_accept_pos;
+ return regm->n.last_accept_pos;
}
-/*
- * NFA machine: represents the logic of the nfa_run function as state machine
- * object which can be fed one character at a time.
- */
-
-void nfa_machine_reset(nfa_machine_t *nfam)
+static void regex_destroy(val regex)
{
- int accept = 0;
+ nfa_t *pnfa = (nfa_t *) regex->co.handle;
+ nfa_free(*pnfa);
+ free(pnfa);
+ regex->co.handle = 0;
+}
- nfam->last_accept_pos = -1;
- nfam->visited = nfam->nfa.start->a.visited + 1;
- nfam->nmove = 1;
- nfam->count = 0;
+static struct cobj_ops regex_obj_ops = {
+ cobj_equal_op,
+ cobj_print_op,
+ regex_destroy,
+ cobj_mark_op,
+ cobj_hash_op
+};
- nfam->move[0] = nfam->nfa.start;
+/*
+ * ``Compile'' raw regular expression to a form that is
+ * easier to simulate by the derivative method.
+ * The only thing we currently do here is replace character set regexps with
+ * character set objects.
+ */
+static val dv_compile_regex(val exp)
+{
+ if (atom(exp)) {
+ return exp;
+ } else {
+ val sym = first(exp);
+ val args = rest(exp);
+
+ if (sym == set_s || sym == cset_s) {
+ char_set_t *set = char_set_compile(args, eq(sym, cset_s));
+ return cobj((mem_t *) set, chset_s, &char_set_obj_ops);
+ } else if (sym == compound_s) {
+ list_collect_decl (out, iter);
+ list_collect (iter, compound_s);
+ for (; args; args = cdr(args))
+ list_collect (iter, dv_compile_regex(first(args)));
+ return out;
+ } else if (sym == zeroplus_s || sym == oneplus_s ||
+ sym == optional_s || sym == compl_s) {
+ return cons(sym, cons(dv_compile_regex(first(args)), nil));
+ } else if (sym == or_s || sym == and_s) {
+ val xfirst = dv_compile_regex(first(args));
+ val xsecond = dv_compile_regex(second(args));
+ return cons(sym, cons(xfirst, cons(xsecond, nil)));
+ } else {
+ internal_error("bad operator in regex");
+ }
+ }
+}
- nfam->nclos = nfa_closure(nfam->stack, nfam->move, nfam->nmove,
- nfam->clos, nfam->visited++, &accept);
+static val reg_nullable(val);
- if (accept)
- nfam->last_accept_pos = nfam->count;
+/*
+ * Helper to reg_nullable for recursing over
+ * contents of a compound expression.
+ */
+static val reg_nullable_list(val exp_list)
+{
+ if (rest(exp_list)) {
+ return if2(reg_nullable(first(exp_list)) &&
+ reg_nullable_list(rest(exp_list)),
+ t);
+ } else {
+ return reg_nullable(first(exp_list));
+ }
}
-void nfa_machine_init(nfa_machine_t *nfam, nfa_t nfa)
+/*
+ * Determine whether the given regular expression is nullable: that is
+ * to say, can the regular expression match the empty string?
+ */
+static val reg_nullable(val exp)
{
- nfam->nfa = nfa;
- nfam->move = (nfa_state_t **) chk_malloc(NFA_SET_SIZE * sizeof *nfam->move);
- nfam->clos = (nfa_state_t **) chk_malloc(NFA_SET_SIZE * sizeof *nfam->clos);
- nfam->stack = (nfa_state_t **) chk_malloc(NFA_SET_SIZE * sizeof *nfam->stack);
- nfa_machine_reset(nfam);
+ if (exp == nil) {
+ return t;
+ } else if (atom(exp)) {
+ return nil;
+ } else {
+ val sym = first(exp), args = rest(exp);
+
+ if (sym == set_s || sym == cset_s) {
+ return nil;
+ } else if (sym == compound_s) {
+ return reg_nullable_list(args);
+ } else if (sym == oneplus_s || sym == compiled_regex_s) {
+ return reg_nullable(first(args));
+ } else if (sym == zeroplus_s || sym == optional_s) {
+ return t;
+ } else if (sym == compl_s) {
+ return if3(reg_nullable(first(args)), nil, t);
+ } else if (sym == or_s) {
+ return if2((reg_nullable(first(args)) || reg_nullable(second(args))), t);
+ } else if (sym == and_s) {
+ return if2((reg_nullable(first(args)) && reg_nullable(second(args))), t);
+ } else {
+ internal_error("bad operator in regex");
+ }
+ }
}
-void nfa_machine_cleanup(nfa_machine_t *nfam)
+static val reg_derivative(val, val);
+
+static val reg_derivative_list(val exp_list, val ch)
{
- free(nfam->stack);
- free(nfam->clos);
- free(nfam->move);
- nfam->stack = 0;
- nfam->clos = 0;
- nfam->move = 0;
- nfam->nfa.start = 0;
- nfam->nfa.accept = 0;
+ if (rest(exp_list)) {
+ if (reg_nullable(first(exp_list))) {
+ return list(or_s, cons(compound_s,
+ cons(reg_derivative(first(exp_list), ch),
+ rest(exp_list))),
+ reg_derivative_list(rest(exp_list), ch),
+ nao);
+ } else {
+ val d_first = reg_derivative(first(exp_list), ch);
+
+ if (d_first == t)
+ return t;
+ else if (d_first == nil)
+ return cons(compound_s, rest(exp_list));
+ else
+ return cons(compound_s,
+ cons(reg_derivative(d_first, ch), rest(exp_list)));
+ }
+ } else {
+ return reg_derivative(first(exp_list), ch);
+ }
}
-nfam_result_t nfa_machine_feed(nfa_machine_t *nfam, wchar_t ch)
+/*
+ * Determine derivative of regex with respect to character.
+ */
+static val reg_derivative(val exp, val ch)
{
- int accept = 0;
+ if (exp == nil || exp == t) {
+ return t;
+ } else if (chrp(exp)) {
+ return if3(eq(exp, ch), nil, t);
+ } else if (typeof(exp) == chset_s) {
+ char_set_t *set = (char_set_t *) exp->co.handle;
+ return if3(char_set_contains(set, c_chr(ch)), nil, t);
+ } else if (exp == wild_s) {
+ return nil;
+ } else {
+ val sym = first(exp);
+ val args = rest(exp);
+
+ if (sym == set_s || sym == cset_s) {
+ internal_error("uncompiled regex passed to reg_derivative");
+ } else if (sym == compiled_regex_s) {
+ return reg_derivative(first(args), ch);
+ } else if (sym == compound_s) {
+ return reg_derivative_list(args, ch);
+ } else if (sym == optional_s) {
+ return reg_derivative(first(args), ch);
+ } else if (sym == oneplus_s || sym == zeroplus_s) {
+ val arg = first(args);
+ val d_arg = reg_derivative(arg, ch);
+ if (d_arg == t)
+ return t;
+ if (d_arg == nil)
+ return cons(zeroplus_s, cons(arg, nil));
+ return cons(compound_s, cons(d_arg,
+ cons(cons(zeroplus_s,
+ cons(arg, nil)), nil)));
+ } else if (sym == compl_s) {
+ return cons(sym, cons(reg_derivative(first(args), ch), nil));
+ } else if (sym == or_s) {
+ val d_arg1 = reg_derivative(first(args), ch);
+ val d_arg2 = nil;
+
+ if (d_arg1 == nil)
+ return nil;
- if (ch != 0) {
- nfam->count++;
+ d_arg2 = reg_derivative(second(args), ch);
- nfam->nmove = nfa_move(nfam->clos, nfam->nclos, nfam->move, ch);
- nfam->nclos = nfa_closure(nfam->stack, nfam->move, nfam->nmove, nfam->clos,
- nfam->visited++, &accept);
+ if (d_arg2 == nil)
+ return nil;
- if (accept)
- nfam->last_accept_pos = nfam->count;
- }
+ if (d_arg1 == t)
+ return d_arg2;
- nfam->nfa.start->a.visited = nfam->visited;
+ if (d_arg2 == t)
+ return d_arg1;
- if (ch && nfam->nclos != 0) {
- if (accept)
- return NFAM_MATCH;
- return NFAM_INCOMPLETE;
- }
+ return cons(or_s, cons(d_arg1, cons(d_arg2, nil)));
+ } else if (sym == and_s) {
+ val d_arg1 = reg_derivative(first(args), ch);
+ val d_arg2 = nil;
- /* Reached if the null character is
- consumed, or NFA hit a transition dead end. */
+ if (d_arg1 == t)
+ return t;
+
+ d_arg2 = reg_derivative(second(args), ch);
- if (nfam->last_accept_pos == nfam->count)
- return NFAM_MATCH;
- if (nfam->last_accept_pos == -1)
- return NFAM_FAIL;
- return NFAM_INCOMPLETE;
+ if (d_arg2 == t)
+ return t;
+
+ return cons(and_s, cons(d_arg1, cons(d_arg2, nil)));
+ } else {
+ internal_error("bad operator in regex");
+ }
+ }
}
-static void regex_destroy(val regex)
+static cnum dv_run(val regex, const wchar_t *str)
{
- nfa_t *pnfa = (nfa_t *) regex->co.handle;
- nfa_free(*pnfa);
- free(pnfa);
- regex->co.handle = 0;
-}
+ const wchar_t *last_accept_pos = 0, *ptr = str;
-static struct cobj_ops regex_obj_ops = {
- cobj_equal_op,
- cobj_print_op,
- regex_destroy,
- cobj_mark_op,
- cobj_hash_op
-};
+ for (; *ptr != 0; ptr++) {
+ wchar_t ch = *ptr;
+ val nullable = reg_nullable(regex);
+ val deriv = reg_derivative(regex, chr(ch));
+
+ if (nullable)
+ last_accept_pos = ptr;
+
+ if (deriv == t)
+ return last_accept_pos ? last_accept_pos - str : -1;
+ }
+
+ if (reg_nullable(regex))
+ return ptr - str;
+ return last_accept_pos ? last_accept_pos - str : -1;
+}
val regex_compile(val regex_sexp)
{
- nfa_t *pnfa = (nfa_t *) chk_malloc(sizeof *pnfa);
- *pnfa = nfa_compile_regex(regex_sexp);
- return cobj((mem_t *) pnfa, regex_s, &regex_obj_ops);
+ if (opt_derivative_regex) {
+ return cons(compiled_regex_s, cons(dv_compile_regex(regex_sexp), nil));
+ } else {
+ nfa_t *pnfa = (nfa_t *) chk_malloc(sizeof *pnfa);
+ *pnfa = nfa_compile_regex(regex_sexp);
+ return cobj((mem_t *) pnfa, regex_s, &regex_obj_ops);
+ }
}
val regexp(val obj)
{
+ if (consp(obj))
+ return if2(eq(car(obj), compiled_regex_s), t);
+
return (is_ptr(obj) && obj->co.type == COBJ && obj->co.cls == regex_s)
? t : nil;
}
-nfa_t *regex_nfa(val reg)
+static nfa_t *regex_nfa(val reg)
{
assert (reg->co.type == COBJ && reg->co.cls == regex_s);
return (nfa_t *) reg->co.handle;
}
+static cnum regex_run(val compiled_regex, const wchar_t *str)
+{
+ if (consp(compiled_regex))
+ return dv_run(compiled_regex, str);
+ return nfa_run(*regex_nfa(compiled_regex), str);
+}
+
+/*
+ * Regex machine: represents the logic of the regex_run function as state
+ * machine object which can be fed one character at a time.
+ */
+
+static void regex_machine_reset(regex_machine_t *regm)
+{
+ int accept = 0;
+
+ regm->n.last_accept_pos = -1;
+ regm->n.count = 0;
+
+ if (regm->n.is_nfa) {
+ regm->n.visited = regm->n.nfa.start->a.visited + 1;
+ regm->n.nmove = 1;
+
+ regm->n.move[0] = regm->n.nfa.start;
+
+ regm->n.nclos = nfa_closure(regm->n.stack, regm->n.move, regm->n.nmove,
+ regm->n.clos, regm->n.visited++, &accept);
+ } else {
+ regm->d.deriv = regm->d.regex;
+ accept = (reg_nullable(regm->d.regex) != nil);
+ }
+
+ if (accept)
+ regm->n.last_accept_pos = regm->n.count;
+}
+
+static void regex_machine_init(regex_machine_t *regm, val regex)
+{
+ if (consp(regex)) {
+ regm->n.is_nfa = 0;
+ regm->d.regex = regex;
+ } else {
+ regm->n.is_nfa = 1;
+ regm->n.nfa = *regex_nfa(regex);
+ regm->n.move = (nfa_state_t **)
+ chk_malloc(NFA_SET_SIZE * sizeof *regm->n.move);
+ regm->n.clos = (nfa_state_t **)
+ chk_malloc(NFA_SET_SIZE * sizeof *regm->n.clos);
+ regm->n.stack = (nfa_state_t **)
+ chk_malloc(NFA_SET_SIZE * sizeof *regm->n.stack);
+ }
+
+ regex_machine_reset(regm);
+}
+
+static void regex_machine_cleanup(regex_machine_t *regm)
+{
+ if (regm->n.is_nfa) {
+ free(regm->n.stack);
+ free(regm->n.clos);
+ free(regm->n.move);
+ regm->n.stack = 0;
+ regm->n.clos = 0;
+ regm->n.move = 0;
+ regm->n.nfa.start = 0;
+ regm->n.nfa.accept = 0;
+ }
+}
+
+static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch)
+{
+ int accept = 0;
+
+ if (regm->n.is_nfa) {
+ if (ch != 0) {
+ regm->n.count++;
+
+ regm->n.nmove = nfa_move(regm->n.clos, regm->n.nclos, regm->n.move, ch);
+ regm->n.nclos = nfa_closure(regm->n.stack, regm->n.move,
+ regm->n.nmove, regm->n.clos,
+ regm->n.visited++, &accept);
+
+ if (accept)
+ regm->n.last_accept_pos = regm->n.count;
+ }
+
+ regm->n.nfa.start->a.visited = regm->n.visited;
+
+ if (ch && regm->n.nclos != 0) {
+ if (accept)
+ return REGM_MATCH;
+ return REGM_INCOMPLETE;
+ }
+ } else {
+ val accept = nil;
+
+ if (ch != 0) {
+ regm->d.count++;
+ regm->d.deriv = reg_derivative(regm->d.deriv, chr(ch));
+ if ((accept = reg_nullable(regm->d.deriv)))
+ regm->d.last_accept_pos = regm->d.count;
+ }
+
+ if (ch && regm->d.deriv != t) {
+ if (accept)
+ return REGM_MATCH;
+ return REGM_INCOMPLETE;
+ }
+ }
+
+ /* Reached if the null character is
+ consumed, or NFA/derivation hit a transition dead end. */
+
+ if (regm->n.last_accept_pos == regm->n.count)
+ return REGM_MATCH;
+ if (regm->n.last_accept_pos == -1)
+ return REGM_FAIL;
+ return REGM_INCOMPLETE;
+}
+
+
val search_regex(val haystack, val needle_regex, val start,
val from_end)
{
- nfa_t *pnfa = regex_nfa(needle_regex);
-
if (length_str_lt(haystack, start)) {
return nil;
} else {
@@ -1185,38 +1495,38 @@ val search_regex(val haystack, val needle_regex, val start,
const wchar_t *h = c_str(haystack);
for (i = c_num(length_str(haystack)) - 1; i >= s; i--) {
- cnum span = nfa_run(*pnfa, h + i);
+ cnum span = regex_run(needle_regex, h + i);
if (span >= 0)
return cons(num(i), num(span));
}
} else {
- nfa_machine_t nfam;
+ regex_machine_t regm;
val i, pos = start, retval;
- nfam_result_t last_res = NFAM_INCOMPLETE;
+ regm_result_t last_res = REGM_INCOMPLETE;
- nfa_machine_init(&nfam, *pnfa);
+ regex_machine_init(&regm, needle_regex);
again:
for (i = pos; length_str_gt(haystack, i); i = plus(i, one)) {
- last_res = nfa_machine_feed(&nfam, c_chr(chr_str(haystack, i)));
+ last_res = regex_machine_feed(&regm, c_chr(chr_str(haystack, i)));
- if (last_res == NFAM_FAIL) {
- nfa_machine_reset(&nfam);
+ if (last_res == REGM_FAIL) {
+ regex_machine_reset(&regm);
pos = plus(pos, one);
goto again;
}
}
- last_res = nfa_machine_feed(&nfam, 0);
+ last_res = regex_machine_feed(&regm, 0);
switch (last_res) {
- case NFAM_INCOMPLETE:
- case NFAM_MATCH:
- retval = cons(pos, num(nfa_machine_match_span(&nfam)));
- nfa_machine_cleanup(&nfam);
+ case REGM_INCOMPLETE:
+ case REGM_MATCH:
+ retval = cons(pos, num(regex_machine_match_span(&regm)));
+ regex_machine_cleanup(&regm);
return retval;
- case NFAM_FAIL:
- nfa_machine_cleanup(&nfam);
+ case REGM_FAIL:
+ regex_machine_cleanup(&regm);
return nil;
}
}
@@ -1227,29 +1537,28 @@ again:
val match_regex(val str, val reg, val pos)
{
- nfa_machine_t nfam;
+ regex_machine_t regm;
val i, retval;
- nfam_result_t last_res = NFAM_INCOMPLETE;
- nfa_t *pnfa = regex_nfa(reg);
+ regm_result_t last_res = REGM_INCOMPLETE;
- nfa_machine_init(&nfam, *pnfa);
+ regex_machine_init(&regm, reg);
for (i = pos; length_str_gt(str, i); i = plus(i, one)) {
- last_res = nfa_machine_feed(&nfam, c_chr(chr_str(str, i)));
- if (last_res == NFAM_FAIL)
+ last_res = regex_machine_feed(&regm, c_chr(chr_str(str, i)));
+ if (last_res == REGM_FAIL)
break;
}
- last_res = nfa_machine_feed(&nfam, 0);
+ last_res = regex_machine_feed(&regm, 0);
switch (last_res) {
- case NFAM_INCOMPLETE:
- case NFAM_MATCH:
- retval = plus(pos, num(nfa_machine_match_span(&nfam)));
- nfa_machine_cleanup(&nfam);
+ case REGM_INCOMPLETE:
+ case REGM_MATCH:
+ retval = plus(pos, num(regex_machine_match_span(&regm)));
+ regex_machine_cleanup(&regm);
return retval;
- case NFAM_FAIL:
- nfa_machine_cleanup(&nfam);
+ case REGM_FAIL:
+ regex_machine_cleanup(&regm);
return nil;
}
diff --git a/regex.h b/regex.h
index 09bfa2d5..eb97955e 100644
--- a/regex.h
+++ b/regex.h
@@ -26,29 +26,7 @@
#include <limits.h>
-typedef union nfa_state nfa_state_t;
-
-typedef struct nfa {
- nfa_state_t *start;
- nfa_state_t *accept;
-} nfa_t;
-
-typedef enum nfam_result {
- NFAM_INCOMPLETE, NFAM_FAIL, NFAM_MATCH
-} nfam_result_t;
-
-typedef struct nfa_machine nfa_machine_t;
-
-nfa_t nfa_compile_regex(val regex);
-void nfa_free(nfa_t);
-cnum nfa_run(nfa_t nfa, const wchar_t *str);
-void nfa_machine_reset(nfa_machine_t *);
-void nfa_machine_init(nfa_machine_t *, nfa_t);
-void nfa_machine_cleanup(nfa_machine_t *);
-nfam_result_t nfa_machine_feed(nfa_machine_t *, wchar_t ch);
-cnum nfa_machine_match_span(nfa_machine_t *);
val regex_compile(val regex_sexp);
val regexp(val);
-nfa_t *regex_nfa(val);
val search_regex(val haystack, val needle_regex, val start_num, val from_end);
val match_regex(val str, val regex, val pos);
diff --git a/txr.h b/txr.h
index 36f29715..40cc2a39 100644
--- a/txr.h
+++ b/txr.h
@@ -31,6 +31,7 @@ extern int opt_gc_debug;
#ifdef HAVE_VALGRIND
extern int opt_vg_debug;
#endif
+extern int opt_derivative_regex;
extern const wchar_t *version;
extern const wchar_t *progname;
extern int output_produced;