diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2015-09-27 22:02:44 -0700 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2015-09-27 22:02:44 -0700 |
commit | 7aebb0c779f9ec66da657f4196ed4115c15b359b (patch) | |
tree | a14bb2be29266810c7e2685ca12f2f02a98fb00d /regex.c | |
parent | 10a77c022921502a7136aa4f7769406a438596d8 (diff) | |
download | txr-7aebb0c779f9ec66da657f4196ed4115c15b359b.tar.gz txr-7aebb0c779f9ec66da657f4196ed4115c15b359b.tar.bz2 txr-7aebb0c779f9ec66da657f4196ed4115c15b359b.zip |
S-exp level regex optimization.
* regex.c (dv_compile_regex): Replaced by two functions,
reg_expand_nongreedy and reg_compile_csets.
(reg_expand_nongreedy, reg_compile_csets): New static
functions.
(reg_optimize): New static function.
(regex_compile): Expand nongreedy syntax in incoming regex,
and then optimize it before deciding whether to use NFA or
derivatives. If derivatives are used, compile the
character sets in the regex to character set objects.
(regex_init): Register some intrinsic functions for debugging,
sys:reg-expand-nongreedy and sys:reg-optimize.
Diffstat (limited to 'regex.c')
-rw-r--r-- | regex.c | 188 |
1 files changed, 156 insertions, 32 deletions
@@ -43,6 +43,7 @@ #include "unwind.h" #include "stream.h" #include "gc.h" +#include "eval.h" #include "regex.h" #include "txr.h" @@ -1338,53 +1339,32 @@ static struct cobj_ops regex_obj_ops = cobj_ops_init(eq, static val reg_nullable(val); -/* - * ``Compile'' raw regular expression to a form that is easier to simulate by - * the derivative method. Here we currently replace character set regexps with - * character set objects, and also transform the nongreedy syntax into the more - * complex expression it represents. - */ -static val dv_compile_regex(val exp) +static val reg_expand_nongreedy(val exp) { - if (exp == space_k) { - return cobj(coerce(mem_t *, space_cs), chset_s, &char_set_obj_ops); - } else if (exp == digit_k) { - return cobj(coerce(mem_t *, digit_cs), chset_s, &char_set_obj_ops); - } else if (exp == word_char_k) { - return cobj(coerce(mem_t *, word_cs), chset_s, &char_set_obj_ops); - } else if (exp == cspace_k) { - return cobj(coerce(mem_t *, cspace_cs), chset_s, &char_set_obj_ops); - } else if (exp == cdigit_k) { - return cobj(coerce(mem_t *, cdigit_cs), chset_s, &char_set_obj_ops); - } else if (exp == cword_char_k) { - return cobj(coerce(mem_t *, cword_cs), chset_s, &char_set_obj_ops); - } else if (symbolp(exp) || chrp(exp)) { + if (atom(exp)) { return exp; - } else if (stringp(exp)) { - return cons(compound_s, list_str(exp)); } else if (consp(exp)) { val sym = first(exp); val args = rest(exp); if (sym == set_s || sym == cset_s) { - char_set_t *set = char_set_compile(args, eq(sym, cset_s)); - return cobj(coerce(mem_t *, set), chset_s, &char_set_obj_ops); + return exp; } else if (sym == compound_s) { list_collect_decl (out, iter); iter = list_collect(iter, compound_s); for (; args; args = cdr(args)) - iter = list_collect(iter, dv_compile_regex(first(args))); + iter = list_collect(iter, reg_expand_nongreedy(first(args))); return out; } else if (sym == zeroplus_s || sym == oneplus_s || sym == optional_s || sym == compl_s) { - return cons(sym, cons(dv_compile_regex(first(args)), nil)); + return cons(sym, cons(reg_expand_nongreedy(first(args)), nil)); } else if (sym == or_s || sym == and_s) { - val xfirst = dv_compile_regex(first(args)); - val xsecond = dv_compile_regex(second(args)); + val xfirst = reg_expand_nongreedy(first(args)); + val xsecond = reg_expand_nongreedy(second(args)); return cons(sym, cons(xfirst, cons(xsecond, nil))); } else if (sym == nongreedy_s) { - val xfirst = dv_compile_regex(first(args)); - val xsecond = dv_compile_regex(second(args)); + val xfirst = reg_expand_nongreedy(first(args)); + val xsecond = reg_expand_nongreedy(second(args)); val zplus = cons(zeroplus_s, cons(xfirst, nil)); if (xsecond == nil) { @@ -1414,6 +1394,51 @@ static val dv_compile_regex(val exp) uw_throwf(error_s, lit("bad object in regex syntax: ~s"), exp, nao); } } +static val reg_compile_csets(val exp) +{ + if (exp == space_k) { + return cobj(coerce(mem_t *, space_cs), chset_s, &char_set_obj_ops); + } else if (exp == digit_k) { + return cobj(coerce(mem_t *, digit_cs), chset_s, &char_set_obj_ops); + } else if (exp == word_char_k) { + return cobj(coerce(mem_t *, word_cs), chset_s, &char_set_obj_ops); + } else if (exp == cspace_k) { + return cobj(coerce(mem_t *, cspace_cs), chset_s, &char_set_obj_ops); + } else if (exp == cdigit_k) { + return cobj(coerce(mem_t *, cdigit_cs), chset_s, &char_set_obj_ops); + } else if (exp == cword_char_k) { + return cobj(coerce(mem_t *, cword_cs), chset_s, &char_set_obj_ops); + } else if (symbolp(exp) || chrp(exp)) { + return exp; + } else if (stringp(exp)) { + return cons(compound_s, list_str(exp)); + } else if (consp(exp)) { + val sym = first(exp); + val args = rest(exp); + + if (sym == set_s || sym == cset_s) { + char_set_t *set = char_set_compile(args, eq(sym, cset_s)); + return cobj(coerce(mem_t *, set), chset_s, &char_set_obj_ops); + } else if (sym == compound_s) { + list_collect_decl (out, iter); + iter = list_collect(iter, compound_s); + for (; args; args = cdr(args)) + iter = list_collect(iter, reg_compile_csets(first(args))); + return out; + } else if (sym == zeroplus_s || sym == oneplus_s || + sym == optional_s || sym == compl_s || sym == nongreedy_s) { + return cons(sym, cons(reg_compile_csets(first(args)), nil)); + } else if (sym == or_s || sym == and_s) { + val xfirst = reg_compile_csets(first(args)); + val xsecond = reg_compile_csets(second(args)); + return cons(sym, cons(xfirst, cons(xsecond, nil))); + } else { + uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao); + } + } else { + uw_throwf(error_s, lit("bad object in regex syntax: ~s"), exp, nao); + } +} /* * Helper to reg_nullable for recursing over @@ -1699,6 +1724,98 @@ static cnum dv_run(val regex, const wchar_t *str) return last_accept_pos ? last_accept_pos - str : -1; } +static val reg_optimize(val exp) +{ + if (atom(exp)) { + return exp; + } else if (reg_matches_all(exp)) { + return cons(zeroplus_s, cons(wild_s, nil)); + } else { + val sym = first(exp); + val args = rest(exp); + + if (sym == set_s) { + return if3(rest(exp), exp, t); + } else if (sym == cset_s) { + return if3(rest(exp), exp, wild_s); + } else if (sym == compound_s) { + val xargs = mapcar(func_n1(reg_optimize), args); + while (rest(xargs)) { + if (reg_matches_all(first(xargs)) && + reg_matches_all(second(xargs))) + { + pop(&xargs); + continue; + } + break; + } + if (!xargs) + return nil; + if (!cdr(xargs)) + return car(xargs); + if (memq(t, xargs)) + return t; + return cons(sym, xargs); + } else if (sym == zeroplus_s) { + val arg = reg_optimize(first(args)); + if (consp(arg) && car(arg) == zeroplus_s) + return arg; + if (reg_matches_all(arg)) + return arg; + return cons(sym, cons(arg, nil)); + } else if (sym == oneplus_s) { + val arg = reg_optimize(first(args)); + if (reg_matches_all(arg)) + return cons(zeroplus_s, cons(wild_s, nil)); + if (consp(arg) && car(arg) == zeroplus_s) + return arg; + return cons(sym, cons(arg, nil)); + } else if (sym == optional_s) { + val arg = reg_optimize(first(args)); + if (reg_matches_all(arg)) + return arg; + return cons(sym, cons(arg, nil)); + } else if (sym == compl_s) { + val arg = reg_optimize(first(args)); + if (reg_matches_all(arg)) + return t; + if (arg == nil) + return cons(oneplus_s, cons(wild_s, nil)); + return cons(sym, cons(arg, nil)); + } else if (sym == or_s) { + val arg1 = reg_optimize(pop(&args)); + val arg2 = reg_optimize(pop(&args)); + + if (arg1 == t || reg_matches_all(arg2)) + return arg2; + + if (arg2 == t || reg_matches_all(arg1)) + return arg1; + + return cons(sym, cons(arg1, cons(arg2, nil))); + } else if (sym == and_s) { + val arg1 = reg_optimize(pop(&args)); + val arg2 = reg_optimize(pop(&args)); + + if (arg1 == t || arg2 == t) + return t; + + if (reg_matches_all(arg1)) { + if (reg_matches_all(arg2)) + return cons(zeroplus_s, cons(wild_s, nil)); + return arg2; + } + + if (reg_matches_all(arg2)) + return arg2; + + return cons(sym, cons(arg1, cons(arg2, nil))); + } else { + uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao); + } + } +} + static val regex_requires_dv(val exp) { if (atom(exp)) { @@ -1732,10 +1849,14 @@ val regex_compile(val regex_sexp, val error_stream) if (stringp(regex_sexp)) { regex_sexp = regex_parse(regex_sexp, default_bool_arg(error_stream)); return if2(regex_sexp, regex_compile(regex_sexp, error_stream)); - } else if (opt_derivative_regex || regex_requires_dv(regex_sexp)) { + } + + regex_sexp = reg_optimize(reg_expand_nongreedy(regex_sexp)); + + if (opt_derivative_regex || regex_requires_dv(regex_sexp)) { regex_t *regex = coerce(regex_t *, chk_malloc(sizeof *regex)); val ret; - val dv = dv_compile_regex(regex_sexp); + val dv = reg_compile_csets(regex_sexp); regex->kind = REGEX_DV; regex->nstates = 0; regex->source = nil; @@ -2312,5 +2433,8 @@ void regex_init(void) cdigit_k = intern(lit("cdigit"), keyword_package); cword_char_k = intern(lit("cword-char"), keyword_package); + reg_fun(intern(lit("reg-expand-nongreedy"), system_package), + func_n1(reg_expand_nongreedy)); + reg_fun(intern(lit("reg-optimize"), system_package), func_n1(reg_optimize)); init_special_char_sets(); } |