diff options
-rw-r--r-- | ChangeLog | 22 | ||||
-rw-r--r-- | lib.c | 7 | ||||
-rw-r--r-- | match.c | 12 | ||||
-rw-r--r-- | regex.c | 107 | ||||
-rw-r--r-- | regex.h | 7 |
5 files changed, 119 insertions, 36 deletions
@@ -1,5 +1,27 @@ 2009-10-20 Kaz Kylheku <kkylheku@gmail.com> + Working regex over lazy strings from freeform. + + Bugfixes. + + * lib.c (length_str): Fixed recursion to wrong length function. + (lazy_str_force): March down list properly. Update lazy + string's limit value. + + * match.c (match_line): Convert to lazy-string-aware style; i.e. + avoidance of triggering a force of the whole string. + (match_files): Bugfix in argument processing of freeform directive. + + * regex.h (nfam_result_t): New typedef. + (nfa_machine_reset): New function declaration. + (nfa_machine_feed): Updated declaration. + + * regex.c (nfa_machine_init): Refactor to use nfa_machine_reset. + (nfa_machine_feed): Return nfam_result_t rather than just int. + (search_regex, match_regex): Refactor to work with lazy strings well. + +2009-10-20 Kaz Kylheku <kkylheku@gmail.com> + Implement custom separator and limit in freeform. * lib.h (lazy_string): New struct member, opts. @@ -736,7 +736,7 @@ obj_t *length_str(obj_t *str) if (str->ls.type == LSTR) { lazy_str_force(str); - return length(str->ls.prefix); + return length_str(str->ls.prefix); } if (!str->st.len) @@ -1391,10 +1391,13 @@ obj_t *lazy_str_force(obj_t *lstr) } else while (gt(lim, zero) && lstr->ls.list) { lstr->ls.prefix = cat_str(list(lstr->ls.prefix, car(lstr->ls.list), nao), or2(car(lstr->ls.opts), string("\n"))); - lstr->ls.list = nil; + lstr->ls.list = cdr(lstr->ls.list); lim = minus(lim, one); } + if (lim) + *cdr_l(lstr->ls.opts) = lim; + return lstr->ls.prefix; } @@ -318,8 +318,7 @@ obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, } else if (nump(modifier)) { obj_t *past = plus(pos, modifier); - if (c_num(past) > c_num(length_str(dataline)) || - c_num(past) < c_num(pos)) + if (length_str_lt(dataline, past) || lt(past, pos)) { LOG_MISMATCH("fixed field size"); return nil; @@ -351,8 +350,7 @@ obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, pos = past; } else if (nump(modifier)) { obj_t *past = plus(pos, modifier); - if (c_num(past) > c_num(length_str(dataline)) || - c_num(past) < c_num(pos)) + if (length_str_lt(dataline, past) || lt(past, pos)) { LOG_MISMATCH("count based var"); return nil; @@ -466,12 +464,12 @@ obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, if (new_pos && !equal(new_pos, pos)) { pos = new_pos; - assert (c_num(pos) <= c_num(length_str(dataline))); + bug_unless (length_str_ge(dataline, pos)); } else { pos = plus(pos, one); } - if (c_num(pos) >= c_num(length_str(dataline))) + if (length_str_le(dataline, pos)) break; } @@ -984,7 +982,7 @@ repeat_spec_same_data: return nil; } } else if (sym == freeform) { - obj_t *args = rest(rest(first_spec)); + obj_t *args = rest(first_spec); obj_t *vals = mapcar(func_n1(cdr), mapcar(bind2other(func_n2(eval_form), bindings), args)); @@ -578,20 +578,17 @@ long nfa_machine_match_span(nfa_machine_t *nfam) * NFA machine: represents the logic of the nfa_run function as state machine * object which can be fed one character at a time. */ -void nfa_machine_init(nfa_machine_t *nfam, nfa_t nfa) + +void nfa_machine_reset(nfa_machine_t *nfam) { int accept = 0; - nfam->nfa = nfa; nfam->last_accept_pos = -1; - nfam->visited = nfa.start->a.visited + 1; - nfam->move = chk_malloc(NFA_SET_SIZE * sizeof *nfam->move); - nfam->clos = chk_malloc(NFA_SET_SIZE * sizeof *nfam->clos); - nfam->stack = chk_malloc(NFA_SET_SIZE * sizeof *nfam->stack); + nfam->visited = nfam->nfa.start->a.visited + 1; nfam->nmove = 1; nfam->count = 0; - nfam->move[0] = nfa.start; + nfam->move[0] = nfam->nfa.start; nfam->nclos = nfa_closure(nfam->stack, nfam->move, nfam->nmove, nfam->clos, nfam->visited++, &accept); @@ -600,6 +597,15 @@ void nfa_machine_init(nfa_machine_t *nfam, nfa_t nfa) nfam->last_accept_pos = nfam->count; } +void nfa_machine_init(nfa_machine_t *nfam, nfa_t nfa) +{ + nfam->nfa = nfa; + nfam->move = chk_malloc(NFA_SET_SIZE * sizeof *nfam->move); + nfam->clos = chk_malloc(NFA_SET_SIZE * sizeof *nfam->clos); + nfam->stack = chk_malloc(NFA_SET_SIZE * sizeof *nfam->stack); + nfa_machine_reset(nfam); +} + void nfa_machine_cleanup(nfa_machine_t *nfam) { free(nfam->stack); @@ -612,7 +618,7 @@ void nfa_machine_cleanup(nfa_machine_t *nfam) nfam->nfa.accept = 0; } -int nfa_machine_feed(nfa_machine_t *nfam, int ch) +nfam_result_t nfa_machine_feed(nfa_machine_t *nfam, int ch) { int accept = 0; @@ -680,26 +686,54 @@ nfa_t *regex_nfa(obj_t *reg) return (nfa_t *) reg->co.handle; } -obj_t *search_regex(obj_t *haystack, obj_t *needle_regex, obj_t *start_num, +obj_t *search_regex(obj_t *haystack, obj_t *needle_regex, obj_t *start, obj_t *from_end) { - const char *h = c_str(haystack); - long len = c_num(length_str(haystack)); - long start = c_num(start_num); nfa_t *pnfa = regex_nfa(needle_regex); - if (start > len) { + if (length_str_lt(haystack, start)) { return nil; } else { - long begin = from_end ? len : start; - long end = from_end ? start - 1 : len + 1; - int incr = from_end ? -1 : 1; - long i; - - for (i = begin; i != end; i += incr) { - long span = nfa_run(*pnfa, h + i); - if (span >= 0) - return cons(num(i), num(span)); + if (from_end) { + long i; + long s = c_num(start); + const char *h = c_str(haystack); + + for (i = c_num(length_str(haystack)) - 1; i >= s; i--) { + long span = nfa_run(*pnfa, h + i); + if (span >= 0) + return cons(num(i), num(span)); + } + } else { + nfa_machine_t nfam; + obj_t *i, *pos = start, *retval; + nfam_result_t last_res = NFAM_INCOMPLETE; + + nfa_machine_init(&nfam, *pnfa); + +again: + for (i = pos; length_str_gt(haystack, i); i = plus(i, one)) { + last_res = nfa_machine_feed(&nfam, c_chr(chr_str(haystack, i))); + + if (last_res == NFAM_FAIL) { + nfa_machine_reset(&nfam); + pos = plus(pos, one); + goto again; + } + } + + last_res = nfa_machine_feed(&nfam, 0); + + switch (last_res) { + case NFAM_INCOMPLETE: + case NFAM_MATCH: + retval = cons(pos, num(nfa_machine_match_span(&nfam))); + nfa_machine_cleanup(&nfam); + return retval; + case NFAM_FAIL: + nfa_machine_cleanup(&nfam); + return nil; + } } return nil; @@ -708,8 +742,31 @@ obj_t *search_regex(obj_t *haystack, obj_t *needle_regex, obj_t *start_num, obj_t *match_regex(obj_t *str, obj_t *reg, obj_t *pos) { + nfa_machine_t nfam; + obj_t *i, *retval; + nfam_result_t last_res = NFAM_INCOMPLETE; nfa_t *pnfa = regex_nfa(reg); - long cpos = c_num(pos); - long span = nfa_run(*pnfa, c_str(str) + cpos); - return span >= 0 ? num(span + cpos) : nil; + + nfa_machine_init(&nfam, *pnfa); + + for (i = pos; length_str_gt(str, i); i = plus(i, one)) { + last_res = nfa_machine_feed(&nfam, c_chr(chr_str(str, i))); + if (last_res == NFAM_FAIL) + break; + } + + last_res = nfa_machine_feed(&nfam, 0); + + switch (last_res) { + case NFAM_INCOMPLETE: + case NFAM_MATCH: + retval = plus(pos, num(nfa_machine_match_span(&nfam))); + nfa_machine_cleanup(&nfam); + return retval; + case NFAM_FAIL: + nfa_machine_cleanup(&nfam); + return nil; + } + + return nil; } @@ -95,7 +95,9 @@ typedef struct nfa { nfa_state_t *accept; } nfa_t; -enum nfam_result { NFAM_INCOMPLETE, NFAM_FAIL, NFAM_MATCH }; +typedef enum nfam_result { + NFAM_INCOMPLETE, NFAM_FAIL, NFAM_MATCH +} nfam_result_t; typedef struct nfa_machine { long last_accept_pos; @@ -109,9 +111,10 @@ typedef struct nfa_machine { nfa_t nfa_compile_regex(obj_t *regex); void nfa_free(nfa_t); long nfa_run(nfa_t nfa, const char *str); +void nfa_machine_reset(nfa_machine_t *); void nfa_machine_init(nfa_machine_t *, nfa_t); void nfa_machine_cleanup(nfa_machine_t *); -int nfa_machine_feed(nfa_machine_t *, int ch); +nfam_result_t nfa_machine_feed(nfa_machine_t *, int ch); long nfa_machine_match_span(nfa_machine_t *); obj_t *regex_compile(obj_t *regex_sexp); obj_t *regexp(obj_t *); |