diff options
Diffstat (limited to 'regex.c')
-rw-r--r-- | regex.c | 631 |
1 files changed, 631 insertions, 0 deletions
diff --git a/regex.c b/regex.c new file mode 100644 index 00000000..a48b3ff5 --- /dev/null +++ b/regex.c @@ -0,0 +1,631 @@ +/* Copyright 2009 + * Kaz Kylheku <kkylheku@gmail.com> + * Vancouver, Canada + * All rights reserved. + * + * BSD License: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <dirent.h> +#include "lib.h" +#include "regex.h" + +#define NFA_SET_SIZE 512 + +#define CHAR_SET_INDEX(CH) ((CH) / (sizeof (bitcell_t) * CHAR_BIT)) +#define CHAR_SET_BIT(CH) ((CH) % (sizeof (bitcell_t) * CHAR_BIT)) + +void char_set_clear(char_set_t *set) +{ + static const char_set_t blank = { { 0 } }; + *set = blank; +} + +void char_set_compl(char_set_t *set) +{ + int i; + for (i = 0; i < CHAR_SET_SIZE; i++) + set->bitcell[i] ^= BITCELL_ALL1; +} + +void char_set_add(char_set_t *set, int ch) +{ + set->bitcell[CHAR_SET_INDEX(ch)] |= (1 << CHAR_SET_BIT(ch)); +} + +void char_set_add_range(char_set_t *set, int ch0, int ch1) +{ + if (ch0 <= ch1) { + int i; + int bt0 = CHAR_SET_BIT(ch0); + int bc0 = CHAR_SET_INDEX(ch0); + bitcell_t mask0 = ~((BITCELL_LIT(1) << bt0) - 1); + int bt1 = CHAR_SET_BIT(ch1); + int bc1 = CHAR_SET_INDEX(ch1); + bitcell_t mask1 = ((BITCELL_LIT(1) << (bt1 + 1) % 32) - 1); + + switch (bc1 - bc0) { + case 0: + set->bitcell[bc0] |= (mask0 & mask1); + break; + default: + set->bitcell[bc0] |= mask0; + set->bitcell[bc1] |= mask1; + case 1: + for (i = bc0 + 1; i < bc1; i++) + set->bitcell[i] = BITCELL_ALL1; + break; + } + } +} + +int char_set_contains(char_set_t *set, int ch) +{ + return (set->bitcell[CHAR_SET_INDEX(ch)] & (1 << CHAR_SET_BIT(ch))) != 0; +} + +nfa_state_t *nfa_state_accept(void) +{ + nfa_state_t *st = (nfa_state_t *) chk_malloc(sizeof *st); + st->a.kind = nfa_accept; + st->a.visited = 0; + return st; +} + +nfa_state_t *nfa_state_empty(nfa_state_t *t0, nfa_state_t *t1) +{ + nfa_state_t *st = (nfa_state_t *) chk_malloc(sizeof *st); + st->e.kind = nfa_empty; + st->e.visited = 0; + st->e.trans0 = t0; + st->e.trans1 = t1; + return st; +} + +nfa_state_t *nfa_state_single(nfa_state_t *t, int ch) +{ + nfa_state_t *st = (nfa_state_t *) chk_malloc(sizeof *st); + st->o.kind = nfa_single; + st->o.visited = 0; + st->o.trans = t; + st->o.ch = ch; + return st; +} + +nfa_state_t *nfa_state_wild(nfa_state_t *t) +{ + nfa_state_t *st = (nfa_state_t *) chk_malloc(sizeof *st); + st->o.kind = nfa_wild; + st->o.visited = 0; + st->o.trans = t; + st->o.ch = 0; + return st; +} + +void nfa_state_free(nfa_state_t *st) +{ + if (st->a.kind == nfa_set) + free(st->s.set); + free(st); +} + +void nfa_state_shallow_free(nfa_state_t *st) +{ + free(st); +} + +nfa_state_t *nfa_state_set(nfa_state_t *t) +{ + nfa_state_t *st = (nfa_state_t *) chk_malloc(sizeof *st); + char_set_t *cs = (char_set_t *) chk_malloc(sizeof *cs); + char_set_clear(cs); + st->s.kind = nfa_set; + st->s.visited = 0; + st->s.trans = t; + st->s.set = cs; + return st; +} + +/* + * An acceptance state is converted to an empty transition + * state with specified transitions. It thereby loses + * its acceptance state status. This is used during + * compilation to hook new output paths into an inner NFA, + * either back to itself, or to a new state in the + * surrounding new NFA. + */ +void nfa_state_empty_convert(nfa_state_t *acc, nfa_state_t *t0, nfa_state_t *t1) +{ + assert (acc->a.kind == nfa_accept); + acc->e.kind = nfa_empty; + acc->e.trans0 = t0; + acc->e.trans1 = t1; +} + +/* + * Acceptance state takes on the kind of st, and all associated + * data. I.e. we merge the identity of accept, + * with the contents of st, such that the new state has + * all of the outgoing arrows of st, and + * all of the incoming arrows of acc. + * This is easily done with an assignment, provided + * that st doesn't have any incoming arrows. + * We ensure that start states don't have any incoming + * arrows in the compiler, by ensuring that repetition + * operators terminate their backwards arrows on an + * existing start state, and allocate a new start + * state in front of it. + */ +void nfa_state_merge(nfa_state_t *acc, nfa_state_t *st) +{ + assert (acc->a.kind == nfa_accept); + *acc = *st; +} + +nfa_t nfa_make(nfa_state_t *s, nfa_state_t *acc) +{ + nfa_t ret; + ret.start = s; + ret.accept = acc; + return ret; +} + +/* + * Combine two NFA's representing regexps that are catenated. + * The acceptance state of the predecessor is merged with the start state of + * the successor. + */ +nfa_t nfa_combine(nfa_t pred, nfa_t succ) +{ + nfa_t ret; + ret.start = pred.start; + ret.accept = succ.accept; + nfa_state_merge(pred.accept, succ.start); + nfa_state_shallow_free(succ.start); /* No longer needed. */ + return ret; +} + +nfa_t nfa_compile_set(obj_t *args, int compl) +{ + nfa_state_t *acc = nfa_state_accept(); + nfa_state_t *s = nfa_state_set(acc); + char_set_t *set = s->s.set; + nfa_t ret = nfa_make(s, acc); + + for (; args; args = rest(args)) { + obj_t *item = first(args); + + if (consp(item)) { + obj_t *from = car(item); + obj_t *to = cdr(item); + + assert (typeof(from) == chr_t && typeof(to) == chr_t); + char_set_add_range(set, c_chr(from), c_chr(to)); + } else if (typeof(item) == chr_t) { + char_set_add(set, c_chr(item)); + } else { + assert(0 && "bad regex set"); + } + } + + if (compl) + char_set_compl(set); + + return ret; +} + +/* + * Input is the items from a regex form, + * not including the regex symbol. + * I.e. (rest '(regex ...)) not '(regex ...). + */ +nfa_t nfa_compile_regex(obj_t *items) +{ + if (nullp(items)) { + nfa_state_t *acc = nfa_state_accept(); + nfa_state_t *s = nfa_state_empty(acc, 0); + nfa_t nfa = nfa_make(s, acc); + return nfa; + } else { + obj_t *item = first(items), *others = rest(items); + nfa_t nfa; + + if (typeof(item) == chr_t) { + nfa_state_t *acc = nfa_state_accept(); + nfa_state_t *s = nfa_state_single(acc, c_chr(item)); + nfa = nfa_make(s, acc); + } else if (item == wild) { + nfa_state_t *acc = nfa_state_accept(); + nfa_state_t *s = nfa_state_wild(acc); + nfa = nfa_make(s, acc); + } else if (consp(item)) { + obj_t *sym = first(item); + obj_t *args = rest(item); + + if (sym == set) { + nfa = nfa_compile_set(args, 0); + } else if (sym == cset) { + nfa = nfa_compile_set(args, 1); + } else if (sym == compound) { + nfa = nfa_compile_regex(args); + } else if (sym == zeroplus) { + nfa_t nfa_args = nfa_compile_regex(args); + nfa_state_t *acc = nfa_state_accept(); + /* New start state has empty transitions going through + the inner NFA, or skipping it right to the new acceptance state. */ + nfa_state_t *s = nfa_state_empty(nfa_args.start, acc); + /* Convert acceptance state of inner NFA to one which has + an empty transition back to the start state, and + an empty transition to the new acceptance state. */ + nfa_state_empty_convert(nfa_args.accept, nfa_args.start, acc); + nfa = nfa_make(s, acc); + } else if (sym == oneplus) { + /* One-plus case differs from zero-plus in that the new start state + does not have an empty transition to the acceptance state. + So the inner NFA must be traversed once. */ + nfa_t nfa_args = nfa_compile_regex(args); + nfa_state_t *acc = nfa_state_accept(); + nfa_state_t *s = nfa_state_empty(nfa_args.start, 0); /* <-- diff */ + nfa_state_empty_convert(nfa_args.accept, nfa_args.start, acc); + nfa = nfa_make(s, acc); + } else if (sym == optional) { + /* In this case, we can keep the acceptance state of the inner + NFA as the acceptance state of the new NFA. We simply add + a new start state which can short-circuit to it via an empty + transition. */ + nfa_t nfa_args = nfa_compile_regex(args); + nfa_state_t *s = nfa_state_empty(nfa_args.start, nfa_args.accept); + nfa = nfa_make(s, nfa_args.accept); + } else if (sym == or) { + /* Simple: make a new start and acceptance state, which form + the ends of a spindle that goes through two branches. */ + nfa_t nfa_first = nfa_compile_regex(first(args)); + nfa_t nfa_second = nfa_compile_regex(second(args)); + nfa_state_t *acc = nfa_state_accept(); + /* New state s has empty transitions into each inner NFA. */ + nfa_state_t *s = nfa_state_empty(nfa_first.start, nfa_second.start); + /* Acceptance state of each inner NFA converted to empty + transition to new combined acceptance state. */ + nfa_state_empty_convert(nfa_first.accept, acc, 0); + nfa_state_empty_convert(nfa_second.accept, acc, 0); + nfa = nfa_make(s, acc); + } else { + assert (0 && "internal error: bad operator in regex"); + } + } else { + assert (0 && "internal error: bad regex item"); + } + + /* We made an NFA for the first item, but others follow. + Compile the others to an NFA recursively, then + stick it with this NFA. */ + if (others) { + nfa_t nfa_others = nfa_compile_regex(others); + nfa = nfa_combine(nfa, nfa_others); + } + + return nfa; + } +} + +int nfa_all_states(nfa_state_t **inout, int num, int visited) +{ + int i; + + for (i = 0; i < num; i++) + inout[i]->a.visited = visited; + + for (i = 0; i < num; i++) { + nfa_state_t *s = inout[i]; + + if (num >= NFA_SET_SIZE) + abort(); + + switch (s->a.kind) { + case nfa_accept: + break; + case nfa_empty: + { + nfa_state_t *e0 = s->e.trans0; + nfa_state_t *e1 = s->e.trans1; + + if (e0 && e0->a.visited != visited) { + e0->a.visited = visited; + inout[num++] = e0; + } + if (e1 && e1->a.visited != visited) { + e1->a.visited = visited; + inout[num++] = e1; + } + } + break; + case nfa_wild: + case nfa_single: + case nfa_set: + if (s->o.trans->a.visited != visited) { + s->o.trans->a.visited = visited; + inout[num++] = s->o.trans; + } + break; + } + } + + if (num > NFA_SET_SIZE) + abort(); + + return num; +} + +void nfa_free(nfa_t nfa) +{ + nfa_state_t **all = chk_malloc(NFA_SET_SIZE * sizeof *all); + int nstates, i; + + all[0] = nfa.start; + all[1] = nfa.accept; + + nstates = nfa_all_states(all, 2, nfa.start->a.visited); + + for (i = 0; i < nstates; i++) + nfa_state_free(all[i]); + + free(all); +} + +/* + * Compute the epsilon-closure of the NFA states stored in the set in, whose + * size is given by nin. The results are stored in the set out, the size of + * which is returned. The stack parameter provides storage used by the + * algorithm, so it doesn't have to be allocated and freed repeatedly. + * The visited parameter is a stamp used for marking states which are added + * to the epsilon-closure set, so that sets are not added twice. + * If any of the states added to the closure are acceptance states, + * the accept parameter is used to store the flag 1. + * + * An epsilon-closure is the set of all input states, plus all additional + * states which are reachable from that set with empty (epsilon) transitions. + * (Transitions that don't do not consume and match an input character). + */ +int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin, + nfa_state_t **out, int visited, int *accept) +{ + int i, nout = 0; + int stackp = 0; + + /* First, add all states in the input state to the closure, + push them on the stack, and mark them as visited. */ + for (i = 0; i < nin; i++) { + if (stackp >= NFA_SET_SIZE) + abort(); + in[i]->a.visited = visited; + stack[stackp++] = in[i]; + out[nout++] = in[i]; + if (in[i]->a.kind == nfa_accept) + *accept = 1; + } + + while (stackp) { + nfa_state_t *top = stack[--stackp]; + + if (nout >= NFA_SET_SIZE) + abort(); + + /* Only states of type nfa_empty are interesting. + Each such state at most two epsilon transitions. */ + + if (top->a.kind == nfa_empty) { + nfa_state_t *e0 = top->e.trans0; + nfa_state_t *e1 = top->e.trans1; + + if (e0 && e0->a.visited != visited) { + e0->a.visited = visited; + stack[stackp++] = e0; + out[nout++] = e0; + if (e0->a.kind == nfa_accept) + *accept = 1; + } + + if (e1 && e1->a.visited != visited) { + e1->a.visited = visited; + stack[stackp++] = e1; + out[nout++] = e1; + if (e1->a.kind == nfa_accept) + *accept = 1; + } + } + } + + if (nout > NFA_SET_SIZE) + abort(); + + return nout; +} + +/* + * Compute the move set from a given set of NFA states. The move + * set is the set of states which are reachable from the set of + * input states on the consumpion of the input character given by ch. + */ +int nfa_move(nfa_state_t **in, int nin, nfa_state_t **out, int ch) +{ + int i, nmove; + + for (nmove = 0, i = 0; i < nin; i++) { + nfa_state_t *s = in[i]; + + switch (s->a.kind) { + case nfa_wild: + /* Unconditional match; don't have to look at ch. */ + break; + case nfa_single: + if (s->o.ch == ch) /* Character match. */ + break; + continue; /* no match */ + case nfa_set: + if (char_set_contains(s->s.set, ch)) /* Set match. */ + break; + continue; /* no match */ + default: + /* Epsilon-transition and acceptance states have no character moves. */ + continue; + } + + /* The state matches the character, so add it to the move set. + C trick: all character-transitioning state types have the + pointer to the next state in the same position, + among a common set of leading struct members in the union. */ + + if (nmove >= NFA_SET_SIZE) + abort(); + out[nmove++] = s->o.trans; + } + + return nmove; +} + +/* + * Match regex against the string in. The match is + * anchored to the front of the string; to search + * within the string, a .* must be added to the front + * of the regex. + * + * Returns the length of the prefix of the string + * which matches the regex. Or, if you will, + * the position of the first mismatching + * character. + * + * If the regex does not match at all, zero is + * returned. + * + * Matching stops when a state is reached from which + * there are no transitions on the next input character, + * or when the string runs out of characters. + * The most recently visited acceptance state then + * determines the match length (defaulting to zero + * if no acceptance states were encountered). + */ +long nfa_run(nfa_t nfa, const char *str) +{ + const char *last_accept_pos = 0, *ptr = str; + unsigned visited = nfa.start->a.visited + 1; + nfa_state_t **move = chk_malloc(NFA_SET_SIZE * sizeof *move); + nfa_state_t **clos = chk_malloc(NFA_SET_SIZE * sizeof *clos); + nfa_state_t **stack = chk_malloc(NFA_SET_SIZE * sizeof *stack); + int nmove = 1, nclos; + int accept = 0; + + move[0] = nfa.start; + + nclos = nfa_closure(stack, move, nmove, clos, visited++, &accept); + + if (accept) + last_accept_pos = ptr; + + for (; *ptr != 0; ptr++) { + int ch = *ptr; + + accept = 0; + + nmove = nfa_move(clos, nclos, move, ch); + nclos = nfa_closure(stack, move, nmove, clos, visited++, &accept); + + if (accept) + last_accept_pos = ptr + 1; + + if (nclos == 0) /* dead end; no match */ + break; + } + + nfa.start->a.visited = visited; + + free(stack); + free(clos); + free(move); + + return last_accept_pos ? last_accept_pos - str : -1; +} + +static obj_t *regex_equal(obj_t *self, obj_t *other) +{ + return self == other ? t : nil; /* eq equality only */ +} + +static void regex_destroy(obj_t *regex) +{ + nfa_t *pnfa = (nfa_t *) regex->co.handle; + nfa_free(*pnfa); + free(pnfa); + regex->co.handle = 0; +} + +static struct cobj_ops regex_obj_ops = { + regex_equal, cobj_print_op, regex_destroy +}; + +obj_t *regex_compile(obj_t *regex_sexp) +{ + nfa_t *pnfa = chk_malloc(sizeof *pnfa); + *pnfa = nfa_compile_regex(regex_sexp); + return cobj(pnfa, regex, ®ex_obj_ops); +} + +nfa_t *regex_nfa(obj_t *reg) +{ + assert (reg->co.type == COBJ && reg->co.cls == regex); + return (nfa_t *) reg->co.handle; +} + +obj_t *search_regex(obj_t *haystack, obj_t *needle_regex, obj_t *start_num, + obj_t *from_end) +{ + const char *h = c_str(haystack); + long len = c_num(length_str(haystack)); + long start = c_num(start_num); + nfa_t *pnfa = regex_nfa(needle_regex); + + if (start > len) { + return nil; + } else { + long begin = from_end ? len : start; + long end = from_end ? start - 1 : len + 1; + int incr = from_end ? -1 : 1; + long i; + + for (i = begin; i != end; i += incr) { + long span = nfa_run(*pnfa, h + i); + if (span >= 0) + return cons(num(i), num(span)); + } + + return nil; + } +} + +obj_t *match_regex(obj_t *str, obj_t *reg, obj_t *pos) +{ + nfa_t *pnfa = regex_nfa(reg); + long cpos = c_num(pos); + long span = nfa_run(*pnfa, c_str(str) + cpos); + return span >= 0 ? num(span + cpos) : nil; +} |