diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2009-11-12 11:44:25 -0800 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2009-11-12 11:44:25 -0800 |
commit | ddb0601e8e26255b8b9b536a5e6a47b86c33b011 (patch) | |
tree | ab91583911596a3bf0dff90492a65baaf2d1513d | |
parent | afbf93478e0a04a12d11dc8933eaa2a779353cb3 (diff) | |
download | txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.tar.gz txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.tar.bz2 txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.zip |
Regular expression module updated to do unicode character sets.
Most of the changes are in the area of representing sets.
Also, a bug was found in the compilation of regex character sets:
ranges straddling two adjacent blocks of 32 characters were
not being added to the character set. However, ranges falling
within a single 32 block, or spanning three or more such blocks,
worked properly. This bug is not tickled by common ranges
such as A-Z, or 0-9, which land within a 32 block.
-rw-r--r-- | ChangeLog | 54 | ||||
-rw-r--r-- | regex.c | 482 | ||||
-rw-r--r-- | regex.h | 69 |
3 files changed, 544 insertions, 61 deletions
@@ -1,3 +1,57 @@ +2009-11-12 Kaz Kylheku <kkylheku@gmail.com> + + Regular expression module updated to do unicode character sets. + Most of the changes are in the area of representing sets. + + Also, a bug was found in the compilation of regex character sets: + ranges straddling two adjacent blocks of 32 characters were + not being added to the character set. However, ranges falling + within a single 32 block, or spanning three or more such blocks, + worked properly. This bug is not tickled by common ranges + such as A-Z, or 0-9, which land within a 32 block. + + * regex.h (BITCELL_LIT): Macro removed. + (CHAR_SET_SIZE): Macro does not depend on UCHAR_MAX any more, + but hard-codes a set size of 256. UCHAR_MAX means nothing to us any + more since we are using wchar_t. The number 256 is simply an + arbitrarily chosen size for representing the small character + sets (or the leaves of the radix tree for representing large sets). + (chset_type_t): New enum typedef. + (cset_L0_t, cset_L1_t, cset_L2_t, cset_L3_t): New array typedefs. + (struct char_set): Replaced by union char_set. + (struct any_char_set, struct small_char_set, struct displaced_char_set, + struct large_char_set, struct xlarge_char_set): New struct types. + (char_set_clear): Declaration removed. + (char_set_create, char_set_destroy): Declared. + (char_set_add, char_set_add_range, char_set_contains, + nfa_state_single, nfa_state_set, nfa_machine_feed): Declarations + updated for wchar_t. + (struct nfa_state_single): member ch changed to wchar_t. + + * regex.c (char_set_clear): Function removed. + (CHAR_SET_L0, CHAR_SET_L1, CHAR_SET_L2, CHAR_SET_L3, CHAR_SET_L2_L0, + CHAR_SET_L2_HI, CHAR_SET_L1_L0, CHAR_SET_L1_HI, CHAR_SET_L0_L0, + CHAR_SET_L0_HI): New macros. + (L0_full, L0_fill_range, L0_contains, L1_full, L1_fill_range, + L1_contains, L1_free, L2_full, L2_fill_range, L2_contains, + L2_free, L3_fill_range, L3_contains, char_set_create, + char_set_destroy): New functions. + (char_set_compl): Works using a flag rather than by actually + computing a complemented set. Also, is no longer a toggle (and + was never used that way). + (char_set_add, char_set_add_range, char_set_contains): Polymorphic over + the different set types. + (nfa_state_single, nfa_move, nfa_run, nfa_machine_feed): Converted + to wchar_t. + (nfa_state_free): Use char_set_destroy to free set. + (nfa_state_set): Does not construct the set internally but + takes it as a parameter. + (nfa_compile_set): Rewritten to perform two passes over the + s-expression representing the list of characters and ranges + making up the set. The first pass determines what representation + will be used for the set. The second pass stuffs the characters and + ranges into the set. + 2009-11-11 Kaz Kylheku <kkylheku@gmail.com> * txr.c (main): call setlocale to set the LC_CTYPE to en_US.UTF-8, @@ -26,6 +26,8 @@ #include <stdio.h> #include <stdlib.h> +#include <string.h> +#include <wchar.h> #include <assert.h> #include <dirent.h> #include <setjmp.h> @@ -39,53 +41,394 @@ #define CHAR_SET_INDEX(CH) ((CH) / (sizeof (bitcell_t) * CHAR_BIT)) #define CHAR_SET_BIT(CH) ((CH) % (sizeof (bitcell_t) * CHAR_BIT)) -void char_set_clear(char_set_t *set) +#define CHAR_SET_L0(CH) ((CH) & 0xFF) +#define CHAR_SET_L1(CH) (((CH) >> 8) & 0xF) +#define CHAR_SET_L2(CH) (((CH) >> 12) & 0xF) +#define CHAR_SET_L3(CH) (((CH) >> 16) & 0x1F) + +#define CHAR_SET_L2_LO(CH) ((CH) & (~(wchar_t) 0xFFFF)) +#define CHAR_SET_L2_HI(CH) ((CH) | ((wchar_t) 0xFFFF)) + +#define CHAR_SET_L1_LO(CH) ((CH) & (~(wchar_t) 0xFFF)) +#define CHAR_SET_L1_HI(CH) ((CH) | ((wchar_t) 0xFFF)) + +#define CHAR_SET_L0_LO(CH) ((CH) & (~(wchar_t) 0xFF)) +#define CHAR_SET_L0_HI(CH) ((CH) | ((wchar_t) 0xFF)) + +static int L0_full(cset_L0_t *L0) { - static const char_set_t blank = { { 0 } }; - *set = blank; + int i; + + for (i = 0; i < CHAR_SET_SIZE; i++) + if ((*L0)[i] != ((bitcell_t) -1)) + return 0; + return 1; } -void char_set_compl(char_set_t *set) +void L0_fill_range(cset_L0_t *L0, wchar_t ch0, wchar_t ch1) { int i; - for (i = 0; i < CHAR_SET_SIZE; i++) - set->bitcell[i] ^= BITCELL_ALL1; + int bt0 = CHAR_SET_BIT(ch0); + int bc0 = CHAR_SET_INDEX(ch0); + bitcell_t mask0 = ~(((bitcell_t) 1 << bt0) - 1); + int bt1 = CHAR_SET_BIT(ch1); + int bc1 = CHAR_SET_INDEX(ch1); + bitcell_t mask1 = (((bitcell_t) 1 << (bt1 + 1) % 32) - 1); + + if (bc1 == bc0) { + (*L0)[bc0] |= (mask0 & mask1); + } else { + (*L0)[bc0] |= mask0; + (*L0)[bc1] |= mask1; + for (i = bc0 + 1; i < bc1; i++) + (*L0)[i] = ((bitcell_t) -1); + } } -void char_set_add(char_set_t *set, int ch) +int L0_contains(cset_L0_t *L0, wchar_t ch) { - set->bitcell[CHAR_SET_INDEX(ch)] |= (1 << CHAR_SET_BIT(ch)); + return ((*L0)[CHAR_SET_INDEX(ch)] & (1 << CHAR_SET_BIT(ch))) != 0; } -void char_set_add_range(char_set_t *set, int ch0, int ch1) +int L1_full(cset_L1_t *L1) { - if (ch0 <= ch1) { - int i; - int bt0 = CHAR_SET_BIT(ch0); - int bc0 = CHAR_SET_INDEX(ch0); - bitcell_t mask0 = ~((BITCELL_LIT(1) << bt0) - 1); - int bt1 = CHAR_SET_BIT(ch1); - int bc1 = CHAR_SET_INDEX(ch1); - bitcell_t mask1 = ((BITCELL_LIT(1) << (bt1 + 1) % 32) - 1); + int i; + for (i = 0; i < 16; i++) + if ((*L1)[i] != (cset_L0_t *) -1) + return 0; + return 1; +} - switch (bc1 - bc0) { - case 0: - set->bitcell[bc0] |= (mask0 & mask1); - break; - default: - set->bitcell[bc0] |= mask0; - set->bitcell[bc1] |= mask1; - case 1: - for (i = bc0 + 1; i < bc1; i++) - set->bitcell[i] = BITCELL_ALL1; - break; +void L1_fill_range(cset_L1_t *L1, wchar_t ch0, wchar_t ch1) +{ + int i1, i10, i11; + + i10 = CHAR_SET_L1(ch0); + i11 = CHAR_SET_L1(ch1); + + for (i1 = i10; i1 <= i11; i1++) { + wchar_t c0 = 0, c1 = 0; + cset_L0_t *L0; + + if (i1 > i10 && i1 < i11) { + free((*L1)[i1]); + (*L1)[i1] = (cset_L0_t *) -1; + continue; + } else if (i10 == i11) { + c0 = ch0; + c1 = ch1; + } else if (i1 == i10) { + c0 = ch0; + c1 = CHAR_SET_L0_HI(ch0); + } else if (i1 == i11) { + c0 = CHAR_SET_L0_LO(ch1); + c1 = ch1; + } + + if ((L0 = (*L1)[i1]) == (cset_L0_t *) -1) + continue; + + if (L0 == 0) { + static const cset_L0_t blank; + L0 = (*L1)[i1] = (cset_L0_t *) chk_malloc(sizeof *L0); + memcpy(L0, &blank, sizeof *L0); + } + + L0_fill_range(L0, CHAR_SET_L0(c0), CHAR_SET_L0(c1)); + + if (L0_full(L0)) { + free(L0); + (*L1)[i1] = (cset_L0_t *) -1; + } + } +} + +int L1_contains(cset_L1_t *L1, wchar_t ch) +{ + int i1 = CHAR_SET_L1(ch); + cset_L0_t *L0 = (*L1)[i1]; + + if (L0 == 0) + return 0; + else if (L0 == (cset_L0_t *) -1) + return 1; + else + return L0_contains(L0, CHAR_SET_L0(ch)); +} + + +void L1_free(cset_L1_t *L1) +{ + int i1; + + if (L1 == (cset_L1_t *) -1) + return; + + for (i1 = 0; i1 < 16; i1++) + if ((*L1)[i1] != (cset_L0_t *) -1) + free((*L1)[i1]); +} + +int L2_full(cset_L2_t *L2) +{ + int i; + for (i = 0; i < 16; i++) + if ((*L2)[i] != (cset_L1_t *) -1) + return 0; + return 1; +} + +void L2_fill_range(cset_L2_t *L2, wchar_t ch0, wchar_t ch1) +{ + int i2, i20, i21; + + i20 = CHAR_SET_L2(ch0); + i21 = CHAR_SET_L2(ch1); + + for (i2 = i20; i2 <= i21; i2++) { + wchar_t c0 = 0, c1 = 0; + cset_L1_t *L1; + + if (i2 > i20 && i2 < i21) { + free((*L2)[i2]); + (*L2)[i2] = (cset_L1_t *) -1; + continue; + } else if (i20 == i21) { + c0 = ch0; + c1 = ch1; + } else if (i2 == i20) { + c0 = ch0; + c1 = CHAR_SET_L1_HI(ch0); + } else if (i2 == i21) { + c0 = CHAR_SET_L1_LO(ch1); + c1 = ch1; + } + + if ((L1 = (*L2)[i2]) == (cset_L1_t *) -1) + continue; + + if (L1 == 0) { + static const cset_L1_t blank; + L1 = (*L2)[i2] = (cset_L1_t *) chk_malloc(sizeof *L1); + memcpy(L1, &blank, sizeof *L1); + } + + L1_fill_range(L1, c0, c1); + + if (L1_full(L1)) { + free(L1); + (*L2)[i2] = (cset_L1_t *) -1; } } } -int char_set_contains(char_set_t *set, int ch) +int L2_contains(cset_L2_t *L2, wchar_t ch) +{ + int i2 = CHAR_SET_L2(ch); + cset_L1_t *L1 = (*L2)[i2]; + + if (L1 == 0) + return 0; + else if (L1 == (cset_L1_t *) -1) + return 1; + else + return L1_contains(L1, ch); +} + +void L2_free(cset_L2_t *L2) { - return (set->bitcell[CHAR_SET_INDEX(ch)] & (1 << CHAR_SET_BIT(ch))) != 0; + int i2; + + for (i2 = 0; i2 < 16; i2++) { + cset_L1_t *L1 = (*L2)[i2]; + if (L1 != 0 && L1 != (cset_L1_t *) -1) { + L1_free((*L2)[i2]); + free((*L2)[i2]); + } + } +} + +void L3_fill_range(cset_L3_t *L3, wchar_t ch0, wchar_t ch1) +{ + int i3, i30, i31; + + i30 = CHAR_SET_L3(ch0); + i31 = CHAR_SET_L3(ch1); + + for (i3 = i30; i3 <= i31; i3++) { + wchar_t c0 = 0, c1 = 0; + cset_L2_t *L2; + + if (i3 > i30 && i3 < i31) { + free((*L3)[i3]); + (*L3)[i3] = (cset_L2_t *) -1; + continue; + } else if (i30 == i31) { + c0 = ch0; + c1 = ch1; + } else if (i3 == i30) { + c0 = ch0; + c1 = CHAR_SET_L2_HI(ch0); + } else if (i3 == i31) { + c0 = CHAR_SET_L2_LO(ch1); + c1 = ch1; + } + + if ((L2 = (*L3)[i3]) == (cset_L2_t *) -1) + continue; + + if (L2 == 0) { + static const cset_L2_t blank; + L2 = (*L3)[i3] = (cset_L2_t *) chk_malloc(sizeof *L2); + memcpy(L2, &blank, sizeof *L2); + } + + L2_fill_range(L2, c0, c1); + if (L2_full(L2)) { + free(L2); + (*L3)[i3] = (cset_L2_t *) -1; + } + } +} + +int L3_contains(cset_L3_t *L3, wchar_t ch) +{ + int i3 = CHAR_SET_L3(ch); + cset_L2_t *L2 = (*L3)[i3]; + + if (L2 == 0) + return 0; + else if (L2 == (cset_L2_t *) -1) + return 1; + else + return L2_contains(L2, ch); +} + +void L3_free(cset_L3_t *L3) +{ + int i3; + + for (i3 = 0; i3 < 17; i3++) { + cset_L2_t *L2 = (*L3)[i3]; + if (L2 != 0 && L2 != (cset_L2_t *) -1) { + L2_free((*L3)[i3]); + free((*L3)[i3]); + } + } +} + +char_set_t *char_set_create(chset_type_t type, wchar_t base) +{ + static const char_set_t blank; + char_set_t *cs = (char_set_t *) chk_malloc(sizeof *cs); + *cs = blank; + cs->any.type = type; + + if (type == CHSET_DISPLACED) + cs->d.base = base; + + return cs; +} + +void char_set_destroy(char_set_t *set) +{ + switch (set->any.type) { + case CHSET_DISPLACED: + case CHSET_SMALL: + free(set); + break; + case CHSET_LARGE: + L2_free(&set->l.dir); + free(set); + break; + case CHSET_XLARGE: + L3_free(&set->xl.dir); + free(set); + break; + } +} + +void char_set_compl(char_set_t *set) +{ + set->any.compl = 1; +} + +void char_set_add(char_set_t *set, wchar_t ch) +{ + switch (set->any.type) { + case CHSET_DISPLACED: + assert (ch >= set->d.base && ch < set->d.base + 256); + ch -= set->d.base; + /* fallthrough */ + case CHSET_SMALL: + assert (ch < 256); + set->s.bitcell[CHAR_SET_INDEX(ch)] |= (1 << CHAR_SET_BIT(ch)); + break; + case CHSET_LARGE: + assert (ch < 0x10000); + L2_fill_range(&set->l.dir, ch, ch); + break; + case CHSET_XLARGE: + assert (ch < 0x110000); + L3_fill_range(&set->xl.dir, ch, ch); + break; + } +} + +void char_set_add_range(char_set_t *set, wchar_t ch0, wchar_t ch1) +{ + if (ch0 >= ch1) + return; + + switch (set->any.type) { + case CHSET_DISPLACED: + assert (ch0 >= set->d.base && ch1 < set->d.base + 256); + ch0 -= set->d.base; + ch1 -= set->d.base; + /* fallthrough */ + case CHSET_SMALL: + assert (ch1 < 256); + L0_fill_range(&set->s.bitcell, ch0, ch1); + break; + case CHSET_LARGE: + assert (ch1 < 0x10000); + L2_fill_range(&set->l.dir, ch0, ch1); + break; + case CHSET_XLARGE: + assert (ch1 < 0x110000); + L3_fill_range(&set->xl.dir, ch0, ch1); + break; + } +} + +int char_set_contains(char_set_t *set, wchar_t ch) +{ + int result = 0; + + switch (set->any.type) { + case CHSET_DISPLACED: + if (ch < set->d.base) + break; + ch -= set->d.base; + /* fallthrough */ + case CHSET_SMALL: + if (ch >= 256) + break; + result = L0_contains(&set->s.bitcell, ch); + break; + case CHSET_LARGE: + if (ch >= 0x10000) + break; + result = L2_contains(&set->l.dir, ch); + break; + case CHSET_XLARGE: + if (ch >= 0x110000) + break; + result = L3_contains(&set->xl.dir, ch); + break; + } + + return set->any.compl ? !result : result; } nfa_state_t *nfa_state_accept(void) @@ -106,7 +449,7 @@ nfa_state_t *nfa_state_empty(nfa_state_t *t0, nfa_state_t *t1) return st; } -nfa_state_t *nfa_state_single(nfa_state_t *t, int ch) +nfa_state_t *nfa_state_single(nfa_state_t *t, wchar_t ch) { nfa_state_t *st = (nfa_state_t *) chk_malloc(sizeof *st); st->o.kind = nfa_single; @@ -129,7 +472,7 @@ nfa_state_t *nfa_state_wild(nfa_state_t *t) void nfa_state_free(nfa_state_t *st) { if (st->a.kind == nfa_set) - free(st->s.set); + char_set_destroy(st->s.set); free(st); } @@ -138,11 +481,9 @@ void nfa_state_shallow_free(nfa_state_t *st) free(st); } -nfa_state_t *nfa_state_set(nfa_state_t *t) +nfa_state_t *nfa_state_set(nfa_state_t *t, char_set_t *cs) { nfa_state_t *st = (nfa_state_t *) chk_malloc(sizeof *st); - char_set_t *cs = (char_set_t *) chk_malloc(sizeof *cs); - char_set_clear(cs); st->s.kind = nfa_set; st->s.visited = 0; st->s.trans = t; @@ -211,31 +552,74 @@ nfa_t nfa_combine(nfa_t pred, nfa_t succ) nfa_t nfa_compile_set(obj_t *args, int compl) { - nfa_state_t *acc = nfa_state_accept(); - nfa_state_t *s = nfa_state_set(acc); - char_set_t *set = s->s.set; - nfa_t ret = nfa_make(s, acc); + obj_t *iter; + wchar_t min = WCHAR_MAX; + wchar_t max = 0; + chset_type_t cst; - for (; args; args = rest(args)) { - obj_t *item = first(args); + for (iter = args; iter; iter = rest(iter)) { + obj_t *item = first(iter); if (consp(item)) { obj_t *from = car(item); obj_t *to = cdr(item); assert (typeof(from) == chr_t && typeof(to) == chr_t); - char_set_add_range(set, c_chr(from), c_chr(to)); + + if (c_chr(from) < min) + min = c_chr(from); + if (c_chr(from) > max) + max = c_chr(from); + + if (c_chr(to) < min) + min = c_chr(to); + if (c_chr(to) > max) + max = c_chr(to); } else if (typeof(item) == chr_t) { - char_set_add(set, c_chr(item)); + if (c_chr(item) < min) + min = c_chr(item); + if (c_chr(item) > max) + max = c_chr(item); } else { assert(0 && "bad regex set"); } } - if (compl) - char_set_compl(set); + if (max < 0x100) + cst = CHSET_SMALL; + else if (max - min < 0x100) + cst = CHSET_DISPLACED; + else if (max < 0x10000) + cst = CHSET_LARGE; + else + cst = CHSET_XLARGE; + + { + char_set_t *set = char_set_create(cst, min); + nfa_state_t *acc = nfa_state_accept(); + nfa_state_t *s = nfa_state_set(acc, set); + nfa_t ret = nfa_make(s, acc); + + for (iter = args; iter; iter = rest(iter)) { + obj_t *item = first(iter); - return ret; + if (consp(item)) { + obj_t *from = car(item); + obj_t *to = cdr(item); + + assert (typeof(from) == chr_t && typeof(to) == chr_t); + char_set_add_range(set, c_chr(from), c_chr(to)); + } else if (typeof(item) == chr_t) { + char_set_add(set, c_chr(item)); + } else { + assert(0 && "bad regex set"); + } + } + + if (compl) + char_set_compl(set); + return ret; + } } /* @@ -470,7 +854,7 @@ int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin, * set is the set of states which are reachable from the set of * input states on the consumpion of the input character given by ch. */ -int nfa_move(nfa_state_t **in, int nin, nfa_state_t **out, int ch) +int nfa_move(nfa_state_t **in, int nin, nfa_state_t **out, wchar_t ch) { int i, nmove; @@ -546,7 +930,7 @@ long nfa_run(nfa_t nfa, const wchar_t *str) last_accept_pos = ptr; for (; *ptr != 0; ptr++) { - int ch = *ptr; + wchar_t ch = *ptr; accept = 0; @@ -618,7 +1002,7 @@ void nfa_machine_cleanup(nfa_machine_t *nfam) nfam->nfa.accept = 0; } -nfam_result_t nfa_machine_feed(nfa_machine_t *nfam, int ch) +nfam_result_t nfa_machine_feed(nfa_machine_t *nfam, wchar_t ch) { int accept = 0; @@ -27,20 +27,65 @@ #include <limits.h> typedef unsigned int bitcell_t; + #define BITCELL_ALL1 UINT_MAX -#define BITCELL_LIT(NUMTOKEN) NUMTOKEN ## U +#define CHAR_SET_SIZE (256 / (sizeof (bitcell_t) * CHAR_BIT)) + +typedef enum { + CHSET_SMALL, CHSET_DISPLACED, CHSET_LARGE, CHSET_XLARGE +} chset_type_t; + +typedef bitcell_t cset_L0_t[CHAR_SET_SIZE]; +typedef cset_L0_t *cset_L1_t[16]; +typedef cset_L1_t *cset_L2_t[16]; +typedef cset_L2_t *cset_L3_t[17]; -#define CHAR_SET_SIZE ((UCHAR_MAX + 1) / (sizeof (bitcell_t) * CHAR_BIT)) +struct any_char_set { + chset_type_t type : 4; + int compl : 2; +}; + +struct small_char_set { + chset_type_t type : 4; + int compl : 2; + cset_L0_t bitcell; +}; -typedef struct char_set { - bitcell_t bitcell[CHAR_SET_SIZE]; +struct displaced_char_set { + chset_type_t type : 4; + int compl : 2; + cset_L0_t bitcell; + wchar_t base; +}; + + +struct large_char_set { + chset_type_t type : 4; + int inv : 2; + cset_L2_t dir; +}; + +struct xlarge_char_set { + chset_type_t type : 4; + int inv : 2; + cset_L3_t dir; +}; + +typedef union char_set { + struct any_char_set any; + struct small_char_set s; + struct displaced_char_set d; + struct large_char_set l; + struct xlarge_char_set xl; } char_set_t; -void char_set_clear(char_set_t *); +char_set_t *char_set_create(chset_type_t, wchar_t); +void char_set_destroy(char_set_t *); + void char_set_compl(char_set_t *); -void char_set_add(char_set_t *, int); -void char_set_add_range(char_set_t *, int, int); /* inclusive */ -int char_set_contains(char_set_t *, int); +void char_set_add(char_set_t *, wchar_t); +void char_set_add_range(char_set_t *, wchar_t, wchar_t); /* inclusive */ +int char_set_contains(char_set_t *, wchar_t); typedef enum { nfa_accept, nfa_empty, nfa_wild, nfa_single, nfa_set @@ -64,7 +109,7 @@ struct nfa_state_single { nfa_kind_t kind; unsigned visited; nfa_state_t *trans; - int ch; + wchar_t ch; }; struct nfa_state_set { @@ -83,9 +128,9 @@ union nfa_state { nfa_state_t *nfa_state_accept(void); nfa_state_t *nfa_state_empty(nfa_state_t *, nfa_state_t *); -nfa_state_t *nfa_state_single(nfa_state_t *, int ch); +nfa_state_t *nfa_state_single(nfa_state_t *, wchar_t ch); nfa_state_t *nfa_state_wild(nfa_state_t *); -nfa_state_t *nfa_state_set(nfa_state_t *); +nfa_state_t *nfa_state_set(nfa_state_t *, char_set_t *); void nfa_state_free(nfa_state_t *st); void nfa_state_shallow_free(nfa_state_t *st); void nfa_state_merge(nfa_state_t *accept, nfa_state_t *); @@ -114,7 +159,7 @@ long nfa_run(nfa_t nfa, const wchar_t *str); void nfa_machine_reset(nfa_machine_t *); void nfa_machine_init(nfa_machine_t *, nfa_t); void nfa_machine_cleanup(nfa_machine_t *); -nfam_result_t nfa_machine_feed(nfa_machine_t *, int ch); +nfam_result_t nfa_machine_feed(nfa_machine_t *, wchar_t ch); long nfa_machine_match_span(nfa_machine_t *); obj_t *regex_compile(obj_t *regex_sexp); obj_t *regexp(obj_t *); |