diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2009-11-12 11:44:25 -0800 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2009-11-12 11:44:25 -0800 |
commit | ddb0601e8e26255b8b9b536a5e6a47b86c33b011 (patch) | |
tree | ab91583911596a3bf0dff90492a65baaf2d1513d /regex.h | |
parent | afbf93478e0a04a12d11dc8933eaa2a779353cb3 (diff) | |
download | txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.tar.gz txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.tar.bz2 txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.zip |
Regular expression module updated to do unicode character sets.
Most of the changes are in the area of representing sets.
Also, a bug was found in the compilation of regex character sets:
ranges straddling two adjacent blocks of 32 characters were
not being added to the character set. However, ranges falling
within a single 32 block, or spanning three or more such blocks,
worked properly. This bug is not tickled by common ranges
such as A-Z, or 0-9, which land within a 32 block.
Diffstat (limited to 'regex.h')
-rw-r--r-- | regex.h | 69 |
1 files changed, 57 insertions, 12 deletions
@@ -27,20 +27,65 @@ #include <limits.h> typedef unsigned int bitcell_t; + #define BITCELL_ALL1 UINT_MAX -#define BITCELL_LIT(NUMTOKEN) NUMTOKEN ## U +#define CHAR_SET_SIZE (256 / (sizeof (bitcell_t) * CHAR_BIT)) + +typedef enum { + CHSET_SMALL, CHSET_DISPLACED, CHSET_LARGE, CHSET_XLARGE +} chset_type_t; + +typedef bitcell_t cset_L0_t[CHAR_SET_SIZE]; +typedef cset_L0_t *cset_L1_t[16]; +typedef cset_L1_t *cset_L2_t[16]; +typedef cset_L2_t *cset_L3_t[17]; -#define CHAR_SET_SIZE ((UCHAR_MAX + 1) / (sizeof (bitcell_t) * CHAR_BIT)) +struct any_char_set { + chset_type_t type : 4; + int compl : 2; +}; + +struct small_char_set { + chset_type_t type : 4; + int compl : 2; + cset_L0_t bitcell; +}; -typedef struct char_set { - bitcell_t bitcell[CHAR_SET_SIZE]; +struct displaced_char_set { + chset_type_t type : 4; + int compl : 2; + cset_L0_t bitcell; + wchar_t base; +}; + + +struct large_char_set { + chset_type_t type : 4; + int inv : 2; + cset_L2_t dir; +}; + +struct xlarge_char_set { + chset_type_t type : 4; + int inv : 2; + cset_L3_t dir; +}; + +typedef union char_set { + struct any_char_set any; + struct small_char_set s; + struct displaced_char_set d; + struct large_char_set l; + struct xlarge_char_set xl; } char_set_t; -void char_set_clear(char_set_t *); +char_set_t *char_set_create(chset_type_t, wchar_t); +void char_set_destroy(char_set_t *); + void char_set_compl(char_set_t *); -void char_set_add(char_set_t *, int); -void char_set_add_range(char_set_t *, int, int); /* inclusive */ -int char_set_contains(char_set_t *, int); +void char_set_add(char_set_t *, wchar_t); +void char_set_add_range(char_set_t *, wchar_t, wchar_t); /* inclusive */ +int char_set_contains(char_set_t *, wchar_t); typedef enum { nfa_accept, nfa_empty, nfa_wild, nfa_single, nfa_set @@ -64,7 +109,7 @@ struct nfa_state_single { nfa_kind_t kind; unsigned visited; nfa_state_t *trans; - int ch; + wchar_t ch; }; struct nfa_state_set { @@ -83,9 +128,9 @@ union nfa_state { nfa_state_t *nfa_state_accept(void); nfa_state_t *nfa_state_empty(nfa_state_t *, nfa_state_t *); -nfa_state_t *nfa_state_single(nfa_state_t *, int ch); +nfa_state_t *nfa_state_single(nfa_state_t *, wchar_t ch); nfa_state_t *nfa_state_wild(nfa_state_t *); -nfa_state_t *nfa_state_set(nfa_state_t *); +nfa_state_t *nfa_state_set(nfa_state_t *, char_set_t *); void nfa_state_free(nfa_state_t *st); void nfa_state_shallow_free(nfa_state_t *st); void nfa_state_merge(nfa_state_t *accept, nfa_state_t *); @@ -114,7 +159,7 @@ long nfa_run(nfa_t nfa, const wchar_t *str); void nfa_machine_reset(nfa_machine_t *); void nfa_machine_init(nfa_machine_t *, nfa_t); void nfa_machine_cleanup(nfa_machine_t *); -nfam_result_t nfa_machine_feed(nfa_machine_t *, int ch); +nfam_result_t nfa_machine_feed(nfa_machine_t *, wchar_t ch); long nfa_machine_match_span(nfa_machine_t *); obj_t *regex_compile(obj_t *regex_sexp); obj_t *regexp(obj_t *); |