summaryrefslogtreecommitdiffstats
path: root/regex.h
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2009-11-12 11:44:25 -0800
committerKaz Kylheku <kaz@kylheku.com>2009-11-12 11:44:25 -0800
commitddb0601e8e26255b8b9b536a5e6a47b86c33b011 (patch)
treeab91583911596a3bf0dff90492a65baaf2d1513d /regex.h
parentafbf93478e0a04a12d11dc8933eaa2a779353cb3 (diff)
downloadtxr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.tar.gz
txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.tar.bz2
txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.zip
Regular expression module updated to do unicode character sets.
Most of the changes are in the area of representing sets. Also, a bug was found in the compilation of regex character sets: ranges straddling two adjacent blocks of 32 characters were not being added to the character set. However, ranges falling within a single 32 block, or spanning three or more such blocks, worked properly. This bug is not tickled by common ranges such as A-Z, or 0-9, which land within a 32 block.
Diffstat (limited to 'regex.h')
-rw-r--r--regex.h69
1 files changed, 57 insertions, 12 deletions
diff --git a/regex.h b/regex.h
index 8deabb01..19d19a7d 100644
--- a/regex.h
+++ b/regex.h
@@ -27,20 +27,65 @@
#include <limits.h>
typedef unsigned int bitcell_t;
+
#define BITCELL_ALL1 UINT_MAX
-#define BITCELL_LIT(NUMTOKEN) NUMTOKEN ## U
+#define CHAR_SET_SIZE (256 / (sizeof (bitcell_t) * CHAR_BIT))
+
+typedef enum {
+ CHSET_SMALL, CHSET_DISPLACED, CHSET_LARGE, CHSET_XLARGE
+} chset_type_t;
+
+typedef bitcell_t cset_L0_t[CHAR_SET_SIZE];
+typedef cset_L0_t *cset_L1_t[16];
+typedef cset_L1_t *cset_L2_t[16];
+typedef cset_L2_t *cset_L3_t[17];
-#define CHAR_SET_SIZE ((UCHAR_MAX + 1) / (sizeof (bitcell_t) * CHAR_BIT))
+struct any_char_set {
+ chset_type_t type : 4;
+ int compl : 2;
+};
+
+struct small_char_set {
+ chset_type_t type : 4;
+ int compl : 2;
+ cset_L0_t bitcell;
+};
-typedef struct char_set {
- bitcell_t bitcell[CHAR_SET_SIZE];
+struct displaced_char_set {
+ chset_type_t type : 4;
+ int compl : 2;
+ cset_L0_t bitcell;
+ wchar_t base;
+};
+
+
+struct large_char_set {
+ chset_type_t type : 4;
+ int inv : 2;
+ cset_L2_t dir;
+};
+
+struct xlarge_char_set {
+ chset_type_t type : 4;
+ int inv : 2;
+ cset_L3_t dir;
+};
+
+typedef union char_set {
+ struct any_char_set any;
+ struct small_char_set s;
+ struct displaced_char_set d;
+ struct large_char_set l;
+ struct xlarge_char_set xl;
} char_set_t;
-void char_set_clear(char_set_t *);
+char_set_t *char_set_create(chset_type_t, wchar_t);
+void char_set_destroy(char_set_t *);
+
void char_set_compl(char_set_t *);
-void char_set_add(char_set_t *, int);
-void char_set_add_range(char_set_t *, int, int); /* inclusive */
-int char_set_contains(char_set_t *, int);
+void char_set_add(char_set_t *, wchar_t);
+void char_set_add_range(char_set_t *, wchar_t, wchar_t); /* inclusive */
+int char_set_contains(char_set_t *, wchar_t);
typedef enum {
nfa_accept, nfa_empty, nfa_wild, nfa_single, nfa_set
@@ -64,7 +109,7 @@ struct nfa_state_single {
nfa_kind_t kind;
unsigned visited;
nfa_state_t *trans;
- int ch;
+ wchar_t ch;
};
struct nfa_state_set {
@@ -83,9 +128,9 @@ union nfa_state {
nfa_state_t *nfa_state_accept(void);
nfa_state_t *nfa_state_empty(nfa_state_t *, nfa_state_t *);
-nfa_state_t *nfa_state_single(nfa_state_t *, int ch);
+nfa_state_t *nfa_state_single(nfa_state_t *, wchar_t ch);
nfa_state_t *nfa_state_wild(nfa_state_t *);
-nfa_state_t *nfa_state_set(nfa_state_t *);
+nfa_state_t *nfa_state_set(nfa_state_t *, char_set_t *);
void nfa_state_free(nfa_state_t *st);
void nfa_state_shallow_free(nfa_state_t *st);
void nfa_state_merge(nfa_state_t *accept, nfa_state_t *);
@@ -114,7 +159,7 @@ long nfa_run(nfa_t nfa, const wchar_t *str);
void nfa_machine_reset(nfa_machine_t *);
void nfa_machine_init(nfa_machine_t *, nfa_t);
void nfa_machine_cleanup(nfa_machine_t *);
-nfam_result_t nfa_machine_feed(nfa_machine_t *, int ch);
+nfam_result_t nfa_machine_feed(nfa_machine_t *, wchar_t ch);
long nfa_machine_match_span(nfa_machine_t *);
obj_t *regex_compile(obj_t *regex_sexp);
obj_t *regexp(obj_t *);