Regular expression module updated to do unicode character sets.

Most of the changes are in the area of representing sets. Also, a bug was found in the compilation of regex character sets: ranges straddling two adjacent blocks of 32 characters were not being added to the character set. However, ranges falling within a single 32 block, or spanning three or more such blocks, worked properly. This bug is not tickled by common ranges such as A-Z, or 0-9, which land within a 32 block.
author: Kaz Kylheku <kaz@kylheku.com> 2009-11-12 11:44:25 -0800
committer: Kaz Kylheku <kaz@kylheku.com> 2009-11-12 11:44:25 -0800
commit: ddb0601e8e26255b8b9b536a5e6a47b86c33b011 (patch)
tree: ab91583911596a3bf0dff90492a65baaf2d1513d /regex.h
parent: afbf93478e0a04a12d11dc8933eaa2a779353cb3 (diff)
download: txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.tar.gz
txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.tar.bz2
txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.zip
1 files changed, 57 insertions, 12 deletions
diff --git a/regex.h b/regex.h
index 8deabb01..19d19a7d 100644
--- a/regex.h
+++ b/regex.h
@@ -27,20 +27,65 @@
 #include <limits.h>
 
 typedef unsigned int bitcell_t;
+
 #define BITCELL_ALL1 UINT_MAX
-#define BITCELL_LIT(NUMTOKEN) NUMTOKEN ## U
+#define CHAR_SET_SIZE (256 / (sizeof (bitcell_t) * CHAR_BIT))
+
+typedef enum {
+  CHSET_SMALL, CHSET_DISPLACED, CHSET_LARGE, CHSET_XLARGE
+} chset_type_t;
+
+typedef bitcell_t cset_L0_t[CHAR_SET_SIZE];
+typedef cset_L0_t *cset_L1_t[16];
+typedef cset_L1_t *cset_L2_t[16];
+typedef cset_L2_t *cset_L3_t[17];
 
-#define CHAR_SET_SIZE ((UCHAR_MAX + 1) / (sizeof (bitcell_t) * CHAR_BIT))
+struct any_char_set {
+  chset_type_t type : 4;
+  int compl : 2;
+};
+
+struct small_char_set {
+  chset_type_t type : 4;
+  int compl : 2;
+  cset_L0_t bitcell;
+};
 
-typedef struct char_set {
-  bitcell_t bitcell[CHAR_SET_SIZE];
+struct displaced_char_set {
+  chset_type_t type : 4;
+  int compl : 2;
+  cset_L0_t bitcell;
+  wchar_t base;
+};
+
+
+struct large_char_set {
+  chset_type_t type : 4;
+  int inv : 2;
+  cset_L2_t dir;
+};
+
+struct xlarge_char_set {
+  chset_type_t type : 4;
+  int inv : 2;
+  cset_L3_t dir;
+};
+
+typedef union char_set {
+  struct any_char_set any;
+  struct small_char_set s;
+  struct displaced_char_set d;
+  struct large_char_set l;
+  struct xlarge_char_set xl;
 } char_set_t;
 
-void char_set_clear(char_set_t *);
+char_set_t *char_set_create(chset_type_t, wchar_t);
+void char_set_destroy(char_set_t *);
+
 void char_set_compl(char_set_t *);
-void char_set_add(char_set_t *, int);
-void char_set_add_range(char_set_t *, int, int); /* inclusive */
-int char_set_contains(char_set_t *, int);
+void char_set_add(char_set_t *, wchar_t);
+void char_set_add_range(char_set_t *, wchar_t, wchar_t); /* inclusive */
+int char_set_contains(char_set_t *, wchar_t);
 
 typedef enum {
   nfa_accept, nfa_empty, nfa_wild, nfa_single, nfa_set
@@ -64,7 +109,7 @@ struct nfa_state_single {
   nfa_kind_t kind;
   unsigned visited;
   nfa_state_t *trans;
-  int ch;
+  wchar_t ch;
 };
 
 struct nfa_state_set {
@@ -83,9 +128,9 @@ union nfa_state {
 
 nfa_state_t *nfa_state_accept(void);
 nfa_state_t *nfa_state_empty(nfa_state_t *, nfa_state_t *);
-nfa_state_t *nfa_state_single(nfa_state_t *, int ch);
+nfa_state_t *nfa_state_single(nfa_state_t *, wchar_t ch);
 nfa_state_t *nfa_state_wild(nfa_state_t *);
-nfa_state_t *nfa_state_set(nfa_state_t *);
+nfa_state_t *nfa_state_set(nfa_state_t *, char_set_t *);
 void nfa_state_free(nfa_state_t *st);
 void nfa_state_shallow_free(nfa_state_t *st);
 void nfa_state_merge(nfa_state_t *accept, nfa_state_t *);
@@ -114,7 +159,7 @@ long nfa_run(nfa_t nfa, const wchar_t *str);
 void nfa_machine_reset(nfa_machine_t *);
 void nfa_machine_init(nfa_machine_t *, nfa_t);
 void nfa_machine_cleanup(nfa_machine_t *);
-nfam_result_t nfa_machine_feed(nfa_machine_t *, int ch);
+nfam_result_t nfa_machine_feed(nfa_machine_t *, wchar_t ch);
 long nfa_machine_match_span(nfa_machine_t *);
 obj_t *regex_compile(obj_t *regex_sexp);
 obj_t *regexp(obj_t *);
author	Kaz Kylheku <kaz@kylheku.com>	2009-11-12 11:44:25 -0800
committer	Kaz Kylheku <kaz@kylheku.com>	2009-11-12 11:44:25 -0800
commit	ddb0601e8e26255b8b9b536a5e6a47b86c33b011 (patch)
tree	ab91583911596a3bf0dff90492a65baaf2d1513d /regex.h
parent	afbf93478e0a04a12d11dc8933eaa2a779353cb3 (diff)
download	txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.tar.gz txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.tar.bz2 txr-ddb0601e8e26255b8b9b536a5e6a47b86c33b011.zip