diff options
-rw-r--r-- | ChangeLog | 35 | ||||
-rw-r--r-- | lib.c | 1 | ||||
-rw-r--r-- | parser.l | 5 | ||||
-rw-r--r-- | parser.y | 43 | ||||
-rw-r--r-- | regex.c | 107 | ||||
-rw-r--r-- | regex.h | 6 |
6 files changed, 188 insertions, 9 deletions
@@ -1,3 +1,38 @@ +2012-04-19 Kaz Kylheku <kaz@kylheku.com> + + First cut at implementing \s, \d, \w, \S, \D and \W regex tokens. + + * lib.c (init): Call regex_init. + + * parser.l: return new REGTOKEN kind. + + * parser.y (REGTOKEN): New token type. + (REGTERM): Translate REGTERM to keyword. + (regclass): Restructured to handle inherited nodes as lists. + (regclassterm): Produce $$ as list. Add handling for REGTOKEN + occurring inside character class by expanding it. This might not + be the best approach. + (yybadtoken): Handle REGTOKEN in switch. + + * regex.c (struct any_char_set, struct small_char_set, + struct displaced_char_set, struct large_char_set, + struct xlarge_char_set): New bitfield member, stat. + (char_set_create): New parameter for indicating static char set. + (char_set_destroy): Do not free a static char set. + (char_set_compile): Pass zero to new parameter of char_set_create. + (spaces): New static array. + (space_cs, digit_cs, word_cs, cspace_cs, cdigit_cs, cword_cs): New + static pointers to char_set_t. + (init_special_char_sets, nfa_compile_given_set): New static function. + (nfa_compile_regex, dv_compile_regex): Handle new character set token + keywords. + (space_k, digit_k, word_char_k, cspace_k, cdigit_k, cword_char_k, + regex_space_chars): New variables. + (regex_init): New function. + + * regex.h (space_k, digit_k, word_char_k, cspace_k, cdigit_k, + cword_char_k, regex_space_chars, regex_init): Declared. + 2012-04-15 Kaz Kylheku <kaz@kylheku.com> * eval.c (eval_init): New intrinsic functions remq*, remql*, @@ -4622,6 +4622,7 @@ void init(const wchar_t *pn, mem_t *(*oom)(mem_t *, size_t), eval_init(); filter_init(); hash_init(); + regex_init(); gc_state(gc_save); } @@ -574,6 +574,11 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} return REGCHAR; } +<REGEX>[\\][sSdDwW] { + yylval.chr = yytext[1]; + return REGTOKEN; +} + <REGEX>{WS}[\\]\n{WS} { lineno++; } @@ -76,7 +76,7 @@ static val parsed_spec; %token <val> NUMBER METANUM -%token <chr> REGCHAR LITCHAR +%token <chr> REGCHAR REGTOKEN LITCHAR %token <chr> METAPAR METABKT SPLICE %type <val> spec clauses clauses_opt clause @@ -105,7 +105,7 @@ static val parsed_spec; %left '|' '/' %left '&' %right '~' '*' '?' '+' '%' -%right '.' REGCHAR LITCHAR +%right '.' REGCHAR REGTOKEN LITCHAR %right DOTDOT %% @@ -796,18 +796,48 @@ regterm : regterm '*' { $$ = list(zeroplus_s, $1, nao); } | ']' { $$ = chr(']'); } | '-' { $$ = chr('-'); } | REGCHAR { $$ = chr($1); } + | REGTOKEN { switch ($1) + { case 's': + $$ = space_k; break; + case 'S': + $$ = cspace_k; break; + case 'd': + $$ = digit_k; break; + case 'D': + $$ = cdigit_k; break; + case 'w': + $$ = word_char_k; break; + case 'W': + $$ = cword_char_k; break; }} | '(' regexpr ')' { $$ = $2; } | '(' error { $$ = nil; yybadtoken(yychar, lit("regex subexpression")); } ; -regclass : regclassterm { $$ = cons($1, nil); } - | regclassterm regclass { $$ = cons($1, $2); } +regclass : regclassterm { $$ = $1; } + | regclassterm regclass { $$ = nappend2($1, $2); } ; -regclassterm : regrange { $$ = $1; } - | regchar { $$ = chr($1); } +regclassterm : regrange { $$ = cons($1, nil); } + | regchar { $$ = cons(chr($1), nil); } + | REGTOKEN { switch ($1) + { case 's': + $$ = regex_space_chars; + break; + case 'd': + $$ = cons(cons(chr('0'), chr('9')), nil); + break; + case 'w': + $$ = list(cons(chr('A'), chr('Z')), + cons(chr('a'), chr('z')), + chr('_'), nao); + break; + default: + yyerrorf(lit("complemented token " + "\\~a not allowed " + "in regex character class"), + chr($1), nao); } } ; regrange : regchar '-' regchar { $$ = cons(chr($1), chr($3)); } @@ -1130,6 +1160,7 @@ void yybadtoken(int tok, val context) case FINALLY: problem = lit("\"finally\""); break; case NUMBER: problem = lit("number"); break; case REGCHAR: problem = lit("regular expression character"); break; + case REGTOKEN: problem = lit("regular expression token"); break; case LITCHAR: problem = lit("string literal character"); break; case METAPAR: problem = lit("@("); break; case METABKT: problem = lit("@["); break; @@ -38,6 +38,7 @@ #include "unwind.h" #include "regex.h" #include "txr.h" +#include "gc.h" #if WCHAR_MAX > 65535 #define FULL_UNICODE @@ -99,17 +100,20 @@ typedef cset_L2_t *cset_L3_t[17]; struct any_char_set { unsigned type : 3; unsigned comp : 1; + unsigned stat : 1; }; struct small_char_set { unsigned type : 3; unsigned comp : 1; + unsigned stat : 1; cset_L0_t bitcell; }; struct displaced_char_set { unsigned type : 3; unsigned comp : 1; + unsigned stat : 1; cset_L0_t bitcell; wchar_t base; }; @@ -118,6 +122,7 @@ struct displaced_char_set { struct large_char_set { unsigned type : 3; unsigned comp : 1; + unsigned stat : 1; cset_L2_t dir; }; @@ -125,6 +130,7 @@ struct large_char_set { struct xlarge_char_set { unsigned type : 3; unsigned comp : 1; + unsigned stat : 1; cset_L3_t dir; }; #endif @@ -472,12 +478,13 @@ static void L3_free(cset_L3_t *L3) #endif -static char_set_t *char_set_create(chset_type_t type, wchar_t base) +static char_set_t *char_set_create(chset_type_t type, wchar_t base, unsigned st) { static char_set_t blank; char_set_t *cs = (char_set_t *) chk_malloc(sizeof *cs); *cs = blank; cs->any.type = type; + cs->any.stat = st; if (type == CHSET_DISPLACED) cs->d.base = base; @@ -487,6 +494,9 @@ static char_set_t *char_set_create(chset_type_t type, wchar_t base) static void char_set_destroy(char_set_t *set) { + if (set->any.stat) + return; + switch (set->any.type) { case CHSET_DISPLACED: case CHSET_SMALL: @@ -644,7 +654,7 @@ static char_set_t *char_set_compile(val args, val comp) { - char_set_t *set = char_set_create(cst, min); + char_set_t *set = char_set_create(cst, min, 0); for (iter = args; iter; iter = rest(iter)) { val item = first(iter); @@ -669,6 +679,48 @@ static char_set_t *char_set_compile(val args, val comp) } } +wchar_t spaces[] = { + 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x0020, 0x00a0, 0x1680, 0x180e, + 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, + 0x2009, 0x200a, 0x2028, 0x2029, 0x205f, 0x3000, 0 +}; + +static char_set_t *space_cs, *digit_cs, *word_cs; +static char_set_t *cspace_cs, *cdigit_cs, *cword_cs; + +static void init_special_char_sets(void) +{ + int i; + + space_cs = char_set_create(CHSET_LARGE, 0, 1); + cspace_cs = char_set_create(CHSET_LARGE, 0, 1); + digit_cs = char_set_create(CHSET_SMALL, 0, 1); + cdigit_cs = char_set_create(CHSET_SMALL, 0, 1); + word_cs = char_set_create(CHSET_SMALL, 0, 1); + cword_cs = char_set_create(CHSET_SMALL, 0, 1); + + char_set_compl(cspace_cs); + char_set_compl(cdigit_cs); + char_set_compl(cword_cs); + + for (i = 0; spaces[i] != 0; i++) { + wchar_t sp = spaces[i]; + char_set_add(space_cs, sp); + char_set_add(cspace_cs, sp); + push(chr(sp), ®ex_space_chars); + } + + char_set_add_range(digit_cs, '0', '9'); + char_set_add_range(cdigit_cs, '0', '9'); + + char_set_add_range(word_cs, 'A', 'Z'); + char_set_add_range(cword_cs, 'A', 'Z'); + char_set_add_range(word_cs, 'a', 'a'); + char_set_add_range(cword_cs, 'a', 'a'); + char_set_add(word_cs, '_'); + char_set_add(cword_cs, '_'); +} + static void char_set_cobj_destroy(val chset) { char_set_t *set = (char_set_t *) chset->co.handle; @@ -812,6 +864,13 @@ static nfa_t nfa_compile_set(val args, val comp) return nfa_make(s, acc); } +static nfa_t nfa_compile_given_set(char_set_t *set) +{ + nfa_state_t *acc = nfa_state_accept(); + nfa_state_t *s = nfa_state_set(acc, set); + return nfa_make(s, acc); +} + static nfa_t nfa_compile_regex(val regex); /* @@ -852,6 +911,18 @@ static nfa_t nfa_compile_regex(val exp) nfa_state_t *acc = nfa_state_accept(); nfa_state_t *s = nfa_state_wild(acc); return nfa_make(s, acc); + } else if (exp == space_k) { + return nfa_compile_given_set(space_cs); + } else if (exp == digit_k) { + return nfa_compile_given_set(digit_cs); + } else if (exp == word_char_k) { + return nfa_compile_given_set(word_cs); + } else if (exp == cspace_k) { + return nfa_compile_given_set(cspace_cs); + } else if (exp == cdigit_k) { + return nfa_compile_given_set(cdigit_cs); + } else if (exp == cword_char_k) { + return nfa_compile_given_set(cword_cs); } else if (consp(exp)) { val sym = first(exp), args = rest(exp); @@ -1178,7 +1249,19 @@ static val reg_nullable(val); */ static val dv_compile_regex(val exp) { - if (symbolp(exp) || chrp(exp)) { + if (exp == space_k) { + return cobj((mem_t *) space_cs, chset_s, &char_set_obj_ops); + } else if (exp == digit_k) { + return cobj((mem_t *) digit_cs, chset_s, &char_set_obj_ops); + } else if (exp == word_char_k) { + return cobj((mem_t *) word_cs, chset_s, &char_set_obj_ops); + } else if (exp == cspace_k) { + return cobj((mem_t *) cspace_cs, chset_s, &char_set_obj_ops); + } else if (exp == cdigit_k) { + return cobj((mem_t *) cdigit_cs, chset_s, &char_set_obj_ops); + } else if (exp == cword_char_k) { + return cobj((mem_t *) cword_cs, chset_s, &char_set_obj_ops); + } else if (symbolp(exp) || chrp(exp)) { return exp; } else if (stringp(exp)) { return cons(compound_s, list_str(exp)); @@ -1766,3 +1849,21 @@ val regsub(val regex, val repl, val str) return cat_str(out, nil); } + +val space_k, digit_k, word_char_k; +val cspace_k, cdigit_k, cword_char_k; +val regex_space_chars; + +void regex_init(void) +{ + space_k = intern(lit("space"), keyword_package); + digit_k = intern(lit("digit"), keyword_package); + word_char_k = intern(lit("word-char"), keyword_package); + cspace_k = intern(lit("cspace"), keyword_package); + cdigit_k = intern(lit("cdigit"), keyword_package); + cword_char_k = intern(lit("cword-char"), keyword_package); + + prot1(®ex_space_chars); + + init_special_char_sets(); +} @@ -24,8 +24,14 @@ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ +extern val space_k, digit_k, word_char_k; +extern val cspace_k, cdigit_k, cword_char_k; +extern val regex_space_chars; + val regex_compile(val regex_sexp); val regexp(val); val search_regex(val haystack, val needle_regex, val start_num, val from_end); val match_regex(val str, val regex, val pos); val regsub(val regex, val repl, val str); + +void regex_init(void); |