diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2012-04-20 00:47:46 -0700 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2012-04-20 00:47:46 -0700 |
commit | c3c43fd39c715827de5cc74846ba977c5b1d2181 (patch) | |
tree | 2058612d3a3623851b346e7068e648ecdc8bc4a6 | |
parent | 2614361741b85a71b06fc4d70538e0b5d8fb660b (diff) | |
download | txr-c3c43fd39c715827de5cc74846ba977c5b1d2181.tar.gz txr-c3c43fd39c715827de5cc74846ba977c5b1d2181.tar.bz2 txr-c3c43fd39c715827de5cc74846ba977c5b1d2181.zip |
* parser.y (regtoken): New nonterminal symbol.
(regterm): REGTOKEN production factored out to regtoken.
(regclass): Reverted prior commmit's changes.
(regclassterm): Reverted prior commit, removing REGTOKEN
production for character classes, and introduced a regtoken
production. So now the keyword symbols are part of the
character class abstract syntax.
(regtoken): New production rule.
* regex.c (regex_space_chars): Converted to internal linkage.
(char_set_compile): Handle token keywords in character class
abstract syntax.
* regex.h (regex_space_chars): External declaration removed.
-rw-r--r-- | ChangeLog | 17 | ||||
-rw-r--r-- | parser.y | 56 | ||||
-rw-r--r-- | regex.c | 31 | ||||
-rw-r--r-- | regex.h | 1 |
4 files changed, 68 insertions, 37 deletions
@@ -1,3 +1,20 @@ +2012-04-20 Kaz Kylheku <kaz@kylheku.com> + + * parser.y (regtoken): New nonterminal symbol. + (regterm): REGTOKEN production factored out to regtoken. + (regclass): Reverted prior commmit's changes. + (regclassterm): Reverted prior commit, removing REGTOKEN + production for character classes, and introduced a regtoken + production. So now the keyword symbols are part of the + character class abstract syntax. + (regtoken): New production rule. + + * regex.c (regex_space_chars): Converted to internal linkage. + (char_set_compile): Handle token keywords in character class + abstract syntax. + + * regex.h (regex_space_chars): External declaration removed. + 2012-04-19 Kaz Kylheku <kaz@kylheku.com> First cut at implementing \s, \d, \w, \S, \D and \W regex tokens. @@ -90,7 +90,7 @@ static val parsed_spec; %type <val> repeat_clause repeat_parts_opt o_line %type <val> o_elems_opt o_elems o_elem o_var rep_elem rep_parts_opt %type <val> regex lisp_regex regexpr regbranch -%type <val> regterm regclass regclassterm regrange +%type <val> regterm regtoken regclass regclassterm regrange %type <val> strlit chrlit quasilit quasi_items quasi_item litchars %type <chr> regchar %type <lineno> '(' '[' @@ -796,48 +796,20 @@ regterm : regterm '*' { $$ = list(zeroplus_s, $1, nao); } | ']' { $$ = chr(']'); } | '-' { $$ = chr('-'); } | REGCHAR { $$ = chr($1); } - | REGTOKEN { switch ($1) - { case 's': - $$ = space_k; break; - case 'S': - $$ = cspace_k; break; - case 'd': - $$ = digit_k; break; - case 'D': - $$ = cdigit_k; break; - case 'w': - $$ = word_char_k; break; - case 'W': - $$ = cword_char_k; break; }} + | regtoken { $$ = $1; } | '(' regexpr ')' { $$ = $2; } | '(' error { $$ = nil; yybadtoken(yychar, lit("regex subexpression")); } ; -regclass : regclassterm { $$ = $1; } - | regclassterm regclass { $$ = nappend2($1, $2); } +regclass : regclassterm { $$ = cons($1, nil); } + | regclassterm regclass { $$ = cons($1, $2); } ; -regclassterm : regrange { $$ = cons($1, nil); } - | regchar { $$ = cons(chr($1), nil); } - | REGTOKEN { switch ($1) - { case 's': - $$ = regex_space_chars; - break; - case 'd': - $$ = cons(cons(chr('0'), chr('9')), nil); - break; - case 'w': - $$ = list(cons(chr('A'), chr('Z')), - cons(chr('a'), chr('z')), - chr('_'), nao); - break; - default: - yyerrorf(lit("complemented token " - "\\~a not allowed " - "in regex character class"), - chr($1), nao); } } +regclassterm : regrange { $$ = $1; } + | regchar { $$ = chr($1); } + | regtoken { $$ = $1; } ; regrange : regchar '-' regchar { $$ = cons(chr($1), chr($3)); } @@ -856,6 +828,20 @@ regchar : '?' { $$ = '?'; } | REGCHAR { $$ = $1; } ; +regtoken : REGTOKEN { switch ($1) + { case 's': + $$ = space_k; break; + case 'S': + $$ = cspace_k; break; + case 'd': + $$ = digit_k; break; + case 'D': + $$ = cdigit_k; break; + case 'w': + $$ = word_char_k; break; + case 'W': + $$ = cword_char_k; break; }} + newl : '\n' | error '\n' { yyerror("newline expected after directive"); yyerrok; } @@ -209,6 +209,8 @@ union regex_machine { int opt_derivative_regex = 0; +static val regex_space_chars; + static int L0_full(cset_L0_t *L0) { int i; @@ -634,6 +636,24 @@ static char_set_t *char_set_compile(val args, val comp) min = c_chr(item); if (c_chr(item) > max) max = c_chr(item); + } else if (item == space_k) { + if (max < 0x3000) + max = 0x3000; + if (min > 0x9) + min = 0x9; + } else if (item == digit_k) { + if (max < '9') + max = 9; + if (min > '0') + min = 0; + } else if (item == word_char_k) { + if (min > 'A') + min = 'A'; + if (max < 'z') + max = 'z'; + } else if (item == cspace_k || item == cdigit_k || item == cword_char_k) { + uw_throwf(error_s, lit("bad object in character class syntax: ~s"), + item, nao); } else { assert(0 && "bad regex set"); } @@ -667,6 +687,16 @@ static char_set_t *char_set_compile(val args, val comp) char_set_add_range(set, c_chr(from), c_chr(to)); } else if (typeof(item) == chr_s) { char_set_add(set, c_chr(item)); + } else if (item == space_k) { + val iter; + for (iter = regex_space_chars; iter; iter = cdr(iter)) + char_set_add(set, c_chr(car(iter))); + } else if (item == digit_k) { + char_set_add_range(set, '0', '9'); + } else if (item == word_char_k) { + char_set_add_range(set, 'A', 'Z'); + char_set_add_range(set, 'a', 'z'); + char_set_add(set, '_'); } else { assert(0 && "bad regex set"); } @@ -1852,7 +1882,6 @@ val regsub(val regex, val repl, val str) val space_k, digit_k, word_char_k; val cspace_k, cdigit_k, cword_char_k; -val regex_space_chars; void regex_init(void) { @@ -26,7 +26,6 @@ extern val space_k, digit_k, word_char_k; extern val cspace_k, cdigit_k, cword_char_k; -extern val regex_space_chars; val regex_compile(val regex_sexp); val regexp(val); |