summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2012-04-20 00:47:46 -0700
committerKaz Kylheku <kaz@kylheku.com>2012-04-20 00:47:46 -0700
commitc3c43fd39c715827de5cc74846ba977c5b1d2181 (patch)
tree2058612d3a3623851b346e7068e648ecdc8bc4a6
parent2614361741b85a71b06fc4d70538e0b5d8fb660b (diff)
downloadtxr-c3c43fd39c715827de5cc74846ba977c5b1d2181.tar.gz
txr-c3c43fd39c715827de5cc74846ba977c5b1d2181.tar.bz2
txr-c3c43fd39c715827de5cc74846ba977c5b1d2181.zip
* parser.y (regtoken): New nonterminal symbol.
(regterm): REGTOKEN production factored out to regtoken. (regclass): Reverted prior commmit's changes. (regclassterm): Reverted prior commit, removing REGTOKEN production for character classes, and introduced a regtoken production. So now the keyword symbols are part of the character class abstract syntax. (regtoken): New production rule. * regex.c (regex_space_chars): Converted to internal linkage. (char_set_compile): Handle token keywords in character class abstract syntax. * regex.h (regex_space_chars): External declaration removed.
-rw-r--r--ChangeLog17
-rw-r--r--parser.y56
-rw-r--r--regex.c31
-rw-r--r--regex.h1
4 files changed, 68 insertions, 37 deletions
diff --git a/ChangeLog b/ChangeLog
index 75b111d3..4497be3b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2012-04-20 Kaz Kylheku <kaz@kylheku.com>
+
+ * parser.y (regtoken): New nonterminal symbol.
+ (regterm): REGTOKEN production factored out to regtoken.
+ (regclass): Reverted prior commmit's changes.
+ (regclassterm): Reverted prior commit, removing REGTOKEN
+ production for character classes, and introduced a regtoken
+ production. So now the keyword symbols are part of the
+ character class abstract syntax.
+ (regtoken): New production rule.
+
+ * regex.c (regex_space_chars): Converted to internal linkage.
+ (char_set_compile): Handle token keywords in character class
+ abstract syntax.
+
+ * regex.h (regex_space_chars): External declaration removed.
+
2012-04-19 Kaz Kylheku <kaz@kylheku.com>
First cut at implementing \s, \d, \w, \S, \D and \W regex tokens.
diff --git a/parser.y b/parser.y
index c57bc901..46a0052b 100644
--- a/parser.y
+++ b/parser.y
@@ -90,7 +90,7 @@ static val parsed_spec;
%type <val> repeat_clause repeat_parts_opt o_line
%type <val> o_elems_opt o_elems o_elem o_var rep_elem rep_parts_opt
%type <val> regex lisp_regex regexpr regbranch
-%type <val> regterm regclass regclassterm regrange
+%type <val> regterm regtoken regclass regclassterm regrange
%type <val> strlit chrlit quasilit quasi_items quasi_item litchars
%type <chr> regchar
%type <lineno> '(' '['
@@ -796,48 +796,20 @@ regterm : regterm '*' { $$ = list(zeroplus_s, $1, nao); }
| ']' { $$ = chr(']'); }
| '-' { $$ = chr('-'); }
| REGCHAR { $$ = chr($1); }
- | REGTOKEN { switch ($1)
- { case 's':
- $$ = space_k; break;
- case 'S':
- $$ = cspace_k; break;
- case 'd':
- $$ = digit_k; break;
- case 'D':
- $$ = cdigit_k; break;
- case 'w':
- $$ = word_char_k; break;
- case 'W':
- $$ = cword_char_k; break; }}
+ | regtoken { $$ = $1; }
| '(' regexpr ')' { $$ = $2; }
| '(' error { $$ = nil;
yybadtoken(yychar,
lit("regex subexpression")); }
;
-regclass : regclassterm { $$ = $1; }
- | regclassterm regclass { $$ = nappend2($1, $2); }
+regclass : regclassterm { $$ = cons($1, nil); }
+ | regclassterm regclass { $$ = cons($1, $2); }
;
-regclassterm : regrange { $$ = cons($1, nil); }
- | regchar { $$ = cons(chr($1), nil); }
- | REGTOKEN { switch ($1)
- { case 's':
- $$ = regex_space_chars;
- break;
- case 'd':
- $$ = cons(cons(chr('0'), chr('9')), nil);
- break;
- case 'w':
- $$ = list(cons(chr('A'), chr('Z')),
- cons(chr('a'), chr('z')),
- chr('_'), nao);
- break;
- default:
- yyerrorf(lit("complemented token "
- "\\~a not allowed "
- "in regex character class"),
- chr($1), nao); } }
+regclassterm : regrange { $$ = $1; }
+ | regchar { $$ = chr($1); }
+ | regtoken { $$ = $1; }
;
regrange : regchar '-' regchar { $$ = cons(chr($1), chr($3)); }
@@ -856,6 +828,20 @@ regchar : '?' { $$ = '?'; }
| REGCHAR { $$ = $1; }
;
+regtoken : REGTOKEN { switch ($1)
+ { case 's':
+ $$ = space_k; break;
+ case 'S':
+ $$ = cspace_k; break;
+ case 'd':
+ $$ = digit_k; break;
+ case 'D':
+ $$ = cdigit_k; break;
+ case 'w':
+ $$ = word_char_k; break;
+ case 'W':
+ $$ = cword_char_k; break; }}
+
newl : '\n'
| error '\n' { yyerror("newline expected after directive");
yyerrok; }
diff --git a/regex.c b/regex.c
index 9670b901..77989a98 100644
--- a/regex.c
+++ b/regex.c
@@ -209,6 +209,8 @@ union regex_machine {
int opt_derivative_regex = 0;
+static val regex_space_chars;
+
static int L0_full(cset_L0_t *L0)
{
int i;
@@ -634,6 +636,24 @@ static char_set_t *char_set_compile(val args, val comp)
min = c_chr(item);
if (c_chr(item) > max)
max = c_chr(item);
+ } else if (item == space_k) {
+ if (max < 0x3000)
+ max = 0x3000;
+ if (min > 0x9)
+ min = 0x9;
+ } else if (item == digit_k) {
+ if (max < '9')
+ max = 9;
+ if (min > '0')
+ min = 0;
+ } else if (item == word_char_k) {
+ if (min > 'A')
+ min = 'A';
+ if (max < 'z')
+ max = 'z';
+ } else if (item == cspace_k || item == cdigit_k || item == cword_char_k) {
+ uw_throwf(error_s, lit("bad object in character class syntax: ~s"),
+ item, nao);
} else {
assert(0 && "bad regex set");
}
@@ -667,6 +687,16 @@ static char_set_t *char_set_compile(val args, val comp)
char_set_add_range(set, c_chr(from), c_chr(to));
} else if (typeof(item) == chr_s) {
char_set_add(set, c_chr(item));
+ } else if (item == space_k) {
+ val iter;
+ for (iter = regex_space_chars; iter; iter = cdr(iter))
+ char_set_add(set, c_chr(car(iter)));
+ } else if (item == digit_k) {
+ char_set_add_range(set, '0', '9');
+ } else if (item == word_char_k) {
+ char_set_add_range(set, 'A', 'Z');
+ char_set_add_range(set, 'a', 'z');
+ char_set_add(set, '_');
} else {
assert(0 && "bad regex set");
}
@@ -1852,7 +1882,6 @@ val regsub(val regex, val repl, val str)
val space_k, digit_k, word_char_k;
val cspace_k, cdigit_k, cword_char_k;
-val regex_space_chars;
void regex_init(void)
{
diff --git a/regex.h b/regex.h
index 4d137a0f..962a2846 100644
--- a/regex.h
+++ b/regex.h
@@ -26,7 +26,6 @@
extern val space_k, digit_k, word_char_k;
extern val cspace_k, cdigit_k, cword_char_k;
-extern val regex_space_chars;
val regex_compile(val regex_sexp);
val regexp(val);