summaryrefslogtreecommitdiffstats
path: root/parser.l
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2021-04-08 21:25:58 -0700
committerKaz Kylheku <kaz@kylheku.com>2021-04-08 21:25:58 -0700
commit60cd468d75eb0fef11a26238ddc3588b7c7b2f15 (patch)
treec08ecfbf6929101a4e077a8a39db4dff96d47616 /parser.l
parent9218460118760c2e4f3910b6c7ee73e0e644a401 (diff)
downloadtxr-60cd468d75eb0fef11a26238ddc3588b7c7b2f15.tar.gz
txr-60cd468d75eb0fef11a26238ddc3588b7c7b2f15.tar.bz2
txr-60cd468d75eb0fef11a26238ddc3588b7c7b2f15.zip
parser: allow non-UTF-8 bytes in literals and regexes.
* parser.l (grammar): Just like we do in SREGEX, allow an arbitrary byte in REGEX, mapping it to the DCxx range. Do the same inside string literals of all types. * lex.yy.c.shipped: Updated. * tests/012/parse.tl: New tests.
Diffstat (limited to 'parser.l')
-rw-r--r--parser.l14
1 files changed, 3 insertions, 11 deletions
diff --git a/parser.l b/parser.l
index d7e53c49..98cdf344 100644
--- a/parser.l
+++ b/parser.l
@@ -945,18 +945,11 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
return REGCHAR;
}
-<SREGEX>. {
- /* Allow non-UTF-8 byte for regexes scanned from string */
+<SREGEX,REGEX>. {
yylval->chr = convert(unsigned char, yytext[0]) + 0xDC00;
return REGCHAR;
}
-<REGEX>. {
- yyerrprepf(yyg, lit("non-UTF-8 byte in regex: '\\x~02x'"),
- num(convert(unsigned char, yytext[0])), nao);
- return ERRTOK;
-}
-
<INITIAL>[ ]+ {
yylval->lexeme = utf8_dup_from(yytext);
return SPACE;
@@ -1128,9 +1121,8 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
}
<STRLIT,CHRLIT,QSILIT,WLIT,QWLIT>. {
- yyerrprepf(yyg, lit("non-UTF-8 byte in literal: '\\x~02x'"),
- num(convert(unsigned char, yytext[0])), nao);
- return ERRTOK;
+ yylval->chr = convert(unsigned char, yytext[0]) + 0xDC00;
+ return LITCHAR;
}
%%