summaryrefslogtreecommitdiffstats
path: root/parser.l
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2021-04-08 17:49:39 -0700
committerKaz Kylheku <kaz@kylheku.com>2021-04-08 17:49:39 -0700
commit4b088c75d89e8cbcdc07dec40036fd33995946d3 (patch)
treef749fee637ea544c3f404a14e7294099968a4dbb /parser.l
parentcea5c956486b8acae4bf5a23f0148d6b85d9acd3 (diff)
downloadtxr-4b088c75d89e8cbcdc07dec40036fd33995946d3.tar.gz
txr-4b088c75d89e8cbcdc07dec40036fd33995946d3.tar.bz2
txr-4b088c75d89e8cbcdc07dec40036fd33995946d3.zip
parser: allow funny UTF-8 in regexes and literals.
The main idea in this commit is to change a behavior of the lexer, and take advantage of it in the parser. Currently, the lexer recognizes a {UANYN} pattern in two places. That pattern matches a UTF-8 character. The lexeme is passed to the decoder, which is expected to produce exactly one wide character. If the UTF-8 is bad (for instance, a code in the surrogate pair range U+DCxx) then the decoder will produce multiple characters. In that case, these rules return ERRTOK instead of a LITCHAR or REGCHAR. The idea is: why don't we just return those characters as a TEXT token? Then we can just incorporate that into the literal or regex. * parser.l (grammar): If a UANYN lexeme decodes to multiple characters instead of the expected one, then produce a TEXT token instead of complaining about invalid UTF-8 bytes. * parser.y (regterm): Recognize a TEXT item as a regterm, converting its string value to a compound node in the regex AST, so it will be correctly treated as a fixed pattern. (chrlit): If a hash-backslash is followed by a TEXT token, which can happen now, that is invalid; we diagnose that as invalid UTF-8. (quasi_item): Remove TEXT rule, because the litchars constituent not generates TEXT. (litchars, restlistchar): Recognize TEXT item, similarly to regterm. * tests/012/parse.tl: New file. * tests/012/parse.expected: Likewise.
Diffstat (limited to 'parser.l')
-rw-r--r--parser.l10
1 files changed, 4 insertions, 6 deletions
diff --git a/parser.l b/parser.l
index 778c632a..d7e53c49 100644
--- a/parser.l
+++ b/parser.l
@@ -938,9 +938,8 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
<REGEX,SREGEX>{UANYN} {
wchar_t wchr[8];
if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) {
- yyerrprepf(yyg, lit("non-UTF-8 byte in regex: '\\x~02x'"),
- num(convert(unsigned char, yytext[0])), nao);
- return ERRTOK;
+ yylval->lexeme = chk_strdup(wchr);
+ return TEXT;
}
yylval->chr = wchr[0];
return REGCHAR;
@@ -1100,9 +1099,8 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
<STRLIT,CHRLIT,QSILIT,WLIT,QWLIT>{UANYN} {
wchar_t wchr[8];
if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) {
- yyerrprepf(yyg, lit("non-UTF-8 byte in literal: '\\x~02x'"),
- num(convert(unsigned char, yytext[0])), nao);
- return ERRTOK;
+ yylval->lexeme = chk_strdup(wchr);
+ return TEXT;
}
yylval->chr = wchr[0];
return LITCHAR;