diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2021-04-08 17:49:39 -0700 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2021-04-08 17:49:39 -0700 |
commit | 4b088c75d89e8cbcdc07dec40036fd33995946d3 (patch) | |
tree | f749fee637ea544c3f404a14e7294099968a4dbb | |
parent | cea5c956486b8acae4bf5a23f0148d6b85d9acd3 (diff) | |
download | txr-4b088c75d89e8cbcdc07dec40036fd33995946d3.tar.gz txr-4b088c75d89e8cbcdc07dec40036fd33995946d3.tar.bz2 txr-4b088c75d89e8cbcdc07dec40036fd33995946d3.zip |
parser: allow funny UTF-8 in regexes and literals.
The main idea in this commit is to change a behavior of the
lexer, and take advantage of it in the parser. Currently, the
lexer recognizes a {UANYN} pattern in two places. That
pattern matches a UTF-8 character. The lexeme is passed to
the decoder, which is expected to produce exactly one wide
character. If the UTF-8 is bad (for instance, a code in the
surrogate pair range U+DCxx) then the decoder will produce
multiple characters. In that case, these rules return ERRTOK
instead of a LITCHAR or REGCHAR. The idea is: why don't we
just return those characters as a TEXT token? Then we can
just incorporate that into the literal or regex.
* parser.l (grammar): If a UANYN lexeme decodes to multiple
characters instead of the expected one, then produce a
TEXT token instead of complaining about invalid UTF-8 bytes.
* parser.y (regterm): Recognize a TEXT item as a regterm,
converting its string value to a compound node in the regex
AST, so it will be correctly treated as a fixed pattern.
(chrlit): If a hash-backslash is followed by a TEXT token,
which can happen now, that is invalid; we diagnose that
as invalid UTF-8.
(quasi_item): Remove TEXT rule, because the litchars
constituent not generates TEXT.
(litchars, restlistchar): Recognize TEXT item, similarly to
regterm.
* tests/012/parse.tl: New file.
* tests/012/parse.expected: Likewise.
-rw-r--r-- | parser.l | 10 | ||||
-rw-r--r-- | parser.y | 10 | ||||
-rw-r--r-- | tests/012/parse.expected | 0 | ||||
-rw-r--r-- | tests/012/parse.tl | 7 |
4 files changed, 20 insertions, 7 deletions
@@ -938,9 +938,8 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} <REGEX,SREGEX>{UANYN} { wchar_t wchr[8]; if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) { - yyerrprepf(yyg, lit("non-UTF-8 byte in regex: '\\x~02x'"), - num(convert(unsigned char, yytext[0])), nao); - return ERRTOK; + yylval->lexeme = chk_strdup(wchr); + return TEXT; } yylval->chr = wchr[0]; return REGCHAR; @@ -1100,9 +1099,8 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} <STRLIT,CHRLIT,QSILIT,WLIT,QWLIT>{UANYN} { wchar_t wchr[8]; if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) { - yyerrprepf(yyg, lit("non-UTF-8 byte in literal: '\\x~02x'"), - num(convert(unsigned char, yytext[0])), nao); - return ERRTOK; + yylval->lexeme = chk_strdup(wchr); + return TEXT; } yylval->chr = wchr[0]; return LITCHAR; @@ -1187,6 +1187,7 @@ regterm : regterm '*' { $$ = list(zeroplus_s, $1, nao); } | '-' { $$ = chr('-'); } | REGCHAR { $$ = chr($1); } | regtoken { $$ = $1; } + | TEXT { $$ = list(compound_s, string_own($1), nao); } | '(' regexpr ')' { $$ = $2; } | '(' error { $$ = nil; yybadtok(yychar, lit("regex subexpression")); } @@ -1258,6 +1259,10 @@ chrlit : HASH_BACKSLASH SYMTOK { wchar_t ch; $$ = chr(ch); } | HASH_BACKSLASH LITCHAR { $$ = chr($2); end_of_char(scnr); } + | HASH_BACKSLASH TEXT { free($2); + yyerrorf(scnr, + lit("invalid UTF-8 used as character name"), + nao); } | HASH_BACKSLASH error { $$ = nil; yybadtok(yychar, lit("character literal")); } @@ -1278,7 +1283,6 @@ quasi_items : quasi_item { $$ = cons($1, nil); ; quasi_item : litchars { $$ = $1; } - | TEXT { $$ = string_own($1); } | q_var { $$ = $1; } | METANUM { $$ = cons(var_s, cons($1, nil)); rl($$, num(parser->lineno)); } @@ -1292,10 +1296,14 @@ quasi_item : litchars { $$ = $1; } litchars : LITCHAR { $$ = mkstring(one, chr($1)); } | LITCHAR restlitchar { val ch = mkstring(one, chr($1)); $$ = string_extend(ch, $2); } + | TEXT { $$ = string_own($1); } + | TEXT restlitchar { $$ = string_extend(string_own($1), $2); } ; restlitchar : LITCHAR { $$ = mkstring(one, chr($1)); } | restlitchar LITCHAR { $$ = string_extend($1, chr($2)); } + | TEXT { $$ = string_own($1); } + | restlitchar TEXT { $$ = string_extend($1, string_own($2)); } ; wordslit : '"' { $$ = nil; } diff --git a/tests/012/parse.expected b/tests/012/parse.expected new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/tests/012/parse.expected diff --git a/tests/012/parse.tl b/tests/012/parse.tl new file mode 100644 index 00000000..8e3e7afc --- /dev/null +++ b/tests/012/parse.tl @@ -0,0 +1,7 @@ +(load "../common") + +(test (read `"@(str-buf #b'EDB081')"`) + "\xDCED\xDCB0\xDC81") + +(test (regex-parse (str-buf #b'EDB081')) + (compound "\xDCED\xDCB0\xDC81")) |