parser: allow funny UTF-8 in regexes and literals.

The main idea in this commit is to change a behavior of the lexer, and take advantage of it in the parser. Currently, the lexer recognizes a {UANYN} pattern in two places. That pattern matches a UTF-8 character. The lexeme is passed to the decoder, which is expected to produce exactly one wide character. If the UTF-8 is bad (for instance, a code in the surrogate pair range U+DCxx) then the decoder will produce multiple characters. In that case, these rules return ERRTOK instead of a LITCHAR or REGCHAR. The idea is: why don't we just return those characters as a TEXT token? Then we can just incorporate that into the literal or regex. * parser.l (grammar): If a UANYN lexeme decodes to multiple characters instead of the expected one, then produce a TEXT token instead of complaining about invalid UTF-8 bytes. * parser.y (regterm): Recognize a TEXT item as a regterm, converting its string value to a compound node in the regex AST, so it will be correctly treated as a fixed pattern. (chrlit): If a hash-backslash is followed by a TEXT token, which can happen now, that is invalid; we diagnose that as invalid UTF-8. (quasi_item): Remove TEXT rule, because the litchars constituent not generates TEXT. (litchars, restlistchar): Recognize TEXT item, similarly to regterm. * tests/012/parse.tl: New file. * tests/012/parse.expected: Likewise.
author: Kaz Kylheku <kaz@kylheku.com> 2021-04-08 17:49:39 -0700
committer: Kaz Kylheku <kaz@kylheku.com> 2021-04-08 17:49:39 -0700
commit: 4b088c75d89e8cbcdc07dec40036fd33995946d3 (patch)
tree: f749fee637ea544c3f404a14e7294099968a4dbb
parent: cea5c956486b8acae4bf5a23f0148d6b85d9acd3 (diff)
download: txr-4b088c75d89e8cbcdc07dec40036fd33995946d3.tar.gz
txr-4b088c75d89e8cbcdc07dec40036fd33995946d3.tar.bz2
txr-4b088c75d89e8cbcdc07dec40036fd33995946d3.zip
4 files changed, 20 insertions, 7 deletions
diff --git a/parser.l b/parser.l
index 778c632a..d7e53c49 100644
--- a/parser.l
+++ b/parser.l
@@ -938,9 +938,8 @@ UONLY   {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
 <REGEX,SREGEX>{UANYN}  {
   wchar_t wchr[8];
   if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) {
-    yyerrprepf(yyg, lit("non-UTF-8 byte in regex: '\\x~02x'"),
-               num(convert(unsigned char, yytext[0])), nao);
-    return ERRTOK;
+    yylval->lexeme = chk_strdup(wchr);
+    return TEXT;
   }
   yylval->chr = wchr[0];
   return REGCHAR;
@@ -1100,9 +1099,8 @@ UONLY   {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
 <STRLIT,CHRLIT,QSILIT,WLIT,QWLIT>{UANYN} {
   wchar_t wchr[8];
   if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) {
-    yyerrprepf(yyg, lit("non-UTF-8 byte in literal: '\\x~02x'"),
-               num(convert(unsigned char, yytext[0])), nao);
-    return ERRTOK;
+    yylval->lexeme = chk_strdup(wchr);
+    return TEXT;
   }
   yylval->chr = wchr[0];
   return LITCHAR;
diff --git a/parser.y b/parser.y
index 7e5b898d..98c2aa5c 100644
--- a/parser.y
+++ b/parser.y
@@ -1187,6 +1187,7 @@ regterm : regterm '*'           { $$ = list(zeroplus_s, $1, nao); }
         | '-'                   { $$ = chr('-'); }
         | REGCHAR               { $$ = chr($1); }
         | regtoken              { $$ = $1; }
+        | TEXT                  { $$ = list(compound_s, string_own($1), nao); }
         | '(' regexpr ')'       { $$ = $2; }
         | '(' error             { $$ = nil;
                                   yybadtok(yychar, lit("regex subexpression")); }
@@ -1258,6 +1259,10 @@ chrlit : HASH_BACKSLASH SYMTOK  { wchar_t ch;
                                   $$ = chr(ch); }
        | HASH_BACKSLASH LITCHAR { $$ = chr($2);
                                   end_of_char(scnr); }
+       | HASH_BACKSLASH TEXT     { free($2);
+                                   yyerrorf(scnr,
+                                            lit("invalid UTF-8 used as character name"),
+                                            nao); }
        | HASH_BACKSLASH error   { $$ = nil;
                                   yybadtok(yychar,
                                              lit("character literal")); }
@@ -1278,7 +1283,6 @@ quasi_items : quasi_item                { $$ = cons($1, nil);
             ;
 
 quasi_item : litchars           { $$ = $1; }
-           | TEXT               { $$ = string_own($1); }
            | q_var              { $$ = $1; }
            | METANUM            { $$ = cons(var_s, cons($1, nil));
                                   rl($$, num(parser->lineno)); }
@@ -1292,10 +1296,14 @@ quasi_item : litchars           { $$ = $1; }
 litchars : LITCHAR              { $$ = mkstring(one, chr($1)); }
          | LITCHAR restlitchar  { val ch = mkstring(one, chr($1));
                                   $$ = string_extend(ch, $2); }
+         | TEXT                 { $$ = string_own($1); }
+         | TEXT restlitchar     { $$ = string_extend(string_own($1), $2); }
          ;
 
 restlitchar : LITCHAR                   { $$ = mkstring(one, chr($1)); }
             | restlitchar LITCHAR       { $$ = string_extend($1, chr($2)); }
+            | TEXT                      { $$ = string_own($1); }
+            | restlitchar TEXT          { $$ = string_extend($1, string_own($2)); }
             ;
 
 wordslit : '"'                  { $$ = nil; }
diff --git a/tests/012/parse.expected b/tests/012/parse.expected
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/tests/012/parse.expected
diff --git a/tests/012/parse.tl b/tests/012/parse.tl
new file mode 100644
index 00000000..8e3e7afc
--- /dev/null
+++ b/tests/012/parse.tl
@@ -0,0 +1,7 @@
+(load "../common")
+
+(test (read `"@(str-buf #b'EDB081')"`)
+      "\xDCED\xDCB0\xDC81")
+
+(test (regex-parse (str-buf #b'EDB081'))
+      (compound "\xDCED\xDCB0\xDC81"))
author	Kaz Kylheku <kaz@kylheku.com>	2021-04-08 17:49:39 -0700
committer	Kaz Kylheku <kaz@kylheku.com>	2021-04-08 17:49:39 -0700
commit	4b088c75d89e8cbcdc07dec40036fd33995946d3 (patch)
tree	f749fee637ea544c3f404a14e7294099968a4dbb
parent	cea5c956486b8acae4bf5a23f0148d6b85d9acd3 (diff)
download	txr-4b088c75d89e8cbcdc07dec40036fd33995946d3.tar.gz txr-4b088c75d89e8cbcdc07dec40036fd33995946d3.tar.bz2 txr-4b088c75d89e8cbcdc07dec40036fd33995946d3.zip