From 8bc5fc7a77eb1a6707f3c742235ab38ca210f55e Mon Sep 17 00:00:00 2001
From: Kaz Kylheku <kaz@kylheku.com>
Date: Fri, 28 May 2021 06:52:26 -0700
Subject: json: handling for bad UTF-8 bytes, NUL and \u0000.

* parser.l <JLIT>: Convert \u+0000 sequence to U+DC00
code point, the pseudo-null. Also include JLIT
in in the rule for catching bad bytes that are not
matched by {UANYN}.

* txr.1: Document this treatment as extensions to JSON.

* lex.yy.c.shipped: Updated.
---
 parser.l | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'parser.l')

diff --git a/parser.l b/parser.l
index fab8dd9c..86472c03 100644
--- a/parser.l
+++ b/parser.l
@@ -1121,7 +1121,8 @@ NJPUNC  [^(){},:\[\]"~*^ \t\n]
 }
 
 <JLIT>[\\]u{HEX}{4} {
-  yylval->chr = num_esc(yyg, yytext + 1);
+  wchar_t ch = num_esc(yyg, yytext + 1);
+  yylval->chr = if3(ch, ch, 0xDC00);
   return LITCHAR;
 }
 
@@ -1171,7 +1172,7 @@ NJPUNC  [^(){},:\[\]"~*^ \t\n]
            chr(yytext[0]), nao);
 }
 
-<STRLIT,CHRLIT,QSILIT,WLIT,QWLIT>. {
+<STRLIT,CHRLIT,QSILIT,WLIT,QWLIT,JLIT>. {
   yylval->chr = convert(unsigned char, yytext[0]) + 0xDC00;
   return LITCHAR;
 }
-- 
cgit v1.2.3