summaryrefslogtreecommitdiffstats
path: root/parser.l
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2021-05-31 20:21:47 -0700
committerKaz Kylheku <kaz@kylheku.com>2021-05-31 20:21:47 -0700
commit63dabb4ea67cde0971474c4278d605394af0d1b3 (patch)
tree17730286fb5e59475a4ffc0c433e90019bab581a /parser.l
parent59beb217e76f3518ad35ea1d51b36452cf5723fd (diff)
downloadtxr-63dabb4ea67cde0971474c4278d605394af0d1b3.tar.gz
txr-63dabb4ea67cde0971474c4278d605394af0d1b3.tar.bz2
txr-63dabb4ea67cde0971474c4278d605394af0d1b3.zip
json: fix unquote parsing issue in quasiquotes.
The big comment I added above end_of_json_unquote summarizes the issue. This issue has been uncovered by some test cases in a JSON test suite, not yet committed. * parser.l <JMARKER>: New start condition. Used as a reliable marker in the start condition stack, based on which end_of_json_quasiquote can intelligently fix-up the stack. (JSON): In the transitions to the quasiquote scanning NESTED state, push the JMARKER start condition before NESTED. (JMARKER): The lexer should never read input in the JMARKER state. If we don't put in a rule to catch this, if that ever happens, the lexer will just copy the source code to standard output. I ran into this during debugging. (end_of_json_unquote): Rewrite the start condition stack intelligently based on what the Lisp lookahead token has done to it, so parsing can smoothly continue. * lex.yy.c.shipped: Regenerated.
Diffstat (limited to 'parser.l')
-rw-r--r--parser.l64
1 files changed, 60 insertions, 4 deletions
diff --git a/parser.l b/parser.l
index c180c919..ce3a5573 100644
--- a/parser.l
+++ b/parser.l
@@ -285,7 +285,7 @@ NJPUNC [^(){},:\[\]"~*^ \t\n]
%x SPECIAL BRACED NESTED REGEX SREGEX STRLIT CHRLIT
%x QSILIT QSPECIAL WLIT QWLIT BUFLIT
-%x JSON JLIT
+%x JSON JLIT JMARKER
%%
@@ -1227,11 +1227,13 @@ NJPUNC [^(){},:\[\]"~*^ \t\n]
}
<JSON>~[*] {
+ yy_push_state(JMARKER, yyscanner);
yy_push_state(NESTED, yyscanner);
return JSPLICE;
}
<JSON>~ {
+ yy_push_state(JMARKER, yyscanner);
yy_push_state(NESTED, yyscanner);
return yytext[0];
}
@@ -1257,6 +1259,10 @@ NJPUNC [^(){},:\[\]"~*^ \t\n]
return yytext[0];
}
+<JMARKER>. {
+ internal_error("scanner processed input JMARKER state");
+}
+
%%
static int directive_tok(scanner_t *yyscanner, int tok, int state)
@@ -1343,13 +1349,63 @@ void end_of_json(scanner_t *yyg)
yy_pop_state(yyg);
}
+/* The complexity here is necessary because TXR Lisp parsing looks ahead
+ * by one token. (The reason for *that* is the support of a.b.c referencing dot
+ * syntax in TXR Lisp.)
+ *
+ * Consider these two different cases:
+ *
+ * ^#J[,~(+ 2.0 2.0)]
+ * ^#J[,~(+ 2.0 2.0) #J42]
+ *
+ * This end_of_json_unquote function gets called when the (+ 2.0 2.0)
+ * has been parsed, but the Yacc-generated parser has shifted one tokan
+ * ahead. It has read the ] token in the one case or the #J token in
+ * the other. These tokens have totally different effects on the Lex
+ * start condition. When the lexer reads the ] token, it pops off a NESTED
+ * state, whereas the #J token wants to push on a new JSON state.
+ * By the time end_of_json_unquote has been called, this has already happened.
+ *
+ * To deal with this, we use the dummy JMARKER start state which serves as a
+ * kind of parenthesis inside the start condition stack. BHefore scanning Lisp
+ * unquote within JSON, we push JMARKER state first, then the NESTED state.
+ *
+ * If the lookahead token is like ], and pops off a state, it will pop off
+ * our NESTED state, so we are left at the JMARKER state. If the lookahead
+ * token is something else like #J (HASH_J), then it will push a new
+ * state like JSON on top, and we have JMARKER NESTED JSON.
+ *
+ * So what we are doing here is popping off everything until we get down
+ * to the JMARKER state, and putting it into our little save area.
+ *
+ * Then we lose the JMARKER state.
+ *
+ * If the save area is empty, it means that the lookahead token consumed
+ * our NESTED state, and so we are done.
+ *
+ * If the save area is not empty, it means the lookahead put something
+ * extra over our NESTED state. We drop that state from our save area,
+ * and restore the rest of the save area back into the stack.
+ * Effectively, we are deleting the unquote-related states from the
+ * interior of the start condition stack, not to disturb new material
+ * initiated by the lookahead token.
+ */
void end_of_json_unquote(scanner_t *yyg)
{
- if (YYSTATE == NESTED)
+ int stacksave[8];
+ int top = 0;
+
+ while (YYSTATE != JMARKER) {
+ stacksave[top++] = YYSTATE;
yy_pop_state(yyg);
+ }
- if (YYSTATE != JSON)
- internal_error("end_of_json_unquote called in wrong scanner state");
+ yy_pop_state(yyg);
+
+ if (top-- > 0) {
+ while (top > 0)
+ yy_push_state(stacksave[--top], yyg);
+ }
}
val source_loc(val form)