/* Copyright 2009-2024 * Kaz Kylheku * Vancouver, Canada * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ %{ #include #include #include #include #include #include #include #include "config.h" #include "lib.h" #include "gc.h" #include "stream.h" #include "utf8.h" #include "signal.h" #include "unwind.h" #include "hash.h" #include "y.tab.h" #include "parser.h" #include "txr.h" #define YY_INPUT(buf, result, max_size) \ do { \ val self = lit("parser"); \ val n = get_bytes(self, yyextra->stream, \ coerce(mem_t *, buf), max_size); \ result = c_num(n, self); \ } while (0) #define YY_DECL \ static int yylex_impl(YYSTYPE *yylval_param, yyscan_t yyscanner) #define YY_FATAL_ERROR(msg) lex_irrecoverable_error(msg) int opt_loglevel = 1; /* 0 - quiet; 1 - normal; 2 - verbose */ val form_to_ln_hash; static int directive_tok(scanner_t *yyg, int tok, int state); #define FLEX_NUM_VERSION 10000*YY_FLEX_MAJOR_VERSION + \ 100*YY_FLEX_MINOR_VERSION + \ YY_FLEX_SUBMINOR_VERSION #if FLEX_NUM_VERSION < 20509 int yylex_destroy(void) { return 0; } #endif /* Missing prototypes not generated by flex. */ int yyget_column(yyscan_t); void yyset_column (int column_no, yyscan_t yyscanner); /* The following function is all that libflex provides. By providing it here, we eliminate the need to link libflex. */ #define YY_SKIP_YYWRAP INLINE int yywrap(yyscan_t scanner) { (void) scanner; return 1; } void yyerror(scanner_t *scanner, parser_t *parser, const char *s) { yyerrorf(scanner, lit("~a"), string_utf8(s), nao); if (parser->prepared_msg) { yyerrorf(scanner, lit("~a"), parser->prepared_msg, nao); parser->prepared_msg = nil; } } void yyerrorf(scanner_t *scanner, val fmt, ...) { parser_t *parser = yyget_extra(scanner); if (opt_loglevel >= 1) { va_list vl; va_start (vl, fmt); if (opt_compat && opt_compat <= 114) format(std_error, lit("~a: (~a:~d): "), prog_string, parser->name, num(parser->lineno), nao); else format(std_error, lit("~a:~d: "), parser->name, num(parser->lineno), nao); vformat(std_error, fmt, vl); put_char(chr('\n'), std_error); va_end (vl); } parser->errors++; } static void yyerrprepf(scanner_t *scanner, val fmt, ...) { parser_t *parser = yyget_extra(scanner); if (opt_loglevel >= 1) { va_list vl; va_start (vl, fmt); set(mkloc(parser->prepared_msg, parser->parser), vformat_to_string(fmt, vl)); va_end (vl); } } static void lex_irrecoverable_error(const char *msg8) { val msg = string_utf8(msg8); uw_throwf(error_s, lit("error in parser: ~a"), msg, nao); } static void out_of_range_float(scanner_t *scanner, const char *tok) { yyerrorf(scanner, lit("out-of-range floating-point literal: ~a"), string_utf8(tok), nao); } static wchar_t char_esc(int letter) { switch (letter) { case ' ': return L' '; case 'a': return L'\a'; case 'b': return L'\b'; case 't': return L'\t'; case 'n': return L'\n'; case 'v': return L'\v'; case 'f': return L'\f'; case 'r': return L'\r'; case 'e': return 27; case '"': return L'"'; case '\'': return L'\''; case '`': return L'`'; case '/': return L'/'; case '@': return L'@'; case '\\': return L'\\'; } internal_error("unhandled escape character"); } static wchar_t num_esc(scanner_t *scn, char *num) { long val = 0; if (num[0] == 'x' || num[0] == 'u') { if (strlen(num) > 7) yyerror(scn, yyget_extra(scn), "too many digits in hex character escape"); else val = strtol(num + 1, 0, 16); } else { if (num[0] == 'o') num++; if (strlen(num) > 8) yyerror(scn, yyget_extra(scn), "too many digits in octal character escape"); else val = strtol(num, 0, 8); } if (val < 0 || val > 0x10FFFF || convert(wchar_t, val) != val) { yyerror(scn, yyget_extra(scn), "numeric character escape out of range"); val = 0; } return val; } static wchar_t *unicode_ident(scanner_t *scn, const char *lex) { wchar_t *wlex = utf8_dup_from(lex), *ptr = wlex, wch; while ((wch = *ptr++)) { if (wch < 0x1680 || (wch >= 0x3000 && wch < 0xdc00)) continue; if ((wch >= 0xdc00 && wch <= 0xdcff) || (wch >= 0xd800 && wch <= 0xdbff) || #if FULL_UNICODE (wch >= 0xf0000 && wch <= 0xffffd) || (wch >= 0x100000 && wch <= 0x10fffd) || #endif (wch >= 0xe000 && wch <= 0xf8ff) || (wch == 0xfffe) || (wch == 0xffff)) { yyerror(scn, yyget_extra(scn), "disallowed Unicode character in identifier"); break; } switch (wch) { case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009: case 0x200a: case 0x2028: case 0x2029: case 0x205f: case 0x3000: yyerror(scn, yyget_extra(scn), "Unicode space occurs in identifier"); break; default: continue; } break; } return wlex; } static char *remove_char(char *str, int c) { char *dst = str, *src = str; while (*src) { int ch = *src++; if (ch != c) *dst++ = ch; } *dst = 0; return str; } %} %option stack noinput reentrant bison-bridge extra-type="parser_t *" %option never-interactive TOK [a-zA-Z0-9_]+ SGN [+\-] EXP [eE][+\-]?[0-9]+ DIG [0-9] DIG19 [1-9] DIGSEP {DIG}({DIG}|,)*{DIG}|{DIG} XDIG [0-9A-Fa-f] XDIGSEP {XDIG}({XDIG}|,)*{XDIG}|{XDIG} NUM {SGN}?{DIG}+ NUMSEP {SGN}?{DIGSEP} FLO {SGN}?({DIG}*[.]{DIG}+{EXP}?|{DIG}+[.]?{EXP}) FLOSEP {SGN}?({DIGSEP}*[.]{DIGSEP}+{EXP}?|{DIGSEP}+[.]?{EXP}) FLODOT {SGN}?{DIGSEP}+[.] DOTFLO [.]{DIG}+ XNUM #x{SGN}?{XDIG}+ XNUMSEP #x{SGN}?{XDIGSEP} ODIG [0-7] ODIGSEP {ODIG}({ODIG}|,)*{ODIG}|{ODIG} BDIG [01] BDIGSEP {BDIG}({BDIG}|,)*{BDIG}|{BDIG} ONUM #o{SGN}?{ODIG}+ ONUMSEP #o{SGN}?{ODIGSEP} BNUM #b{SGN}?{BDIG}+ BNUMSEP #b{SGN}?{BDIGSEP} BSCHR ([a-zA-Z0-9!$%&*+\-<=>?\\_~]|{UONLY}) NSCHR ([a-zA-Z0-9!$%&*+\-<=>?\\_~/]|{UONLY}) ID_END [^a-zA-Z0-9!$%&*+\-<=>?\\_~/] EXTRA [#^] BT0 {BSCHR}({BSCHR}|{EXTRA})* BT1 @{BT0}+ BT2 ({BSCHR}|{EXTRA})+ BTREG ({BT0}|{BT1})?:{BT2}?|({BT0}|{BT1})(:{BT2})? BTKWUN @?#?:{BT2}? BTOK {BTREG}|{BTKWUN} NT0 {NSCHR}({NSCHR}|{EXTRA})* NT1 @{NT0}+ NT2 ({NSCHR}|{EXTRA})+ NTREG ({NT0}|{NT1})?:{NT2}?|({NT0}|{NT1})(:{NT2})? NTKWUN @?#?:{NT2}? NTOK {NTREG}|{NTKWUN} WS [\t ]* REQWS [\t ]+ NL (\n|\r|\r\n) HEX [0-9A-Fa-f] OCT [0-7] REGOP [/()|.*?+~&%\[\]\-] ASC [\x00-\x7f] ASCN [\x00-\t\v-\x7f] U [\x80-\xbf] U2 [\xc2-\xdf] U3 [\xe0-\xef] U4 [\xf0-\xf4] UANY {ASC}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} UANYN {ASCN}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} JNUM -?(0|{DIG19}{DIG}*)([.]{DIG}+)?{EXP}? JPUNC [(){},:\[\]"~*^] NJPUNC [^(){},:\[\]"~*^ \t\r\n] %x SPECIAL BRACED NESTED REGEX SREGEX STRLIT CHRLIT %x QSILIT QSPECIAL WLIT QWLIT BUFLIT %x JSON JLIT JMARKER %% {NUM} { wchar_t *wtxt = utf8_dup_from(yytext); if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); yylval->val = int_str_wc(wtxt, num(10)); free(wtxt); return NUMBER; } {NUMSEP} { wchar_t *wtxt = utf8_dup_from(remove_char(yytext, ',')); if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); yylval->val = int_str_wc(wtxt, num(10)); free(wtxt); return NUMBER; } {XNUMSEP}|{ONUMSEP}|{BNUMSEP} { wchar_t *wtxt = utf8_dup_from(remove_char(yytext + 2, ',')); int base; switch (yytext[1]) { case 'x': base = 16; break; case 'o': base = 8; break; case 'b': default: base = 2; break; } if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); yylval->val = int_str_wc(wtxt, num_fast(base)); free(wtxt); return NUMBER; } ({BNUMSEP}|{ONUMSEP}|{XNUMSEP}){TOK} { int base = 0; val str = string_own(utf8_dup_from(yytext + 2)); switch (yytext[1]) { case 'x': base = 16; break; case 'o': base = 8; break; case 'b': default: base = 2; break; } yyerrorf(yyg, lit("trailing junk in numeric literal: ~a~a~a"), chr(yytext[0]), chr(yytext[1]), str, nao); if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); yylval->val = int_str(str, num_fast(base)); return NUMBER; } {WS}{FLO} { if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); if ((yylval->val = flo_str_utf8(yytext)) == nil) out_of_range_float(yyg, yytext); return NUMBER; } {WS}{FLOSEP} { if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); remove_char(yytext, ','); if ((yylval->val = flo_str_utf8(yytext)) == nil) out_of_range_float(yyg, yytext); return NUMBER; } {WS}({FLOSEP}[.]?|{FLODOT}){TOK} | {WS}({FLOSEP}[.]?|{FLODOT}){BTOK} | {WS}({FLOSEP}[.]?|{FLODOT}){NTOK} { val str = string_utf8(yytext); yyerrorf(yyg, lit("trailing junk in floating-point literal: ~a"), str, nao); if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); if ((yylval->val = flo_str_utf8(yytext)) == nil) out_of_range_float(yyg, yytext); return NUMBER; } {FLODOT}/[^.] { if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); if ((yylval->val = flo_str_utf8(yytext)) == nil) out_of_range_float(yyg, yytext); return NUMBER; } @{NUM} | {NUM} { wchar_t *wtxt = utf8_dup_from(yytext + 1); if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); yylval->val = int_str_wc(wtxt, num(10)); free(wtxt); return METANUM; } @{XNUM} | {XNUM} { wchar_t *wtxt = utf8_dup_from(yytext + 3); if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); yylval->val = int_str_wc(wtxt, num(16)); free(wtxt); return METANUM; } @{ONUM} | {ONUM} { wchar_t *wtxt = utf8_dup_from(yytext + 3); if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); yylval->val = int_str_wc(wtxt, num(8)); free(wtxt); return METANUM; } @{BNUM} { wchar_t *wtxt = utf8_dup_from(yytext + 3); if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); yylval->val = int_str_wc(wtxt, num(2)); free(wtxt); return METANUM; } {TOK}/{DOTFLO} | {BTOK}/{DOTFLO} | {NTOK}/{DOTFLO} { yyerrorf(yyg, lit("cramped floating-point literal: " "space needed between ~a and dot."), string_own(utf8_dup_from(yytext)), nao); if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); yylval->lexeme = unicode_ident(yyg, yytext); return SYMTOK; } {TOK} | {BTOK} | {NTOK} { if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); yylval->lexeme = unicode_ident(yyg, yytext); return SYMTOK; } {BTOK}{BTOK} | {NTOK}{NTOK} { if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); yyerrorf(yyg, lit("bad token: ~a"), string_own(utf8_dup_from(yytext)), nao); yylval->lexeme = unicode_ident(yyg, yytext); return SYMTOK; } \({WS}({NT0}?:)?all{WS}\) { return directive_tok(yyg, ALL, 0); } \({WS}({NT0}?:)?some/{ID_END} { return directive_tok(yyg, SOME, NESTED); } \({WS}({NT0}?:)?none{WS}\) { return directive_tok(yyg, NONE, 0); } \({WS}({NT0}?:)?maybe{WS}\) { return directive_tok(yyg, MAYBE, 0); } \({WS}({NT0}?:)?cases{WS}\) { return directive_tok(yyg, CASES, 0); } \({WS}({NT0}?:)?block/{ID_END} { return directive_tok(yyg, BLOCK, NESTED); } \({WS}({NT0}?:)?choose/{ID_END} { return directive_tok(yyg, CHOOSE, NESTED); } \({WS}({NT0}?:)?gather/{ID_END} { return directive_tok(yyg, GATHER, NESTED); } \({WS}({NT0}?:)?and{WS}\) { return directive_tok(yyg, AND, 0); } \({WS}({NT0}?:)?or{WS}\) { return directive_tok(yyg, OR, 0); } \({WS}({NT0}?:)?end{WS}\) { return directive_tok(yyg, END, 0); } \({WS}({NT0}?:)?collect/{ID_END} { return directive_tok(yyg, COLLECT, NESTED); } \({WS}({NT0}?:)?coll/{ID_END} { return directive_tok(yyg, COLL, NESTED); } \({WS}({NT0}?:)?until/{ID_END} { return directive_tok(yyg, UNTIL, NESTED); } \({WS}({NT0}?:)?output/{ID_END} { return directive_tok(yyg, OUTPUT, NESTED); } \({WS}({NT0}?:)?repeat/{ID_END} { return directive_tok(yyg, REPEAT, NESTED); } \({WS}({NT0}?:)?push/{ID_END} { return directive_tok(yyg, PUSH, NESTED); } \({WS}({NT0}?:)?rep/{ID_END} { return directive_tok(yyg, REP, NESTED); } \({WS}({NT0}?:)?single{WS}\) { return directive_tok(yyg, SINGLE, 0); } \({WS}({NT0}?:)?first{WS}\) { return directive_tok(yyg, FIRST, 0); } \({WS}({NT0}?:)?last/{ID_END} { return directive_tok(yyg, LAST, NESTED); } \({WS}({NT0}?:)?empty{WS}\) { return directive_tok(yyg, EMPTY, 0); } \({WS}({NT0}?:)?mod/{ID_END} { return directive_tok(yyg, MOD, NESTED); } \({WS}({NT0}?:)?modlast/{ID_END} { return directive_tok(yyg, MODLAST, NESTED); } \({WS}({NT0}?:)?define/{ID_END} { return directive_tok(yyg, DEFINE, NESTED); } \({WS}({NT0}?:)?try{WS}\) { return directive_tok(yyg, TRY, 0); } \({WS}({NT0}?:)?catch/{ID_END} { return directive_tok(yyg, CATCH, NESTED); } \({WS}({NT0}?:)?finally{WS}\) { return directive_tok(yyg, FINALLY, 0); } \({WS}({NT0}?:)?if/{ID_END} { return directive_tok(yyg, IF, NESTED); } \({WS}({NT0}?:)?elif/{ID_END} { return directive_tok(yyg, ELIF, NESTED); } \({WS}({NT0}?:)?else{WS}\) { return directive_tok(yyg, ELSE, 0); } [{] { yy_push_state(BRACED, yyscanner); yylval->lineno = yyextra->lineno; return yytext[0]; } [(\[] { yy_push_state(NESTED, yyscanner); yylval->lineno = yyextra->lineno; return yytext[0]; } @ { yylval->lineno = yyextra->lineno; return (opt_compat && opt_compat <= 248) ? OLD_AT : '@'; } ,[*] { yylval->chr = '*'; return SPLICE; } [,'^] { yylval->chr = yytext[0]; return yytext[0]; } [}] { yy_pop_state(yyscanner); if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); return yytext[0]; } [)\]]/{DOTFLO} { yyerrorf(yyg, lit("cramped floating-point literal: " "space or 0 needed between ~a and dot."), string_own(utf8_dup_from(yytext)), nao); yy_pop_state(yyscanner); if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); return yytext[0]; } [)\]}] { yy_pop_state(yyscanner); if (yy_top_state(yyscanner) == INITIAL || yy_top_state(yyscanner) == QSILIT || yy_top_state(yyscanner) == QWLIT) yy_pop_state(yyscanner); return yytext[0]; } {WS} { /* Eat whitespace in directive */ } \" { yy_push_state(STRLIT, yyscanner); return '"'; } #\\ { yy_push_state(CHRLIT, yyscanner); yylval->lineno = yyextra->lineno; return HASH_BACKSLASH; } #b' { yy_push_state(BUFLIT, yyscanner); yylval->lineno = yyextra->lineno; return HASH_B_QUOTE; } #[/] { yy_push_state(REGEX, yyscanner); yylval->lineno = yyextra->lineno; return HASH_SLASH; } ` { yy_push_state(QSILIT, yyscanner); return '`'; } #\" { yy_push_state(WLIT, yyscanner); yylval->lineno = yyextra->lineno; return WORDS; } #\*\" { yy_push_state(WLIT, yyscanner); yylval->lineno = yyextra->lineno; return WSPLICE; } #\` { yy_push_state(QWLIT, yyscanner); yylval->lineno = yyextra->lineno; return QWORDS; } #\*\` { yy_push_state(QWLIT, yyscanner); yylval->lineno = yyextra->lineno; return QWSPLICE; } # { return '#'; } #H { yylval->lineno = yyextra->lineno; return HASH_H; } #S { yylval->lineno = yyextra->lineno; return HASH_S; } #R { yylval->lineno = yyextra->lineno; return HASH_R; } #N { yylval->lineno = yyextra->lineno; return HASH_N; } #T { yylval->lineno = yyextra->lineno; return HASH_T; } #J { yylval->lineno = yyextra->lineno; yy_push_state(JSON, yyscanner); return HASH_J; } #; { yylval->lineno = yyextra->lineno; return HASH_SEMI; } #{DIG}+= { wchar_t *wtxt = utf8_dup_from(yytext + 1); yylval->val = int_str_wc(wtxt, num(10)); free(wtxt); return HASH_N_EQUALS; } #{DIG}+# { wchar_t *wtxt = utf8_dup_from(yytext + 1); yylval->val = int_str_wc(wtxt, num(10)); free(wtxt); return HASH_N_HASH; } {WS}\.\. { yylval->lineno = yyextra->lineno; return (opt_compat && opt_compat <= 185) ? OLD_DOTDOT : DOTDOT; } @ { yy_pop_state(yyscanner); yylval->lexeme = chk_strdup(L"@"); return TEXT; } {NL} { yyextra->lineno++; } [/] { yy_push_state(REGEX, yyscanner); return '/'; } {REQWS}\.{REQWS} { yylval->chr = '.'; return CONSDOT; } \.{REQWS} { yylval->chr = '.'; return LAMBDOT; } {REQWS}\. { yylval->chr = '.'; return UREFDOT; } \. { yylval->chr = '.'; return '.'; } \.\? { yylval->chr = '.'; return OREFDOT; } {REQWS}\.\? { yylval->chr = '.'; return UOREFDOT; } [\\]{NL}{WS} { if (YYSTATE == SPECIAL) yy_pop_state(yyscanner); /* @\ continuation */ yyextra->lineno++; } [\\][abtnvfre ] { wchar_t lexeme[2]; lexeme[0] = char_esc(yytext[1]); lexeme[1] = 0; yylval->lexeme = chk_strdup(lexeme); yy_pop_state(yyscanner); return TEXT; } [\\](x{HEX}+|{OCT}+);? { wchar_t lexeme[2]; lexeme[0] = num_esc(yyg, yytext + 1); lexeme[1] = 0; yylval->lexeme = chk_strdup(lexeme); { char lastchar = yytext[yyleng-1]; if (lastchar == ';' && opt_compat && opt_compat <= 109) unput(lastchar); } yy_pop_state(yyscanner); return TEXT; } [\\]x { yyerrorf(yyg, lit("\\x escape without digits"), nao); } [\\]. { yyerrorf(yyg, lit("unrecognized escape \\~a"), chr(yytext[1]), nao); } [;][^\n\r]* { /* comment */ } {UANYN} { val ch = chr_str(string_utf8(yytext), zero); if (chr_isspace(ch)) yyerrprepf(yyg, lit("unexpected whitespace character #\\x~,02x"), ch, nao); else if (chr_isunisp(ch)) yyerrprepf(yyg, lit("unexpected Unicode space character #\\x~,02x"), ch, nao); else if (chr_iscntrl(ch)) yyerrprepf(yyg, lit("unexpected control character #\\x~,02x"), ch, nao); else yyerrprepf(yyg, lit("unexpected character #\\~a"), ch, nao); return ERRTOK; } . { yyerrprepf(yyg, lit("non-UTF-8 byte #x~02x in directive"), num(convert(unsigned char, yytext[0])), nao); return ERRTOK; } [/] { yylval->chr = '/'; return (YYSTATE == SREGEX) ? REGCHAR : '/'; } [\\][abtnvfre\\ ] { yylval->chr = char_esc(yytext[1]); return REGCHAR; } [\\](x{HEX}+|{OCT}+);? { yylval->chr = num_esc(yyg, yytext + 1); return REGCHAR; } [\\][sSdDwW] { yylval->chr = yytext[1]; return REGTOKEN; } {WS}[\\]{NL}{WS} { yyextra->lineno++; } {NL} { yyextra->lineno++; yyerrprepf(yyg, lit("newline in regex"), nao); return ERRTOK; } {NL} { yyextra->lineno++; yylval->chr = yytext[0]; return REGCHAR; } {REGOP} { yylval->chr = yytext[0]; return yytext[0]; } [\\]{REGOP} { yylval->chr = yytext[1]; return REGCHAR; } [\\]. { if (opt_compat && opt_compat <= 105) { yylval->chr = yytext[1]; return REGCHAR; } if (yytext[1] == 'x') yyerrprepf(yyg, lit("\\x escape without digits in regex"), nao); else yyerrprepf(yyg, lit("unrecognized escape in regex"), nao); return ERRTOK; } [\\] { yyerrprepf(yyg, lit("dangling backslash in regex"), nao); return ERRTOK; } {UANYN} { wchar_t wchr[8]; if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) { yylval->lexeme = chk_strdup(wchr); return TEXT; } yylval->chr = wchr[0]; return REGCHAR; } . { yylval->chr = convert(unsigned char, yytext[0]) + 0xDC00; return REGCHAR; } [ ]+ { yylval->lexeme = utf8_dup_from(yytext); return SPACE; } ({UONLY}|[^@\n\r ])+ { yylval->lexeme = utf8_dup_from(yytext); return TEXT; } {NL} { yyextra->lineno++; return '\n'; } @{WS}\* { yy_push_state(SPECIAL, yyscanner); return '*'; } @ { yy_push_state(SPECIAL, yyscanner); } ^@{WS}*[#;].*{NL} { /* eat whole line comment */ yyextra->lineno++; } @{WS}*[#;].* { /* comment to end of line */ } \" { yy_pop_state(yyscanner); return yytext[0]; } \` { yy_pop_state(yyscanner); return yytext[0]; } [\\][abtnvfre "`'\\ ] { yylval->chr = char_esc(yytext[1]); return LITCHAR; } [\\]@ { yylval->chr = char_esc(yytext[1]); return LITCHAR; } {WS}[\\]{NL}{WS} { yyextra->lineno++; } {WS}[\\]{NL}{WS} { yyextra->lineno++; if (!opt_compat || opt_compat > 109) return ' '; } [\\](x{HEX}+|{OCT}+);? { yylval->chr = num_esc(yyg, yytext+1); return LITCHAR; } [\\]x { yyerrorf(yyg, lit("\\x escape without digits"), nao); } [\\]. { yyerrorf(yyg, lit("unrecognized escape: \\~a"), chr(yytext[1]), nao); } (x{HEX}+|o{OCT}+) { yylval->chr = num_esc(yyg, yytext); return LITCHAR; } {TOK} { yylval->lexeme = utf8_dup_from(yytext); return SYMTOK; } [^ \t\n\r] { yylval->lexeme = utf8_dup_from(yytext); return SYMTOK; /* hack */ } {NL} { yyerrprepf(yyg, lit("newline in string literal"), nao); yyextra->lineno++; yylval->chr = yytext[0]; return ERRTOK; } {NL} { yyerrprepf(yyg, lit("newline in character literal"), nao); yyextra->lineno++; yylval->chr = yytext[0]; return ERRTOK; } {NL} { yyerrprepf(yyg, lit("newline in string quasiliteral"), nao); yyextra->lineno++; yylval->chr = yytext[0]; return ERRTOK; } {NL} { yyextra->lineno++; if (opt_compat && opt_compat <= 109) return ' '; yyerrprepf(yyg, lit("newline in word list literal"), nao); yylval->chr = yytext[0]; return ERRTOK; } @/([[({'^,@]|{TOK}) { yy_push_state(QSPECIAL, yyscanner); return yytext[0]; } @ { yyerrprepf(yyg, lit("malformed @ expression in quasiliteral"), nao); return ERRTOK; } {WS} { return ' '; } \" { yy_pop_state(yyscanner); return yytext[0]; } [\\][bfnrt"\\/] { yylval->chr = char_esc(yytext[1]); return LITCHAR; } [\\]u[Dd][8-9A-Fa-f]{HEX}{2}[\\]u[Dd][C-Fc-f]{HEX}{2} { wchar_t ch0, ch1; yytext[6] = 0; ch0 = num_esc(yyg, yytext + 1); ch1 = num_esc(yyg, yytext + 7); yylval->chr = ((ch0 - 0xD800) << 10 | (ch1 - 0xDC00)) + 0x10000; return LITCHAR; } [\\]u{HEX}{4} { wchar_t ch = num_esc(yyg, yytext + 1); yylval->chr = if3(ch, ch, 0xDC00); return LITCHAR; } [\\]u { yyerrorf(yyg, lit("JSON \\u escape needs four digits"), nao); } [\\]. { yyerrorf(yyg, lit("unrecognized JSON escape: \\~a"), chr(yytext[1]), nao); } {NL} { yyerrprepf(yyg, lit("newline in JSON string"), nao); yyextra->lineno++; yylval->chr = yytext[0]; return ERRTOK; } {UANYN} { wchar_t wchr[8]; if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) { yylval->lexeme = chk_strdup(wchr); return TEXT; } yylval->chr = wchr[0]; return LITCHAR; } {HEX} { yylval->chr = strtol(yytext, 0, 16); return LITCHAR; } ' { return '\''; } {WS} { } {NL} { yyextra->lineno++; } . { yyerrorf(yyg, lit("bad character ~s in buffer literal"), chr(yytext[0]), nao); } . { yylval->chr = convert(unsigned char, yytext[0]) + 0xDC00; return LITCHAR; } {JNUM} { if (yyextra->read_json_int && !strpbrk(yytext, ".eE")) { wchar_t *wtxt = utf8_dup_from(yytext); yylval->val = int_str_wc(wtxt, num(10)); free(wtxt); } else if ((yylval->val = flo_str_utf8(yytext)) == nil) { out_of_range_float(yyg, yytext); } return NUMBER; } true/({JPUNC}|[ \t\n]) { yylval->val = t; return JSKW; } false/({JPUNC}|[ \t\n]) { yylval->val = nil; return JSKW; } null/({JPUNC}|[ \t\n]) { yylval->val = null_s; return JSKW; } {NJPUNC}+ { if (strcmp("true", yytext) == 0) { yylval->val = t; return JSKW; } if (strcmp("false", yytext) == 0) { yylval->val = nil; return JSKW; } if (strcmp("null", yytext) == 0) { yylval->val = null_s; return JSKW; } { val str = string_own(utf8_dup_from(yytext)); yyerrorf(yyg, lit("unrecognized JSON syntax: ~a"), str, nao); } } \" { yy_push_state(JLIT, yyscanner); return yytext[0]; } ~[*] { yy_push_state(JMARKER, yyscanner); yy_push_state(NESTED, yyscanner); return JSPLICE; } ~ { yy_push_state(JMARKER, yyscanner); yy_push_state(NESTED, yyscanner); return yytext[0]; } {JPUNC} { return yytext[0]; } {NL} { yyextra->lineno++; } {WS} { } . { yyerrorf(yyg, lit("bad character ~s in JSON literal"), chr(yytext[0]), nao); } . { internal_error("scanner processed input JMARKER state"); } %% static int directive_tok(scanner_t *yyscanner, int tok, int state) { struct yyguts_t *yyg = convert(struct yyguts_t *, yyscanner); char *pstart = yytext + 1 + strspn(yytext + 1, " \t"); char *pcolon = strchr(pstart, ':'); char *pend = pstart + strspn(pstart, ":-abcdefghijklmnopqrstuvwxyz"); *pend = 0; if (pcolon != 0) { val pkgname = string_utf8((*pcolon = 0, pstart)); val package = if3(pstart[0], find_package(pkgname), keyword_package); if (!package) { yyerrprepf(yyg, lit("package ~a not found"), pkgname, nao); tok = ERRTOK; } if (package != user_package && package != keyword_package) { val sym = string_utf8(pcolon + 1); yyerrprepf(yyg, lit("~a:~a: original usr package expected, not ~a"), pkgname, sym, pkgname, nao); tok = ERRTOK; } } else { val symname = string_utf8(pstart); val sym = intern_fallback(symname, cur_package); val package = symbol_package(sym); if (package != user_package && package != keyword_package) { yyerrprepf(yyg, lit("~a: this is ~a:~a, not usr:~a"), symname, package_name(package), symname, symname, nao); tok = ERRTOK; } } if (state != 0) yy_push_state(state, yyscanner); else yy_pop_state(yyscanner); yylval->lineno = yyextra->lineno; return tok; } void end_of_regex(scanner_t *yyg) { if (YYSTATE != REGEX && YYSTATE != SREGEX) internal_error("end_of_regex called in wrong scanner state"); yy_pop_state(yyg); if (YYSTATE != INITIAL) { if (yy_top_state(yyg) == INITIAL || yy_top_state(yyg) == QSILIT || yy_top_state(yyg) == QWLIT) yy_pop_state(yyg); } } void end_of_char(scanner_t *yyg) { if (YYSTATE != CHRLIT) internal_error("end_of_char called in wrong scanner state"); yy_pop_state(yyg); } void end_of_buflit(scanner_t *yyg) { if (YYSTATE != BUFLIT) internal_error("end_of_buflit called in wrong scanner state"); yy_pop_state(yyg); } void end_of_json(scanner_t *yyg) { if (YYSTATE == JLIT) yy_pop_state(yyg); if (YYSTATE != JSON) internal_error("end_of_json called in wrong scanner state"); yy_pop_state(yyg); } /* The complexity here is necessary because TXR Lisp parsing looks ahead * by one token. (The reason for *that* is the support of a.b.c referencing dot * syntax in TXR Lisp.) * * Consider these two different cases: * * ^#J[,~(+ 2.0 2.0)] * ^#J[,~(+ 2.0 2.0) #J42] * * This end_of_json_unquote function gets called when the (+ 2.0 2.0) * has been parsed, but the Yacc-generated parser has shifted one tokan * ahead. It has read the ] token in the one case or the #J token in * the other. These tokens have totally different effects on the Lex * start condition. When the lexer reads the ] token, it pops off a NESTED * state, whereas the #J token wants to push on a new JSON state. * By the time end_of_json_unquote has been called, this has already happened. * * To deal with this, we use the dummy JMARKER start state which serves as a * kind of parenthesis inside the start condition stack. BHefore scanning Lisp * unquote within JSON, we push JMARKER state first, then the NESTED state. * * If the lookahead token is like ], and pops off a state, it will pop off * our NESTED state, so we are left at the JMARKER state. If the lookahead * token is something else like #J (HASH_J), then it will push a new * state like JSON on top, and we have JMARKER NESTED JSON. * * So what we are doing here is popping off everything until we get down * to the JMARKER state, and putting it into our little save area. * * Then we lose the JMARKER state. * * If the save area is empty, it means that the lookahead token consumed * our NESTED state, and so we are done. * * If the save area is not empty, it means the lookahead put something * extra over our NESTED state. We drop that state from our save area, * and restore the rest of the save area back into the stack. * Effectively, we are deleting the unquote-related states from the * interior of the start condition stack, not to disturb new material * initiated by the lookahead token. */ void end_of_json_unquote(scanner_t *yyg) { int stacksave[8]; int top = 0; while (YYSTATE != JMARKER) { stacksave[top++] = YYSTATE; yy_pop_state(yyg); } yy_pop_state(yyg); if (top-- > 0) { while (top > 0) yy_push_state(stacksave[--top], yyg); } } val source_loc(val form) { return gethash(form_to_ln_hash, form); } val source_loc_str(val form, val alt) { cons_bind (line, file, gethash(form_to_ln_hash, form)); if (missingp(alt)) alt = lit("source location n/a"); return if3(line, format(nil, lit("~a:~d"), file, line, nao), alt); } int yylex(YYSTYPE *yylval_param, yyscan_t yyscanner) { struct yyguts_t * yyg = convert(struct yyguts_t *, yyscanner); int yy_char; if (yyextra->tok_idx > 0) { struct yy_token *tok = &yyextra->tok_pushback[--yyextra->tok_idx]; yyextra->recent_tok = *tok; *yylval_param = tok->yy_lval; if (tok->yy_lex_state && tok->yy_lex_state != YYSTATE) yy_push_state(tok->yy_lex_state, yyg); return tok->yy_char; } yy_char = yyextra->recent_tok.yy_char = yylex_impl(yylval_param, yyscanner); yyextra->recent_tok.yy_lval = *yylval_param; yyextra->recent_tok.yy_lex_state = YYSTATE; return yy_char; } void prime_scanner(scanner_t *yyg, enum prime_parser prim) { while (YYSTATE != INITIAL) yy_pop_state(yyg); switch (prim) { case prime_lisp: case prime_interactive: yy_push_state(SPECIAL, yyg); yy_push_state(NESTED, yyg); yy_push_state(NESTED, yyg); break; case prime_regex: yy_push_state(SREGEX, yyg); break; case prime_json: yy_push_state(JSON, yyg); break; } } void scrub_scanner(scanner_t *yyg, int yy_char, wchar_t *lexeme) { struct yy_token *rtok = &yyextra->recent_tok; if (rtok->yy_char == yy_char && rtok->yy_lval.lexeme == lexeme) { rtok->yy_char = 0; rtok->yy_lval.lexeme = 0; } } void parser_l_init(void) { prot1(&form_to_ln_hash); form_to_ln_hash = make_eq_hash(hash_weak_keys); (void) &yy_fatal_error; /* suppress unused function warning */ }