diff options
-rw-r--r-- | ChangeLog | 24 | ||||
-rw-r--r-- | parser.l | 954 | ||||
-rw-r--r-- | parser.y | 36 | ||||
-rw-r--r-- | txr.1 | 6 |
4 files changed, 547 insertions, 473 deletions
@@ -1,3 +1,27 @@ +2012-02-26 Kaz Kylheku <kaz@kylheku.com> + + Bug #35625 + + * parser.l (BSCHR, BSYM, BTOK): New lexical definitions. + (BRACED): New state. + (grammar): Refactored so that braced variables are now handled + in the BRACED state, allowing for lexical differences between + braced variables and Lisp. This allows us to have + the /regex/ syntax in braces, but /regex/ is just a symbol + in the Lisp. The new #/ token is recognized and returned + as HASH_SLASH. All rules reformatted to a more easily + maintainble convention. + + * parser.y (HASH_SLASH): New token. + (modifiers, lisp_regex): New nonterminals. + (var): Grammar changed to use modifiers nonterminal instead of exprs. + (var_op): Rule moved closer to var. + (expr): Produces lisp_regex rather than regex. + (yybadtoken): Handle HASH_SLASH in the switch statement. + Bugfix: HASH_BACKSLASH was not handled. + + * txr.1: Documented #/regex/ syntax. + 2012-02-25 Kaz Kylheku <kaz@kylheku.com> * arith.c: Updated copyright year. @@ -150,10 +150,13 @@ static wchar_t num_esc(char *num) SYM [a-zA-Z0-9_]+ NUM [+\-]?[0-9]+ -NSCHR [a-zA-Z0-9!$%&*+\-<=>?\\^_~] +BSCHR [a-zA-Z0-9!$%&*+\-<=>?\\^_~] +BSYM {BSCHR}({BSCHR}|#)* +NSCHR [a-zA-Z0-9!$%&*+\-<=>?\\^_~/] NSYM {NSCHR}({NSCHR}|#)* TOK :?{SYM} ATNUM @{NUM} +BTOK [:@]?{BSYM} NTOK [:@]?{NSYM} ID_END [^a-zA-Z0-9_] WS [\t ]* @@ -171,497 +174,520 @@ UANY {ASC}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} UANYN {ASCN}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} -%x SPECIAL NESTED REGEX STRLIT CHRLIT QSILIT +%x SPECIAL BRACED NESTED REGEX STRLIT CHRLIT QSILIT %% -<SPECIAL,NESTED>{NUM} { - val str = string_own(utf8_dup_from(yytext)); - - if (yy_top_state() == INITIAL - || yy_top_state() == QSILIT) - yy_pop_state(); - - yylval.num = int_str(str, num(10)); - return NUMBER; - } - -<NESTED,QSILIT>{ATNUM} { - val str = string_own(utf8_dup_from(yytext + 1)); - - if (yy_top_state() == INITIAL - || yy_top_state() == QSILIT) - yy_pop_state(); - yylval.num = int_str(str, num(10)); - return METANUM; - } - -<SPECIAL>{TOK} | -<NESTED>{NTOK} { - if (yy_top_state() == INITIAL - || yy_top_state() == QSILIT) - yy_pop_state(); - - switch (yytext[0]) { - case ':': - yylval.lexeme = utf8_dup_from(yytext + 1); - return KEYWORD; - case '@': - yylval.lexeme = utf8_dup_from(yytext + 1); - return METAVAR; - default: - yylval.lexeme = utf8_dup_from(yytext); - return IDENT; - } - } -<NESTED>: { - if (yy_top_state() == INITIAL - || yy_top_state() == QSILIT) - yy_pop_state(); - yylval.lexeme = utf8_dup_from(""); - return KEYWORD; - } - -<SPECIAL>\({WS}all{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return ALL; - } - -<SPECIAL>\({WS}some/{ID_END} { - yy_push_state(NESTED); - yylval.lineno = lineno; - return SOME; - } - -<SPECIAL>\({WS}none{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return NONE; - } - -<SPECIAL>\({WS}maybe{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return MAYBE; - } - -<SPECIAL>\({WS}cases{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return CASES; - } - -<SPECIAL>\({WS}choose/{ID_END} { - yy_push_state(NESTED); - yylval.lineno = lineno; - return CHOOSE; - } - -<SPECIAL>\({WS}gather/{ID_END} { - yy_push_state(NESTED); - yylval.lineno = lineno; - return GATHER; - } - -<SPECIAL>\({WS}and{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return AND; - } - -<SPECIAL>\({WS}or{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return OR; - } - -<SPECIAL>\({WS}end{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return END; - } +<SPECIAL,NESTED,BRACED>{NUM} { + val str = string_own(utf8_dup_from(yytext)); + + if (yy_top_state() == INITIAL + || yy_top_state() == QSILIT) + yy_pop_state(); + + yylval.num = int_str(str, num(10)); + return NUMBER; +} + +<NESTED,QSILIT>{ATNUM} { + val str = string_own(utf8_dup_from(yytext + 1)); + + if (yy_top_state() == INITIAL + || yy_top_state() == QSILIT) + yy_pop_state(); + yylval.num = int_str(str, num(10)); + return METANUM; +} + +<SPECIAL>{TOK} | +<BRACED>{BTOK} | +<NESTED>{NTOK} { + if (yy_top_state() == INITIAL + || yy_top_state() == QSILIT) + yy_pop_state(); + + switch (yytext[0]) { + case ':': + yylval.lexeme = utf8_dup_from(yytext + 1); + return KEYWORD; + case '@': + yylval.lexeme = utf8_dup_from(yytext + 1); + return METAVAR; + default: + yylval.lexeme = utf8_dup_from(yytext); + return IDENT; + } +} + +<BRACED,NESTED>: { + if (yy_top_state() == INITIAL + || yy_top_state() == QSILIT) + yy_pop_state(); + yylval.lexeme = utf8_dup_from(""); + return KEYWORD; +} + +<SPECIAL>\({WS}all{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return ALL; +} + +<SPECIAL>\({WS}some/{ID_END} { + yy_push_state(NESTED); + yylval.lineno = lineno; + return SOME; +} + +<SPECIAL>\({WS}none{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return NONE; +} + +<SPECIAL>\({WS}maybe{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return MAYBE; +} + +<SPECIAL>\({WS}cases{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return CASES; +} + +<SPECIAL>\({WS}choose/{ID_END} { + yy_push_state(NESTED); + yylval.lineno = lineno; + return CHOOSE; +} + +<SPECIAL>\({WS}gather/{ID_END} { + yy_push_state(NESTED); + yylval.lineno = lineno; + return GATHER; +} + +<SPECIAL>\({WS}and{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return AND; +} + +<SPECIAL>\({WS}or{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return OR; +} + +<SPECIAL>\({WS}end{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return END; +} <SPECIAL>\({WS}collect/{ID_END} { - yy_push_state(NESTED); - yylval.lineno = lineno; - return COLLECT; - } - -<SPECIAL>\({WS}coll/{ID_END} { - yy_push_state(NESTED); - yylval.lineno = lineno; - return COLL; - } - -<SPECIAL>\({WS}until{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return UNTIL; - } + yy_push_state(NESTED); + yylval.lineno = lineno; + return COLLECT; +} + +<SPECIAL>\({WS}coll/{ID_END} { + yy_push_state(NESTED); + yylval.lineno = lineno; + return COLL; +} + +<SPECIAL>\({WS}until{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return UNTIL; +} <SPECIAL>\({WS}output/{ID_END} { - yy_push_state(NESTED); - yylval.lineno = lineno; - return OUTPUT; - } + yy_push_state(NESTED); + yylval.lineno = lineno; + return OUTPUT; +} <SPECIAL>\({WS}repeat/{ID_END} { - yy_push_state(NESTED); - yylval.lineno = lineno; - return REPEAT; - } - - -<SPECIAL>\({WS}rep/{ID_END} { - yy_push_state(NESTED); - yylval.lineno = lineno; - return REP; - } - -<SPECIAL>\({WS}single{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return SINGLE; - } - -<SPECIAL>\({WS}first{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return FIRST; - } - -<SPECIAL>\({WS}last{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return LAST; - } - -<SPECIAL>\({WS}empty{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return EMPTY; - } - -<SPECIAL>\({WS}mod/{ID_END} { - yy_push_state(NESTED); - yylval.lineno = lineno; - return MOD; - } + yy_push_state(NESTED); + yylval.lineno = lineno; + return REPEAT; +} + + +<SPECIAL>\({WS}rep/{ID_END} { + yy_push_state(NESTED); + yylval.lineno = lineno; + return REP; +} + +<SPECIAL>\({WS}single{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return SINGLE; +} + +<SPECIAL>\({WS}first{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return FIRST; +} + +<SPECIAL>\({WS}last{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return LAST; +} + +<SPECIAL>\({WS}empty{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return EMPTY; +} + +<SPECIAL>\({WS}mod/{ID_END} { + yy_push_state(NESTED); + yylval.lineno = lineno; + return MOD; +} <SPECIAL>\({WS}modlast/{ID_END} { - yy_push_state(NESTED); - yylval.lineno = lineno; - return MODLAST; - } - -<SPECIAL>\({WS}define/{ID_END} { - yy_push_state(NESTED); - yylval.lineno = lineno; - return DEFINE; - } - -<SPECIAL>\({WS}try{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return TRY; - } - -<SPECIAL>\({WS}catch/{ID_END} { - yy_push_state(NESTED); - yylval.lineno = lineno; - return CATCH; - } - -<SPECIAL>\({WS}finally{WS}\) { - yy_pop_state(); - yylval.lineno = lineno; - return FINALLY; - } - -<NESTED>@[\(\[] | -<SPECIAL,NESTED>[{(\[] { - yy_push_state(NESTED); - if (yytext[0] == '@') { - yylval.chr = yytext[1]; - return yytext[1] == '(' ? METAPAR : METABKT; - } - yylval.lineno = lineno; - return yytext[0]; - } - -<SPECIAL,NESTED>,[*] { - yylval.chr = '*'; - return SPLICE; - } - -<SPECIAL,NESTED>[,'] { - yylval.chr = yytext[0]; - return yytext[0]; - } - -<SPECIAL,NESTED>[})\]] { - yy_pop_state(); - if (yy_top_state() == INITIAL - || yy_top_state() == QSILIT) - yy_pop_state(); - return yytext[0]; - } - -<SPECIAL,NESTED>{WS} { /* Eat whitespace in directive */ } - -<SPECIAL,NESTED>\" { - yy_push_state(STRLIT); - return '"'; - } - -<SPECIAL,NESTED>#\\ { - yy_push_state(CHRLIT); - return HASH_BACKSLASH; - } - -<SPECIAL,NESTED>` { - yy_push_state(QSILIT); - return '`'; - } - -<NESTED># { - return '#'; - } - -<NESTED>\.\. { - yylval.lineno = lineno; - return DOTDOT; - } - -<SPECIAL>@ { - yy_pop_state(); - yylval.lexeme = chk_strdup(L"@"); - return TEXT; - } - -<SPECIAL,NESTED>\n { - lineno++; - } - -<SPECIAL,NESTED>[/] { - yy_push_state(REGEX); - return '/'; - } - -<SPECIAL,NESTED>\. { - yylval.chr = '.'; - return '.'; - } - -<SPECIAL,NESTED>[\\]\n{WS} { - yy_pop_state(); - lineno++; - } + yy_push_state(NESTED); + yylval.lineno = lineno; + return MODLAST; +} + +<SPECIAL>\({WS}define/{ID_END} { + yy_push_state(NESTED); + yylval.lineno = lineno; + return DEFINE; +} + +<SPECIAL>\({WS}try{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return TRY; +} + +<SPECIAL>\({WS}catch/{ID_END} { + yy_push_state(NESTED); + yylval.lineno = lineno; + return CATCH; +} + +<SPECIAL>\({WS}finally{WS}\) { + yy_pop_state(); + yylval.lineno = lineno; + return FINALLY; +} + +<SPECIAL>[{] { + yy_push_state(BRACED); + yylval.lineno = lineno; + return yytext[0]; +} + +<SPECIAL>[(\[] | +<NESTED,BRACED>@?[(\[] { + yy_push_state(NESTED); + if (yytext[0] == '@') { + yylval.chr = yytext[1]; + return yytext[1] == '(' ? METAPAR : METABKT; + } + yylval.lineno = lineno; + return yytext[0]; +} + +<NESTED>,[*] { + yylval.chr = '*'; + return SPLICE; +} + +<NESTED>[,'] { + yylval.chr = yytext[0]; + return yytext[0]; +} + +<BRACED>[}] { + yy_pop_state(); + if (yy_top_state() == INITIAL + || yy_top_state() == QSILIT) + yy_pop_state(); + return yytext[0]; +} + +<SPECIAL,NESTED>[)\]] { + yy_pop_state(); + if (yy_top_state() == INITIAL + || yy_top_state() == QSILIT) + yy_pop_state(); + return yytext[0]; +} + +<SPECIAL,NESTED,BRACED>{WS} { + /* Eat whitespace in directive */ +} + +<SPECIAL,NESTED,BRACED>\" { + yy_push_state(STRLIT); + return '"'; +} + +<SPECIAL,NESTED,BRACED>#\\ { + yy_push_state(CHRLIT); + return HASH_BACKSLASH; +} + +<SPECIAL,NESTED,BRACED>#[/] { + yy_push_state(REGEX); + return HASH_SLASH; +} + +<SPECIAL,NESTED,BRACED>` { + yy_push_state(QSILIT); + return '`'; +} + +<NESTED,BRACED># { + return '#'; +} + +<NESTED>\.\. { + yylval.lineno = lineno; + return DOTDOT; +} + +<SPECIAL>@ { + yy_pop_state(); + yylval.lexeme = chk_strdup(L"@"); + return TEXT; +} + +<SPECIAL,NESTED,BRACED>\n { + lineno++; +} + +<SPECIAL,BRACED>[/] { + yy_push_state(REGEX); + return '/'; +} + +<SPECIAL,NESTED>\. { + yylval.chr = '.'; + return '.'; +} + +<SPECIAL,NESTED,BRACED>[\\]\n{WS} { + yy_pop_state(); + lineno++; +} <SPECIAL>[\\][abtnvfre ] { - wchar_t lexeme[2]; - lexeme[0] = char_esc(yytext[1]); - lexeme[1] = 0; - yylval.lexeme = chk_strdup(lexeme); - yy_pop_state(); - return TEXT; - } - -<SPECIAL>[\\](x{HEX}+|{OCT}+) { - wchar_t lexeme[2]; - lexeme[0] = num_esc(yytext + 1); - lexeme[1] = 0; - yylval.lexeme = chk_strdup(lexeme); - yy_pop_state(); - return TEXT; - } - -<SPECIAL,NESTED>[;].* { - /* comment */ - } -<SPECIAL,NESTED>{UANYN} { - yyerrprepf(lit("bad character in directive: '~a'"), - string_utf8(yytext), nao); - return ERRTOK; - } - -<SPECIAL,NESTED>. { - yyerrprepf(lit("non-UTF-8 byte in directive: " - "'\\x~02x'"), - num((unsigned char) yytext[0]), nao); - return ERRTOK; - } - -<REGEX>[/] { - yylval.chr = '/'; - return '/'; - } - - -<REGEX>[\\][abtnvfre\\ ] { - yylval.chr = char_esc(yytext[1]); - return REGCHAR; - } - -<REGEX>[\\](x{HEX}+|{OCT}+);? { - yylval.chr = num_esc(yytext + 1); - return REGCHAR; - } - -<REGEX>{WS}[\\]\n{WS} { - lineno++; - } - -<REGEX>\n { - lineno++; - yyerrprepf(lit("newline in regex"), nao); - return ERRTOK; - } - -<REGEX>[.*?+~&%] { - yylval.chr = yytext[0]; - return yytext[0]; - } + wchar_t lexeme[2]; + lexeme[0] = char_esc(yytext[1]); + lexeme[1] = 0; + yylval.lexeme = chk_strdup(lexeme); + yy_pop_state(); + return TEXT; +} + +<SPECIAL>[\\](x{HEX}+|{OCT}+) { + wchar_t lexeme[2]; + lexeme[0] = num_esc(yytext + 1); + lexeme[1] = 0; + yylval.lexeme = chk_strdup(lexeme); + yy_pop_state(); + return TEXT; +} + +<SPECIAL,NESTED,BRACED>[;].* { + /* comment */ +} +<SPECIAL,NESTED,BRACED>{UANYN} { + yyerrprepf(lit("bad character in directive: '~a'"), + string_utf8(yytext), nao); + return ERRTOK; +} + +<SPECIAL,NESTED,BRACED>. { + yyerrprepf(lit("non-UTF-8 byte in directive: " + "'\\x~02x'"), + num((unsigned char) yytext[0]), nao); + return ERRTOK; +} + +<REGEX>[/] { + yylval.chr = '/'; + return '/'; +} + + +<REGEX>[\\][abtnvfre\\ ] { + yylval.chr = char_esc(yytext[1]); + return REGCHAR; +} + +<REGEX>[\\](x{HEX}+|{OCT}+);? { + yylval.chr = num_esc(yytext + 1); + return REGCHAR; +} + +<REGEX>{WS}[\\]\n{WS} { + lineno++; +} + +<REGEX>\n { + lineno++; + yyerrprepf(lit("newline in regex"), nao); + return ERRTOK; +} + +<REGEX>[.*?+~&%] { + yylval.chr = yytext[0]; + return yytext[0]; +} <REGEX>[\[\]\-] { - yylval.chr = yytext[0]; - return yytext[0]; - } + yylval.chr = yytext[0]; + return yytext[0]; +} -<REGEX>[()|] { - yylval.chr = yytext[0]; - return yytext[0]; - } +<REGEX>[()|] { + yylval.chr = yytext[0]; + return yytext[0]; +} -<REGEX>[\\]. { - yylval.chr = yytext[1]; - return REGCHAR; - } +<REGEX>[\\]. { + yylval.chr = yytext[1]; + return REGCHAR; +} <REGEX>{UANYN} { - wchar_t buf[8]; - utf8_from(buf, yytext); - yylval.chr = buf[0]; - return REGCHAR; - } - -<REGEX>. { - yyerrprepf(lit("non-UTF-8 byte in regex: '\\x~02x'"), - num((unsigned char) yytext[0]), nao); - return ERRTOK; - } - -<INITIAL>[ ]+ { - yylval.lexeme = utf8_dup_from(yytext); - return SPACE; - } - -<INITIAL>({UONLY}|[^@\n ])+ { - yylval.lexeme = utf8_dup_from(yytext); - return TEXT; - } - -<INITIAL>\n { - lineno++; - return '\n'; - } - -<INITIAL>@{WS}\* { - yy_push_state(SPECIAL); - return '*'; - } - -<INITIAL>@ { - yy_push_state(SPECIAL); - } - -<INITIAL>^@[#;].*\n { - /* eat whole line comment */ - lineno++; - } - -<INITIAL>@[#;].* { - /* comment to end of line */ - } - -<STRLIT>\" { - yy_pop_state(); - return yytext[0]; - } - -<QSILIT>` { - yy_pop_state(); - return yytext[0]; - } - -<STRLIT,QSILIT>[\\][abtnvfre"`'\\] { - yylval.chr = char_esc(yytext[1]); - return LITCHAR; - } - -<STRLIT,QSILIT>{WS}[\\]\n{WS} { - lineno++; - } + wchar_t buf[8]; + utf8_from(buf, yytext); + yylval.chr = buf[0]; + return REGCHAR; +} + +<REGEX>. { + yyerrprepf(lit("non-UTF-8 byte in regex: '\\x~02x'"), + num((unsigned char) yytext[0]), nao); + return ERRTOK; +} + +<INITIAL>[ ]+ { + yylval.lexeme = utf8_dup_from(yytext); + return SPACE; +} + +<INITIAL>({UONLY}|[^@\n ])+ { + yylval.lexeme = utf8_dup_from(yytext); + return TEXT; +} + +<INITIAL>\n { + lineno++; + return '\n'; +} + +<INITIAL>@{WS}\* { + yy_push_state(SPECIAL); + return '*'; +} + +<INITIAL>@ { + yy_push_state(SPECIAL); +} + +<INITIAL>^@[#;].*\n { + /* eat whole line comment */ + lineno++; +} + +<INITIAL>@[#;].* { + /* comment to end of line */ +} + +<STRLIT>\" { + yy_pop_state(); + return yytext[0]; +} + +<QSILIT>` { + yy_pop_state(); + return yytext[0]; +} + +<STRLIT,QSILIT>[\\][abtnvfre"`'\\] { + yylval.chr = char_esc(yytext[1]); + return LITCHAR; +} + +<STRLIT,QSILIT>{WS}[\\]\n{WS} { + lineno++; +} <STRLIT,QSILIT>[\\](x{HEX}+|{OCT}+);? { - yylval.chr = num_esc(yytext+1); - return LITCHAR; - } + yylval.chr = num_esc(yytext+1); + return LITCHAR; +} <CHRLIT>(x{HEX}+|o{OCT}+) { - yylval.chr = num_esc(yytext); - return LITCHAR; - } - -<CHRLIT>{SYM} { - yylval.lexeme = utf8_dup_from(yytext); - return IDENT; - } - -<CHRLIT>[^ \t\n] { - yylval.lexeme = utf8_dup_from(yytext); - return IDENT; /* hack */ - } - -<STRLIT>\n { - yyerrprepf(lit("newline in string literal"), nao); - lineno++; - yylval.chr = yytext[0]; - return ERRTOK; - } - -<CHRLIT>\n { - yyerrprepf(lit("newline in character literal"), nao); - lineno++; - yylval.chr = yytext[0]; - return ERRTOK; - } - -<QSILIT>\n { - yyerrprepf(lit("newline in string quasiliteral"), nao); - lineno++; - yylval.chr = yytext[0]; - return ERRTOK; - } - -<QSILIT>@ { - yy_push_state(SPECIAL); - } + yylval.chr = num_esc(yytext); + return LITCHAR; +} + +<CHRLIT>{SYM} { + yylval.lexeme = utf8_dup_from(yytext); + return IDENT; +} + +<CHRLIT>[^ \t\n] { + yylval.lexeme = utf8_dup_from(yytext); + return IDENT; /* hack */ +} + +<STRLIT>\n { + yyerrprepf(lit("newline in string literal"), nao); + lineno++; + yylval.chr = yytext[0]; + return ERRTOK; +} + +<CHRLIT>\n { + yyerrprepf(lit("newline in character literal"), nao); + lineno++; + yylval.chr = yytext[0]; + return ERRTOK; +} + +<QSILIT>\n { + yyerrprepf(lit("newline in string quasiliteral"), nao); + lineno++; + yylval.chr = yytext[0]; + return ERRTOK; +} + +<QSILIT>@ { + yy_push_state(SPECIAL); +} <STRLIT,CHRLIT,QSILIT>{UANYN} { - wchar_t buf[8]; - utf8_from(buf, yytext); - yylval.chr = buf[0]; - return LITCHAR; - } + wchar_t buf[8]; + utf8_from(buf, yytext); + yylval.chr = buf[0]; + return LITCHAR; +} <STRLIT,CHRLIT,QSILIT>. { - yyerrprepf(lit("non-UTF-8 byte in literal: '\\x~02x'"), - num((unsigned char) yytext[0]), nao); - return ERRTOK; - } + yyerrprepf(lit("non-UTF-8 byte in literal: '\\x~02x'"), + num((unsigned char) yytext[0]), nao); + return ERRTOK; +} %% @@ -72,7 +72,7 @@ static val parsed_spec; %token <lineno> UNTIL COLL OUTPUT REPEAT REP SINGLE FIRST LAST EMPTY %token <lineno> MOD MODLAST DEFINE TRY CATCH FINALLY %token <lineno> ERRTOK /* deliberately not used in grammar */ -%token <lineno> HASH_BACKSLASH DOTDOT +%token <lineno> HASH_BACKSLASH HASH_SLASH DOTDOT %token <val> NUMBER METANUM @@ -85,11 +85,11 @@ static val parsed_spec; %type <val> clause_parts additional_parts gather_parts additional_gather_parts %type <val> output_clause define_clause try_clause catch_clauses_opt %type <val> line elems_opt elems clause_parts_h additional_parts_h -%type <val> text texts elem var var_op meta_expr vector +%type <val> text texts elem var var_op modifiers meta_expr vector %type <val> list exprs exprs_opt expr out_clauses out_clauses_opt out_clause %type <val> repeat_clause repeat_parts_opt o_line %type <val> o_elems_opt o_elems_opt2 o_elems o_elem o_var rep_elem rep_parts_opt -%type <val> regex regexpr regbranch +%type <val> regex lisp_regex regexpr regbranch %type <val> regterm regclass regclassterm regrange %type <val> strlit chrlit quasilit quasi_items quasi_item litchars %type <chr> regchar @@ -607,9 +607,10 @@ var : IDENT { $$ = list(var_s, intern(string_own($1), nil), nao); } | '{' IDENT '}' elem { $$ = list(var_s, intern(string_own($2), nil), $4, nao); } - | '{' IDENT exprs '}' { $$ = list(var_s, intern(string_own($2), nil), + | '{' IDENT modifiers '}' { $$ = list(var_s, intern(string_own($2), nil), nil, $3, nao); } - | '{' IDENT exprs '}' elem { $$ = list(var_s, intern(string_own($2), nil), + | '{' IDENT modifiers '}' elem + { $$ = list(var_s, intern(string_own($2), nil), $5, $3, nao); } | var_op IDENT { $$ = list(var_s, intern(string_own($2), nil), nil, $1, nao); } @@ -632,6 +633,16 @@ var : IDENT { $$ = list(var_s, intern(string_own($1), nil), yybadtoken(yychar, lit("variable spec")); } ; +var_op : '*' { $$ = list(t, nao); } + ; + +modifiers : NUMBER { $$ = cons($1, nil); } + | regex { $$ = cons(cons(regex_compile(rest($1)), + rest($1)), nil); + rlcp($$, $1); } + | list { $$ = cons($1, nil); } + ; + o_var : IDENT { $$ = list(var_s, intern(string_own($1), nil), nao); } | IDENT o_elem { $$ = list(var_s, intern(string_own($1), nil), @@ -644,9 +655,6 @@ o_var : IDENT { $$ = list(var_s, intern(string_own($1), nil), yybadtoken(yychar, lit("variable spec")); } ; -var_op : '*' { $$ = list(t, nao); } - ; - vector : '#' list { $$ = rlcp(vector_list($2), $2); } ; @@ -705,7 +713,7 @@ expr : IDENT { $$ = rl(intern(string_own($1), nil), | list { $$ = $1; } | vector { $$ = $1; } | meta_expr { $$ = $1; } - | regex { $$ = cons(regex_compile(rest($1)), + | lisp_regex { $$ = cons(regex_compile(rest($1)), rest($1)); rlcp($$, $1); } | chrlit { $$ = rl($1, num(lineno)); } @@ -721,6 +729,14 @@ regex : '/' regexpr '/' { $$ = cons(regex_s, $2); end_of_regex(); end_of_regex(); } ; +lisp_regex : HASH_SLASH regexpr '/' + { $$ = cons(regex_s, $2); end_of_regex(); + rl($$, num(lineno)); } + | HASH_SLASH error { $$ = nil; + yybadtoken(yychar, lit("regex")); + end_of_regex(); } + ; + regexpr : regbranch { $$ = if3(cdr($1), cons(compound_s, $1), car($1)); } @@ -1071,6 +1087,8 @@ void yybadtoken(int tok, val context) case METAPAR: problem = lit("@("); break; case METABKT: problem = lit("@["); break; case DOTDOT: problem = lit(".."); break; + case HASH_BACKSLASH: problem = lit("#\\"); break; + case HASH_SLASH: problem = lit("#/"); break; } if (problem != 0) @@ -4499,6 +4499,12 @@ according to a modified namespace lookup rule. More details are given in the documentation for the dwim operator. +.SS Regular Expressions + +In TXR Lisp, the / character can occur in symbol names, and the / token +is a symbol. Therefore the /regex/ syntax is absent, replaced with the +#/regex/ syntax. + .SS Lisp Operators When the first element of a compound expression is an operator symbol, |