From 6742e3e96b3387bbea484c7278305cab1bd5397e Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Sat, 15 Aug 2015 08:41:30 -0700 Subject: Allow slashes in regex passed to regex-parse. * parser.l (SREGEX): New start state, for stand-alone regex parsing. (grammar): All REGEX state rules are active in the SREGEX state also. The rule for the / character returns a REGCHAR if in the SREGEX state, so it is treated as an ordinary character. * txr.1: Updated regex-parse documentation about the treatment of the slash. Also added notes about double escaping when a string literal is passed to regex-parse. --- parser.l | 31 +++++++++++++++---------------- txr.1 | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 47 insertions(+), 19 deletions(-) diff --git a/parser.l b/parser.l index 66a51cfc..af838a63 100644 --- a/parser.l +++ b/parser.l @@ -218,7 +218,7 @@ UANY {ASC}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} UANYN {ASCN}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} -%x SPECIAL BRACED NESTED REGEX STRLIT CHRLIT QSILIT QSPECIAL WLIT QWLIT +%x SPECIAL BRACED NESTED REGEX SREGEX STRLIT CHRLIT QSILIT QSPECIAL WLIT QWLIT %% @@ -765,48 +765,47 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} return ERRTOK; } -[/] { +[/] { yylval->chr = '/'; - return '/'; + return (YYSTATE == SREGEX) ? REGCHAR : '/'; } - -[\\][abtnvfre\\ ] { +[\\][abtnvfre\\ ] { yylval->chr = char_esc(yytext[1]); return REGCHAR; } -[\\](x{HEX}+|{OCT}+);? { +[\\](x{HEX}+|{OCT}+);? { yylval->chr = num_esc(yyg, yytext + 1); return REGCHAR; } -[\\][sSdDwW] { +[\\][sSdDwW] { yylval->chr = yytext[1]; return REGTOKEN; } -{WS}[\\]\n{WS} { +{WS}[\\]\n{WS} { yyextra->lineno++; } -\n { +\n { yyextra->lineno++; yyerrprepf(yyg, lit("newline in regex"), nao); return ERRTOK; } -{REGOP} { +{REGOP} { yylval->chr = yytext[0]; return yytext[0]; } -[\\]{REGOP} { +[\\]{REGOP} { yylval->chr = yytext[1]; return REGCHAR; } -[\\]. { +[\\]. { if (opt_compat && opt_compat <= 105) { yylval->chr = yytext[1]; return REGCHAR; @@ -816,12 +815,12 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} return ERRTOK; } -[\\] { +[\\] { yyerrprepf(yyg, lit("dangling backslash in regex"), nao); return ERRTOK; } -{UANYN} { +{UANYN} { wchar_t buf[8]; utf8_from(buf, yytext); yylval->chr = buf[0]; @@ -986,7 +985,7 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} void end_of_regex(scanner_t *yyg) { - if (YYSTATE != REGEX) + if (YYSTATE != REGEX && YYSTATE != SREGEX) internal_error("end_of_regex called in wrong scanner state"); yy_pop_state(yyg); @@ -1050,7 +1049,7 @@ void prime_scanner(scanner_t *yyg, enum prime_parser prim) yy_push_state(NESTED, yyg); break; case prime_regex: - yy_push_state(REGEX, yyg); + yy_push_state(SREGEX, yyg); break; } } diff --git a/txr.1 b/txr.1 index 154a5277..e7614a6b 100644 --- a/txr.1 +++ b/txr.1 @@ -24836,9 +24836,9 @@ stream. .desc The .code regex-parse -function parses a character string which contains a regular expression -(without any surrounding / characters) and turns it into a Lisp data structure -(the abstract syntax tree representation of the regular expression). +function parses a character string which contains a regular expression and +turns it into a Lisp data structure (the abstract syntax tree representation of +the regular expression). The regular expression syntax .code #/RE/ @@ -24871,6 +24871,35 @@ value, that structure is then something which is suitable as input to .codn regex-compile . +There is a small difference in the syntax accepted by +.code regex-parse +and the syntax of regular expression literals. Any +.code / +(slash) characters occurring in any position within +.meta string +are treated as ordinary characters, not as regular expression delimiters. +The call +.code (regex-parse "/a/") +matches three characters: a slash, followed by the letter "a", followed +by another slash. Note that the slashes are not escaped. + +Note: if a +.code regex-parse +call is written using a string literal as the +.meta string +argument, then note that any backslashes which are to be processed +by the regular expression must be doubled up, otherwise they belong +to the string literal: + +.cblk + (regex-parse "\e*") ;; error, invalid string literal escape + (regex-parse "\e\e*") ;; correct: the \e* literal match for * +.cble + +The double backslash in the string literal produces a single backslash +in the resulting string object that is processed by +.codn regex-parse . + .SS* Hashing Library .coNP Functions @, make-hash and @ hash .synb -- cgit v1.2.3