From e68b978b40c53b7fef21056c2d1c1dff0b6bb729 Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Sun, 19 Apr 2015 10:34:28 -0700 Subject: Do not allow unrecognized escapes in regex. * parser.l (REGOP): New regex alias for matching all regex special characters. (grammar): Several rules for regex special characters merged together. New rule introduced to match a special character after a backslash, making it literal. The old rule which makes literal any character after a backslash now throws an error, unless version 105 comaptibility is selected. * txr.1: Documented this behavior change. --- ChangeLog | 14 ++++++++++++++ parser.l | 25 ++++++++++++++----------- txr.1 | 23 ++++++++++++++++++----- 3 files changed, 46 insertions(+), 16 deletions(-) diff --git a/ChangeLog b/ChangeLog index 12d4babb..60472e7c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,17 @@ +2015-04-19 Kaz Kylheku + + Do not allow unrecognized escapes in regex. + + * parser.l (REGOP): New regex alias for matching all regex + special characters. + (grammar): Several rules for regex special characters merged + together. New rule introduced to match a special character + after a backslash, making it literal. The old rule which makes + literal any character after a backslash now throws an error, + unless version 105 comaptibility is selected. + + * txr.1: Documented this behavior change. + 2015-04-19 Kaz Kylheku Improvement in error reporting. diff --git a/parser.l b/parser.l index 70c21d79..9036278f 100644 --- a/parser.l +++ b/parser.l @@ -49,6 +49,7 @@ #include "hash.h" #include "parser.h" #include "eval.h" +#include "txr.h" #include "y.tab.h" #define YY_INPUT(buf, result, max_size) \ @@ -198,6 +199,8 @@ WS [\t ]* HEX [0-9A-Fa-f] OCT [0-7] +REGOP [/()|.*?+~&%\[\]\-] + ASC [\x00-\x7f] ASCN [\x00-\t\v-\x7f] U [\x80-\xbf] @@ -741,24 +744,24 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} return ERRTOK; } -[.*?+~&%] { +{REGOP} { yylval->chr = yytext[0]; return yytext[0]; } -[\[\]\-] { - yylval->chr = yytext[0]; - return yytext[0]; -} - -[()|] { - yylval->chr = yytext[0]; - return yytext[0]; +[\\]{REGOP} { + yylval->chr = yytext[1]; + return REGCHAR; } [\\]. { - yylval->chr = yytext[1]; - return REGCHAR; + if (opt_compat && opt_compat <= 105) { + yylval->chr = yytext[1]; + return REGCHAR; + } + + yyerrprepf(yyg, lit("unrecognized escape in regex"), nao); + return ERRTOK; } [\\] { diff --git a/txr.1 b/txr.1 index 81ccb76c..6884e0ed 100644 --- a/txr.1 +++ b/txr.1 @@ -2052,14 +2052,13 @@ and the set matched by This operator is called intersection, logical and, or conjunction. .PP -Any escaped character which does not fall into the above escaping conventions, -or any unescaped character which is not a regular expression operator, denotes -one-position match of that character itself. +Any character which is not a regular expression operator, a backslash escape, +or the slash delimiter, denotes one-position match of that character itself. Any of the special characters, including the delimiting .codn / , -can be escaped with a backslash to suppress its meaning and denote the -character itself. +and the backslash, can be escaped with a backslash to suppress its +meaning and denote the character itself. Furthermore, all of the same escapes as are described in the section Special Characters in Text above are supported - the difference is that in regular @@ -2072,6 +2071,12 @@ rather than Octal and hex character escapes can be optionally terminated by a semicolon, which is useful if the following characters are octal or hex digits not intended to be part of the escape. + +Only the above escapes are supported. Unlike in some other regular expression +implementations, if a backlash appears before a character which isn't a regex +special character or one of the supported escape sequences, it is an error. +This wasn't true of historic versions of \*(TX. See the Compatibility section. + .IP "Precedence table, highest to lowest:" .TS tab(!); @@ -27924,6 +27929,14 @@ function automatically marks a stream open on a TTY devices as a real-time strea .code isatty function). +Also allows unrecognized backslash escape sequences in regular +expression syntax to simply denote the escaped character literally, +as was historically the case prior to \*(TX 106, so that +.code \ez +for instance denotes +.codn z . +As of \*(TX 106, these are diagnosed as errors. + .IP 102 Up to \*(TX 102, the .code get-string -- cgit v1.2.3