From 8ad6bb9114f0f8ce2d86e4290f8ac86ed108dc16 Mon Sep 17 00:00:00 2001
From: Kaz Kylheku <kaz@kylheku.com>
Date: Mon, 18 Jan 2010 13:42:09 -0800
Subject: Adjust semantics of non-greedy operator R%S, to avoid the broken case
 whereby R%S matches nothing at all when S is not empty but equivalent to
 empty, or more generally when S is nullable. A much nicer definition is ``the
 intersection of R* and the set of all strings that do not contain a non-empty
 substring that matches S, followed by S''.

---
 ChangeLog | 15 +++++++++++++++
 regex.c   | 12 +++++++++---
 txr.1     | 35 +++++++++++++++++------------------
 3 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index dcdd303b..783a1d49 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2010-01-18  Kaz Kylheku  <kkylheku@gmail.com>
+
+	Adjust semantics of non-greedy operator R%S, to avoid the broken
+	case whereby R%S matches nothing at all when S is not empty
+	but equivalent to empty, or more generally when S is nullable.
+	A much nicer definition is ``the intersection of R* and
+	the set of all strings that do not contain a non-empty substring
+	that matches S, followed by S''.
+
+	* regex.c (dv_compile_regex): Adjust syntactic sugar for the %
+	operator, taking advantage of the reg_nullable function to keep the
+	simpler syntactic sugar for cases where S is not nullable.
+
+	* txr.1: Document accordingly.
+
 2010-01-17  Kaz Kylheku  <kkylheku@gmail.com>
 
 	* parser.y (regterm, regclass): Relocate handling
diff --git a/regex.c b/regex.c
index 5107c5ec..174fb0d9 100644
--- a/regex.c
+++ b/regex.c
@@ -1126,6 +1126,8 @@ static struct cobj_ops regex_obj_ops = {
   cobj_hash_op
 };
 
+static val reg_nullable(val);
+
 /*
  * ``Compile'' raw regular expression to a form that is
  * easier to simulate by the derivative method.  
@@ -1165,12 +1167,18 @@ static val dv_compile_regex(val exp)
         return zplus;
       } else {
         val any = list(zeroplus_s, wild_s, nao);
+        val notempty = list(oneplus_s, wild_s, nao);
 
         return list(compound_s, 
                     list(and_s, 
                          zplus,
                          list(compl_s, 
-                              list(compound_s, any, xsecond, any, nao), 
+                              list(compound_s, 
+                                   any, 
+                                   if3(reg_nullable(xsecond),
+                                       list(and_s, xsecond, notempty, nao),
+                                       xsecond),
+                                   any, nao), 
                               nao), 
                          nao),
                     xsecond, nao);
@@ -1181,8 +1189,6 @@ static val dv_compile_regex(val exp)
   }
 }
 
-static val reg_nullable(val);
-
 /*
  * Helper to reg_nullable for recursing over
  * contents of a compound expression.
diff --git a/txr.1 b/txr.1
index 75e01403..9cfab323 100644
--- a/txr.1
+++ b/txr.1
@@ -689,9 +689,9 @@ match R1 zero or more times, then match R2. If this match can occur in
 more than one way, then it occurs such that R1 is matched the fewest
 number of times; which is opposite from the behavior of R1*R2.
 In other words, repetitions of R1 terminate at the earliest
-point in the text where a match for R2 occurs.  Favoring shorter matches, % is
-termed a non-greedy operator.  Note that R2 may be an empty regular expression,
-which is a special case that is equivalent to R1*.
+point in the text where a non-empty match for R2 occurs.  Favoring shorter
+matches, % is termed a non-greedy operator.  If R2 matches the empty
+string, then R1%R2 is equivalent to R1*.
 .IP ~R
 match the complement of the following expression R; i.e. match
 those texts that R does not match. This operator is called complement,
@@ -3028,15 +3028,15 @@ which is in turn based on intersection and complement. The uninteresting case
 context, the non-greedy operator matches R as far as possible, possibly to the
 end of the input, exactly like the greedy Kleene.  The interesting case (R%T)
 is defined as a "syntactic sugar" for the equivalent expression
-((R*)&(~.*T.*))T   which means: match the longest string which is matched by
-R*, but which does not contain T; then, match T.  This is a useful and
-expressive notation. With it, we can write the regular expression for matching
-C language comments simply like this: [/][*].%[*][/] (match the opening
-sequence /*, then match a sequence of zero or more characters non-greedily, and
-then the closing sequence */.  With the non-greedy operator, we don't have to
-think about the interior of the comment as set of strings which excludes */.
-Though the non-greedy operator appears expressive, its apparent
-simplicity may be deceptive.  It looks as if it works "magically"
+((R*)&(~.*(T&.+).*))T   which means: match the longest string which is matched
+by R*, but which does not contain a non-empty match for T; then, match T.  This
+is a useful and expressive notation. With it, we can write the regular
+expression for matching C language comments simply like this: [/][*].%[*][/]
+(match the opening sequence /*, then match a sequence of zero or more
+characters non-greedily, and then the closing sequence */.  With the non-greedy
+operator, we don't have to think about the interior of the comment as set of
+strings which excludes */.  Though the non-greedy operator appears expressive,
+its apparent simplicity may be deceptive.  It looks as if it works "magically"
 by itself; "somehow" this .% "knows" only to consume enough characters so that
 it doesn't swallow an occurrence of the trailing context. Care must be taken
 that the trailing context passed to the operator really is the correct text
@@ -3045,13 +3045,12 @@ expression .%abc.  If you intend the trailing context to be merely a, you must
 be careful to write (.%a)bc. Otherwise the trailing context is abc, and this
 means that the .% match will consume the longest string that does not contain
 "abc", when in fact what was intended was to consume the longest string that
-does not contain a. The change in behavior of the % operator upon modifying the 
+does not contain a. The change in behavior of the % operator upon modifying the
 trailing context is not as intuitive as that of the * operator, because the
-trailing context is deeply involved in its logic.  For
-single-character trailing contexts, it may be a good idea to use a complemented
-character class instead. That is to say, rather than (.%a)bc, consider
-[^a]*bc. The set of strings which don't contain the character a is adequately
-expressed by [^a]*.
+trailing context is deeply involved in its logic.  For single-character
+trailing contexts, it may be a good idea to use a complemented character class
+instead. That is to say, rather than (.%a)bc, consider [^a]*bc. The set of
+strings which don't contain the character a is adequately expressed by [^a]*.
 
 .SH NOTES ON FALSE
 
-- 
cgit v1.2.3