7 files changed, 372 insertions, 51 deletions
diff --git a/ChangeLog b/ChangeLog
index 328f6452..3db0d797 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,53 @@
+2010-01-05  Kaz Kylheku  <kkylheku@gmail.com>
+
+	Implemented the regular expression ~ and & operators.
+	This turns out to be easy to do in NFA land.
+	The complement of an NFA has exactly the same number
+	and configuration of states and transitions, except
+	that the states have an inverted meaning; and furthermore,
+	failed character transitions are routed to an extra
+	state (which in this impelmentation is permanently
+	allocated and shared by all regexes). The regex &
+	is implemented trivially using DeMorgan's.
+
+	Also, bugfix: regular expressions like A|B|C are allowed
+	now by the syntax, rather than constituting syntax error.
+	Previously, this would have been entered as (A|B)|C.
+
+	* lib.c (comp_s, and_s): New symbol globals.
+	(obj_init): New symbols interned.
+
+	* lib.h (comp_s, and_s): Declared.
+
+	* parser.l (grammar): Provide new ~ and & tokens in REGEX state.
+
+	* parser.y (regexpr): Constituents of '|' are regexprs,
+	rather than regbranches (see bugfix note above).
+	The '&' operator is added.
+	(regterm): The '~' operator is added.
+
+	* regex.c (struct any_char_set, struct small_char_set, struct
+	displaced_char_set): refs field added.
+	(nfa_kind_t): New enum members nfa_super_accept,
+	nfa_reject, nfa_compl_empty, nfa_compl_wild,
+	nfa_compl_single, nfa_compl_set.
+	(nfa_super_accept_state): New static structure.
+	(nfa_is_accept_state): New inline function.
+	(char_set_create): Initialize reference count to 1.
+	(char_set_destroy): Decrement refcount, free if zero.
+	(char_set_clone): New function.
+	(nfa_state_empty_convert, nfa_state_merge): Handle nfa_reject state,
+	the complement of nfa_accept.
+	(nfa_compl_state, nfa_compl): New functions.
+	(nfa_compile_regex): Handle new operators.
+	(nfa_all_states, nfa_closure): Handle new state types.
+	(nfa_move): Handle new types according to special rules:
+	the new complemented states that have character transitions have a next
+	move to the super-accept state if they do not match the input
+	character.
+	
+	* txr.1: Documented new regex operators.
+
 2009-12-09  Kaz Kylheku  <kkylheku@gmail.com>
 
 	* parser.l (YYINPUT): Fix signed/unsigned comparison.
diff --git a/lib.c b/lib.c
index 8094f678..b41a7cc5 100644
--- a/lib.c
+++ b/lib.c
@@ -52,7 +52,7 @@ val system_package, keyword_package, user_package;
 val null, t, cons_s, str_s, chr_s, num_s, sym_s, pkg_s, fun_s, vec_s;
 val stream_s, hash_s, lcons_s, lstr_s, cobj_s;
 val var_s, regex_s, set_s, cset_s, wild_s, oneplus_s;
-val zeroplus_s, optional_s, compound_s, or_s, quasi_s;
+val zeroplus_s, optional_s, compl_s, compound_s, or_s, and_s, quasi_s;
 val skip_s, trailer_s, block_s, next_s, freeform_s, fail_s, accept_s;
 val all_s, some_s, none_s, maybe_s, cases_s, collect_s, until_s, coll_s;
 val define_s, output_s, single_s, first_s, last_s, empty_s;
@@ -1891,8 +1891,10 @@ static void obj_init(void)
   oneplus_s = intern(lit("1+"), user_package);
   zeroplus_s = intern(lit("0+"), user_package);
   optional_s = intern(lit("?"), user_package);
+  compl_s = intern(lit("~"), user_package);
   compound_s = intern(lit("compound"), user_package);
   or_s = intern(lit("or"), user_package);
+  and_s = intern(lit("and"), user_package);
   quasi_s = intern(lit("quasi"), system_package);
   skip_s = intern(lit("skip"), user_package);
   trailer_s = intern(lit("trailer"), user_package);
diff --git a/lib.h b/lib.h
index 6188fc83..b50c1075 100644
--- a/lib.h
+++ b/lib.h
@@ -208,7 +208,7 @@ extern val keyword_package, system_package, user_package;
 extern val null, t, cons_s, str_s, chr_s, num_s, sym_s, pkg_s, fun_s, vec_s;
 extern val stream_s, hash_s, lcons_s, lstr_s, cobj_s;
 extern val var_s, regex_s, set_s, cset_s, wild_s, oneplus_s;
-extern val zeroplus_s, optional_s, compound_s, or_s, quasi_s;
+extern val zeroplus_s, optional_s, compl_s, compound_s, or_s, and_s, quasi_s;
 extern val skip_s, trailer_s, block_s, next_s, freeform_s, fail_s, accept_s;
 extern val all_s, some_s, none_s, maybe_s, cases_s, collect_s, until_s, coll_s;
 extern val define_s, output_s, single_s, first_s, last_s, empty_s;
diff --git a/parser.l b/parser.l
index dcb45ea9..e0c0f2d5 100644
--- a/parser.l
+++ b/parser.l
@@ -444,10 +444,10 @@ UONLY   {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
                   yyerror("newline in regex");
                 }
 
-<REGEX>[.*?+^]  {
-                  yylval.chr = yytext[0];
-                  return yytext[0];
-                }
+<REGEX>[.*?+^~&]        {
+                          yylval.chr = yytext[0];
+                          return yytext[0];
+                        }
 
 
 <REGEX>[\[\]\-] {
diff --git a/parser.y b/parser.y
index abac6c00..51386217 100644
--- a/parser.y
+++ b/parser.y
@@ -79,7 +79,8 @@ static val parsed_spec;
 %nonassoc '{' '}' '[' ']' '(' ')'
 %right IDENT TEXT NUMBER
 %left '|' '/'
-%right '*' '?' '+'
+%left '&'
+%right '~' '*' '?' '+'
 %right '^' '.' '\\' REGCHAR LITCHAR
 
 %%
@@ -454,7 +455,9 @@ regex : '/' regexpr '/'         { $$ = $2; }
       ;
 
 regexpr : regbranch                     { $$ = $1; }
-        | regbranch '|' regbranch       { $$ = list(list(or_s, $1,
+        | regexpr '|' regexpr           { $$ = list(list(or_s, $1,
+                                                         $3, nao), nao); }
+        | regexpr '&' regexpr           { $$ = list(list(and_s, $1,
                                                          $3, nao), nao); }
         ;
 
@@ -471,6 +474,7 @@ regterm : '[' regclass ']'      { $$ = cons(set_s, $2); }
         | regterm '*'           { $$ = list(zeroplus_s, $1, nao); }
         | regterm '+'           { $$ = list(oneplus_s, $1, nao); }
         | regterm '?'           { $$ = list(optional_s, $1, nao); }
+        | '~' regterm           { $$ = list(compl_s, $2, nao); }
         | REGCHAR               { $$ = chr($1); }
         | '(' regexpr ')'       { $$ = cons(compound_s, $2); }
         | '(' error             { $$ = nil;
diff --git a/regex.c b/regex.c
index ec1e656c..5cbe11a0 100644
--- a/regex.c
+++ b/regex.c
@@ -71,17 +71,20 @@ typedef cset_L2_t *cset_L3_t[17];
 struct any_char_set {
   unsigned type : 3;
   unsigned comp : 1;
+  unsigned refs : 12;
 };
 
 struct small_char_set {
   unsigned type : 3;
   unsigned comp : 1;
+  unsigned refs : 12;
   cset_L0_t bitcell;
 };
 
 struct displaced_char_set {
   unsigned type : 3;
   unsigned comp : 1;
+  unsigned refs : 12;
   cset_L0_t bitcell;
   wchar_t base;
 };
@@ -90,12 +93,14 @@ struct displaced_char_set {
 struct large_char_set {
   unsigned type : 3;
   unsigned comp : 1;
+  unsigned refs : 12;
   cset_L2_t dir;
 };
 
 struct xlarge_char_set {
   unsigned type : 3;
   unsigned comp : 1;
+  unsigned refs : 12;
   cset_L3_t dir;
 };
 
@@ -109,15 +114,38 @@ typedef union char_set {
 
 #define NFA_SET_SIZE 512
 
+/*
+ * NFA node types, where each node represents a state, plus up to
+ * either two epsilon-transitions, or a single transition for a character, *
+ * character class or wild (match any character). The types starting
+ * with nfa_reject are involved in special representations needed
+ * for complement NFA's; these only occur if a regex contains the
+ * complement operator.
+ */
+
 typedef enum {
-  nfa_accept, nfa_empty, nfa_wild, nfa_single, nfa_set
+  nfa_accept, nfa_empty, nfa_wild,
+  nfa_single, nfa_set, nfa_super_accept,
+  nfa_reject, nfa_compl_empty, nfa_compl_wild,
+  nfa_compl_single, nfa_compl_set
 } nfa_kind_t;
 
+/*
+ * Node used for types nfa_accept, nfa_reject, nfa_super_accept.
+ *
+ * NOTE: nfa_super_accept is globally instantiated; there is
+ * only one node of this type in the system. Transitions to it
+ * are implicit. Its visit flag is never used, so there is no
+ * reentrancy issue.
+ */
 struct nfa_state_accept {
   nfa_kind_t kind;
   unsigned visited;
 };
 
+/*
+ * Node used for nfa_empty, nfa_compl_empty.
+ */
 struct nfa_state_empty {
   nfa_kind_t kind;
   unsigned visited;
@@ -125,13 +153,20 @@ struct nfa_state_empty {
   nfa_state_t *trans1;
 };
 
+/*
+ * Node used for nfa_single, nfa_wild,
+ * nfa_compl_single, nfa_compl_wild.
+ */
 struct nfa_state_single {
   nfa_kind_t kind;
   unsigned visited;
   nfa_state_t *trans;
-  wchar_t ch;
+  wchar_t ch;        /* set to 0 if nfa_state_wild or nfa_compl_wild. */
 };
 
+/*
+ * Node used for nfa_set, nfa_compl_set.
+ */
 struct nfa_state_set {
   nfa_kind_t kind;
   unsigned visited;
@@ -155,6 +190,23 @@ struct nfa_machine {
   nfa_t nfa;
 };
 
+static nfa_state_t nfa_super_accept_state = { { nfa_super_accept } };
+
+INLINE int nfa_is_accept_state(nfa_state_t *state)
+{
+  switch (state->a.kind) {
+  case nfa_accept:
+  case nfa_super_accept:
+  case nfa_compl_empty:
+  case nfa_compl_wild:
+  case nfa_compl_single:
+  case nfa_compl_set:
+    return 1;
+  default:
+    return 0;
+  }
+}
+
 static int L0_full(cset_L0_t *L0)
 {
   int i;
@@ -423,6 +475,7 @@ static char_set_t *char_set_create(chset_type_t type, wchar_t base)
   char_set_t *cs = (char_set_t *) chk_malloc(sizeof *cs);
   *cs = blank;
   cs->any.type = type;
+  cs->any.refs = 1;
 
   if (type == CHSET_DISPLACED)
     cs->d.base = base;
@@ -432,22 +485,30 @@ static char_set_t *char_set_create(chset_type_t type, wchar_t base)
 
 static void char_set_destroy(char_set_t *set)
 {
-  switch (set->any.type) {
-  case CHSET_DISPLACED:
-  case CHSET_SMALL:
-    free(set);
-    break;
-  case CHSET_LARGE:
-    L2_free(&set->l.dir);
-    free(set);
-    break;
-  case CHSET_XLARGE:
-    L3_free(&set->xl.dir);
-    free(set);
-    break;
+  if (--set->any.refs == 0) {
+    switch (set->any.type) {
+    case CHSET_DISPLACED:
+    case CHSET_SMALL:
+      free(set);
+      break;
+    case CHSET_LARGE:
+      L2_free(&set->l.dir);
+      free(set);
+      break;
+    case CHSET_XLARGE:
+      L3_free(&set->xl.dir);
+      free(set);
+      break;
+    }
   }
 }
 
+static char_set_t *char_set_clone(char_set_t *set)
+{
+  set->any.refs++;
+  return set;
+}
+
 static void char_set_compl(char_set_t *set)
 {
   set->any.comp = 1;
@@ -602,7 +663,7 @@ static nfa_state_t *nfa_state_set(nfa_state_t *t, char_set_t *cs)
 static void nfa_state_empty_convert(nfa_state_t *acc, nfa_state_t *t0,
                                     nfa_state_t *t1)
 {
-  assert (acc->a.kind == nfa_accept);
+  assert (acc->a.kind == nfa_accept || acc->a.kind == nfa_reject);
   acc->e.kind = nfa_empty;
   acc->e.trans0 = t0;
   acc->e.trans1 = t1;
@@ -624,7 +685,7 @@ static void nfa_state_empty_convert(nfa_state_t *acc, nfa_state_t *t0,
  */
 static void nfa_state_merge(nfa_state_t *acc, nfa_state_t *st)
 {
-  assert (acc->a.kind == nfa_accept);
+  assert (acc->a.kind == nfa_accept || acc->a.kind == nfa_reject);
   *acc = *st;
 }
 
@@ -723,6 +784,136 @@ static nfa_t nfa_compile_set(val args, int comp)
   }
 }
 
+static nfa_state_t *nfa_compl_state(nfa_state_t *in)
+{
+  nfa_state_t *st = (nfa_state_t *) chk_malloc(sizeof *st);
+
+  st->a.visited = 0;
+
+  switch (in->a.kind) {
+  case nfa_accept:
+    st->a.kind = nfa_reject;
+    break;
+  case nfa_empty:
+    st->a.kind = nfa_compl_empty;
+    st->e.trans0 = 0;
+    st->e.trans1 = 0;
+    break;
+  case nfa_wild:
+    st->a.kind = nfa_compl_wild;
+    st->o.trans = 0;
+    st->o.ch = 0;
+    break;
+  case nfa_single:
+    st->a.kind = nfa_compl_single;
+    st->o.trans = 0;
+    st->o.ch = in->o.ch;
+    break;
+  case nfa_set:
+    st->a.kind = nfa_compl_set;
+    st->s.trans = 0;
+    st->s.set = char_set_clone(in->s.set);
+    break;
+  case nfa_super_accept:
+    internal_error("NFA super accept state explicitly reachable in graph");
+  case nfa_reject:
+    st->a.kind = nfa_accept;
+    break;
+  case nfa_compl_empty:
+    st->a.kind = nfa_empty;
+    st->e.trans0 = 0;
+    st->e.trans1 = 0;
+    break;
+  case nfa_compl_wild:
+    st->a.kind = nfa_wild;
+    st->o.trans = 0;
+    st->o.ch = 0;
+    break;
+  case nfa_compl_single:
+    st->a.kind = nfa_single;
+    st->o.trans = 0;
+    st->o.ch = in->o.ch;
+    break;
+  case nfa_compl_set:
+    st->a.kind = nfa_set;
+    st->s.trans = 0;
+    st->s.set = char_set_clone(in->s.set);
+    break;
+  }
+
+  return st;
+}
+
+static nfa_t nfa_compl(nfa_t in)
+{
+  nfa_state_t **stack = (nfa_state_t **) chk_malloc(NFA_SET_SIZE*sizeof *stack);
+  nfa_state_t **temp = (nfa_state_t **) chk_malloc(NFA_SET_SIZE*sizeof *temp);
+  unsigned visited = ++in.start->a.visited;
+  int i, num = 2;
+
+  stack[0] = in.start;
+  stack[1] = in.accept;
+
+  temp[0] = nfa_compl_state(stack[0]);
+  temp[1] = nfa_compl_state(stack[1]);
+
+  for (i = 0; i < num; i++) {
+    nfa_state_t *s = stack[i];
+    nfa_state_t *t = temp[i];
+
+    if (num >= NFA_SET_SIZE)
+      internal_error("NFA set size exceeded");
+
+    switch (s->a.kind) {
+    case nfa_accept:
+    case nfa_reject:
+      break;
+    case nfa_empty:
+    case nfa_compl_empty:
+      {
+        nfa_state_t *e0 = s->e.trans0;
+        nfa_state_t *e1 = s->e.trans1;
+
+        if (e0 && e0->a.visited != visited) {
+          e0->a.visited = visited;
+          temp[num] = t->e.trans0 = nfa_compl_state(e0);
+          stack[num++] = e0;
+        }
+        if (e1 && e1->a.visited != visited) {
+          e1->a.visited = visited;
+          temp[num] = t->e.trans1 = nfa_compl_state(e1);
+          stack[num++] = e1;
+        }
+      }
+      break;
+    case nfa_wild:
+    case nfa_single:
+    case nfa_set:
+    case nfa_compl_wild:
+    case nfa_compl_single:
+    case nfa_compl_set:
+      if (s->o.trans->a.visited != visited) {
+        s->o.trans->a.visited = visited;
+        temp[num] = t->o.trans = nfa_compl_state(s->o.trans);
+        stack[num++] = s->o.trans;
+      }
+      break;
+    case nfa_super_accept:
+      internal_error("NFA super accept state explicitly reachable in graph");
+    }
+  }
+
+  if (num > NFA_SET_SIZE)
+    internal_error("NFA set size exceeded");
+
+  {
+    nfa_t ret = { temp[0], temp[1] };
+    free(temp);
+    free(stack);
+    return ret;
+  }
+}
+
 /*
  * Input is the items from a regex form,
  * not including the regex symbol.
@@ -785,6 +976,9 @@ nfa_t nfa_compile_regex(val items)
         nfa_t nfa_args = nfa_compile_regex(args);
         nfa_state_t *s = nfa_state_empty(nfa_args.start, nfa_args.accept);
         nfa = nfa_make(s, nfa_args.accept);
+      } else if (sym == compl_s) {
+        nfa_t nfa_args = nfa_compile_regex(args);
+        nfa = nfa_compl(nfa_args);
       } else if (sym == or_s) {
         /* Simple: make a new start and acceptance state, which form
            the ends of a spindle that goes through two branches. */
@@ -798,6 +992,19 @@ nfa_t nfa_compile_regex(val items)
         nfa_state_empty_convert(nfa_first.accept, acc, 0);
         nfa_state_empty_convert(nfa_second.accept, acc, 0);
         nfa = nfa_make(s, acc);
+      } else if (sym == and_s) {
+        /* We compile regex and using the De Morgan's equality:
+           R1&R2 == ~(~R1|~R2).
+           We could do this by transforming the source code,
+           and recursing into compiler, but instead, the approach taken
+           is NFA manipulation closely based on the or_s case above. */
+        nfa_t nfa_first = nfa_compl(nfa_compile_regex(first(args)));
+        nfa_t nfa_second = nfa_compl(nfa_compile_regex(second(args)));
+        nfa_state_t *acc = nfa_state_accept();
+        nfa_state_t *s = nfa_state_empty(nfa_first.start, nfa_second.start);
+        nfa_state_empty_convert(nfa_first.accept, acc, 0);
+        nfa_state_empty_convert(nfa_second.accept, acc, 0);
+        nfa = nfa_compl(nfa_make(s, acc));
       } else {
         assert (0 && "internal error: bad operator in regex");
       }
@@ -817,45 +1024,52 @@ nfa_t nfa_compile_regex(val items)
   }
 }
 
-static int nfa_all_states(nfa_state_t **inout, int num, unsigned visited)
+static int nfa_all_states(nfa_state_t **stack, int num, unsigned visited)
 {
   int i;
 
   for (i = 0; i < num; i++)
-    inout[i]->a.visited = visited;
+    stack[i]->a.visited = visited;
 
   for (i = 0; i < num; i++) {
-    nfa_state_t *s = inout[i];
+    nfa_state_t *s = stack[i];
 
     if (num >= NFA_SET_SIZE)
       internal_error("NFA set size exceeded");
 
     switch (s->a.kind) {
     case nfa_accept:
+    case nfa_reject:
       break;
     case nfa_empty:
+    case nfa_compl_empty:
       {
         nfa_state_t *e0 = s->e.trans0;
         nfa_state_t *e1 = s->e.trans1;
 
         if (e0 && e0->a.visited != visited) {
           e0->a.visited = visited;
-          inout[num++] = e0;
+          stack[num++] = e0;
         }
         if (e1 && e1->a.visited != visited) {
           e1->a.visited = visited;
-          inout[num++] = e1;
+          stack[num++] = e1;
         }
       }
       break;
     case nfa_wild:
     case nfa_single:
     case nfa_set:
+    case nfa_compl_wild:
+    case nfa_compl_single:
+    case nfa_compl_set:
       if (s->o.trans->a.visited != visited) {
         s->o.trans->a.visited = visited;
-        inout[num++] = s->o.trans;
+        stack[num++] = s->o.trans;
       }
       break;
+    case nfa_super_accept:
+      internal_error("NFA super accept state explicitly reachable in graph");
     }
   }
 
@@ -873,7 +1087,7 @@ void nfa_free(nfa_t nfa)
   all[0] = nfa.start;
   all[1] = nfa.accept;
 
-  nstates = nfa_all_states(all, 2, nfa.start->a.visited);
+  nstates = nfa_all_states(all, 2, nfa.start->a.visited + 1);
 
   for (i = 0; i < nstates; i++)
     nfa_state_free(all[i]);
@@ -906,10 +1120,15 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
   for (i = 0; i < nin; i++) {
     if (stackp >= NFA_SET_SIZE)
       internal_error("NFA set size exceeded");
-    in[i]->a.visited = visited;
+    /* We do not mark the super accept state visited,
+       and we will never visit it by following e-transitions,
+       because all transitions into the super-accept state
+       are character-consuming. */
+    if (in[i] != &nfa_super_accept_state)
+       in[i]->a.visited = visited;
     stack[stackp++] = in[i];
     out[nout++] = in[i];
-    if (in[i]->a.kind == nfa_accept)
+    if (nfa_is_accept_state(in[i]))
       *accept = 1;
   }
 
@@ -919,10 +1138,13 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
     if (nout >= NFA_SET_SIZE)
       internal_error("NFA set size exceeded");
 
-    /* Only states of type nfa_empty are interesting.
-       Each such state at most two epsilon transitions. */
-
-    if (top->a.kind == nfa_empty) {
+    /* Only states of type nfa_empty or its complement are interesting.
+       Each such state at most two epsilon transitions.
+       Note above comment that epsilon transitions never lead
+       to the special super accept state; i.e. e0 and e1
+       are never the super accept state and so we can write to their
+       their visited field. */
+    if (top->a.kind == nfa_empty || top->a.kind == nfa_compl_empty) {
       nfa_state_t *e0 = top->e.trans0;
       nfa_state_t *e1 = top->e.trans1;
 
@@ -930,7 +1152,7 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
         e0->a.visited = visited;
         stack[stackp++] = e0;
         out[nout++] = e0;
-        if (e0->a.kind == nfa_accept)
+        if (nfa_is_accept_state(e0))
           *accept = 1;
       }
 
@@ -938,7 +1160,7 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
         e1->a.visited = visited;
         stack[stackp++] = e1;
         out[nout++] = e1;
-        if (e1->a.kind == nfa_accept)
+        if (nfa_is_accept_state(e1))
           *accept = 1;
       }
     }
@@ -958,22 +1180,41 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
 static int nfa_move(nfa_state_t **in, int nin, nfa_state_t **out, wchar_t ch)
 {
   int i, nmove;
+  int super_added = 0;
 
   for (nmove = 0, i = 0; i < nin; i++) {
     nfa_state_t *s = in[i];
 
     switch (s->a.kind) {
     case nfa_wild:
+    case nfa_compl_wild:
       /* Unconditional match; don't have to look at ch. */
       break;
     case nfa_single:
       if (s->o.ch == ch) /* Character match. */
         break;
       continue; /* no match */
+    case nfa_compl_single:
+      if (s->o.ch == ch) /* Character match. */
+        break;
+      goto add_super; /* no match means we can move to super accept state */
     case nfa_set:
       if (char_set_contains(s->s.set, ch)) /* Set match. */
         break;
       continue; /* no match */
+    case nfa_compl_set:
+      if (char_set_contains(s->s.set, ch)) /* Set match. */
+        break;
+      goto add_super;
+    case nfa_super_accept:
+      /* super always transitions to itself, for any character. */
+    add_super:
+      if (!super_added) {
+        super_added = 1;
+        if (nmove >= NFA_SET_SIZE)
+          internal_error("NFA set size exceeded");
+        out[nmove++] = &nfa_super_accept_state;
+      }
     default:
       /* Epsilon-transition and acceptance states have no character moves. */
       continue;
diff --git a/txr.1 b/txr.1
index 03434d0e..25f8ac51 100644
--- a/txr.1
+++ b/txr.1
@@ -643,17 +643,21 @@ If RE is a regular expression, then so is (RE).
 The contents of parentheses denote one regular expression unit, so that for
 instance in (RE)*, the * operator applies to the entire parenthesized group.
 .IP (RE)?
-optionally matches the preceding regular expression (RE).
+optionally match the preceding regular expression (RE).
 .IP (RE)+
-matches the preceding expression one or more times.
+match the preceding expression one or more times.
 .IP (RE)*
-matches the preceding expression zero or more times.
+match the preceding expression zero or more times.
+.IP ~(RE)
+match the complement of the preceding expression; i.e. match
+those texts that (RE) does not match.
 .IP (RE1)(RE2)
 Two consecutive regular expressions denote catenation:
 the left expression must match, and then the right.
-
 .IP (RE1)|(RE2)
-matches either the expression RE1 or RE2.
+match either the expression RE1 or RE2.
+.IP (RE1)&(RE2)
+match both the expression RE1 and RE2 simultaneously.
 
 .PP
 Any of the special characters, including the delimiting /,  can be escaped with
@@ -674,14 +678,34 @@ The postfix operators ?, + and * have the second highest precedence, and
 associate left to right, so that in A+?*, the * applies to A+?, and the ?
 applies to A+.
 
-Catenation is on the next lower precedence rung, so that AB? means "match A,
-and then optionally B" not "match A and B, as one optional unit".  The latter
-must be written (AB)?  using parentheses to override precedence.
+The unary complement operator has the next lower precedence, so
+that ~A* means the ~(A*): "match the all text that is not matched by zero
+or more repetitions of A", not "match zero or more times the text
+not matched by A".
+
+Catenation is on the next lower precedence rung, so that AB? means A(B?), or
+"match A, and then optionally B", not "match A and B, as one optional
+unit".  The latter must be written (AB)?  using parentheses to override
+precedence.
+
+The precedence of conjunction (&) is lower than catenation, but not as low as
+that of disjunction, thus AB
 
 The disjunction operator | has the lowest precedence, lower than catenation.
-Thus abc|def means "match abc, or match def". The meaning "match ab,
-then c or d, then ef" must be expressed as ab(c|d)ef, or using
-a character class: ab[cd]ef.
+Thus ABC|DEF means "match ABC, or match DEF". The meaning "match AB,
+then C or D, then EF" must be expressed as AB(C|D)EF, or using
+a character class: AB[CD]EF.
+
+The regular expression
+
+  ABC*&~DEF+|Z
+
+Is parsed as if it were grouped with parentheses like this:
+
+  ((AB(C*))&(~(DE(F+))))|Z
+
+The main constituent is the | operator, whose left side is an & expression,
+et cetera.
 
 In
 .b txr,