summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog50
-rw-r--r--lib.c4
-rw-r--r--lib.h2
-rw-r--r--parser.l8
-rw-r--r--parser.y8
-rw-r--r--regex.c305
-rw-r--r--txr.146
7 files changed, 372 insertions, 51 deletions
diff --git a/ChangeLog b/ChangeLog
index 328f6452..3db0d797 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,53 @@
+2010-01-05 Kaz Kylheku <kkylheku@gmail.com>
+
+ Implemented the regular expression ~ and & operators.
+ This turns out to be easy to do in NFA land.
+ The complement of an NFA has exactly the same number
+ and configuration of states and transitions, except
+ that the states have an inverted meaning; and furthermore,
+ failed character transitions are routed to an extra
+ state (which in this impelmentation is permanently
+ allocated and shared by all regexes). The regex &
+ is implemented trivially using DeMorgan's.
+
+ Also, bugfix: regular expressions like A|B|C are allowed
+ now by the syntax, rather than constituting syntax error.
+ Previously, this would have been entered as (A|B)|C.
+
+ * lib.c (comp_s, and_s): New symbol globals.
+ (obj_init): New symbols interned.
+
+ * lib.h (comp_s, and_s): Declared.
+
+ * parser.l (grammar): Provide new ~ and & tokens in REGEX state.
+
+ * parser.y (regexpr): Constituents of '|' are regexprs,
+ rather than regbranches (see bugfix note above).
+ The '&' operator is added.
+ (regterm): The '~' operator is added.
+
+ * regex.c (struct any_char_set, struct small_char_set, struct
+ displaced_char_set): refs field added.
+ (nfa_kind_t): New enum members nfa_super_accept,
+ nfa_reject, nfa_compl_empty, nfa_compl_wild,
+ nfa_compl_single, nfa_compl_set.
+ (nfa_super_accept_state): New static structure.
+ (nfa_is_accept_state): New inline function.
+ (char_set_create): Initialize reference count to 1.
+ (char_set_destroy): Decrement refcount, free if zero.
+ (char_set_clone): New function.
+ (nfa_state_empty_convert, nfa_state_merge): Handle nfa_reject state,
+ the complement of nfa_accept.
+ (nfa_compl_state, nfa_compl): New functions.
+ (nfa_compile_regex): Handle new operators.
+ (nfa_all_states, nfa_closure): Handle new state types.
+ (nfa_move): Handle new types according to special rules:
+ the new complemented states that have character transitions have a next
+ move to the super-accept state if they do not match the input
+ character.
+
+ * txr.1: Documented new regex operators.
+
2009-12-09 Kaz Kylheku <kkylheku@gmail.com>
* parser.l (YYINPUT): Fix signed/unsigned comparison.
diff --git a/lib.c b/lib.c
index 8094f678..b41a7cc5 100644
--- a/lib.c
+++ b/lib.c
@@ -52,7 +52,7 @@ val system_package, keyword_package, user_package;
val null, t, cons_s, str_s, chr_s, num_s, sym_s, pkg_s, fun_s, vec_s;
val stream_s, hash_s, lcons_s, lstr_s, cobj_s;
val var_s, regex_s, set_s, cset_s, wild_s, oneplus_s;
-val zeroplus_s, optional_s, compound_s, or_s, quasi_s;
+val zeroplus_s, optional_s, compl_s, compound_s, or_s, and_s, quasi_s;
val skip_s, trailer_s, block_s, next_s, freeform_s, fail_s, accept_s;
val all_s, some_s, none_s, maybe_s, cases_s, collect_s, until_s, coll_s;
val define_s, output_s, single_s, first_s, last_s, empty_s;
@@ -1891,8 +1891,10 @@ static void obj_init(void)
oneplus_s = intern(lit("1+"), user_package);
zeroplus_s = intern(lit("0+"), user_package);
optional_s = intern(lit("?"), user_package);
+ compl_s = intern(lit("~"), user_package);
compound_s = intern(lit("compound"), user_package);
or_s = intern(lit("or"), user_package);
+ and_s = intern(lit("and"), user_package);
quasi_s = intern(lit("quasi"), system_package);
skip_s = intern(lit("skip"), user_package);
trailer_s = intern(lit("trailer"), user_package);
diff --git a/lib.h b/lib.h
index 6188fc83..b50c1075 100644
--- a/lib.h
+++ b/lib.h
@@ -208,7 +208,7 @@ extern val keyword_package, system_package, user_package;
extern val null, t, cons_s, str_s, chr_s, num_s, sym_s, pkg_s, fun_s, vec_s;
extern val stream_s, hash_s, lcons_s, lstr_s, cobj_s;
extern val var_s, regex_s, set_s, cset_s, wild_s, oneplus_s;
-extern val zeroplus_s, optional_s, compound_s, or_s, quasi_s;
+extern val zeroplus_s, optional_s, compl_s, compound_s, or_s, and_s, quasi_s;
extern val skip_s, trailer_s, block_s, next_s, freeform_s, fail_s, accept_s;
extern val all_s, some_s, none_s, maybe_s, cases_s, collect_s, until_s, coll_s;
extern val define_s, output_s, single_s, first_s, last_s, empty_s;
diff --git a/parser.l b/parser.l
index dcb45ea9..e0c0f2d5 100644
--- a/parser.l
+++ b/parser.l
@@ -444,10 +444,10 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
yyerror("newline in regex");
}
-<REGEX>[.*?+^] {
- yylval.chr = yytext[0];
- return yytext[0];
- }
+<REGEX>[.*?+^~&] {
+ yylval.chr = yytext[0];
+ return yytext[0];
+ }
<REGEX>[\[\]\-] {
diff --git a/parser.y b/parser.y
index abac6c00..51386217 100644
--- a/parser.y
+++ b/parser.y
@@ -79,7 +79,8 @@ static val parsed_spec;
%nonassoc '{' '}' '[' ']' '(' ')'
%right IDENT TEXT NUMBER
%left '|' '/'
-%right '*' '?' '+'
+%left '&'
+%right '~' '*' '?' '+'
%right '^' '.' '\\' REGCHAR LITCHAR
%%
@@ -454,7 +455,9 @@ regex : '/' regexpr '/' { $$ = $2; }
;
regexpr : regbranch { $$ = $1; }
- | regbranch '|' regbranch { $$ = list(list(or_s, $1,
+ | regexpr '|' regexpr { $$ = list(list(or_s, $1,
+ $3, nao), nao); }
+ | regexpr '&' regexpr { $$ = list(list(and_s, $1,
$3, nao), nao); }
;
@@ -471,6 +474,7 @@ regterm : '[' regclass ']' { $$ = cons(set_s, $2); }
| regterm '*' { $$ = list(zeroplus_s, $1, nao); }
| regterm '+' { $$ = list(oneplus_s, $1, nao); }
| regterm '?' { $$ = list(optional_s, $1, nao); }
+ | '~' regterm { $$ = list(compl_s, $2, nao); }
| REGCHAR { $$ = chr($1); }
| '(' regexpr ')' { $$ = cons(compound_s, $2); }
| '(' error { $$ = nil;
diff --git a/regex.c b/regex.c
index ec1e656c..5cbe11a0 100644
--- a/regex.c
+++ b/regex.c
@@ -71,17 +71,20 @@ typedef cset_L2_t *cset_L3_t[17];
struct any_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned refs : 12;
};
struct small_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned refs : 12;
cset_L0_t bitcell;
};
struct displaced_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned refs : 12;
cset_L0_t bitcell;
wchar_t base;
};
@@ -90,12 +93,14 @@ struct displaced_char_set {
struct large_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned refs : 12;
cset_L2_t dir;
};
struct xlarge_char_set {
unsigned type : 3;
unsigned comp : 1;
+ unsigned refs : 12;
cset_L3_t dir;
};
@@ -109,15 +114,38 @@ typedef union char_set {
#define NFA_SET_SIZE 512
+/*
+ * NFA node types, where each node represents a state, plus up to
+ * either two epsilon-transitions, or a single transition for a character, *
+ * character class or wild (match any character). The types starting
+ * with nfa_reject are involved in special representations needed
+ * for complement NFA's; these only occur if a regex contains the
+ * complement operator.
+ */
+
typedef enum {
- nfa_accept, nfa_empty, nfa_wild, nfa_single, nfa_set
+ nfa_accept, nfa_empty, nfa_wild,
+ nfa_single, nfa_set, nfa_super_accept,
+ nfa_reject, nfa_compl_empty, nfa_compl_wild,
+ nfa_compl_single, nfa_compl_set
} nfa_kind_t;
+/*
+ * Node used for types nfa_accept, nfa_reject, nfa_super_accept.
+ *
+ * NOTE: nfa_super_accept is globally instantiated; there is
+ * only one node of this type in the system. Transitions to it
+ * are implicit. Its visit flag is never used, so there is no
+ * reentrancy issue.
+ */
struct nfa_state_accept {
nfa_kind_t kind;
unsigned visited;
};
+/*
+ * Node used for nfa_empty, nfa_compl_empty.
+ */
struct nfa_state_empty {
nfa_kind_t kind;
unsigned visited;
@@ -125,13 +153,20 @@ struct nfa_state_empty {
nfa_state_t *trans1;
};
+/*
+ * Node used for nfa_single, nfa_wild,
+ * nfa_compl_single, nfa_compl_wild.
+ */
struct nfa_state_single {
nfa_kind_t kind;
unsigned visited;
nfa_state_t *trans;
- wchar_t ch;
+ wchar_t ch; /* set to 0 if nfa_state_wild or nfa_compl_wild. */
};
+/*
+ * Node used for nfa_set, nfa_compl_set.
+ */
struct nfa_state_set {
nfa_kind_t kind;
unsigned visited;
@@ -155,6 +190,23 @@ struct nfa_machine {
nfa_t nfa;
};
+static nfa_state_t nfa_super_accept_state = { { nfa_super_accept } };
+
+INLINE int nfa_is_accept_state(nfa_state_t *state)
+{
+ switch (state->a.kind) {
+ case nfa_accept:
+ case nfa_super_accept:
+ case nfa_compl_empty:
+ case nfa_compl_wild:
+ case nfa_compl_single:
+ case nfa_compl_set:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
static int L0_full(cset_L0_t *L0)
{
int i;
@@ -423,6 +475,7 @@ static char_set_t *char_set_create(chset_type_t type, wchar_t base)
char_set_t *cs = (char_set_t *) chk_malloc(sizeof *cs);
*cs = blank;
cs->any.type = type;
+ cs->any.refs = 1;
if (type == CHSET_DISPLACED)
cs->d.base = base;
@@ -432,22 +485,30 @@ static char_set_t *char_set_create(chset_type_t type, wchar_t base)
static void char_set_destroy(char_set_t *set)
{
- switch (set->any.type) {
- case CHSET_DISPLACED:
- case CHSET_SMALL:
- free(set);
- break;
- case CHSET_LARGE:
- L2_free(&set->l.dir);
- free(set);
- break;
- case CHSET_XLARGE:
- L3_free(&set->xl.dir);
- free(set);
- break;
+ if (--set->any.refs == 0) {
+ switch (set->any.type) {
+ case CHSET_DISPLACED:
+ case CHSET_SMALL:
+ free(set);
+ break;
+ case CHSET_LARGE:
+ L2_free(&set->l.dir);
+ free(set);
+ break;
+ case CHSET_XLARGE:
+ L3_free(&set->xl.dir);
+ free(set);
+ break;
+ }
}
}
+static char_set_t *char_set_clone(char_set_t *set)
+{
+ set->any.refs++;
+ return set;
+}
+
static void char_set_compl(char_set_t *set)
{
set->any.comp = 1;
@@ -602,7 +663,7 @@ static nfa_state_t *nfa_state_set(nfa_state_t *t, char_set_t *cs)
static void nfa_state_empty_convert(nfa_state_t *acc, nfa_state_t *t0,
nfa_state_t *t1)
{
- assert (acc->a.kind == nfa_accept);
+ assert (acc->a.kind == nfa_accept || acc->a.kind == nfa_reject);
acc->e.kind = nfa_empty;
acc->e.trans0 = t0;
acc->e.trans1 = t1;
@@ -624,7 +685,7 @@ static void nfa_state_empty_convert(nfa_state_t *acc, nfa_state_t *t0,
*/
static void nfa_state_merge(nfa_state_t *acc, nfa_state_t *st)
{
- assert (acc->a.kind == nfa_accept);
+ assert (acc->a.kind == nfa_accept || acc->a.kind == nfa_reject);
*acc = *st;
}
@@ -723,6 +784,136 @@ static nfa_t nfa_compile_set(val args, int comp)
}
}
+static nfa_state_t *nfa_compl_state(nfa_state_t *in)
+{
+ nfa_state_t *st = (nfa_state_t *) chk_malloc(sizeof *st);
+
+ st->a.visited = 0;
+
+ switch (in->a.kind) {
+ case nfa_accept:
+ st->a.kind = nfa_reject;
+ break;
+ case nfa_empty:
+ st->a.kind = nfa_compl_empty;
+ st->e.trans0 = 0;
+ st->e.trans1 = 0;
+ break;
+ case nfa_wild:
+ st->a.kind = nfa_compl_wild;
+ st->o.trans = 0;
+ st->o.ch = 0;
+ break;
+ case nfa_single:
+ st->a.kind = nfa_compl_single;
+ st->o.trans = 0;
+ st->o.ch = in->o.ch;
+ break;
+ case nfa_set:
+ st->a.kind = nfa_compl_set;
+ st->s.trans = 0;
+ st->s.set = char_set_clone(in->s.set);
+ break;
+ case nfa_super_accept:
+ internal_error("NFA super accept state explicitly reachable in graph");
+ case nfa_reject:
+ st->a.kind = nfa_accept;
+ break;
+ case nfa_compl_empty:
+ st->a.kind = nfa_empty;
+ st->e.trans0 = 0;
+ st->e.trans1 = 0;
+ break;
+ case nfa_compl_wild:
+ st->a.kind = nfa_wild;
+ st->o.trans = 0;
+ st->o.ch = 0;
+ break;
+ case nfa_compl_single:
+ st->a.kind = nfa_single;
+ st->o.trans = 0;
+ st->o.ch = in->o.ch;
+ break;
+ case nfa_compl_set:
+ st->a.kind = nfa_set;
+ st->s.trans = 0;
+ st->s.set = char_set_clone(in->s.set);
+ break;
+ }
+
+ return st;
+}
+
+static nfa_t nfa_compl(nfa_t in)
+{
+ nfa_state_t **stack = (nfa_state_t **) chk_malloc(NFA_SET_SIZE*sizeof *stack);
+ nfa_state_t **temp = (nfa_state_t **) chk_malloc(NFA_SET_SIZE*sizeof *temp);
+ unsigned visited = ++in.start->a.visited;
+ int i, num = 2;
+
+ stack[0] = in.start;
+ stack[1] = in.accept;
+
+ temp[0] = nfa_compl_state(stack[0]);
+ temp[1] = nfa_compl_state(stack[1]);
+
+ for (i = 0; i < num; i++) {
+ nfa_state_t *s = stack[i];
+ nfa_state_t *t = temp[i];
+
+ if (num >= NFA_SET_SIZE)
+ internal_error("NFA set size exceeded");
+
+ switch (s->a.kind) {
+ case nfa_accept:
+ case nfa_reject:
+ break;
+ case nfa_empty:
+ case nfa_compl_empty:
+ {
+ nfa_state_t *e0 = s->e.trans0;
+ nfa_state_t *e1 = s->e.trans1;
+
+ if (e0 && e0->a.visited != visited) {
+ e0->a.visited = visited;
+ temp[num] = t->e.trans0 = nfa_compl_state(e0);
+ stack[num++] = e0;
+ }
+ if (e1 && e1->a.visited != visited) {
+ e1->a.visited = visited;
+ temp[num] = t->e.trans1 = nfa_compl_state(e1);
+ stack[num++] = e1;
+ }
+ }
+ break;
+ case nfa_wild:
+ case nfa_single:
+ case nfa_set:
+ case nfa_compl_wild:
+ case nfa_compl_single:
+ case nfa_compl_set:
+ if (s->o.trans->a.visited != visited) {
+ s->o.trans->a.visited = visited;
+ temp[num] = t->o.trans = nfa_compl_state(s->o.trans);
+ stack[num++] = s->o.trans;
+ }
+ break;
+ case nfa_super_accept:
+ internal_error("NFA super accept state explicitly reachable in graph");
+ }
+ }
+
+ if (num > NFA_SET_SIZE)
+ internal_error("NFA set size exceeded");
+
+ {
+ nfa_t ret = { temp[0], temp[1] };
+ free(temp);
+ free(stack);
+ return ret;
+ }
+}
+
/*
* Input is the items from a regex form,
* not including the regex symbol.
@@ -785,6 +976,9 @@ nfa_t nfa_compile_regex(val items)
nfa_t nfa_args = nfa_compile_regex(args);
nfa_state_t *s = nfa_state_empty(nfa_args.start, nfa_args.accept);
nfa = nfa_make(s, nfa_args.accept);
+ } else if (sym == compl_s) {
+ nfa_t nfa_args = nfa_compile_regex(args);
+ nfa = nfa_compl(nfa_args);
} else if (sym == or_s) {
/* Simple: make a new start and acceptance state, which form
the ends of a spindle that goes through two branches. */
@@ -798,6 +992,19 @@ nfa_t nfa_compile_regex(val items)
nfa_state_empty_convert(nfa_first.accept, acc, 0);
nfa_state_empty_convert(nfa_second.accept, acc, 0);
nfa = nfa_make(s, acc);
+ } else if (sym == and_s) {
+ /* We compile regex and using the De Morgan's equality:
+ R1&R2 == ~(~R1|~R2).
+ We could do this by transforming the source code,
+ and recursing into compiler, but instead, the approach taken
+ is NFA manipulation closely based on the or_s case above. */
+ nfa_t nfa_first = nfa_compl(nfa_compile_regex(first(args)));
+ nfa_t nfa_second = nfa_compl(nfa_compile_regex(second(args)));
+ nfa_state_t *acc = nfa_state_accept();
+ nfa_state_t *s = nfa_state_empty(nfa_first.start, nfa_second.start);
+ nfa_state_empty_convert(nfa_first.accept, acc, 0);
+ nfa_state_empty_convert(nfa_second.accept, acc, 0);
+ nfa = nfa_compl(nfa_make(s, acc));
} else {
assert (0 && "internal error: bad operator in regex");
}
@@ -817,45 +1024,52 @@ nfa_t nfa_compile_regex(val items)
}
}
-static int nfa_all_states(nfa_state_t **inout, int num, unsigned visited)
+static int nfa_all_states(nfa_state_t **stack, int num, unsigned visited)
{
int i;
for (i = 0; i < num; i++)
- inout[i]->a.visited = visited;
+ stack[i]->a.visited = visited;
for (i = 0; i < num; i++) {
- nfa_state_t *s = inout[i];
+ nfa_state_t *s = stack[i];
if (num >= NFA_SET_SIZE)
internal_error("NFA set size exceeded");
switch (s->a.kind) {
case nfa_accept:
+ case nfa_reject:
break;
case nfa_empty:
+ case nfa_compl_empty:
{
nfa_state_t *e0 = s->e.trans0;
nfa_state_t *e1 = s->e.trans1;
if (e0 && e0->a.visited != visited) {
e0->a.visited = visited;
- inout[num++] = e0;
+ stack[num++] = e0;
}
if (e1 && e1->a.visited != visited) {
e1->a.visited = visited;
- inout[num++] = e1;
+ stack[num++] = e1;
}
}
break;
case nfa_wild:
case nfa_single:
case nfa_set:
+ case nfa_compl_wild:
+ case nfa_compl_single:
+ case nfa_compl_set:
if (s->o.trans->a.visited != visited) {
s->o.trans->a.visited = visited;
- inout[num++] = s->o.trans;
+ stack[num++] = s->o.trans;
}
break;
+ case nfa_super_accept:
+ internal_error("NFA super accept state explicitly reachable in graph");
}
}
@@ -873,7 +1087,7 @@ void nfa_free(nfa_t nfa)
all[0] = nfa.start;
all[1] = nfa.accept;
- nstates = nfa_all_states(all, 2, nfa.start->a.visited);
+ nstates = nfa_all_states(all, 2, nfa.start->a.visited + 1);
for (i = 0; i < nstates; i++)
nfa_state_free(all[i]);
@@ -906,10 +1120,15 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
for (i = 0; i < nin; i++) {
if (stackp >= NFA_SET_SIZE)
internal_error("NFA set size exceeded");
- in[i]->a.visited = visited;
+ /* We do not mark the super accept state visited,
+ and we will never visit it by following e-transitions,
+ because all transitions into the super-accept state
+ are character-consuming. */
+ if (in[i] != &nfa_super_accept_state)
+ in[i]->a.visited = visited;
stack[stackp++] = in[i];
out[nout++] = in[i];
- if (in[i]->a.kind == nfa_accept)
+ if (nfa_is_accept_state(in[i]))
*accept = 1;
}
@@ -919,10 +1138,13 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
if (nout >= NFA_SET_SIZE)
internal_error("NFA set size exceeded");
- /* Only states of type nfa_empty are interesting.
- Each such state at most two epsilon transitions. */
-
- if (top->a.kind == nfa_empty) {
+ /* Only states of type nfa_empty or its complement are interesting.
+ Each such state at most two epsilon transitions.
+ Note above comment that epsilon transitions never lead
+ to the special super accept state; i.e. e0 and e1
+ are never the super accept state and so we can write to their
+ their visited field. */
+ if (top->a.kind == nfa_empty || top->a.kind == nfa_compl_empty) {
nfa_state_t *e0 = top->e.trans0;
nfa_state_t *e1 = top->e.trans1;
@@ -930,7 +1152,7 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
e0->a.visited = visited;
stack[stackp++] = e0;
out[nout++] = e0;
- if (e0->a.kind == nfa_accept)
+ if (nfa_is_accept_state(e0))
*accept = 1;
}
@@ -938,7 +1160,7 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
e1->a.visited = visited;
stack[stackp++] = e1;
out[nout++] = e1;
- if (e1->a.kind == nfa_accept)
+ if (nfa_is_accept_state(e1))
*accept = 1;
}
}
@@ -958,22 +1180,41 @@ static int nfa_closure(nfa_state_t **stack, nfa_state_t **in, int nin,
static int nfa_move(nfa_state_t **in, int nin, nfa_state_t **out, wchar_t ch)
{
int i, nmove;
+ int super_added = 0;
for (nmove = 0, i = 0; i < nin; i++) {
nfa_state_t *s = in[i];
switch (s->a.kind) {
case nfa_wild:
+ case nfa_compl_wild:
/* Unconditional match; don't have to look at ch. */
break;
case nfa_single:
if (s->o.ch == ch) /* Character match. */
break;
continue; /* no match */
+ case nfa_compl_single:
+ if (s->o.ch == ch) /* Character match. */
+ break;
+ goto add_super; /* no match means we can move to super accept state */
case nfa_set:
if (char_set_contains(s->s.set, ch)) /* Set match. */
break;
continue; /* no match */
+ case nfa_compl_set:
+ if (char_set_contains(s->s.set, ch)) /* Set match. */
+ break;
+ goto add_super;
+ case nfa_super_accept:
+ /* super always transitions to itself, for any character. */
+ add_super:
+ if (!super_added) {
+ super_added = 1;
+ if (nmove >= NFA_SET_SIZE)
+ internal_error("NFA set size exceeded");
+ out[nmove++] = &nfa_super_accept_state;
+ }
default:
/* Epsilon-transition and acceptance states have no character moves. */
continue;
diff --git a/txr.1 b/txr.1
index 03434d0e..25f8ac51 100644
--- a/txr.1
+++ b/txr.1
@@ -643,17 +643,21 @@ If RE is a regular expression, then so is (RE).
The contents of parentheses denote one regular expression unit, so that for
instance in (RE)*, the * operator applies to the entire parenthesized group.
.IP (RE)?
-optionally matches the preceding regular expression (RE).
+optionally match the preceding regular expression (RE).
.IP (RE)+
-matches the preceding expression one or more times.
+match the preceding expression one or more times.
.IP (RE)*
-matches the preceding expression zero or more times.
+match the preceding expression zero or more times.
+.IP ~(RE)
+match the complement of the preceding expression; i.e. match
+those texts that (RE) does not match.
.IP (RE1)(RE2)
Two consecutive regular expressions denote catenation:
the left expression must match, and then the right.
-
.IP (RE1)|(RE2)
-matches either the expression RE1 or RE2.
+match either the expression RE1 or RE2.
+.IP (RE1)&(RE2)
+match both the expression RE1 and RE2 simultaneously.
.PP
Any of the special characters, including the delimiting /, can be escaped with
@@ -674,14 +678,34 @@ The postfix operators ?, + and * have the second highest precedence, and
associate left to right, so that in A+?*, the * applies to A+?, and the ?
applies to A+.
-Catenation is on the next lower precedence rung, so that AB? means "match A,
-and then optionally B" not "match A and B, as one optional unit". The latter
-must be written (AB)? using parentheses to override precedence.
+The unary complement operator has the next lower precedence, so
+that ~A* means the ~(A*): "match the all text that is not matched by zero
+or more repetitions of A", not "match zero or more times the text
+not matched by A".
+
+Catenation is on the next lower precedence rung, so that AB? means A(B?), or
+"match A, and then optionally B", not "match A and B, as one optional
+unit". The latter must be written (AB)? using parentheses to override
+precedence.
+
+The precedence of conjunction (&) is lower than catenation, but not as low as
+that of disjunction, thus AB
The disjunction operator | has the lowest precedence, lower than catenation.
-Thus abc|def means "match abc, or match def". The meaning "match ab,
-then c or d, then ef" must be expressed as ab(c|d)ef, or using
-a character class: ab[cd]ef.
+Thus ABC|DEF means "match ABC, or match DEF". The meaning "match AB,
+then C or D, then EF" must be expressed as AB(C|D)EF, or using
+a character class: AB[CD]EF.
+
+The regular expression
+
+ ABC*&~DEF+|Z
+
+Is parsed as if it were grouped with parentheses like this:
+
+ ((AB(C*))&(~(DE(F+))))|Z
+
+The main constituent is the | operator, whose left side is an & expression,
+et cetera.
In
.b txr,