summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog59
-rw-r--r--gc.c30
-rw-r--r--lib.c449
-rw-r--r--lib.h34
-rw-r--r--match.c35
-rw-r--r--parser.y37
-rw-r--r--regex.c76
-rw-r--r--regex.h21
-rw-r--r--stream.c6
-rw-r--r--txr.c20
10 files changed, 592 insertions, 175 deletions
diff --git a/ChangeLog b/ChangeLog
index 9fa42564..0f4b2de1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,62 @@
+2009-10-20 Kaz Kylheku <kkylheku@gmail.com>
+
+ Start of implementation for freestyle matching.
+
+ Lazy strings implemented, incompletely.
+
+ Changed string function to implicitly strdup; non-strdup
+ version changed to string_own. Fixed wrong uses of strdup
+ rather than chk_strdup.
+
+ Functions added to regex module to provide regex matching
+ as a state machine to which characters are fed.
+
+ * lib.h (type_t): New enum member LSTR, for lazy strings.
+ (lstr_t, freestyle, type_check3, string_own): Declared.
+ (string): Parameter changed to const char *.
+ (lazy_stringp, split_str, lazy_str, lazy_str_force_upto,
+ lazy_str_force, lazy_str_get_trailing_list, length_str_gt,
+ length_str_ge, length_str_lt, length_str_le): Declared.
+
+ * lib.c (lstr_t, freestyle): New symbol globals.
+ (code2type, obj_print, obj_pprint, equal): Extended to handle LSTR.
+ (type_check3): New function.
+ (string_own): New function; does what string used to do.
+ (string): Duplicates the string with strdup, so callers don't have to.
+ (mkstring, copy_str, trim_str): Use string_own.
+ (stringp): A lazy string is a kind of string.
+ (lazy_stringp): New function.
+ (length_str, c_str, search_str, sub_str, chr_str,
+ chr_str_set): Handle lazy strings.
+ (split_str): New function.
+ (lazy_str, lazy_str_force_upto, lazy_str_force,
+ lazy_str_get_trailing_list, length_str_gt, length_str_ge,
+ length_str_lt, length_str_le): New functions.
+ (obj_init): New symbols interned. Eliminated strdup calls.
+
+ * gc.c (finalize, mark_obj): Changed to handle LSTR type.
+ Eliminated default case from switch so we get a gcc
+ diagnostic if a case is not handled.
+
+ * match.c (match_files): Eliminated strdup calls.
+ Added freeform directive.
+
+ * parser.y (grammar): Changed string calls to string_own.
+
+ * stream.c (stdio_get_line, get_string_from_stream):
+ Changed string calls to string_own.
+ (dir_get_line): Eliminated chk_strdup call.
+
+ * txr.c (remove_hash_bang_line, main): Eliminated strdup calls.
+
+ * regex.h (nfam_result): New union.
+ (nfa_machine, nfa_machine_t): New struct and typedef.
+ (nfa_machine_init, nfa_machine_cleanup, nfa_machine_feed,
+ nfa_machine_match_span): New functions declared.
+
+ * regex.c (nfa_machine_init, nfa_machine_cleanup, nfa_machine_feed,
+ nfa_machine_match_span): New functions defined.
+
2009-10-18 Kaz Kylheku <kkylheku@gmail.com>
Trivial change allows regexps to be bound to variables,
diff --git a/gc.c b/gc.c
index 32411a62..b61fd90b 100644
--- a/gc.c
+++ b/gc.c
@@ -147,33 +147,34 @@ static void finalize(obj_t *obj)
{
switch (obj->t.type) {
case CONS:
- break;
+ return;
case STR:
if (!opt_gc_debug) {
free(obj->st.str);
obj->st.str = 0;
}
- break;
+ return;
case CHR:
case NUM:
case SYM:
case FUN:
- break;
+ return;
case VEC:
if (!opt_gc_debug) {
free(obj->v.vec-2);
obj->v.vec = 0;
}
- break;
+ return;
case LCONS:
- break;
+ case LSTR:
+ return;
case COBJ:
if (obj->co.ops->destroy)
obj->co.ops->destroy(obj);
- break;
- default:
- assert (0 && "corrupt type field");
+ return;
}
+
+ assert (0 && "corrupt type field");
}
static void mark_obj(obj_t *obj)
@@ -208,7 +209,7 @@ tail_call:
mark_obj_tail(obj->st.len);
case CHR:
case NUM:
- break;
+ return;
case SYM:
mark_obj(obj->s.name);
mark_obj_tail(obj->s.val);
@@ -216,7 +217,7 @@ tail_call:
mark_obj(obj->f.env);
if (obj->f.functype == FINTERP)
mark_obj_tail(obj->f.f.interp_fun);
- break;
+ return;
case VEC:
{
obj_t *alloc_size = obj->v.vec[-2];
@@ -229,18 +230,21 @@ tail_call:
for (i = 0; i < fp; i++)
mark_obj(obj->v.vec[i]);
}
- break;
+ return;
case LCONS:
mark_obj(obj->lc.func);
mark_obj(obj->lc.car);
mark_obj_tail(obj->lc.cdr);
+ case LSTR:
+ mark_obj(obj->ls.prefix);
+ mark_obj_tail(obj->ls.list);
case COBJ:
if (obj->co.ops->mark)
obj->co.ops->mark(obj);
mark_obj_tail(obj->co.cls);
- default:
- assert (0 && "corrupt type field");
}
+
+ assert (0 && "corrupt type field");
}
static int in_heap(obj_t *ptr)
diff --git a/lib.c b/lib.c
index 4953b644..c5a066d1 100644
--- a/lib.c
+++ b/lib.c
@@ -44,9 +44,10 @@
obj_t *interned_syms;
obj_t *null, *t, *cons_t, *str_t, *chr_t, *num_t, *sym_t, *fun_t, *vec_t;
-obj_t *stream_t, *lcons_t, *cobj_t, *var, *regex, *set, *cset, *wild, *oneplus;
+obj_t *stream_t, *lcons_t, *lstr_t, *cobj_t;
+obj_t *var, *regex, *set, *cset, *wild, *oneplus;
obj_t *zeroplus, *optional, *compound, *or, *quasi;
-obj_t *skip, *trailer, *block, *next, *fail, *accept;
+obj_t *skip, *trailer, *block, *next, *freeform, *fail, *accept;
obj_t *all, *some, *none, *maybe, *cases, *collect, *until, *coll;
obj_t *define, *output, *single, *frst, *lst, *empty, *repeat, *rep;
obj_t *flattn, *forget, *local, *mrge, *bind, *cat, *args;
@@ -83,7 +84,7 @@ static obj_t *equal_tramp(obj_t *env, obj_t *, obj_t *);
static obj_t *code2type(int code)
{
- switch (code) {
+ switch ((type_t) code) {
case CONS: return cons_t;
case STR: return str_t;
case CHR: return chr_t;
@@ -92,6 +93,7 @@ static obj_t *code2type(int code)
case FUN: return fun_t;
case VEC: return vec_t;
case LCONS: return lcons_t;
+ case LSTR: return lstr_t;
case COBJ: return cobj_t;
}
return nil;
@@ -126,6 +128,14 @@ obj_t *type_check2(obj_t *obj, int t1, int t2)
return t;
}
+obj_t *type_check3(obj_t *obj, int t1, int t2, int t3)
+{
+ if (!obj || (obj->t.type != t1 && obj->t.type != t2 && obj->t.type != t3))
+ type_mismatch("~s is not of type ~s, ~s nor ~s", obj,
+ code2type(t1), code2type(t2), code2type(t3), nao);
+ return t;
+}
+
obj_t *car(obj_t *cons)
{
if (cons == nil)
@@ -451,6 +461,14 @@ obj_t *equal(obj_t *left, obj_t *right)
return t;
}
return nil;
+ case LSTR:
+ if (right->t.type == STR || right->t.type == LSTR) {
+ lazy_str_force(left);
+ if (right->t.type == LSTR)
+ lazy_str_force(right);
+ return equal(left->ls.prefix, right->ls.prefix);
+ }
+ return nil;
case COBJ:
if (right->t.type == COBJ)
return left->co.ops->equal(left, right);
@@ -655,7 +673,7 @@ obj_t *min2(obj_t *anum, obj_t *bnum)
return c_num(anum) < c_num(bnum) ? anum : bnum;
}
-obj_t *string(char *str)
+obj_t *string_own(char *str)
{
obj_t *obj = make_obj();
obj->st.type = STR;
@@ -664,10 +682,19 @@ obj_t *string(char *str)
return obj;
}
+obj_t *string(const char *str)
+{
+ obj_t *obj = make_obj();
+ obj->st.type = STR;
+ obj->st.str = chk_strdup(str);
+ obj->st.len = nil;
+ return obj;
+}
+
obj_t *mkstring(obj_t *len, obj_t *ch)
{
char *str = chk_malloc(c_num(len) + 1);
- obj_t *s = string(str);
+ obj_t *s = string_own(str);
memset(str, c_chr(ch), c_num(len));
str[c_num(len)] = 0;
s->st.len = len;
@@ -677,7 +704,7 @@ obj_t *mkstring(obj_t *len, obj_t *ch)
obj_t *mkustring(obj_t *len)
{
char *str = chk_malloc(c_num(len) + 1);
- obj_t *s = string(str);
+ obj_t *s = string_own(str);
s->st.len = len;
return s;
}
@@ -690,17 +717,28 @@ obj_t *init_str(obj_t *str, const char *data)
obj_t *copy_str(obj_t *str)
{
- return string(strdup(c_str(str)));
+ return string(c_str(str));
}
obj_t *stringp(obj_t *str)
{
- return (str && str->st.type == STR) ? t : nil;
+ return (str && (str->st.type == STR || str->st.type == LSTR)) ? t : nil;
+}
+
+obj_t *lazy_stringp(obj_t *str)
+{
+ return (str && (str->st.type == LSTR)) ? t : nil;
}
obj_t *length_str(obj_t *str)
{
- type_check (str, STR);
+ type_check2 (str, STR, LSTR);
+
+ if (str->ls.type == LSTR) {
+ lazy_str_force(str);
+ return length(str->ls.prefix);
+ }
+
if (!str->st.len)
str->st.len = num(strlen(str->st.str));
return str->st.len;
@@ -708,13 +746,16 @@ obj_t *length_str(obj_t *str)
const char *c_str(obj_t *obj)
{
- type_check2(obj, STR, SYM);
+ type_check3(obj, STR, SYM, LSTR);
switch (obj->t.type) {
case STR:
return obj->st.str;
case SYM:
return c_str(symbol_name(obj));
+ case LSTR:
+ lazy_str_force(obj);
+ return c_str(obj->ls.prefix);
default:
abort();
}
@@ -723,19 +764,38 @@ const char *c_str(obj_t *obj)
obj_t *search_str(obj_t *haystack, obj_t *needle, obj_t *start_num,
obj_t *from_end)
{
- const char *h = c_str(haystack);
- long len = c_num(length_str(haystack));
- long start = c_num(start_num);
-
- if (start > len) {
+ if (length_str_lt(haystack, start_num)) {
return nil;
} else {
- const char *n = c_str(needle), *good = 0, *pos, *from = h + start;
+ obj_t *h_is_lazy = lazy_stringp(haystack);
+ long start = c_num(start_num);
+ long good = -1, pos = -1;
+ const char *n = c_str(needle), *h;
- do {
- pos = strstr(from, n);
- } while (pos && (good = pos) && from_end && *(from = pos + 1));
- return (good == 0) ? nil : num(good - h);
+ if (!h_is_lazy) {
+ do {
+ const char *f;
+ h = c_str(haystack);
+
+ f = strstr(h + start, n);
+
+ if (f)
+ pos = f - h;
+ else
+ pos = -1;
+ } while (pos != -1 && (good = pos) != -1 && from_end && h[++start]);
+ } else {
+ size_t ln = c_num(length_str(needle));
+
+ do {
+ lazy_str_force_upto(haystack, plus(num(start + 1), length_str(needle)));
+ h = c_str(haystack->ls.prefix);
+
+ if (!strncmp(h + start, n, ln))
+ good = start;
+ } while (h[++start] && (from_end || good == -1));
+ }
+ return (good == -1) ? nil : num(good);
}
}
@@ -758,30 +818,27 @@ obj_t *search_str_tree(obj_t *haystack, obj_t *tree, obj_t *start_num,
return nil;
}
-obj_t *sub_str(obj_t *str_in, obj_t *from_num, obj_t *to_num)
+obj_t *sub_str(obj_t *str_in, obj_t *from, obj_t *to)
{
- const char *str = c_str(str_in);
- size_t len = c_num(length_str(str_in));
- long from = from_num ? c_num(from_num) : 0;
- long to = to_num ? c_num(to_num) : len;
+ if (from == nil || lt(from, zero))
+ from = zero;
+ if (to == nil)
+ to = length_str(str_in);
+ else if (lt(to, zero))
+ to = zero;
+ if (length_str_lt(str_in, from))
+ from = length_str(str_in);
+ if (length_str_lt(str_in, to))
+ to = length_str(str_in);
- if (to < 0)
- to = 0;
- if (from < 0)
- from = 0;
- if (from > len)
- from = len;
- if (to > len)
- to = len;
-
- if (from >= to) {
+ if (ge(from, to)) {
return null_string;
} else {
- size_t size = to - from + 1;
+ size_t size = c_num(to) - c_num(from) + 1;
char *sub = chk_malloc(size);
- strncpy(sub, str + from, size);
+ strncpy(sub, c_str(str_in) + c_num(from), size);
sub[size-1] = 0;
- return string(sub);
+ return string_own(sub);
}
}
@@ -833,7 +890,27 @@ obj_t *cat_str(obj_t *list, obj_t *sep)
}
*ptr = 0;
- return string(str);
+ return string_own(str);
+}
+
+obj_t *split_str(obj_t *str, obj_t *sep)
+{
+ const char *cstr = c_str(str);
+ const char *csep = c_str(sep);
+ list_collect_decl (out, iter);
+
+ for (;;) {
+ size_t span = strcspn(cstr, csep);
+ obj_t *piece = mkustring(num(span));
+ init_str(piece, cstr);
+ list_collect (iter, piece);
+ cstr += span;
+ if (!*cstr)
+ break;
+ cstr++;
+ }
+
+ return out;
}
obj_t *trim_str(obj_t *str)
@@ -854,7 +931,7 @@ obj_t *trim_str(obj_t *str)
char *new = chk_malloc(len + 1);
memcpy(new, start, len);
new[len] = 0;
- return string(new);
+ return string_own(new);
}
}
@@ -885,24 +962,26 @@ int c_chr(obj_t *chr)
obj_t *chr_str(obj_t *str, obj_t *index)
{
- long l = c_num(length_str(str));
- long i = c_num(index);
- const char *s = c_str(str);
-
- bug_unless (i < l);
+ bug_unless (length_str_gt(str, index));
- return chr(s[i]);
+ if (lazy_stringp(str)) {
+ lazy_str_force_upto(str, index);
+ return chr(c_str(str->ls.prefix)[c_num(index)]);
+ } else {
+ return chr(c_str(str)[c_num(index)]);
+ }
}
obj_t *chr_str_set(obj_t *str, obj_t *index, obj_t *chr)
{
- long l = c_num(length_str(str));
- long i = c_num(index);
- char *s = str->st.str;
-
- bug_unless (i < l);
+ bug_unless (length_str_gt(str, index));
- s[i] = c_chr(chr);
+ if (lazy_stringp(str)) {
+ lazy_str_force_upto(str, index);
+ str->ls.prefix->st.str[c_num(index)] = c_chr(chr);
+ } else {
+ str->st.str[c_num(index)] = c_chr(chr);
+ }
return chr;
}
@@ -1277,6 +1356,124 @@ obj_t *lazy_stream_cons(obj_t *stream)
lazy_stream_func));
}
+obj_t *lazy_str(obj_t *list)
+{
+ obj_t *obj = make_obj();
+ obj->ls.type = LSTR;
+
+ if (nullp(list)) {
+ obj->ls.prefix = null_string;
+ obj->ls.list = nil;
+ } else {
+ obj->ls.prefix = first(list);
+ obj->ls.list = rest(list);
+ }
+
+ return obj;
+}
+
+obj_t *lazy_str_force(obj_t *lstr)
+{
+ type_check(lstr, LSTR);
+
+ if (lstr->ls.list) {
+ lstr->ls.prefix = cat_str(cons(lstr->ls.prefix, lstr->ls.list),
+ string("\n"));
+ lstr->ls.list = nil;
+ }
+
+ return lstr->ls.prefix;
+}
+
+obj_t *lazy_str_force_upto(obj_t *lstr, obj_t *index)
+{
+ type_check(lstr, LSTR);
+
+ while (gt(index, length_str(lstr->ls.prefix)) && lstr->ls.list) {
+ obj_t *next = pop(&lstr->ls.list);
+ lstr->ls.prefix = cat_str(cons(lstr->ls.prefix, cons(next, nil)),
+ string("\n"));
+ }
+
+ return lt(index, length_str(lstr->ls.prefix));
+}
+
+obj_t *length_str_gt(obj_t *str, obj_t *len)
+{
+ type_check2 (str, STR, LSTR);
+
+ switch (str->t.type) {
+ case STR:
+ return gt(length_str(str), len);
+ case LSTR:
+ lazy_str_force_upto(str, len);
+ return gt(length_str(str->ls.prefix), len);
+ default:
+ internal_error("unexpected type value");
+ }
+}
+
+obj_t *length_str_ge(obj_t *str, obj_t *len)
+{
+ type_check2 (str, STR, LSTR);
+
+ switch (str->t.type) {
+ case STR:
+ return ge(length_str(str), len);
+ case LSTR:
+ lazy_str_force_upto(str, len);
+ return ge(length_str(str->ls.prefix), len);
+ default:
+ internal_error("unexpected type value");
+ }
+}
+
+obj_t *length_str_lt(obj_t *str, obj_t *len)
+{
+ type_check2 (str, STR, LSTR);
+
+ switch (str->t.type) {
+ case STR:
+ return lt(length_str(str), len);
+ case LSTR:
+ lazy_str_force_upto(str, len);
+ return lt(length_str(str->ls.prefix), len);
+ default:
+ internal_error("unexpected type value");
+ }
+}
+
+obj_t *length_str_le(obj_t *str, obj_t *len)
+{
+ type_check2 (str, STR, LSTR);
+
+ switch (str->t.type) {
+ case STR:
+ return le(length_str(str), len);
+ case LSTR:
+ lazy_str_force_upto(str, len);
+ return le(length_str(str->ls.prefix), len);
+ default:
+ internal_error("unexpected type value");
+ }
+}
+
+obj_t *lazy_str_get_trailing_list(obj_t *lstr, obj_t *index)
+{
+ type_check(lstr, LSTR);
+
+ /* Force lazy string up through the index position */
+ if (ge(index, length_str(lstr->ls.prefix)))
+ lazy_str_force_upto(lstr, index);
+
+ {
+ obj_t *split_suffix = split_str(sub_str(lstr->ls.prefix, index, nil),
+ string("\n"));
+
+ return nappend2(split_suffix, lstr->ls.list);
+ }
+}
+
obj_t *cobj(void *handle, obj_t *cls_sym, struct cobj_ops *ops)
{
obj_t *obj = make_obj();
@@ -1463,73 +1660,75 @@ static void obj_init(void)
&null_list, &equal_f,
&identity_f, &prog_string, 0);
- nil_string = string(strdup("nil"));
-
- null = intern(string(strdup("null")));
- t = intern(string(strdup("t")));
- cons_t = intern(string(strdup("cons")));
- str_t = intern(string(strdup("str")));
- chr_t = intern(string(strdup("chr")));
- num_t = intern(string(strdup("num")));
- sym_t = intern(string(strdup("sym")));
- fun_t = intern(string(strdup("fun")));
- vec_t = intern(string(strdup("vec")));
- stream_t = intern(string(strdup("stream")));
- lcons_t = intern(string(strdup("lcons")));
- cobj_t = intern(string(strdup("cobj")));
- var = intern(string(strdup("$var")));
- regex = intern(string(strdup("$regex")));
- set = intern(string(strdup("set")));
- cset = intern(string(strdup("cset")));
- wild = intern(string(strdup("wild")));
- oneplus = intern(string(strdup("1+")));
- zeroplus = intern(string(strdup("0+")));
- optional = intern(string(strdup("?")));
- compound = intern(string(strdup("compound")));
- or = intern(string(strdup("or")));
- quasi = intern(string(strdup("$quasi")));
- skip = intern(string(strdup("skip")));
- trailer = intern(string(strdup("trailer")));
- block = intern(string(strdup("block")));
- next = intern(string(strdup("next")));
- fail = intern(string(strdup("fail")));
- accept = intern(string(strdup("accept")));
- all = intern(string(strdup("all")));
- some = intern(string(strdup("some")));
- none = intern(string(strdup("none")));
- maybe = intern(string(strdup("maybe")));
- cases = intern(string(strdup("cases")));
- collect = intern(string(strdup("collect")));
- until = intern(string(strdup("until")));
- coll = intern(string(strdup("coll")));
- define = intern(string(strdup("define")));
- output = intern(string(strdup("output")));
- single = intern(string(strdup("single")));
- frst = intern(string(strdup("first")));
- lst = intern(string(strdup("last")));
- empty = intern(string(strdup("empty")));
- repeat = intern(string(strdup("repeat")));
- rep = intern(string(strdup("rep")));
- flattn = intern(string(strdup("flatten")));
- forget = intern(string(strdup("forget")));
- local = intern(string(strdup("local")));
- mrge = intern(string(strdup("merge")));
- bind = intern(string(strdup("bind")));
- cat = intern(string(strdup("cat")));
- args = intern(string(strdup("args")));
- try = intern(string(strdup("try")));
- catch = intern(string(strdup("catch")));
- finally = intern(string(strdup("finally")));
- nothrow = intern(string(strdup("nothrow")));
- throw = intern(string(strdup("throw")));
- defex = intern(string(strdup("defex")));
- error = intern(string(strdup("error")));
- type_error = intern(string(strdup("type_error")));
- internal_err = intern(string(strdup("internal_error")));
- numeric_err = intern(string(strdup("numeric_error")));
- range_err = intern(string(strdup("range_error")));
- query_error = intern(string(strdup("query_error")));
- file_error = intern(string(strdup("file_error")));
+ nil_string = string("nil");
+
+ null = intern(string("null"));
+ t = intern(string("t"));
+ cons_t = intern(string("cons"));
+ str_t = intern(string("str"));
+ chr_t = intern(string("chr"));
+ num_t = intern(string("num"));
+ sym_t = intern(string("sym"));
+ fun_t = intern(string("fun"));
+ vec_t = intern(string("vec"));
+ stream_t = intern(string("stream"));
+ lcons_t = intern(string("lcons"));
+ lstr_t = intern(string("lstr"));
+ cobj_t = intern(string("cobj"));
+ var = intern(string("$var"));
+ regex = intern(string("$regex"));
+ set = intern(string("set"));
+ cset = intern(string("cset"));
+ wild = intern(string("wild"));
+ oneplus = intern(string("1+"));
+ zeroplus = intern(string("0+"));
+ optional = intern(string("?"));
+ compound = intern(string("compound"));
+ or = intern(string("or"));
+ quasi = intern(string("$quasi"));
+ skip = intern(string("skip"));
+ trailer = intern(string("trailer"));
+ block = intern(string("block"));
+ next = intern(string("next"));
+ freeform = intern(string("freeform"));
+ fail = intern(string("fail"));
+ accept = intern(string("accept"));
+ all = intern(string("all"));
+ some = intern(string("some"));
+ none = intern(string("none"));
+ maybe = intern(string("maybe"));
+ cases = intern(string("cases"));
+ collect = intern(string("collect"));
+ until = intern(string("until"));
+ coll = intern(string("coll"));
+ define = intern(string("define"));
+ output = intern(string("output"));
+ single = intern(string("single"));
+ frst = intern(string("first"));
+ lst = intern(string("last"));
+ empty = intern(string("empty"));
+ repeat = intern(string("repeat"));
+ rep = intern(string("rep"));
+ flattn = intern(string("flatten"));
+ forget = intern(string("forget"));
+ local = intern(string("local"));
+ mrge = intern(string("merge"));
+ bind = intern(string("bind"));
+ cat = intern(string("cat"));
+ args = intern(string("args"));
+ try = intern(string("try"));
+ catch = intern(string("catch"));
+ finally = intern(string("finally"));
+ nothrow = intern(string("nothrow"));
+ throw = intern(string("throw"));
+ defex = intern(string("defex"));
+ error = intern(string("error"));
+ type_error = intern(string("type_error"));
+ internal_err = intern(string("internal_error"));
+ numeric_err = intern(string("numeric_error"));
+ range_err = intern(string("range_error"));
+ query_error = intern(string("query_error"));
+ file_error = intern(string("file_error"));
interned_syms = cons(nil, interned_syms);
@@ -1540,13 +1739,13 @@ static void obj_init(void)
maxint = num(LONG_MAX);
minint = num(LONG_MIN);
- null_string = string(strdup(""));
+ null_string = string("");
null_list = cons(nil, nil);
equal_f = func_f2(nil, equal_tramp);
identity_f = func_f1(nil, identity_tramp);
- prog_string = string(strdup(progname));
+ prog_string = string(progname);
}
void obj_print(obj_t *obj, obj_t *out)
@@ -1648,6 +1847,10 @@ void obj_print(obj_t *obj, obj_t *out)
put_cchar(out, ')');
}
return;
+ case LSTR:
+ obj_print(obj->ls.prefix, out);
+ put_cstring(out, "#<... lazy string>");
+ return;
case COBJ:
obj->co.ops->print(obj, out);
return;
@@ -1710,6 +1913,10 @@ void obj_pprint(obj_t *obj, obj_t *out)
put_cchar(out, ')');
}
return;
+ case LSTR:
+ obj_pprint(obj->ls.prefix, out);
+ put_cstring(out, "...");
+ return;
case COBJ:
obj->co.ops->print(obj, out);
return;
diff --git a/lib.h b/lib.h
index 703e618f..e1282ed1 100644
--- a/lib.h
+++ b/lib.h
@@ -25,7 +25,7 @@
*/
typedef enum type {
- CONS = 1, STR, CHR, NUM, SYM, FUN, VEC, LCONS, COBJ
+ CONS = 1, STR, CHR, NUM, SYM, FUN, VEC, LCONS, LSTR, COBJ
} type_t;
typedef enum functype
@@ -113,6 +113,16 @@ struct lazy_cons {
obj_t *func; /* when nil, car and cdr are valid */
};
+/*
+ * Lazy string: virtual string which dynamically grows as a catentation
+ * of a list of strings.
+ */
+struct lazy_string {
+ type_t type;
+ obj_t *prefix; /* actual string part */
+ obj_t *list; /* remaining list */
+};
+
struct cobj {
type_t type;
void *handle;
@@ -137,15 +147,17 @@ union obj {
struct func f;
struct vec v;
struct lazy_cons lc;
+ struct lazy_string ls;
struct cobj co;
};
extern obj_t *interned_syms;
extern obj_t *t, *cons_t, *str_t, *chr_t, *num_t, *sym_t, *fun_t, *vec_t;
-extern obj_t *stream_t, *lcons_t, *var, *regex, *set, *cset, *wild, *oneplus;
+extern obj_t *stream_t, *lcons_t, *lstr_t, *cobj_t;
+extern obj_t *var, *regex, *set, *cset, *wild, *oneplus;
extern obj_t *zeroplus, *optional, *compound, *or, *quasi;
-extern obj_t *skip, *trailer, *block, *next, *fail, *accept;
+extern obj_t *skip, *trailer, *block, *next, *freeform, *fail, *accept;
extern obj_t *all, *some, *none, *maybe, *cases, *collect, *until, *coll;
extern obj_t *define, *output, *single, *frst, *lst, *empty, *repeat, *rep;
extern obj_t *flattn, *forget, *local, *mrge, *bind, *cat, *args;
@@ -169,6 +181,7 @@ obj_t *identity(obj_t *obj);
obj_t *typeof(obj_t *obj);
obj_t *type_check(obj_t *obj, int);
obj_t *type_check2(obj_t *obj, int, int);
+obj_t *type_check3(obj_t *obj, int, int, int);
obj_t *car(obj_t *cons);
obj_t *cdr(obj_t *cons);
obj_t **car_l(obj_t *cons);
@@ -221,12 +234,14 @@ obj_t *le(obj_t *anum, obj_t *bnum);
obj_t *numeq(obj_t *anum, obj_t *bnum);
obj_t *max2(obj_t *anum, obj_t *bnum);
obj_t *min2(obj_t *anum, obj_t *bnum);
-obj_t *string(char *str);
+obj_t *string_own(char *str);
+obj_t *string(const char *str);
obj_t *mkstring(obj_t *len, obj_t *ch);
obj_t *mkustring(obj_t *len); /* must initialize immediately with init_str! */
obj_t *init_str(obj_t *str, const char *);
obj_t *copy_str(obj_t *str);
obj_t *stringp(obj_t *str);
+obj_t *lazy_stringp(obj_t *str);
obj_t *length_str(obj_t *str);
const char *c_str(obj_t *str);
obj_t *search_str(obj_t *haystack, obj_t *needle, obj_t *start_num,
@@ -235,10 +250,11 @@ obj_t *search_str_tree(obj_t *haystack, obj_t *tree, obj_t *start_num,
obj_t *from_end);
obj_t *sub_str(obj_t *str_in, obj_t *from_num, obj_t *to_num);
obj_t *cat_str(obj_t *list, obj_t *sep);
+obj_t *split_str(obj_t *str, obj_t *sep);
obj_t *trim_str(obj_t *str);
obj_t *string_lt(obj_t *astr, obj_t *bstr);
obj_t *chr(int ch);
-obj_t *chrp(obj_t *str);
+obj_t *chrp(obj_t *chr);
int c_chr(obj_t *chr);
obj_t *chr_str(obj_t *str, obj_t *index);
obj_t *chr_str_set(obj_t *str, obj_t *index, obj_t *chr);
@@ -272,6 +288,14 @@ obj_t *vec_set_fill(obj_t *vec, obj_t *fill);
obj_t **vecref_l(obj_t *vec, obj_t *ind);
obj_t *vec_push(obj_t *vec, obj_t *item);
obj_t *lazy_stream_cons(obj_t *stream);
+obj_t *lazy_str(obj_t *list);
+obj_t *lazy_str_force_upto(obj_t *lstr, obj_t *index);
+obj_t *lazy_str_force(obj_t *lstr);
+obj_t *lazy_str_get_trailing_list(obj_t *lstr, obj_t *index);
+obj_t *length_str_gt(obj_t *str, obj_t *len);
+obj_t *length_str_ge(obj_t *str, obj_t *len);
+obj_t *length_str_lt(obj_t *str, obj_t *len);
+obj_t *length_str_le(obj_t *str, obj_t *len);
obj_t *cobj(void *handle, obj_t *cls_sym, struct cobj_ops *ops);
void cobj_print_op(obj_t *, obj_t *); /* Default function for struct cobj_ops */
obj_t *assoc(obj_t *list, obj_t *key);
diff --git a/match.c b/match.c
index d065cc1d..e90d3d6f 100644
--- a/match.c
+++ b/match.c
@@ -902,7 +902,7 @@ obj_t *match_files(obj_t *spec, obj_t *files,
return nil;
} else if (errno != 0)
file_err(nil, "could not open ~a (error ~a/~a)", name,
- num(errno), string(strdup(strerror(errno))), nao);
+ num(errno), string(strerror(errno)), nao);
else
file_err(nil, "could not open ~a", name, nao);
return nil;
@@ -983,6 +983,33 @@ repeat_spec_same_data:
return cons(new_bindings, cons(data, num(data_lineno)));
return nil;
}
+ } else if (sym == freeform) {
+ if ((spec = rest(spec)) == nil) {
+ sem_error(spec_linenum,
+ "freeform must be followed by a query line", nao);
+ } else {
+ obj_t *ff_specline = rest(first(spec));
+ obj_t *ff_dataline = lazy_str(data);
+
+ cons_bind (new_bindings, success,
+ match_line(bindings, ff_specline, ff_dataline, zero,
+ spec_linenum, num(data_lineno), first(files)));
+
+ if (!success) {
+ debuglf(spec_linenum, "freeform match failure", nao);
+ return nil;
+ }
+
+ if (nump(success))
+ data = lazy_str_get_trailing_list(ff_dataline, success);
+
+ bindings = new_bindings;
+ }
+
+ if ((spec = rest(spec)) == nil)
+ break;
+
+ goto repeat_spec_same_data;
} else if (sym == block) {
obj_t *name = first(rest(first_spec));
if (rest(specline))
@@ -1027,7 +1054,7 @@ repeat_spec_same_data:
if (eq(first(source), nothrow))
push(nil, &source);
else if (eq(first(source), args)) {
- obj_t *input_name = string(strdup("args"));
+ obj_t *input_name = string("args");
cons_bind (new_bindings, success,
match_files(spec, cons(input_name, files),
bindings, files, one));
@@ -1378,7 +1405,7 @@ repeat_spec_same_data:
form, nao);
nt = eq(second(new_style_dest), nothrow);
- dest = or2(cdr(val), string(strdup("-")));
+ dest = or2(cdr(val), string("-"));
}
}
@@ -1393,7 +1420,7 @@ repeat_spec_same_data:
return nil;
} else if (errno != 0) {
file_err(nil, "could not open ~a (error ~a/~a)", dest,
- num(errno), string(strdup(strerror(errno))), nao);
+ num(errno), string(strerror(errno)), nao);
} else {
file_err(nil, "could not open ~a", dest, nao);
}
diff --git a/parser.y b/parser.y
index 405e63d1..7a9b11b0 100644
--- a/parser.y
+++ b/parser.y
@@ -183,7 +183,7 @@ elems : elem { $$ = cons($1, nil); }
yyerror("rep outside of output"); }
;
-elem : TEXT { $$ = string($1); }
+elem : TEXT { $$ = string_own($1); }
| var { $$ = $1; }
| list { $$ = $1; }
| regex { $$ = cons(regex_compile($1), $1); }
@@ -347,7 +347,7 @@ o_elems : o_elem { $$ = cons($1, nil); }
| o_elem o_elems { $$ = cons($1, $2); }
;
-o_elem : TEXT { $$ = string($1); }
+o_elem : TEXT { $$ = string_own($1); }
| var { $$ = $1; }
| rep_elem { $$ = $1; }
;
@@ -372,23 +372,32 @@ rep_parts_opt : SINGLE o_elems_opt2
/* This sucks, but factoring '*' into a nonterminal
* that generates an empty phrase causes reduce/reduce conflicts.
*/
-var : IDENT { $$ = list(var, intern(string($1)), nao); }
- | IDENT elem { $$ = list(var, intern(string($1)), $2, nao); }
- | '{' IDENT '}' { $$ = list(var, intern(string($2)), nao); }
- | '{' IDENT '}' elem { $$ = list(var, intern(string($2)), $4, nao); }
- | '{' IDENT regex '}' { $$ = list(var, intern(string($2)),
+var : IDENT { $$ = list(var, intern(string_own($1)),
+ nao); }
+ | IDENT elem { $$ = list(var, intern(string_own($1)),
+ $2, nao); }
+ | '{' IDENT '}' { $$ = list(var, intern(string_own($2)),
+ nao); }
+ | '{' IDENT '}' elem { $$ = list(var, intern(string_own($2)),
+ $4, nao); }
+ | '{' IDENT regex '}' { $$ = list(var, intern(string_own($2)),
nil, cons(regex_compile($3), $3),
nao); }
- | '{' IDENT NUMBER '}' { $$ = list(var, intern(string($2)),
+ | '{' IDENT NUMBER '}' { $$ = list(var, intern(string_own($2)),
nil, num($3), nao); }
- | var_op IDENT { $$ = list(var, intern(string($2)),
+ | var_op IDENT { $$ = list(var, intern(string_own($2)),
nil, $1, nao); }
- | var_op IDENT elem { $$ = list(var, intern(string($2)),
+ | var_op IDENT elem { $$ = list(var, intern(string_own($2)),
$3, $1, nao); }
- | var_op '{' IDENT '}' { $$ = list(var, intern(string($3)),
+ | var_op '{' IDENT '}' { $$ = list(var, intern(string_own($3)),
nil, $1, nao); }
- | var_op '{' IDENT '}' elem { $$ = list(var, intern(string($3)),
+ | var_op '{' IDENT '}' elem { $$ = list(var, intern(string_own($3)),
$5, $1, nao); }
+ | var_op '{' IDENT regex '}' { yyerror("longest match "
+ "not useable with regex"); }
+ | var_op '{' IDENT NUMBER '}' { yyerror("longest match "
+ "not useable with "
+ "fixed width match"); }
| IDENT error { $$ = nil;
yybadtoken(yychar, "variable spec"); }
| var_op error { $$ = nil;
@@ -409,7 +418,7 @@ exprs : expr { $$ = cons($1, nil); }
| expr '.' expr { $$ = cons($1, $3); }
;
-expr : IDENT { $$ = intern(string($1)); }
+expr : IDENT { $$ = intern(string_own($1)); }
| NUMBER { $$ = num($1); }
| list { $$ = $1; }
| regex { $$ = cons(regex_compile($1), $1); }
@@ -502,7 +511,7 @@ quasi_items : quasi_item { $$ = cons($1, nil); }
;
quasi_item : litchars { $$ = lit_char_helper($1); }
- | TEXT { $$ = string($1); }
+ | TEXT { $$ = string_own($1); }
| var { $$ = $1; }
| list { $$ = $1; }
;
diff --git a/regex.c b/regex.c
index 926ae4fd..81e16be7 100644
--- a/regex.c
+++ b/regex.c
@@ -569,6 +569,82 @@ long nfa_run(nfa_t nfa, const char *str)
return last_accept_pos ? last_accept_pos - str : -1;
}
+long nfa_machine_match_span(nfa_machine_t *nfam)
+{
+ return nfam->last_accept_pos;
+}
+
+/*
+ * NFA machine: represents the logic of the nfa_run function as state machine
+ * object which can be fed one character at a time.
+ */
+void nfa_machine_init(nfa_machine_t *nfam, nfa_t nfa)
+{
+ int accept = 0;
+
+ nfam->nfa = nfa;
+ nfam->last_accept_pos = -1;
+ nfam->visited = nfa.start->a.visited + 1;
+ nfam->move = chk_malloc(NFA_SET_SIZE * sizeof *nfam->move);
+ nfam->clos = chk_malloc(NFA_SET_SIZE * sizeof *nfam->clos);
+ nfam->stack = chk_malloc(NFA_SET_SIZE * sizeof *nfam->stack);
+ nfam->nmove = 1;
+ nfam->count = 0;
+
+ nfam->move[0] = nfa.start;
+
+ nfam->nclos = nfa_closure(nfam->stack, nfam->move, nfam->nmove,
+ nfam->clos, nfam->visited++, &accept);
+
+ if (accept)
+ nfam->last_accept_pos = nfam->count;
+}
+
+void nfa_machine_cleanup(nfa_machine_t *nfam)
+{
+ free(nfam->stack);
+ free(nfam->clos);
+ free(nfam->move);
+ nfam->stack = 0;
+ nfam->clos = 0;
+ nfam->move = 0;
+ nfam->nfa.start = 0;
+ nfam->nfa.accept = 0;
+}
+
+int nfa_machine_feed(nfa_machine_t *nfam, int ch)
+{
+ int accept = 0;
+
+ if (ch != 0) {
+ nfam->count++;
+
+ nfam->nmove = nfa_move(nfam->clos, nfam->nclos, nfam->move, ch);
+ nfam->nclos = nfa_closure(nfam->stack, nfam->move, nfam->nmove, nfam->clos,
+ nfam->visited++, &accept);
+
+ if (accept)
+ nfam->last_accept_pos = nfam->count;
+ }
+
+ nfam->nfa.start->a.visited = nfam->visited;
+
+ if (ch && nfam->nclos != 0) {
+ if (accept)
+ return NFAM_MATCH;
+ return NFAM_INCOMPLETE;
+ }
+
+ /* Reached if the null character is
+ consumed, or NFA hit a transition dead end. */
+
+ if (nfam->last_accept_pos == nfam->count)
+ return NFAM_MATCH;
+ if (nfam->last_accept_pos == -1)
+ return NFAM_FAIL;
+ return NFAM_INCOMPLETE;
+}
+
static obj_t *regex_equal(obj_t *self, obj_t *other)
{
return self == other ? t : nil; /* eq equality only */
diff --git a/regex.h b/regex.h
index 5f2d5021..973ab501 100644
--- a/regex.h
+++ b/regex.h
@@ -90,16 +90,29 @@ void nfa_state_free(nfa_state_t *st);
void nfa_state_shallow_free(nfa_state_t *st);
void nfa_state_merge(nfa_state_t *accept, nfa_state_t *);
-typedef struct nfa nfa_t;
-
-struct nfa {
+typedef struct nfa {
nfa_state_t *start;
nfa_state_t *accept;
-};
+} nfa_t;
+
+enum nfam_result { NFAM_INCOMPLETE, NFAM_FAIL, NFAM_MATCH };
+
+typedef struct nfa_machine {
+ long last_accept_pos;
+ unsigned visited;
+ nfa_state_t **move, **clos, **stack;
+ int nmove, nclos;
+ long count;
+ nfa_t nfa;
+} nfa_machine_t;
nfa_t nfa_compile_regex(obj_t *regex);
void nfa_free(nfa_t);
long nfa_run(nfa_t nfa, const char *str);
+void nfa_machine_init(nfa_machine_t *, nfa_t);
+void nfa_machine_cleanup(nfa_machine_t *);
+int nfa_machine_feed(nfa_machine_t *, int ch);
+long nfa_machine_match_span(nfa_machine_t *);
obj_t *regex_compile(obj_t *regex_sexp);
obj_t *regexp(obj_t *);
nfa_t *regex_nfa(obj_t *);
diff --git a/stream.c b/stream.c
index f91ae753..90932b0e 100644
--- a/stream.c
+++ b/stream.c
@@ -152,7 +152,7 @@ static obj_t *stdio_get_line(obj_t *stream)
char *line = snarf_line((FILE *) stream->co.handle);
if (!line)
return nil;
- return string(line);
+ return string_own(line);
}
}
@@ -407,7 +407,7 @@ static obj_t *dir_get_line(obj_t *stream)
return nil;
if (!strcmp(e->d_name, ".") || !strcmp(e->d_name, ".."))
continue;
- return string(chk_strdup(e->d_name));
+ return string(e->d_name);
}
}
}
@@ -478,7 +478,7 @@ obj_t *get_string_from_stream(obj_t *stream)
return out;
so->buf = chk_realloc(so->buf, so->fill + 1);
- out = string(so->buf);
+ out = string_own(so->buf);
free(so);
return out;
} else if (stream->co.ops == &string_in_ops.cobj_ops) {
diff --git a/txr.c b/txr.c
index e12c42bf..56032e6f 100644
--- a/txr.c
+++ b/txr.c
@@ -119,7 +119,7 @@ obj_t *remove_hash_bang_line(obj_t *spec)
return spec;
{
- obj_t *shbang = string(strdup("#!"));
+ obj_t *shbang = string("#!");
obj_t *firstline = first(spec);
obj_t *items = rest(firstline);
@@ -183,7 +183,7 @@ int main(int argc, char **argv)
val[piece] = 0;
- list = cons(string(strdup(val)), list);
+ list = cons(string(val), list);
if (!comma_p)
break;
@@ -192,15 +192,13 @@ int main(int argc, char **argv)
}
list = nreverse(list);
- bindings = cons(cons(intern(string(strdup(var))), list), bindings);
+ bindings = cons(cons(intern(string(var)), list), bindings);
} else if (equals) {
char *val = equals + 1;
*equals = 0;
- bindings = cons(cons(intern(string(strdup(var))),
- string(strdup(val))), bindings);
+ bindings = cons(cons(intern(string(var)), string(val)), bindings);
} else {
- bindings = cons(cons(intern(string(strdup(var))),
- null_string), bindings);
+ bindings = cons(cons(intern(string(var)), null_string), bindings);
}
argc--, argv++;
@@ -242,10 +240,10 @@ int main(int argc, char **argv)
opt_arraydims = val;
break;
case 'c':
- specstring = string(strdup(*argv));
+ specstring = string(*argv);
break;
case 'f':
- spec_file_str = string(strdup(*argv));
+ spec_file_str = string(*argv);
break;
}
@@ -298,7 +296,7 @@ int main(int argc, char **argv)
if (specstring) {
spec_file = "cmdline";
- spec_file_str = string(strdup(spec_file));
+ spec_file_str = string(spec_file);
yyin_stream = make_string_input_stream(specstring);
} else if (spec_file_str) {
if (strcmp(c_str(spec_file_str), "-") != 0) {
@@ -325,7 +323,7 @@ int main(int argc, char **argv)
spec_file = "stdin";
}
argc--, argv++;
- spec_file_str = string(strdup(spec_file));
+ spec_file_str = string(spec_file);
}