diff options
-rw-r--r-- | ChangeLog | 59 | ||||
-rw-r--r-- | gc.c | 30 | ||||
-rw-r--r-- | lib.c | 449 | ||||
-rw-r--r-- | lib.h | 34 | ||||
-rw-r--r-- | match.c | 35 | ||||
-rw-r--r-- | parser.y | 37 | ||||
-rw-r--r-- | regex.c | 76 | ||||
-rw-r--r-- | regex.h | 21 | ||||
-rw-r--r-- | stream.c | 6 | ||||
-rw-r--r-- | txr.c | 20 |
10 files changed, 592 insertions, 175 deletions
@@ -1,3 +1,62 @@ +2009-10-20 Kaz Kylheku <kkylheku@gmail.com> + + Start of implementation for freestyle matching. + + Lazy strings implemented, incompletely. + + Changed string function to implicitly strdup; non-strdup + version changed to string_own. Fixed wrong uses of strdup + rather than chk_strdup. + + Functions added to regex module to provide regex matching + as a state machine to which characters are fed. + + * lib.h (type_t): New enum member LSTR, for lazy strings. + (lstr_t, freestyle, type_check3, string_own): Declared. + (string): Parameter changed to const char *. + (lazy_stringp, split_str, lazy_str, lazy_str_force_upto, + lazy_str_force, lazy_str_get_trailing_list, length_str_gt, + length_str_ge, length_str_lt, length_str_le): Declared. + + * lib.c (lstr_t, freestyle): New symbol globals. + (code2type, obj_print, obj_pprint, equal): Extended to handle LSTR. + (type_check3): New function. + (string_own): New function; does what string used to do. + (string): Duplicates the string with strdup, so callers don't have to. + (mkstring, copy_str, trim_str): Use string_own. + (stringp): A lazy string is a kind of string. + (lazy_stringp): New function. + (length_str, c_str, search_str, sub_str, chr_str, + chr_str_set): Handle lazy strings. + (split_str): New function. + (lazy_str, lazy_str_force_upto, lazy_str_force, + lazy_str_get_trailing_list, length_str_gt, length_str_ge, + length_str_lt, length_str_le): New functions. + (obj_init): New symbols interned. Eliminated strdup calls. + + * gc.c (finalize, mark_obj): Changed to handle LSTR type. + Eliminated default case from switch so we get a gcc + diagnostic if a case is not handled. + + * match.c (match_files): Eliminated strdup calls. + Added freeform directive. + + * parser.y (grammar): Changed string calls to string_own. + + * stream.c (stdio_get_line, get_string_from_stream): + Changed string calls to string_own. + (dir_get_line): Eliminated chk_strdup call. + + * txr.c (remove_hash_bang_line, main): Eliminated strdup calls. + + * regex.h (nfam_result): New union. + (nfa_machine, nfa_machine_t): New struct and typedef. + (nfa_machine_init, nfa_machine_cleanup, nfa_machine_feed, + nfa_machine_match_span): New functions declared. + + * regex.c (nfa_machine_init, nfa_machine_cleanup, nfa_machine_feed, + nfa_machine_match_span): New functions defined. + 2009-10-18 Kaz Kylheku <kkylheku@gmail.com> Trivial change allows regexps to be bound to variables, @@ -147,33 +147,34 @@ static void finalize(obj_t *obj) { switch (obj->t.type) { case CONS: - break; + return; case STR: if (!opt_gc_debug) { free(obj->st.str); obj->st.str = 0; } - break; + return; case CHR: case NUM: case SYM: case FUN: - break; + return; case VEC: if (!opt_gc_debug) { free(obj->v.vec-2); obj->v.vec = 0; } - break; + return; case LCONS: - break; + case LSTR: + return; case COBJ: if (obj->co.ops->destroy) obj->co.ops->destroy(obj); - break; - default: - assert (0 && "corrupt type field"); + return; } + + assert (0 && "corrupt type field"); } static void mark_obj(obj_t *obj) @@ -208,7 +209,7 @@ tail_call: mark_obj_tail(obj->st.len); case CHR: case NUM: - break; + return; case SYM: mark_obj(obj->s.name); mark_obj_tail(obj->s.val); @@ -216,7 +217,7 @@ tail_call: mark_obj(obj->f.env); if (obj->f.functype == FINTERP) mark_obj_tail(obj->f.f.interp_fun); - break; + return; case VEC: { obj_t *alloc_size = obj->v.vec[-2]; @@ -229,18 +230,21 @@ tail_call: for (i = 0; i < fp; i++) mark_obj(obj->v.vec[i]); } - break; + return; case LCONS: mark_obj(obj->lc.func); mark_obj(obj->lc.car); mark_obj_tail(obj->lc.cdr); + case LSTR: + mark_obj(obj->ls.prefix); + mark_obj_tail(obj->ls.list); case COBJ: if (obj->co.ops->mark) obj->co.ops->mark(obj); mark_obj_tail(obj->co.cls); - default: - assert (0 && "corrupt type field"); } + + assert (0 && "corrupt type field"); } static int in_heap(obj_t *ptr) @@ -44,9 +44,10 @@ obj_t *interned_syms; obj_t *null, *t, *cons_t, *str_t, *chr_t, *num_t, *sym_t, *fun_t, *vec_t; -obj_t *stream_t, *lcons_t, *cobj_t, *var, *regex, *set, *cset, *wild, *oneplus; +obj_t *stream_t, *lcons_t, *lstr_t, *cobj_t; +obj_t *var, *regex, *set, *cset, *wild, *oneplus; obj_t *zeroplus, *optional, *compound, *or, *quasi; -obj_t *skip, *trailer, *block, *next, *fail, *accept; +obj_t *skip, *trailer, *block, *next, *freeform, *fail, *accept; obj_t *all, *some, *none, *maybe, *cases, *collect, *until, *coll; obj_t *define, *output, *single, *frst, *lst, *empty, *repeat, *rep; obj_t *flattn, *forget, *local, *mrge, *bind, *cat, *args; @@ -83,7 +84,7 @@ static obj_t *equal_tramp(obj_t *env, obj_t *, obj_t *); static obj_t *code2type(int code) { - switch (code) { + switch ((type_t) code) { case CONS: return cons_t; case STR: return str_t; case CHR: return chr_t; @@ -92,6 +93,7 @@ static obj_t *code2type(int code) case FUN: return fun_t; case VEC: return vec_t; case LCONS: return lcons_t; + case LSTR: return lstr_t; case COBJ: return cobj_t; } return nil; @@ -126,6 +128,14 @@ obj_t *type_check2(obj_t *obj, int t1, int t2) return t; } +obj_t *type_check3(obj_t *obj, int t1, int t2, int t3) +{ + if (!obj || (obj->t.type != t1 && obj->t.type != t2 && obj->t.type != t3)) + type_mismatch("~s is not of type ~s, ~s nor ~s", obj, + code2type(t1), code2type(t2), code2type(t3), nao); + return t; +} + obj_t *car(obj_t *cons) { if (cons == nil) @@ -451,6 +461,14 @@ obj_t *equal(obj_t *left, obj_t *right) return t; } return nil; + case LSTR: + if (right->t.type == STR || right->t.type == LSTR) { + lazy_str_force(left); + if (right->t.type == LSTR) + lazy_str_force(right); + return equal(left->ls.prefix, right->ls.prefix); + } + return nil; case COBJ: if (right->t.type == COBJ) return left->co.ops->equal(left, right); @@ -655,7 +673,7 @@ obj_t *min2(obj_t *anum, obj_t *bnum) return c_num(anum) < c_num(bnum) ? anum : bnum; } -obj_t *string(char *str) +obj_t *string_own(char *str) { obj_t *obj = make_obj(); obj->st.type = STR; @@ -664,10 +682,19 @@ obj_t *string(char *str) return obj; } +obj_t *string(const char *str) +{ + obj_t *obj = make_obj(); + obj->st.type = STR; + obj->st.str = chk_strdup(str); + obj->st.len = nil; + return obj; +} + obj_t *mkstring(obj_t *len, obj_t *ch) { char *str = chk_malloc(c_num(len) + 1); - obj_t *s = string(str); + obj_t *s = string_own(str); memset(str, c_chr(ch), c_num(len)); str[c_num(len)] = 0; s->st.len = len; @@ -677,7 +704,7 @@ obj_t *mkstring(obj_t *len, obj_t *ch) obj_t *mkustring(obj_t *len) { char *str = chk_malloc(c_num(len) + 1); - obj_t *s = string(str); + obj_t *s = string_own(str); s->st.len = len; return s; } @@ -690,17 +717,28 @@ obj_t *init_str(obj_t *str, const char *data) obj_t *copy_str(obj_t *str) { - return string(strdup(c_str(str))); + return string(c_str(str)); } obj_t *stringp(obj_t *str) { - return (str && str->st.type == STR) ? t : nil; + return (str && (str->st.type == STR || str->st.type == LSTR)) ? t : nil; +} + +obj_t *lazy_stringp(obj_t *str) +{ + return (str && (str->st.type == LSTR)) ? t : nil; } obj_t *length_str(obj_t *str) { - type_check (str, STR); + type_check2 (str, STR, LSTR); + + if (str->ls.type == LSTR) { + lazy_str_force(str); + return length(str->ls.prefix); + } + if (!str->st.len) str->st.len = num(strlen(str->st.str)); return str->st.len; @@ -708,13 +746,16 @@ obj_t *length_str(obj_t *str) const char *c_str(obj_t *obj) { - type_check2(obj, STR, SYM); + type_check3(obj, STR, SYM, LSTR); switch (obj->t.type) { case STR: return obj->st.str; case SYM: return c_str(symbol_name(obj)); + case LSTR: + lazy_str_force(obj); + return c_str(obj->ls.prefix); default: abort(); } @@ -723,19 +764,38 @@ const char *c_str(obj_t *obj) obj_t *search_str(obj_t *haystack, obj_t *needle, obj_t *start_num, obj_t *from_end) { - const char *h = c_str(haystack); - long len = c_num(length_str(haystack)); - long start = c_num(start_num); - - if (start > len) { + if (length_str_lt(haystack, start_num)) { return nil; } else { - const char *n = c_str(needle), *good = 0, *pos, *from = h + start; + obj_t *h_is_lazy = lazy_stringp(haystack); + long start = c_num(start_num); + long good = -1, pos = -1; + const char *n = c_str(needle), *h; - do { - pos = strstr(from, n); - } while (pos && (good = pos) && from_end && *(from = pos + 1)); - return (good == 0) ? nil : num(good - h); + if (!h_is_lazy) { + do { + const char *f; + h = c_str(haystack); + + f = strstr(h + start, n); + + if (f) + pos = f - h; + else + pos = -1; + } while (pos != -1 && (good = pos) != -1 && from_end && h[++start]); + } else { + size_t ln = c_num(length_str(needle)); + + do { + lazy_str_force_upto(haystack, plus(num(start + 1), length_str(needle))); + h = c_str(haystack->ls.prefix); + + if (!strncmp(h + start, n, ln)) + good = start; + } while (h[++start] && (from_end || good == -1)); + } + return (good == -1) ? nil : num(good); } } @@ -758,30 +818,27 @@ obj_t *search_str_tree(obj_t *haystack, obj_t *tree, obj_t *start_num, return nil; } -obj_t *sub_str(obj_t *str_in, obj_t *from_num, obj_t *to_num) +obj_t *sub_str(obj_t *str_in, obj_t *from, obj_t *to) { - const char *str = c_str(str_in); - size_t len = c_num(length_str(str_in)); - long from = from_num ? c_num(from_num) : 0; - long to = to_num ? c_num(to_num) : len; + if (from == nil || lt(from, zero)) + from = zero; + if (to == nil) + to = length_str(str_in); + else if (lt(to, zero)) + to = zero; + if (length_str_lt(str_in, from)) + from = length_str(str_in); + if (length_str_lt(str_in, to)) + to = length_str(str_in); - if (to < 0) - to = 0; - if (from < 0) - from = 0; - if (from > len) - from = len; - if (to > len) - to = len; - - if (from >= to) { + if (ge(from, to)) { return null_string; } else { - size_t size = to - from + 1; + size_t size = c_num(to) - c_num(from) + 1; char *sub = chk_malloc(size); - strncpy(sub, str + from, size); + strncpy(sub, c_str(str_in) + c_num(from), size); sub[size-1] = 0; - return string(sub); + return string_own(sub); } } @@ -833,7 +890,27 @@ obj_t *cat_str(obj_t *list, obj_t *sep) } *ptr = 0; - return string(str); + return string_own(str); +} + +obj_t *split_str(obj_t *str, obj_t *sep) +{ + const char *cstr = c_str(str); + const char *csep = c_str(sep); + list_collect_decl (out, iter); + + for (;;) { + size_t span = strcspn(cstr, csep); + obj_t *piece = mkustring(num(span)); + init_str(piece, cstr); + list_collect (iter, piece); + cstr += span; + if (!*cstr) + break; + cstr++; + } + + return out; } obj_t *trim_str(obj_t *str) @@ -854,7 +931,7 @@ obj_t *trim_str(obj_t *str) char *new = chk_malloc(len + 1); memcpy(new, start, len); new[len] = 0; - return string(new); + return string_own(new); } } @@ -885,24 +962,26 @@ int c_chr(obj_t *chr) obj_t *chr_str(obj_t *str, obj_t *index) { - long l = c_num(length_str(str)); - long i = c_num(index); - const char *s = c_str(str); - - bug_unless (i < l); + bug_unless (length_str_gt(str, index)); - return chr(s[i]); + if (lazy_stringp(str)) { + lazy_str_force_upto(str, index); + return chr(c_str(str->ls.prefix)[c_num(index)]); + } else { + return chr(c_str(str)[c_num(index)]); + } } obj_t *chr_str_set(obj_t *str, obj_t *index, obj_t *chr) { - long l = c_num(length_str(str)); - long i = c_num(index); - char *s = str->st.str; - - bug_unless (i < l); + bug_unless (length_str_gt(str, index)); - s[i] = c_chr(chr); + if (lazy_stringp(str)) { + lazy_str_force_upto(str, index); + str->ls.prefix->st.str[c_num(index)] = c_chr(chr); + } else { + str->st.str[c_num(index)] = c_chr(chr); + } return chr; } @@ -1277,6 +1356,124 @@ obj_t *lazy_stream_cons(obj_t *stream) lazy_stream_func)); } +obj_t *lazy_str(obj_t *list) +{ + obj_t *obj = make_obj(); + obj->ls.type = LSTR; + + if (nullp(list)) { + obj->ls.prefix = null_string; + obj->ls.list = nil; + } else { + obj->ls.prefix = first(list); + obj->ls.list = rest(list); + } + + return obj; +} + +obj_t *lazy_str_force(obj_t *lstr) +{ + type_check(lstr, LSTR); + + if (lstr->ls.list) { + lstr->ls.prefix = cat_str(cons(lstr->ls.prefix, lstr->ls.list), + string("\n")); + lstr->ls.list = nil; + } + + return lstr->ls.prefix; +} + +obj_t *lazy_str_force_upto(obj_t *lstr, obj_t *index) +{ + type_check(lstr, LSTR); + + while (gt(index, length_str(lstr->ls.prefix)) && lstr->ls.list) { + obj_t *next = pop(&lstr->ls.list); + lstr->ls.prefix = cat_str(cons(lstr->ls.prefix, cons(next, nil)), + string("\n")); + } + + return lt(index, length_str(lstr->ls.prefix)); +} + +obj_t *length_str_gt(obj_t *str, obj_t *len) +{ + type_check2 (str, STR, LSTR); + + switch (str->t.type) { + case STR: + return gt(length_str(str), len); + case LSTR: + lazy_str_force_upto(str, len); + return gt(length_str(str->ls.prefix), len); + default: + internal_error("unexpected type value"); + } +} + +obj_t *length_str_ge(obj_t *str, obj_t *len) +{ + type_check2 (str, STR, LSTR); + + switch (str->t.type) { + case STR: + return ge(length_str(str), len); + case LSTR: + lazy_str_force_upto(str, len); + return ge(length_str(str->ls.prefix), len); + default: + internal_error("unexpected type value"); + } +} + +obj_t *length_str_lt(obj_t *str, obj_t *len) +{ + type_check2 (str, STR, LSTR); + + switch (str->t.type) { + case STR: + return lt(length_str(str), len); + case LSTR: + lazy_str_force_upto(str, len); + return lt(length_str(str->ls.prefix), len); + default: + internal_error("unexpected type value"); + } +} + +obj_t *length_str_le(obj_t *str, obj_t *len) +{ + type_check2 (str, STR, LSTR); + + switch (str->t.type) { + case STR: + return le(length_str(str), len); + case LSTR: + lazy_str_force_upto(str, len); + return le(length_str(str->ls.prefix), len); + default: + internal_error("unexpected type value"); + } +} + +obj_t *lazy_str_get_trailing_list(obj_t *lstr, obj_t *index) +{ + type_check(lstr, LSTR); + + /* Force lazy string up through the index position */ + if (ge(index, length_str(lstr->ls.prefix))) + lazy_str_force_upto(lstr, index); + + { + obj_t *split_suffix = split_str(sub_str(lstr->ls.prefix, index, nil), + string("\n")); + + return nappend2(split_suffix, lstr->ls.list); + } +} + obj_t *cobj(void *handle, obj_t *cls_sym, struct cobj_ops *ops) { obj_t *obj = make_obj(); @@ -1463,73 +1660,75 @@ static void obj_init(void) &null_list, &equal_f, &identity_f, &prog_string, 0); - nil_string = string(strdup("nil")); - - null = intern(string(strdup("null"))); - t = intern(string(strdup("t"))); - cons_t = intern(string(strdup("cons"))); - str_t = intern(string(strdup("str"))); - chr_t = intern(string(strdup("chr"))); - num_t = intern(string(strdup("num"))); - sym_t = intern(string(strdup("sym"))); - fun_t = intern(string(strdup("fun"))); - vec_t = intern(string(strdup("vec"))); - stream_t = intern(string(strdup("stream"))); - lcons_t = intern(string(strdup("lcons"))); - cobj_t = intern(string(strdup("cobj"))); - var = intern(string(strdup("$var"))); - regex = intern(string(strdup("$regex"))); - set = intern(string(strdup("set"))); - cset = intern(string(strdup("cset"))); - wild = intern(string(strdup("wild"))); - oneplus = intern(string(strdup("1+"))); - zeroplus = intern(string(strdup("0+"))); - optional = intern(string(strdup("?"))); - compound = intern(string(strdup("compound"))); - or = intern(string(strdup("or"))); - quasi = intern(string(strdup("$quasi"))); - skip = intern(string(strdup("skip"))); - trailer = intern(string(strdup("trailer"))); - block = intern(string(strdup("block"))); - next = intern(string(strdup("next"))); - fail = intern(string(strdup("fail"))); - accept = intern(string(strdup("accept"))); - all = intern(string(strdup("all"))); - some = intern(string(strdup("some"))); - none = intern(string(strdup("none"))); - maybe = intern(string(strdup("maybe"))); - cases = intern(string(strdup("cases"))); - collect = intern(string(strdup("collect"))); - until = intern(string(strdup("until"))); - coll = intern(string(strdup("coll"))); - define = intern(string(strdup("define"))); - output = intern(string(strdup("output"))); - single = intern(string(strdup("single"))); - frst = intern(string(strdup("first"))); - lst = intern(string(strdup("last"))); - empty = intern(string(strdup("empty"))); - repeat = intern(string(strdup("repeat"))); - rep = intern(string(strdup("rep"))); - flattn = intern(string(strdup("flatten"))); - forget = intern(string(strdup("forget"))); - local = intern(string(strdup("local"))); - mrge = intern(string(strdup("merge"))); - bind = intern(string(strdup("bind"))); - cat = intern(string(strdup("cat"))); - args = intern(string(strdup("args"))); - try = intern(string(strdup("try"))); - catch = intern(string(strdup("catch"))); - finally = intern(string(strdup("finally"))); - nothrow = intern(string(strdup("nothrow"))); - throw = intern(string(strdup("throw"))); - defex = intern(string(strdup("defex"))); - error = intern(string(strdup("error"))); - type_error = intern(string(strdup("type_error"))); - internal_err = intern(string(strdup("internal_error"))); - numeric_err = intern(string(strdup("numeric_error"))); - range_err = intern(string(strdup("range_error"))); - query_error = intern(string(strdup("query_error"))); - file_error = intern(string(strdup("file_error"))); + nil_string = string("nil"); + + null = intern(string("null")); + t = intern(string("t")); + cons_t = intern(string("cons")); + str_t = intern(string("str")); + chr_t = intern(string("chr")); + num_t = intern(string("num")); + sym_t = intern(string("sym")); + fun_t = intern(string("fun")); + vec_t = intern(string("vec")); + stream_t = intern(string("stream")); + lcons_t = intern(string("lcons")); + lstr_t = intern(string("lstr")); + cobj_t = intern(string("cobj")); + var = intern(string("$var")); + regex = intern(string("$regex")); + set = intern(string("set")); + cset = intern(string("cset")); + wild = intern(string("wild")); + oneplus = intern(string("1+")); + zeroplus = intern(string("0+")); + optional = intern(string("?")); + compound = intern(string("compound")); + or = intern(string("or")); + quasi = intern(string("$quasi")); + skip = intern(string("skip")); + trailer = intern(string("trailer")); + block = intern(string("block")); + next = intern(string("next")); + freeform = intern(string("freeform")); + fail = intern(string("fail")); + accept = intern(string("accept")); + all = intern(string("all")); + some = intern(string("some")); + none = intern(string("none")); + maybe = intern(string("maybe")); + cases = intern(string("cases")); + collect = intern(string("collect")); + until = intern(string("until")); + coll = intern(string("coll")); + define = intern(string("define")); + output = intern(string("output")); + single = intern(string("single")); + frst = intern(string("first")); + lst = intern(string("last")); + empty = intern(string("empty")); + repeat = intern(string("repeat")); + rep = intern(string("rep")); + flattn = intern(string("flatten")); + forget = intern(string("forget")); + local = intern(string("local")); + mrge = intern(string("merge")); + bind = intern(string("bind")); + cat = intern(string("cat")); + args = intern(string("args")); + try = intern(string("try")); + catch = intern(string("catch")); + finally = intern(string("finally")); + nothrow = intern(string("nothrow")); + throw = intern(string("throw")); + defex = intern(string("defex")); + error = intern(string("error")); + type_error = intern(string("type_error")); + internal_err = intern(string("internal_error")); + numeric_err = intern(string("numeric_error")); + range_err = intern(string("range_error")); + query_error = intern(string("query_error")); + file_error = intern(string("file_error")); interned_syms = cons(nil, interned_syms); @@ -1540,13 +1739,13 @@ static void obj_init(void) maxint = num(LONG_MAX); minint = num(LONG_MIN); - null_string = string(strdup("")); + null_string = string(""); null_list = cons(nil, nil); equal_f = func_f2(nil, equal_tramp); identity_f = func_f1(nil, identity_tramp); - prog_string = string(strdup(progname)); + prog_string = string(progname); } void obj_print(obj_t *obj, obj_t *out) @@ -1648,6 +1847,10 @@ void obj_print(obj_t *obj, obj_t *out) put_cchar(out, ')'); } return; + case LSTR: + obj_print(obj->ls.prefix, out); + put_cstring(out, "#<... lazy string>"); + return; case COBJ: obj->co.ops->print(obj, out); return; @@ -1710,6 +1913,10 @@ void obj_pprint(obj_t *obj, obj_t *out) put_cchar(out, ')'); } return; + case LSTR: + obj_pprint(obj->ls.prefix, out); + put_cstring(out, "..."); + return; case COBJ: obj->co.ops->print(obj, out); return; @@ -25,7 +25,7 @@ */ typedef enum type { - CONS = 1, STR, CHR, NUM, SYM, FUN, VEC, LCONS, COBJ + CONS = 1, STR, CHR, NUM, SYM, FUN, VEC, LCONS, LSTR, COBJ } type_t; typedef enum functype @@ -113,6 +113,16 @@ struct lazy_cons { obj_t *func; /* when nil, car and cdr are valid */ }; +/* + * Lazy string: virtual string which dynamically grows as a catentation + * of a list of strings. + */ +struct lazy_string { + type_t type; + obj_t *prefix; /* actual string part */ + obj_t *list; /* remaining list */ +}; + struct cobj { type_t type; void *handle; @@ -137,15 +147,17 @@ union obj { struct func f; struct vec v; struct lazy_cons lc; + struct lazy_string ls; struct cobj co; }; extern obj_t *interned_syms; extern obj_t *t, *cons_t, *str_t, *chr_t, *num_t, *sym_t, *fun_t, *vec_t; -extern obj_t *stream_t, *lcons_t, *var, *regex, *set, *cset, *wild, *oneplus; +extern obj_t *stream_t, *lcons_t, *lstr_t, *cobj_t; +extern obj_t *var, *regex, *set, *cset, *wild, *oneplus; extern obj_t *zeroplus, *optional, *compound, *or, *quasi; -extern obj_t *skip, *trailer, *block, *next, *fail, *accept; +extern obj_t *skip, *trailer, *block, *next, *freeform, *fail, *accept; extern obj_t *all, *some, *none, *maybe, *cases, *collect, *until, *coll; extern obj_t *define, *output, *single, *frst, *lst, *empty, *repeat, *rep; extern obj_t *flattn, *forget, *local, *mrge, *bind, *cat, *args; @@ -169,6 +181,7 @@ obj_t *identity(obj_t *obj); obj_t *typeof(obj_t *obj); obj_t *type_check(obj_t *obj, int); obj_t *type_check2(obj_t *obj, int, int); +obj_t *type_check3(obj_t *obj, int, int, int); obj_t *car(obj_t *cons); obj_t *cdr(obj_t *cons); obj_t **car_l(obj_t *cons); @@ -221,12 +234,14 @@ obj_t *le(obj_t *anum, obj_t *bnum); obj_t *numeq(obj_t *anum, obj_t *bnum); obj_t *max2(obj_t *anum, obj_t *bnum); obj_t *min2(obj_t *anum, obj_t *bnum); -obj_t *string(char *str); +obj_t *string_own(char *str); +obj_t *string(const char *str); obj_t *mkstring(obj_t *len, obj_t *ch); obj_t *mkustring(obj_t *len); /* must initialize immediately with init_str! */ obj_t *init_str(obj_t *str, const char *); obj_t *copy_str(obj_t *str); obj_t *stringp(obj_t *str); +obj_t *lazy_stringp(obj_t *str); obj_t *length_str(obj_t *str); const char *c_str(obj_t *str); obj_t *search_str(obj_t *haystack, obj_t *needle, obj_t *start_num, @@ -235,10 +250,11 @@ obj_t *search_str_tree(obj_t *haystack, obj_t *tree, obj_t *start_num, obj_t *from_end); obj_t *sub_str(obj_t *str_in, obj_t *from_num, obj_t *to_num); obj_t *cat_str(obj_t *list, obj_t *sep); +obj_t *split_str(obj_t *str, obj_t *sep); obj_t *trim_str(obj_t *str); obj_t *string_lt(obj_t *astr, obj_t *bstr); obj_t *chr(int ch); -obj_t *chrp(obj_t *str); +obj_t *chrp(obj_t *chr); int c_chr(obj_t *chr); obj_t *chr_str(obj_t *str, obj_t *index); obj_t *chr_str_set(obj_t *str, obj_t *index, obj_t *chr); @@ -272,6 +288,14 @@ obj_t *vec_set_fill(obj_t *vec, obj_t *fill); obj_t **vecref_l(obj_t *vec, obj_t *ind); obj_t *vec_push(obj_t *vec, obj_t *item); obj_t *lazy_stream_cons(obj_t *stream); +obj_t *lazy_str(obj_t *list); +obj_t *lazy_str_force_upto(obj_t *lstr, obj_t *index); +obj_t *lazy_str_force(obj_t *lstr); +obj_t *lazy_str_get_trailing_list(obj_t *lstr, obj_t *index); +obj_t *length_str_gt(obj_t *str, obj_t *len); +obj_t *length_str_ge(obj_t *str, obj_t *len); +obj_t *length_str_lt(obj_t *str, obj_t *len); +obj_t *length_str_le(obj_t *str, obj_t *len); obj_t *cobj(void *handle, obj_t *cls_sym, struct cobj_ops *ops); void cobj_print_op(obj_t *, obj_t *); /* Default function for struct cobj_ops */ obj_t *assoc(obj_t *list, obj_t *key); @@ -902,7 +902,7 @@ obj_t *match_files(obj_t *spec, obj_t *files, return nil; } else if (errno != 0) file_err(nil, "could not open ~a (error ~a/~a)", name, - num(errno), string(strdup(strerror(errno))), nao); + num(errno), string(strerror(errno)), nao); else file_err(nil, "could not open ~a", name, nao); return nil; @@ -983,6 +983,33 @@ repeat_spec_same_data: return cons(new_bindings, cons(data, num(data_lineno))); return nil; } + } else if (sym == freeform) { + if ((spec = rest(spec)) == nil) { + sem_error(spec_linenum, + "freeform must be followed by a query line", nao); + } else { + obj_t *ff_specline = rest(first(spec)); + obj_t *ff_dataline = lazy_str(data); + + cons_bind (new_bindings, success, + match_line(bindings, ff_specline, ff_dataline, zero, + spec_linenum, num(data_lineno), first(files))); + + if (!success) { + debuglf(spec_linenum, "freeform match failure", nao); + return nil; + } + + if (nump(success)) + data = lazy_str_get_trailing_list(ff_dataline, success); + + bindings = new_bindings; + } + + if ((spec = rest(spec)) == nil) + break; + + goto repeat_spec_same_data; } else if (sym == block) { obj_t *name = first(rest(first_spec)); if (rest(specline)) @@ -1027,7 +1054,7 @@ repeat_spec_same_data: if (eq(first(source), nothrow)) push(nil, &source); else if (eq(first(source), args)) { - obj_t *input_name = string(strdup("args")); + obj_t *input_name = string("args"); cons_bind (new_bindings, success, match_files(spec, cons(input_name, files), bindings, files, one)); @@ -1378,7 +1405,7 @@ repeat_spec_same_data: form, nao); nt = eq(second(new_style_dest), nothrow); - dest = or2(cdr(val), string(strdup("-"))); + dest = or2(cdr(val), string("-")); } } @@ -1393,7 +1420,7 @@ repeat_spec_same_data: return nil; } else if (errno != 0) { file_err(nil, "could not open ~a (error ~a/~a)", dest, - num(errno), string(strdup(strerror(errno))), nao); + num(errno), string(strerror(errno)), nao); } else { file_err(nil, "could not open ~a", dest, nao); } @@ -183,7 +183,7 @@ elems : elem { $$ = cons($1, nil); } yyerror("rep outside of output"); } ; -elem : TEXT { $$ = string($1); } +elem : TEXT { $$ = string_own($1); } | var { $$ = $1; } | list { $$ = $1; } | regex { $$ = cons(regex_compile($1), $1); } @@ -347,7 +347,7 @@ o_elems : o_elem { $$ = cons($1, nil); } | o_elem o_elems { $$ = cons($1, $2); } ; -o_elem : TEXT { $$ = string($1); } +o_elem : TEXT { $$ = string_own($1); } | var { $$ = $1; } | rep_elem { $$ = $1; } ; @@ -372,23 +372,32 @@ rep_parts_opt : SINGLE o_elems_opt2 /* This sucks, but factoring '*' into a nonterminal * that generates an empty phrase causes reduce/reduce conflicts. */ -var : IDENT { $$ = list(var, intern(string($1)), nao); } - | IDENT elem { $$ = list(var, intern(string($1)), $2, nao); } - | '{' IDENT '}' { $$ = list(var, intern(string($2)), nao); } - | '{' IDENT '}' elem { $$ = list(var, intern(string($2)), $4, nao); } - | '{' IDENT regex '}' { $$ = list(var, intern(string($2)), +var : IDENT { $$ = list(var, intern(string_own($1)), + nao); } + | IDENT elem { $$ = list(var, intern(string_own($1)), + $2, nao); } + | '{' IDENT '}' { $$ = list(var, intern(string_own($2)), + nao); } + | '{' IDENT '}' elem { $$ = list(var, intern(string_own($2)), + $4, nao); } + | '{' IDENT regex '}' { $$ = list(var, intern(string_own($2)), nil, cons(regex_compile($3), $3), nao); } - | '{' IDENT NUMBER '}' { $$ = list(var, intern(string($2)), + | '{' IDENT NUMBER '}' { $$ = list(var, intern(string_own($2)), nil, num($3), nao); } - | var_op IDENT { $$ = list(var, intern(string($2)), + | var_op IDENT { $$ = list(var, intern(string_own($2)), nil, $1, nao); } - | var_op IDENT elem { $$ = list(var, intern(string($2)), + | var_op IDENT elem { $$ = list(var, intern(string_own($2)), $3, $1, nao); } - | var_op '{' IDENT '}' { $$ = list(var, intern(string($3)), + | var_op '{' IDENT '}' { $$ = list(var, intern(string_own($3)), nil, $1, nao); } - | var_op '{' IDENT '}' elem { $$ = list(var, intern(string($3)), + | var_op '{' IDENT '}' elem { $$ = list(var, intern(string_own($3)), $5, $1, nao); } + | var_op '{' IDENT regex '}' { yyerror("longest match " + "not useable with regex"); } + | var_op '{' IDENT NUMBER '}' { yyerror("longest match " + "not useable with " + "fixed width match"); } | IDENT error { $$ = nil; yybadtoken(yychar, "variable spec"); } | var_op error { $$ = nil; @@ -409,7 +418,7 @@ exprs : expr { $$ = cons($1, nil); } | expr '.' expr { $$ = cons($1, $3); } ; -expr : IDENT { $$ = intern(string($1)); } +expr : IDENT { $$ = intern(string_own($1)); } | NUMBER { $$ = num($1); } | list { $$ = $1; } | regex { $$ = cons(regex_compile($1), $1); } @@ -502,7 +511,7 @@ quasi_items : quasi_item { $$ = cons($1, nil); } ; quasi_item : litchars { $$ = lit_char_helper($1); } - | TEXT { $$ = string($1); } + | TEXT { $$ = string_own($1); } | var { $$ = $1; } | list { $$ = $1; } ; @@ -569,6 +569,82 @@ long nfa_run(nfa_t nfa, const char *str) return last_accept_pos ? last_accept_pos - str : -1; } +long nfa_machine_match_span(nfa_machine_t *nfam) +{ + return nfam->last_accept_pos; +} + +/* + * NFA machine: represents the logic of the nfa_run function as state machine + * object which can be fed one character at a time. + */ +void nfa_machine_init(nfa_machine_t *nfam, nfa_t nfa) +{ + int accept = 0; + + nfam->nfa = nfa; + nfam->last_accept_pos = -1; + nfam->visited = nfa.start->a.visited + 1; + nfam->move = chk_malloc(NFA_SET_SIZE * sizeof *nfam->move); + nfam->clos = chk_malloc(NFA_SET_SIZE * sizeof *nfam->clos); + nfam->stack = chk_malloc(NFA_SET_SIZE * sizeof *nfam->stack); + nfam->nmove = 1; + nfam->count = 0; + + nfam->move[0] = nfa.start; + + nfam->nclos = nfa_closure(nfam->stack, nfam->move, nfam->nmove, + nfam->clos, nfam->visited++, &accept); + + if (accept) + nfam->last_accept_pos = nfam->count; +} + +void nfa_machine_cleanup(nfa_machine_t *nfam) +{ + free(nfam->stack); + free(nfam->clos); + free(nfam->move); + nfam->stack = 0; + nfam->clos = 0; + nfam->move = 0; + nfam->nfa.start = 0; + nfam->nfa.accept = 0; +} + +int nfa_machine_feed(nfa_machine_t *nfam, int ch) +{ + int accept = 0; + + if (ch != 0) { + nfam->count++; + + nfam->nmove = nfa_move(nfam->clos, nfam->nclos, nfam->move, ch); + nfam->nclos = nfa_closure(nfam->stack, nfam->move, nfam->nmove, nfam->clos, + nfam->visited++, &accept); + + if (accept) + nfam->last_accept_pos = nfam->count; + } + + nfam->nfa.start->a.visited = nfam->visited; + + if (ch && nfam->nclos != 0) { + if (accept) + return NFAM_MATCH; + return NFAM_INCOMPLETE; + } + + /* Reached if the null character is + consumed, or NFA hit a transition dead end. */ + + if (nfam->last_accept_pos == nfam->count) + return NFAM_MATCH; + if (nfam->last_accept_pos == -1) + return NFAM_FAIL; + return NFAM_INCOMPLETE; +} + static obj_t *regex_equal(obj_t *self, obj_t *other) { return self == other ? t : nil; /* eq equality only */ @@ -90,16 +90,29 @@ void nfa_state_free(nfa_state_t *st); void nfa_state_shallow_free(nfa_state_t *st); void nfa_state_merge(nfa_state_t *accept, nfa_state_t *); -typedef struct nfa nfa_t; - -struct nfa { +typedef struct nfa { nfa_state_t *start; nfa_state_t *accept; -}; +} nfa_t; + +enum nfam_result { NFAM_INCOMPLETE, NFAM_FAIL, NFAM_MATCH }; + +typedef struct nfa_machine { + long last_accept_pos; + unsigned visited; + nfa_state_t **move, **clos, **stack; + int nmove, nclos; + long count; + nfa_t nfa; +} nfa_machine_t; nfa_t nfa_compile_regex(obj_t *regex); void nfa_free(nfa_t); long nfa_run(nfa_t nfa, const char *str); +void nfa_machine_init(nfa_machine_t *, nfa_t); +void nfa_machine_cleanup(nfa_machine_t *); +int nfa_machine_feed(nfa_machine_t *, int ch); +long nfa_machine_match_span(nfa_machine_t *); obj_t *regex_compile(obj_t *regex_sexp); obj_t *regexp(obj_t *); nfa_t *regex_nfa(obj_t *); @@ -152,7 +152,7 @@ static obj_t *stdio_get_line(obj_t *stream) char *line = snarf_line((FILE *) stream->co.handle); if (!line) return nil; - return string(line); + return string_own(line); } } @@ -407,7 +407,7 @@ static obj_t *dir_get_line(obj_t *stream) return nil; if (!strcmp(e->d_name, ".") || !strcmp(e->d_name, "..")) continue; - return string(chk_strdup(e->d_name)); + return string(e->d_name); } } } @@ -478,7 +478,7 @@ obj_t *get_string_from_stream(obj_t *stream) return out; so->buf = chk_realloc(so->buf, so->fill + 1); - out = string(so->buf); + out = string_own(so->buf); free(so); return out; } else if (stream->co.ops == &string_in_ops.cobj_ops) { @@ -119,7 +119,7 @@ obj_t *remove_hash_bang_line(obj_t *spec) return spec; { - obj_t *shbang = string(strdup("#!")); + obj_t *shbang = string("#!"); obj_t *firstline = first(spec); obj_t *items = rest(firstline); @@ -183,7 +183,7 @@ int main(int argc, char **argv) val[piece] = 0; - list = cons(string(strdup(val)), list); + list = cons(string(val), list); if (!comma_p) break; @@ -192,15 +192,13 @@ int main(int argc, char **argv) } list = nreverse(list); - bindings = cons(cons(intern(string(strdup(var))), list), bindings); + bindings = cons(cons(intern(string(var)), list), bindings); } else if (equals) { char *val = equals + 1; *equals = 0; - bindings = cons(cons(intern(string(strdup(var))), - string(strdup(val))), bindings); + bindings = cons(cons(intern(string(var)), string(val)), bindings); } else { - bindings = cons(cons(intern(string(strdup(var))), - null_string), bindings); + bindings = cons(cons(intern(string(var)), null_string), bindings); } argc--, argv++; @@ -242,10 +240,10 @@ int main(int argc, char **argv) opt_arraydims = val; break; case 'c': - specstring = string(strdup(*argv)); + specstring = string(*argv); break; case 'f': - spec_file_str = string(strdup(*argv)); + spec_file_str = string(*argv); break; } @@ -298,7 +296,7 @@ int main(int argc, char **argv) if (specstring) { spec_file = "cmdline"; - spec_file_str = string(strdup(spec_file)); + spec_file_str = string(spec_file); yyin_stream = make_string_input_stream(specstring); } else if (spec_file_str) { if (strcmp(c_str(spec_file_str), "-") != 0) { @@ -325,7 +323,7 @@ int main(int argc, char **argv) spec_file = "stdin"; } argc--, argv++; - spec_file_str = string(strdup(spec_file)); + spec_file_str = string(spec_file); } |