/* Copyright 2009 * Kaz Kylheku * Vancouver, Canada * All rights reserved. * * BSD License: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ %{ #include #include #include #include #include #include #include #include #include #include "lib.h" #include "gc.h" #include "unwind.h" #include "regex.h" #include "extract.h" int yylex(void); void yyerror(const char *); obj_t *repeat_rep_helper(obj_t *sym, obj_t *main, obj_t *parts); static obj_t *parsed_spec; static int output_produced; %} %union { char *lexeme; union obj *obj; char chr; long num; } %token TEXT IDENT ALL SOME NONE MAYBE AND OR END COLLECT UNTIL COLL %token OUTPUT REPEAT REP SINGLE FIRST LAST EMPTY %token NUMBER %token REGCHAR %type spec clauses clause all_clause some_clause none_clause maybe_clause %type collect_clause clause_parts additional_parts output_clause %type line elems_opt elems elem var var_op list exprs expr %type out_clauses out_clauses_opt out_clause %type repeat_clause repeat_parts_opt o_line %type o_elems_opt o_elems_opt2 o_elems o_elem rep_elem rep_parts_opt %type regex regexpr regbranch %type regterm regclass regclassterm regrange %type regchar %nonassoc ALL SOME NONE MAYBE AND OR END COLLECT UNTIL COLL %nonassoc OUTPUT REPEAT REP FIRST LAST EMPTY %nonassoc '{' '}' '[' ']' '(' ')' %right IDENT TEXT NUMBER %left '|' '/' %right '*' '?' '+' %right '^' '.' '\\' REGCHAR %% spec : clauses { parsed_spec = $1; } | { parsed_spec = nil; } | error { parsed_spec = nil; yybadtoken(yychar, 0); } ; clauses : clause { $$ = cons($1, nil); } | clause clauses { $$ = cons($1, $2); } ; clause : all_clause { $$ = list(num(lineno - 1), $1, nao); } | some_clause { $$ = list(num(lineno - 1), $1, nao); } | none_clause { $$ = list(num(lineno - 1), $1, nao); } | maybe_clause { $$ = list(num(lineno - 1), $1, nao); } | collect_clause { $$ = list(num(lineno - 1), $1, nao); } | output_clause { $$ = list(num(lineno - 1), $1, nao); } | line { $$ = $1; } | repeat_clause { $$ = nil; yyerror("repeat outside of output"); } ; all_clause : ALL newl clause_parts { $$ = cons(all, $3); } | ALL newl error { $$ = nil; yybadtoken(yychar, "all clause"); } | ALL newl END { $$ = nil; yyerror("empty all clause"); } ; some_clause : SOME newl clause_parts { $$ = cons(some, $3); } | SOME newl error { $$ = nil; yybadtoken(yychar, "some clause"); } | SOME newl END { $$ = nil; yyerror("empty some clause"); } ; none_clause : NONE newl clause_parts { $$ = cons(none, $3); } | NONE newl error { $$ = nil; yybadtoken(yychar, "none clause"); } | NONE newl END { $$ = nil; yyerror("empty none clause"); } ; maybe_clause : MAYBE newl clause_parts { $$ = cons(maybe, $3); } | MAYBE newl error { $$ = nil; yybadtoken(yychar, "maybe clause"); } | MAYBE newl END { $$ = nil; yyerror("empty maybe clause"); } ; collect_clause : COLLECT newl clauses END newl { $$ = list(collect, $3, nao); } | COLLECT newl clauses UNTIL newl clauses END newl { $$ = list(collect, $3, $6, nao); } | COLLECT newl error { $$ = nil; if (yychar == UNTIL || yychar == END) yyerror("empty collect"); else yybadtoken(yychar, "collect clause"); } ; clause_parts : clauses additional_parts { $$ = cons($1, $2); } ; additional_parts : END newl { $$ = nil; } | AND newl clauses additional_parts { $$ = cons($3, $4); } | OR newl clauses additional_parts { $$ = cons($3, $4); } ; line : elems_opt '\n' { $$ = $1; } ; elems_opt : elems { $$ = cons(num(lineno - 1), $1); } | { $$ = nil; } ; elems : elem { $$ = cons($1, nil); } | elem elems { $$ = cons($1, $2); } | rep_elem { $$ = nil; yyerror("rep outside of output"); } ; elem : TEXT { $$ = string($1); } | var { $$ = $1; } | list { $$ = $1; } | regex { $$ = cons(regex_compile($1), $1); } | COLL elems END { $$ = list(coll, $2, nao); } | COLL elems UNTIL elems END { $$ = list(coll, $2, $4, nao); } | COLL error { $$ = nil; yybadtoken(yychar, "coll clause"); } ; output_clause : OUTPUT o_elems '\n' out_clauses END newl { $$ = list(output, $4, $2, nao); } | OUTPUT newl out_clauses END newl { $$ = list(output, $3, nao); } | OUTPUT o_elems '\n' error { $$ = nil; yybadtoken(yychar, "output clause"); } | OUTPUT newl error { $$ = nil; yybadtoken(yychar, "output clause"); } ; out_clauses : out_clause { $$ = cons($1, nil); } | out_clause out_clauses { $$ = cons($1, $2); } ; out_clause : repeat_clause { $$ = list(num(lineno - 1), $1, nao); } | o_line { $$ = $1; } | all_clause { $$ = nil; yyerror("match clause in output"); } | some_clause { $$ = nil; yyerror("match clause in output"); } | none_clause { $$ = nil; yyerror("match clause in output"); } | maybe_clause { $$ = nil; yyerror("match clause in output"); } | collect_clause { $$ = nil; yyerror("match clause in output"); } | output_clause { $$ = nil; yyerror("match clause in output"); } ; repeat_clause : REPEAT newl out_clauses repeat_parts_opt END newl { $$ = repeat_rep_helper(repeat, $3, $4); } | REPEAT newl error { $$ = nil; yybadtoken(yychar, "repeat clause"); } ; repeat_parts_opt : SINGLE newl out_clauses_opt repeat_parts_opt { $$ = cons(cons(single, $3), $4); } | FIRST newl out_clauses_opt repeat_parts_opt { $$ = cons(cons(frst, $3), $4); } | LAST newl out_clauses_opt repeat_parts_opt { $$ = cons(cons(lst, $3), $4); } | EMPTY newl out_clauses_opt repeat_parts_opt { $$ = cons(cons(empty, $3), $4); } | /* empty */ { $$ = nil; } ; out_clauses_opt : out_clauses { $$ = $1; } | /* empty */ { $$ = null_list; } o_line : o_elems_opt '\n' { $$ = $1; } ; o_elems_opt : o_elems { $$ = cons(num(lineno - 1), $1); } | { $$ = nil; } ; o_elems_opt2 : o_elems { $$ = $1; } | { $$ = null_list; } ; o_elems : o_elem { $$ = cons($1, nil); } | o_elem o_elems { $$ = cons($1, $2); } ; o_elem : TEXT { $$ = string($1); } | var { $$ = $1; } | rep_elem { $$ = $1; } ; rep_elem : REP o_elems rep_parts_opt END { $$ = repeat_rep_helper(rep, $2, $3); } | REP error { $$ = nil; yybadtoken(yychar, "rep clause"); } ; rep_parts_opt : SINGLE o_elems_opt2 rep_parts_opt { $$ = cons(cons(single, $2), $3); } | FIRST o_elems_opt2 rep_parts_opt { $$ = cons(cons(frst, $2), $3); } | LAST o_elems_opt2 rep_parts_opt { $$ = cons(cons(lst, $2), $3); } | EMPTY o_elems_opt2 rep_parts_opt { $$ = cons(cons(empty, $2), $3); } | /* empty */ { $$ = nil; } ; /* This sucks, but factoring '*' into a nonterminal * that generates an empty phrase causes reduce/reduce conflicts. */ var : IDENT { $$ = list(var, intern(string($1)), nao); } | IDENT elem { $$ = list(var, intern(string($1)), $2, nao); } | '{' IDENT '}' { $$ = list(var, intern(string($2)), nao); } | '{' IDENT '}' elem { $$ = list(var, intern(string($2)), $4, nao); } | '{' IDENT regex '}' { $$ = list(var, intern(string($2)), nil, cons(regex_compile($3), $3), nao); } | '{' IDENT NUMBER '}' { $$ = list(var, intern(string($2)), nil, num($3), nao); } | var_op IDENT { $$ = list(var, intern(string($2)), nil, $1, nao); } | var_op IDENT elem { $$ = list(var, intern(string($2)), $3, $1, nao); } | var_op '{' IDENT '}' { $$ = list(var, intern(string($3)), nil, $1, nao); } | var_op '{' IDENT '}' elem { $$ = list(var, intern(string($3)), $5, $1, nao); } | IDENT error { $$ = nil; yybadtoken(yychar, "variable spec"); } | var_op error { $$ = nil; yybadtoken(yychar, "variable spec"); } ; var_op : '*' { $$ = t; } ; list : '(' exprs ')' { $$ = $2; } | '(' ')' { $$ = nil; } | '(' error { $$ = nil; yybadtoken(yychar, "list expression"); } ; exprs : expr { $$ = cons($1, nil); } | expr exprs { $$ = cons($1, $2); } | expr '.' expr { $$ = cons($1, $3); } ; expr : IDENT { $$ = intern(string($1)); } | NUMBER { $$ = num($1); } | list { $$ = $1; } | regex { $$ = cons(regex_compile($1), $1); } ; regex : '/' regexpr '/' { $$ = $2; } | '/' '/' { $$ = nil; } | '/' error { $$ = nil; yybadtoken(yychar, "regex"); } ; regexpr : regbranch { $$ = $1; } | regbranch '|' regbranch { $$ = list(list(or, $1, $3, nao), nao); } ; regbranch : regterm { $$ = cons($1, nil); } | regterm regbranch { $$ = cons($1, $2); } ; regterm : '[' regclass ']' { $$ = cons(set, $2); } | '[' '^' regclass ']' { $$ = cons(cset, $3); } | '.' { $$ = wild; } | '^' { $$ = chr('^'); } | ']' { $$ = chr(']'); } | '-' { $$ = chr('-'); } | regterm '*' { $$ = list(zeroplus, $1, nao); } | regterm '+' { $$ = list(oneplus, $1, nao); } | regterm '?' { $$ = list(optional, $1, nao); } | REGCHAR { $$ = chr($1); } | '(' regexpr ')' { $$ = cons(compound, $2); } | '(' error { $$ = nil; yybadtoken(yychar, "regex subexpression"); } | '[' error { $$ = nil; yybadtoken(yychar, "regex character class"); } ; regclass : regclassterm { $$ = cons($1, nil); } | regclassterm regclass { $$ = cons($1, $2); } ; regclassterm : regrange { $$ = $1; } | regchar { $$ = chr($1); } ; regrange : regchar '-' regchar { $$ = cons(chr($1), chr($3)); } regchar : '?' { $$ = '?'; } | '.' { $$ = '.'; } | '*' { $$ = '*'; } | '+' { $$ = '+'; } | '(' { $$ = '('; } | ')' { $$ = ')'; } | '^' { $$ = '^'; } | '|' { $$ = '|'; } | REGCHAR { $$ = $1; } ; newl : '\n' | error '\n' { yyerror("newline expected after directive"); yyerrok; } ; %% obj_t *repeat_rep_helper(obj_t *sym, obj_t *main, obj_t *parts) { obj_t *single_parts = nil; obj_t *first_parts = nil; obj_t *last_parts = nil; obj_t *empty_parts = nil; obj_t *iter; for (iter = parts; iter != nil; iter = cdr(iter)) { obj_t *part = car(iter); obj_t *sym = car(part); obj_t *clauses = cdr(part); if (sym == single) single_parts = nappend2(single_parts, clauses); else if (sym == frst) first_parts = nappend2(first_parts, clauses); else if (sym == lst) last_parts = nappend2(last_parts, clauses); else if (sym == empty) empty_parts = nappend2(empty_parts, clauses); else abort(); } return list(sym, main, single_parts, first_parts, last_parts, empty_parts, nao); } obj_t *get_spec(void) { return parsed_spec; } void dump_shell_string(const char *str) { int ch; putchar('"'); while ((ch = *str++) != 0) { switch (ch) { case '"': case '`': case '$': case '\\': case '\n': putchar('\\'); /* fallthrough */ default: putchar(ch); } } putchar('"'); } void dump_var(const char *name, char *pfx1, size_t len1, char *pfx2, size_t len2, obj_t *value, int level) { if (len1 >= 112 || len2 >= 112) abort(); if (stringp(value)) { fputs(name, stdout); fputs(pfx1, stdout); fputs(pfx2, stdout); putchar('='); dump_shell_string(c_str(value)); putchar('\n'); } else { obj_t *iter; int i; size_t add1 = 0, add2 = 0; for (i = 0, iter = value; iter; iter = cdr(iter), i++) { if (level < opt_arraydims) { add2 = sprintf(pfx2 + len2, "[%d]", i); add1 = 0; } else { add1 = sprintf(pfx1 + len1, "_%d", i); add2 = 0; } dump_var(name, pfx1, len1 + add1, pfx2, len2 + add2, car(iter), level + 1); } } } void dump_bindings(obj_t *bindings) { if (opt_loglevel >= 2) { fputs("raw_bindings:\n", stderr); dump(bindings, stderr); } while (bindings) { char pfx1[128], pfx2[128]; obj_t *var = car(car(bindings)); obj_t *value = cdr(car(bindings)); const char *name = c_str(symbol_name(var)); *pfx1 = 0; *pfx2 = 0; dump_var(name, pfx1, 0, pfx2, 0, value, 0); bindings = cdr(bindings); } } obj_t *depth(obj_t *obj) { obj_t *dep = zero; if (obj == nil) return one; if (atom(obj)) return zero; while (obj) { dep = max2(dep, depth(first(obj))); obj = rest(obj); } return plus(dep, one); } obj_t *weird_merge(obj_t *left, obj_t *right) { obj_t *left_depth = depth(left); obj_t *right_depth = depth(right); while (lt(left_depth, right_depth) || zerop(left_depth)) { left = cons(left, nil); left_depth = plus(left_depth, one); } while (lt(right_depth, left_depth) || zerop(right_depth)) { right = cons(right, nil); right_depth = plus(right_depth, one); } return append2(left, right); } obj_t *map_leaf_lists(obj_t *func, obj_t *list) { if (atom(list)) return list; if (none_satisfy(list, func_n1(listp), nil)) return funcall1(func, list); return mapcar(bind2(func_n2(map_leaf_lists), func), list); } obj_t *dest_bind(obj_t *bindings, obj_t *pattern, obj_t *value) { if (nullp(pattern)) return bindings; if (symbolp(pattern)) { obj_t *existing = assoc(bindings, pattern); if (existing) { if (tree_find(value, cdr(existing))) return bindings; if (tree_find(cdr(existing), value)) return bindings; yyerrorf(2, "bind variable mismatch: %s", c_str(symbol_name(pattern))); return t; } return cons(cons(pattern, value), bindings); } if (consp(pattern)) { obj_t *piter = pattern, *viter = value; while (consp(piter) && consp(viter)) { bindings = dest_bind(bindings, car(piter), car(viter)); if (bindings == t) return t; piter = cdr(piter); viter = cdr(viter); } while (consp(piter) && consp(viter)); if (symbolp(piter)) { bindings = dest_bind(bindings, piter, viter); if (bindings == t) return t; } } return bindings; } obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, obj_t *pos, obj_t *spec_lineno, obj_t *data_lineno, obj_t *file) { #define LOG_MISMATCH(KIND) \ yyerrorlf(2, c_num(spec_lineno), \ "%s mismatch, position %ld (%s:%ld)", (KIND), c_num(pos), \ c_str(file), c_num(data_lineno)); \ yyerrorlf(2, c_num(spec_lineno), " %s", c_str(dataline)); \ if (c_num(pos) < 77) \ yyerrorlf(2, c_num(spec_lineno), " %*s^", (int) c_num(pos), "") #define LOG_MATCH(KIND, EXTENT) \ yyerrorlf(2, c_num(spec_lineno), \ "%s matched, position %ld-%ld (%s:%ld)", (KIND), \ c_num(pos), c_num(EXTENT), c_str(file), \ c_num(data_lineno)); \ yyerrorlf(2, c_num(spec_lineno), " %s", c_str(dataline)); \ if (c_num(EXTENT) < 77) \ yyerrorlf(2, c_num(spec_lineno), " %*s%-*s^", (int) c_num(pos), \ "", (int) (c_num(EXTENT) - c_num(pos)), "^") for (;;) { obj_t *elem; if (specline == nil) break; elem = first(specline); switch (elem ? elem->t.type : 0) { case CONS: /* directive */ { obj_t *directive = first(elem); if (directive == var) { obj_t *sym = second(elem); obj_t *pat = third(elem); obj_t *modifier = fourth(elem); obj_t *pair = assoc(bindings, sym); /* var exists already? */ if (pair) { /* If the variable already has a binding, we replace it with its value, and treat it as a string match. The spec looks like ((var ) ...) and it must be transformed into ( ...) */ if (pat) { specline = cons(cdr(pair), cons(pat, rest(specline))); } else if (nump(modifier)) { obj_t *past = plus(pos, modifier); if (c_num(past) > c_num(length_str(dataline)) || c_num(past) < c_num(pos)) { LOG_MISMATCH("fixed field size"); return nil; } if (!tree_find(trim_str(sub_str(dataline, pos, past)), cdr(pair))) { LOG_MISMATCH("fixed field contents"); return nil; } LOG_MATCH("fixed field", past); pos = past; specline = cdr(specline); } else { specline = cons(cdr(pair), rest(specline)); } continue; } else if (pat == nil) { /* match to end of line or with regex */ if (consp(modifier)) { obj_t *past = match_regex(dataline, car(modifier), pos); if (nullp(past)) { LOG_MISMATCH("var positive regex"); return nil; } LOG_MATCH("var positive regex", past); bindings = acons_new(bindings, sym, sub_str(dataline, pos, past)); pos = past; } else if (nump(modifier)) { obj_t *past = plus(pos, modifier); if (c_num(past) > c_num(length_str(dataline)) || c_num(past) < c_num(pos)) { LOG_MISMATCH("count based var"); return nil; } LOG_MATCH("count based var", past); bindings = acons_new(bindings, sym, trim_str(sub_str(dataline, pos, past))); pos = past; } else { bindings = acons_new(bindings, sym, sub_str(dataline, pos, nil)); pos = length_str(dataline); } } else if (pat->t.type == STR) { obj_t *find = search_str(dataline, pat, pos, modifier); if (!find) { LOG_MISMATCH("var delimiting string"); return nil; } LOG_MATCH("var delimiting string", find); bindings = acons_new(bindings, sym, sub_str(dataline, pos, find)); pos = plus(find, length_str(pat)); } else if (consp(pat) && typeof(first(pat)) == regex) { obj_t *find = search_regex(dataline, first(pat), pos, modifier); obj_t *fpos = car(find); obj_t *flen = cdr(find); if (!find) { LOG_MISMATCH("var delimiting regex"); return nil; } LOG_MATCH("var delimiting regex", fpos); bindings = acons_new(bindings, sym, sub_str(dataline, pos, fpos)); pos = plus(fpos, flen); } else if (consp(pat) && first(pat) == var) { /* Unbound var followed by var: the following one must be bound. */ obj_t *second_sym = second(pat); obj_t *next_pat = third(pat); obj_t *pair = assoc(bindings, second_sym); /* var exists already? */ if (!pair) { yyerrorlf(1, c_num(spec_lineno), "consecutive unbound variables"); return nil; } /* Re-generate a new spec with an edited version of the element we just processed, and repeat. */ { obj_t *new_elem = list(var, sym, cdr(pair), modifier, nao); if (next_pat) specline = cons(new_elem, cons(next_pat, rest(specline))); else specline = cons(new_elem, rest(specline)); } continue; } else if (consp(pat) && (consp(first(pat)) || stringp(first(pat)))) { cons_bind (find, len, search_str(dataline, pat, pos, modifier)); if (!find) { LOG_MISMATCH("string"); return nil; } bindings = acons_new(bindings, sym, sub_str(dataline, pos, find)); pos = plus(find, len); } else { yyerrorlf(0, c_num(spec_lineno), "variable followed by invalid element"); return nil; } } else if (typeof(directive) == regex) { obj_t *past = match_regex(dataline, directive, pos); if (nullp(past)) { LOG_MISMATCH("regex"); return nil; } LOG_MATCH("regex", past); pos = past; } else if (directive == coll) { obj_t *coll_specline = second(elem); obj_t *until_specline = third(elem); obj_t *bindings_coll = nil; obj_t *iter; for (;;) { cons_bind (new_bindings, new_pos, match_line(bindings, coll_specline, dataline, pos, spec_lineno, data_lineno, file)); if (until_specline) { cons_bind (until_bindings, until_pos, match_line(bindings, until_specline, dataline, pos, spec_lineno, data_lineno, file)); if (until_pos) { (void) until_bindings; LOG_MATCH("until", until_pos); break; } else { LOG_MISMATCH("until"); } } if (new_pos) { LOG_MATCH("coll", new_pos); for (iter = new_bindings; iter && iter != bindings; iter = cdr(iter)) { obj_t *binding = car(iter); obj_t *existing = assoc(bindings_coll, car(binding)); bindings_coll = acons_new(bindings_coll, car(binding), cons(cdr(binding), cdr(existing))); } } if (new_pos && !equal(new_pos, pos)) { pos = new_pos; assert (c_num(pos) <= c_num(length_str(dataline))); } else { pos = plus(pos, one); } if (c_num(pos) >= c_num(length_str(dataline))) break; } if (!bindings_coll) yyerrorlf(2, c_num(spec_lineno), "nothing was collected"); for (iter = bindings_coll; iter; iter = cdr(iter)) { obj_t *pair = car(iter); obj_t *rev = cons(car(pair), nreverse(cdr(pair))); bindings = cons(rev, bindings); } } else if (consp(directive) || stringp(directive)) { cons_bind (find, len, search_str_tree(dataline, elem, pos, nil)); obj_t *newpos; if (find == nil || !equal(find, pos)) { LOG_MISMATCH("string tree"); return nil; } newpos = plus(find, len); LOG_MATCH("string tree", newpos); pos = newpos; } else { yyerrorlf(0, c_num(spec_lineno), "unknown directive: %s", c_str(symbol_name(directive))); } } break; case STR: { obj_t *find = search_str(dataline, elem, pos, nil); obj_t *newpos; if (find == nil || !equal(find, pos)) { LOG_MISMATCH("string"); return nil; } newpos = plus(find, length_str(elem)); LOG_MATCH("string", newpos); pos = newpos; break; } default: yyerrorlf(0, c_num(spec_lineno), "unsupported object in spec"); } specline = cdr(specline); } return cons(bindings, pos); } obj_t *format_field(obj_t *string_or_list, obj_t *spec) { if (!stringp(string_or_list)) return string_or_list; { obj_t *right = lt(spec, zero); obj_t *width = if3(lt(spec, zero), neg(spec), spec); obj_t *diff = minus(width, length_str(string_or_list)); if (le(diff, zero)) return string_or_list; if (ge(length_str(string_or_list), width)) return string_or_list; { obj_t *padding = mkstring(diff, chr(' ')); return if3(right, cat_str(list(padding, string_or_list, nao), nil), cat_str(list(string_or_list, padding, nao), nil)); } } } obj_t *subst_vars(obj_t *spec, obj_t *bindings) { list_collect_decl(out, iter); while (spec) { obj_t *elem = first(spec); if (consp(elem) && first(elem) == var) { obj_t *sym = second(elem); obj_t *pat = third(elem); obj_t *modifier = fourth(elem); obj_t *pair = assoc(bindings, sym); if (pair) { if (pat) spec = cons(cdr(pair), cons(pat, rest(spec))); else if (nump(modifier)) spec = cons(format_field(cdr(pair), modifier), rest(spec)); else spec = cons(cdr(pair), rest(spec)); continue; } } list_collect(iter, elem); spec = cdr(spec); } return out; } typedef struct fpip { FILE *f; DIR *d; enum { fpip_fclose, fpip_pclose, fpip_closedir } close; } fpip_t; fpip_t complex_open(obj_t *name, obj_t *output) { fpip_t ret = { 0 }; const char *namestr = c_str(name); long len = c_num(length_str(name)); if (len == 0) return ret; if (!strcmp(namestr, "-")) { ret.close = fpip_fclose; ret.f = output ? stdout : stdin; output_produced = output ? 1 : 0; } else if (namestr[0] == '!') { ret.close = fpip_pclose; ret.f = popen(namestr+1, output ? "w" : "r"); } else if (namestr[0] == '$') { if (output) return ret; ret.close = fpip_closedir; ret.d = opendir(namestr+1); } else { ret.close = fpip_fclose; ret.f = fopen(namestr, output ? "w" : "r"); } return ret; } int complex_open_failed(fpip_t fp) { return fp.f == 0 && fp.d == 0; } void complex_close(fpip_t fp) { if (fp.f == 0) return; switch (fp.close) { case fpip_fclose: if (fp.f != stdin && fp.f != stdout) fclose(fp.f); return; case fpip_pclose: pclose(fp.f); return; case fpip_closedir: closedir(fp.d); return; } abort(); } obj_t *complex_snarf(fpip_t fp, obj_t *name) { switch (fp.close) { case fpip_fclose: return lazy_stream_cons(stdio_line_stream(fp.f, name)); case fpip_pclose: return lazy_stream_cons(pipe_line_stream(fp.f, name)); case fpip_closedir: return lazy_stream_cons(dirent_stream(fp.d, name)); } abort(); } obj_t *robust_length(obj_t *obj) { if (obj == nil) return zero; if (atom(obj)) return negone; return length(obj); } obj_t *bind_car(obj_t *bind_cons) { return if3(consp(cdr(bind_cons)), cons(car(bind_cons), car(cdr(bind_cons))), bind_cons); } obj_t *bind_cdr(obj_t *bind_cons) { return if3(consp(cdr(bind_cons)), cons(car(bind_cons), cdr(cdr(bind_cons))), bind_cons); } obj_t *extract_vars(obj_t *output_spec) { list_collect_decl (vars, tai); if (consp(output_spec)) { if (first(output_spec) == var) { list_collect (tai, second(output_spec)); } else { for (; output_spec; output_spec = cdr(output_spec)) list_collect_nconc(tai, extract_vars(car(output_spec))); } } return vars; } obj_t *extract_bindings(obj_t *bindings, obj_t *output_spec) { list_collect_decl (bindings_out, tail); obj_t *var_list = extract_vars(output_spec); for (; bindings; bindings = cdr(bindings)) if (memq(car(car(bindings)), var_list)) list_collect(tail, car(bindings)); return bindings_out; } void do_output_line(obj_t *bindings, obj_t *specline, obj_t *spec_lineno, FILE *out) { for (; specline; specline = rest(specline)) { obj_t *elem = first(specline); switch (elem ? elem->t.type : 0) { case CONS: { obj_t *directive = first(elem); if (directive == var) { obj_t *str = cat_str(subst_vars(cons(elem, nil), bindings), nil); if (str == nil) { yyerrorlf(1, c_num(spec_lineno), "bad substitution: %s", c_str(symbol_name(second(elem)))); continue; } fputs(c_str(str), out); } else if (directive == rep) { obj_t *main_clauses = second(elem); obj_t *single_clauses = third(elem); obj_t *first_clauses = fourth(elem); obj_t *last_clauses = fifth(elem); obj_t *empty_clauses = sixth(elem); obj_t *bind_cp = extract_bindings(bindings, elem); obj_t *max_depth = reduce_left(func_n2(max2), bind_cp, zero, chain(list(func_n1(cdr), func_n1(robust_length), nao))); if (equal(max_depth, zero) && empty_clauses) { do_output_line(bindings, empty_clauses, spec_lineno, out); } else if (equal(max_depth, one) && single_clauses) { obj_t *bind_a = mapcar(func_n1(bind_car), bind_cp); do_output_line(bind_a, single_clauses, spec_lineno, out); } else if (!zerop(max_depth)) { long i; for (i = 0; i < c_num(max_depth); i++) { obj_t *bind_a = mapcar(func_n1(bind_car), bind_cp); obj_t *bind_d = mapcar(func_n1(bind_cdr), bind_cp); if (i == 0 && first_clauses) { do_output_line(bind_a, first_clauses, spec_lineno, out); } else if (i == c_num(max_depth) - 1 && last_clauses) { do_output_line(bind_a, last_clauses, spec_lineno, out); } else { do_output_line(bind_a, main_clauses, spec_lineno, out); } bind_cp = bind_d; } } } else { yyerrorlf(0, c_num(spec_lineno), "unknown directive: %s", c_str(symbol_name(directive))); } } break; case STR: fputs(c_str(elem), out); break; case 0: break; default: yyerrorlf(0, c_num(spec_lineno), "unsupported object in output spec"); } } } void do_output(obj_t *bindings, obj_t *specs, FILE *out) { if (equal(specs, null_list)) return; for (; specs; specs = cdr(specs)) { cons_bind (spec_lineno, specline, first(specs)); obj_t *first_elem = first(specline); if (consp(first_elem)) { obj_t *sym = first(first_elem); if (sym == repeat) { obj_t *main_clauses = second(first_elem); obj_t *single_clauses = third(first_elem); obj_t *first_clauses = fourth(first_elem); obj_t *last_clauses = fifth(first_elem); obj_t *empty_clauses = sixth(first_elem); obj_t *bind_cp = extract_bindings(bindings, first_elem); obj_t *max_depth = reduce_left(func_n2(max2), bind_cp, zero, chain(list(func_n1(cdr), func_n1(robust_length), nao))); if (equal(max_depth, zero) && empty_clauses) { do_output(bind_cp, empty_clauses, out); } else if (equal(max_depth, one) && single_clauses) { obj_t *bind_a = mapcar(func_n1(bind_car), bind_cp); do_output(bind_a, single_clauses, out); } else if (!zerop(max_depth)) { long i; for (i = 0; i < c_num(max_depth); i++) { obj_t *bind_a = mapcar(func_n1(bind_car), bind_cp); obj_t *bind_d = mapcar(func_n1(bind_cdr), bind_cp); if (i == 0 && first_clauses) { do_output(bind_a, first_clauses, out); } else if (i == c_num(max_depth) - 1 && last_clauses) { do_output(bind_a, last_clauses, out); } else { do_output(bind_a, main_clauses, out); } bind_cp = bind_d; } } continue; } } do_output_line(bindings, specline, spec_lineno, out); putc('\n', out); } } obj_t *match_files(obj_t *spec, obj_t *files, obj_t *bindings, obj_t *first_file_parsed, obj_t *data_linenum) { obj_t *data = nil; long data_lineno = 0; if (first_file_parsed) { data = first_file_parsed; data_lineno = c_num(data_linenum); first_file_parsed = nil; } else if (files) { obj_t *name = first(files); fpip_t fp = (errno = 0, complex_open(name, nil)); yyerrorf(2, "opening data source %s", c_str(name)); if (complex_open_failed(fp)) { if (errno != 0) yyerrorf(2, "could not open %s: %s", c_str(name), strerror(errno)); else yyerrorf(2, "could not open %s", c_str(name)); return nil; } if ((data = complex_snarf(fp, name)) != nil) data_lineno = 1; } for (; spec; spec = rest(spec), data = rest(data), data_lineno++) repeat_spec_same_data: { obj_t *specline = rest(first(spec)); obj_t *dataline = first(data); obj_t *spec_linenum = first(first(spec)); obj_t *first_spec = first(specline); long spec_lineno = spec_linenum ? c_num(spec_linenum) : 0; if (consp(first_spec)) { obj_t *sym = first(first_spec); if (sym == skip) { obj_t *max = first(rest(first_spec)); long cmax = nump(max) ? c_num(max) : 0; long reps = 0; if (rest(specline)) yyerrorlf(1, spec_lineno, "material after skip directive ignored"); if ((spec = rest(spec)) == nil) break; { uw_block_begin(nil, result); while (dataline && (!max || reps++ < cmax)) { cons_bind (new_bindings, success, match_files(spec, files, bindings, data, num(data_lineno))); if (success) { yyerrorlf(2, spec_lineno, "skip matched %s:%ld", c_str(first(files)), data_lineno); result = cons(new_bindings, cons(data, num(data_lineno))); break; } yyerrorlf(2, spec_lineno, "skip didn't match %s:%ld", c_str(first(files)), data_lineno); data = rest(data); data_lineno++; dataline = first(data); } uw_block_end; if (result) return result; } yyerrorlf(2, spec_lineno, "skip failed"); return nil; } else if (sym == block) { obj_t *name = first(rest(first_spec)); if (rest(specline)) yyerrorlf(1, spec_lineno, "material after block directive ignored"); if ((spec = rest(spec)) == nil) break; uw_block_begin(name, result); result = match_files(spec, files, bindings, data, num(data_lineno)); uw_block_end; return result; } else if (sym == fail || sym == accept) { obj_t *target = first(rest(first_spec)); if (rest(specline)) yyerrorlf(1, spec_lineno, "material after %s ignored", c_str(symbol_name(sym))); uw_block_return(target, if2(sym == accept, cons(bindings, if3(data, cons(data, num(data_lineno)), t)))); if (target) yyerrorlf(1, spec_lineno, "%s: no block named %s in scope", c_str(symbol_name(sym)), c_str(symbol_name(target))); else yyerrorlf(1, spec_lineno, "%s: not anonymous block in scope", c_str(symbol_name(sym))); return nil; } else if (sym == next) { if (rest(first_spec)) yyerrorlf(0, spec_lineno, "next takes no args"); if ((spec = rest(spec)) == nil) break; if (rest(specline)) { obj_t *sub = subst_vars(rest(specline), bindings); obj_t *str = cat_str(sub, nil); if (str == nil) { yyerrorlf(2, spec_lineno, "bad substitution in next file spec"); continue; } files = cons(str, files); } else { files = rest(files); } /* We recursively process the file list, but the new data position we return to the caller must be in the original file we we were called with. Hence, we can't make a straight tail call here. */ { cons_bind (new_bindings, success, match_files(spec, files, bindings, nil, nil)); if (success) return cons(new_bindings, if3(data, cons(data, num(data_lineno)), t)); return nil; } } else if (sym == some || sym == all || sym == none || sym == maybe) { obj_t *specs; obj_t *all_match = t; obj_t *some_match = nil; obj_t *max_line = zero; obj_t *max_data = nil; for (specs = rest(first_spec); specs != nil; specs = rest(specs)) { obj_t *nested_spec = first(specs); obj_t *data_linenum = num(data_lineno); cons_bind (new_bindings, success, match_files(nested_spec, files, bindings, data, data_linenum)); if (success) { bindings = new_bindings; some_match = t; if (success == t) { max_data = t; } else if (consp(success) && max_data != t) { cons_bind (new_data, new_line, success); if (gt(new_line, max_line)) { max_line = new_line; max_data = new_data; } } } else { all_match = nil; } } if (sym == all && !all_match) { yyerrorlf(2, spec_lineno, "all: some clauses didn't match"); return nil; } if (sym == some && !some_match) { yyerrorlf(2, spec_lineno, "some: no clauses matched"); return nil; } if (sym == none && some_match) { yyerrorlf(2, spec_lineno, "none: some clauses matched"); return nil; } /* No check for maybe, since it always succeeds. */ if (consp(max_data)) { data_lineno = c_num(max_line); data = max_data; } else if (max_data == t) { data = nil; } if ((spec = rest(spec)) == nil) break; goto repeat_spec_same_data; } else if (sym == collect) { obj_t *coll_spec = second(first_spec); obj_t *until_spec = third(first_spec); obj_t *bindings_coll = nil; obj_t *iter; uw_block_begin(nil, result); result = t; while (data) { cons_bind (new_bindings, success, match_files(coll_spec, files, bindings, data, num(data_lineno))); /* Until clause sees un-collated bindings from collect. */ if (until_spec) { cons_bind (discarded_bindings, success, match_files(until_spec, files, new_bindings, data, num(data_lineno))); if (success) { (void) discarded_bindings; break; } } if (success) { yyerrorlf(2, spec_lineno, "collect matched %s:%ld", c_str(first(files)), data_lineno); for (iter = new_bindings; iter && iter != bindings; iter = cdr(iter)) { obj_t *binding = car(iter); obj_t *existing = assoc(bindings_coll, car(binding)); bindings_coll = acons_new(bindings_coll, car(binding), cons(cdr(binding), cdr(existing))); } } if (success) { if (consp(success)) { yyerrorlf(2, spec_lineno, "collect advancing from line %ld to %ld", data_lineno, c_num(cdr(success))); data = car(success); data_lineno = c_num(cdr(success)); } else { yyerrorlf(2, spec_lineno, "collect consumed entire file"); data = nil; break; } } else { data = rest(data); data_lineno++; } } uw_block_end; if (!result) { yyerrorlf(2, spec_lineno, "collect explicitly failed"); return nil; } if (!bindings_coll) yyerrorlf(2, spec_lineno, "nothing was collected"); for (iter = bindings_coll; iter; iter = cdr(iter)) { obj_t *pair = car(iter); obj_t *rev = cons(car(pair), nreverse(cdr(pair))); bindings = cons(rev, bindings); } if ((spec = rest(spec)) == nil) break; goto repeat_spec_same_data; } else if (sym == flattn) { obj_t *iter; for (iter = rest(first_spec); iter; iter = rest(iter)) { obj_t *sym = first(iter); if (!symbolp(sym)) { yyerrorlf(1, spec_lineno, "non-symbol in flatten directive"); continue; } else { obj_t *existing = assoc(bindings, sym); if (existing) *cdr_l(existing) = flatten(cdr(existing)); } } if ((spec = rest(spec)) == nil) break; goto repeat_spec_same_data; } else if (sym == forget) { bindings = alist_remove(bindings, rest(first_spec)); if ((spec = rest(spec)) == nil) break; goto repeat_spec_same_data; } else if (sym == mrge) { obj_t *target = first(rest(first_spec)); obj_t *args = rest(rest(first_spec)); obj_t *exists = assoc(bindings, target); obj_t *merged = nil; if (!target || !symbolp(target)) yyerrorlf(1, spec_lineno, "bad merge directive"); if (exists) yyerrorlf(1, spec_lineno, "merge: symbol %s already bound", c_str(symbol_name(target))); for (; args; args = rest(args)) { obj_t *other_sym = first(args); if (other_sym) { obj_t *other_lookup = assoc(bindings, other_sym); if (!symbolp(other_sym)) yyerrorlf(1, spec_lineno, "non-symbol in merge directive"); else if (!other_lookup) yyerrorlf(1, spec_lineno, "merge: nonexistent symbol %s", c_str(symbol_name(sym))); if (merged) merged = weird_merge(merged, cdr(other_lookup)); else merged = cdr(other_lookup); } } bindings = acons_new(bindings, target, merged); if ((spec = rest(spec)) == nil) break; goto repeat_spec_same_data; } else if (sym == bind) { obj_t *args = rest(first_spec); obj_t *pattern = first(args); obj_t *var = second(args); obj_t *lookup = assoc(bindings, var); if (!var || !symbolp(var)) yyerrorlf(1, spec_lineno, "bind: bad variable spec"); else if (!lookup) yyerrorlf(1, spec_lineno, "bind: unbound source variable"); bindings = dest_bind(bindings, pattern, cdr(lookup)); if (bindings == t) return nil; if ((spec = rest(spec)) == nil) break; goto repeat_spec_same_data; } else if (sym == cat) { obj_t *iter; for (iter = rest(first_spec); iter; iter = rest(iter)) { obj_t *sym = first(iter); if (!symbolp(sym)) { yyerrorlf(1, spec_lineno, "non-symbol in cat directive"); continue; } else { obj_t *existing = assoc(bindings, sym); obj_t *sep = nil; if (rest(specline)) { obj_t *sub = subst_vars(rest(specline), bindings); sep = cat_str(sub, nil); } if (existing) *cdr_l(existing) = cat_str(flatten(cdr(existing)), sep); } } if ((spec = rest(spec)) == nil) break; goto repeat_spec_same_data; } else if (sym == output) { obj_t *specs = second(first_spec); obj_t *dest_opt = third(first_spec); obj_t *dest = dest_opt ? cat_str(subst_vars(dest_opt, bindings), nil) : string(chk_strdup("-")); fpip_t fp = (errno = 0, complex_open(dest, t)); yyerrorf(2, "opening data sink %s", c_str(dest)); if (complex_open_failed(fp)) { if (errno != 0) yyerrorf(2, "could not open %s: %s", c_str(dest), strerror(errno)); else yyerrorf(2, "could not open %s", c_str(dest)); } else { do_output(bindings, specs, fp.f); complex_close(fp); } if ((spec = rest(spec)) == nil) break; goto repeat_spec_same_data; } } if (dataline == nil) return nil; { cons_bind (new_bindings, success, match_line(bindings, specline, dataline, zero, spec_linenum, num(data_lineno), first(files))); if (nump(success) && c_num(success) < c_num(length_str(dataline))) { yyerrorf(2, "spec only matches line to position %ld: %s", c_num(success), c_str(dataline)); return nil; } if (!success) return nil; bindings = new_bindings; } } return cons(bindings, if3(data, cons(data, num(data_lineno)), t)); } int extract(obj_t *spec, obj_t *files, obj_t *predefined_bindings) { cons_bind (bindings, success, match_files(spec, files, predefined_bindings, nil, nil)); if (!output_produced) { if (!opt_nobindings) { if (bindings) { bindings = nreverse(bindings); dump_bindings(bindings); } } if (!success) puts("false"); } return success ? 0 : EXIT_FAILURE; }