diff options
Diffstat (limited to 'extract.y')
-rw-r--r-- | extract.y | 1620 |
1 files changed, 1620 insertions, 0 deletions
diff --git a/extract.y b/extract.y new file mode 100644 index 00000000..594b341e --- /dev/null +++ b/extract.y @@ -0,0 +1,1620 @@ +/* Copyright 2009 + * Kaz Kylheku <kkylheku@gmail.com> + * Vancouver, Canada + * All rights reserved. + * + * BSD License: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +%{ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <assert.h> +#include <limits.h> +#include <errno.h> +#include <dirent.h> +#include <setjmp.h> +#include "lib.h" +#include "gc.h" +#include "unwind.h" +#include "regex.h" +#include "extract.h" + +int yylex(void); +void yyerror(const char *); + +obj_t *repeat_rep_helper(obj_t *sym, obj_t *main, obj_t *parts); + +static obj_t *parsed_spec; +static int output_produced; + +%} + +%union { + char *lexeme; + union obj *obj; + char chr; + long num; +} + +%token <lexeme> TEXT IDENT ALL SOME NONE MAYBE AND OR END COLLECT UNTIL COLL +%token <lexeme> OUTPUT REPEAT REP SINGLE FIRST LAST EMPTY +%token <num> NUMBER +%token <chr> REGCHAR + +%type <obj> spec clauses clause all_clause some_clause none_clause maybe_clause +%type <obj> collect_clause clause_parts additional_parts output_clause +%type <obj> line elems_opt elems elem var var_op list exprs expr +%type <obj> out_clauses out_clauses_opt out_clause +%type <obj> repeat_clause repeat_parts_opt o_line +%type <obj> o_elems_opt o_elems_opt2 o_elems o_elem rep_elem rep_parts_opt +%type <obj> regex regexpr regbranch +%type <obj> regterm regclass regclassterm regrange +%type <chr> regchar + +%nonassoc ALL SOME NONE MAYBE AND OR END COLLECT UNTIL COLL +%nonassoc OUTPUT REPEAT REP FIRST LAST EMPTY +%nonassoc '{' '}' '[' ']' '(' ')' +%right IDENT TEXT NUMBER +%left '|' '/' +%right '*' '?' '+' +%right '^' '.' '\\' REGCHAR + +%% + +spec : clauses { parsed_spec = $1; } + | { parsed_spec = nil; } + | error { parsed_spec = nil; + yybadtoken(yychar, 0); } + ; + +clauses : clause { $$ = cons($1, nil); } + | clause clauses { $$ = cons($1, $2); } + ; + +clause : all_clause { $$ = list(num(lineno - 1), $1, nao); } + | some_clause { $$ = list(num(lineno - 1), $1, nao); } + | none_clause { $$ = list(num(lineno - 1), $1, nao); } + | maybe_clause { $$ = list(num(lineno - 1), $1, nao); } + | collect_clause { $$ = list(num(lineno - 1), $1, nao); } + | output_clause { $$ = list(num(lineno - 1), $1, nao); } + | line { $$ = $1; } + | repeat_clause { $$ = nil; + yyerror("repeat outside of output"); } + ; + +all_clause : ALL newl clause_parts { $$ = cons(all, $3); } + | ALL newl error { $$ = nil; + yybadtoken(yychar, + "all clause"); } + | ALL newl END { $$ = nil; + yyerror("empty all clause"); } + + ; + +some_clause : SOME newl clause_parts { $$ = cons(some, $3); } + | SOME newl error { $$ = nil; + yybadtoken(yychar, + "some clause"); } + | SOME newl END { $$ = nil; + yyerror("empty some clause"); } + ; + +none_clause : NONE newl clause_parts { $$ = cons(none, $3); } + | NONE newl error { $$ = nil; + yybadtoken(yychar, + "none clause"); } + | NONE newl END { $$ = nil; + yyerror("empty none clause"); } + ; + +maybe_clause : MAYBE newl clause_parts { $$ = cons(maybe, $3); } + | MAYBE newl error { $$ = nil; + yybadtoken(yychar, + "maybe clause"); } + | MAYBE newl END { $$ = nil; + yyerror("empty maybe clause"); } + ; + +collect_clause : COLLECT newl clauses END newl { $$ = list(collect, $3, nao); } + | COLLECT newl clauses + UNTIL newl clauses END newl { $$ = list(collect, $3, + $6, nao); } + | COLLECT newl error { $$ = nil; + if (yychar == UNTIL || yychar == END) + yyerror("empty collect"); + else + yybadtoken(yychar, + "collect clause"); } + ; + +clause_parts : clauses additional_parts { $$ = cons($1, $2); } + ; + +additional_parts : END newl { $$ = nil; } + | AND newl clauses additional_parts { $$ = cons($3, $4); } + | OR newl clauses additional_parts { $$ = cons($3, $4); } + ; + +line : elems_opt '\n' { $$ = $1; } + ; + +elems_opt : elems { $$ = cons(num(lineno - 1), $1); } + | { $$ = nil; } + ; + +elems : elem { $$ = cons($1, nil); } + | elem elems { $$ = cons($1, $2); } + | rep_elem { $$ = nil; + yyerror("rep outside of output"); } + ; + +elem : TEXT { $$ = string($1); } + | var { $$ = $1; } + | list { $$ = $1; } + | regex { $$ = cons(regex_compile($1), $1); } + | COLL elems END { $$ = list(coll, $2, nao); } + | COLL elems + UNTIL elems END { $$ = list(coll, $2, $4, nao); } + | COLL error { $$ = nil; + yybadtoken(yychar, "coll clause"); } + ; + +output_clause : OUTPUT o_elems '\n' + out_clauses + END newl { $$ = list(output, $4, $2, nao); } + | OUTPUT newl + out_clauses + END newl { $$ = list(output, $3, nao); } + | OUTPUT o_elems '\n' + error { $$ = nil; + yybadtoken(yychar, "output clause"); } + | OUTPUT newl + error { $$ = nil; + yybadtoken(yychar, "output clause"); } + ; + +out_clauses : out_clause { $$ = cons($1, nil); } + | out_clause out_clauses { $$ = cons($1, $2); } + ; + +out_clause : repeat_clause { $$ = list(num(lineno - 1), $1, nao); } + | o_line { $$ = $1; } + | all_clause { $$ = nil; + yyerror("match clause in output"); } + | some_clause { $$ = nil; + yyerror("match clause in output"); } + | none_clause { $$ = nil; + yyerror("match clause in output"); } + | maybe_clause { $$ = nil; + yyerror("match clause in output"); } + | collect_clause { $$ = nil; + yyerror("match clause in output"); } + | output_clause { $$ = nil; + yyerror("match clause in output"); } + ; + +repeat_clause : REPEAT newl + out_clauses + repeat_parts_opt + END newl { $$ = repeat_rep_helper(repeat, $3, $4); } + | REPEAT newl + error { $$ = nil; + yybadtoken(yychar, "repeat clause"); } + ; + +repeat_parts_opt : SINGLE newl + out_clauses_opt + repeat_parts_opt { $$ = cons(cons(single, $3), $4); } + | FIRST newl + out_clauses_opt + repeat_parts_opt { $$ = cons(cons(frst, $3), $4); } + | LAST newl + out_clauses_opt + repeat_parts_opt { $$ = cons(cons(lst, $3), $4); } + | EMPTY newl + out_clauses_opt + repeat_parts_opt { $$ = cons(cons(empty, $3), $4); } + | /* empty */ { $$ = nil; } + ; + + +out_clauses_opt : out_clauses { $$ = $1; } + | /* empty */ { $$ = null_list; } + +o_line : o_elems_opt '\n' { $$ = $1; } + ; + +o_elems_opt : o_elems { $$ = cons(num(lineno - 1), $1); } + | { $$ = nil; } + ; + +o_elems_opt2 : o_elems { $$ = $1; } + | { $$ = null_list; } + ; + +o_elems : o_elem { $$ = cons($1, nil); } + | o_elem o_elems { $$ = cons($1, $2); } + ; + +o_elem : TEXT { $$ = string($1); } + | var { $$ = $1; } + | rep_elem { $$ = $1; } + ; + +rep_elem : REP o_elems + rep_parts_opt END { $$ = repeat_rep_helper(rep, $2, $3); } + | REP error { $$ = nil; yybadtoken(yychar, "rep clause"); } + ; + +rep_parts_opt : SINGLE o_elems_opt2 + rep_parts_opt { $$ = cons(cons(single, $2), $3); } + | FIRST o_elems_opt2 + rep_parts_opt { $$ = cons(cons(frst, $2), $3); } + | LAST o_elems_opt2 + rep_parts_opt { $$ = cons(cons(lst, $2), $3); } + | EMPTY o_elems_opt2 + rep_parts_opt { $$ = cons(cons(empty, $2), $3); } + | /* empty */ { $$ = nil; } + ; + + +/* This sucks, but factoring '*' into a nonterminal + * that generates an empty phrase causes reduce/reduce conflicts. + */ +var : IDENT { $$ = list(var, intern(string($1)), nao); } + | IDENT elem { $$ = list(var, intern(string($1)), $2, nao); } + | '{' IDENT '}' { $$ = list(var, intern(string($2)), nao); } + | '{' IDENT '}' elem { $$ = list(var, intern(string($2)), $4, nao); } + | '{' IDENT regex '}' { $$ = list(var, intern(string($2)), + nil, cons(regex_compile($3), $3), + nao); } + | '{' IDENT NUMBER '}' { $$ = list(var, intern(string($2)), + nil, num($3), nao); } + | var_op IDENT { $$ = list(var, intern(string($2)), + nil, $1, nao); } + | var_op IDENT elem { $$ = list(var, intern(string($2)), + $3, $1, nao); } + | var_op '{' IDENT '}' { $$ = list(var, intern(string($3)), + nil, $1, nao); } + | var_op '{' IDENT '}' elem { $$ = list(var, intern(string($3)), + $5, $1, nao); } + | IDENT error { $$ = nil; + yybadtoken(yychar, "variable spec"); } + | var_op error { $$ = nil; + yybadtoken(yychar, "variable spec"); } + ; + +var_op : '*' { $$ = t; } + ; + +list : '(' exprs ')' { $$ = $2; } + | '(' ')' { $$ = nil; } + | '(' error { $$ = nil; + yybadtoken(yychar, "list expression"); } + ; + +exprs : expr { $$ = cons($1, nil); } + | expr exprs { $$ = cons($1, $2); } + | expr '.' expr { $$ = cons($1, $3); } + ; + +expr : IDENT { $$ = intern(string($1)); } + | NUMBER { $$ = num($1); } + | list { $$ = $1; } + | regex { $$ = cons(regex_compile($1), $1); } + ; + +regex : '/' regexpr '/' { $$ = $2; } + | '/' '/' { $$ = nil; } + | '/' error { $$ = nil; + yybadtoken(yychar, "regex"); } + ; + +regexpr : regbranch { $$ = $1; } + | regbranch '|' regbranch { $$ = list(list(or, $1, + $3, nao), nao); } + ; + +regbranch : regterm { $$ = cons($1, nil); } + | regterm regbranch { $$ = cons($1, $2); } + ; + +regterm : '[' regclass ']' { $$ = cons(set, $2); } + | '[' '^' regclass ']' { $$ = cons(cset, $3); } + | '.' { $$ = wild; } + | '^' { $$ = chr('^'); } + | ']' { $$ = chr(']'); } + | '-' { $$ = chr('-'); } + | regterm '*' { $$ = list(zeroplus, $1, nao); } + | regterm '+' { $$ = list(oneplus, $1, nao); } + | regterm '?' { $$ = list(optional, $1, nao); } + | REGCHAR { $$ = chr($1); } + | '(' regexpr ')' { $$ = cons(compound, $2); } + | '(' error { $$ = nil; + yybadtoken(yychar, "regex subexpression"); } + | '[' error { $$ = nil; + yybadtoken(yychar, "regex character class"); } + ; + +regclass : regclassterm { $$ = cons($1, nil); } + | regclassterm regclass { $$ = cons($1, $2); } + ; + +regclassterm : regrange { $$ = $1; } + | regchar { $$ = chr($1); } + ; + +regrange : regchar '-' regchar { $$ = cons(chr($1), chr($3)); } + +regchar : '?' { $$ = '?'; } + | '.' { $$ = '.'; } + | '*' { $$ = '*'; } + | '+' { $$ = '+'; } + | '(' { $$ = '('; } + | ')' { $$ = ')'; } + | '^' { $$ = '^'; } + | '|' { $$ = '|'; } + | REGCHAR { $$ = $1; } + ; + +newl : '\n' + | error '\n' { yyerror("newline expected after directive"); + yyerrok; } + ; + +%% + +obj_t *repeat_rep_helper(obj_t *sym, obj_t *main, obj_t *parts) +{ + obj_t *single_parts = nil; + obj_t *first_parts = nil; + obj_t *last_parts = nil; + obj_t *empty_parts = nil; + obj_t *iter; + + for (iter = parts; iter != nil; iter = cdr(iter)) { + obj_t *part = car(iter); + obj_t *sym = car(part); + obj_t *clauses = cdr(part); + + if (sym == single) + single_parts = nappend2(single_parts, clauses); + else if (sym == frst) + first_parts = nappend2(first_parts, clauses); + else if (sym == lst) + last_parts = nappend2(last_parts, clauses); + else if (sym == empty) + empty_parts = nappend2(empty_parts, clauses); + else + abort(); + } + + return list(sym, main, single_parts, first_parts, + last_parts, empty_parts, nao); +} + +obj_t *get_spec(void) +{ + return parsed_spec; +} + +void dump_shell_string(const char *str) +{ + int ch; + + putchar('"'); + while ((ch = *str++) != 0) { + switch (ch) { + case '"': case '`': case '$': case '\\': case '\n': + putchar('\\'); + /* fallthrough */ + default: + putchar(ch); + } + } + putchar('"'); +} + +void dump_var(const char *name, char *pfx1, size_t len1, + char *pfx2, size_t len2, obj_t *value, int level) +{ + if (len1 >= 112 || len2 >= 112) + abort(); + + if (stringp(value)) { + fputs(name, stdout); + fputs(pfx1, stdout); + fputs(pfx2, stdout); + putchar('='); + dump_shell_string(c_str(value)); + putchar('\n'); + } else { + obj_t *iter; + int i; + size_t add1 = 0, add2 = 0; + + for (i = 0, iter = value; iter; iter = cdr(iter), i++) { + if (level < opt_arraydims) { + add2 = sprintf(pfx2 + len2, "[%d]", i); + add1 = 0; + } else { + add1 = sprintf(pfx1 + len1, "_%d", i); + add2 = 0; + } + + dump_var(name, pfx1, len1 + add1, pfx2, len2 + add2, car(iter), level + 1); + } + } +} + +void dump_bindings(obj_t *bindings) +{ + if (opt_loglevel >= 2) { + fputs("raw_bindings:\n", stderr); + dump(bindings, stderr); + } + + while (bindings) { + char pfx1[128], pfx2[128]; + obj_t *var = car(car(bindings)); + obj_t *value = cdr(car(bindings)); + const char *name = c_str(symbol_name(var)); + *pfx1 = 0; *pfx2 = 0; + dump_var(name, pfx1, 0, pfx2, 0, value, 0); + bindings = cdr(bindings); + } +} + +obj_t *depth(obj_t *obj) +{ + obj_t *dep = zero; + + if (obj == nil) + return one; + + if (atom(obj)) + return zero; + + while (obj) { + dep = max2(dep, depth(first(obj))); + obj = rest(obj); + } + + return plus(dep, one); +} + +obj_t *merge(obj_t *left, obj_t *right) +{ + obj_t *left_depth = depth(left); + obj_t *right_depth = depth(right); + + while (lt(left_depth, right_depth) || zerop(left_depth)) { + left = cons(left, nil); + left_depth = plus(left_depth, one); + } + + while (lt(right_depth, left_depth) || zerop(right_depth)) { + right = cons(right, nil); + right_depth = plus(right_depth, one); + } + + return append2(left, right); +} + +obj_t *dest_bind(obj_t *bindings, obj_t *pattern, obj_t *value) +{ + if (nullp(pattern)) + return bindings; + + if (symbolp(pattern)) { + obj_t *existing = assoc(bindings, pattern); + if (existing) { + if (tree_find(value, cdr(existing))) + return bindings; + if (tree_find(cdr(existing), value)) + return bindings; + yyerrorf(2, "bind variable mismatch: %s", c_str(symbol_name(pattern))); + return t; + } + return cons(cons(pattern, value), bindings); + } + + if (consp(pattern)) { + obj_t *piter = pattern, *viter = value; + + while (consp(piter) && consp(viter)) + { + bindings = dest_bind(bindings, car(piter), car(viter)); + if (bindings == t) + return t; + piter = cdr(piter); + viter = cdr(viter); + } while (consp(piter) && consp(viter)); + + if (symbolp(piter)) { + bindings = dest_bind(bindings, piter, viter); + if (bindings == t) + return t; + } + } + + return bindings; +} + +obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, + obj_t *pos, obj_t *spec_lineno, obj_t *data_lineno, + obj_t *file) +{ +#define LOG_MISMATCH(KIND) \ + yyerrorlf(2, c_num(spec_lineno), \ + "%s mismatch, position %ld (%s:%ld)", (KIND), c_num(pos), \ + c_str(file), c_num(data_lineno)); \ + yyerrorlf(2, c_num(spec_lineno), " %s", c_str(dataline)); \ + if (c_num(pos) < 77) \ + yyerrorlf(2, c_num(spec_lineno), " %*s^", (int) c_num(pos), "") + +#define LOG_MATCH(KIND, EXTENT) \ + yyerrorlf(2, c_num(spec_lineno), \ + "%s matched, position %ld-%ld (%s:%ld)", (KIND), \ + c_num(pos), c_num(EXTENT), c_str(file), \ + c_num(data_lineno)); \ + yyerrorlf(2, c_num(spec_lineno), " %s", c_str(dataline)); \ + if (c_num(EXTENT) < 77) \ + yyerrorlf(2, c_num(spec_lineno), " %*s%-*s^", (int) c_num(pos), \ + "", (int) (c_num(EXTENT) - c_num(pos)), "^") + for (;;) { + obj_t *elem; + + if (specline == nil) + break; + + elem = first(specline); + + switch (elem ? elem->t.type : 0) { + case CONS: /* directive */ + { + obj_t *directive = first(elem); + + if (directive == var) { + obj_t *sym = second(elem); + obj_t *pat = third(elem); + obj_t *modifier = fourth(elem); + obj_t *pair = assoc(bindings, sym); /* var exists already? */ + + if (pair) { + /* If the variable already has a binding, we replace + it with its value, and treat it as a string match. + The spec looks like ((var <sym> <pat>) ...) + and it must be transformed into + (<sym-substituted> <pat> ...) */ + if (pat) { + specline = cons(cdr(pair), cons(pat, rest(specline))); + } else if (nump(modifier)) { + obj_t *past = plus(pos, modifier); + + if (c_num(past) > c_num(length_str(dataline)) || + c_num(past) < c_num(pos)) + { + LOG_MISMATCH("fixed field size"); + return nil; + } + + if (!tree_find(trim_str(sub_str(dataline, pos, past)), + cdr(pair))) + { + LOG_MISMATCH("fixed field contents"); + return nil; + } + + LOG_MATCH("fixed field", past); + pos = past; + specline = cdr(specline); + } else { + specline = cons(cdr(pair), rest(specline)); + } + continue; + } else if (pat == nil) { /* match to end of line or with regex */ + if (consp(modifier)) { + obj_t *past = match_regex(dataline, car(modifier), pos); + if (nullp(past)) { + LOG_MISMATCH("var positive regex"); + return nil; + } + LOG_MATCH("var positive regex", past); + bindings = acons_new(bindings, sym, sub_str(dataline, pos, past)); + pos = past; + } else if (nump(modifier)) { + obj_t *past = plus(pos, modifier); + if (c_num(past) > c_num(length_str(dataline)) || + c_num(past) < c_num(pos)) + { + LOG_MISMATCH("count based var"); + return nil; + } + LOG_MATCH("count based var", past); + bindings = acons_new(bindings, sym, trim_str(sub_str(dataline, pos, past))); + pos = past; + } else { + bindings = acons_new(bindings, sym, sub_str(dataline, pos, nil)); + pos = length_str(dataline); + } + } else if (pat->t.type == STR) { + obj_t *find = search_str(dataline, pat, pos, modifier); + if (!find) { + LOG_MISMATCH("var delimiting string"); + return nil; + } + LOG_MATCH("var delimiting string", find); + bindings = acons_new(bindings, sym, sub_str(dataline, pos, find)); + pos = plus(find, length_str(pat)); + } else if (consp(pat) && typeof(first(pat)) == regex) { + obj_t *find = search_regex(dataline, first(pat), pos, modifier); + obj_t *fpos = car(find); + obj_t *flen = cdr(find); + if (!find) { + LOG_MISMATCH("var delimiting regex"); + return nil; + } + LOG_MATCH("var delimiting regex", fpos); + bindings = acons_new(bindings, sym, sub_str(dataline, pos, fpos)); + pos = plus(fpos, flen); + } else if (consp(pat) && first(pat) == var) { + /* Unbound var followed by var: the following one must be bound. */ + obj_t *second_sym = second(pat); + obj_t *next_pat = third(pat); + obj_t *pair = assoc(bindings, second_sym); /* var exists already? */ + + if (!pair) { + yyerrorlf(1, c_num(spec_lineno), "consecutive unbound variables"); + return nil; + } + + /* Re-generate a new spec with an edited version of + the element we just processed, and repeat. */ + { + obj_t *new_elem = list(var, sym, cdr(pair), modifier, nao); + + if (next_pat) + specline = cons(new_elem, cons(next_pat, rest(specline))); + else + specline = cons(new_elem, rest(specline)); + } + + continue; + } else if (consp(pat) && (consp(first(pat)) || stringp(first(pat)))) { + cons_bind (find, len, search_str(dataline, pat, pos, modifier)); + if (!find) { + LOG_MISMATCH("string"); + return nil; + } + bindings = acons_new(bindings, sym, sub_str(dataline, pos, find)); + pos = plus(find, len); + } else { + yyerrorlf(0, c_num(spec_lineno), "variable followed by invalid element"); + return nil; + } + } else if (typeof(directive) == regex) { + obj_t *past = match_regex(dataline, directive, pos); + if (nullp(past)) { + LOG_MISMATCH("regex"); + return nil; + } + LOG_MATCH("regex", past); + pos = past; + } else if (directive == coll) { + obj_t *coll_specline = second(elem); + obj_t *until_specline = third(elem); + obj_t *bindings_coll = nil; + obj_t *iter; + + for (;;) { + cons_bind (new_bindings, new_pos, + match_line(bindings, coll_specline, dataline, pos, + spec_lineno, data_lineno, file)); + + if (new_pos) { + LOG_MATCH("coll", new_pos); + + for (iter = new_bindings; iter && iter != bindings; + iter = cdr(iter)) + { + obj_t *binding = car(iter); + obj_t *existing = assoc(bindings_coll, car(binding)); + + bindings_coll = acons_new(bindings_coll, car(binding), + cons(cdr(binding), cdr(existing))); + } + } + + if (until_specline) { + cons_bind (until_bindings, until_pos, + match_line(bindings, until_specline, dataline, pos, + spec_lineno, data_lineno, file)); + + (void) until_bindings; + if (until_pos) { + /* The until specline matched. Special behavior: + We throw away its bindings, and run it again. + We run it again by incorporating it into the + surrouding specline, just behind the collect + item, which will be popped off. */ + LOG_MATCH("until", until_pos); + (void) new_bindings; + specline = cons(first(specline), + append2(until_specline, rest(specline))); + break; + } + LOG_MISMATCH("until"); + } + + if (new_pos && !equal(new_pos, pos)) { + pos = new_pos; + assert (c_num(pos) <= c_num(length_str(dataline))); + } else { + pos = plus(pos, one); + } + + if (c_num(pos) >= c_num(length_str(dataline))) + break; + } + + + if (!bindings_coll) + yyerrorlf(2, c_num(spec_lineno), "nothing was collected"); + + for (iter = bindings_coll; iter; iter = cdr(iter)) { + obj_t *pair = car(iter); + obj_t *rev = cons(car(pair), nreverse(cdr(pair))); + bindings = cons(rev, bindings); + } + } else if (consp(directive) || stringp(directive)) { + cons_bind (find, len, search_str_tree(dataline, elem, pos, nil)); + obj_t *newpos; + + if (find == nil || !equal(find, pos)) { + LOG_MISMATCH("string tree"); + return nil; + } + + newpos = plus(find, len); + LOG_MATCH("string tree", newpos); + pos = newpos; + } else { + yyerrorlf(0, c_num(spec_lineno), "unknown directive: %s", + c_str(symbol_name(directive))); + } + } + break; + case STR: + { + obj_t *find = search_str(dataline, elem, pos, nil); + obj_t *newpos; + if (find == nil || !equal(find, pos)) { + LOG_MISMATCH("string"); + return nil; + } + newpos = plus(find, length_str(elem)); + LOG_MATCH("string", newpos); + pos = newpos; + break; + } + default: + yyerrorlf(0, c_num(spec_lineno), "unsupported object in spec"); + } + + specline = cdr(specline); + } + + return cons(bindings, pos); +} + +obj_t *format_field(obj_t *string_or_list, obj_t *spec) +{ + if (!stringp(string_or_list)) + return string_or_list; + + { + obj_t *right = lt(spec, zero); + obj_t *width = if3(lt(spec, zero), neg(spec), spec); + obj_t *diff = minus(width, length_str(string_or_list)); + + if (le(diff, zero)) + return string_or_list; + + if (ge(length_str(string_or_list), width)) + return string_or_list; + + { + obj_t *padding = mkstring(diff, chr(' ')); + + return if3(right, + cat_str(list(padding, string_or_list, nao), nil), + cat_str(list(string_or_list, padding, nao), nil)); + } + } +} + +obj_t *subst_vars(obj_t *spec, obj_t *bindings) +{ + list_collect_decl(out, iter); + + while (spec) { + obj_t *elem = first(spec); + + if (consp(elem) && first(elem) == var) { + obj_t *sym = second(elem); + obj_t *pat = third(elem); + obj_t *modifier = fourth(elem); + obj_t *pair = assoc(bindings, sym); + + if (pair) { + if (pat) + spec = cons(cdr(pair), cons(pat, rest(spec))); + else if (nump(modifier)) + spec = cons(format_field(cdr(pair), modifier), rest(spec)); + else + spec = cons(cdr(pair), rest(spec)); + continue; + } + } + + list_collect(iter, elem); + spec = cdr(spec); + } + + return out; +} + +typedef struct fpip { + FILE *f; + DIR *d; + enum { fpip_fclose, fpip_pclose, fpip_closedir } close; +} fpip_t; + +fpip_t complex_open(obj_t *name, obj_t *output) +{ + fpip_t ret = { 0 }; + + const char *namestr = c_str(name); + long len = c_num(length_str(name)); + + if (len == 0) + return ret; + + if (!strcmp(namestr, "-")) { + ret.close = fpip_fclose; + ret.f = output ? stdout : stdin; + output_produced = output ? 1 : 0; + } else if (namestr[0] == '!') { + ret.close = fpip_pclose; + ret.f = popen(namestr+1, output ? "w" : "r"); + } else if (namestr[0] == '$') { + if (output) + return ret; + ret.close = fpip_closedir; + ret.d = opendir(namestr+1); + } else { + ret.close = fpip_fclose; + ret.f = fopen(namestr, output ? "w" : "r"); + } + + return ret; +} + +int complex_open_failed(fpip_t fp) +{ + return fp.f == 0 && fp.d == 0; +} + +void complex_close(fpip_t fp) +{ + if (fp.f == 0) + return; + switch (fp.close) { + case fpip_fclose: + if (fp.f != stdin && fp.f != stdout) + fclose(fp.f); + return; + case fpip_pclose: + pclose(fp.f); + return; + case fpip_closedir: + closedir(fp.d); + return; + } + + abort(); +} + +obj_t *complex_snarf(fpip_t fp, obj_t *name) +{ + switch (fp.close) { + case fpip_fclose: + return lazy_stream_cons(stdio_line_stream(fp.f, name)); + case fpip_pclose: + return lazy_stream_cons(pipe_line_stream(fp.f, name)); + case fpip_closedir: + return lazy_stream_cons(dirent_stream(fp.d, name)); + } + + abort(); +} + +obj_t *robust_length(obj_t *obj) +{ + if (obj == nil) + return zero; + if (atom(obj)) + return negone; + return length(obj); +} + +obj_t *bind_car(obj_t *bind_cons) +{ + return if3(consp(cdr(bind_cons)), + cons(car(bind_cons), car(cdr(bind_cons))), + bind_cons); +} + +obj_t *bind_cdr(obj_t *bind_cons) +{ + return if3(consp(cdr(bind_cons)), + cons(car(bind_cons), cdr(cdr(bind_cons))), + bind_cons); +} + +obj_t *extract_vars(obj_t *output_spec) +{ + list_collect_decl (vars, tai); + + if (consp(output_spec)) { + if (first(output_spec) == var) { + list_collect (tai, second(output_spec)); + } else { + for (; output_spec; output_spec = cdr(output_spec)) + list_collect_nconc(tai, extract_vars(car(output_spec))); + } + } + + return vars; +} + +obj_t *extract_bindings(obj_t *bindings, obj_t *output_spec) +{ + list_collect_decl (bindings_out, tail); + obj_t *var_list = extract_vars(output_spec); + + for (; bindings; bindings = cdr(bindings)) + if (memq(car(car(bindings)), var_list)) + list_collect(tail, car(bindings)); + + return bindings_out; +} + +void do_output_line(obj_t *bindings, obj_t *specline, + obj_t *spec_lineno, FILE *out) +{ + for (; specline; specline = rest(specline)) { + obj_t *elem = first(specline); + + switch (elem ? elem->t.type : 0) { + case CONS: + { + obj_t *directive = first(elem); + + if (directive == var) { + obj_t *str = cat_str(subst_vars(cons(elem, nil), bindings), nil); + if (str == nil) { + yyerrorlf(1, c_num(spec_lineno), "bad substitution: %s", + c_str(symbol_name(second(elem)))); + continue; + } + fputs(c_str(str), out); + } else if (directive == rep) { + obj_t *main_clauses = second(elem); + obj_t *single_clauses = third(elem); + obj_t *first_clauses = fourth(elem); + obj_t *last_clauses = fifth(elem); + obj_t *empty_clauses = sixth(elem); + obj_t *bind_cp = extract_bindings(bindings, elem); + obj_t *max_depth = reduce_left(func_n2(max2), + bind_cp, zero, + chain(list(func_n1(cdr), + func_n1(robust_length), + nao))); + + if (equal(max_depth, zero) && empty_clauses) { + do_output_line(bindings, empty_clauses, spec_lineno, out); + } else if (equal(max_depth, one) && single_clauses) { + obj_t *bind_a = mapcar(func_n1(bind_car), bind_cp); + do_output_line(bind_a, single_clauses, spec_lineno, out); + } else if (!zerop(max_depth)) { + long i; + + for (i = 0; i < c_num(max_depth); i++) { + obj_t *bind_a = mapcar(func_n1(bind_car), bind_cp); + obj_t *bind_d = mapcar(func_n1(bind_cdr), bind_cp); + + if (i == 0 && first_clauses) { + do_output_line(bind_a, first_clauses, spec_lineno, out); + } else if (i == c_num(max_depth) - 1 && last_clauses) { + do_output_line(bind_a, last_clauses, spec_lineno, out); + } else { + do_output_line(bind_a, main_clauses, spec_lineno, out); + } + + bind_cp = bind_d; + } + } + + } else { + yyerrorlf(0, c_num(spec_lineno), "unknown directive: %s", + c_str(symbol_name(directive))); + } + } + break; + case STR: + fputs(c_str(elem), out); + break; + case 0: + break; + default: + yyerrorlf(0, c_num(spec_lineno), "unsupported object in output spec"); + } + } +} + +void do_output(obj_t *bindings, obj_t *specs, FILE *out) +{ + if (equal(specs, null_list)) + return; + + for (; specs; specs = cdr(specs)) { + cons_bind (spec_lineno, specline, first(specs)); + obj_t *first_elem = first(specline); + + if (consp(first_elem)) { + obj_t *sym = first(first_elem); + + if (sym == repeat) { + obj_t *main_clauses = second(first_elem); + obj_t *single_clauses = third(first_elem); + obj_t *first_clauses = fourth(first_elem); + obj_t *last_clauses = fifth(first_elem); + obj_t *empty_clauses = sixth(first_elem); + obj_t *bind_cp = extract_bindings(bindings, first_elem); + obj_t *max_depth = reduce_left(func_n2(max2), + bind_cp, zero, + chain(list(func_n1(cdr), + func_n1(robust_length), + nao))); + + if (equal(max_depth, zero) && empty_clauses) { + do_output(bind_cp, empty_clauses, out); + } else if (equal(max_depth, one) && single_clauses) { + obj_t *bind_a = mapcar(func_n1(bind_car), bind_cp); + do_output(bind_a, single_clauses, out); + } else if (!zerop(max_depth)) { + long i; + + for (i = 0; i < c_num(max_depth); i++) { + obj_t *bind_a = mapcar(func_n1(bind_car), bind_cp); + obj_t *bind_d = mapcar(func_n1(bind_cdr), bind_cp); + + if (i == 0 && first_clauses) { + do_output(bind_a, first_clauses, out); + } else if (i == c_num(max_depth) - 1 && last_clauses) { + do_output(bind_a, last_clauses, out); + } else { + do_output(bind_a, main_clauses, out); + } + + bind_cp = bind_d; + } + } + continue; + } + } + + do_output_line(bindings, specline, spec_lineno, out); + putc('\n', out); + } +} + +obj_t *match_files(obj_t *spec, obj_t *files, + obj_t *bindings, obj_t *first_file_parsed, + obj_t *data_linenum) +{ + obj_t *data = nil; + long data_lineno = 0; + + if (first_file_parsed) { + data = first_file_parsed; + data_lineno = c_num(data_linenum); + first_file_parsed = nil; + } else if (files) { + obj_t *name = first(files); + fpip_t fp = (errno = 0, complex_open(name, nil)); + + yyerrorf(2, "opening data source %s", c_str(name)); + + if (complex_open_failed(fp)) { + if (errno != 0) + yyerrorf(2, "could not open %s: %s", c_str(name), strerror(errno)); + else + yyerrorf(2, "could not open %s", c_str(name)); + return nil; + } + + if ((data = complex_snarf(fp, name)) != nil) + data_lineno = 1; + } + + for (; spec; spec = rest(spec), data = rest(data), data_lineno++) +repeat_spec_same_data: + { + obj_t *specline = rest(first(spec)); + obj_t *dataline = first(data); + obj_t *spec_linenum = first(first(spec)); + obj_t *first_spec = first(specline); + long spec_lineno = spec_linenum ? c_num(spec_linenum) : 0; + + if (consp(first_spec)) { + obj_t *sym = first(first_spec); + + if (sym == skip) { + obj_t *max = first(rest(first_spec)); + long cmax = nump(max) ? c_num(max) : 0; + long reps = 0; + + if (rest(specline)) + yyerrorlf(1, spec_lineno, "material after skip directive ignored"); + + if ((spec = rest(spec)) == nil) + break; + + { + uw_block_begin(nil, result); + + while (dataline && (!max || reps++ < cmax)) { + cons_bind (new_bindings, success, + match_files(spec, files, bindings, + data, num(data_lineno))); + + if (success) { + yyerrorlf(2, spec_lineno, "skip matched %s:%ld", + c_str(first(files)), data_lineno); + result = cons(new_bindings, cons(data, num(data_lineno))); + break; + } + + yyerrorlf(2, spec_lineno, "skip didn't match %s:%ld", + c_str(first(files)), data_lineno); + data = rest(data); + data_lineno++; + dataline = first(data); + } + + uw_block_end; + + if (result) + return result; + } + + yyerrorlf(2, spec_lineno, "skip failed"); + return nil; + } else if (sym == block) { + obj_t *name = first(rest(first_spec)); + if (rest(specline)) + yyerrorlf(1, spec_lineno, "material after block directive ignored"); + if ((spec = rest(spec)) == nil) + break; + uw_block_begin(name, result); + result = match_files(spec, files, bindings, data, num(data_lineno)); + uw_block_end; + return result; + } else if (sym == fail || sym == accept) { + obj_t *target = first(rest(first_spec)); + + if (rest(specline)) + yyerrorlf(1, spec_lineno, "material after %s ignored", + c_str(symbol_name(sym))); + + uw_block_return(target, + if2(sym == accept, + cons(bindings, + if3(data, cons(data, num(data_lineno)), t)))); + if (target) + yyerrorlf(1, spec_lineno, "%s: no block named %s in scope", + c_str(symbol_name(sym)), c_str(symbol_name(target))); + else + yyerrorlf(1, spec_lineno, "%s: not anonymous block in scope", + c_str(symbol_name(sym))); + + return nil; + } else if (sym == next) { + if (rest(first_spec)) + yyerrorlf(0, spec_lineno, "next takes no args"); + + if ((spec = rest(spec)) == nil) + break; + + if (rest(specline)) { + obj_t *sub = subst_vars(rest(specline), bindings); + obj_t *str = cat_str(sub, nil); + if (str == nil) { + yyerrorlf(2, spec_lineno, "bad substitution in next file spec"); + continue; + } + files = cons(str, files); + } else { + files = rest(files); + } + + /* We recursively process the file list, but the new + data position we return to the caller must be in the + original file we we were called with. Hence, we can't + make a straight tail call here. */ + { + cons_bind (new_bindings, success, + match_files(spec, files, bindings, nil, nil)); + if (success) + return cons(new_bindings, + if3(data, cons(data, num(data_lineno)), t)); + return nil; + } + } else if (sym == some || sym == all || sym == none || sym == maybe) { + obj_t *specs; + obj_t *all_match = t; + obj_t *some_match = nil; + obj_t *max_line = zero; + obj_t *max_data = nil; + + for (specs = rest(first_spec); specs != nil; specs = rest(specs)) + { + obj_t *nested_spec = first(specs); + obj_t *data_linenum = num(data_lineno); + + cons_bind (new_bindings, success, + match_files(nested_spec, files, bindings, + data, data_linenum)); + + if (success) { + bindings = new_bindings; + some_match = t; + + if (success == t) { + max_data = t; + } else if (consp(success) && max_data != t) { + cons_bind (new_data, new_line, success); + if (gt(new_line, max_line)) { + max_line = new_line; + max_data = new_data; + } + } + } else { + all_match = nil; + } + } + + if (sym == all && !all_match) { + yyerrorlf(2, spec_lineno, "all: some clauses didn't match"); + return nil; + } + + if (sym == some && !some_match) { + yyerrorlf(2, spec_lineno, "some: no clauses matched"); + return nil; + } + + if (sym == none && some_match) { + yyerrorlf(2, spec_lineno, "none: some clauses matched"); + return nil; + } + + /* No check for maybe, since it always succeeds. */ + + if (consp(max_data)) { + data_lineno = c_num(max_line); + data = max_data; + } else if (max_data == t) { + data = nil; + } + + if ((spec = rest(spec)) == nil) + break; + + goto repeat_spec_same_data; + } else if (sym == collect) { + obj_t *coll_spec = second(first_spec); + obj_t *until_spec = third(first_spec); + obj_t *bindings_coll = nil; + obj_t *iter; + + uw_block_begin(nil, result); + + result = t; + + while (data) { + cons_bind (new_bindings, success, + match_files(coll_spec, files, bindings, + data, num(data_lineno))); + + if (success) { + yyerrorlf(2, spec_lineno, "collect matched %s:%ld", + c_str(first(files)), data_lineno); + + for (iter = new_bindings; iter && iter != bindings; + iter = cdr(iter)) + { + obj_t *binding = car(iter); + obj_t *existing = assoc(bindings_coll, car(binding)); + + bindings_coll = acons_new(bindings_coll, car(binding), + cons(cdr(binding), cdr(existing))); + } + } + + /* Until clause sees un-collated bindings from collect. */ + if (until_spec) + { + cons_bind (discarded_bindings, success, + match_files(until_spec, files, new_bindings, + data, num(data_lineno))); + + if (success) { + /* The until spec matched. Special behavior: + We throw away its bindings, and run it again. + We run it again by incorporating it into the + surrouding spec, just behind the topmost one. + When we bail out of this loop, the first(spec) + will be popped, exposing the until_spec, + and then the main loop is repeated. */ + (void) discarded_bindings; + spec = cons(first(spec), append2(until_spec, rest(spec))); + break; + } + } + + if (success) { + if (consp(success)) { + yyerrorlf(2, spec_lineno, + "collect advancing from line %ld to %ld", + data_lineno, c_num(cdr(success))); + data = car(success); + data_lineno = c_num(cdr(success)); + } else { + yyerrorlf(2, spec_lineno, "collect consumed entire file"); + data = nil; + break; + } + } else { + data = rest(data); + data_lineno++; + } + } + + uw_block_end; + + if (!result) { + yyerrorlf(2, spec_lineno, "collect explicitly failed"); + return nil; + } + + if (!bindings_coll) + yyerrorlf(2, spec_lineno, "nothing was collected"); + + for (iter = bindings_coll; iter; iter = cdr(iter)) { + obj_t *pair = car(iter); + obj_t *rev = cons(car(pair), nreverse(cdr(pair))); + bindings = cons(rev, bindings); + } + + if ((spec = rest(spec)) == nil) + break; + + goto repeat_spec_same_data; + } else if (sym == flattn) { + obj_t *iter; + + for (iter = rest(first_spec); iter; iter = rest(iter)) { + obj_t *sym = first(iter); + + if (!symbolp(sym)) { + yyerrorlf(1, spec_lineno, "non-symbol in flatten directive"); + continue; + } else { + obj_t *existing = assoc(bindings, sym); + + if (existing) + *cdr_l(existing) = flatten(cdr(existing)); + } + } + + if ((spec = rest(spec)) == nil) + break; + + goto repeat_spec_same_data; + } else if (sym == forget) { + bindings = alist_remove(bindings, rest(first_spec)); + + if ((spec = rest(spec)) == nil) + break; + + goto repeat_spec_same_data; + } else if (sym == mrge) { + obj_t *target = first(rest(first_spec)); + obj_t *args = rest(rest(first_spec)); + obj_t *exists = assoc(bindings, target); + obj_t *merged = nil; + + if (!target || !symbolp(target)) + yyerrorlf(1, spec_lineno, "bad merge directive"); + + if (exists) + yyerrorlf(1, spec_lineno, "merge: symbol %s already bound", + c_str(symbol_name(target))); + + for (; args; args = rest(args)) { + obj_t *other_sym = first(args); + + if (other_sym) { + obj_t *other_lookup = assoc(bindings, other_sym); + + if (!symbolp(other_sym)) + yyerrorlf(1, spec_lineno, "non-symbol in merge directive"); + else if (!other_lookup) + yyerrorlf(1, spec_lineno, "merge: nonexistent symbol %s", + c_str(symbol_name(sym))); + + if (merged) + merged = merge(merged, cdr(other_lookup)); + else + merged = cdr(other_lookup); + } + } + + bindings = acons_new(bindings, target, merged); + + if ((spec = rest(spec)) == nil) + break; + + goto repeat_spec_same_data; + } else if (sym == bind) { + obj_t *args = rest(first_spec); + obj_t *pattern = first(args); + obj_t *var = second(args); + obj_t *lookup = assoc(bindings, var); + + if (!var || !symbolp(var)) + yyerrorlf(1, spec_lineno, "bind: bad variable spec"); + else if (!lookup) + yyerrorlf(1, spec_lineno, "bind: unbound source variable"); + + bindings = dest_bind(bindings, pattern, cdr(lookup)); + + if (bindings == t) + return nil; + + if ((spec = rest(spec)) == nil) + break; + + goto repeat_spec_same_data; + } else if (sym == cat) { + obj_t *iter; + + for (iter = rest(first_spec); iter; iter = rest(iter)) { + obj_t *sym = first(iter); + + if (!symbolp(sym)) { + yyerrorlf(1, spec_lineno, "non-symbol in cat directive"); + continue; + } else { + obj_t *existing = assoc(bindings, sym); + obj_t *sep = nil; + + if (rest(specline)) { + obj_t *sub = subst_vars(rest(specline), bindings); + sep = cat_str(sub, nil); + } + + if (existing) + *cdr_l(existing) = cat_str(flatten(cdr(existing)), sep); + } + } + + if ((spec = rest(spec)) == nil) + break; + + goto repeat_spec_same_data; + } else if (sym == output) { + obj_t *specs = second(first_spec); + obj_t *dest_opt = third(first_spec); + obj_t *dest = dest_opt ? cat_str(subst_vars(dest_opt, bindings), nil) + : string(chk_strdup("-")); + fpip_t fp = (errno = 0, complex_open(dest, t)); + + yyerrorf(2, "opening data sink %s", c_str(dest)); + + if (complex_open_failed(fp)) { + if (errno != 0) + yyerrorf(2, "could not open %s: %s", c_str(dest), strerror(errno)); + else + yyerrorf(2, "could not open %s", c_str(dest)); + } else { + do_output(bindings, specs, fp.f); + complex_close(fp); + } + + if ((spec = rest(spec)) == nil) + break; + + goto repeat_spec_same_data; + } + } + + if (dataline == nil) + return nil; + + { + cons_bind (new_bindings, success, + match_line(bindings, specline, dataline, zero, + spec_linenum, num(data_lineno), first(files))); + + if (nump(success) && c_num(success) < c_num(length_str(dataline))) { + yyerrorf(2, "spec only matches line to position %ld: %s", + c_num(success), c_str(dataline)); + return nil; + } + + if (!success) + return nil; + + bindings = new_bindings; + } + } + + return cons(bindings, if3(data, cons(data, num(data_lineno)), t)); +} + +int extract(obj_t *spec, obj_t *files, obj_t *predefined_bindings) +{ + cons_bind (bindings, success, match_files(spec, files, predefined_bindings, + nil, nil)); + + if (!output_produced) { + if (!opt_nobindings) { + if (bindings) { + bindings = nreverse(bindings); + dump_bindings(bindings); + } + } + + if (!success) + puts("false"); + } + + return success ? 0 : EXIT_FAILURE; +} |