diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2013-10-05 21:26:09 -0700 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2013-10-05 21:26:09 -0700 |
commit | e39dea5833abe29b7f6b9ba5d55f93b553a7cded (patch) | |
tree | 5e87f46369e075e8a47c8a5b97958e6cf934db63 | |
parent | e022ebd1f2b414837b60f434e6db26e2c999207a (diff) | |
download | hc-e39dea5833abe29b7f6b9ba5d55f93b553a7cded.tar.gz hc-e39dea5833abe29b7f6b9ba5d55f93b553a7cded.tar.bz2 hc-e39dea5833abe29b7f6b9ba5d55f93b553a7cded.zip |
Attribute filtering implemented.
-rw-r--r-- | Makefile | 8 | ||||
-rw-r--r-- | hc.c | 83 | ||||
-rw-r--r-- | hc.h | 4 | ||||
-rw-r--r-- | hc.l | 15 |
4 files changed, 98 insertions, 12 deletions
@@ -1,11 +1,15 @@ CFLAGS := -g -Wall -W -ansi -D_XOPEN_SOURCE=500 $(EXTRA_CFLAGS) +.SUFFIXES: + hc: lex.yy.o hc.o - $(CC) $(CFLAGS) $(OUR_CFLAGS) $^ -o $@ -lfl + $(CC) $(CFLAGS) $^ -o $@ -lfl lex.yy.o: lex.yy.c hc.h + $(CC) $(CFLAGS) $< -c hc.o: hc.c hc.h wl.h + $(CC) $(CFLAGS) $< -c lex.yy.c: hc.l hc.h $(LEX) -i -8 hc.l @@ -14,4 +18,4 @@ wl.h: wl.txr wl txr wl.txr > $@ clean: - -rm hc lex.yy.o lex.yy.c + -rm hc hc.o lex.yy.o lex.yy.c @@ -4,7 +4,6 @@ #include "hc.h" #include "wl.h" - static allowed_el_t *allowed_el[tok_max]; static const token_t blank; @@ -18,7 +17,7 @@ static void bail() static token_t mktok(toktype_t type, char *text) { - token_t tok = { 0, 0, 0, 0 }; + token_t tok = { 0, 0, 0, 0, 0 }; tok.type = type; tok.lexeme = strdup(text); return tok; @@ -38,7 +37,12 @@ static token_t gettok(void) { if (null(pushback)) { int type = yylex(); - return mktok(type, yytext); + token_t tok = mktok(type, yytext); + if (type >= tok_el_unknown && type < tok_at_unknown) + tok.is_el = 1; + if (type >= tok_at_unknown && type < tok_max) + tok.is_at = 1; + return tok; } else { token_t tok = pushback; pushback = blank; @@ -102,6 +106,77 @@ static token_t printuntil(int type) return tok; } +static int allowed_attr(token_t el, token_t at) +{ + allowed_el_t *ael = allowed_el[el.type]; + int i; + + if (!ael || !ael->attr) + return 0; + + for (i = 0; ael->attr[i] != tok_eof; i++) + if (ael->attr[i] == at.type) + return 1; + + return 0; +} + +static void parse_attr(token_t el) +{ + for (;;) { + token_t ws0 = optmatch(tok_wsp); + token_t end = optmatch('/'); + token_t close = optmatch('>'); + + if (!null(end) && null(close)) + bail(); + + if (!null(close)) { + deltok(ws0); + deltok(printtok(end)); + deltok(printtok(close)); + break; + } + + if (null(ws0)) { + bail(); + } else { + token_t at = gettok(); + token_t equal = optmatch('='); + int allowed = allowed_attr(el, at); + + if (!at.is_at) + bail(); + + if (allowed) { + printtok(ws0); + printtok(at); + } + + if (!null(equal)) { + token_t val = gettok(); + + if (!val.is_el && !val.is_at && + val.type != tok_text && val.type != tok_wsp) + bail(); + + if (allowed) { + printtok(equal); + printtok(val); + } + deltok(val); + } + + deltok(equal); + deltok(at); + } + + deltok(ws0); + deltok(end); + deltok(close); + } +} + static void parse_element(token_t in) { token_t end = optmatch('/'); @@ -126,7 +201,7 @@ static void parse_element(token_t in) printtok(in); printtok(end); printtok(name); - deltok(printuntil('>')); + parse_attr(name); } else { deltok(lookfor('>')); } @@ -3,6 +3,7 @@ typedef enum { tok_eof = 0, tok_doctype = 256, tok_text, + tok_wsp, tok_el_unknown, tok_el_a, tok_el_abbr, @@ -219,7 +220,8 @@ typedef enum { typedef struct { int type; - int is_tag; + int is_el; + int is_at; int is_close; char *lexeme; } token_t; @@ -10,10 +10,7 @@ wsp [ \t\n\r\v\t] notwsp [^ \t\n\r\v\t] -ctrl [\x0-\x1f] -notctrl [^\x0-\x1f] -special ["'<>/=&] -notspecial [^"'<>/=&] +notspecial [^"'<>/=& \t\n\r\v\t] elname [A-Za-z0-9]+ attrname [^"'<>/=&\x0-\x1f\t\n\r\v\t ] endnm [^A-Za-z_\-0-9] @@ -23,7 +20,9 @@ endnm [^A-Za-z_\-0-9] [<] { BEGIN(ELM); return '<'; } +{wsp}+ { return tok_wsp; } {notspecial}+ { return tok_text; } +"<!--".*"-->" { return tok_text; } <ELM>a/{endnm} { BEGIN(ATT); return tok_el_a; } <ELM>abbr/{endnm} { BEGIN(ATT); return tok_el_abbr; } <ELM>acronym/{endnm} { BEGIN(ATT); return tok_el_acronym; } @@ -114,6 +113,8 @@ endnm [^A-Za-z_\-0-9] <ELM>ul/{endnm} { BEGIN(ATT); return tok_el_ul; } <ELM>var/{endnm} { BEGIN(ATT); return tok_el_var; } <ELM>{elname} { BEGIN(ATT); return tok_el_unknown; } +<ELM>{wsp}+ { return tok_wsp; } +<ELM>{notspecial}+ { return tok_text; } <ELM>. { return yytext[0]; } <ATT>accept/{endnm} { return tok_at_accept; } @@ -235,9 +236,13 @@ endnm [^A-Za-z_\-0-9] <ATT>vlink/{endnm} { return tok_at_vlink; } <ATT>vspace/{endnm} { return tok_at_vspace; } <ATT>width/{endnm} { return tok_at_width; } -<ATT>{attrname} { return tok_at_unknown; } +<ATT>{attrname}+ { return tok_at_unknown; } <ATT>[>] { BEGIN(INITIAL); return yytext[0]; } +<ATT>{wsp}+ { return tok_wsp; } +<ATT>{notspecial}+ { return tok_text; } +<ATT>\"[^\"]*\" { return tok_text; } +<ATT>'[^']*' { return tok_text; } <ATT>. { return yytext[0]; } %% |