summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2013-10-05 21:26:09 -0700
committerKaz Kylheku <kaz@kylheku.com>2013-10-05 21:26:09 -0700
commite39dea5833abe29b7f6b9ba5d55f93b553a7cded (patch)
tree5e87f46369e075e8a47c8a5b97958e6cf934db63
parente022ebd1f2b414837b60f434e6db26e2c999207a (diff)
downloadhc-e39dea5833abe29b7f6b9ba5d55f93b553a7cded.tar.gz
hc-e39dea5833abe29b7f6b9ba5d55f93b553a7cded.tar.bz2
hc-e39dea5833abe29b7f6b9ba5d55f93b553a7cded.zip
Attribute filtering implemented.
-rw-r--r--Makefile8
-rw-r--r--hc.c83
-rw-r--r--hc.h4
-rw-r--r--hc.l15
4 files changed, 98 insertions, 12 deletions
diff --git a/Makefile b/Makefile
index 9a1cda8..f9e1cc5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,15 @@
CFLAGS := -g -Wall -W -ansi -D_XOPEN_SOURCE=500 $(EXTRA_CFLAGS)
+.SUFFIXES:
+
hc: lex.yy.o hc.o
- $(CC) $(CFLAGS) $(OUR_CFLAGS) $^ -o $@ -lfl
+ $(CC) $(CFLAGS) $^ -o $@ -lfl
lex.yy.o: lex.yy.c hc.h
+ $(CC) $(CFLAGS) $< -c
hc.o: hc.c hc.h wl.h
+ $(CC) $(CFLAGS) $< -c
lex.yy.c: hc.l hc.h
$(LEX) -i -8 hc.l
@@ -14,4 +18,4 @@ wl.h: wl.txr wl
txr wl.txr > $@
clean:
- -rm hc lex.yy.o lex.yy.c
+ -rm hc hc.o lex.yy.o lex.yy.c
diff --git a/hc.c b/hc.c
index 3efb28a..1d85315 100644
--- a/hc.c
+++ b/hc.c
@@ -4,7 +4,6 @@
#include "hc.h"
#include "wl.h"
-
static allowed_el_t *allowed_el[tok_max];
static const token_t blank;
@@ -18,7 +17,7 @@ static void bail()
static token_t mktok(toktype_t type, char *text)
{
- token_t tok = { 0, 0, 0, 0 };
+ token_t tok = { 0, 0, 0, 0, 0 };
tok.type = type;
tok.lexeme = strdup(text);
return tok;
@@ -38,7 +37,12 @@ static token_t gettok(void)
{
if (null(pushback)) {
int type = yylex();
- return mktok(type, yytext);
+ token_t tok = mktok(type, yytext);
+ if (type >= tok_el_unknown && type < tok_at_unknown)
+ tok.is_el = 1;
+ if (type >= tok_at_unknown && type < tok_max)
+ tok.is_at = 1;
+ return tok;
} else {
token_t tok = pushback;
pushback = blank;
@@ -102,6 +106,77 @@ static token_t printuntil(int type)
return tok;
}
+static int allowed_attr(token_t el, token_t at)
+{
+ allowed_el_t *ael = allowed_el[el.type];
+ int i;
+
+ if (!ael || !ael->attr)
+ return 0;
+
+ for (i = 0; ael->attr[i] != tok_eof; i++)
+ if (ael->attr[i] == at.type)
+ return 1;
+
+ return 0;
+}
+
+static void parse_attr(token_t el)
+{
+ for (;;) {
+ token_t ws0 = optmatch(tok_wsp);
+ token_t end = optmatch('/');
+ token_t close = optmatch('>');
+
+ if (!null(end) && null(close))
+ bail();
+
+ if (!null(close)) {
+ deltok(ws0);
+ deltok(printtok(end));
+ deltok(printtok(close));
+ break;
+ }
+
+ if (null(ws0)) {
+ bail();
+ } else {
+ token_t at = gettok();
+ token_t equal = optmatch('=');
+ int allowed = allowed_attr(el, at);
+
+ if (!at.is_at)
+ bail();
+
+ if (allowed) {
+ printtok(ws0);
+ printtok(at);
+ }
+
+ if (!null(equal)) {
+ token_t val = gettok();
+
+ if (!val.is_el && !val.is_at &&
+ val.type != tok_text && val.type != tok_wsp)
+ bail();
+
+ if (allowed) {
+ printtok(equal);
+ printtok(val);
+ }
+ deltok(val);
+ }
+
+ deltok(equal);
+ deltok(at);
+ }
+
+ deltok(ws0);
+ deltok(end);
+ deltok(close);
+ }
+}
+
static void parse_element(token_t in)
{
token_t end = optmatch('/');
@@ -126,7 +201,7 @@ static void parse_element(token_t in)
printtok(in);
printtok(end);
printtok(name);
- deltok(printuntil('>'));
+ parse_attr(name);
} else {
deltok(lookfor('>'));
}
diff --git a/hc.h b/hc.h
index 60e9dea..7a9eecf 100644
--- a/hc.h
+++ b/hc.h
@@ -3,6 +3,7 @@ typedef enum {
tok_eof = 0,
tok_doctype = 256,
tok_text,
+ tok_wsp,
tok_el_unknown,
tok_el_a,
tok_el_abbr,
@@ -219,7 +220,8 @@ typedef enum {
typedef struct {
int type;
- int is_tag;
+ int is_el;
+ int is_at;
int is_close;
char *lexeme;
} token_t;
diff --git a/hc.l b/hc.l
index 8d16781..56fd88e 100644
--- a/hc.l
+++ b/hc.l
@@ -10,10 +10,7 @@
wsp [ \t\n\r\v\t]
notwsp [^ \t\n\r\v\t]
-ctrl [\x0-\x1f]
-notctrl [^\x0-\x1f]
-special ["'<>/=&]
-notspecial [^"'<>/=&]
+notspecial [^"'<>/=& \t\n\r\v\t]
elname [A-Za-z0-9]+
attrname [^"'<>/=&\x0-\x1f\t\n\r\v\t ]
endnm [^A-Za-z_\-0-9]
@@ -23,7 +20,9 @@ endnm [^A-Za-z_\-0-9]
[<] { BEGIN(ELM);
return '<'; }
+{wsp}+ { return tok_wsp; }
{notspecial}+ { return tok_text; }
+"<!--".*"-->" { return tok_text; }
<ELM>a/{endnm} { BEGIN(ATT); return tok_el_a; }
<ELM>abbr/{endnm} { BEGIN(ATT); return tok_el_abbr; }
<ELM>acronym/{endnm} { BEGIN(ATT); return tok_el_acronym; }
@@ -114,6 +113,8 @@ endnm [^A-Za-z_\-0-9]
<ELM>ul/{endnm} { BEGIN(ATT); return tok_el_ul; }
<ELM>var/{endnm} { BEGIN(ATT); return tok_el_var; }
<ELM>{elname} { BEGIN(ATT); return tok_el_unknown; }
+<ELM>{wsp}+ { return tok_wsp; }
+<ELM>{notspecial}+ { return tok_text; }
<ELM>. { return yytext[0]; }
<ATT>accept/{endnm} { return tok_at_accept; }
@@ -235,9 +236,13 @@ endnm [^A-Za-z_\-0-9]
<ATT>vlink/{endnm} { return tok_at_vlink; }
<ATT>vspace/{endnm} { return tok_at_vspace; }
<ATT>width/{endnm} { return tok_at_width; }
-<ATT>{attrname} { return tok_at_unknown; }
+<ATT>{attrname}+ { return tok_at_unknown; }
<ATT>[>] { BEGIN(INITIAL); return yytext[0]; }
+<ATT>{wsp}+ { return tok_wsp; }
+<ATT>{notspecial}+ { return tok_text; }
+<ATT>\"[^\"]*\" { return tok_text; }
+<ATT>'[^']*' { return tok_text; }
<ATT>. { return yytext[0]; }
%%