diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2009-11-11 08:54:21 -0800 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2009-11-11 08:54:21 -0800 |
commit | d59d8950ec58702821ec618b92dfb2490ae0bf31 (patch) | |
tree | e27e2914d563171ad56c2f7ae30c7c49343df06c /parser.l | |
parent | 2f62f352f603b837a5cf032c257531052530c410 (diff) | |
download | txr-d59d8950ec58702821ec618b92dfb2490ae0bf31.tar.gz txr-d59d8950ec58702821ec618b92dfb2490ae0bf31.tar.bz2 txr-d59d8950ec58702821ec618b92dfb2490ae0bf31.zip |
Big conversion to wide characters and UTF-8 support.
This is incomplete. There are too many dependencies on
wide character support from the C stream I/O library,
and implicit use of some encoding which may not be UTF-8.
The regex code does not handle wide characters properly.
Character type is still int in some places, rather than wchar_t.
Test suite passes though.
Diffstat (limited to 'parser.l')
-rw-r--r-- | parser.l | 84 |
1 files changed, 50 insertions, 34 deletions
@@ -33,10 +33,12 @@ #include <limits.h> #include <errno.h> #include <dirent.h> +#include <wchar.h> #include "y.tab.h" #include "lib.h" #include "gc.h" #include "stream.h" +#include "utf8.h" #include "parser.h" #define YY_NO_UNPUT @@ -73,7 +75,7 @@ void yyerrorf(const char *s, ...) if (opt_loglevel >= 1) { va_list vl; va_start (vl, s); - fprintf(stderr, "%s: (%s:%ld): ", progname, spec_file, lineno); + fprintf(stderr, "%ls: (%ls:%ld): ", progname, spec_file, lineno); vfprintf(stderr, s, vl); putc('\n', stderr); va_end (vl); @@ -127,33 +129,33 @@ void yybadtoken(int tok, const char *context) yyerrorf("unexpected end of input"); } -static int char_esc(int letter) +static wchar_t char_esc(int letter) { switch (letter) { - case 'a': return '\a'; - case 'b': return '\b'; - case 't': return '\t'; - case 'n': return '\n'; - case 'v': return '\v'; - case 'f': return '\f'; - case 'r': return '\r'; + case 'a': return L'\a'; + case 'b': return L'\b'; + case 't': return L'\t'; + case 'n': return L'\n'; + case 'v': return L'\v'; + case 'f': return L'\f'; + case 'r': return L'\r'; case 'e': return 27; - case '"': return '"'; - case '\'': return '\''; - case '`': return '`'; + case '"': return L'"'; + case '\'': return L'\''; + case '`': return L'`'; } abort(); } -static int num_esc(char *num) +static wchar_t num_esc(char *num) { if (num[0] == 'x') { - if (strlen(num) > 3) + if (strlen(num) > 7) yyerror("too many digits in hex character escape"); return strtol(num + 1, 0, 16); } else { - if (strlen(num) > 3) + if (strlen(num) > 8) yyerror("too many digits in octal character escape"); return strtol(num, 0, 8); } @@ -170,6 +172,17 @@ WS [\t ]* HEX [0-9A-Fa-f] OCT [0-7] +ASC [\x00-\x7f] +ASCN [\x00-\t\v-\x7f] +U [\x80-\xbf] +U2 [\xc2-\xdf] +U3 [\xe0-\xef] +U4 [\xf0-\xf4] + +UANY {ASC}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} +UANYN {ASCN}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} +UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} + %x SPECIAL NESTED REGEX REGCLASS STRLIT CHRLIT QSILIT %% @@ -188,7 +201,7 @@ OCT [0-7] if (*errp != 0) { /* not a number */ - yylval.lexeme = strdup(yytext); + yylval.lexeme = utf8_dup_from(yytext); return IDENT; } @@ -346,7 +359,7 @@ OCT [0-7] <SPECIAL>@ { yy_pop_state(); - yylval.lexeme = strdup("@"); + yylval.lexeme = wcsdup(L"@"); return TEXT; } @@ -365,26 +378,25 @@ OCT [0-7] } <SPECIAL>[\\][abtnvfre] { - char lexeme[2]; + wchar_t lexeme[2]; lexeme[0] = char_esc(yytext[1]); lexeme[1] = 0; - yylval.lexeme = strdup(lexeme); + yylval.lexeme = wcsdup(lexeme); yy_pop_state(); return TEXT; } <SPECIAL>[\\](x{HEX}+|{OCT}+) { - char lexeme[2]; + wchar_t lexeme[2]; lexeme[0] = num_esc(yytext + 1); lexeme[1] = 0; - yylval.lexeme = strdup(lexeme); + yylval.lexeme = wcsdup(lexeme); yy_pop_state(); return TEXT; } -<SPECIAL,NESTED>. { - yyerrorf("bad character in directive: '%c'", - yytext[0]); +<SPECIAL,NESTED>{UANYN} { + yyerrorf("bad character in directive: '%s'", yytext); } <REGEX>[/] { @@ -433,15 +445,17 @@ OCT [0-7] return REGCHAR; } -<REGEX>. { - yylval.chr = yytext[0]; +<REGEX>{UANYN} { + wchar_t buf[8]; + utf8_from(buf, yytext); + yylval.chr = buf[0]; return REGCHAR; } -<INITIAL>[^@\n]+ { - yylval.lexeme = strdup(yytext); - return TEXT; - } +<INITIAL>({UONLY}|[^@\n])+ { + yylval.lexeme = utf8_dup_from(yytext); + return TEXT; + } <INITIAL>\n { lineno++; @@ -515,9 +529,11 @@ OCT [0-7] yy_push_state(SPECIAL); } -<STRLIT,CHRLIT,QSILIT>. { - yylval.chr = yytext[0]; - return LITCHAR; - } +<STRLIT,CHRLIT,QSILIT>{UANYN} { + wchar_t buf[8]; + utf8_from(buf, yytext); + yylval.chr = buf[0]; + return LITCHAR; + } %% |