summaryrefslogtreecommitdiffstats
path: root/parser.l
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2009-11-11 08:54:21 -0800
committerKaz Kylheku <kaz@kylheku.com>2009-11-11 08:54:21 -0800
commitd59d8950ec58702821ec618b92dfb2490ae0bf31 (patch)
treee27e2914d563171ad56c2f7ae30c7c49343df06c /parser.l
parent2f62f352f603b837a5cf032c257531052530c410 (diff)
downloadtxr-d59d8950ec58702821ec618b92dfb2490ae0bf31.tar.gz
txr-d59d8950ec58702821ec618b92dfb2490ae0bf31.tar.bz2
txr-d59d8950ec58702821ec618b92dfb2490ae0bf31.zip
Big conversion to wide characters and UTF-8 support.
This is incomplete. There are too many dependencies on wide character support from the C stream I/O library, and implicit use of some encoding which may not be UTF-8. The regex code does not handle wide characters properly. Character type is still int in some places, rather than wchar_t. Test suite passes though.
Diffstat (limited to 'parser.l')
-rw-r--r--parser.l84
1 files changed, 50 insertions, 34 deletions
diff --git a/parser.l b/parser.l
index d35c23ad..5919f929 100644
--- a/parser.l
+++ b/parser.l
@@ -33,10 +33,12 @@
#include <limits.h>
#include <errno.h>
#include <dirent.h>
+#include <wchar.h>
#include "y.tab.h"
#include "lib.h"
#include "gc.h"
#include "stream.h"
+#include "utf8.h"
#include "parser.h"
#define YY_NO_UNPUT
@@ -73,7 +75,7 @@ void yyerrorf(const char *s, ...)
if (opt_loglevel >= 1) {
va_list vl;
va_start (vl, s);
- fprintf(stderr, "%s: (%s:%ld): ", progname, spec_file, lineno);
+ fprintf(stderr, "%ls: (%ls:%ld): ", progname, spec_file, lineno);
vfprintf(stderr, s, vl);
putc('\n', stderr);
va_end (vl);
@@ -127,33 +129,33 @@ void yybadtoken(int tok, const char *context)
yyerrorf("unexpected end of input");
}
-static int char_esc(int letter)
+static wchar_t char_esc(int letter)
{
switch (letter) {
- case 'a': return '\a';
- case 'b': return '\b';
- case 't': return '\t';
- case 'n': return '\n';
- case 'v': return '\v';
- case 'f': return '\f';
- case 'r': return '\r';
+ case 'a': return L'\a';
+ case 'b': return L'\b';
+ case 't': return L'\t';
+ case 'n': return L'\n';
+ case 'v': return L'\v';
+ case 'f': return L'\f';
+ case 'r': return L'\r';
case 'e': return 27;
- case '"': return '"';
- case '\'': return '\'';
- case '`': return '`';
+ case '"': return L'"';
+ case '\'': return L'\'';
+ case '`': return L'`';
}
abort();
}
-static int num_esc(char *num)
+static wchar_t num_esc(char *num)
{
if (num[0] == 'x') {
- if (strlen(num) > 3)
+ if (strlen(num) > 7)
yyerror("too many digits in hex character escape");
return strtol(num + 1, 0, 16);
} else {
- if (strlen(num) > 3)
+ if (strlen(num) > 8)
yyerror("too many digits in octal character escape");
return strtol(num, 0, 8);
}
@@ -170,6 +172,17 @@ WS [\t ]*
HEX [0-9A-Fa-f]
OCT [0-7]
+ASC [\x00-\x7f]
+ASCN [\x00-\t\v-\x7f]
+U [\x80-\xbf]
+U2 [\xc2-\xdf]
+U3 [\xe0-\xef]
+U4 [\xf0-\xf4]
+
+UANY {ASC}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
+UANYN {ASCN}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
+UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
+
%x SPECIAL NESTED REGEX REGCLASS STRLIT CHRLIT QSILIT
%%
@@ -188,7 +201,7 @@ OCT [0-7]
if (*errp != 0) {
/* not a number */
- yylval.lexeme = strdup(yytext);
+ yylval.lexeme = utf8_dup_from(yytext);
return IDENT;
}
@@ -346,7 +359,7 @@ OCT [0-7]
<SPECIAL>@ {
yy_pop_state();
- yylval.lexeme = strdup("@");
+ yylval.lexeme = wcsdup(L"@");
return TEXT;
}
@@ -365,26 +378,25 @@ OCT [0-7]
}
<SPECIAL>[\\][abtnvfre] {
- char lexeme[2];
+ wchar_t lexeme[2];
lexeme[0] = char_esc(yytext[1]);
lexeme[1] = 0;
- yylval.lexeme = strdup(lexeme);
+ yylval.lexeme = wcsdup(lexeme);
yy_pop_state();
return TEXT;
}
<SPECIAL>[\\](x{HEX}+|{OCT}+) {
- char lexeme[2];
+ wchar_t lexeme[2];
lexeme[0] = num_esc(yytext + 1);
lexeme[1] = 0;
- yylval.lexeme = strdup(lexeme);
+ yylval.lexeme = wcsdup(lexeme);
yy_pop_state();
return TEXT;
}
-<SPECIAL,NESTED>. {
- yyerrorf("bad character in directive: '%c'",
- yytext[0]);
+<SPECIAL,NESTED>{UANYN} {
+ yyerrorf("bad character in directive: '%s'", yytext);
}
<REGEX>[/] {
@@ -433,15 +445,17 @@ OCT [0-7]
return REGCHAR;
}
-<REGEX>. {
- yylval.chr = yytext[0];
+<REGEX>{UANYN} {
+ wchar_t buf[8];
+ utf8_from(buf, yytext);
+ yylval.chr = buf[0];
return REGCHAR;
}
-<INITIAL>[^@\n]+ {
- yylval.lexeme = strdup(yytext);
- return TEXT;
- }
+<INITIAL>({UONLY}|[^@\n])+ {
+ yylval.lexeme = utf8_dup_from(yytext);
+ return TEXT;
+ }
<INITIAL>\n {
lineno++;
@@ -515,9 +529,11 @@ OCT [0-7]
yy_push_state(SPECIAL);
}
-<STRLIT,CHRLIT,QSILIT>. {
- yylval.chr = yytext[0];
- return LITCHAR;
- }
+<STRLIT,CHRLIT,QSILIT>{UANYN} {
+ wchar_t buf[8];
+ utf8_from(buf, yytext);
+ yylval.chr = buf[0];
+ return LITCHAR;
+ }
%%