summaryrefslogtreecommitdiffstats
path: root/parser.l
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2018-05-11 06:55:25 -0700
committerKaz Kylheku <kaz@kylheku.com>2018-05-11 06:55:25 -0700
commit78e12d9c43b606f7402100a7c3b3367057d103d9 (patch)
tree486f65f1faab122d002be6fd3291ce528f2e661b /parser.l
parent5bb5391fd3ad9874fdb266dd5b6d57f084626d13 (diff)
downloadtxr-78e12d9c43b606f7402100a7c3b3367057d103d9.tar.gz
txr-78e12d9c43b606f7402100a7c3b3367057d103d9.tar.bz2
txr-78e12d9c43b606f7402100a7c3b3367057d103d9.zip
Allow Unicode characters in identifiers.
* parser.l (unicode_ident): New static function. (BSCHR, NSCHR): Include UONLY match. (grammar): Use unicode_ident function to validate tokens obtained from BTOK and NTOK. * txr.1: Documented changing definition of bident and lident.
Diffstat (limited to 'parser.l')
-rw-r--r--parser.l51
1 files changed, 46 insertions, 5 deletions
diff --git a/parser.l b/parser.l
index 5fd70a51..774ade24 100644
--- a/parser.l
+++ b/parser.l
@@ -186,6 +186,47 @@ static wchar_t num_esc(scanner_t *scn, char *num)
return val;
}
+static wchar_t *unicode_ident(scanner_t *scn, const char *lex)
+{
+ wchar_t *wlex = utf8_dup_from(lex), *ptr = wlex, wch;
+
+ while ((wch = *ptr++)) {
+ if (wch < 0x1680 || (wch >= 0x3000 && wch < 0xdc00))
+ continue;
+
+ if ((wch >= 0xdc00 && wch <= 0xdcff) ||
+ (wch >= 0xd800 && wch <= 0xdbff) ||
+#if FULL_UNICODE
+ (wch >= 0xf0000 && wch <= 0xffffd) ||
+ (wch >= 0x100000 && wch <= 0x10fffd) ||
+#endif
+ (wch >= 0xe000 && wch <= 0xf8ff) ||
+ (wch == 0xfffe) ||
+ (wch == 0xffff))
+ {
+ yyerror(scn, yyget_extra(scn),
+ "disallowed Unicode character in identifier");
+ break;
+ }
+
+ switch (wch) {
+ case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002:
+ case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007:
+ case 0x2008: case 0x2009: case 0x200a: case 0x2028: case 0x2029:
+ case 0x205f: case 0x3000:
+ yyerror(scn, yyget_extra(scn),
+ "Unicode space occurs in identifier");
+ break;
+ default:
+ continue;
+ }
+
+ break;
+ }
+
+ return wlex;
+}
+
%}
%option stack noinput reentrant bison-bridge extra-type="parser_t *"
@@ -202,8 +243,8 @@ DOTFLO [.]{DIG}+
XNUM #x{SGN}?{XDIG}+
ONUM #o{SGN}?[0-7]+
BNUM #b{SGN}?[0-1]+
-BSCHR [a-zA-Z0-9!$%&*+\-<=>?\\_~]
-NSCHR [a-zA-Z0-9!$%&*+\-<=>?\\_~/]
+BSCHR ([a-zA-Z0-9!$%&*+\-<=>?\\_~]|{UONLY})
+NSCHR ([a-zA-Z0-9!$%&*+\-<=>?\\_~/]|{UONLY})
ID_END [^a-zA-Z0-9!$%&*+\-<=>?\\_~/]
EXTRA [#^]
BT0 {BSCHR}({BSCHR}|{EXTRA})*
@@ -395,7 +436,7 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
|| yy_top_state(yyscanner) == QWLIT)
yy_pop_state(yyscanner);
- yylval->lexeme = utf8_dup_from(yytext);
+ yylval->lexeme = unicode_ident(yyscanner, yytext);
return SYMTOK;
}
@@ -408,7 +449,7 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
|| yy_top_state(yyscanner) == QWLIT)
yy_pop_state(yyscanner);
- yylval->lexeme = utf8_dup_from(yytext);
+ yylval->lexeme = unicode_ident(yyscanner, yytext);
return SYMTOK;
}
@@ -422,7 +463,7 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
yyerrorf(yyg, lit("bad token: ~a"),
string_own(utf8_dup_from(yytext)),
nao);
- yylval->lexeme = utf8_dup_from(yytext);
+ yylval->lexeme = unicode_ident(yyscanner, yytext);
return SYMTOK;
}