diff options
-rw-r--r-- | ChangeLog | 26 | ||||
-rw-r--r-- | regex.c | 39 | ||||
-rw-r--r-- | utf8.c | 22 |
3 files changed, 86 insertions, 1 deletions
@@ -1,3 +1,29 @@ +2011-10-10 Kaz Kylheku <kaz@kylheku.com> + + Improved support for broken unicode. + Regex support for extra-large character sets not compiled in + if wchar_t is not wide enough for it. + The utf-8 properly throws exceptions when encountering characters + that it cannot represent, instead of silently ignoring the + situation and continuing with incorrectly computed data. + + * regex.c (FULL_UNICODE): New macro. + (CHAR_SET_L3, CHAR_SET_L2_LO, CHAR_SET_L2_HI): Only defined + if full unicde is available. + (CHSET_XLARGE, cset_L3_t, struct xlarge_char_set, + L2_full, L3_fill_range, L3_contains): Ditto. + (unon char_set): Member x1 present only under FULL_UNICODE. + (char_set_destroy, char_set_add, char_set_add_range, + char_set_contains): CHSET_XLARGE cases only available on + FULL_UNICODE. + (char_set_compile): Default cst variable to CHSET_LARGE. + + * utf8.c (FULL_UNICODE): New macro. + (conversion_error): New function. + (utf8_from_uc): Throw error if not FULL_UNICODE and character is + outside the BMP. + (utf8_decode): Likewise. + 2011-10-09 Kaz Kylheku <kaz@kylheku.com> * HACKING: Documented portability hacks for narrow wchar_t. @@ -38,6 +38,10 @@ #include "regex.h" #include "txr.h" +#if WCHAR_MAX > 65535 +#define FULL_UNICODE +#endif + typedef union nfa_state nfa_state_t; typedef struct nfa { @@ -62,10 +66,14 @@ typedef unsigned int bitcell_t; #define CHAR_SET_L0(CH) ((CH) & 0xFF) #define CHAR_SET_L1(CH) (((CH) >> 8) & 0xF) #define CHAR_SET_L2(CH) (((CH) >> 12) & 0xF) +#ifdef FULL_UNICODE #define CHAR_SET_L3(CH) (((CH) >> 16) & 0x1F) +#endif +#ifdef FULL_UNICODE #define CHAR_SET_L2_LO(CH) ((CH) & (~(wchar_t) 0xFFFF)) #define CHAR_SET_L2_HI(CH) ((CH) | ((wchar_t) 0xFFFF)) +#endif #define CHAR_SET_L1_LO(CH) ((CH) & (~(wchar_t) 0xFFF)) #define CHAR_SET_L1_HI(CH) ((CH) | ((wchar_t) 0xFFF)) @@ -74,13 +82,18 @@ typedef unsigned int bitcell_t; #define CHAR_SET_L0_HI(CH) ((CH) | ((wchar_t) 0xFF)) typedef enum { - CHSET_SMALL, CHSET_DISPLACED, CHSET_LARGE, CHSET_XLARGE + CHSET_SMALL, CHSET_DISPLACED, CHSET_LARGE, +#ifdef FULL_UNICODE + CHSET_XLARGE +#endif } chset_type_t; typedef bitcell_t cset_L0_t[CHAR_SET_SIZE]; typedef cset_L0_t *cset_L1_t[16]; typedef cset_L1_t *cset_L2_t[16]; +#ifdef FULL_UNICODE typedef cset_L2_t *cset_L3_t[17]; +#endif struct any_char_set { unsigned type : 3; @@ -107,18 +120,22 @@ struct large_char_set { cset_L2_t dir; }; +#ifdef FULL_UNICODE struct xlarge_char_set { unsigned type : 3; unsigned comp : 1; cset_L3_t dir; }; +#endif typedef union char_set { struct any_char_set any; struct small_char_set s; struct displaced_char_set d; struct large_char_set l; +#ifdef FULL_UNICODE struct xlarge_char_set xl; +#endif } char_set_t; #define NFA_SET_SIZE 512 @@ -299,6 +316,7 @@ static void L1_free(cset_L1_t *L1) free((*L1)[i1]); } +#ifdef FULL_UNICODE static int L2_full(cset_L2_t *L2) { int i; @@ -307,6 +325,7 @@ static int L2_full(cset_L2_t *L2) return 0; return 1; } +#endif static void L2_fill_range(cset_L2_t *L2, wchar_t ch0, wchar_t ch1) { @@ -378,6 +397,8 @@ static void L2_free(cset_L2_t *L2) } } +#ifdef FULL_UNICODE + static void L3_fill_range(cset_L3_t *L3, wchar_t ch0, wchar_t ch1) { int i3, i30, i31; @@ -421,6 +442,7 @@ static void L3_fill_range(cset_L3_t *L3, wchar_t ch0, wchar_t ch1) } } + static int L3_contains(cset_L3_t *L3, wchar_t ch) { int i3 = CHAR_SET_L3(ch); @@ -447,6 +469,8 @@ static void L3_free(cset_L3_t *L3) } } +#endif + static char_set_t *char_set_create(chset_type_t type, wchar_t base) { static char_set_t blank; @@ -471,10 +495,12 @@ static void char_set_destroy(char_set_t *set) L2_free(&set->l.dir); free(set); break; +#ifdef FULL_UNICODE case CHSET_XLARGE: L3_free(&set->xl.dir); free(set); break; +#endif } } @@ -498,10 +524,12 @@ static void char_set_add(char_set_t *set, wchar_t ch) assert (ch < 0x10000); L2_fill_range(&set->l.dir, ch, ch); break; +#ifdef FULL_UNICODE case CHSET_XLARGE: assert (ch < 0x110000); L3_fill_range(&set->xl.dir, ch, ch); break; +#endif } } @@ -524,10 +552,12 @@ static void char_set_add_range(char_set_t *set, wchar_t ch0, wchar_t ch1) assert (ch1 < 0x10000); L2_fill_range(&set->l.dir, ch0, ch1); break; +#ifdef FULL_UNICODE case CHSET_XLARGE: assert (ch1 < 0x110000); L3_fill_range(&set->xl.dir, ch0, ch1); break; +#endif } } @@ -551,11 +581,13 @@ static int char_set_contains(char_set_t *set, wchar_t ch) break; result = L2_contains(&set->l.dir, ch); break; +#ifdef FULL_UNICODE case CHSET_XLARGE: if (ch >= 0x110000) break; result = L3_contains(&set->xl.dir, ch); break; +#endif } return set->any.comp ? !result : result; @@ -603,7 +635,12 @@ static char_set_t *char_set_compile(val args, val comp) else if (max < 0x10000) cst = CHSET_LARGE; else +#ifdef FULL_UNICODE cst = CHSET_XLARGE; +#else + cst = CHSET_LARGE; +#endif + { char_set_t *set = char_set_create(cst, min); @@ -28,10 +28,24 @@ #include <stdio.h> #include <stdlib.h> #include <wchar.h> +#include <setjmp.h> #include "config.h" #include "lib.h" +#include "unwind.h" #include "utf8.h" +#if WCHAR_MAX > 65535 +#define FULL_UNICODE +#endif + +#ifndef FULL_UNICODE +static void conversion_error(void) +{ + uw_throw(range_error_s, + lit("encountered utf-8 character that needs full unicode support")); +} +#endif + size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src) { size_t nchar = 1; @@ -66,8 +80,12 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src) state = utf8_more2; wch = (ch & 0xf); } else if (ch >= 0xf0 && ch < 0xf5) { +#ifdef FULL_UNICODE state = utf8_more3; wch = (ch & 0x7); +#else + conversion_error(); +#endif } else { if (wdst) *wdst++ = 0xdc00 | ch; @@ -249,8 +267,12 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) ud->state = utf8_more2; ud->wch = (ch & 0xf); } else if (ch >= 0xf0 && ch < 0xf5) { +#ifdef FULL_UNICODE ud->state = utf8_more3; ud->wch = (ch & 0x7); +#else + conversion_error(); +#endif } else { ud->back = ud->tail; return 0xdc00 | ch; |