summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog26
-rw-r--r--regex.c39
-rw-r--r--utf8.c22
3 files changed, 86 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 3f691d57..c9f9119a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,29 @@
+2011-10-10 Kaz Kylheku <kaz@kylheku.com>
+
+ Improved support for broken unicode.
+ Regex support for extra-large character sets not compiled in
+ if wchar_t is not wide enough for it.
+ The utf-8 properly throws exceptions when encountering characters
+ that it cannot represent, instead of silently ignoring the
+ situation and continuing with incorrectly computed data.
+
+ * regex.c (FULL_UNICODE): New macro.
+ (CHAR_SET_L3, CHAR_SET_L2_LO, CHAR_SET_L2_HI): Only defined
+ if full unicde is available.
+ (CHSET_XLARGE, cset_L3_t, struct xlarge_char_set,
+ L2_full, L3_fill_range, L3_contains): Ditto.
+ (unon char_set): Member x1 present only under FULL_UNICODE.
+ (char_set_destroy, char_set_add, char_set_add_range,
+ char_set_contains): CHSET_XLARGE cases only available on
+ FULL_UNICODE.
+ (char_set_compile): Default cst variable to CHSET_LARGE.
+
+ * utf8.c (FULL_UNICODE): New macro.
+ (conversion_error): New function.
+ (utf8_from_uc): Throw error if not FULL_UNICODE and character is
+ outside the BMP.
+ (utf8_decode): Likewise.
+
2011-10-09 Kaz Kylheku <kaz@kylheku.com>
* HACKING: Documented portability hacks for narrow wchar_t.
diff --git a/regex.c b/regex.c
index 3634552c..a50f9007 100644
--- a/regex.c
+++ b/regex.c
@@ -38,6 +38,10 @@
#include "regex.h"
#include "txr.h"
+#if WCHAR_MAX > 65535
+#define FULL_UNICODE
+#endif
+
typedef union nfa_state nfa_state_t;
typedef struct nfa {
@@ -62,10 +66,14 @@ typedef unsigned int bitcell_t;
#define CHAR_SET_L0(CH) ((CH) & 0xFF)
#define CHAR_SET_L1(CH) (((CH) >> 8) & 0xF)
#define CHAR_SET_L2(CH) (((CH) >> 12) & 0xF)
+#ifdef FULL_UNICODE
#define CHAR_SET_L3(CH) (((CH) >> 16) & 0x1F)
+#endif
+#ifdef FULL_UNICODE
#define CHAR_SET_L2_LO(CH) ((CH) & (~(wchar_t) 0xFFFF))
#define CHAR_SET_L2_HI(CH) ((CH) | ((wchar_t) 0xFFFF))
+#endif
#define CHAR_SET_L1_LO(CH) ((CH) & (~(wchar_t) 0xFFF))
#define CHAR_SET_L1_HI(CH) ((CH) | ((wchar_t) 0xFFF))
@@ -74,13 +82,18 @@ typedef unsigned int bitcell_t;
#define CHAR_SET_L0_HI(CH) ((CH) | ((wchar_t) 0xFF))
typedef enum {
- CHSET_SMALL, CHSET_DISPLACED, CHSET_LARGE, CHSET_XLARGE
+ CHSET_SMALL, CHSET_DISPLACED, CHSET_LARGE,
+#ifdef FULL_UNICODE
+ CHSET_XLARGE
+#endif
} chset_type_t;
typedef bitcell_t cset_L0_t[CHAR_SET_SIZE];
typedef cset_L0_t *cset_L1_t[16];
typedef cset_L1_t *cset_L2_t[16];
+#ifdef FULL_UNICODE
typedef cset_L2_t *cset_L3_t[17];
+#endif
struct any_char_set {
unsigned type : 3;
@@ -107,18 +120,22 @@ struct large_char_set {
cset_L2_t dir;
};
+#ifdef FULL_UNICODE
struct xlarge_char_set {
unsigned type : 3;
unsigned comp : 1;
cset_L3_t dir;
};
+#endif
typedef union char_set {
struct any_char_set any;
struct small_char_set s;
struct displaced_char_set d;
struct large_char_set l;
+#ifdef FULL_UNICODE
struct xlarge_char_set xl;
+#endif
} char_set_t;
#define NFA_SET_SIZE 512
@@ -299,6 +316,7 @@ static void L1_free(cset_L1_t *L1)
free((*L1)[i1]);
}
+#ifdef FULL_UNICODE
static int L2_full(cset_L2_t *L2)
{
int i;
@@ -307,6 +325,7 @@ static int L2_full(cset_L2_t *L2)
return 0;
return 1;
}
+#endif
static void L2_fill_range(cset_L2_t *L2, wchar_t ch0, wchar_t ch1)
{
@@ -378,6 +397,8 @@ static void L2_free(cset_L2_t *L2)
}
}
+#ifdef FULL_UNICODE
+
static void L3_fill_range(cset_L3_t *L3, wchar_t ch0, wchar_t ch1)
{
int i3, i30, i31;
@@ -421,6 +442,7 @@ static void L3_fill_range(cset_L3_t *L3, wchar_t ch0, wchar_t ch1)
}
}
+
static int L3_contains(cset_L3_t *L3, wchar_t ch)
{
int i3 = CHAR_SET_L3(ch);
@@ -447,6 +469,8 @@ static void L3_free(cset_L3_t *L3)
}
}
+#endif
+
static char_set_t *char_set_create(chset_type_t type, wchar_t base)
{
static char_set_t blank;
@@ -471,10 +495,12 @@ static void char_set_destroy(char_set_t *set)
L2_free(&set->l.dir);
free(set);
break;
+#ifdef FULL_UNICODE
case CHSET_XLARGE:
L3_free(&set->xl.dir);
free(set);
break;
+#endif
}
}
@@ -498,10 +524,12 @@ static void char_set_add(char_set_t *set, wchar_t ch)
assert (ch < 0x10000);
L2_fill_range(&set->l.dir, ch, ch);
break;
+#ifdef FULL_UNICODE
case CHSET_XLARGE:
assert (ch < 0x110000);
L3_fill_range(&set->xl.dir, ch, ch);
break;
+#endif
}
}
@@ -524,10 +552,12 @@ static void char_set_add_range(char_set_t *set, wchar_t ch0, wchar_t ch1)
assert (ch1 < 0x10000);
L2_fill_range(&set->l.dir, ch0, ch1);
break;
+#ifdef FULL_UNICODE
case CHSET_XLARGE:
assert (ch1 < 0x110000);
L3_fill_range(&set->xl.dir, ch0, ch1);
break;
+#endif
}
}
@@ -551,11 +581,13 @@ static int char_set_contains(char_set_t *set, wchar_t ch)
break;
result = L2_contains(&set->l.dir, ch);
break;
+#ifdef FULL_UNICODE
case CHSET_XLARGE:
if (ch >= 0x110000)
break;
result = L3_contains(&set->xl.dir, ch);
break;
+#endif
}
return set->any.comp ? !result : result;
@@ -603,7 +635,12 @@ static char_set_t *char_set_compile(val args, val comp)
else if (max < 0x10000)
cst = CHSET_LARGE;
else
+#ifdef FULL_UNICODE
cst = CHSET_XLARGE;
+#else
+ cst = CHSET_LARGE;
+#endif
+
{
char_set_t *set = char_set_create(cst, min);
diff --git a/utf8.c b/utf8.c
index 66ca774e..f2821f72 100644
--- a/utf8.c
+++ b/utf8.c
@@ -28,10 +28,24 @@
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
+#include <setjmp.h>
#include "config.h"
#include "lib.h"
+#include "unwind.h"
#include "utf8.h"
+#if WCHAR_MAX > 65535
+#define FULL_UNICODE
+#endif
+
+#ifndef FULL_UNICODE
+static void conversion_error(void)
+{
+ uw_throw(range_error_s,
+ lit("encountered utf-8 character that needs full unicode support"));
+}
+#endif
+
size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
{
size_t nchar = 1;
@@ -66,8 +80,12 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
state = utf8_more2;
wch = (ch & 0xf);
} else if (ch >= 0xf0 && ch < 0xf5) {
+#ifdef FULL_UNICODE
state = utf8_more3;
wch = (ch & 0x7);
+#else
+ conversion_error();
+#endif
} else {
if (wdst)
*wdst++ = 0xdc00 | ch;
@@ -249,8 +267,12 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
ud->state = utf8_more2;
ud->wch = (ch & 0xf);
} else if (ch >= 0xf0 && ch < 0xf5) {
+#ifdef FULL_UNICODE
ud->state = utf8_more3;
ud->wch = (ch & 0x7);
+#else
+ conversion_error();
+#endif
} else {
ud->back = ud->tail;
return 0xdc00 | ch;