3 files changed, 86 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 3f691d57..c9f9119a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,29 @@
+2011-10-10  Kaz Kylheku  <kaz@kylheku.com>
+
+	Improved support for broken unicode.
+	Regex support for extra-large character sets not compiled in
+	if wchar_t is not wide enough for it.
+	The utf-8 properly throws exceptions when encountering characters
+	that it cannot represent, instead of silently ignoring the
+	situation and continuing with incorrectly computed data.
+
+	* regex.c (FULL_UNICODE): New macro.
+	(CHAR_SET_L3, CHAR_SET_L2_LO, CHAR_SET_L2_HI): Only defined
+	if full unicde is available.
+	(CHSET_XLARGE, cset_L3_t, struct xlarge_char_set,
+	L2_full, L3_fill_range, L3_contains): Ditto.
+	(unon char_set): Member x1 present only under FULL_UNICODE.
+	(char_set_destroy, char_set_add, char_set_add_range,
+	char_set_contains): CHSET_XLARGE cases only available on
+	FULL_UNICODE.
+	(char_set_compile): Default cst variable to CHSET_LARGE.
+
+	* utf8.c (FULL_UNICODE): New macro.
+	(conversion_error): New function.
+	(utf8_from_uc): Throw error if not FULL_UNICODE and character is
+	outside the BMP.
+	(utf8_decode): Likewise.
+
 2011-10-09  Kaz Kylheku  <kaz@kylheku.com>
 
 	* HACKING: Documented portability hacks for narrow wchar_t.
diff --git a/regex.c b/regex.c
index 3634552c..a50f9007 100644
--- a/regex.c
+++ b/regex.c
@@ -38,6 +38,10 @@
 #include "regex.h"
 #include "txr.h"
 
+#if WCHAR_MAX > 65535
+#define FULL_UNICODE
+#endif
+
 typedef union nfa_state nfa_state_t;
 
 typedef struct nfa {
@@ -62,10 +66,14 @@ typedef unsigned int bitcell_t;
 #define CHAR_SET_L0(CH) ((CH) & 0xFF)
 #define CHAR_SET_L1(CH) (((CH) >> 8) & 0xF)
 #define CHAR_SET_L2(CH) (((CH) >> 12) & 0xF)
+#ifdef FULL_UNICODE
 #define CHAR_SET_L3(CH) (((CH) >> 16) & 0x1F)
+#endif
 
+#ifdef FULL_UNICODE
 #define CHAR_SET_L2_LO(CH) ((CH) & (~(wchar_t) 0xFFFF))
 #define CHAR_SET_L2_HI(CH) ((CH) | ((wchar_t) 0xFFFF))
+#endif
 
 #define CHAR_SET_L1_LO(CH) ((CH) & (~(wchar_t) 0xFFF))
 #define CHAR_SET_L1_HI(CH) ((CH) | ((wchar_t) 0xFFF))
@@ -74,13 +82,18 @@ typedef unsigned int bitcell_t;
 #define CHAR_SET_L0_HI(CH) ((CH) | ((wchar_t) 0xFF))
 
 typedef enum {
-  CHSET_SMALL, CHSET_DISPLACED, CHSET_LARGE, CHSET_XLARGE
+  CHSET_SMALL, CHSET_DISPLACED, CHSET_LARGE, 
+#ifdef FULL_UNICODE
+  CHSET_XLARGE
+#endif
 } chset_type_t;
 
 typedef bitcell_t cset_L0_t[CHAR_SET_SIZE];
 typedef cset_L0_t *cset_L1_t[16];
 typedef cset_L1_t *cset_L2_t[16];
+#ifdef FULL_UNICODE
 typedef cset_L2_t *cset_L3_t[17];
+#endif
 
 struct any_char_set {
   unsigned type : 3;
@@ -107,18 +120,22 @@ struct large_char_set {
   cset_L2_t dir;
 };
 
+#ifdef FULL_UNICODE
 struct xlarge_char_set {
   unsigned type : 3;
   unsigned comp : 1;
   cset_L3_t dir;
 };
+#endif
 
 typedef union char_set {
   struct any_char_set any;
   struct small_char_set s;
   struct displaced_char_set d;
   struct large_char_set l;
+#ifdef FULL_UNICODE
   struct xlarge_char_set xl;
+#endif
 } char_set_t;
 
 #define NFA_SET_SIZE 512
@@ -299,6 +316,7 @@ static void L1_free(cset_L1_t *L1)
       free((*L1)[i1]);
 }
 
+#ifdef FULL_UNICODE
 static int L2_full(cset_L2_t *L2)
 {
   int i;
@@ -307,6 +325,7 @@ static int L2_full(cset_L2_t *L2)
       return 0;
   return 1;
 }
+#endif
 
 static void L2_fill_range(cset_L2_t *L2, wchar_t ch0, wchar_t ch1)
 {
@@ -378,6 +397,8 @@ static void L2_free(cset_L2_t *L2)
   }
 }
 
+#ifdef FULL_UNICODE
+
 static void L3_fill_range(cset_L3_t *L3, wchar_t ch0, wchar_t ch1)
 {
   int i3, i30, i31;
@@ -421,6 +442,7 @@ static void L3_fill_range(cset_L3_t *L3, wchar_t ch0, wchar_t ch1)
   }
 }
 
+
 static int L3_contains(cset_L3_t *L3, wchar_t ch)
 {
   int i3 = CHAR_SET_L3(ch);
@@ -447,6 +469,8 @@ static void L3_free(cset_L3_t *L3)
   }
 }
 
+#endif
+
 static char_set_t *char_set_create(chset_type_t type, wchar_t base)
 {
   static char_set_t blank;
@@ -471,10 +495,12 @@ static void char_set_destroy(char_set_t *set)
     L2_free(&set->l.dir);
     free(set);
     break;
+#ifdef FULL_UNICODE
   case CHSET_XLARGE:
     L3_free(&set->xl.dir);
     free(set);
     break;
+#endif
   }
 }
 
@@ -498,10 +524,12 @@ static void char_set_add(char_set_t *set, wchar_t ch)
     assert (ch < 0x10000);
     L2_fill_range(&set->l.dir, ch, ch);
     break;
+#ifdef FULL_UNICODE
   case CHSET_XLARGE:
     assert (ch < 0x110000);
     L3_fill_range(&set->xl.dir, ch, ch);
     break;
+#endif
   }
 }
 
@@ -524,10 +552,12 @@ static void char_set_add_range(char_set_t *set, wchar_t ch0, wchar_t ch1)
     assert (ch1 < 0x10000);
     L2_fill_range(&set->l.dir, ch0, ch1);
     break;
+#ifdef FULL_UNICODE
   case CHSET_XLARGE:
     assert (ch1 < 0x110000);
     L3_fill_range(&set->xl.dir, ch0, ch1);
     break;
+#endif
   }
 }
 
@@ -551,11 +581,13 @@ static int char_set_contains(char_set_t *set, wchar_t ch)
       break;
     result = L2_contains(&set->l.dir, ch);
     break;
+#ifdef FULL_UNICODE
   case CHSET_XLARGE:
     if (ch >= 0x110000)
       break;
     result = L3_contains(&set->xl.dir, ch);
     break;
+#endif
   }
 
   return set->any.comp ? !result : result;
@@ -603,7 +635,12 @@ static char_set_t *char_set_compile(val args, val comp)
   else if (max < 0x10000)
     cst = CHSET_LARGE;
   else
+#ifdef FULL_UNICODE
     cst = CHSET_XLARGE;
+#else
+    cst = CHSET_LARGE;
+#endif
+
 
   {
     char_set_t *set = char_set_create(cst, min);
diff --git a/utf8.c b/utf8.c
index 66ca774e..f2821f72 100644
--- a/utf8.c
+++ b/utf8.c
@@ -28,10 +28,24 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <wchar.h>
+#include <setjmp.h>
 #include "config.h"
 #include "lib.h"
+#include "unwind.h"
 #include "utf8.h"
 
+#if WCHAR_MAX > 65535
+#define FULL_UNICODE
+#endif
+
+#ifndef FULL_UNICODE
+static void conversion_error(void)
+{
+  uw_throw(range_error_s, 
+	   lit("encountered utf-8 character that needs full unicode support"));
+}
+#endif
+
 size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
 {
   size_t nchar = 1;
@@ -66,8 +80,12 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
         state = utf8_more2;
         wch = (ch & 0xf);
       } else if (ch >= 0xf0 && ch < 0xf5) {
+#ifdef FULL_UNICODE
         state = utf8_more3;
         wch = (ch & 0x7);
+#else
+	conversion_error();
+#endif
       } else {
         if (wdst)
           *wdst++ = 0xdc00 | ch;
@@ -249,8 +267,12 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
         ud->state = utf8_more2;
         ud->wch = (ch & 0xf);
       } else if (ch >= 0xf0 && ch < 0xf5) {
+#ifdef FULL_UNICODE
         ud->state = utf8_more3;
         ud->wch = (ch & 0x7);
+#else
+	conversion_error();
+#endif
       } else {
         ud->back = ud->tail;
         return 0xdc00 | ch;