Improved support for broken unicode.

Regex support for extra-large character sets not compiled in if wchar_t is not wide enough for it. The utf-8 properly throws exceptions when encountering characters that it cannot represent, instead of silently ignoring the situation and continuing with incorrectly computed data. * regex.c (FULL_UNICODE): New macro. (CHAR_SET_L3, CHAR_SET_L2_LO, CHAR_SET_L2_HI): Only defined if full unicde is available. (CHSET_XLARGE, cset_L3_t, struct xlarge_char_set, L2_full, L3_fill_range, L3_contains): Ditto. (unon char_set): Member x1 present only under FULL_UNICODE. (char_set_destroy, char_set_add, char_set_add_range, char_set_contains): CHSET_XLARGE cases only available on FULL_UNICODE. (char_set_compile): Default cst variable to CHSET_LARGE. * utf8.c (FULL_UNICODE): New macro. (conversion_error): New function. (utf8_from_uc): Throw error if not FULL_UNICODE and character is outside the BMP. (utf8_decode): Likewise.
author: Kaz Kylheku <kaz@kylheku.com> 2011-10-10 13:28:14 -0700
committer: Kaz Kylheku <kaz@kylheku.com> 2011-10-10 13:28:14 -0700
commit: 9a2598f5364c05b3322dd02f3c1a59e056b19f64 (patch)
tree: ef58791c7ec2a55b16c4b7110d16b7d3b00e30d5 /utf8.c
parent: 1238530e0014dd1d53d3368574b107ff3050f329 (diff)
download: txr-9a2598f5364c05b3322dd02f3c1a59e056b19f64.tar.gz
txr-9a2598f5364c05b3322dd02f3c1a59e056b19f64.tar.bz2
txr-9a2598f5364c05b3322dd02f3c1a59e056b19f64.zip
1 files changed, 22 insertions, 0 deletions
diff --git a/utf8.c b/utf8.c
index 66ca774e..f2821f72 100644
--- a/utf8.c
+++ b/utf8.c
@@ -28,10 +28,24 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <wchar.h>
+#include <setjmp.h>
 #include "config.h"
 #include "lib.h"
+#include "unwind.h"
 #include "utf8.h"
 
+#if WCHAR_MAX > 65535
+#define FULL_UNICODE
+#endif
+
+#ifndef FULL_UNICODE
+static void conversion_error(void)
+{
+  uw_throw(range_error_s, 
+	   lit("encountered utf-8 character that needs full unicode support"));
+}
+#endif
+
 size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
 {
   size_t nchar = 1;
@@ -66,8 +80,12 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
         state = utf8_more2;
         wch = (ch & 0xf);
       } else if (ch >= 0xf0 && ch < 0xf5) {
+#ifdef FULL_UNICODE
         state = utf8_more3;
         wch = (ch & 0x7);
+#else
+	conversion_error();
+#endif
       } else {
         if (wdst)
           *wdst++ = 0xdc00 | ch;
@@ -249,8 +267,12 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
         ud->state = utf8_more2;
         ud->wch = (ch & 0xf);
       } else if (ch >= 0xf0 && ch < 0xf5) {
+#ifdef FULL_UNICODE
         ud->state = utf8_more3;
         ud->wch = (ch & 0x7);
+#else
+	conversion_error();
+#endif
       } else {
         ud->back = ud->tail;
         return 0xdc00 | ch;
author	Kaz Kylheku <kaz@kylheku.com>	2011-10-10 13:28:14 -0700
committer	Kaz Kylheku <kaz@kylheku.com>	2011-10-10 13:28:14 -0700
commit	9a2598f5364c05b3322dd02f3c1a59e056b19f64 (patch)
tree	ef58791c7ec2a55b16c4b7110d16b7d3b00e30d5 /utf8.c
parent	1238530e0014dd1d53d3368574b107ff3050f329 (diff)
download	txr-9a2598f5364c05b3322dd02f3c1a59e056b19f64.tar.gz txr-9a2598f5364c05b3322dd02f3c1a59e056b19f64.tar.bz2 txr-9a2598f5364c05b3322dd02f3c1a59e056b19f64.zip