diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2011-10-10 13:28:14 -0700 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2011-10-10 13:28:14 -0700 |
commit | 9a2598f5364c05b3322dd02f3c1a59e056b19f64 (patch) | |
tree | ef58791c7ec2a55b16c4b7110d16b7d3b00e30d5 /utf8.c | |
parent | 1238530e0014dd1d53d3368574b107ff3050f329 (diff) | |
download | txr-9a2598f5364c05b3322dd02f3c1a59e056b19f64.tar.gz txr-9a2598f5364c05b3322dd02f3c1a59e056b19f64.tar.bz2 txr-9a2598f5364c05b3322dd02f3c1a59e056b19f64.zip |
Improved support for broken unicode.
Regex support for extra-large character sets not compiled in
if wchar_t is not wide enough for it.
The utf-8 properly throws exceptions when encountering characters
that it cannot represent, instead of silently ignoring the
situation and continuing with incorrectly computed data.
* regex.c (FULL_UNICODE): New macro.
(CHAR_SET_L3, CHAR_SET_L2_LO, CHAR_SET_L2_HI): Only defined
if full unicde is available.
(CHSET_XLARGE, cset_L3_t, struct xlarge_char_set,
L2_full, L3_fill_range, L3_contains): Ditto.
(unon char_set): Member x1 present only under FULL_UNICODE.
(char_set_destroy, char_set_add, char_set_add_range,
char_set_contains): CHSET_XLARGE cases only available on
FULL_UNICODE.
(char_set_compile): Default cst variable to CHSET_LARGE.
* utf8.c (FULL_UNICODE): New macro.
(conversion_error): New function.
(utf8_from_uc): Throw error if not FULL_UNICODE and character is
outside the BMP.
(utf8_decode): Likewise.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 22 |
1 files changed, 22 insertions, 0 deletions
@@ -28,10 +28,24 @@ #include <stdio.h> #include <stdlib.h> #include <wchar.h> +#include <setjmp.h> #include "config.h" #include "lib.h" +#include "unwind.h" #include "utf8.h" +#if WCHAR_MAX > 65535 +#define FULL_UNICODE +#endif + +#ifndef FULL_UNICODE +static void conversion_error(void) +{ + uw_throw(range_error_s, + lit("encountered utf-8 character that needs full unicode support")); +} +#endif + size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src) { size_t nchar = 1; @@ -66,8 +80,12 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src) state = utf8_more2; wch = (ch & 0xf); } else if (ch >= 0xf0 && ch < 0xf5) { +#ifdef FULL_UNICODE state = utf8_more3; wch = (ch & 0x7); +#else + conversion_error(); +#endif } else { if (wdst) *wdst++ = 0xdc00 | ch; @@ -249,8 +267,12 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) ud->state = utf8_more2; ud->wch = (ch & 0xf); } else if (ch >= 0xf0 && ch < 0xf5) { +#ifdef FULL_UNICODE ud->state = utf8_more3; ud->wch = (ch & 0x7); +#else + conversion_error(); +#endif } else { ud->back = ud->tail; return 0xdc00 | ch; |