summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2021-04-20 07:45:30 -0700
committerKaz Kylheku <kaz@kylheku.com>2021-04-20 07:45:30 -0700
commit2db8b0497c7cc13b44210fb06b74d45fefccefc3 (patch)
tree224f5617e347fa39b371eb7716567187b34b8480
parent901440d00b54e747cc68da70df3a0a1eaef258fc (diff)
downloadtxr-2db8b0497c7cc13b44210fb06b74d45fefccefc3.tar.gz
txr-2db8b0497c7cc13b44210fb06b74d45fefccefc3.tar.bz2
txr-2db8b0497c7cc13b44210fb06b74d45fefccefc3.zip
utf8: decode: reduce strictness of full unicode check.
* utf8.c (utf8_from_buf, utf8_deocde): On 16 bit wchar_t, we dont' have to throw on every value in the range 0xF0-0xFF. Only the values 0xF0 through 0xF4 are potential UTF-8 bytes; so we only need to error out on those. 0xF5 through 0xFF are invalid bytes, which we can map into the 0xDCNN range.
-rw-r--r--utf8.c8
1 files changed, 4 insertions, 4 deletions
diff --git a/utf8.c b/utf8.c
index c23eefce..0d484f4f 100644
--- a/utf8.c
+++ b/utf8.c
@@ -84,16 +84,16 @@ size_t utf8_from_buf(wchar_t *wdst, const unsigned char *src, size_t nbytes)
wch_min = 0x800;
break;
case 0xF:
-#ifdef FULL_UNICODE
if (ch < 0xF5) {
+#ifdef FULL_UNICODE
state = utf8_more3;
wch = (ch & 0x7);
wch_min = 0x10000;
break;
- }
#else
conversion_error();
#endif
+ }
/* fallthrough */
default:
if (wdst)
@@ -317,16 +317,16 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
ud->wch_min = 0x800;
break;
case 0xF:
-#ifdef FULL_UNICODE
if (ch < 0xF5) {
+#ifdef FULL_UNICODE
ud->state = utf8_more3;
ud->wch = (ch & 0x7);
ud->wch_min = 0x10000;
break;
- }
#else
conversion_error();
#endif
+ }
/* fallthrough */
default:
ud->back = ud->tail;