From 3b64319b10196425401d4d71f7ee1273e3bffe32 Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Sat, 15 Feb 2014 00:19:15 -0800 Subject: A trivial change in the UTF-8 decoder allows TXR to handle null bytes in text. * utf8.h (UTF8_ADMIT_NUL): New preprocessor symbol. (utf8_decoder): New member, flags. * utf8.c (utf8_decoder_init): Initialize flags to 0. (utf8_decode): If a null byte is encountered in the input, then convert it to 0xDC00, rather than keeping it as zero, unless flags contains UTF8_ADMIT_NUL. * txr.1: Document handling of null bytes. --- utf8.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'utf8.c') diff --git a/utf8.c b/utf8.c index 26e5795d..e3ef3e7a 100644 --- a/utf8.c +++ b/utf8.c @@ -260,6 +260,7 @@ int utf8_encode(wchar_t wch, int (*put)(int ch, mem_t *ctx), mem_t *ctx) void utf8_decoder_init(utf8_decoder_t *ud) { ud->state = utf8_init; + ud->flags = 0; ud->wch = 0; ud->head = ud->tail = ud->back = 0; } @@ -295,6 +296,8 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) case 0x0: case 0x1: case 0x2: case 0x3: case 0x4: case 0x5: case 0x6: case 0x7: ud->back = ud->tail; + if (ch == 0 && (ud->flags & UTF8_ADMIT_NUL) == 0) + return 0xDC00; return ch; case 0xC: case 0xD: ud->state = utf8_more1; -- cgit v1.2.3