diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2012-02-02 21:36:24 -0800 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2012-02-02 21:36:24 -0800 |
commit | 905b074cea7303553777e169529efc8aeccdc35a (patch) | |
tree | 7c46bc7444b03f5e74b82e53923349ada91663b0 | |
parent | 928fb0df45be6cb12a63e9d1d43504be1a595f7d (diff) | |
download | txr-905b074cea7303553777e169529efc8aeccdc35a.tar.gz txr-905b074cea7303553777e169529efc8aeccdc35a.tar.bz2 txr-905b074cea7303553777e169529efc8aeccdc35a.zip |
* utf8.c (utf8_to_uc, utf8_encode): Do not encode surrogate code
points (U+DC00 to U+DCFF) as multi-byte UTF8 sequences. We use
that range for invalid bytes on input, so on output the best thing
to do is to reproduce the original bytes. E.g the code U+DCA0
will produce the byte A0.
-rw-r--r-- | ChangeLog | 8 | ||||
-rw-r--r-- | utf8.c | 26 |
2 files changed, 26 insertions, 8 deletions
@@ -1,5 +1,13 @@ 2012-02-02 Kaz Kylheku <kaz@kylheku.com> + * utf8.c (utf8_to_uc, utf8_encode): Do not encode surrogate code + points (U+DC00 to U+DCFF) as multi-byte UTF8 sequences. We use + that range for invalid bytes on input, so on output the best thing + to do is to reproduce the original bytes. E.g the code U+DCA0 + will produce the byte A0. + +2012-02-02 Kaz Kylheku <kaz@kylheku.com> + * txr.1: UTF-8 handling clarified. 2012-02-02 Kaz Kylheku <kaz@kylheku.com> @@ -152,11 +152,17 @@ size_t utf8_to_uc(unsigned char *dst, const wchar_t *wsrc) *dst++ = 0x80 | (wch & 0x3F); } } else if (wch < 0x10000) { - nbyte += 3; - if (dst) { - *dst++ = 0xE0 | (wch >> 12); - *dst++ = 0x80 | ((wch >> 6) & 0x3F); - *dst++ = 0x80 | (wch & 0x3F); + if ((wch & 0xFF00) == 0xDC00) { + nbyte += 1; + if (dst) + *dst++ = (wch & 0xff); + } else { + nbyte += 3; + if (dst) { + *dst++ = 0xE0 | (wch >> 12); + *dst++ = 0x80 | ((wch >> 6) & 0x3F); + *dst++ = 0x80 | (wch & 0x3F); + } } } else if (wch < 0x110000) { nbyte += 4; @@ -219,9 +225,13 @@ int utf8_encode(wchar_t wch, int (*put)(int ch, mem_t *ctx), mem_t *ctx) return put(0xC0 | (wch >> 6), ctx) && put(0x80 | (wch & 0x3F), ctx); } else if (wch < 0x10000) { - return put(0xE0 | (wch >> 12), ctx) && - put(0x80 | ((wch >> 6) & 0x3F), ctx) && - put(0x80 | (wch & 0x3F), ctx); + if ((wch & 0xFF00) == 0xDC00) { + return put(wch & 0xFF, ctx); + } else { + return put(0xE0 | (wch >> 12), ctx) && + put(0x80 | ((wch >> 6) & 0x3F), ctx) && + put(0x80 | (wch & 0x3F), ctx); + } } else if (wch < 0x110000) { return put(0xF0 | (wch >> 18), ctx) && put(0x80 | ((wch >> 12) & 0x3F), ctx) && |