summaryrefslogtreecommitdiffstats
path: root/utf8.c
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2012-02-02 21:36:24 -0800
committerKaz Kylheku <kaz@kylheku.com>2012-02-02 21:36:24 -0800
commit905b074cea7303553777e169529efc8aeccdc35a (patch)
tree7c46bc7444b03f5e74b82e53923349ada91663b0 /utf8.c
parent928fb0df45be6cb12a63e9d1d43504be1a595f7d (diff)
downloadtxr-905b074cea7303553777e169529efc8aeccdc35a.tar.gz
txr-905b074cea7303553777e169529efc8aeccdc35a.tar.bz2
txr-905b074cea7303553777e169529efc8aeccdc35a.zip
* utf8.c (utf8_to_uc, utf8_encode): Do not encode surrogate code
points (U+DC00 to U+DCFF) as multi-byte UTF8 sequences. We use that range for invalid bytes on input, so on output the best thing to do is to reproduce the original bytes. E.g the code U+DCA0 will produce the byte A0.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c26
1 files changed, 18 insertions, 8 deletions
diff --git a/utf8.c b/utf8.c
index 1ca8f7b5..fcc4dc98 100644
--- a/utf8.c
+++ b/utf8.c
@@ -152,11 +152,17 @@ size_t utf8_to_uc(unsigned char *dst, const wchar_t *wsrc)
*dst++ = 0x80 | (wch & 0x3F);
}
} else if (wch < 0x10000) {
- nbyte += 3;
- if (dst) {
- *dst++ = 0xE0 | (wch >> 12);
- *dst++ = 0x80 | ((wch >> 6) & 0x3F);
- *dst++ = 0x80 | (wch & 0x3F);
+ if ((wch & 0xFF00) == 0xDC00) {
+ nbyte += 1;
+ if (dst)
+ *dst++ = (wch & 0xff);
+ } else {
+ nbyte += 3;
+ if (dst) {
+ *dst++ = 0xE0 | (wch >> 12);
+ *dst++ = 0x80 | ((wch >> 6) & 0x3F);
+ *dst++ = 0x80 | (wch & 0x3F);
+ }
}
} else if (wch < 0x110000) {
nbyte += 4;
@@ -219,9 +225,13 @@ int utf8_encode(wchar_t wch, int (*put)(int ch, mem_t *ctx), mem_t *ctx)
return put(0xC0 | (wch >> 6), ctx) &&
put(0x80 | (wch & 0x3F), ctx);
} else if (wch < 0x10000) {
- return put(0xE0 | (wch >> 12), ctx) &&
- put(0x80 | ((wch >> 6) & 0x3F), ctx) &&
- put(0x80 | (wch & 0x3F), ctx);
+ if ((wch & 0xFF00) == 0xDC00) {
+ return put(wch & 0xFF, ctx);
+ } else {
+ return put(0xE0 | (wch >> 12), ctx) &&
+ put(0x80 | ((wch >> 6) & 0x3F), ctx) &&
+ put(0x80 | (wch & 0x3F), ctx);
+ }
} else if (wch < 0x110000) {
return put(0xF0 | (wch >> 18), ctx) &&
put(0x80 | ((wch >> 12) & 0x3F), ctx) &&