summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2012-02-02 21:36:24 -0800
committerKaz Kylheku <kaz@kylheku.com>2012-02-02 21:36:24 -0800
commit905b074cea7303553777e169529efc8aeccdc35a (patch)
tree7c46bc7444b03f5e74b82e53923349ada91663b0
parent928fb0df45be6cb12a63e9d1d43504be1a595f7d (diff)
downloadtxr-905b074cea7303553777e169529efc8aeccdc35a.tar.gz
txr-905b074cea7303553777e169529efc8aeccdc35a.tar.bz2
txr-905b074cea7303553777e169529efc8aeccdc35a.zip
* utf8.c (utf8_to_uc, utf8_encode): Do not encode surrogate code
points (U+DC00 to U+DCFF) as multi-byte UTF8 sequences. We use that range for invalid bytes on input, so on output the best thing to do is to reproduce the original bytes. E.g the code U+DCA0 will produce the byte A0.
-rw-r--r--ChangeLog8
-rw-r--r--utf8.c26
2 files changed, 26 insertions, 8 deletions
diff --git a/ChangeLog b/ChangeLog
index 608d20e8..5c7ecd02 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,13 @@
2012-02-02 Kaz Kylheku <kaz@kylheku.com>
+ * utf8.c (utf8_to_uc, utf8_encode): Do not encode surrogate code
+ points (U+DC00 to U+DCFF) as multi-byte UTF8 sequences. We use
+ that range for invalid bytes on input, so on output the best thing
+ to do is to reproduce the original bytes. E.g the code U+DCA0
+ will produce the byte A0.
+
+2012-02-02 Kaz Kylheku <kaz@kylheku.com>
+
* txr.1: UTF-8 handling clarified.
2012-02-02 Kaz Kylheku <kaz@kylheku.com>
diff --git a/utf8.c b/utf8.c
index 1ca8f7b5..fcc4dc98 100644
--- a/utf8.c
+++ b/utf8.c
@@ -152,11 +152,17 @@ size_t utf8_to_uc(unsigned char *dst, const wchar_t *wsrc)
*dst++ = 0x80 | (wch & 0x3F);
}
} else if (wch < 0x10000) {
- nbyte += 3;
- if (dst) {
- *dst++ = 0xE0 | (wch >> 12);
- *dst++ = 0x80 | ((wch >> 6) & 0x3F);
- *dst++ = 0x80 | (wch & 0x3F);
+ if ((wch & 0xFF00) == 0xDC00) {
+ nbyte += 1;
+ if (dst)
+ *dst++ = (wch & 0xff);
+ } else {
+ nbyte += 3;
+ if (dst) {
+ *dst++ = 0xE0 | (wch >> 12);
+ *dst++ = 0x80 | ((wch >> 6) & 0x3F);
+ *dst++ = 0x80 | (wch & 0x3F);
+ }
}
} else if (wch < 0x110000) {
nbyte += 4;
@@ -219,9 +225,13 @@ int utf8_encode(wchar_t wch, int (*put)(int ch, mem_t *ctx), mem_t *ctx)
return put(0xC0 | (wch >> 6), ctx) &&
put(0x80 | (wch & 0x3F), ctx);
} else if (wch < 0x10000) {
- return put(0xE0 | (wch >> 12), ctx) &&
- put(0x80 | ((wch >> 6) & 0x3F), ctx) &&
- put(0x80 | (wch & 0x3F), ctx);
+ if ((wch & 0xFF00) == 0xDC00) {
+ return put(wch & 0xFF, ctx);
+ } else {
+ return put(0xE0 | (wch >> 12), ctx) &&
+ put(0x80 | ((wch >> 6) & 0x3F), ctx) &&
+ put(0x80 | (wch & 0x3F), ctx);
+ }
} else if (wch < 0x110000) {
return put(0xF0 | (wch >> 18), ctx) &&
put(0x80 | ((wch >> 12) & 0x3F), ctx) &&