* utf8.c (utf8_to_uc, utf8_encode): Do not encode surrogate code

points (U+DC00 to U+DCFF) as multi-byte UTF8 sequences. We use that range for invalid bytes on input, so on output the best thing to do is to reproduce the original bytes. E.g the code U+DCA0 will produce the byte A0.
author: Kaz Kylheku <kaz@kylheku.com> 2012-02-02 21:36:24 -0800
committer: Kaz Kylheku <kaz@kylheku.com> 2012-02-02 21:36:24 -0800
commit: 905b074cea7303553777e169529efc8aeccdc35a (patch)
tree: 7c46bc7444b03f5e74b82e53923349ada91663b0
parent: 928fb0df45be6cb12a63e9d1d43504be1a595f7d (diff)
download: txr-905b074cea7303553777e169529efc8aeccdc35a.tar.gz
txr-905b074cea7303553777e169529efc8aeccdc35a.tar.bz2
txr-905b074cea7303553777e169529efc8aeccdc35a.zip
2 files changed, 26 insertions, 8 deletions
diff --git a/ChangeLog b/ChangeLog
index 608d20e8..5c7ecd02 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,13 @@
 2012-02-02  Kaz Kylheku  <kaz@kylheku.com>
 
+	* utf8.c (utf8_to_uc, utf8_encode): Do not encode surrogate code
+	points (U+DC00 to U+DCFF) as multi-byte UTF8 sequences.  We use
+	that range for invalid bytes on input, so on output the best thing
+	to do is to reproduce the original bytes. E.g the code U+DCA0
+	will produce the byte A0.
+
+2012-02-02  Kaz Kylheku  <kaz@kylheku.com>
+
 	* txr.1: UTF-8 handling clarified.
 
 2012-02-02  Kaz Kylheku  <kaz@kylheku.com>
diff --git a/utf8.c b/utf8.c
index 1ca8f7b5..fcc4dc98 100644
--- a/utf8.c
+++ b/utf8.c
@@ -152,11 +152,17 @@ size_t utf8_to_uc(unsigned char *dst, const wchar_t *wsrc)
         *dst++ = 0x80 | (wch & 0x3F);
       }
     } else if (wch < 0x10000) {
-      nbyte += 3;
-      if (dst) {
-        *dst++ = 0xE0 | (wch >> 12);
-        *dst++ = 0x80 | ((wch >> 6) & 0x3F);
-        *dst++ = 0x80 | (wch & 0x3F);
+      if ((wch & 0xFF00) == 0xDC00) {
+	nbyte += 1;
+	if (dst)
+	  *dst++ = (wch & 0xff);
+      } else {
+	nbyte += 3;
+	if (dst) {
+	  *dst++ = 0xE0 | (wch >> 12);
+	  *dst++ = 0x80 | ((wch >> 6) & 0x3F);
+	  *dst++ = 0x80 | (wch & 0x3F);
+	}
       }
     } else if (wch < 0x110000) {
       nbyte += 4;
@@ -219,9 +225,13 @@ int utf8_encode(wchar_t wch, int (*put)(int ch, mem_t *ctx), mem_t *ctx)
     return put(0xC0 | (wch >> 6), ctx) &&
            put(0x80 | (wch & 0x3F), ctx);
   } else if (wch < 0x10000) {
-    return put(0xE0 | (wch >> 12), ctx) &&
-           put(0x80 | ((wch >> 6) & 0x3F), ctx) &&
-           put(0x80 | (wch & 0x3F), ctx);
+    if ((wch & 0xFF00) == 0xDC00) {
+      return put(wch & 0xFF, ctx);
+    } else {
+      return put(0xE0 | (wch >> 12), ctx) &&
+	     put(0x80 | ((wch >> 6) & 0x3F), ctx) &&
+	     put(0x80 | (wch & 0x3F), ctx);
+    }
   } else if (wch < 0x110000) {
     return put(0xF0 | (wch >> 18), ctx) &&
            put(0x80 | ((wch >> 12) & 0x3F), ctx) &&
author	Kaz Kylheku <kaz@kylheku.com>	2012-02-02 21:36:24 -0800
committer	Kaz Kylheku <kaz@kylheku.com>	2012-02-02 21:36:24 -0800
commit	905b074cea7303553777e169529efc8aeccdc35a (patch)
tree	7c46bc7444b03f5e74b82e53923349ada91663b0
parent	928fb0df45be6cb12a63e9d1d43504be1a595f7d (diff)
download	txr-905b074cea7303553777e169529efc8aeccdc35a.tar.gz txr-905b074cea7303553777e169529efc8aeccdc35a.tar.bz2 txr-905b074cea7303553777e169529efc8aeccdc35a.zip