* utf8.c (utf8_from_uc, utf8_decode): Impose a minium value on the

decoded character based on which UTF-8 case it is from. This rejects overlong forms. * utf8.h (struct utf8_decoder): New member, wch_min.
author: Kaz Kylheku <kaz@kylheku.com> 2012-02-02 16:35:32 -0800
committer: Kaz Kylheku <kaz@kylheku.com> 2012-02-02 16:35:32 -0800
commit: 97a34f6e5b04d4ce2eb3ee63f42d1375f4939de3 (patch)
tree: 4d2ed48b418f06c47a73a67a82895fcabef62ef8 /utf8.c
parent: c4dd3aaa204cd70e03d4cec3531c64be9a24f649 (diff)
download: txr-97a34f6e5b04d4ce2eb3ee63f42d1375f4939de3.tar.gz
txr-97a34f6e5b04d4ce2eb3ee63f42d1375f4939de3.tar.bz2
txr-97a34f6e5b04d4ce2eb3ee63f42d1375f4939de3.zip
1 files changed, 25 insertions, 7 deletions
diff --git a/utf8.c b/utf8.c
index f2821f72..1ca8f7b5 100644
--- a/utf8.c
+++ b/utf8.c
@@ -51,7 +51,7 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
   size_t nchar = 1;
   enum utf8_state state = utf8_init;
   const unsigned char *backtrack = 0;
-  wchar_t wch = 0;
+  wchar_t wch = 0, wch_min = 0;
 
   for (;;) {
     int ch = *src++;
@@ -76,13 +76,16 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
       } else if (ch >= 0xc2 && ch <= 0xe0) {
         state = utf8_more1;
         wch = (ch & 0x1f);
+	wch_min = 0x80;
       } else if (ch >= 0xe0 && ch <= 0xef) {
         state = utf8_more2;
         wch = (ch & 0xf);
+	wch_min = 0x800;
       } else if (ch >= 0xf0 && ch < 0xf5) {
 #ifdef FULL_UNICODE
         state = utf8_more3;
         wch = (ch & 0x7);
+	wch_min = 0x10000;
 #else
 	conversion_error();
 #endif
@@ -101,9 +104,15 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
         wch |= (ch & 0x3f);
         state = (enum utf8_state) (state - 1);
         if (state == utf8_init) {
-          if (wdst)
-            *wdst++ = wch;
-          nchar++;
+	  if (wch < wch_min) {
+	    src = backtrack;
+	    if (wdst)
+	      *wdst++ = 0xdc00 | *src;
+	  } else {
+	    if (wdst)
+	      *wdst++ = wch;
+	  }
+	  nchar++;
         }
       } else {
         src = backtrack;
@@ -260,16 +269,19 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
       if (ch < 0x80) {
         ud->back = ud->tail;
         return ch;
-      } else if (ch >= 0xc2 && ch <= 0xe0) {
+      } else if (ch >= 0xc0 && ch <= 0xe0) {
         ud->state = utf8_more1;
         ud->wch = (ch & 0x1f);
+	ud->wch_min = 0x80;
       } else if (ch >= 0xe0 && ch <= 0xef) {
         ud->state = utf8_more2;
         ud->wch = (ch & 0xf);
+	ud->wch_min = 0x800;
       } else if (ch >= 0xf0 && ch < 0xf5) {
 #ifdef FULL_UNICODE
         ud->state = utf8_more3;
         ud->wch = (ch & 0x7);
+	ud->wch_min = 0x100000;
 #else
 	conversion_error();
 #endif
@@ -286,8 +298,14 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
         ud->wch |= (ch & 0x3f);
         ud->state = (enum utf8_state) (ud->state - 1);
         if (ud->state == utf8_init) {
-          ud->back = ud->tail;
-          return ud->wch;
+	  if (ud->wch < ud->wch_min) {
+	    wchar_t wch = 0xdc00 | ud->buf[ud->back];
+	    ud->tail = ud->back = (ud->back + 1) % 8;
+	    return wch;
+	  } else {
+	    ud->back = ud->tail;
+	    return ud->wch;
+	  }
         }
       } else {
         wchar_t wch = 0xdc00 | ud->buf[ud->back];
author	Kaz Kylheku <kaz@kylheku.com>	2012-02-02 16:35:32 -0800
committer	Kaz Kylheku <kaz@kylheku.com>	2012-02-02 16:35:32 -0800
commit	97a34f6e5b04d4ce2eb3ee63f42d1375f4939de3 (patch)
tree	4d2ed48b418f06c47a73a67a82895fcabef62ef8 /utf8.c
parent	c4dd3aaa204cd70e03d4cec3531c64be9a24f649 (diff)
download	txr-97a34f6e5b04d4ce2eb3ee63f42d1375f4939de3.tar.gz txr-97a34f6e5b04d4ce2eb3ee63f42d1375f4939de3.tar.bz2 txr-97a34f6e5b04d4ce2eb3ee63f42d1375f4939de3.zip