3 files changed, 38 insertions, 8 deletions
diff --git a/ChangeLog b/ChangeLog
index b4f24928..a0398638 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,17 @@
 2012-02-02  Kaz Kylheku  <kaz@kylheku.com>
 
+	* utf8.c (utf8_from_uc, utf8_decode): Impose a minium value on the
+	decoded character based on which UTF-8 case it is from.  This rejects
+	overlong forms.
+
+	* utf8.h (struct utf8_decoder): New member, wch_min.
+
+2012-02-02  Kaz Kylheku  <kaz@kylheku.com>
+
+	* txr.1: Document that TXR accepts UTF-8 overlong forms.
+
+2012-02-02  Kaz Kylheku  <kaz@kylheku.com>
+
 	* txr.vim: Move error match before other cases and simplify.
 	Comment are colorized properly again.
 
diff --git a/utf8.c b/utf8.c
index f2821f72..1ca8f7b5 100644
--- a/utf8.c
+++ b/utf8.c
@@ -51,7 +51,7 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
   size_t nchar = 1;
   enum utf8_state state = utf8_init;
   const unsigned char *backtrack = 0;
-  wchar_t wch = 0;
+  wchar_t wch = 0, wch_min = 0;
 
   for (;;) {
     int ch = *src++;
@@ -76,13 +76,16 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
       } else if (ch >= 0xc2 && ch <= 0xe0) {
         state = utf8_more1;
         wch = (ch & 0x1f);
+	wch_min = 0x80;
       } else if (ch >= 0xe0 && ch <= 0xef) {
         state = utf8_more2;
         wch = (ch & 0xf);
+	wch_min = 0x800;
       } else if (ch >= 0xf0 && ch < 0xf5) {
 #ifdef FULL_UNICODE
         state = utf8_more3;
         wch = (ch & 0x7);
+	wch_min = 0x10000;
 #else
 	conversion_error();
 #endif
@@ -101,9 +104,15 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
         wch |= (ch & 0x3f);
         state = (enum utf8_state) (state - 1);
         if (state == utf8_init) {
-          if (wdst)
-            *wdst++ = wch;
-          nchar++;
+	  if (wch < wch_min) {
+	    src = backtrack;
+	    if (wdst)
+	      *wdst++ = 0xdc00 | *src;
+	  } else {
+	    if (wdst)
+	      *wdst++ = wch;
+	  }
+	  nchar++;
         }
       } else {
         src = backtrack;
@@ -260,16 +269,19 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
       if (ch < 0x80) {
         ud->back = ud->tail;
         return ch;
-      } else if (ch >= 0xc2 && ch <= 0xe0) {
+      } else if (ch >= 0xc0 && ch <= 0xe0) {
         ud->state = utf8_more1;
         ud->wch = (ch & 0x1f);
+	ud->wch_min = 0x80;
       } else if (ch >= 0xe0 && ch <= 0xef) {
         ud->state = utf8_more2;
         ud->wch = (ch & 0xf);
+	ud->wch_min = 0x800;
       } else if (ch >= 0xf0 && ch < 0xf5) {
 #ifdef FULL_UNICODE
         ud->state = utf8_more3;
         ud->wch = (ch & 0x7);
+	ud->wch_min = 0x100000;
 #else
 	conversion_error();
 #endif
@@ -286,8 +298,14 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
         ud->wch |= (ch & 0x3f);
         ud->state = (enum utf8_state) (ud->state - 1);
         if (ud->state == utf8_init) {
-          ud->back = ud->tail;
-          return ud->wch;
+	  if (ud->wch < ud->wch_min) {
+	    wchar_t wch = 0xdc00 | ud->buf[ud->back];
+	    ud->tail = ud->back = (ud->back + 1) % 8;
+	    return wch;
+	  } else {
+	    ud->back = ud->tail;
+	    return ud->wch;
+	  }
         }
       } else {
         wchar_t wch = 0xdc00 | ud->buf[ud->back];
diff --git a/utf8.h b/utf8.h
index 2f1365e6..fbb993f1 100644
--- a/utf8.h
+++ b/utf8.h
@@ -37,7 +37,7 @@ enum utf8_state { utf8_init, utf8_more1, utf8_more2, utf8_more3 };
 
 typedef struct utf8_decoder {
   enum utf8_state state;
-  wchar_t wch;
+  wchar_t wch, wch_min;
   int head, tail, back;
   int buf[8];
 } utf8_decoder_t;