diff options
-rw-r--r-- | ChangeLog | 12 | ||||
-rw-r--r-- | utf8.c | 32 | ||||
-rw-r--r-- | utf8.h | 2 |
3 files changed, 38 insertions, 8 deletions
@@ -1,5 +1,17 @@ 2012-02-02 Kaz Kylheku <kaz@kylheku.com> + * utf8.c (utf8_from_uc, utf8_decode): Impose a minium value on the + decoded character based on which UTF-8 case it is from. This rejects + overlong forms. + + * utf8.h (struct utf8_decoder): New member, wch_min. + +2012-02-02 Kaz Kylheku <kaz@kylheku.com> + + * txr.1: Document that TXR accepts UTF-8 overlong forms. + +2012-02-02 Kaz Kylheku <kaz@kylheku.com> + * txr.vim: Move error match before other cases and simplify. Comment are colorized properly again. @@ -51,7 +51,7 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src) size_t nchar = 1; enum utf8_state state = utf8_init; const unsigned char *backtrack = 0; - wchar_t wch = 0; + wchar_t wch = 0, wch_min = 0; for (;;) { int ch = *src++; @@ -76,13 +76,16 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src) } else if (ch >= 0xc2 && ch <= 0xe0) { state = utf8_more1; wch = (ch & 0x1f); + wch_min = 0x80; } else if (ch >= 0xe0 && ch <= 0xef) { state = utf8_more2; wch = (ch & 0xf); + wch_min = 0x800; } else if (ch >= 0xf0 && ch < 0xf5) { #ifdef FULL_UNICODE state = utf8_more3; wch = (ch & 0x7); + wch_min = 0x10000; #else conversion_error(); #endif @@ -101,9 +104,15 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src) wch |= (ch & 0x3f); state = (enum utf8_state) (state - 1); if (state == utf8_init) { - if (wdst) - *wdst++ = wch; - nchar++; + if (wch < wch_min) { + src = backtrack; + if (wdst) + *wdst++ = 0xdc00 | *src; + } else { + if (wdst) + *wdst++ = wch; + } + nchar++; } } else { src = backtrack; @@ -260,16 +269,19 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) if (ch < 0x80) { ud->back = ud->tail; return ch; - } else if (ch >= 0xc2 && ch <= 0xe0) { + } else if (ch >= 0xc0 && ch <= 0xe0) { ud->state = utf8_more1; ud->wch = (ch & 0x1f); + ud->wch_min = 0x80; } else if (ch >= 0xe0 && ch <= 0xef) { ud->state = utf8_more2; ud->wch = (ch & 0xf); + ud->wch_min = 0x800; } else if (ch >= 0xf0 && ch < 0xf5) { #ifdef FULL_UNICODE ud->state = utf8_more3; ud->wch = (ch & 0x7); + ud->wch_min = 0x100000; #else conversion_error(); #endif @@ -286,8 +298,14 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) ud->wch |= (ch & 0x3f); ud->state = (enum utf8_state) (ud->state - 1); if (ud->state == utf8_init) { - ud->back = ud->tail; - return ud->wch; + if (ud->wch < ud->wch_min) { + wchar_t wch = 0xdc00 | ud->buf[ud->back]; + ud->tail = ud->back = (ud->back + 1) % 8; + return wch; + } else { + ud->back = ud->tail; + return ud->wch; + } } } else { wchar_t wch = 0xdc00 | ud->buf[ud->back]; @@ -37,7 +37,7 @@ enum utf8_state { utf8_init, utf8_more1, utf8_more2, utf8_more3 }; typedef struct utf8_decoder { enum utf8_state state; - wchar_t wch; + wchar_t wch, wch_min; int head, tail, back; int buf[8]; } utf8_decoder_t; |