diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2012-02-02 22:54:17 -0800 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2012-02-02 22:54:17 -0800 |
commit | a7d3edcff56ee0faa8355ceaea7bc23c2f2e2aa7 (patch) | |
tree | 1696c8ed5bd8d5da2bda03f9a0a78507139bdfa0 /utf8.c | |
parent | 905b074cea7303553777e169529efc8aeccdc35a (diff) | |
download | txr-a7d3edcff56ee0faa8355ceaea7bc23c2f2e2aa7.tar.gz txr-a7d3edcff56ee0faa8355ceaea7bc23c2f2e2aa7.tar.bz2 txr-a7d3edcff56ee0faa8355ceaea7bc23c2f2e2aa7.zip |
* utf8.c (utf8_from_uc, utf8_decode): Use upper case for hex constants.
If bytes decode to U+DCxx, treat this sequence as invalid. This
way we can't be fooled by an attacker into accepting some U+DCxx which
on output we will then convert to byte xx.
(utf8_to_uc): Use upper case for hex constants.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 54 |
1 files changed, 29 insertions, 25 deletions
@@ -61,7 +61,7 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src) break; src = backtrack; if (wdst) - *wdst++ = 0xdc00 | *src; + *wdst++ = 0xDC00 | *src; nchar++; state = utf8_init; continue; @@ -73,15 +73,15 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src) if (wdst) *wdst++ = ch; nchar++; - } else if (ch >= 0xc2 && ch <= 0xe0) { + } else if (ch >= 0xC2 && ch <= 0xE0) { state = utf8_more1; - wch = (ch & 0x1f); + wch = (ch & 0x1F); wch_min = 0x80; - } else if (ch >= 0xe0 && ch <= 0xef) { + } else if (ch >= 0xE0 && ch <= 0xEF) { state = utf8_more2; - wch = (ch & 0xf); + wch = (ch & 0xF); wch_min = 0x800; - } else if (ch >= 0xf0 && ch < 0xf5) { + } else if (ch >= 0xF0 && ch < 0xF5) { #ifdef FULL_UNICODE state = utf8_more3; wch = (ch & 0x7); @@ -91,7 +91,7 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src) #endif } else { if (wdst) - *wdst++ = 0xdc00 | ch; + *wdst++ = 0xDC00 | ch; nchar++; } backtrack = src; @@ -99,15 +99,17 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src) case utf8_more1: case utf8_more2: case utf8_more3: - if (ch >= 0x80 && ch < 0xc0) { + if (ch >= 0x80 && ch < 0xC0) { wch <<= 6; - wch |= (ch & 0x3f); + wch |= (ch & 0x3F); state = (enum utf8_state) (state - 1); if (state == utf8_init) { - if (wch < wch_min) { + if (wch < wch_min && + (wch <= 0xFFFF && (wch & 0xFF00) == 0xDC00)) + { src = backtrack; if (wdst) - *wdst++ = 0xdc00 | *src; + *wdst++ = 0xDC00 | *src; } else { if (wdst) *wdst++ = wch; @@ -117,7 +119,7 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src) } else { src = backtrack; if (wdst) - *wdst++ = 0xdc00 | *src; + *wdst++ = 0xDC00 | *src; nchar++; state = utf8_init; } @@ -155,7 +157,7 @@ size_t utf8_to_uc(unsigned char *dst, const wchar_t *wsrc) if ((wch & 0xFF00) == 0xDC00) { nbyte += 1; if (dst) - *dst++ = (wch & 0xff); + *dst++ = (wch & 0xFF); } else { nbyte += 3; if (dst) { @@ -267,7 +269,7 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) if (ud->state == utf8_init) { return WEOF; } else { - wchar_t wch = 0xdc00 | ud->buf[ud->back]; + wchar_t wch = 0xDC00 | ud->buf[ud->back]; ud->tail = ud->back = (ud->back + 1) % 8; ud->state = utf8_init; return wch; @@ -279,15 +281,15 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) if (ch < 0x80) { ud->back = ud->tail; return ch; - } else if (ch >= 0xc0 && ch <= 0xe0) { + } else if (ch >= 0xC0 && ch <= 0xE0) { ud->state = utf8_more1; - ud->wch = (ch & 0x1f); + ud->wch = (ch & 0x1F); ud->wch_min = 0x80; - } else if (ch >= 0xe0 && ch <= 0xef) { + } else if (ch >= 0xE0 && ch <= 0xEF) { ud->state = utf8_more2; - ud->wch = (ch & 0xf); + ud->wch = (ch & 0xF); ud->wch_min = 0x800; - } else if (ch >= 0xf0 && ch < 0xf5) { + } else if (ch >= 0xF0 && ch < 0xF5) { #ifdef FULL_UNICODE ud->state = utf8_more3; ud->wch = (ch & 0x7); @@ -297,19 +299,21 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) #endif } else { ud->back = ud->tail; - return 0xdc00 | ch; + return 0xDC00 | ch; } break; case utf8_more1: case utf8_more2: case utf8_more3: - if (ch >= 0x80 && ch < 0xc0) { + if (ch >= 0x80 && ch < 0xC0) { ud->wch <<= 6; - ud->wch |= (ch & 0x3f); + ud->wch |= (ch & 0x3F); ud->state = (enum utf8_state) (ud->state - 1); if (ud->state == utf8_init) { - if (ud->wch < ud->wch_min) { - wchar_t wch = 0xdc00 | ud->buf[ud->back]; + if (ud->wch < ud->wch_min || + (ud->wch <= 0xFFFF && (ud->wch & 0xFF00) == 0xDC00)) + { + wchar_t wch = 0xDC00 | ud->buf[ud->back]; ud->tail = ud->back = (ud->back + 1) % 8; return wch; } else { @@ -318,7 +322,7 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) } } } else { - wchar_t wch = 0xdc00 | ud->buf[ud->back]; + wchar_t wch = 0xDC00 | ud->buf[ud->back]; ud->tail = ud->back = (ud->back + 1) % 8; ud->state = utf8_init; return wch; |