summaryrefslogtreecommitdiffstats
path: root/utf8.c
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2012-02-02 16:35:32 -0800
committerKaz Kylheku <kaz@kylheku.com>2012-02-02 16:35:32 -0800
commit97a34f6e5b04d4ce2eb3ee63f42d1375f4939de3 (patch)
tree4d2ed48b418f06c47a73a67a82895fcabef62ef8 /utf8.c
parentc4dd3aaa204cd70e03d4cec3531c64be9a24f649 (diff)
downloadtxr-97a34f6e5b04d4ce2eb3ee63f42d1375f4939de3.tar.gz
txr-97a34f6e5b04d4ce2eb3ee63f42d1375f4939de3.tar.bz2
txr-97a34f6e5b04d4ce2eb3ee63f42d1375f4939de3.zip
* utf8.c (utf8_from_uc, utf8_decode): Impose a minium value on the
decoded character based on which UTF-8 case it is from. This rejects overlong forms. * utf8.h (struct utf8_decoder): New member, wch_min.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c32
1 files changed, 25 insertions, 7 deletions
diff --git a/utf8.c b/utf8.c
index f2821f72..1ca8f7b5 100644
--- a/utf8.c
+++ b/utf8.c
@@ -51,7 +51,7 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
size_t nchar = 1;
enum utf8_state state = utf8_init;
const unsigned char *backtrack = 0;
- wchar_t wch = 0;
+ wchar_t wch = 0, wch_min = 0;
for (;;) {
int ch = *src++;
@@ -76,13 +76,16 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
} else if (ch >= 0xc2 && ch <= 0xe0) {
state = utf8_more1;
wch = (ch & 0x1f);
+ wch_min = 0x80;
} else if (ch >= 0xe0 && ch <= 0xef) {
state = utf8_more2;
wch = (ch & 0xf);
+ wch_min = 0x800;
} else if (ch >= 0xf0 && ch < 0xf5) {
#ifdef FULL_UNICODE
state = utf8_more3;
wch = (ch & 0x7);
+ wch_min = 0x10000;
#else
conversion_error();
#endif
@@ -101,9 +104,15 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
wch |= (ch & 0x3f);
state = (enum utf8_state) (state - 1);
if (state == utf8_init) {
- if (wdst)
- *wdst++ = wch;
- nchar++;
+ if (wch < wch_min) {
+ src = backtrack;
+ if (wdst)
+ *wdst++ = 0xdc00 | *src;
+ } else {
+ if (wdst)
+ *wdst++ = wch;
+ }
+ nchar++;
}
} else {
src = backtrack;
@@ -260,16 +269,19 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
if (ch < 0x80) {
ud->back = ud->tail;
return ch;
- } else if (ch >= 0xc2 && ch <= 0xe0) {
+ } else if (ch >= 0xc0 && ch <= 0xe0) {
ud->state = utf8_more1;
ud->wch = (ch & 0x1f);
+ ud->wch_min = 0x80;
} else if (ch >= 0xe0 && ch <= 0xef) {
ud->state = utf8_more2;
ud->wch = (ch & 0xf);
+ ud->wch_min = 0x800;
} else if (ch >= 0xf0 && ch < 0xf5) {
#ifdef FULL_UNICODE
ud->state = utf8_more3;
ud->wch = (ch & 0x7);
+ ud->wch_min = 0x100000;
#else
conversion_error();
#endif
@@ -286,8 +298,14 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
ud->wch |= (ch & 0x3f);
ud->state = (enum utf8_state) (ud->state - 1);
if (ud->state == utf8_init) {
- ud->back = ud->tail;
- return ud->wch;
+ if (ud->wch < ud->wch_min) {
+ wchar_t wch = 0xdc00 | ud->buf[ud->back];
+ ud->tail = ud->back = (ud->back + 1) % 8;
+ return wch;
+ } else {
+ ud->back = ud->tail;
+ return ud->wch;
+ }
}
} else {
wchar_t wch = 0xdc00 | ud->buf[ud->back];