From 8367c03ef07473cff4f1b6f0645e1ce9ae17c94c Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Thu, 12 Nov 2009 22:34:25 -0800 Subject: Fixed broken utf8_from. Added utf8_encode, utf8_decoder_init, utf8_decode. --- ChangeLog | 14 ++++++ utf8.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++--------- utf8.h | 14 ++++++ 3 files changed, 151 insertions(+), 21 deletions(-) diff --git a/ChangeLog b/ChangeLog index 82ee1edf..d729a960 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,17 @@ +2009-11-12 Kaz Kylheku + + * utf8.c (utf8_from): Fix total breakage. + Was writing out incomplete wide characters on internal + state transtions while traversing a single multi-byte character. + Also, improved handling of bad bytes close to EOF: if EOF + occurs in a multi-byte character, it will backtrack, + and skip one bad byte, etc. + (utf8_encode, utf8_decoder_init, utf8_decode): New functions. + + * utf8.h (enum utf8_state): New enum. + (struct utf8_decoder, utf8_decoder_t): New struct. + (utf8_encode, utf8_decoder_init, utf8_decode): Declared. + 2009-11-12 Kaz Kylheku Documenting extended characters in man page. diff --git a/utf8.c b/utf8.c index a3a23e8e..ca2e9016 100644 --- a/utf8.c +++ b/utf8.c @@ -27,31 +27,45 @@ #include #include #include +#include #include "lib.h" +#include "utf8.h" size_t utf8_from(wchar_t *wdst, const unsigned char *src) { size_t nchar = 1; - enum { init, more1, more2, more3 } state; + enum utf8_state state = utf8_init; const char *backtrack = 0; - int ch; wchar_t wch = 0; - for (state = init; (ch = *src); src++) { + for (;;) { + int ch = *src++; + + if (ch == 0) { + if (state == utf8_init) + break; + src = backtrack; + if (wdst) + *wdst++ = 0xdc00 | *src; + nchar++; + state = utf8_init; + continue; + } + switch (state) { - case init: + case utf8_init: if (ch < 0x80) { if (wdst) *wdst++ = ch; nchar++; } else if (ch >= 0xc2 && ch <= 0xe0) { - state = more1; + state = utf8_more1; wch = (ch & 0x1f); } else if (ch >= 0xe0 && ch <= 0xef) { - state = more2; + state = utf8_more2; wch = (ch & 0xf); } else if (ch >= 0xf0 && ch < 0xf5) { - state = more3; + state = utf8_more3; wch = (ch & 0x7); } else { if (wdst) @@ -60,33 +74,28 @@ size_t utf8_from(wchar_t *wdst, const unsigned char *src) } backtrack = src; break; - case more1: - case more2: - case more3: + case utf8_more1: + case utf8_more2: + case utf8_more3: if (ch >= 0x80 && ch < 0xc0) { wch <<= 6; wch |= (ch & 0x3f); - if (wdst) - *wdst++ = wch; - nchar++; - state--; + if (--state == utf8_init) { + if (wdst) + *wdst++ = wch; + nchar++; + } } else { src = backtrack; if (wdst) *wdst++ = 0xdc00 | *src; nchar++; - state = init; + state = utf8_init; } break; } } - if (state != init) { - if (wdst) - *wdst++ = 0xdc00 | *backtrack; - nchar++; - } - if (wdst) *wdst++ = 0; return nchar; @@ -147,6 +156,99 @@ unsigned char *utf8_dup_to(const wchar_t *wstr) return str; } +int utf8_encode(wchar_t wch, int (*put)(int ch, void *ctx), void *ctx) +{ + if (wch < 0x80) { + return put(wch, ctx); + } else if (wch < 0x800) { + return put(0xC0 | (wch >> 6), ctx) && + put(0x80 | (wch & 0x3F), ctx); + } else if (wch < 0x10000) { + return put(0xE0 | (wch >> 12), ctx) && + put(0x80 | ((wch >> 6) & 0x3F), ctx) && + put(0x80 | (wch & 0x3F), ctx); + } else if (wch < 0x110000) { + return put(0xF0 | (wch >> 18), ctx) && + put(0x80 | ((wch >> 12) & 0x3F), ctx) && + put(0x80 | ((wch >> 6) & 0x3F), ctx) && + put(0x80 | (wch & 0x3F), ctx); + } + + return 0; +} + +void utf8_decoder_init(utf8_decoder_t *ud) +{ + ud->state = utf8_init; + ud->wch = 0; + ud->head = ud->tail = ud->back = 0; +} + +wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(void *ctx), void *ctx) +{ + for (;;) { + int ch; + + if (ud->tail != ud->head) { + ch = ud->buf[ud->tail]; + ud->tail = (ud->tail + 1) % 8; + } else { + ch = get(ctx); + ud->buf[ud->head] = ch; + ud->head = ud->tail = (ud->head + 1) % 8; + } + + if (ch == EOF) { + if (ud->state == utf8_init) { + return WEOF; + } else { + wchar_t wch = 0xdc00 | ud->buf[ud->back]; + ud->tail = ud->back = (ud->back + 1) % 8; + ud->state = utf8_init; + return wch; + } + } + + switch (ud->state) { + case utf8_init: + if (ch < 0x80) { + ud->back = ud->tail; + return ch; + } else if (ch >= 0xc2 && ch <= 0xe0) { + ud->state = utf8_more1; + ud->wch = (ch & 0x1f); + } else if (ch >= 0xe0 && ch <= 0xef) { + ud->state = utf8_more2; + ud->wch = (ch & 0xf); + } else if (ch >= 0xf0 && ch < 0xf5) { + ud->state = utf8_more3; + ud->wch = (ch & 0x7); + } else { + ud->back = ud->tail; + return 0xdc00 | ch; + } + break; + case utf8_more1: + case utf8_more2: + case utf8_more3: + if (ch >= 0x80 && ch < 0xc0) { + ud->wch <<= 6; + ud->wch |= (ch & 0x3f); + if (--ud->state == utf8_init) { + ud->back = ud->tail; + return ud->wch; + } + } else { + wchar_t wch = 0xdc00 | ud->buf[ud->back]; + ud->tail = ud->back = (ud->back + 1) % 8; + ud->state = utf8_init; + return wch; + } + break; + } + } +} + FILE *w_fopen(const wchar_t *wname, const wchar_t *wmode) { char *name = (char *) utf8_dup_to(wname); diff --git a/utf8.h b/utf8.h index 28e67fe2..542a84fa 100644 --- a/utf8.h +++ b/utf8.h @@ -28,5 +28,19 @@ size_t utf8_from(wchar_t *, const unsigned char *); size_t utf8_to(unsigned char *, const wchar_t *); wchar_t *utf8_dup_from(const unsigned char *); unsigned char *utf8_dup_to(const wchar_t *); + +enum utf8_state { utf8_init, utf8_more1, utf8_more2, utf8_more3 }; + +typedef struct utf8_decoder { + enum utf8_state state; + wchar_t wch; + int head, tail, back; + int buf[8]; +} utf8_decoder_t; + +int utf8_encode(wchar_t, int (*put)(int ch, void *ctx), void *ctx); +void utf8_decoder_init(utf8_decoder_t *); +wint_t utf8_decode(utf8_decoder_t *,int (*get)(void *ctx), void *ctx); + FILE *w_fopen(const wchar_t *, const wchar_t *); FILE *w_popen(const wchar_t *, const wchar_t *); -- cgit v1.2.3