/* Copyright 2011 * Kaz Kylheku * Vancouver, Canada * All rights reserved. * * BSD License: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ #include #include #include #include #include #include "config.h" #include "lib.h" #include "unwind.h" #include "utf8.h" #if WCHAR_MAX > 65535 #define FULL_UNICODE #endif #ifndef FULL_UNICODE static void conversion_error(void) { uw_throw(range_error_s, lit("encountered utf-8 character that needs full unicode support")); } #endif size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src) { size_t nchar = 1; enum utf8_state state = utf8_init; const unsigned char *backtrack = 0; wchar_t wch = 0; for (;;) { int ch = *src++; if (ch == 0) { if (state == utf8_init) break; src = backtrack; if (wdst) *wdst++ = 0xdc00 | *src; nchar++; state = utf8_init; continue; } switch (state) { case utf8_init: if (ch < 0x80) { if (wdst) *wdst++ = ch; nchar++; } else if (ch >= 0xc2 && ch <= 0xe0) { state = utf8_more1; wch = (ch & 0x1f); } else if (ch >= 0xe0 && ch <= 0xef) { state = utf8_more2; wch = (ch & 0xf); } else if (ch >= 0xf0 && ch < 0xf5) { #ifdef FULL_UNICODE state = utf8_more3; wch = (ch & 0x7); #else conversion_error(); #endif } else { if (wdst) *wdst++ = 0xdc00 | ch; nchar++; } backtrack = src; break; case utf8_more1: case utf8_more2: case utf8_more3: if (ch >= 0x80 && ch < 0xc0) { wch <<= 6; wch |= (ch & 0x3f); state = (enum utf8_state) (state - 1); if (state == utf8_init) { if (wdst) *wdst++ = wch; nchar++; } } else { src = backtrack; if (wdst) *wdst++ = 0xdc00 | *src; nchar++; state = utf8_init; } break; } } if (wdst) *wdst++ = 0; return nchar; } size_t utf8_from(wchar_t *wdst, const char *src) { return utf8_from_uc(wdst, (const unsigned char *) src); } size_t utf8_to_uc(unsigned char *dst, const wchar_t *wsrc) { size_t nbyte = 1; wchar_t wch; while ((wch = *wsrc++)) { if (wch < 0x80) { nbyte += 1; if (dst) *dst++ = wch; } else if (wch < 0x800) { nbyte += 2; if (dst) { *dst++ = 0xC0 | (wch >> 6); *dst++ = 0x80 | (wch & 0x3F); } } else if (wch < 0x10000) { nbyte += 3; if (dst) { *dst++ = 0xE0 | (wch >> 12); *dst++ = 0x80 | ((wch >> 6) & 0x3F); *dst++ = 0x80 | (wch & 0x3F); } } else if (wch < 0x110000) { nbyte += 4; if (dst) { *dst++ = 0xF0 | (wch >> 18); *dst++ = 0x80 | ((wch >> 12) & 0x3F); *dst++ = 0x80 | ((wch >> 6) & 0x3F); *dst++ = 0x80 | (wch & 0x3F); } } } if (dst) *dst++ = 0; return nbyte; } size_t utf8_to(char *dst, const wchar_t *wsrc) { return utf8_to_uc((unsigned char *) dst, wsrc); } wchar_t *utf8_dup_from_uc(const unsigned char *str) { size_t nchar = utf8_from_uc(0, str); wchar_t *wstr = (wchar_t *) chk_malloc(nchar * sizeof *wstr); utf8_from_uc(wstr, str); return wstr; } wchar_t *utf8_dup_from(const char *str) { size_t nchar = utf8_from(0, str); wchar_t *wstr = (wchar_t *) chk_malloc(nchar * sizeof *wstr); utf8_from(wstr, str); return wstr; } unsigned char *utf8_dup_to_uc(const wchar_t *wstr) { size_t nbyte = utf8_to_uc(0, wstr); unsigned char *str = chk_malloc(nbyte); utf8_to_uc(str, wstr); return str; } char *utf8_dup_to(const wchar_t *wstr) { size_t nbyte = utf8_to(0, wstr); char *str = (char *) chk_malloc(nbyte); utf8_to(str, wstr); return str; } int utf8_encode(wchar_t wch, int (*put)(int ch, mem_t *ctx), mem_t *ctx) { if (wch < 0x80) { return put(wch, ctx); } else if (wch < 0x800) { return put(0xC0 | (wch >> 6), ctx) && put(0x80 | (wch & 0x3F), ctx); } else if (wch < 0x10000) { return put(0xE0 | (wch >> 12), ctx) && put(0x80 | ((wch >> 6) & 0x3F), ctx) && put(0x80 | (wch & 0x3F), ctx); } else if (wch < 0x110000) { return put(0xF0 | (wch >> 18), ctx) && put(0x80 | ((wch >> 12) & 0x3F), ctx) && put(0x80 | ((wch >> 6) & 0x3F), ctx) && put(0x80 | (wch & 0x3F), ctx); } return 0; } void utf8_decoder_init(utf8_decoder_t *ud) { ud->state = utf8_init; ud->wch = 0; ud->head = ud->tail = ud->back = 0; } wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) { for (;;) { int ch; if (ud->tail != ud->head) { ch = ud->buf[ud->tail]; ud->tail = (ud->tail + 1) % 8; } else { ch = get(ctx); ud->buf[ud->head] = ch; ud->head = ud->tail = (ud->head + 1) % 8; } if (ch == EOF) { if (ud->state == utf8_init) { return WEOF; } else { wchar_t wch = 0xdc00 | ud->buf[ud->back]; ud->tail = ud->back = (ud->back + 1) % 8; ud->state = utf8_init; return wch; } } switch (ud->state) { case utf8_init: if (ch < 0x80) { ud->back = ud->tail; return ch; } else if (ch >= 0xc2 && ch <= 0xe0) { ud->state = utf8_more1; ud->wch = (ch & 0x1f); } else if (ch >= 0xe0 && ch <= 0xef) { ud->state = utf8_more2; ud->wch = (ch & 0xf); } else if (ch >= 0xf0 && ch < 0xf5) { #ifdef FULL_UNICODE ud->state = utf8_more3; ud->wch = (ch & 0x7); #else conversion_error(); #endif } else { ud->back = ud->tail; return 0xdc00 | ch; } break; case utf8_more1: case utf8_more2: case utf8_more3: if (ch >= 0x80 && ch < 0xc0) { ud->wch <<= 6; ud->wch |= (ch & 0x3f); ud->state = (enum utf8_state) (ud->state - 1); if (ud->state == utf8_init) { ud->back = ud->tail; return ud->wch; } } else { wchar_t wch = 0xdc00 | ud->buf[ud->back]; ud->tail = ud->back = (ud->back + 1) % 8; ud->state = utf8_init; return wch; } break; } } } FILE *w_fopen(const wchar_t *wname, const wchar_t *wmode) { char *name = (char *) utf8_dup_to(wname); char *mode = (char *) utf8_dup_to(wmode); FILE *f = fopen(name, mode); free(name); free(mode); return f; } FILE *w_popen(const wchar_t *wcmd, const wchar_t *wmode) { char *cmd = (char *) utf8_dup_to(wcmd); char *mode = (char *) utf8_dup_to(wmode); FILE *f = popen(cmd, mode); free(cmd); free(mode); return f; }