diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2009-11-11 08:54:21 -0800 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2009-11-11 08:54:21 -0800 |
commit | d59d8950ec58702821ec618b92dfb2490ae0bf31 (patch) | |
tree | e27e2914d563171ad56c2f7ae30c7c49343df06c | |
parent | 2f62f352f603b837a5cf032c257531052530c410 (diff) | |
download | txr-d59d8950ec58702821ec618b92dfb2490ae0bf31.tar.gz txr-d59d8950ec58702821ec618b92dfb2490ae0bf31.tar.bz2 txr-d59d8950ec58702821ec618b92dfb2490ae0bf31.zip |
Big conversion to wide characters and UTF-8 support.
This is incomplete. There are too many dependencies on
wide character support from the C stream I/O library,
and implicit use of some encoding which may not be UTF-8.
The regex code does not handle wide characters properly.
Character type is still int in some places, rather than wchar_t.
Test suite passes though.
-rw-r--r-- | ChangeLog | 68 | ||||
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | gc.c | 2 | ||||
-rw-r--r-- | hash.c | 2 | ||||
-rw-r--r-- | lib.c | 328 | ||||
-rw-r--r-- | lib.h | 17 | ||||
-rw-r--r-- | match.c | 233 | ||||
-rw-r--r-- | parser.h | 2 | ||||
-rw-r--r-- | parser.l | 84 | ||||
-rw-r--r-- | parser.y | 5 | ||||
-rw-r--r-- | regex.c | 6 | ||||
-rw-r--r-- | regex.h | 2 | ||||
-rw-r--r-- | stream.c | 128 | ||||
-rw-r--r-- | stream.h | 6 | ||||
-rw-r--r-- | txr.c | 67 | ||||
-rw-r--r-- | txr.h | 4 | ||||
-rw-r--r-- | unwind.c | 20 | ||||
-rw-r--r-- | unwind.h | 6 | ||||
-rw-r--r-- | utf8.c | 168 | ||||
-rw-r--r-- | utf8.h | 32 |
20 files changed, 750 insertions, 432 deletions
@@ -1,3 +1,71 @@ +2009-11-11 Kaz Kylheku <kkylheku@gmail.com> + + Big conversion to wide characters and UTF-8 support. + This is incomplete. There are too many dependencies on + wide character support from the C stream I/O library. + The regex code does not handle wide characters properly. + Character type is still int in some places, rather than wchar_t. + Test suite passes though. + + * hash.c (hash_str): Converted to wchar_t. + + * lib.c (progname, type_check, type_check2, type_check3, + car, cdr, car_l, cdr_l, equal, chk_strdup, string_own, + string, mkstring, mkustring, init_str, length_str, + c_str, search_str, sub_str, cat_str, split_str, trim_str, + chrp, apply, lazy_str, lazy_str_get_trailing_list, + cobj, obj_init, obj_print, obj_pprint, init): Converted to wchar_t. + (vector): Cast of chk_malloc return value added. + (string_utf8): New function. + + * lib.h (struct string): Member str changed to wchar_t *. + (progname, chk_strdup, string_own, string, init_str, + c_str, init): Declarations updated. + (string_utf8): Declared. + + * match.c (debugf, debuglf, sem_error, file_err, dump_shell_string, + dump_var, dump_bindings, dest_bind, match_line, do_output_line, + do_output, match_files): Converted to wchar_t. + + * parser.h (spec_file): Declaration updated. + + * parser.l (yy_errorf, char_esc, num_esc): Converted to wchar_t. + (ASC, ASCN, U, U2, U3, U4, UANY, UNANN, UONLY): New named + regexes, used for lexing utf-8. + (grammar): Converted to wchar_t and utf-8 handling. + + * parser.y (%union/yystype): lexeme member changed to wchar_t *, + chr member changed to wchar_t. + + * regex.c (nfa_run): Input string is wchar_t *. + (search_regex): String from haystack is wchar_t *. + + * regex.h (nfa_run): Declaration updated. + + * stream.c (struct strm_ops, common_vformat, stdio_stream_print, + stdio_maybe_read_error, stdio_maybe_write_error, stdio_put_string, + stdio_put_char, snarf_line, stdio_get_line, stdio_close, pipe_close, + struct string_output, string_out_put_string, string_out_put_char, + string_out_vcformat, dir_get_line, make_string_output_stream, + get_string_from-stream, make_dir_stream, get_line, get_char, + vformat, vcformat, format, cformat, put_string, put_cstring, + put_char, put_cchar, stream_init): Converted to wchar_t. + + * stream.h (vformat, format, put_cstring): Declarations updated. + + * txr.c (version, progname, spec_file, oom_realloc_handler, + help, hint, remove_hash_bang_line, main, txr_main): Converted + to wchar_t. + + * txr.h (version, progname): Declarations updated. + + * unwind.c (uw_throw, uw_throwf, uw_errorf, type_mismatch, + uw_register_subtype): Converted to wchar_t. + + * unwind.h (uw_throwf, uw_errorf, type_mismatch): Declarations updated. + + * utf8.c, utf8.h: New files. + 2009-11-10 Kaz Kylheku <kkylheku@gmail.com> hash.c (hash_grow): Rewritten to avoid resizing the vector @@ -31,7 +31,7 @@ CFLAGS := -I$(top_srcdir) $(LANG_FLAGS) $(DIAG_FLAGS) $(OPT_FLAGS) $(DBG_FLAGS) OBJS := txr.o lex.yy.o y.tab.o match.o lib.o regex.o gc.o unwind.o stream.o -OBJS += hash.o +OBJS += hash.o utf8.o PROG := ./txr @@ -321,7 +321,7 @@ static void sweep(void) continue; if (0 && dbg) { - fprintf(stderr, "%s: finalizing: ", progname); + fprintf(stderr, "%ls: finalizing: ", progname); obj_print(block, std_error); putc('\n', stderr); } @@ -64,7 +64,7 @@ static struct hash *reachable_weak_hashes; * We don't reduce the final result modulo a small prime, but leave it * as it is; let the hashing routines do their own reduction. */ -static long hash_c_str(const char *str) +static long hash_c_str(const wchar_t *str) { unsigned long h = 0; while (*str) { @@ -27,16 +27,18 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <ctype.h> +#include <wctype.h> #include <assert.h> #include <limits.h> #include <stdarg.h> #include <dirent.h> #include <setjmp.h> +#include <wchar.h> #include "lib.h" #include "gc.h" #include "unwind.h" #include "stream.h" +#include "utf8.h" #define max(a, b) ((a) > (b) ? (a) : (b)) #define min(a, b) ((a) < (b) ? (a) : (b)) @@ -63,7 +65,7 @@ obj_t *null_list; obj_t *identity_f; obj_t *equal_f; -const char *progname; +const wchar_t *progname; obj_t *prog_string; void *(*oom_realloc)(void *, size_t); @@ -125,14 +127,14 @@ obj_t *typeof(obj_t *obj) obj_t *type_check(obj_t *obj, int type) { if (!is_ptr(obj) || obj->t.type != type) - type_mismatch("~s is not of type ~s", obj, code2type(type), nao); + type_mismatch(L"~s is not of type ~s", obj, code2type(type), nao); return t; } obj_t *type_check2(obj_t *obj, int t1, int t2) { if (!is_ptr(obj) || (obj->t.type != t1 && obj->t.type != t2)) - type_mismatch("~s is not of type ~s or ~s", obj, + type_mismatch(L"~s is not of type ~s or ~s", obj, code2type(t1), code2type(t2), nao); return t; } @@ -141,7 +143,7 @@ obj_t *type_check3(obj_t *obj, int t1, int t2, int t3) { if (!is_ptr(obj) || (obj->t.type != t1 && obj->t.type != t2 && obj->t.type != t3)) - type_mismatch("~s is not of type ~s, ~s nor ~s", obj, + type_mismatch(L"~s is not of type ~s, ~s nor ~s", obj, code2type(t1), code2type(t2), code2type(t3), nao); return t; } @@ -162,7 +164,7 @@ obj_t *car(obj_t *cons) return cons->lc.car; } default: - type_mismatch("~s is not a cons", cons, nao); + type_mismatch(L"~s is not a cons", cons, nao); } } @@ -182,7 +184,7 @@ obj_t *cdr(obj_t *cons) return cons->lc.cdr; } default: - type_mismatch("~s is not a cons", cons, nao); + type_mismatch(L"~s is not a cons", cons, nao); } } @@ -195,7 +197,7 @@ obj_t **car_l(obj_t *cons) funcall1(cons->lc.func, cons); return &cons->lc.car; default: - type_mismatch("~s is not a cons", cons, nao); + type_mismatch(L"~s is not a cons", cons, nao); } } @@ -208,7 +210,7 @@ obj_t **cdr_l(obj_t *cons) funcall1(cons->lc.func, cons); return &cons->lc.cdr; default: - type_mismatch("~s is not a cons", cons, nao); + type_mismatch(L"~s is not a cons", cons, nao); } } @@ -429,7 +431,7 @@ obj_t *equal(obj_t *left, obj_t *right) return nil; case STR: if (type(right) == STR) - return strcmp(left->st.str, right->st.str) == 0 ? t : nil; + return wcscmp(left->st.str, right->st.str) == 0 ? t : nil; if (type(right) == LSTR) { lazy_str_force(right); return equal(left, right->ls.prefix); @@ -508,11 +510,11 @@ void *chk_realloc(void *old, size_t size) return newptr; } -void *chk_strdup(const char *str) +void *chk_strdup(const wchar_t *str) { - size_t size = strlen(str) + 1; - char *copy = chk_malloc(size); - memcpy(copy, str, size); + size_t nchar = wcslen(str) + 1; + wchar_t *copy = (wchar_t *) chk_malloc(nchar * sizeof *copy); + wmemcpy(copy, str, nchar); return copy; } @@ -603,7 +605,7 @@ obj_t *num(long val) long c_num(obj_t *num) { if (!is_num(num)) - type_mismatch("~s is not a number", num, nao); + type_mismatch(L"~s is not a number", num, nao); return ((long) num) >> TAG_SHIFT; } @@ -681,7 +683,7 @@ obj_t *min2(obj_t *anum, obj_t *bnum) return c_num(anum) < c_num(bnum) ? anum : bnum; } -obj_t *string_own(char *str) +obj_t *string_own(wchar_t *str) { obj_t *obj = make_obj(); obj->st.type = STR; @@ -690,7 +692,7 @@ obj_t *string_own(char *str) return obj; } -obj_t *string(const char *str) +obj_t *string(const wchar_t *str) { obj_t *obj = make_obj(); obj->st.type = STR; @@ -699,27 +701,36 @@ obj_t *string(const char *str) return obj; } +obj_t *string_utf8(const unsigned char *str) +{ + obj_t *obj = make_obj(); + obj->st.type = STR; + obj->st.str = utf8_dup_from(str); + obj->st.len = nil; + return obj; +} + obj_t *mkstring(obj_t *len, obj_t *ch) { - char *str = chk_malloc(c_num(len) + 1); + size_t nchar = c_num(len) + 1; + wchar_t *str = (wchar_t *) chk_malloc(nchar * sizeof *str); obj_t *s = string_own(str); - memset(str, c_chr(ch), c_num(len)); - str[c_num(len)] = 0; + wmemset(str, c_chr(ch), nchar); s->st.len = len; return s; } obj_t *mkustring(obj_t *len) { - char *str = chk_malloc(c_num(len) + 1); + wchar_t *str = (wchar_t *) chk_malloc((c_num(len) + 1) * sizeof *str); obj_t *s = string_own(str); s->st.len = len; return s; } -obj_t *init_str(obj_t *str, const char *data) +obj_t *init_str(obj_t *str, const wchar_t *data) { - memcpy(str->st.str, data, c_num(str->st.len)); + wmemcpy(str->st.str, data, c_num(str->st.len) + 1); return str; } @@ -753,11 +764,11 @@ obj_t *length_str(obj_t *str) } if (!str->st.len) - str->st.len = num(strlen(str->st.str)); + str->st.len = num(wcslen(str->st.str)); return str->st.len; } -const char *c_str(obj_t *obj) +const wchar_t *c_str(obj_t *obj) { type_check3(obj, STR, SYM, LSTR); @@ -783,14 +794,14 @@ obj_t *search_str(obj_t *haystack, obj_t *needle, obj_t *start_num, obj_t *h_is_lazy = lazy_stringp(haystack); long start = c_num(start_num); long good = -1, pos = -1; - const char *n = c_str(needle), *h; + const wchar_t *n = c_str(needle), *h; if (!h_is_lazy) { do { - const char *f; + const wchar_t *f; h = c_str(haystack); - f = strstr(h + start, n); + f = wcsstr(h + start, n); if (f) pos = f - h; @@ -804,7 +815,7 @@ obj_t *search_str(obj_t *haystack, obj_t *needle, obj_t *start_num, lazy_str_force_upto(haystack, plus(num(start + 1), length_str(needle))); h = c_str(haystack->ls.prefix); - if (!strncmp(h + start, n, ln)) + if (!wcsncmp(h + start, n, ln)) good = start; } while (h[++start] && (from_end || good == -1)); } @@ -847,11 +858,12 @@ obj_t *sub_str(obj_t *str_in, obj_t *from, obj_t *to) if (ge(from, to)) { return null_string; } else { - size_t size = c_num(to) - c_num(from) + 1; - char *sub = chk_malloc(size); - const char *str = c_str(lazy_stringp(str_in) ? str_in->ls.prefix : str_in); - strncpy(sub, str + c_num(from), size); - sub[size-1] = 0; + size_t nchar = c_num(to) - c_num(from) + 1; + wchar_t *sub = (wchar_t *) chk_malloc(nchar * sizeof (wchar_t)); + const wchar_t *str = c_str(lazy_stringp(str_in) + ? str_in->ls.prefix : str_in); + wcsncpy(sub, str + c_num(from), nchar); + sub[nchar-1] = 0; return string_own(sub); } } @@ -860,7 +872,7 @@ obj_t *cat_str(obj_t *list, obj_t *sep) { long total = 0; obj_t *iter; - char *str, *ptr; + wchar_t *str, *ptr; long len_sep = sep ? c_num(length_str(sep)) : 0; for (iter = list; iter != nil; iter = cdr(iter)) { @@ -882,7 +894,7 @@ obj_t *cat_str(obj_t *list, obj_t *sep) return nil; } - str = chk_malloc(total + 1); + str = (wchar_t *) chk_malloc((total + 1) * sizeof *str); for (ptr = str, iter = list; iter != nil; iter = cdr(iter)) { obj_t *item = car(iter); @@ -891,14 +903,14 @@ obj_t *cat_str(obj_t *list, obj_t *sep) continue; if (stringp(item)) { len = c_num(length_str(item)); - memcpy(ptr, c_str(item), len); + wmemcpy(ptr, c_str(item), len); ptr += len; } else { *ptr++ = c_chr(item); } if (len_sep && cdr(iter)) { - memcpy(ptr, c_str(sep), len_sep); + wmemcpy(ptr, c_str(sep), len_sep); ptr += len_sep; } } @@ -909,12 +921,12 @@ obj_t *cat_str(obj_t *list, obj_t *sep) obj_t *split_str(obj_t *str, obj_t *sep) { - const char *cstr = c_str(str); - const char *csep = c_str(sep); + const wchar_t *cstr = c_str(str); + const wchar_t *csep = c_str(sep); list_collect_decl (out, iter); for (;;) { - size_t span = strcspn(cstr, csep); + size_t span = wcscspn(cstr, csep); obj_t *piece = mkustring(num(span)); init_str(piece, cstr); list_collect (iter, piece); @@ -929,21 +941,21 @@ obj_t *split_str(obj_t *str, obj_t *sep) obj_t *trim_str(obj_t *str) { - const char *start = c_str(str); - const char *end = start + c_num(length_str(str)); + const wchar_t *start = c_str(str); + const wchar_t *end = start + c_num(length_str(str)); - while (start[0] && isspace(start[0])) + while (start[0] && iswspace(start[0])) start++; - while (end > start && isspace(end[-1])) + while (end > start && iswspace(end[-1])) end--; if (end == start) { return null_string; } else { size_t len = end - start; - char *new = chk_malloc(len + 1); - memcpy(new, start, len); + wchar_t *new = (wchar_t *) chk_malloc((len + 1) * sizeof *new); + wmemcpy(new, start, len); new[len] = 0; return string_own(new); } @@ -951,7 +963,7 @@ obj_t *trim_str(obj_t *str) obj_t *string_lt(obj_t *astr, obj_t *bstr) { - int cmp = strcmp(c_str(astr), c_str(bstr)); + int cmp = wcscmp(c_str(astr), c_str(bstr)); return cmp == -1 ? t : nil; } @@ -969,7 +981,7 @@ obj_t *chrp(obj_t *chr) int c_chr(obj_t *chr) { if (!is_chr(chr)) - type_mismatch("~s is not a character", chr, nao); + type_mismatch(L"~s is not a character", chr, nao); return ((int) chr) >> TAG_SHIFT; } @@ -1141,7 +1153,7 @@ obj_t *apply(obj_t *fun, obj_t *arglist) type_check (fun, FUN); type_assert (listp(arglist), - ("apply arglist ~s is not a list", arglist, nao)); + (L"apply arglist ~s is not a list", arglist, nao)); *p++ = car(arglist); arglist = cdr(arglist); *p++ = car(arglist); arglist = cdr(arglist); @@ -1267,7 +1279,7 @@ obj_t *vector(obj_t *alloc) { long alloc_plus = c_num(alloc) + 2; obj_t *vec = make_obj(); - obj_t **v = chk_malloc(alloc_plus * sizeof *v); + obj_t **v = (obj_t **) chk_malloc(alloc_plus * sizeof *v); vec->v.type = VEC; vec->v.vec = v + 2; v[0] = alloc; @@ -1374,7 +1386,7 @@ obj_t *lazy_str(obj_t *lst, obj_t *term, obj_t *limit) obj_t *obj = make_obj(); obj->ls.type = LSTR; - term = or2(term, string("\n")); + term = or2(term, string(L"\n")); if (nullp(lst)) { obj->ls.prefix = null_string; @@ -1501,7 +1513,7 @@ obj_t *lazy_str_get_trailing_list(obj_t *lstr, obj_t *index) { obj_t *split_suffix = split_str(sub_str(lstr->ls.prefix, index, nil), - or2(car(lstr->ls.opts), string("\n"))); + or2(car(lstr->ls.opts), string(L"\n"))); return nappend2(split_suffix, lstr->ls.list); } @@ -1519,7 +1531,7 @@ obj_t *cobj(void *handle, obj_t *cls_sym, struct cobj_ops *ops) void cobj_print_op(obj_t *obj, obj_t *out) { - put_cstring(out, "#<"); + put_cstring(out, L"#<"); obj_print(obj->co.cls, out); cformat(out, ": %p>", obj->co.handle); } @@ -1707,77 +1719,77 @@ static void obj_init(void) &identity_f, &prog_string, (obj_t **) 0); - nil_string = string("nil"); - - null = intern(string("null")); - t = intern(string("t")); - cons_t = intern(string("cons")); - str_t = intern(string("str")); - chr_t = intern(string("chr")); - num_t = intern(string("num")); - sym_t = intern(string("sym")); - fun_t = intern(string("fun")); - vec_t = intern(string("vec")); - stream_t = intern(string("stream")); - hash_t = intern(string("hash")); - lcons_t = intern(string("lcons")); - lstr_t = intern(string("lstr")); - cobj_t = intern(string("cobj")); - var = intern(string("$var")); - regex = intern(string("$regex")); - set = intern(string("set")); - cset = intern(string("cset")); - wild = intern(string("wild")); - oneplus = intern(string("1+")); - zeroplus = intern(string("0+")); - optional = intern(string("?")); - compound = intern(string("compound")); - or = intern(string("or")); - quasi = intern(string("$quasi")); - skip = intern(string("skip")); - trailer = intern(string("trailer")); - block = intern(string("block")); - next = intern(string("next")); - freeform = intern(string("freeform")); - fail = intern(string("fail")); - accept = intern(string("accept")); - all = intern(string("all")); - some = intern(string("some")); - none = intern(string("none")); - maybe = intern(string("maybe")); - cases = intern(string("cases")); - collect = intern(string("collect")); - until = intern(string("until")); - coll = intern(string("coll")); - define = intern(string("define")); - output = intern(string("output")); - single = intern(string("single")); - frst = intern(string("first")); - lst = intern(string("last")); - empty = intern(string("empty")); - repeat = intern(string("repeat")); - rep = intern(string("rep")); - flattn = intern(string("flatten")); - forget = intern(string("forget")); - local = intern(string("local")); - mrge = intern(string("merge")); - bind = intern(string("bind")); - cat = intern(string("cat")); - args = intern(string("args")); - try = intern(string("try")); - catch = intern(string("catch")); - finally = intern(string("finally")); - nothrow = intern(string("nothrow")); - throw = intern(string("throw")); - defex = intern(string("defex")); - error = intern(string("error")); - type_error = intern(string("type_error")); - internal_err = intern(string("internal_error")); - numeric_err = intern(string("numeric_error")); - range_err = intern(string("range_error")); - query_error = intern(string("query_error")); - file_error = intern(string("file_error")); - process_error = intern(string("process_error")); + nil_string = string(L"nil"); + + null = intern(string(L"null")); + t = intern(string(L"t")); + cons_t = intern(string(L"cons")); + str_t = intern(string(L"str")); + chr_t = intern(string(L"chr")); + num_t = intern(string(L"num")); + sym_t = intern(string(L"sym")); + fun_t = intern(string(L"fun")); + vec_t = intern(string(L"vec")); + stream_t = intern(string(L"stream")); + hash_t = intern(string(L"hash")); + lcons_t = intern(string(L"lcons")); + lstr_t = intern(string(L"lstr")); + cobj_t = intern(string(L"cobj")); + var = intern(string(L"$var")); + regex = intern(string(L"$regex")); + set = intern(string(L"set")); + cset = intern(string(L"cset")); + wild = intern(string(L"wild")); + oneplus = intern(string(L"1+")); + zeroplus = intern(string(L"0+")); + optional = intern(string(L"?")); + compound = intern(string(L"compound")); + or = intern(string(L"or")); + quasi = intern(string(L"$quasi")); + skip = intern(string(L"skip")); + trailer = intern(string(L"trailer")); + block = intern(string(L"block")); + next = intern(string(L"next")); + freeform = intern(string(L"freeform")); + fail = intern(string(L"fail")); + accept = intern(string(L"accept")); + all = intern(string(L"all")); + some = intern(string(L"some")); + none = intern(string(L"none")); + maybe = intern(string(L"maybe")); + cases = intern(string(L"cases")); + collect = intern(string(L"collect")); + until = intern(string(L"until")); + coll = intern(string(L"coll")); + define = intern(string(L"define")); + output = intern(string(L"output")); + single = intern(string(L"single")); + frst = intern(string(L"first")); + lst = intern(string(L"last")); + empty = intern(string(L"empty")); + repeat = intern(string(L"repeat")); + rep = intern(string(L"rep")); + flattn = intern(string(L"flatten")); + forget = intern(string(L"forget")); + local = intern(string(L"local")); + mrge = intern(string(L"merge")); + bind = intern(string(L"bind")); + cat = intern(string(L"cat")); + args = intern(string(L"args")); + try = intern(string(L"try")); + catch = intern(string(L"catch")); + finally = intern(string(L"finally")); + nothrow = intern(string(L"nothrow")); + throw = intern(string(L"throw")); + defex = intern(string(L"defex")); + error = intern(string(L"error")); + type_error = intern(string(L"type_error")); + internal_err = intern(string(L"internal_error")); + numeric_err = intern(string(L"numeric_error")); + range_err = intern(string(L"range_error")); + query_error = intern(string(L"query_error")); + file_error = intern(string(L"file_error")); + process_error = intern(string(L"process_error")); interned_syms = cons(nil, interned_syms); @@ -1788,7 +1800,7 @@ static void obj_init(void) maxint = num(NUM_MAX); minint = num(NUM_MIN); - null_string = string(""); + null_string = string(L""); null_list = cons(nil, nil); @@ -1800,7 +1812,7 @@ static void obj_init(void) void obj_print(obj_t *obj, obj_t *out) { if (obj == nil) { - put_cstring(out, "nil"); + put_cstring(out, L"nil"); return; } @@ -1817,7 +1829,7 @@ void obj_print(obj_t *obj, obj_t *out) } else if (consp(cdr(iter))) { put_cchar(out, ' '); } else { - put_cstring(out, " . "); + put_cstring(out, L" . "); obj_print(cdr(iter), out); put_cchar(out, ')'); } @@ -1826,22 +1838,22 @@ void obj_print(obj_t *obj, obj_t *out) return; case STR: { - const char *ptr; + const wchar_t *ptr; put_cchar(out, '"'); for (ptr = obj->st.str; *ptr; ptr++) { switch (*ptr) { - case '\a': put_cstring(out, "\\a"); break; - case '\b': put_cstring(out, "\\b"); break; - case '\t': put_cstring(out, "\\t"); break; - case '\n': put_cstring(out, "\\n"); break; - case '\v': put_cstring(out, "\\v"); break; - case '\f': put_cstring(out, "\\f"); break; - case '\r': put_cstring(out, "\\r"); break; - case '"': put_cstring(out, "\\\""); break; - case '\\': put_cstring(out, "\\\\"); break; - case 27: put_cstring(out, "\\e"); break; + case L'\a': put_cstring(out, L"\\a"); break; + case L'\b': put_cstring(out, L"\\b"); break; + case L'\t': put_cstring(out, L"\\t"); break; + case L'\n': put_cstring(out, L"\\n"); break; + case L'\v': put_cstring(out, L"\\v"); break; + case L'\f': put_cstring(out, L"\\f"); break; + case L'\r': put_cstring(out, L"\\r"); break; + case L'"': put_cstring(out, L"\\\""); break; + case L'\\': put_cstring(out, L"\\\\"); break; + case 27: put_cstring(out, L"\\e"); break; default: - if (isprint(*ptr)) + if (iswprint(*ptr)) put_cchar(out, *ptr); else cformat(out, "\\%03o", (int) *ptr); @@ -1856,18 +1868,18 @@ void obj_print(obj_t *obj, obj_t *out) put_cchar(out, '\''); switch (ch) { - case '\a': put_cstring(out, "\\a"); break; - case '\b': put_cstring(out, "\\b"); break; - case '\t': put_cstring(out, "\\t"); break; - case '\n': put_cstring(out, "\\n"); break; - case '\v': put_cstring(out, "\\v"); break; - case '\f': put_cstring(out, "\\f"); break; - case '\r': put_cstring(out, "\\r"); break; - case '"': put_cstring(out, "\\\""); break; - case '\\': put_cstring(out, "\\\\"); break; - case 27: put_cstring(out, "\\e"); break; + case L'\a': put_cstring(out, L"\\a"); break; + case L'\b': put_cstring(out, L"\\b"); break; + case L'\t': put_cstring(out, L"\\t"); break; + case L'\n': put_cstring(out, L"\\n"); break; + case L'\v': put_cstring(out, L"\\v"); break; + case L'\f': put_cstring(out, L"\\f"); break; + case L'\r': put_cstring(out, L"\\r"); break; + case L'"': put_cstring(out, L"\\\""); break; + case L'\\': put_cstring(out, L"\\\\"); break; + case 27: put_cstring(out, L"\\e"); break; default: - if (isprint(ch)) + if (iswprint(ch)) put_cchar(out, ch); else cformat(out, "\\%03o", ch); @@ -1887,7 +1899,7 @@ void obj_print(obj_t *obj, obj_t *out) case VEC: { long i, fill = c_num(obj->v.vec[vec_fill]); - put_cstring(out, "#("); + put_cstring(out, L"#("); for (i = 0; i < fill; i++) { obj_print(obj->v.vec[i], out); if (i < fill - 1) @@ -1898,7 +1910,7 @@ void obj_print(obj_t *obj, obj_t *out) return; case LSTR: obj_print(obj->ls.prefix, out); - put_cstring(out, "#<... lazy string>"); + put_cstring(out, L"#<... lazy string>"); return; case COBJ: obj->co.ops->print(obj, out); @@ -1911,7 +1923,7 @@ void obj_print(obj_t *obj, obj_t *out) void obj_pprint(obj_t *obj, obj_t *out) { if (obj == nil) { - put_cstring(out, "nil"); + put_cstring(out, L"nil"); return; } @@ -1928,7 +1940,7 @@ void obj_pprint(obj_t *obj, obj_t *out) } else if (consp(cdr(iter))) { put_cchar(out, ' '); } else { - put_cstring(out, " . "); + put_cstring(out, L" . "); obj_pprint(cdr(iter), out); put_cchar(out, ')'); } @@ -1953,7 +1965,7 @@ void obj_pprint(obj_t *obj, obj_t *out) case VEC: { long i, fill = c_num(obj->v.vec[vec_fill]); - put_cstring(out, "#("); + put_cstring(out, L"#("); for (i = 0; i < fill; i++) { obj_pprint(obj->v.vec[i], out); if (i < fill - 1) @@ -1964,7 +1976,7 @@ void obj_pprint(obj_t *obj, obj_t *out) return; case LSTR: obj_pprint(obj->ls.prefix, out); - put_cstring(out, "..."); + put_cstring(out, L"..."); return; case COBJ: obj->co.ops->print(obj, out); @@ -1974,7 +1986,7 @@ void obj_pprint(obj_t *obj, obj_t *out) cformat(out, "#<garbage: %p>", (void *) obj); } -void init(const char *pn, void *(*oom)(void *, size_t), +void init(const wchar_t *pn, void *(*oom)(void *, size_t), obj_t **stack_bottom) { progname = pn; @@ -64,7 +64,7 @@ struct cons { struct string { type_t type; - char *str; + wchar_t *str; obj_t *len; }; @@ -176,7 +176,7 @@ extern obj_t *null_list; /* (nil) */ extern obj_t *identity_f; extern obj_t *equal_f; -extern const char *progname; +extern const wchar_t *progname; extern obj_t *prog_string; extern void *(*oom_realloc)(void *, size_t); @@ -216,7 +216,7 @@ obj_t *nump(obj_t *num); obj_t *equal(obj_t *left, obj_t *right); void *chk_malloc(size_t size); void *chk_realloc(void*, size_t size); -void *chk_strdup(const char *str); +void *chk_strdup(const wchar_t *str); obj_t *cons(obj_t *car, obj_t *cdr); obj_t *list(obj_t *first, ...); /* terminated by nao */ obj_t *consp(obj_t *obj); @@ -238,16 +238,17 @@ obj_t *le(obj_t *anum, obj_t *bnum); obj_t *numeq(obj_t *anum, obj_t *bnum); obj_t *max2(obj_t *anum, obj_t *bnum); obj_t *min2(obj_t *anum, obj_t *bnum); -obj_t *string_own(char *str); -obj_t *string(const char *str); +obj_t *string_own(wchar_t *str); +obj_t *string(const wchar_t *str); +obj_t *string_utf8(const unsigned char *str); obj_t *mkstring(obj_t *len, obj_t *ch); obj_t *mkustring(obj_t *len); /* must initialize immediately with init_str! */ -obj_t *init_str(obj_t *str, const char *); +obj_t *init_str(obj_t *str, const wchar_t *); obj_t *copy_str(obj_t *str); obj_t *stringp(obj_t *str); obj_t *lazy_stringp(obj_t *str); obj_t *length_str(obj_t *str); -const char *c_str(obj_t *str); +const wchar_t *c_str(obj_t *str); obj_t *search_str(obj_t *haystack, obj_t *needle, obj_t *start_num, obj_t *from_end); obj_t *search_str_tree(obj_t *haystack, obj_t *tree, obj_t *start_num, @@ -315,7 +316,7 @@ obj_t *sort(obj_t *list, obj_t *lessfun, obj_t *keyfun); void obj_print(obj_t *obj, obj_t *stream); void obj_pprint(obj_t *obj, obj_t *stream); -void init(const char *progname, void *(*oom_realloc)(void *, size_t), +void init(const wchar_t *progname, void *(*oom_realloc)(void *, size_t), obj_t **stack_bottom); void dump(obj_t *obj, obj_t *stream); obj_t *snarf(obj_t *in); @@ -32,6 +32,7 @@ #include <dirent.h> #include <setjmp.h> #include <stdarg.h> +#include <wchar.h> #include "lib.h" #include "gc.h" #include "unwind.h" @@ -39,28 +40,29 @@ #include "stream.h" #include "parser.h" #include "txr.h" +#include "utf8.h" #include "match.h" int output_produced; -static void debugf(const char *fmt, ...) +static void debugf(const wchar_t *fmt, ...) { if (opt_loglevel >= 2) { va_list vl; va_start (vl, fmt); - format(std_error, "~a: ", prog_string, nao); + format(std_error, L"~a: ", prog_string, nao); vformat(std_error, fmt, vl); put_cchar(std_error, '\n'); va_end (vl); } } -static void debuglf(obj_t *line, const char *fmt, ...) +static void debuglf(obj_t *line, const wchar_t *fmt, ...) { if (opt_loglevel >= 2) { va_list vl; va_start (vl, fmt); - format(std_error, "~a: (~a:~a) ", prog_string, spec_file_str, line, nao); + format(std_error, L"~a: (~a:~a) ", prog_string, spec_file_str, line, nao); vformat(std_error, fmt, vl); put_cchar(std_error, '\n'); va_end (vl); @@ -72,21 +74,21 @@ static void debuglcf(obj_t *line, const char *fmt, ...) if (opt_loglevel >= 2) { va_list vl; va_start (vl, fmt); - format(std_error, "~a: (~a:~a) ", prog_string, spec_file_str, line, nao); + format(std_error, L"~a: (~a:~a) ", prog_string, spec_file_str, line, nao); vcformat(std_error, fmt, vl); put_cchar(std_error, '\n'); va_end (vl); } } -static void sem_error(obj_t *line, const char *fmt, ...) +static void sem_error(obj_t *line, const wchar_t *fmt, ...) { va_list vl; obj_t *stream = make_string_output_stream(); va_start (vl, fmt); if (line) - format(stream, "(~a:~a) ", spec_file_str, line, nao); + format(stream, L"(~a:~a) ", spec_file_str, line, nao); (void) vformat(stream, fmt, vl); va_end (vl); @@ -94,14 +96,14 @@ static void sem_error(obj_t *line, const char *fmt, ...) abort(); } -static void file_err(obj_t *line, const char *fmt, ...) +static void file_err(obj_t *line, const wchar_t *fmt, ...) { va_list vl; obj_t *stream = make_string_output_stream(); va_start (vl, fmt); if (line) - format(stream, "(~a:~a) ", spec_file_str, line, nao); + format(stream, L"(~a:~a) ", spec_file_str, line, nao); (void) vformat(stream, fmt, vl); va_end (vl); @@ -110,43 +112,43 @@ static void file_err(obj_t *line, const char *fmt, ...) } -void dump_shell_string(const char *str) +void dump_shell_string(const wchar_t *str) { int ch; - putchar('"'); + putwchar('"'); while ((ch = *str++) != 0) { switch (ch) { case '"': case '`': case '$': case '\\': case '\n': - putchar('\\'); + putwchar('\\'); /* fallthrough */ default: - putchar(ch); + putwchar(ch); } } - putchar('"'); + putwchar('"'); } -void dump_var(const char *name, char *pfx1, size_t len1, - char *pfx2, size_t len2, obj_t *value, int level) +void dump_var(const wchar_t *name, wchar_t *pfx1, size_t len1, + wchar_t *pfx2, size_t len2, obj_t *value, int level) { if (len1 >= 112 || len2 >= 112) internal_error("too much depth in bindings"); if (stringp(value) || chrp(value)) { - fputs(name, stdout); - fputs(pfx1, stdout); - fputs(pfx2, stdout); - putchar('='); + fputws(name, stdout); + fputws(pfx1, stdout); + fputws(pfx2, stdout); + putwchar('='); if (stringp(value)) { dump_shell_string(c_str(value)); } else { - char mini[2]; + wchar_t mini[2]; mini[0] = c_chr(value); mini[1] = 0; dump_shell_string(mini); } - putchar('\n'); + putwchar('\n'); } else { obj_t *iter; int i; @@ -154,10 +156,10 @@ void dump_var(const char *name, char *pfx1, size_t len1, for (i = 0, iter = value; iter; iter = cdr(iter), i++) { if (level < opt_arraydims) { - add2 = sprintf(pfx2 + len2, "[%d]", i); + add2 = swprintf(pfx2 + len2, 12, L"[%d]", i); add1 = 0; } else { - add1 = sprintf(pfx1 + len1, "_%d", i); + add1 = swprintf(pfx1 + len1, 12, L"_%d", i); add2 = 0; } @@ -174,10 +176,10 @@ void dump_bindings(obj_t *bindings) } while (bindings) { - char pfx1[128], pfx2[128]; + wchar_t pfx1[128], pfx2[128]; obj_t *var = car(car(bindings)); obj_t *value = cdr(car(bindings)); - const char *name = c_str(symbol_name(var)); + const wchar_t *name = c_str(symbol_name(var)); *pfx1 = 0; *pfx2 = 0; dump_var(name, pfx1, 0, pfx2, 0, value, 0); bindings = cdr(bindings); @@ -241,7 +243,7 @@ obj_t *dest_bind(obj_t *bindings, obj_t *pattern, obj_t *value) return bindings; if (tree_find(cdr(existing), value)) return bindings; - debugf("bind variable mismatch: ~a", pattern, nao); + debugf(L"bind variable mismatch: ~a", pattern, nao); return t; } return cons(cons(pattern, value), bindings); @@ -274,16 +276,16 @@ obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, obj_t *file) { #define LOG_MISMATCH(KIND) \ - debuglf(spec_lineno, KIND " mismatch, position ~a (~a:~a)", pos, \ + debuglf(spec_lineno, KIND L" mismatch, position ~a (~a:~a)", pos, \ file, data_lineno, nao); \ - debuglf(spec_lineno, " ~a", dataline, nao); \ + debuglf(spec_lineno, L" ~a", dataline, nao); \ if (c_num(pos) < 77) \ debuglcf(spec_lineno, " %*s^", (int) c_num(pos), "") #define LOG_MATCH(KIND, EXTENT) \ - debuglf(spec_lineno, KIND " matched, position ~a-~a (~a:~a)", \ + debuglf(spec_lineno, KIND L" matched, position ~a-~a (~a:~a)", \ pos, EXTENT, file, data_lineno, nao); \ - debuglf(spec_lineno, " ~a", dataline, nao); \ + debuglf(spec_lineno, L" ~a", dataline, nao); \ if (c_num(EXTENT) < 77) \ debuglcf(spec_lineno, " %*s%-*s^", (int) c_num(pos), \ "", (int) (c_num(EXTENT) - c_num(pos)), "^") @@ -320,18 +322,18 @@ obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, if (length_str_lt(dataline, past) || lt(past, pos)) { - LOG_MISMATCH("fixed field size"); + LOG_MISMATCH(L"fixed field size"); return nil; } if (!tree_find(trim_str(sub_str(dataline, pos, past)), cdr(pair))) { - LOG_MISMATCH("fixed field contents"); + LOG_MISMATCH(L"fixed field contents"); return nil; } - LOG_MATCH("fixed field", past); + LOG_MATCH(L"fixed field", past); pos = past; specline = cdr(specline); } else { @@ -342,20 +344,20 @@ obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, if (consp(modifier)) { obj_t *past = match_regex(dataline, car(modifier), pos); if (nullp(past)) { - LOG_MISMATCH("var positive regex"); + LOG_MISMATCH(L"var positive regex"); return nil; } - LOG_MATCH("var positive regex", past); + LOG_MATCH(L"var positive regex", past); bindings = acons_new(bindings, sym, sub_str(dataline, pos, past)); pos = past; } else if (nump(modifier)) { obj_t *past = plus(pos, modifier); if (length_str_lt(dataline, past) || lt(past, pos)) { - LOG_MISMATCH("count based var"); + LOG_MISMATCH(L"count based var"); return nil; } - LOG_MATCH("count based var", past); + LOG_MATCH(L"count based var", past); bindings = acons_new(bindings, sym, trim_str(sub_str(dataline, pos, past))); pos = past; } else { @@ -365,10 +367,10 @@ obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, } else if (type(pat) == STR) { obj_t *find = search_str(dataline, pat, pos, modifier); if (!find) { - LOG_MISMATCH("var delimiting string"); + LOG_MISMATCH(L"var delimiting string"); return nil; } - LOG_MATCH("var delimiting string", find); + LOG_MATCH(L"var delimiting string", find); bindings = acons_new(bindings, sym, sub_str(dataline, pos, find)); pos = plus(find, length_str(pat)); } else if (consp(pat) && typeof(first(pat)) == regex) { @@ -376,10 +378,10 @@ obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, obj_t *fpos = car(find); obj_t *flen = cdr(find); if (!find) { - LOG_MISMATCH("var delimiting regex"); + LOG_MISMATCH(L"var delimiting regex"); return nil; } - LOG_MATCH("var delimiting regex", fpos); + LOG_MATCH(L"var delimiting regex", fpos); bindings = acons_new(bindings, sym, sub_str(dataline, pos, fpos)); pos = plus(fpos, flen); } else if (consp(pat) && first(pat) == var) { @@ -389,7 +391,7 @@ obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, obj_t *pair = assoc(bindings, second_sym); /* var exists already? */ if (!pair) - sem_error(spec_lineno, "consecutive unbound variables", nao); + sem_error(spec_lineno, L"consecutive unbound variables", nao); /* Re-generate a new spec with an edited version of the element we just processed, and repeat. */ @@ -406,14 +408,14 @@ obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, } else if (consp(pat) && (consp(first(pat)) || stringp(first(pat)))) { cons_bind (find, len, search_str(dataline, pat, pos, modifier)); if (!find) { - LOG_MISMATCH("string"); + LOG_MISMATCH(L"string"); return nil; } bindings = acons_new(bindings, sym, sub_str(dataline, pos, find)); pos = plus(find, len); } else { sem_error(spec_lineno, - "variable followed by invalid element", nao); + L"variable followed by invalid element", nao); } } else if (typeof(directive) == regex) { obj_t *past = match_regex(dataline, directive, pos); @@ -475,7 +477,7 @@ obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, if (!bindings_coll) { - debuglf(spec_lineno, "nothing was collected", nao); + debuglf(spec_lineno, L"nothing was collected", nao); return nil; } @@ -489,15 +491,15 @@ obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, obj_t *newpos; if (find == nil || !equal(find, pos)) { - LOG_MISMATCH("string tree"); + LOG_MISMATCH(L"string tree"); return nil; } newpos = plus(find, len); - LOG_MATCH("string tree", newpos); + LOG_MATCH(L"string tree", newpos); pos = newpos; } else { - sem_error(spec_lineno, "unknown directive: ~a", directive, nao); + sem_error(spec_lineno, L"unknown directive: ~a", directive, nao); } } break; @@ -515,7 +517,7 @@ obj_t *match_line(obj_t *bindings, obj_t *specline, obj_t *dataline, break; } default: - sem_error(spec_lineno, "unsupported object in spec: ~s", elem, nao); + sem_error(spec_lineno, L"unsupported object in spec: ~s", elem, nao); } specline = cdr(specline); @@ -628,27 +630,30 @@ fpip_t complex_open(obj_t *name, obj_t *output) { fpip_t ret = { 0 }; - const char *namestr = c_str(name); + const wchar_t *namestr = c_str(name); long len = c_num(length_str(name)); if (len == 0) return ret; - if (!strcmp(namestr, "-")) { + if (!wcscmp(namestr, L"-")) { ret.close = fpip_fclose; ret.f = output ? stdout : stdin; output_produced = output ? 1 : 0; } else if (namestr[0] == '!') { ret.close = fpip_pclose; - ret.f = popen(namestr+1, output ? "w" : "r"); + ret.f = w_popen(namestr+1, output ? L"w" : L"r"); } else if (namestr[0] == '$') { + char *name; if (output) return ret; + name = (char *) utf8_dup_to(namestr+1); ret.close = fpip_closedir; - ret.d = opendir(namestr+1); + ret.d = opendir(name); + free(name); } else { ret.close = fpip_fclose; - ret.f = fopen(namestr, output ? "w" : "r"); + ret.f = w_fopen(namestr, output ? L"w" : L"r"); } return ret; @@ -758,8 +763,8 @@ void do_output_line(obj_t *bindings, obj_t *specline, if (directive == var) { obj_t *str = cat_str(subst_vars(cons(elem, nil), bindings), nil); if (str == nil) - sem_error(spec_lineno, "bad substitution: ~a", second(elem), nao); - fputs(c_str(str), out); + sem_error(spec_lineno, L"bad substitution: ~a", second(elem), nao); + fputws(c_str(str), out); } else if (directive == rep) { obj_t *main_clauses = second(elem); obj_t *single_clauses = third(elem); @@ -798,17 +803,17 @@ void do_output_line(obj_t *bindings, obj_t *specline, } } else { - sem_error(spec_lineno, "unknown directive: ~a", directive, nao); + sem_error(spec_lineno, L"unknown directive: ~a", directive, nao); } } break; case STR: - fputs(c_str(elem), out); + fputws(c_str(elem), out); break; case 0: break; default: - sem_error(spec_lineno, "unsupported object in output spec: ~s", elem); + sem_error(spec_lineno, L"unsupported object in output spec: ~s", elem); } } } @@ -866,7 +871,7 @@ void do_output(obj_t *bindings, obj_t *specs, FILE *out) } do_output_line(bindings, specline, spec_lineno, out); - putc('\n', out); + putwc('\n', out); } } @@ -888,21 +893,21 @@ obj_t *match_files(obj_t *spec, obj_t *files, obj_t *first_spec_item = second(first(spec)); if (consp(first_spec_item) && eq(first(first_spec_item), next)) { - debugf("not opening source ~a since query starts with next directive", + debugf(L"not opening source ~a since query starts with next directive", name, nao); } else { - debugf("opening data source ~a", name, nao); + debugf(L"opening data source ~a", name, nao); if (complex_open_failed(fp)) { if (consp(source_spec) && car(source_spec) == nothrow) { - debugf("could not open ~a: treating as failed match due to nothrow", + debugf(L"could not open ~a: treating as failed match due to nothrow", name, nao); return nil; } else if (errno != 0) - file_err(nil, "could not open ~a (error ~a/~a)", name, - num(errno), string(strerror(errno)), nao); + file_err(nil, L"could not open ~a (error ~a/~a)", name, + num(errno), string_utf8(strerror(errno)), nao); else - file_err(nil, "could not open ~a", name, nao); + file_err(nil, L"could not open ~a", name, nao); return nil; } @@ -931,7 +936,7 @@ repeat_spec_same_data: if (rest(specline)) sem_error(spec_linenum, - "unexpected material after skip directive", nao); + L"unexpected material after skip directive", nao); if ((spec = rest(spec)) == nil) break; @@ -944,12 +949,12 @@ repeat_spec_same_data: data, num(data_lineno)); if (result) { - debuglf(spec_linenum, "skip matched ~a:~a", first(files), + debuglf(spec_linenum, L"skip matched ~a:~a", first(files), num(data_lineno), nao); break; } - debuglf(spec_linenum, "skip didn't match ~a:~a", first(files), + debuglf(spec_linenum, L"skip didn't match ~a:~a", first(files), num(data_lineno), nao); data = rest(data); data_lineno++; @@ -962,12 +967,12 @@ repeat_spec_same_data: return result; } - debuglf(spec_linenum, "skip failed", nao); + debuglf(spec_linenum, L"skip failed", nao); return nil; } else if (sym == trailer) { if (rest(specline)) sem_error(spec_linenum, - "unexpected material after trailer directive", nao); + L"unexpected material after trailer directive", nao); if ((spec = rest(spec)) == nil) break; @@ -989,7 +994,7 @@ repeat_spec_same_data: if ((spec = rest(spec)) == nil) { sem_error(spec_linenum, - "freeform must be followed by a query line", nao); + L"freeform must be followed by a query line", nao); } else { obj_t *limit = or2(if2(nump(first(vals)), first(vals)), if2(nump(second(vals)), second(vals))); @@ -1003,7 +1008,7 @@ repeat_spec_same_data: spec_linenum, num(data_lineno), first(files))); if (!success) { - debuglf(spec_linenum, "freeform match failure", nao); + debuglf(spec_linenum, L"freeform match failure", nao); return nil; } @@ -1021,7 +1026,7 @@ repeat_spec_same_data: obj_t *name = first(rest(first_spec)); if (rest(specline)) sem_error(spec_linenum, - "unexpected material after block directive", nao); + L"unexpected material after block directive", nao); if ((spec = rest(spec)) == nil) break; { @@ -1034,7 +1039,7 @@ repeat_spec_same_data: obj_t *target = first(rest(first_spec)); if (rest(specline)) - sem_error(spec_linenum, "unexpected material after ~a", sym, nao); + sem_error(spec_linenum, L"unexpected material after ~a", sym, nao); uw_block_return(target, if2(sym == accept, @@ -1042,15 +1047,15 @@ repeat_spec_same_data: if3(data, cons(data, num(data_lineno)), t)))); /* TODO: uw_block_return could just throw this */ if (target) - sem_error(spec_linenum, "~a: no block named ~a in scope", + sem_error(spec_linenum, L"~a: no block named ~a in scope", sym, target, nao); else - sem_error(spec_linenum, "%~a: no anonymous block in scope", sym, nao); + sem_error(spec_linenum, L"%~a: no anonymous block in scope", sym, nao); return nil; } else if (sym == next) { if (rest(first_spec) && rest(specline)) sem_error(spec_linenum, - "invalid combination of old and new next syntax", nao); + L"invalid combination of old and new next syntax", nao); if ((spec = rest(spec)) == nil) break; @@ -1061,7 +1066,7 @@ repeat_spec_same_data: if (eq(first(source), nothrow)) push(nil, &source); else if (eq(first(source), args)) { - obj_t *input_name = string("args"); + obj_t *input_name = string(L"args"); cons_bind (new_bindings, success, match_files(spec, cons(input_name, files), bindings, files, one)); @@ -1076,7 +1081,7 @@ repeat_spec_same_data: obj_t *name = cdr(val); if (!val) - sem_error(spec_linenum, "next: unbound variable in form ~a", + sem_error(spec_linenum, L"next: unbound variable in form ~a", first(source), nao); if (eq(second(source), nothrow)) { @@ -1085,7 +1090,7 @@ repeat_spec_same_data: } else { files = rest(files); if (!files) { - debuglf(spec_linenum, "next: out of arguments", nao); + debuglf(spec_linenum, L"next: out of arguments", nao); return nil; } files = cons(cons(nothrow, first(files)), rest(files)); @@ -1096,7 +1101,7 @@ repeat_spec_same_data: } else { files = rest(files); if (!files) - sem_error(spec_linenum, "next: out of arguments", nao); + sem_error(spec_linenum, L"next: out of arguments", nao); files = cons(cons(nothrow, first(files)), rest(files)); } } @@ -1105,14 +1110,14 @@ repeat_spec_same_data: obj_t *sub = subst_vars(rest(specline), bindings); obj_t *str = cat_str(sub, nil); if (str == nil) { - sem_error(spec_linenum, "bad substitution in next file spec", nao); + sem_error(spec_linenum, L"bad substitution in next file spec", nao); continue; } files = cons(cons(nothrow, str), files); } else { files = rest(files); if (!files) - sem_error(spec_linenum, "next: out of arguments", nao); + sem_error(spec_linenum, L"next: out of arguments", nao); } /* We recursively process the file list, but the new @@ -1166,17 +1171,17 @@ repeat_spec_same_data: } if (sym == all && !all_match) { - debuglf(spec_linenum, "all: some clauses didn't match", nao); + debuglf(spec_linenum, L"all: some clauses didn't match", nao); return nil; } if ((sym == some || sym == cases) && !some_match) { - debuglf(spec_linenum, "some/cases: no clauses matched", nao); + debuglf(spec_linenum, L"some/cases: no clauses matched", nao); return nil; } if (sym == none && some_match) { - debuglf(spec_linenum, "none: some clauses matched", nao); + debuglf(spec_linenum, L"none: some clauses matched", nao); return nil; } @@ -1255,7 +1260,7 @@ repeat_spec_same_data: data = new_data; data_lineno = new_lineno; } else { - debuglf(spec_linenum, "collect consumed entire file", nao); + debuglf(spec_linenum, L"collect consumed entire file", nao); data = nil; } } else { @@ -1267,12 +1272,12 @@ repeat_spec_same_data: uw_block_end; if (!result) { - debuglf(spec_linenum, "collect explicitly failed", nao); + debuglf(spec_linenum, L"collect explicitly failed", nao); return nil; } if (!bindings_coll) { - debuglf(spec_linenum, "nothing was collected", nao); + debuglf(spec_linenum, L"nothing was collected", nao); return nil; } @@ -1293,7 +1298,7 @@ repeat_spec_same_data: obj_t *sym = first(iter); if (!symbolp(sym)) { - sem_error(spec_linenum, "non-symbol in flatten directive", nao); + sem_error(spec_linenum, L"non-symbol in flatten directive", nao); } else { obj_t *existing = assoc(bindings, sym); @@ -1319,7 +1324,7 @@ repeat_spec_same_data: obj_t *merged = nil; if (!target || !symbolp(target)) - sem_error(spec_linenum, "bad merge directive", nao); + sem_error(spec_linenum, L"bad merge directive", nao); for (; args; args = rest(args)) { obj_t *other_sym = first(args); @@ -1328,9 +1333,9 @@ repeat_spec_same_data: obj_t *other_lookup = assoc(bindings, other_sym); if (!symbolp(other_sym)) - sem_error(spec_linenum, "non-symbol in merge directive", nao); + sem_error(spec_linenum, L"non-symbol in merge directive", nao); else if (!other_lookup) - sem_error(spec_linenum, "merge: nonexistent symbol ~a", + sem_error(spec_linenum, L"merge: nonexistent symbol ~a", other_sym, nao); if (merged) @@ -1353,7 +1358,7 @@ repeat_spec_same_data: obj_t *val = eval_form(form, bindings); if (!val) - sem_error(spec_linenum, "bind: unbound variable on right side", nao); + sem_error(spec_linenum, L"bind: unbound variable on right side", nao); bindings = dest_bind(bindings, pattern, cdr(val)); @@ -1371,7 +1376,7 @@ repeat_spec_same_data: obj_t *sym = first(iter); if (!symbolp(sym)) { - sem_error(spec_linenum, "non-symbol in cat directive", nao); + sem_error(spec_linenum, L"non-symbol in cat directive", nao); } else { obj_t *existing = assoc(bindings, sym); obj_t *sep = nil; @@ -1408,28 +1413,28 @@ repeat_spec_same_data: obj_t *val = eval_form(form, bindings); if (!val) - sem_error(spec_linenum, "output: unbound variable in form ~a", + sem_error(spec_linenum, L"output: unbound variable in form ~a", form, nao); nt = eq(second(new_style_dest), nothrow); - dest = or2(cdr(val), string("-")); + dest = or2(cdr(val), string(L"-")); } } fpip_t fp = (errno = 0, complex_open(dest, t)); - debugf("opening data sink ~a", dest, nao); + debugf(L"opening data sink ~a", dest, nao); if (complex_open_failed(fp)) { if (nt) { - debugf("could not open ~a: treating as failed match due to nothrow", + debugf(L"could not open ~a: treating as failed match due to nothrow", dest, nao); return nil; } else if (errno != 0) { - file_err(nil, "could not open ~a (error ~a/~a)", dest, - num(errno), string(strerror(errno)), nao); + file_err(nil, L"could not open ~a (error ~a/~a)", dest, + num(errno), string_utf8(strerror(errno)), nao); } else { - file_err(nil, "could not open ~a", dest, nao); + file_err(nil, L"could not open ~a", dest, nao); } } else { do_output(bindings, specs, fp.f); @@ -1447,7 +1452,7 @@ repeat_spec_same_data: obj_t *params = second(args); if (rest(specline)) - sem_error(spec_linenum, "unexpected material after define", nao); + sem_error(spec_linenum, L"unexpected material after define", nao); uw_set_func(name, cons(params, body)); @@ -1590,7 +1595,7 @@ repeat_spec_same_data: } else if (sym == defex) { obj_t *types = rest(first_spec); if (!all_satisfy(types, func_n1(symbolp), nil)) - sem_error(spec_linenum, "defex arguments must all be symbols", nao); + sem_error(spec_linenum, L"defex arguments must all be symbols", nao); (void) reduce_left(func_n2(uw_register_subtype), types, nil, nil); if ((spec = rest(spec)) == nil) break; @@ -1599,7 +1604,7 @@ repeat_spec_same_data: obj_t *type = second(first_spec); obj_t *args = rest(rest(first_spec)); if (!symbolp(type)) - sem_error(spec_linenum, "throw: ~a is not a type symbol", + sem_error(spec_linenum, L"throw: ~a is not a type symbol", first(first_spec), nao); { obj_t *values = mapcar(bind2other(func_n2(eval_form), bindings), @@ -1618,7 +1623,7 @@ repeat_spec_same_data: obj_t *bindings_cp = copy_alist(bindings); if (!equal(length(args), length(params))) - sem_error(spec_linenum, "function ~a takes ~a argument(s)", + sem_error(spec_linenum, L"function ~a takes ~a argument(s)", sym, length(params), nao); for (piter = params, aiter = args; piter; @@ -1641,7 +1646,7 @@ repeat_spec_same_data: obj_t *val = eval_form(arg, bindings); if (!val) sem_error(spec_linenum, - "unbound variable in function argument form", nao); + L"unbound variable in function argument form", nao); bindings_cp = acons_new(bindings_cp, param, cdr(val)); } } @@ -1655,7 +1660,7 @@ repeat_spec_same_data: uw_block_end; if (!result) { - debuglf(spec_linenum, "function failed", nao); + debuglf(spec_linenum, L"function failed", nao); return nil; } @@ -1671,8 +1676,8 @@ repeat_spec_same_data: if (newbind) { bindings = dest_bind(bindings, arg, cdr(newbind)); if (bindings == t) { - debuglf(spec_linenum, "binding mismatch on ~a " - "when returning from ~a", arg, sym, nao); + debuglf(spec_linenum, L"binding mismatch on ~a " + L"when returning from ~a", arg, sym, nao); return nil; } } @@ -1686,7 +1691,7 @@ repeat_spec_same_data: data = car(success); data_lineno = c_num(cdr(success)); } else { - debuglf(spec_linenum, "function consumed entire file", nao); + debuglf(spec_linenum, L"function consumed entire file", nao); data = nil; } } @@ -1709,7 +1714,7 @@ repeat_spec_same_data: spec_linenum, num(data_lineno), first(files))); if (nump(success) && c_num(success) < c_num(length_str(dataline))) { - debuglf(spec_linenum, "spec only matches line to position ~a: ~a", + debuglf(spec_linenum, L"spec only matches line to position ~a: ~a", success, dataline, nao); return nil; } @@ -28,7 +28,7 @@ long lineno; extern int errors; extern obj_t *yyin_stream; -extern const char *spec_file; +extern const wchar_t *spec_file; extern obj_t *spec_file_str; int yyparse(void); obj_t *get_spec(void); @@ -33,10 +33,12 @@ #include <limits.h> #include <errno.h> #include <dirent.h> +#include <wchar.h> #include "y.tab.h" #include "lib.h" #include "gc.h" #include "stream.h" +#include "utf8.h" #include "parser.h" #define YY_NO_UNPUT @@ -73,7 +75,7 @@ void yyerrorf(const char *s, ...) if (opt_loglevel >= 1) { va_list vl; va_start (vl, s); - fprintf(stderr, "%s: (%s:%ld): ", progname, spec_file, lineno); + fprintf(stderr, "%ls: (%ls:%ld): ", progname, spec_file, lineno); vfprintf(stderr, s, vl); putc('\n', stderr); va_end (vl); @@ -127,33 +129,33 @@ void yybadtoken(int tok, const char *context) yyerrorf("unexpected end of input"); } -static int char_esc(int letter) +static wchar_t char_esc(int letter) { switch (letter) { - case 'a': return '\a'; - case 'b': return '\b'; - case 't': return '\t'; - case 'n': return '\n'; - case 'v': return '\v'; - case 'f': return '\f'; - case 'r': return '\r'; + case 'a': return L'\a'; + case 'b': return L'\b'; + case 't': return L'\t'; + case 'n': return L'\n'; + case 'v': return L'\v'; + case 'f': return L'\f'; + case 'r': return L'\r'; case 'e': return 27; - case '"': return '"'; - case '\'': return '\''; - case '`': return '`'; + case '"': return L'"'; + case '\'': return L'\''; + case '`': return L'`'; } abort(); } -static int num_esc(char *num) +static wchar_t num_esc(char *num) { if (num[0] == 'x') { - if (strlen(num) > 3) + if (strlen(num) > 7) yyerror("too many digits in hex character escape"); return strtol(num + 1, 0, 16); } else { - if (strlen(num) > 3) + if (strlen(num) > 8) yyerror("too many digits in octal character escape"); return strtol(num, 0, 8); } @@ -170,6 +172,17 @@ WS [\t ]* HEX [0-9A-Fa-f] OCT [0-7] +ASC [\x00-\x7f] +ASCN [\x00-\t\v-\x7f] +U [\x80-\xbf] +U2 [\xc2-\xdf] +U3 [\xe0-\xef] +U4 [\xf0-\xf4] + +UANY {ASC}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} +UANYN {ASCN}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} +UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} + %x SPECIAL NESTED REGEX REGCLASS STRLIT CHRLIT QSILIT %% @@ -188,7 +201,7 @@ OCT [0-7] if (*errp != 0) { /* not a number */ - yylval.lexeme = strdup(yytext); + yylval.lexeme = utf8_dup_from(yytext); return IDENT; } @@ -346,7 +359,7 @@ OCT [0-7] <SPECIAL>@ { yy_pop_state(); - yylval.lexeme = strdup("@"); + yylval.lexeme = wcsdup(L"@"); return TEXT; } @@ -365,26 +378,25 @@ OCT [0-7] } <SPECIAL>[\\][abtnvfre] { - char lexeme[2]; + wchar_t lexeme[2]; lexeme[0] = char_esc(yytext[1]); lexeme[1] = 0; - yylval.lexeme = strdup(lexeme); + yylval.lexeme = wcsdup(lexeme); yy_pop_state(); return TEXT; } <SPECIAL>[\\](x{HEX}+|{OCT}+) { - char lexeme[2]; + wchar_t lexeme[2]; lexeme[0] = num_esc(yytext + 1); lexeme[1] = 0; - yylval.lexeme = strdup(lexeme); + yylval.lexeme = wcsdup(lexeme); yy_pop_state(); return TEXT; } -<SPECIAL,NESTED>. { - yyerrorf("bad character in directive: '%c'", - yytext[0]); +<SPECIAL,NESTED>{UANYN} { + yyerrorf("bad character in directive: '%s'", yytext); } <REGEX>[/] { @@ -433,15 +445,17 @@ OCT [0-7] return REGCHAR; } -<REGEX>. { - yylval.chr = yytext[0]; +<REGEX>{UANYN} { + wchar_t buf[8]; + utf8_from(buf, yytext); + yylval.chr = buf[0]; return REGCHAR; } -<INITIAL>[^@\n]+ { - yylval.lexeme = strdup(yytext); - return TEXT; - } +<INITIAL>({UONLY}|[^@\n])+ { + yylval.lexeme = utf8_dup_from(yytext); + return TEXT; + } <INITIAL>\n { lineno++; @@ -515,9 +529,11 @@ OCT [0-7] yy_push_state(SPECIAL); } -<STRLIT,CHRLIT,QSILIT>. { - yylval.chr = yytext[0]; - return LITCHAR; - } +<STRLIT,CHRLIT,QSILIT>{UANYN} { + wchar_t buf[8]; + utf8_from(buf, yytext); + yylval.chr = buf[0]; + return LITCHAR; + } %% @@ -32,6 +32,7 @@ #include <dirent.h> #include "lib.h" #include "regex.h" +#include "utf8.h" #include "parser.h" int yylex(void); @@ -46,9 +47,9 @@ static obj_t *parsed_spec; %} %union { - char *lexeme; + wchar_t *lexeme; union obj *obj; - char chr; + wchar_t chr; long num; } @@ -528,9 +528,9 @@ int nfa_move(nfa_state_t **in, int nin, nfa_state_t **out, int ch) * determines the match length (defaulting to zero * if no acceptance states were encountered). */ -long nfa_run(nfa_t nfa, const char *str) +long nfa_run(nfa_t nfa, const wchar_t *str) { - const char *last_accept_pos = 0, *ptr = str; + const wchar_t *last_accept_pos = 0, *ptr = str; unsigned visited = nfa.start->a.visited + 1; nfa_state_t **move = chk_malloc(NFA_SET_SIZE * sizeof *move); nfa_state_t **clos = chk_malloc(NFA_SET_SIZE * sizeof *clos); @@ -697,7 +697,7 @@ obj_t *search_regex(obj_t *haystack, obj_t *needle_regex, obj_t *start, if (from_end) { long i; long s = c_num(start); - const char *h = c_str(haystack); + const wchar_t *h = c_str(haystack); for (i = c_num(length_str(haystack)) - 1; i >= s; i--) { long span = nfa_run(*pnfa, h + i); @@ -110,7 +110,7 @@ typedef struct nfa_machine { nfa_t nfa_compile_regex(obj_t *regex); void nfa_free(nfa_t); -long nfa_run(nfa_t nfa, const char *str); +long nfa_run(nfa_t nfa, const wchar_t *str); void nfa_machine_reset(nfa_machine_t *); void nfa_machine_init(nfa_machine_t *, nfa_t); void nfa_machine_cleanup(nfa_machine_t *); @@ -32,21 +32,23 @@ #include <assert.h> #include <setjmp.h> #include <errno.h> +#include <wchar.h> #include "lib.h" #include "gc.h" #include "unwind.h" #include "stream.h" +#include "utf8.h" obj_t *std_input, *std_output, *std_error; struct strm_ops { struct cobj_ops cobj_ops; - obj_t *(*put_string)(obj_t *, const char *); + obj_t *(*put_string)(obj_t *, const wchar_t *); obj_t *(*put_char)(obj_t *, int); obj_t *(*get_line)(obj_t *); obj_t *(*get_char)(obj_t *); obj_t *(*vcformat)(obj_t *, const char *fmt, va_list vl); - obj_t *(*vformat)(obj_t *, const char *fmt, va_list vl); + obj_t *(*vformat)(obj_t *, const wchar_t *fmt, va_list vl); obj_t *(*close)(obj_t *, obj_t *); }; @@ -60,7 +62,7 @@ static void common_destroy(obj_t *obj) (void) close_stream(obj, nil); } -obj_t *common_vformat(obj_t *stream, const char *fmt, va_list vl) +obj_t *common_vformat(obj_t *stream, const wchar_t *fmt, va_list vl) { int ch; @@ -109,7 +111,7 @@ struct stdio_handle { void stdio_stream_print(obj_t *stream, obj_t *out) { struct stdio_handle *h = (struct stdio_handle *) stream->co.handle; - format(out, "#<~s ~s>", stream->co.cls, h->descr, nao); + format(out, L"#<~s ~s>", stream->co.cls, h->descr, nao); } void stdio_stream_destroy(obj_t *stream) @@ -130,8 +132,8 @@ static obj_t *stdio_maybe_read_error(obj_t *stream) struct stdio_handle *h = (struct stdio_handle *) stream->co.handle; if (ferror(h->f)) { clearerr(h->f); - uw_throwf(file_error, "error reading ~a: ~a/~s", - stream, num(errno), string(strerror(errno))); + uw_throwf(file_error, L"error reading ~a: ~a/~s", + stream, num(errno), string_utf8(strerror(errno))); } return nil; } @@ -141,16 +143,16 @@ static obj_t *stdio_maybe_write_error(obj_t *stream) struct stdio_handle *h = (struct stdio_handle *) stream->co.handle; if (ferror(h->f)) { clearerr(h->f); - uw_throwf(file_error, "error writing ~a: ~a/~s", - stream, num(errno), string(strerror(errno))); + uw_throwf(file_error, L"error writing ~a: ~a/~s", + stream, num(errno), string_utf8(strerror(errno))); } return nil; } -static obj_t *stdio_put_string(obj_t *stream, const char *s) +static obj_t *stdio_put_string(obj_t *stream, const wchar_t *s) { struct stdio_handle *h = (struct stdio_handle *) stream->co.handle; - return (h->f && fputs(s, h->f) != EOF) ? t : stdio_maybe_write_error(stream); + return (h->f && fputws(s, h->f) != EOF) ? t : stdio_maybe_write_error(stream); } static obj_t *stdio_put_char(obj_t *stream, int ch) @@ -159,12 +161,12 @@ static obj_t *stdio_put_char(obj_t *stream, int ch) return (h->f && putc(ch, h->f) != EOF) ? t : stdio_maybe_write_error(stream); } -static char *snarf_line(FILE *in) +static wchar_t *snarf_line(FILE *in) { const size_t min_size = 512; size_t size = 0; size_t fill = 0; - char *buf = 0; + wchar_t *buf = 0; for (;;) { int ch = getc(in); @@ -174,7 +176,7 @@ static char *snarf_line(FILE *in) if (fill >= size) { size_t newsize = size ? size * 2 : min_size; - buf = chk_realloc(buf, newsize); + buf = chk_realloc(buf, newsize * sizeof *buf); size = newsize; } @@ -186,7 +188,7 @@ static char *snarf_line(FILE *in) } if (buf) - buf = chk_realloc(buf, fill); + buf = chk_realloc(buf, fill * sizeof *buf); return buf; } @@ -197,7 +199,7 @@ static obj_t *stdio_get_line(obj_t *stream) return nil; } else { struct stdio_handle *h = (struct stdio_handle *) stream->co.handle; - char *line = snarf_line(h->f); + wchar_t *line = snarf_line(h->f); if (!line) return stdio_maybe_read_error(stream); return string_own(line); @@ -233,8 +235,8 @@ static obj_t *stdio_close(obj_t *stream, obj_t *throw_on_error) int result = fclose(h->f); h->f = 0; if (result == EOF && throw_on_error) { - uw_throwf(file_error, "error closing ~a: ~a/~s", - stream, num(errno), string(strerror(errno))); + uw_throwf(file_error, L"error closing ~a: ~a/~s", + stream, num(errno), string_utf8(strerror(errno))); } return result != EOF ? t : nil; } @@ -266,22 +268,23 @@ static obj_t *pipe_close(obj_t *stream, obj_t *throw_on_error) if (status != 0 && throw_on_error) { if (status < 0) { - uw_throwf(process_error, "unable to obtain status of command ~a: ~a/~s", - stream, num(errno), string(strerror(errno)), nao); + uw_throwf(process_error, + L"unable to obtain status of command ~a: ~a/~s", + stream, num(errno), string_utf8(strerror(errno)), nao); } else if (WIFEXITED(status)) { int exitstatus = WEXITSTATUS(status); - uw_throwf(process_error, "pipe ~a terminated with status ~a", + uw_throwf(process_error, L"pipe ~a terminated with status ~a", stream, num(exitstatus), nao); } else if (WIFSIGNALED(status)) { int termsig = WTERMSIG(status); - uw_throwf(process_error, "pipe ~a terminated by signal ~a", + uw_throwf(process_error, L"pipe ~a terminated by signal ~a", stream, num(termsig), nao); } else if (WIFSTOPPED(status) || WIFCONTINUED(status)) { - uw_throwf(process_error, "processes of closed pipe ~a still running", + uw_throwf(process_error, L"processes of closed pipe ~a still running", stream, nao); } else { - uw_throwf(file_error, "strange status in when closing pipe ~a", + uw_throwf(file_error, L"strange status in when closing pipe ~a", stream, nao); } } @@ -356,7 +359,7 @@ static struct strm_ops string_in_ops = { }; struct string_output { - char *buf; + wchar_t *buf; size_t size; size_t fill; }; @@ -373,14 +376,14 @@ static void string_out_stream_destroy(obj_t *stream) } } -static obj_t *string_out_put_string(obj_t *stream, const char *s) +static obj_t *string_out_put_string(obj_t *stream, const wchar_t *s) { struct string_output *so = (struct string_output *) stream->co.handle; if (so == 0) { return nil; } else { - size_t len = strlen(s); + size_t len = wcslen(s); size_t old_size = so->size; size_t required_size = len + so->fill + 1; @@ -393,8 +396,8 @@ static obj_t *string_out_put_string(obj_t *stream, const char *s) return nil; } - so->buf = chk_realloc(so->buf, so->size); - memcpy(so->buf + so->fill, s, len + 1); + so->buf = chk_realloc(so->buf, so->size * sizeof *so->buf); + memcpy(so->buf + so->fill, s, (len + 1) * sizeof *so->buf); so->fill += len; return t; } @@ -402,7 +405,7 @@ static obj_t *string_out_put_string(obj_t *stream, const char *s) static obj_t *string_out_put_char(obj_t *stream, int ch) { - char mini[2]; + wchar_t mini[2]; mini[0] = ch; mini[1] = 0; return string_out_put_string(stream, mini); @@ -415,11 +418,12 @@ obj_t *string_out_vcformat(obj_t *stream, const char *fmt, va_list vl) if (so == 0) { return nil; } else { - int nchars, nchars2; + int nchars, nchars2, nchars3; char dummy_buf[1]; size_t old_size = so->size; size_t required_size; va_list vl_copy; + char *utf8_buf; #if defined va_copy va_copy (vl_copy, vl); @@ -437,21 +441,31 @@ obj_t *string_out_vcformat(obj_t *stream, const char *fmt, va_list vl) bug_unless (nchars >= 0); - required_size = so->fill + nchars + 1; + utf8_buf = chk_malloc(nchars + 1); + nchars2 = vsnprintf(utf8_buf, nchars + 1, fmt, vl); + bug_unless (nchars == nchars2); + + nchars3 = utf8_from(0, utf8_buf); - if (required_size < so->fill) + required_size = so->fill + nchars3 + 1; + + if (required_size < so->fill) { + free(utf8_buf); return nil; + } while (so->size <= required_size) { so->size *= 2; - if (so->size < old_size) + if (so->size < old_size) { + free(utf8_buf); return nil; + } } - so->buf = chk_realloc(so->buf, so->size); - nchars2 = vsnprintf(so->buf + so->fill, so->size-so->fill, fmt, vl); - bug_unless (nchars == nchars2); - so->fill += nchars; + so->buf = chk_realloc(so->buf, so->size * sizeof *so->buf); + utf8_from(so->buf, utf8_buf); + free(utf8_buf); + so->fill += nchars3; return t; } } @@ -483,7 +497,7 @@ static obj_t *dir_get_line(obj_t *stream) return nil; if (!strcmp(e->d_name, ".") || !strcmp(e->d_name, "..")) continue; - return string(e->d_name); + return string_utf8(e->d_name); } } } @@ -539,7 +553,7 @@ obj_t *make_string_output_stream(void) { struct string_output *so = (struct string_output *) chk_malloc(sizeof *so); so->size = 128; - so->buf = (char *) chk_malloc(so->size); + so->buf = (wchar_t *) chk_malloc(so->size * sizeof so->buf); so->fill = 0; so->buf[0] = 0; return cobj((void *) so, stream_t, &string_out_ops.cobj_ops); @@ -548,7 +562,7 @@ obj_t *make_string_output_stream(void) obj_t *get_string_from_stream(obj_t *stream) { type_check (stream, COBJ); - type_assert (stream->co.cls == stream_t, ("~a is not a stream", stream)); + type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); if (stream->co.ops == &string_out_ops.cobj_ops) { struct string_output *so = (struct string_output *) stream->co.handle; @@ -579,7 +593,7 @@ obj_t *make_dir_stream(DIR *dir) obj_t *close_stream(obj_t *stream, obj_t *throw_on_error) { type_check (stream, COBJ); - type_assert (stream->co.cls == stream_t, ("~a is not a stream", stream)); + type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); { struct strm_ops *ops = (struct strm_ops *) stream->co.ops; @@ -590,7 +604,7 @@ obj_t *close_stream(obj_t *stream, obj_t *throw_on_error) obj_t *get_line(obj_t *stream) { type_check (stream, COBJ); - type_assert (stream->co.cls == stream_t, ("~a is not a stream", stream)); + type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); { struct strm_ops *ops = (struct strm_ops *) stream->co.ops; @@ -601,7 +615,7 @@ obj_t *get_line(obj_t *stream) obj_t *get_char(obj_t *stream) { type_check (stream, COBJ); - type_assert (stream->co.cls == stream_t, ("~a is not a stream", stream)); + type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); { struct strm_ops *ops = (struct strm_ops *) stream->co.ops; @@ -609,10 +623,10 @@ obj_t *get_char(obj_t *stream) } } -obj_t *vformat(obj_t *stream, const char *str, va_list vl) +obj_t *vformat(obj_t *stream, const wchar_t *str, va_list vl) { type_check (stream, COBJ); - type_assert (stream->co.cls == stream_t, ("~a is not a stream", stream)); + type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); { struct strm_ops *ops = (struct strm_ops *) stream->co.ops; @@ -623,7 +637,7 @@ obj_t *vformat(obj_t *stream, const char *str, va_list vl) obj_t *vcformat(obj_t *stream, const char *string, va_list vl) { type_check (stream, COBJ); - type_assert (stream->co.cls == stream_t, ("~a is not a stream", stream)); + type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); { struct strm_ops *ops = (struct strm_ops *) stream->co.ops; @@ -631,10 +645,10 @@ obj_t *vcformat(obj_t *stream, const char *string, va_list vl) } } -obj_t *format(obj_t *stream, const char *str, ...) +obj_t *format(obj_t *stream, const wchar_t *str, ...) { type_check (stream, COBJ); - type_assert (stream->co.cls == stream_t, ("~a is not a stream", stream)); + type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); { struct strm_ops *ops = (struct strm_ops *) stream->co.ops; @@ -651,7 +665,7 @@ obj_t *format(obj_t *stream, const char *str, ...) obj_t *cformat(obj_t *stream, const char *string, ...) { type_check (stream, COBJ); - type_assert (stream->co.cls == stream_t, ("~a is not a stream", stream)); + type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); { struct strm_ops *ops = (struct strm_ops *) stream->co.ops; @@ -668,7 +682,7 @@ obj_t *cformat(obj_t *stream, const char *string, ...) obj_t *put_string(obj_t *stream, obj_t *string) { type_check (stream, COBJ); - type_assert (stream->co.cls == stream_t, ("~a is not a stream", stream)); + type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); { struct strm_ops *ops = (struct strm_ops *) stream->co.ops; @@ -676,10 +690,10 @@ obj_t *put_string(obj_t *stream, obj_t *string) } } -obj_t *put_cstring(obj_t *stream, const char *str) +obj_t *put_cstring(obj_t *stream, const wchar_t *str) { type_check (stream, COBJ); - type_assert (stream->co.cls == stream_t, ("~a is not a stream", stream)); + type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); { struct strm_ops *ops = (struct strm_ops *) stream->co.ops; @@ -690,7 +704,7 @@ obj_t *put_cstring(obj_t *stream, const char *str) obj_t *put_char(obj_t *stream, obj_t *ch) { type_check (stream, COBJ); - type_assert (stream->co.cls == stream_t, ("~a is not a stream", stream)); + type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); { struct strm_ops *ops = (struct strm_ops *) stream->co.ops; @@ -701,7 +715,7 @@ obj_t *put_char(obj_t *stream, obj_t *ch) obj_t *put_cchar(obj_t *stream, int ch) { type_check (stream, COBJ); - type_assert (stream->co.cls == stream_t, ("~a is not a stream", stream)); + type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); { struct strm_ops *ops = (struct strm_ops *) stream->co.ops; @@ -717,7 +731,7 @@ obj_t *put_line(obj_t *stream, obj_t *string) void stream_init(void) { protect(&std_input, &std_output, &std_error, (obj_t **) 0); - std_input = make_stdio_stream(stdin, string("stdin"), t, nil); - std_output = make_stdio_stream(stdout, string("stdout"), nil, t); - std_error = make_stdio_stream(stderr, string("stderr"), nil, t); + std_input = make_stdio_stream(stdin, string(L"stdin"), t, nil); + std_output = make_stdio_stream(stdout, string(L"stdout"), nil, t); + std_error = make_stdio_stream(stderr, string(L"stderr"), nil, t); } @@ -35,13 +35,13 @@ obj_t *make_dir_stream(DIR *); obj_t *close_stream(obj_t *stream, obj_t *throw_on_error); obj_t *get_line(obj_t *); obj_t *get_char(obj_t *); -obj_t *vformat(obj_t *stream, const char *string, va_list); /* nao-terminated */ +obj_t *vformat(obj_t *stream, const wchar_t *string, va_list); /* nao-term */ obj_t *vcformat(obj_t *stream, const char *string, va_list); /* printf-style */ -obj_t *format(obj_t *stream, const char *string, ...); +obj_t *format(obj_t *stream, const wchar_t *string, ...); obj_t *cformat(obj_t *stream, const char *string, ...); obj_t *put_string(obj_t *stream, obj_t *string); obj_t *put_line(obj_t *stream, obj_t *string); -obj_t *put_cstring(obj_t *stream, const char *); +obj_t *put_cstring(obj_t *stream, const wchar_t *); obj_t *put_char(obj_t *stream, obj_t *ch); obj_t *put_cchar(obj_t *stream, int ch); @@ -31,17 +31,19 @@ #include <limits.h> #include <dirent.h> #include <setjmp.h> +#include <wchar.h> #include "lib.h" #include "stream.h" #include "gc.h" #include "unwind.h" #include "parser.h" #include "match.h" +#include "utf8.h" #include "txr.h" -const char *version = "020"; -const char *progname = "txr"; -const char *spec_file = "stdin"; +const wchar_t *version = L"020"; +const wchar_t *progname = L"txr"; +const wchar_t *spec_file = L"stdin"; obj_t *spec_file_str; /* @@ -51,7 +53,7 @@ obj_t *spec_file_str; */ void *oom_realloc_handler(void *old, size_t size) { - fprintf(stderr, "%s: out of memory\n", progname); + fprintf(stderr, "%ls: out of memory\n", progname); puts("false"); abort(); } @@ -60,13 +62,13 @@ void help(void) { const char *text = "\n" -"txr version %s\n" +"txr version %ls\n" "\n" "copyright 2009, Kaz Kylheku <kkylheku@gmail.com>\n" "\n" "usage:\n" "\n" -" %s [ options ] query-file { data-file }*\n" +" %ls [ options ] query-file { data-file }*\n" "\n" "The query-file or data-file arguments may be specified as -, in which case\n" "standard input is used. All data-file arguments which begin with a !\n" @@ -110,7 +112,7 @@ void help(void) void hint(void) { - fprintf(stderr, "%s: incorrect arguments: try --help\n", progname); + fprintf(stderr, "%ls: incorrect arguments: try --help\n", progname); } obj_t *remove_hash_bang_line(obj_t *spec) @@ -119,7 +121,7 @@ obj_t *remove_hash_bang_line(obj_t *spec) return spec; { - obj_t *shbang = string("#!"); + obj_t *shbang = string(L"#!"); obj_t *firstline = first(spec); obj_t *items = rest(firstline); @@ -138,7 +140,7 @@ static int txr_main(int argc, char **argv); int main(int argc, char **argv) { obj_t *stack_bottom = nil; - progname = argv[0] ? argv[0] : progname; + progname = argv[0] ? utf8_dup_from(argv[0]) : progname; init(progname, oom_realloc_handler, &stack_bottom); return txr_main(argc, argv); } @@ -149,7 +151,6 @@ static int txr_main(int argc, char **argv) obj_t *spec = nil; obj_t *bindings = nil; int match_loglevel = opt_loglevel; - progname = argv[0] ? argv[0] : progname; prot1(&spec_file_str); @@ -189,7 +190,7 @@ static int txr_main(int argc, char **argv) val[piece] = 0; - list = cons(string(val), list); + list = cons(string_utf8(val), list); if (!comma_p) break; @@ -198,13 +199,13 @@ static int txr_main(int argc, char **argv) } list = nreverse(list); - bindings = cons(cons(intern(string(var)), list), bindings); + bindings = cons(cons(intern(string_utf8(var)), list), bindings); } else if (equals) { char *val = equals + 1; *equals = 0; - bindings = cons(cons(intern(string(var)), string(val)), bindings); + bindings = cons(cons(intern(string_utf8(var)), string_utf8(val)), bindings); } else { - bindings = cons(cons(intern(string(var)), null_string), bindings); + bindings = cons(cons(intern(string_utf8(var)), null_string), bindings); } argc--, argv++; @@ -212,7 +213,7 @@ static int txr_main(int argc, char **argv) } if (!strcmp(*argv, "--version")) { - printf("%s: version %s\n", progname, version); + printf("%ls: version %ls\n", progname, version); return 0; } @@ -227,7 +228,7 @@ static int txr_main(int argc, char **argv) char opt = (*argv)[1]; if (argc == 1) { - fprintf(stderr, "%s: option %c needs argument\n", progname, opt); + fprintf(stderr, "%ls: option %c needs argument\n", progname, opt); return EXIT_FAILURE; } @@ -238,7 +239,7 @@ static int txr_main(int argc, char **argv) case 'a': val = strtol(*argv, &errp, 10); if (*errp != 0) { - fprintf(stderr, "%s: option %c needs numeric argument, not %s\n", + fprintf(stderr, "%ls: option %c needs numeric argument, not %s\n", progname, opt, *argv); return EXIT_FAILURE; } @@ -246,10 +247,10 @@ static int txr_main(int argc, char **argv) opt_arraydims = val; break; case 'c': - specstring = string(*argv); + specstring = string_utf8(*argv); break; case 'f': - spec_file_str = string(*argv); + spec_file_str = string_utf8(*argv); break; } @@ -279,14 +280,14 @@ static int txr_main(int argc, char **argv) case 'a': case 'c': case 'D': - fprintf(stderr, "%s: option -%c does not clump\n", progname, *popt); + fprintf(stderr, "%ls: option -%c does not clump\n", progname, *popt); return EXIT_FAILURE; case '-': - fprintf(stderr, "%s: unrecognized long option: --%s\n", + fprintf(stderr, "%ls: unrecognized long option: --%s\n", progname, popt + 1); return EXIT_FAILURE; default: - fprintf(stderr, "%s: unrecognized option: %c\n", progname, *popt); + fprintf(stderr, "%ls: unrecognized option: %c\n", progname, *popt); return EXIT_FAILURE; } } @@ -296,22 +297,22 @@ static int txr_main(int argc, char **argv) } if (specstring && spec_file_str) { - fprintf(stderr, "%s: cannot specify both -f and -c\n", progname); + fprintf(stderr, "%ls: cannot specify both -f and -c\n", progname); return EXIT_FAILURE; } if (specstring) { - spec_file = "cmdline"; + spec_file = L"cmdline"; spec_file_str = string(spec_file); yyin_stream = make_string_input_stream(specstring); } else if (spec_file_str) { - if (strcmp(c_str(spec_file_str), "-") != 0) { - FILE *in = fopen(c_str(spec_file_str), "r"); + if (wcscmp(c_str(spec_file_str), L"-") != 0) { + FILE *in = w_fopen(c_str(spec_file_str), L"r"); if (in == 0) uw_throwcf(file_error, "unable to open %s", c_str(spec_file_str)); yyin_stream = make_stdio_stream(in, spec_file_str, t, nil); } else { - spec_file = "stdin"; + spec_file = L"stdin"; } } else { if (argc < 1) { @@ -323,10 +324,10 @@ static int txr_main(int argc, char **argv) FILE *in = fopen(*argv, "r"); if (in == 0) uw_throwcf(file_error, "unable to open %s", *argv); - yyin_stream = make_stdio_stream(in, string(*argv), t, nil); - spec_file = *argv; + yyin_stream = make_stdio_stream(in, string_utf8(*argv), t, nil); + spec_file = utf8_dup_from(*argv); } else { - spec_file = "stdin"; + spec_file = L"stdin"; } argc--, argv++; spec_file_str = string(spec_file); @@ -345,8 +346,8 @@ static int txr_main(int argc, char **argv) opt_loglevel = match_loglevel; if (opt_loglevel >= 2) { - format(std_error, "spec:\n~s\n", spec, nao); - format(std_error, "bindings:\n~s\n", bindings, nao); + format(std_error, L"spec:\n~s\n", spec, nao); + format(std_error, L"bindings:\n~s\n", bindings, nao); } { @@ -354,7 +355,7 @@ static int txr_main(int argc, char **argv) list_collect_decl(filenames, iter); while (*argv) - list_collect(iter, string(*argv++)); + list_collect(iter, string_utf8(*argv++)); retval = extract(spec, filenames, bindings); @@ -28,6 +28,6 @@ extern int opt_loglevel; extern int opt_nobindings; extern int opt_arraydims; extern int opt_gc_debug; -extern const char *version; -extern const char *progname; +extern const wchar_t *version; +extern const wchar_t *progname; extern int output_produced; @@ -217,15 +217,15 @@ obj_t *uw_throw(obj_t *sym, obj_t *exception) if (ex == 0) { if (opt_loglevel >= 1) { obj_t *s = stringp(exception); - format(std_error, "~a: unhandled exception of type ~a:\n", + format(std_error, L"~a: unhandled exception of type ~a:\n", prog_string, sym, nao); - format(std_error, s ? "~a: ~a\n" : "~a: ~s\n", + format(std_error, s ? L"~a: ~a\n" : L"~a: ~s\n", prog_string, exception, nao); } if (uw_exception_subtype_p(sym, query_error) || uw_exception_subtype_p(sym, file_error)) { if (!output_produced) - put_cstring(std_output, "false\n"); + put_cstring(std_output, L"false\n"); exit(EXIT_FAILURE); } abort(); @@ -238,7 +238,7 @@ obj_t *uw_throw(obj_t *sym, obj_t *exception) abort(); } -obj_t *uw_throwf(obj_t *sym, const char *fmt, ...) +obj_t *uw_throwf(obj_t *sym, const wchar_t *fmt, ...) { va_list vl; obj_t *stream = make_string_output_stream(); @@ -251,7 +251,7 @@ obj_t *uw_throwf(obj_t *sym, const char *fmt, ...) abort(); } -obj_t *uw_errorf(const char *fmt, ...) +obj_t *uw_errorf(const wchar_t *fmt, ...) { va_list vl; obj_t *stream = make_string_output_stream(); @@ -290,7 +290,7 @@ obj_t *uw_errorcf(const char *fmt, ...) abort(); } -obj_t *type_mismatch(const char *fmt, ...) +obj_t *type_mismatch(const wchar_t *fmt, ...) { va_list vl; obj_t *stream = make_string_output_stream(); @@ -317,21 +317,21 @@ obj_t *uw_register_subtype(obj_t *sub, obj_t *sup) if (sub == t) { if (sup == t) return sup; - uw_throwf(type_error, "cannot define ~a as an exception subtype of ~a", + uw_throwf(type_error, L"cannot define ~a as an exception subtype of ~a", sub, sup, nao); } if (sup == nil) { - uw_throwf(type_error, "cannot define ~a as an exception subtype of ~a", + uw_throwf(type_error, L"cannot define ~a as an exception subtype of ~a", sub, sup, nao); } if (uw_exception_subtype_p(sub, sup)) - uw_throwf(type_error, "~a is already an exception subtype of ~a", + uw_throwf(type_error, L"~a is already an exception subtype of ~a", sub, sup, nao); if (uw_exception_subtype_p(sup, sub)) - uw_throwf(type_error, "~a is already an exception supertype of ~a", + uw_throwf(type_error, L"~a is already an exception supertype of ~a", sub, sup, nao); /* If sup symbol not registered, then we make it @@ -79,8 +79,8 @@ obj_t *uw_set_func(obj_t *sym, obj_t *value); obj_t *uw_block_return(obj_t *tag, obj_t *result); void uw_push_catch(uw_frame_t *, obj_t *matches); noreturn obj_t *uw_throw(obj_t *sym, obj_t *exception); -noreturn obj_t *uw_throwf(obj_t *sym, const char *fmt, ...); -noreturn obj_t *uw_errorf(const char *fmt, ...); +noreturn obj_t *uw_throwf(obj_t *sym, const wchar_t *fmt, ...); +noreturn obj_t *uw_errorf(const wchar_t *fmt, ...); noreturn obj_t *uw_throwcf(obj_t *sym, const char *fmt, ...); noreturn obj_t *uw_errorcf(const char *fmt, ...); obj_t *uw_register_subtype(obj_t *sub, obj_t *super); @@ -89,7 +89,7 @@ void uw_continue(uw_frame_t *curr, uw_frame_t *target); void uw_pop_frame(uw_frame_t *); void uw_init(void); -noreturn obj_t *type_mismatch(const char *, ...); +noreturn obj_t *type_mismatch(const wchar_t *, ...); #define uw_block_begin(TAG, RESULTVAR) \ obj_t *RESULTVAR = nil; \ @@ -0,0 +1,168 @@ +/* Copyright 2009 + * Kaz Kylheku <kkylheku@gmail.com> + * Vancouver, Canada + * All rights reserved. + * + * BSD License: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include "lib.h" + +size_t utf8_from(wchar_t *wdst, const unsigned char *src) +{ + size_t nchar = 1; + enum { init, more1, more2, more3 } state; + const char *backtrack = 0; + int ch; + wchar_t wch = 0; + + for (state = init; (ch = *src); src++) { + switch (state) { + case init: + if (ch < 0x80) { + if (wdst) + *wdst++ = ch; + nchar++; + } else if (ch >= 0xc2 && ch <= 0xe0) { + state = more1; + wch = (ch & 0x1f); + } else if (ch >= 0xe0 && ch <= 0xef) { + state = more2; + wch = (ch & 0xf); + } else if (ch >= 0xf0 && ch < 0xf5) { + state = more3; + wch = (ch & 0x7); + } else { + if (wdst) + *wdst++ = 0xdc00 | ch; + nchar++; + } + backtrack = src; + break; + case more1: + case more2: + case more3: + if (ch >= 0x80 && ch < 0xc0) { + wch <<= 6; + wch |= (ch & 0x3f); + if (wdst) + *wdst++ = wch; + nchar++; + state--; + } else { + src = backtrack; + if (wdst) + *wdst++ = 0xdc00 | *src; + nchar++; + state = init; + } + break; + } + } + + if (state != init) { + if (wdst) + *wdst++ = 0xdc00 | *backtrack; + nchar++; + } + + if (wdst) + *wdst++ = 0; + return nchar; +} + +size_t utf8_to(unsigned char *dst, const wchar_t *wsrc) +{ + size_t nbyte = 1; + wchar_t wch; + + while ((wch = *wsrc++)) { + if (wch < 0x80) { + nbyte += 1; + if (dst) + *dst++ = wch; + } else if (wch < 0x800) { + nbyte += 2; + if (dst) { + *dst++ = 0xC0 | (wch >> 6); + *dst++ = 0x80 | (wch & 0x3F); + } + } else if (wch < 0x10000) { + nbyte += 3; + if (dst) { + *dst++ = 0xE0 | (wch >> 12); + *dst++ = 0x80 | ((wch >> 6) & 0x3F); + *dst++ = 0x80 | (wch & 0x3F); + } + } else if (wch < 0x110000) { + nbyte += 4; + if (dst) { + *dst++ = 0xF0 | (wch >> 18); + *dst++ = 0x80 | ((wch >> 12) & 0x3F); + *dst++ = 0x80 | ((wch >> 6) & 0x3F); + *dst++ = 0x80 | (wch & 0x3F); + } + } + } + + if (dst) + *dst++ = 0; + return nbyte; +} + +wchar_t *utf8_dup_from(const unsigned char *str) +{ + size_t nchar = utf8_from(0, str); + wchar_t *wstr = chk_malloc(sizeof *wstr * nchar); + utf8_from(wstr, str); + return wstr; +} + +unsigned char *utf8_dup_to(const wchar_t *wstr) +{ + size_t nbyte = utf8_to(0, wstr); + unsigned char *str = chk_malloc(nbyte); + utf8_to(str, wstr); + return str; +} + +FILE *w_fopen(const wchar_t *wname, const wchar_t *wmode) +{ + char *name = (char *) utf8_dup_to(wname); + char *mode = (char *) utf8_dup_to(wmode); + FILE *f = fopen(name, mode); + free(name); + free(mode); + return f; +} + +FILE *w_popen(const wchar_t *wcmd, const wchar_t *wmode) +{ + char *cmd = (char *) utf8_dup_to(wcmd); + char *mode = (char *) utf8_dup_to(wmode); + FILE *f = popen(cmd, mode); + free(cmd); + free(mode); + return f; +} @@ -0,0 +1,32 @@ +/* Copyright 2009 + * Kaz Kylheku <kkylheku@gmail.com> + * Vancouver, Canada + * All rights reserved. + * + * BSD License: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +size_t utf8_from(wchar_t *, const unsigned char *); +size_t utf8_to(unsigned char *, const wchar_t *); +wchar_t *utf8_dup_from(const unsigned char *); +unsigned char *utf8_dup_to(const wchar_t *); +FILE *w_fopen(const wchar_t *, const wchar_t *); +FILE *w_popen(const wchar_t *, const wchar_t *); |