diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2009-11-12 16:34:27 -0800 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2009-11-12 16:34:27 -0800 |
commit | aa4420347f132039a3e37d6996d1e31096fc10de (patch) | |
tree | cfebd82beda9e272899efae5e5f5dcfb0fc767fd | |
parent | 52501f18487dbefaf0282f1bf1cc328b3fe1ab00 (diff) | |
download | txr-aa4420347f132039a3e37d6996d1e31096fc10de.tar.gz txr-aa4420347f132039a3e37d6996d1e31096fc10de.tar.bz2 txr-aa4420347f132039a3e37d6996d1e31096fc10de.zip |
Documenting extended characters in man page.
Cleaned up some more issues related to extended characters.
-rw-r--r-- | ChangeLog | 22 | ||||
-rw-r--r-- | parser.l | 15 | ||||
-rw-r--r-- | stream.c | 60 | ||||
-rw-r--r-- | stream.h | 2 | ||||
-rw-r--r-- | txr.1 | 22 |
5 files changed, 107 insertions, 14 deletions
@@ -1,5 +1,27 @@ 2009-11-12 Kaz Kylheku <kkylheku@gmail.com> + Documenting extended characters in man page. + Cleaned up some more issues related to extended characters. + + * parser.l (grammar): Added error sctions for invalid UTF-8 bytes. + + * stream.c (BROKEN_POPEN_GETWC): New macro. Enables workaround + for a glibc bug, whereby getwc blows up when applied to a FILE * + stream returned from a popen call. + (struct strm_ops): put_char function takes wchar_t. + (common_format): Use wchar_t rather than int. + (stdio_put_string): fputws returns -1, not EOF. + (stdio_put_char, put_cchar): Character argument changed to wchar_t. + Output done with putwc used instead of putc. + (snarf_line, stdio_get_char): Use getwc to read from the stream. + (pipe_close, make_pipe_stream): Implement workaround form glibc bug. + + * stream.h (put_cchar): Declaration updated. + + * txr.1: Added notes about international characters. + +2009-11-12 Kaz Kylheku <kkylheku@gmail.com> + Regular expression module updated to do unicode character sets. Most of the changes are in the area of representing sets. @@ -399,6 +399,11 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} yyerrorf("bad character in directive: '%s'", yytext); } +<SPECIAL,NESTED>. { + yyerrorf("non-UTF-8 byte in directive: '\\x%02x'", + (unsigned char) yytext[0]); + } + <REGEX>[/] { yy_pop_state(); if (yy_top_state() == INITIAL @@ -452,6 +457,11 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} return REGCHAR; } +<REGEX>. { + yyerrorf("non-UTF-8 byte in regex: '\\x%02x'", + (unsigned char) yytext[0]); + } + <INITIAL>({UONLY}|[^@\n])+ { yylval.lexeme = utf8_dup_from(yytext); return TEXT; @@ -536,4 +546,9 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} return LITCHAR; } +<STRLIT,CHRLIT,QSILIT>. { + yyerrorf("non-UTF-8 byte in literal: '\\x%02x'", + (unsigned char) yytext[0]); + } + %% @@ -24,6 +24,12 @@ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ +/* + * Enable code to work around getwc crash in glibc, + * which happens on FILE * handles from popen. + */ +#define BROKEN_POPEN_GETWC + #include <stdio.h> #include <string.h> #include <dirent.h> @@ -33,6 +39,7 @@ #include <setjmp.h> #include <errno.h> #include <wchar.h> +#include <unistd.h> #include "lib.h" #include "gc.h" #include "unwind.h" @@ -44,7 +51,7 @@ obj_t *std_input, *std_output, *std_error; struct strm_ops { struct cobj_ops cobj_ops; obj_t *(*put_string)(obj_t *, const wchar_t *); - obj_t *(*put_char)(obj_t *, int); + obj_t *(*put_char)(obj_t *, wchar_t); obj_t *(*get_line)(obj_t *); obj_t *(*get_char)(obj_t *); obj_t *(*vcformat)(obj_t *, const char *fmt, va_list vl); @@ -64,7 +71,7 @@ static void common_destroy(obj_t *obj) obj_t *common_vformat(obj_t *stream, const wchar_t *fmt, va_list vl) { - int ch; + wchar_t ch; for (; (ch = *fmt) != 0; fmt++) { obj_t *obj; @@ -105,6 +112,9 @@ obj_t *common_vformat(obj_t *stream, const wchar_t *fmt, va_list vl) struct stdio_handle { FILE *f; +#ifdef BROKEN_POPEN_GETWC + FILE *f_orig_pipe; +#endif obj_t *descr; }; @@ -152,13 +162,14 @@ static obj_t *stdio_maybe_write_error(obj_t *stream) static obj_t *stdio_put_string(obj_t *stream, const wchar_t *s) { struct stdio_handle *h = (struct stdio_handle *) stream->co.handle; - return (h->f && fputws(s, h->f) != EOF) ? t : stdio_maybe_write_error(stream); + return (h->f && fputws(s, h->f) != -1) ? t : stdio_maybe_write_error(stream); } -static obj_t *stdio_put_char(obj_t *stream, int ch) +static obj_t *stdio_put_char(obj_t *stream, wchar_t ch) { struct stdio_handle *h = (struct stdio_handle *) stream->co.handle; - return (h->f && putc(ch, h->f) != EOF) ? t : stdio_maybe_write_error(stream); + return (h->f && putwc(ch, h->f) != WEOF) + ? t : stdio_maybe_write_error(stream); } static wchar_t *snarf_line(FILE *in) @@ -169,9 +180,9 @@ static wchar_t *snarf_line(FILE *in) wchar_t *buf = 0; for (;;) { - int ch = getc(in); + wint_t ch = getwc(in); - if (ch == EOF && buf == 0) + if (ch == WEOF && buf == 0) break; if (fill >= size) { @@ -180,7 +191,7 @@ static wchar_t *snarf_line(FILE *in) size = newsize; } - if (ch == '\n' || ch == EOF) { + if (ch == '\n' || ch == WEOF) { buf[fill++] = 0; break; } @@ -210,8 +221,8 @@ obj_t *stdio_get_char(obj_t *stream) { struct stdio_handle *h = (struct stdio_handle *) stream->co.handle; if (h->f) { - int ch = getc(h->f); - return (ch != EOF) ? chr(ch) : stdio_maybe_read_error(stream); + wint_t ch = getwc(h->f); + return (ch != WEOF) ? chr(ch) : stdio_maybe_read_error(stream); } return nil; } @@ -262,9 +273,13 @@ static obj_t *pipe_close(obj_t *stream, obj_t *throw_on_error) struct stdio_handle *h = (struct stdio_handle *) stream->co.handle; if (h->f != 0) { +#ifdef BROKEN_POPEN_GETWC + int status = (fclose(h->f), pclose(h->f_orig_pipe)); + h->f = h->f_orig_pipe = 0; +#else int status = pclose(h->f); - h->f = 0; +#endif if (status != 0 && throw_on_error) { if (status < 0) { @@ -403,7 +418,7 @@ static obj_t *string_out_put_string(obj_t *stream, const wchar_t *s) } } -static obj_t *string_out_put_char(obj_t *stream, int ch) +static obj_t *string_out_put_char(obj_t *stream, wchar_t ch) { wchar_t mini[2]; mini[0] = ch; @@ -539,8 +554,27 @@ obj_t *make_stdio_stream(FILE *f, obj_t *descr, obj_t *input, obj_t *output) obj_t *make_pipe_stream(FILE *f, obj_t *descr, obj_t *input, obj_t *output) { struct stdio_handle *h = (struct stdio_handle *) chk_malloc(sizeof *h); +#ifdef BROKEN_POPEN_GETWC + int dup_fd = dup(fileno(f)); + FILE *dup_f = (dup_fd != -1) ? fdopen(dup_fd, output ? "w" : "r") : 0; + + if (dup_fd == -1 || dup_f == 0) { + int error = errno; + if (dup_f != 0) + fclose(dup_f); + else if (dup_fd != -1) + close(dup_fd); + free(h); + uw_throwf(process_error, L"unable to create pipe ~a: ~a/~s", descr, + num(error), string_utf8(strerror(error)), nao); + } + + h->f_orig_pipe = f; + h->f = dup_f; +#else h->f = f; h->descr = descr; +#endif return cobj((void *) h, stream_t, &pipe_ops.cobj_ops); } @@ -712,7 +746,7 @@ obj_t *put_char(obj_t *stream, obj_t *ch) } } -obj_t *put_cchar(obj_t *stream, int ch) +obj_t *put_cchar(obj_t *stream, wchar_t ch) { type_check (stream, COBJ); type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); @@ -43,6 +43,6 @@ obj_t *put_string(obj_t *stream, obj_t *string); obj_t *put_line(obj_t *stream, obj_t *string); obj_t *put_cstring(obj_t *stream, const wchar_t *); obj_t *put_char(obj_t *stream, obj_t *ch); -obj_t *put_cchar(obj_t *stream, int ch); +obj_t *put_cchar(obj_t *stream, wchar_t ch); void stream_init(void); @@ -396,6 +396,28 @@ does not split the line into two; it's embedded into the line and thus cannot match anything. However, @\en may be useful in the @(cat) directive and in @(output). +.SS International Characters + +.B txr +represents text internally using wide characters, which are used to represent +Unicode code points. The query language, as well as all data sources, are +assumed to be in the UTF-8 encoding. In the query language, extended +characters can be used directly in comments, literal text, string literals, +quasiliterals and regular expressions. Extended characters can also be +expressed indirectly using hexadecimal or octal escapes. +On some platforms, wide characters may be restricted to 16 bits, so that +.B txr +can only work with characters in the BMP (Basic Multilingual Plane) +subset of Unicode. + +If +.B txr +encounters an invalid bytes in the UTF-8 input, what happens depends on the +context in which this occurs. Invalid bytes in a query are reported as errors. +Invalid bytes in data are currently treated in an unspecified way. In +the future, invalid bytes in data will be mapped to the Unicode codes +U+DC00 through U+DCFF. + .SS Variables Much of the query syntax consists of arbitrary text, which matches file data |