summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog22
-rw-r--r--parser.l15
-rw-r--r--stream.c60
-rw-r--r--stream.h2
-rw-r--r--txr.122
5 files changed, 107 insertions, 14 deletions
diff --git a/ChangeLog b/ChangeLog
index 4fbbf5bb..82ee1edf 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,27 @@
2009-11-12 Kaz Kylheku <kkylheku@gmail.com>
+ Documenting extended characters in man page.
+ Cleaned up some more issues related to extended characters.
+
+ * parser.l (grammar): Added error sctions for invalid UTF-8 bytes.
+
+ * stream.c (BROKEN_POPEN_GETWC): New macro. Enables workaround
+ for a glibc bug, whereby getwc blows up when applied to a FILE *
+ stream returned from a popen call.
+ (struct strm_ops): put_char function takes wchar_t.
+ (common_format): Use wchar_t rather than int.
+ (stdio_put_string): fputws returns -1, not EOF.
+ (stdio_put_char, put_cchar): Character argument changed to wchar_t.
+ Output done with putwc used instead of putc.
+ (snarf_line, stdio_get_char): Use getwc to read from the stream.
+ (pipe_close, make_pipe_stream): Implement workaround form glibc bug.
+
+ * stream.h (put_cchar): Declaration updated.
+
+ * txr.1: Added notes about international characters.
+
+2009-11-12 Kaz Kylheku <kkylheku@gmail.com>
+
Regular expression module updated to do unicode character sets.
Most of the changes are in the area of representing sets.
diff --git a/parser.l b/parser.l
index 5919f929..b15f5ad1 100644
--- a/parser.l
+++ b/parser.l
@@ -399,6 +399,11 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
yyerrorf("bad character in directive: '%s'", yytext);
}
+<SPECIAL,NESTED>. {
+ yyerrorf("non-UTF-8 byte in directive: '\\x%02x'",
+ (unsigned char) yytext[0]);
+ }
+
<REGEX>[/] {
yy_pop_state();
if (yy_top_state() == INITIAL
@@ -452,6 +457,11 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
return REGCHAR;
}
+<REGEX>. {
+ yyerrorf("non-UTF-8 byte in regex: '\\x%02x'",
+ (unsigned char) yytext[0]);
+ }
+
<INITIAL>({UONLY}|[^@\n])+ {
yylval.lexeme = utf8_dup_from(yytext);
return TEXT;
@@ -536,4 +546,9 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
return LITCHAR;
}
+<STRLIT,CHRLIT,QSILIT>. {
+ yyerrorf("non-UTF-8 byte in literal: '\\x%02x'",
+ (unsigned char) yytext[0]);
+ }
+
%%
diff --git a/stream.c b/stream.c
index 91ee2e85..0c1050f1 100644
--- a/stream.c
+++ b/stream.c
@@ -24,6 +24,12 @@
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
*/
+/*
+ * Enable code to work around getwc crash in glibc,
+ * which happens on FILE * handles from popen.
+ */
+#define BROKEN_POPEN_GETWC
+
#include <stdio.h>
#include <string.h>
#include <dirent.h>
@@ -33,6 +39,7 @@
#include <setjmp.h>
#include <errno.h>
#include <wchar.h>
+#include <unistd.h>
#include "lib.h"
#include "gc.h"
#include "unwind.h"
@@ -44,7 +51,7 @@ obj_t *std_input, *std_output, *std_error;
struct strm_ops {
struct cobj_ops cobj_ops;
obj_t *(*put_string)(obj_t *, const wchar_t *);
- obj_t *(*put_char)(obj_t *, int);
+ obj_t *(*put_char)(obj_t *, wchar_t);
obj_t *(*get_line)(obj_t *);
obj_t *(*get_char)(obj_t *);
obj_t *(*vcformat)(obj_t *, const char *fmt, va_list vl);
@@ -64,7 +71,7 @@ static void common_destroy(obj_t *obj)
obj_t *common_vformat(obj_t *stream, const wchar_t *fmt, va_list vl)
{
- int ch;
+ wchar_t ch;
for (; (ch = *fmt) != 0; fmt++) {
obj_t *obj;
@@ -105,6 +112,9 @@ obj_t *common_vformat(obj_t *stream, const wchar_t *fmt, va_list vl)
struct stdio_handle {
FILE *f;
+#ifdef BROKEN_POPEN_GETWC
+ FILE *f_orig_pipe;
+#endif
obj_t *descr;
};
@@ -152,13 +162,14 @@ static obj_t *stdio_maybe_write_error(obj_t *stream)
static obj_t *stdio_put_string(obj_t *stream, const wchar_t *s)
{
struct stdio_handle *h = (struct stdio_handle *) stream->co.handle;
- return (h->f && fputws(s, h->f) != EOF) ? t : stdio_maybe_write_error(stream);
+ return (h->f && fputws(s, h->f) != -1) ? t : stdio_maybe_write_error(stream);
}
-static obj_t *stdio_put_char(obj_t *stream, int ch)
+static obj_t *stdio_put_char(obj_t *stream, wchar_t ch)
{
struct stdio_handle *h = (struct stdio_handle *) stream->co.handle;
- return (h->f && putc(ch, h->f) != EOF) ? t : stdio_maybe_write_error(stream);
+ return (h->f && putwc(ch, h->f) != WEOF)
+ ? t : stdio_maybe_write_error(stream);
}
static wchar_t *snarf_line(FILE *in)
@@ -169,9 +180,9 @@ static wchar_t *snarf_line(FILE *in)
wchar_t *buf = 0;
for (;;) {
- int ch = getc(in);
+ wint_t ch = getwc(in);
- if (ch == EOF && buf == 0)
+ if (ch == WEOF && buf == 0)
break;
if (fill >= size) {
@@ -180,7 +191,7 @@ static wchar_t *snarf_line(FILE *in)
size = newsize;
}
- if (ch == '\n' || ch == EOF) {
+ if (ch == '\n' || ch == WEOF) {
buf[fill++] = 0;
break;
}
@@ -210,8 +221,8 @@ obj_t *stdio_get_char(obj_t *stream)
{
struct stdio_handle *h = (struct stdio_handle *) stream->co.handle;
if (h->f) {
- int ch = getc(h->f);
- return (ch != EOF) ? chr(ch) : stdio_maybe_read_error(stream);
+ wint_t ch = getwc(h->f);
+ return (ch != WEOF) ? chr(ch) : stdio_maybe_read_error(stream);
}
return nil;
}
@@ -262,9 +273,13 @@ static obj_t *pipe_close(obj_t *stream, obj_t *throw_on_error)
struct stdio_handle *h = (struct stdio_handle *) stream->co.handle;
if (h->f != 0) {
+#ifdef BROKEN_POPEN_GETWC
+ int status = (fclose(h->f), pclose(h->f_orig_pipe));
+ h->f = h->f_orig_pipe = 0;
+#else
int status = pclose(h->f);
-
h->f = 0;
+#endif
if (status != 0 && throw_on_error) {
if (status < 0) {
@@ -403,7 +418,7 @@ static obj_t *string_out_put_string(obj_t *stream, const wchar_t *s)
}
}
-static obj_t *string_out_put_char(obj_t *stream, int ch)
+static obj_t *string_out_put_char(obj_t *stream, wchar_t ch)
{
wchar_t mini[2];
mini[0] = ch;
@@ -539,8 +554,27 @@ obj_t *make_stdio_stream(FILE *f, obj_t *descr, obj_t *input, obj_t *output)
obj_t *make_pipe_stream(FILE *f, obj_t *descr, obj_t *input, obj_t *output)
{
struct stdio_handle *h = (struct stdio_handle *) chk_malloc(sizeof *h);
+#ifdef BROKEN_POPEN_GETWC
+ int dup_fd = dup(fileno(f));
+ FILE *dup_f = (dup_fd != -1) ? fdopen(dup_fd, output ? "w" : "r") : 0;
+
+ if (dup_fd == -1 || dup_f == 0) {
+ int error = errno;
+ if (dup_f != 0)
+ fclose(dup_f);
+ else if (dup_fd != -1)
+ close(dup_fd);
+ free(h);
+ uw_throwf(process_error, L"unable to create pipe ~a: ~a/~s", descr,
+ num(error), string_utf8(strerror(error)), nao);
+ }
+
+ h->f_orig_pipe = f;
+ h->f = dup_f;
+#else
h->f = f;
h->descr = descr;
+#endif
return cobj((void *) h, stream_t, &pipe_ops.cobj_ops);
}
@@ -712,7 +746,7 @@ obj_t *put_char(obj_t *stream, obj_t *ch)
}
}
-obj_t *put_cchar(obj_t *stream, int ch)
+obj_t *put_cchar(obj_t *stream, wchar_t ch)
{
type_check (stream, COBJ);
type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream));
diff --git a/stream.h b/stream.h
index 78f83f93..13b428c0 100644
--- a/stream.h
+++ b/stream.h
@@ -43,6 +43,6 @@ obj_t *put_string(obj_t *stream, obj_t *string);
obj_t *put_line(obj_t *stream, obj_t *string);
obj_t *put_cstring(obj_t *stream, const wchar_t *);
obj_t *put_char(obj_t *stream, obj_t *ch);
-obj_t *put_cchar(obj_t *stream, int ch);
+obj_t *put_cchar(obj_t *stream, wchar_t ch);
void stream_init(void);
diff --git a/txr.1 b/txr.1
index 19ffeb30..e62b30e1 100644
--- a/txr.1
+++ b/txr.1
@@ -396,6 +396,28 @@ does not split the line into two; it's embedded into the line and
thus cannot match anything. However, @\en may be useful in the @(cat)
directive and in @(output).
+.SS International Characters
+
+.B txr
+represents text internally using wide characters, which are used to represent
+Unicode code points. The query language, as well as all data sources, are
+assumed to be in the UTF-8 encoding. In the query language, extended
+characters can be used directly in comments, literal text, string literals,
+quasiliterals and regular expressions. Extended characters can also be
+expressed indirectly using hexadecimal or octal escapes.
+On some platforms, wide characters may be restricted to 16 bits, so that
+.B txr
+can only work with characters in the BMP (Basic Multilingual Plane)
+subset of Unicode.
+
+If
+.B txr
+encounters an invalid bytes in the UTF-8 input, what happens depends on the
+context in which this occurs. Invalid bytes in a query are reported as errors.
+Invalid bytes in data are currently treated in an unspecified way. In
+the future, invalid bytes in data will be mapped to the Unicode codes
+U+DC00 through U+DCFF.
+
.SS Variables
Much of the query syntax consists of arbitrary text, which matches file data