summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog15
-rw-r--r--txr.19
-rw-r--r--utf8.c3
-rw-r--r--utf8.h3
4 files changed, 29 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 00fff529..92e3d13e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2014-02-15 Kaz Kylheku <kaz@kylheku.com>
+
+ A trivial change in the UTF-8 decoder allows TXR to handle null bytes
+ in text.
+
+ * utf8.h (UTF8_ADMIT_NUL): New preprocessor symbol.
+ (utf8_decoder): New member, flags.
+
+ * utf8.c (utf8_decoder_init): Initialize flags to 0.
+ (utf8_decode): If a null byte is encountered in the input,
+ then convert it to 0xDC00, rather than keeping it as zero,
+ unless flags contains UTF8_ADMIT_NUL.
+
+ * txr.1: Document handling of null bytes.
+
2014-02-14 Kaz Kylheku <kaz@kylheku.com>
* hash.c (hash_update): Avoid double cdr.
diff --git a/txr.1 b/txr.1
index dc692dd2..d69b8645 100644
--- a/txr.1
+++ b/txr.1
@@ -478,7 +478,7 @@ does not split the line into two; it's embedded into the line and
thus cannot match anything. However, @\en may be useful in the @(cat)
directive and in @(output).
-.SS International Characters
+.SS Character Handling and International Characters
.B TXR
represents text internally using wide characters, which are used to represent
@@ -519,6 +519,13 @@ mapping it to the Unicode character range U+DC00 through U+DCFF. The decoding
resumes afresh at the following byte, expecting that byte to be the start
of a UTF-8 code.
+Furthermore, because TXR internally uses a null-terminated character
+representation of strings which easily interoperates with C language
+interfaces, when a null character is read from a stream, TXR converts it to
+the code U+DC00. On output, this code converts back to a null byte,
+as explained in the previous paragraph. By means of this representational
+trick, TXR can handle textual data containing null bytes.
+
.SS Regular Expression Directives
In place of a piece of text (see section Text above), a regular expression
diff --git a/utf8.c b/utf8.c
index 26e5795d..e3ef3e7a 100644
--- a/utf8.c
+++ b/utf8.c
@@ -260,6 +260,7 @@ int utf8_encode(wchar_t wch, int (*put)(int ch, mem_t *ctx), mem_t *ctx)
void utf8_decoder_init(utf8_decoder_t *ud)
{
ud->state = utf8_init;
+ ud->flags = 0;
ud->wch = 0;
ud->head = ud->tail = ud->back = 0;
}
@@ -295,6 +296,8 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
case 0x0: case 0x1: case 0x2: case 0x3:
case 0x4: case 0x5: case 0x6: case 0x7:
ud->back = ud->tail;
+ if (ch == 0 && (ud->flags & UTF8_ADMIT_NUL) == 0)
+ return 0xDC00;
return ch;
case 0xC: case 0xD:
ud->state = utf8_more1;
diff --git a/utf8.h b/utf8.h
index c4915488..67dee69a 100644
--- a/utf8.h
+++ b/utf8.h
@@ -35,8 +35,11 @@ unsigned char *utf8_dup_to_uc(const wchar_t *);
enum utf8_state { utf8_init, utf8_more1, utf8_more2, utf8_more3 };
+#define UTF8_ADMIT_NUL 1
+
typedef struct utf8_decoder {
enum utf8_state state;
+ int flags;
wchar_t wch, wch_min;
int head, tail, back;
int buf[8];