diff options
-rw-r--r-- | ChangeLog | 15 | ||||
-rw-r--r-- | txr.1 | 9 | ||||
-rw-r--r-- | utf8.c | 3 | ||||
-rw-r--r-- | utf8.h | 3 |
4 files changed, 29 insertions, 1 deletions
@@ -1,3 +1,18 @@ +2014-02-15 Kaz Kylheku <kaz@kylheku.com> + + A trivial change in the UTF-8 decoder allows TXR to handle null bytes + in text. + + * utf8.h (UTF8_ADMIT_NUL): New preprocessor symbol. + (utf8_decoder): New member, flags. + + * utf8.c (utf8_decoder_init): Initialize flags to 0. + (utf8_decode): If a null byte is encountered in the input, + then convert it to 0xDC00, rather than keeping it as zero, + unless flags contains UTF8_ADMIT_NUL. + + * txr.1: Document handling of null bytes. + 2014-02-14 Kaz Kylheku <kaz@kylheku.com> * hash.c (hash_update): Avoid double cdr. @@ -478,7 +478,7 @@ does not split the line into two; it's embedded into the line and thus cannot match anything. However, @\en may be useful in the @(cat) directive and in @(output). -.SS International Characters +.SS Character Handling and International Characters .B TXR represents text internally using wide characters, which are used to represent @@ -519,6 +519,13 @@ mapping it to the Unicode character range U+DC00 through U+DCFF. The decoding resumes afresh at the following byte, expecting that byte to be the start of a UTF-8 code. +Furthermore, because TXR internally uses a null-terminated character +representation of strings which easily interoperates with C language +interfaces, when a null character is read from a stream, TXR converts it to +the code U+DC00. On output, this code converts back to a null byte, +as explained in the previous paragraph. By means of this representational +trick, TXR can handle textual data containing null bytes. + .SS Regular Expression Directives In place of a piece of text (see section Text above), a regular expression @@ -260,6 +260,7 @@ int utf8_encode(wchar_t wch, int (*put)(int ch, mem_t *ctx), mem_t *ctx) void utf8_decoder_init(utf8_decoder_t *ud) { ud->state = utf8_init; + ud->flags = 0; ud->wch = 0; ud->head = ud->tail = ud->back = 0; } @@ -295,6 +296,8 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) case 0x0: case 0x1: case 0x2: case 0x3: case 0x4: case 0x5: case 0x6: case 0x7: ud->back = ud->tail; + if (ch == 0 && (ud->flags & UTF8_ADMIT_NUL) == 0) + return 0xDC00; return ch; case 0xC: case 0xD: ud->state = utf8_more1; @@ -35,8 +35,11 @@ unsigned char *utf8_dup_to_uc(const wchar_t *); enum utf8_state { utf8_init, utf8_more1, utf8_more2, utf8_more3 }; +#define UTF8_ADMIT_NUL 1 + typedef struct utf8_decoder { enum utf8_state state; + int flags; wchar_t wch, wch_min; int head, tail, back; int buf[8]; |