diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2019-12-14 10:19:54 -0800 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2019-12-14 10:19:54 -0800 |
commit | d2a2336acab95d0f57cd4edbcbc4f47025bc999a (patch) | |
tree | 43def6b62ac1c338b2d9289ad93fa998c78d63b8 | |
parent | 399a2ca0791bbb59a6b008543f52ae4b0b06043f (diff) | |
download | txr-d2a2336acab95d0f57cd4edbcbc4f47025bc999a.tar.gz txr-d2a2336acab95d0f57cd4edbcbc4f47025bc999a.tar.bz2 txr-d2a2336acab95d0f57cd4edbcbc4f47025bc999a.zip |
ffi: new type zchar
The zchar type, when used as an array element, specifies an
optionally null-terminated or padded field, which is subject
to UTF-8 conversion.
* ffi.c (zchar_s): New symbol variable.
(enum char_conv): New member, conv_zchar.
(ffi_zchar_array_get): New static function.
(ffi_array_in, ffi_array_get_common): Handle conv_zchar via
ffi_zchar_array_get.
(ffi_array_put): Handle conv_char together with conv_zchar.
(ffi_type_compile): Handle zchar array element type, mapping
to conv_zchar.
(ffi_init_types): Register zhar type.
(ffi_init): Initialize zchar_s symbol variable.
* txr.1: Documented.
-rw-r--r-- | ffi.c | 42 | ||||
-rw-r--r-- | txr.1 | 105 |
2 files changed, 126 insertions, 21 deletions
@@ -101,7 +101,7 @@ val uint16_s, int16_s; val uint32_s, int32_s; val uint64_s, int64_s; -val char_s, uchar_s, bchar_s, wchar_s; +val char_s, zchar_s, uchar_s, bchar_s, wchar_s; val short_s, ushort_s; val int_s, uint_s; val long_s, ulong_s; @@ -172,6 +172,7 @@ struct smemb { enum char_conv { conv_none, conv_char, + conv_zchar, conv_wchar, conv_bchar }; @@ -2228,6 +2229,25 @@ static void ffi_char_array_put(struct txr_ffi_type *tft, val str, mem_t *dst, dst[nelem - 1] = 0; } +static val ffi_zchar_array_get(struct txr_ffi_type *tft, mem_t *src, + cnum nelem) +{ + if (nelem == 0) { + return null_string; + } else { + const char *chptr = coerce(const char *, src); + if (tft->null_term) { + return string_utf8(chptr); + } else if (memchr(chptr, 0, nelem)) { + return string_utf8(chptr); + } else { + wchar_t *wch = utf8_dup_from_buf(chptr, nelem); + return string_own(wch); + } + } +} + + static val ffi_wchar_array_get(struct txr_ffi_type *tft, mem_t *src, cnum nelem) { @@ -2330,6 +2350,11 @@ static val ffi_array_in(struct txr_ffi_type *tft, int copy, mem_t *src, val str = ffi_char_array_get(tft, src, tft->nelem); return if3(vec, replace(vec, str, zero, t), str); } + case conv_zchar: + { + val str = ffi_zchar_array_get(tft, src, tft->nelem); + return if3(vec, replace(vec, str, zero, t), str); + } case conv_wchar: { val str = ffi_wchar_array_get(tft, src, tft->nelem); @@ -2398,6 +2423,7 @@ static void ffi_array_put(struct txr_ffi_type *tft, val vec, mem_t *dst, if (tft->ch_conv != conv_none && stringp(vec)) { switch (tft->ch_conv) { case conv_char: + case conv_zchar: ffi_char_array_put(tft, vec, dst, tft->nelem); break; case conv_wchar: @@ -2465,6 +2491,8 @@ static val ffi_array_get_common(struct txr_ffi_type *tft, mem_t *src, val self, switch (tft->ch_conv) { case conv_char: return ffi_char_array_get(tft, src, nelem); + case conv_zchar: + return ffi_zchar_array_get(tft, src, nelem); case conv_wchar: return ffi_wchar_array_get(tft, src, nelem); case conv_bchar: @@ -3420,6 +3448,8 @@ val ffi_type_compile(val syntax) } if (etft->syntax == char_s) tft->ch_conv = conv_char; + else if (etft->syntax == zchar_s) + tft->ch_conv = conv_zchar; else if (etft->syntax == wchar_s) tft->ch_conv = conv_wchar; else if (etft->syntax == bchar_s) @@ -3462,6 +3492,8 @@ val ffi_type_compile(val syntax) if (etft->syntax == char_s) tft->ch_conv = conv_char; + else if (etft->syntax == zchar_s) + tft->ch_conv = conv_zchar; else if (etft->syntax == wchar_s) tft->ch_conv = conv_wchar; else if (etft->syntax == bchar_s) @@ -3858,6 +3890,13 @@ static void ffi_init_types(void) ffi_char_get, ifbe(ffi_char_rput), ifbe(ffi_char_rget))); + ffi_typedef(zchar_s, make_ffi_type_builtin(zchar_s, integer_s, + FFI_KIND_NUM, + 1, 1, + ffi_char, ffi_char_put, + ffi_char_get, + ifbe(ffi_char_rput), + ifbe(ffi_char_rget))); ffi_typedef(bchar_s, make_ffi_type_builtin(bchar_s, char_s, FFI_KIND_NUM, 1, 1, @@ -5821,6 +5860,7 @@ void ffi_init(void) uint64_s = intern(lit("uint64"), user_package); int64_s = intern(lit("int64"), user_package); char_s = intern(lit("char"), user_package); + zchar_s = intern(lit("zchar"), user_package); uchar_s = intern(lit("uchar"), user_package); bchar_s = intern(lit("bchar"), user_package); wchar_s = intern(lit("wchar"), user_package); @@ -63891,35 +63891,61 @@ basic type, which corresponds to the C type .SS* Simple FFI Types -.coNP FFI types @, char @ uchar and @ bchar -These first of these two types correspond to the C character types +.coNP FFI types @, char @, zchar @ uchar and @ bchar +These first two of these types, .code char and -.codn "unsigned char" , -respectively. The +.code zchar +correspond to the C character type +.codn char . +The +.code uchar +and .code bchar -type (byte char) -also corresponds to +types correspond to .codn "unsigned char" . Both Lisp integers and character values convert to these representation, if they are in their numeric range. Out-of-range values produce an exception. A foreign -.code char +.codn char , +.codn zchar , and .code bchar value converts to a Lisp character, whereas a .code uchar -value converts to an integer. Moreover, +value converts to an integer. + +If these types are used for representing individual scalar values, +there is no difference among +.codn char , +.code zchar +and +.codn bchar . + +What is different among these three types is that the .code array and .code zarray -type constructors treat -.code char -and +type constructors treat them specially. Arrays of these types are +subject to conversion to and from Lisp strings. The variation among +these types expresses different conversion semantics. That is to say, +an array of +.code bchar +converts between the foreign and native Lisp representation differently +from an array of +.codn zchar , +which in turn converts differently from an array of +.codn char . + +Note: it is recommended to avoid using the types .code bchar -specially, but apply no special treatment to -.codn uchar . +and +.code zchar +other than for expressing the element type of an +.code array +or +.codn zarray . .coNP FFI types @, short @, ushort @, int @, uint @, long @ and @ ulong These types correspond to the C integer types @@ -64799,12 +64825,13 @@ In addition, several types are treated specially: when .meta type is one of .codn char , +.codn zchar , .code bchar or .codn wchar , the array type establishes a special correspondence with Lisp strings. When the C array is decoded, a Lisp string is created or updated in place -to reflect the new contents. +to reflect the new contents. This is described in detail below. The second form, whose syntax omits the .meta dim @@ -64820,6 +64847,30 @@ Since the type has unknown length, it has a trivial get operation which returns It is useful for passing a variable amount of data into a foreign function by pointer. +An array of +.code char +represents non-null-terminated UTF-8 character data, which converts to +and from a Lisp string. Any null bytes in the data correspond to +the pseudo-null character +.code #\exDC00 +also notated as +.codn #\epnul . + +An array of +.code zchar +represents a field of optionally null-terminated UTF-8 character data. +If a null byte occurs in the data then the text terminates before that +null byte, otherwise the data comprises the entire foreign array. +Thus, null bytes do not occur in the data. A null byte in the array will +not generate a pseudo-null character in the Lisp string. + +An array of +.code bchar +values represents 8-bit character data that isn't UTF-8 encoded, +and is not null terminated. Each byte holds a character whose code is +in the range 0 to 255. If a null byte occurs in the data, is interpreted +as a string terminator. + .coNP FFI type @ zarray .synb .mets (zarray < dim << type ) @@ -64861,27 +64912,41 @@ has five elements, then the fifth one will be decoded from the C array in earnest; it is not expected to be null. However, when that Lisp representation is converted back to C, that extra element will be ignored and output as a zero bytes. + Lastly, the .code zarray further extends the special treatment which the .code array type applies to the types +.codn zchar , .codn char , .code wchar and .codn bchar . -Namely, +The .code zarray -assumes, and depends on the incoming data being null-terminated, and converts it to a Lisp -string accordingly. The regular +type assumes, and depends on the incoming data being null-terminated, and +converts it to a Lisp string accordingly. The regular .code array -type doesn't assume null termination. In particular, this means that an +type doesn't assume null termination. In particular, this means that whereas .code "(array 42 char)" -will decode 42 bytes of UTF-8, even if some of them are null. The null bytes -convert to U+DC00. In contrast, a +will decode 42 bytes of UTF-8, even if some of them are null, converting +those null bytes the U+DC00 pseudo-null, in contrast, a .code zarray will treat the 42 bytes as a null-terminated string, and decode UTF-8 only up to the first null. +In the other direction, when converting from Lisp string to the foreign array, +.code zarray +ensures null termination. + +Note that the type combination +.code zarray +of +.code zchar +behaves in a manner indistinguishable from a +.code zarray +of +.codn char . The one-argument variant of the .code zarray |