summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2019-12-14 10:19:54 -0800
committerKaz Kylheku <kaz@kylheku.com>2019-12-14 10:19:54 -0800
commitd2a2336acab95d0f57cd4edbcbc4f47025bc999a (patch)
tree43def6b62ac1c338b2d9289ad93fa998c78d63b8
parent399a2ca0791bbb59a6b008543f52ae4b0b06043f (diff)
downloadtxr-d2a2336acab95d0f57cd4edbcbc4f47025bc999a.tar.gz
txr-d2a2336acab95d0f57cd4edbcbc4f47025bc999a.tar.bz2
txr-d2a2336acab95d0f57cd4edbcbc4f47025bc999a.zip
ffi: new type zchar
The zchar type, when used as an array element, specifies an optionally null-terminated or padded field, which is subject to UTF-8 conversion. * ffi.c (zchar_s): New symbol variable. (enum char_conv): New member, conv_zchar. (ffi_zchar_array_get): New static function. (ffi_array_in, ffi_array_get_common): Handle conv_zchar via ffi_zchar_array_get. (ffi_array_put): Handle conv_char together with conv_zchar. (ffi_type_compile): Handle zchar array element type, mapping to conv_zchar. (ffi_init_types): Register zhar type. (ffi_init): Initialize zchar_s symbol variable. * txr.1: Documented.
-rw-r--r--ffi.c42
-rw-r--r--txr.1105
2 files changed, 126 insertions, 21 deletions
diff --git a/ffi.c b/ffi.c
index ca678a1a..21e9024d 100644
--- a/ffi.c
+++ b/ffi.c
@@ -101,7 +101,7 @@ val uint16_s, int16_s;
val uint32_s, int32_s;
val uint64_s, int64_s;
-val char_s, uchar_s, bchar_s, wchar_s;
+val char_s, zchar_s, uchar_s, bchar_s, wchar_s;
val short_s, ushort_s;
val int_s, uint_s;
val long_s, ulong_s;
@@ -172,6 +172,7 @@ struct smemb {
enum char_conv {
conv_none,
conv_char,
+ conv_zchar,
conv_wchar,
conv_bchar
};
@@ -2228,6 +2229,25 @@ static void ffi_char_array_put(struct txr_ffi_type *tft, val str, mem_t *dst,
dst[nelem - 1] = 0;
}
+static val ffi_zchar_array_get(struct txr_ffi_type *tft, mem_t *src,
+ cnum nelem)
+{
+ if (nelem == 0) {
+ return null_string;
+ } else {
+ const char *chptr = coerce(const char *, src);
+ if (tft->null_term) {
+ return string_utf8(chptr);
+ } else if (memchr(chptr, 0, nelem)) {
+ return string_utf8(chptr);
+ } else {
+ wchar_t *wch = utf8_dup_from_buf(chptr, nelem);
+ return string_own(wch);
+ }
+ }
+}
+
+
static val ffi_wchar_array_get(struct txr_ffi_type *tft, mem_t *src,
cnum nelem)
{
@@ -2330,6 +2350,11 @@ static val ffi_array_in(struct txr_ffi_type *tft, int copy, mem_t *src,
val str = ffi_char_array_get(tft, src, tft->nelem);
return if3(vec, replace(vec, str, zero, t), str);
}
+ case conv_zchar:
+ {
+ val str = ffi_zchar_array_get(tft, src, tft->nelem);
+ return if3(vec, replace(vec, str, zero, t), str);
+ }
case conv_wchar:
{
val str = ffi_wchar_array_get(tft, src, tft->nelem);
@@ -2398,6 +2423,7 @@ static void ffi_array_put(struct txr_ffi_type *tft, val vec, mem_t *dst,
if (tft->ch_conv != conv_none && stringp(vec)) {
switch (tft->ch_conv) {
case conv_char:
+ case conv_zchar:
ffi_char_array_put(tft, vec, dst, tft->nelem);
break;
case conv_wchar:
@@ -2465,6 +2491,8 @@ static val ffi_array_get_common(struct txr_ffi_type *tft, mem_t *src, val self,
switch (tft->ch_conv) {
case conv_char:
return ffi_char_array_get(tft, src, nelem);
+ case conv_zchar:
+ return ffi_zchar_array_get(tft, src, nelem);
case conv_wchar:
return ffi_wchar_array_get(tft, src, nelem);
case conv_bchar:
@@ -3420,6 +3448,8 @@ val ffi_type_compile(val syntax)
}
if (etft->syntax == char_s)
tft->ch_conv = conv_char;
+ else if (etft->syntax == zchar_s)
+ tft->ch_conv = conv_zchar;
else if (etft->syntax == wchar_s)
tft->ch_conv = conv_wchar;
else if (etft->syntax == bchar_s)
@@ -3462,6 +3492,8 @@ val ffi_type_compile(val syntax)
if (etft->syntax == char_s)
tft->ch_conv = conv_char;
+ else if (etft->syntax == zchar_s)
+ tft->ch_conv = conv_zchar;
else if (etft->syntax == wchar_s)
tft->ch_conv = conv_wchar;
else if (etft->syntax == bchar_s)
@@ -3858,6 +3890,13 @@ static void ffi_init_types(void)
ffi_char_get,
ifbe(ffi_char_rput),
ifbe(ffi_char_rget)));
+ ffi_typedef(zchar_s, make_ffi_type_builtin(zchar_s, integer_s,
+ FFI_KIND_NUM,
+ 1, 1,
+ ffi_char, ffi_char_put,
+ ffi_char_get,
+ ifbe(ffi_char_rput),
+ ifbe(ffi_char_rget)));
ffi_typedef(bchar_s, make_ffi_type_builtin(bchar_s, char_s,
FFI_KIND_NUM,
1, 1,
@@ -5821,6 +5860,7 @@ void ffi_init(void)
uint64_s = intern(lit("uint64"), user_package);
int64_s = intern(lit("int64"), user_package);
char_s = intern(lit("char"), user_package);
+ zchar_s = intern(lit("zchar"), user_package);
uchar_s = intern(lit("uchar"), user_package);
bchar_s = intern(lit("bchar"), user_package);
wchar_s = intern(lit("wchar"), user_package);
diff --git a/txr.1 b/txr.1
index b3416835..f37216c4 100644
--- a/txr.1
+++ b/txr.1
@@ -63891,35 +63891,61 @@ basic type, which corresponds to the C type
.SS* Simple FFI Types
-.coNP FFI types @, char @ uchar and @ bchar
-These first of these two types correspond to the C character types
+.coNP FFI types @, char @, zchar @ uchar and @ bchar
+These first two of these types,
.code char
and
-.codn "unsigned char" ,
-respectively. The
+.code zchar
+correspond to the C character type
+.codn char .
+The
+.code uchar
+and
.code bchar
-type (byte char)
-also corresponds to
+types correspond to
.codn "unsigned char" .
Both Lisp integers and character values
convert to these representation, if they are in their numeric range.
Out-of-range values produce an exception.
A foreign
-.code char
+.codn char ,
+.codn zchar ,
and
.code bchar
value converts to a Lisp character, whereas a
.code uchar
-value converts to an integer. Moreover,
+value converts to an integer.
+
+If these types are used for representing individual scalar values,
+there is no difference among
+.codn char ,
+.code zchar
+and
+.codn bchar .
+
+What is different among these three types is that the
.code array
and
.code zarray
-type constructors treat
-.code char
-and
+type constructors treat them specially. Arrays of these types are
+subject to conversion to and from Lisp strings. The variation among
+these types expresses different conversion semantics. That is to say,
+an array of
+.code bchar
+converts between the foreign and native Lisp representation differently
+from an array of
+.codn zchar ,
+which in turn converts differently from an array of
+.codn char .
+
+Note: it is recommended to avoid using the types
.code bchar
-specially, but apply no special treatment to
-.codn uchar .
+and
+.code zchar
+other than for expressing the element type of an
+.code array
+or
+.codn zarray .
.coNP FFI types @, short @, ushort @, int @, uint @, long @ and @ ulong
These types correspond to the C integer types
@@ -64799,12 +64825,13 @@ In addition, several types are treated specially: when
.meta type
is one of
.codn char ,
+.codn zchar ,
.code bchar
or
.codn wchar ,
the array type establishes a special correspondence with Lisp strings.
When the C array is decoded, a Lisp string is created or updated in place
-to reflect the new contents.
+to reflect the new contents. This is described in detail below.
The second form, whose syntax omits the
.meta dim
@@ -64820,6 +64847,30 @@ Since the type has unknown length, it has a trivial get operation which returns
It is useful for passing a variable amount of data into a foreign
function by pointer.
+An array of
+.code char
+represents non-null-terminated UTF-8 character data, which converts to
+and from a Lisp string. Any null bytes in the data correspond to
+the pseudo-null character
+.code #\exDC00
+also notated as
+.codn #\epnul .
+
+An array of
+.code zchar
+represents a field of optionally null-terminated UTF-8 character data.
+If a null byte occurs in the data then the text terminates before that
+null byte, otherwise the data comprises the entire foreign array.
+Thus, null bytes do not occur in the data. A null byte in the array will
+not generate a pseudo-null character in the Lisp string.
+
+An array of
+.code bchar
+values represents 8-bit character data that isn't UTF-8 encoded,
+and is not null terminated. Each byte holds a character whose code is
+in the range 0 to 255. If a null byte occurs in the data, is interpreted
+as a string terminator.
+
.coNP FFI type @ zarray
.synb
.mets (zarray < dim << type )
@@ -64861,27 +64912,41 @@ has five elements, then the fifth one will be decoded from the C array
in earnest; it is not expected to be null. However, when that Lisp
representation is converted back to C, that extra element will be ignored and
output as a zero bytes.
+
Lastly, the
.code zarray
further extends the special treatment which the
.code array
type applies to the types
+.codn zchar ,
.codn char ,
.code wchar
and
.codn bchar .
-Namely,
+The
.code zarray
-assumes, and depends on the incoming data being null-terminated, and converts it to a Lisp
-string accordingly. The regular
+type assumes, and depends on the incoming data being null-terminated, and
+converts it to a Lisp string accordingly. The regular
.code array
-type doesn't assume null termination. In particular, this means that an
+type doesn't assume null termination. In particular, this means that whereas
.code "(array 42 char)"
-will decode 42 bytes of UTF-8, even if some of them are null. The null bytes
-convert to U+DC00. In contrast, a
+will decode 42 bytes of UTF-8, even if some of them are null, converting
+those null bytes the U+DC00 pseudo-null, in contrast, a
.code zarray
will treat the 42 bytes as a null-terminated string, and decode UTF-8 only
up to the first null.
+In the other direction, when converting from Lisp string to the foreign array,
+.code zarray
+ensures null termination.
+
+Note that the type combination
+.code zarray
+of
+.code zchar
+behaves in a manner indistinguishable from a
+.code zarray
+of
+.codn char .
The one-argument variant of the
.code zarray