ffi: new type zchar

The zchar type, when used as an array element, specifies an optionally null-terminated or padded field, which is subject to UTF-8 conversion. * ffi.c (zchar_s): New symbol variable. (enum char_conv): New member, conv_zchar. (ffi_zchar_array_get): New static function. (ffi_array_in, ffi_array_get_common): Handle conv_zchar via ffi_zchar_array_get. (ffi_array_put): Handle conv_char together with conv_zchar. (ffi_type_compile): Handle zchar array element type, mapping to conv_zchar. (ffi_init_types): Register zhar type. (ffi_init): Initialize zchar_s symbol variable. * txr.1: Documented.
author: Kaz Kylheku <kaz@kylheku.com> 2019-12-14 10:19:54 -0800
committer: Kaz Kylheku <kaz@kylheku.com> 2019-12-14 10:19:54 -0800
commit: d2a2336acab95d0f57cd4edbcbc4f47025bc999a (patch)
tree: 43def6b62ac1c338b2d9289ad93fa998c78d63b8
parent: 399a2ca0791bbb59a6b008543f52ae4b0b06043f (diff)
download: txr-d2a2336acab95d0f57cd4edbcbc4f47025bc999a.tar.gz
txr-d2a2336acab95d0f57cd4edbcbc4f47025bc999a.tar.bz2
txr-d2a2336acab95d0f57cd4edbcbc4f47025bc999a.zip
2 files changed, 126 insertions, 21 deletions
diff --git a/ffi.c b/ffi.c
index ca678a1a..21e9024d 100644
--- a/ffi.c
+++ b/ffi.c
@@ -101,7 +101,7 @@ val uint16_s, int16_s;
 val uint32_s, int32_s;
 val uint64_s, int64_s;
 
-val char_s, uchar_s, bchar_s, wchar_s;
+val char_s, zchar_s, uchar_s, bchar_s, wchar_s;
 val short_s, ushort_s;
 val int_s, uint_s;
 val long_s, ulong_s;
@@ -172,6 +172,7 @@ struct smemb {
 enum char_conv {
   conv_none,
   conv_char,
+  conv_zchar,
   conv_wchar,
   conv_bchar
 };
@@ -2228,6 +2229,25 @@ static void ffi_char_array_put(struct txr_ffi_type *tft, val str, mem_t *dst,
     dst[nelem - 1] = 0;
 }
 
+static val ffi_zchar_array_get(struct txr_ffi_type *tft, mem_t *src,
+                               cnum nelem)
+{
+  if (nelem == 0) {
+    return null_string;
+  } else {
+    const char *chptr = coerce(const char *, src);
+    if (tft->null_term) {
+      return string_utf8(chptr);
+    } else if (memchr(chptr, 0, nelem)) {
+      return string_utf8(chptr);
+    } else {
+      wchar_t *wch = utf8_dup_from_buf(chptr, nelem);
+      return string_own(wch);
+    }
+  }
+}
+
+
 static val ffi_wchar_array_get(struct txr_ffi_type *tft, mem_t *src,
                                cnum nelem)
 {
@@ -2330,6 +2350,11 @@ static val ffi_array_in(struct txr_ffi_type *tft, int copy, mem_t *src,
         val str = ffi_char_array_get(tft, src, tft->nelem);
         return if3(vec, replace(vec, str, zero, t), str);
       }
+    case conv_zchar:
+      {
+        val str = ffi_zchar_array_get(tft, src, tft->nelem);
+        return if3(vec, replace(vec, str, zero, t), str);
+      }
     case conv_wchar:
       {
         val str = ffi_wchar_array_get(tft, src, tft->nelem);
@@ -2398,6 +2423,7 @@ static void ffi_array_put(struct txr_ffi_type *tft, val vec, mem_t *dst,
   if (tft->ch_conv != conv_none && stringp(vec)) {
     switch (tft->ch_conv) {
     case conv_char:
+    case conv_zchar:
       ffi_char_array_put(tft, vec, dst, tft->nelem);
       break;
     case conv_wchar:
@@ -2465,6 +2491,8 @@ static val ffi_array_get_common(struct txr_ffi_type *tft, mem_t *src, val self,
   switch (tft->ch_conv) {
   case conv_char:
     return ffi_char_array_get(tft, src, nelem);
+  case conv_zchar:
+    return ffi_zchar_array_get(tft, src, nelem);
   case conv_wchar:
     return ffi_wchar_array_get(tft, src, nelem);
   case conv_bchar:
@@ -3420,6 +3448,8 @@ val ffi_type_compile(val syntax)
         }
         if (etft->syntax == char_s)
           tft->ch_conv = conv_char;
+        else if (etft->syntax == zchar_s)
+          tft->ch_conv = conv_zchar;
         else if (etft->syntax == wchar_s)
           tft->ch_conv = conv_wchar;
         else if (etft->syntax == bchar_s)
@@ -3462,6 +3492,8 @@ val ffi_type_compile(val syntax)
 
           if (etft->syntax == char_s)
             tft->ch_conv = conv_char;
+          else if (etft->syntax == zchar_s)
+            tft->ch_conv = conv_zchar;
           else if (etft->syntax == wchar_s)
             tft->ch_conv = conv_wchar;
           else if (etft->syntax == bchar_s)
@@ -3858,6 +3890,13 @@ static void ffi_init_types(void)
                                             ffi_char_get,
                                             ifbe(ffi_char_rput),
                                             ifbe(ffi_char_rget)));
+  ffi_typedef(zchar_s, make_ffi_type_builtin(zchar_s, integer_s,
+                                             FFI_KIND_NUM,
+                                             1, 1,
+                                             ffi_char, ffi_char_put,
+                                             ffi_char_get,
+                                             ifbe(ffi_char_rput),
+                                             ifbe(ffi_char_rget)));
   ffi_typedef(bchar_s, make_ffi_type_builtin(bchar_s, char_s,
                                              FFI_KIND_NUM,
                                              1, 1,
@@ -5821,6 +5860,7 @@ void ffi_init(void)
   uint64_s = intern(lit("uint64"), user_package);
   int64_s = intern(lit("int64"), user_package);
   char_s = intern(lit("char"), user_package);
+  zchar_s = intern(lit("zchar"), user_package);
   uchar_s = intern(lit("uchar"), user_package);
   bchar_s = intern(lit("bchar"), user_package);
   wchar_s = intern(lit("wchar"), user_package);
diff --git a/txr.1 b/txr.1
index b3416835..f37216c4 100644
--- a/txr.1
+++ b/txr.1
@@ -63891,35 +63891,61 @@ basic type, which corresponds to the C type
 
 .SS* Simple FFI Types
 
-.coNP FFI types @, char @ uchar and @ bchar
-These first of these two types correspond to the C character types
+.coNP FFI types @, char @, zchar @ uchar and @ bchar
+These first two of these types,
 .code char
 and
-.codn "unsigned char" ,
-respectively. The
+.code zchar
+correspond to the C character type
+.codn char .
+The
+.code uchar
+and
 .code bchar
-type (byte char)
-also corresponds to
+types correspond to
 .codn "unsigned char" .
 Both Lisp integers and character values
 convert to these representation, if they are in their numeric range.
 Out-of-range values produce an exception.
 A foreign
-.code char
+.codn char ,
+.codn zchar ,
 and
 .code bchar
 value converts to a Lisp character, whereas a
 .code uchar
-value converts to an integer. Moreover,
+value converts to an integer.
+
+If these types are used for representing individual scalar values,
+there is no difference among
+.codn char ,
+.code zchar
+and
+.codn bchar .
+
+What is different among these three types is that the
 .code array
 and
 .code zarray
-type constructors treat
-.code char
-and
+type constructors treat them specially. Arrays of these types are
+subject to conversion to and from Lisp strings. The variation among
+these types expresses different conversion semantics. That is to say,
+an array of
+.code bchar
+converts between the foreign and native Lisp representation differently
+from an array of
+.codn zchar ,
+which in turn converts differently from an array of
+.codn char .
+
+Note: it is recommended to avoid using the types
 .code bchar
-specially, but apply no special treatment to
-.codn uchar .
+and
+.code zchar
+other than for expressing the element type of an
+.code array
+or
+.codn zarray .
 
 .coNP FFI types @, short @, ushort @, int @, uint @, long @ and @ ulong
 These types correspond to the C integer types
@@ -64799,12 +64825,13 @@ In addition, several types are treated specially: when
 .meta type
 is one of
 .codn char ,
+.codn zchar ,
 .code bchar
 or
 .codn wchar ,
 the array type establishes a special correspondence with Lisp strings.
 When the C array is decoded, a Lisp string is created or updated in place
-to reflect the new contents.
+to reflect the new contents. This is described in detail below.
 
 The second form, whose syntax omits the
 .meta dim
@@ -64820,6 +64847,30 @@ Since the type has unknown length, it has a trivial get operation which returns
 It is useful for passing a variable amount of data into a foreign
 function by pointer.
 
+An array of
+.code char
+represents non-null-terminated UTF-8 character data, which converts to
+and from a Lisp string. Any null bytes in the data correspond to
+the pseudo-null character
+.code #\exDC00
+also notated as
+.codn #\epnul .
+
+An array of
+.code zchar
+represents a field of optionally null-terminated UTF-8 character data.
+If a null byte occurs in the data then the text terminates before that
+null byte, otherwise the data comprises the entire foreign array.
+Thus, null bytes do not occur in the data. A null byte in the array will
+not generate a pseudo-null character in the Lisp string.
+
+An array of
+.code bchar
+values represents 8-bit character data that isn't UTF-8 encoded,
+and is not null terminated. Each byte holds a character whose code is
+in the range 0 to 255. If a null byte occurs in the data, is interpreted
+as a string terminator.
+
 .coNP FFI type @ zarray
 .synb
 .mets (zarray < dim << type )
@@ -64861,27 +64912,41 @@ has five elements, then the fifth one will be decoded from the C array
 in earnest; it is not expected to be null. However, when that Lisp
 representation is converted back to C, that extra element will be ignored and
 output as a zero bytes.
+
 Lastly, the
 .code zarray
 further extends the special treatment which the
 .code array
 type applies to the types
+.codn zchar ,
 .codn char ,
 .code wchar
 and
 .codn bchar .
-Namely,
+The
 .code zarray
-assumes, and depends on the incoming data being null-terminated, and converts it to a Lisp
-string accordingly. The regular
+type assumes, and depends on the incoming data being null-terminated, and
+converts it to a Lisp string accordingly. The regular
 .code array
-type doesn't assume null termination. In particular, this means that an
+type doesn't assume null termination. In particular, this means that whereas
 .code "(array 42 char)"
-will decode 42 bytes of UTF-8, even if some of them are null. The null bytes
-convert to U+DC00. In contrast, a
+will decode 42 bytes of UTF-8, even if some of them are null, converting
+those null bytes the U+DC00 pseudo-null, in contrast, a
 .code zarray
 will treat the 42 bytes as a null-terminated string, and decode UTF-8 only
 up to the first null.
+In the other direction, when converting from Lisp string to the foreign array,
+.code zarray
+ensures null termination.
+
+Note that the type combination
+.code zarray
+of
+.code zchar
+behaves in a manner indistinguishable from a
+.code zarray
+of
+.codn char .
 
 The one-argument variant of the
 .code zarray
author	Kaz Kylheku <kaz@kylheku.com>	2019-12-14 10:19:54 -0800
committer	Kaz Kylheku <kaz@kylheku.com>	2019-12-14 10:19:54 -0800
commit	d2a2336acab95d0f57cd4edbcbc4f47025bc999a (patch)
tree	43def6b62ac1c338b2d9289ad93fa998c78d63b8
parent	399a2ca0791bbb59a6b008543f52ae4b0b06043f (diff)
download	txr-d2a2336acab95d0f57cd4edbcbc4f47025bc999a.tar.gz txr-d2a2336acab95d0f57cd4edbcbc4f47025bc999a.tar.bz2 txr-d2a2336acab95d0f57cd4edbcbc4f47025bc999a.zip