ffi: adjust semantics of zarray of characters.

We want to be able to extract null-terminated UTF-8 strings from arrays, without trailing junk, yet retain the ability to extract the entire array including embedded nulls. The natural way is to use the array/zarray distinction. * ffi.c (ffi_array_in, ffi_array_get): Don't try to guess whether the array is null terminated; just rely on the null_term flag, and treat accordingly. * txr.1: Doc updated.
author: Kaz Kylheku <kaz@kylheku.com> 2017-05-10 06:41:50 -0700
committer: Kaz Kylheku <kaz@kylheku.com> 2017-05-10 06:41:50 -0700
commit: 471149c262613e4b69fc14b9599fe541106084e4 (patch)
tree: e784850dd3089f325d37482621aa5d3ae7c2c045
parent: fe81a856d6a1db346c519897581925d1797913ad (diff)
download: txr-471149c262613e4b69fc14b9599fe541106084e4.tar.gz
txr-471149c262613e4b69fc14b9599fe541106084e4.tar.bz2
txr-471149c262613e4b69fc14b9599fe541106084e4.zip
2 files changed, 34 insertions, 14 deletions
diff --git a/ffi.c b/ffi.c
index 5918933d..8c99f3d9 100644
--- a/ffi.c
+++ b/ffi.c
@@ -1008,7 +1008,7 @@ static val ffi_array_in(struct txr_ffi_type *tft, int copy, mem_t *src,
       str = null_string;
     } else {
       const char *chptr = coerce(const char *, src);
-      if (chptr[tft->size - 1] == 0) {
+      if (tft->null_term) {
         str = string_utf8(chptr);
       } else {
         wchar_t *wch = utf8_dup_from_buf(chptr, tft->size);
@@ -1022,13 +1022,12 @@ static val ffi_array_in(struct txr_ffi_type *tft, int copy, mem_t *src,
     if (nelem == 0) {
       str = null_string;
     } else {
-      cnum nchar = tft->size / sizeof (wchar_t);
       const wchar_t *wchptr = coerce(const wchar_t *, src);
 
-      if (wchptr[nchar - 1] == 0) {
+      if (tft->null_term) {
         str = string(wchptr);
       } else {
-        val ustr = mkustring(num_fast(nchar));
+        val ustr = mkustring(num_fast(nelem));
         str = init_str(ustr, wchptr);
       }
     }
@@ -1040,7 +1039,7 @@ static val ffi_array_in(struct txr_ffi_type *tft, int copy, mem_t *src,
       str = null_string;
     } else {
       const unsigned char *chptr = coerce(const unsigned char *, src);
-      if (chptr[tft->size - 1] == 0)
+      if (tft->null_term)
         str = string_8bit(chptr);
       else
         str = string_8bit_size(chptr, tft->size);
@@ -1129,7 +1128,7 @@ static val ffi_array_get(struct txr_ffi_type *tft, mem_t *src, val self)
       return null_string;
     } else {
       const char *chptr = coerce(const char *, src);
-      if (chptr[tft->size - 1] == 0) {
+      if (tft->null_term) {
         return string_utf8(chptr);
       } else {
         wchar_t *wch = utf8_dup_from_buf(chptr, tft->size);
@@ -1140,13 +1139,12 @@ static val ffi_array_get(struct txr_ffi_type *tft, mem_t *src, val self)
     if (nelem == 0) {
       return null_string;
     } else {
-      cnum nchar = tft->size / sizeof (wchar_t);
       const wchar_t *wchptr = coerce(const wchar_t *, src);
 
-      if (wchptr[nchar - 1] == 0) {
+      if (tft->null_term) {
         return string(wchptr);
       } else {
-        val ustr = mkustring(num_fast(nchar));
+        val ustr = mkustring(num_fast(nelem));
         return init_str(ustr, wchptr);
       }
     }
@@ -1155,7 +1153,7 @@ static val ffi_array_get(struct txr_ffi_type *tft, mem_t *src, val self)
       return null_string;
     } else {
       const unsigned char *chptr = coerce(const unsigned char *, src);
-      if (chptr[tft->size - 1] == 0)
+      if (tft->null_term)
         return string_8bit(chptr);
       else
         return string_8bit_size(chptr, tft->size);
diff --git a/txr.1 b/txr.1
index fa48abd5..a56cac44 100644
--- a/txr.1
+++ b/txr.1
@@ -53597,17 +53597,39 @@ When converting from Lisp to C, it ensures that the array is null-terminated.
 This means that the last element of the array is written out as all zero bytes.
 The
 .code zarray
-type also allows the Lisp object to be one element short. For instance,
+type is useful for handling null terminated character arrays representing
+strings, and for null terminated vectors.
+Unlike
+.codn array ,
+.code zarray
+allows the Lisp object to be one element short. For instance,
 when a
 .code "(zarray 5 int)"
 passed by pointer a foreign function is converted back to Lisp,
 the Lisp object is required to have only four elements. If the Lisp object
 has five elements, then the fifth one will be decoded from the C array
 in earnest; it is not expected to be null.
-The
+Lastly, the
 .code zarray
-type is useful for handling null terminated character arrays representing
-strings, and for null terminated vectors.
+further extends the special treatment which the
+.code array
+type applies to the types
+.codn char ,
+.code wchar
+and
+.codn bchar .
+Namely,
+.code zarray
+assumes, and depends on the incoming data being null-terminated, and converts it to a Lisp
+string accordingly. The regular
+.code array
+type doesn't assume null termination. In particular, this means that an
+.code "(array 42 char)"
+will decode 42 bytes of UTF-8, even if some of them are null. The null bytes
+convert to U+DC00. In contrast, a
+.code zarray
+will treat the 42 bytes as a null-terminated string, and decode UTF-8 only
+up to the first null.
 
 .meIP (zarray << type )
 The
author	Kaz Kylheku <kaz@kylheku.com>	2017-05-10 06:41:50 -0700
committer	Kaz Kylheku <kaz@kylheku.com>	2017-05-10 06:41:50 -0700
commit	471149c262613e4b69fc14b9599fe541106084e4 (patch)
tree	e784850dd3089f325d37482621aa5d3ae7c2c045
parent	fe81a856d6a1db346c519897581925d1797913ad (diff)
download	txr-471149c262613e4b69fc14b9599fe541106084e4.tar.gz txr-471149c262613e4b69fc14b9599fe541106084e4.tar.bz2 txr-471149c262613e4b69fc14b9599fe541106084e4.zip