UTF-8 API overhaul: security, and other concerns.

The main aim here is to pave the way for conversion between arbitrary buffers of bytes (that may include embedded NUL characters) and a wide string. Also, a potential security hole is closed. When we convert a TXR string to UTF-8 for use with some C library API, any embedded pnul characters (U+DC00) turn into NUL bytes which effectively cut the UTF-8 string short, and silently so. The C library function receives a shortened string. This could be exploitable in some situations. * lib.c (int_str): Use utf8_dup_to_buf instead of utf8_dup_to_uc. Pass 1 to have the buffer null-terminated, since mp_read_radix depends on it. * stream.c (make_string_byte_input_stream): Use utf8_dup_to_buf. This gives us the size, soo we don't have to call strlen. The buffer is no longer null terminated, but the byte input stream implementation never relied on this. * utf8.c (utf8_from_buf): Replacement fors utf8_from_uc which doesn't assume that the buffer of bytes is null-terminated. It can produce a wide string containing U+DC00 characters corresponding to embedded nulls in the original buffer. (utf8_from): Calculate length of null-terminated string and use utf8_from_buf. (utf8_to_buf): Replacement for utf8_to_uc. Can produce a buffer which is or is not null-terminated, based on new argument. (utf8_to): Use utf8_to_buf, and ask it to null-terminate, thus preserving behavior. (utf8_dup_from_uc): This function was not used anywhere and is removed. (utf8_dup_to_buf): Replacement for utf8_dup_to_uc which takes an extra agrgument, whether to null-terminate or not. (utf8_dup_to): Apply security check here: is the resulting string as long as utf8_to says it should be? If not, it contains embedded nulls. Throw an exception. * utf.h (utf8_from_uc, utf8_to_uc, utf8_dup_from_uc, utf8_dup_to_uc): Declarations removed. (utf8_from_buf, utf8_to_buf, utf8_dup_to_buf): Declared.
author: Kaz Kylheku <kaz@kylheku.com> 2016-03-31 20:53:03 -0700
committer: Kaz Kylheku <kaz@kylheku.com> 2016-03-31 20:53:03 -0700
commit: c27f83bdae5eb00206a478f7764df4fdaa48fc76 (patch)
tree: 3fdcd29807e120c1836a7ba59de6098a0460b636 /utf8.c
parent: 98b26ff13eeb8a9f730801720c4cba30eba9e61d (diff)
download: txr-c27f83bdae5eb00206a478f7764df4fdaa48fc76.tar.gz
txr-c27f83bdae5eb00206a478f7764df4fdaa48fc76.tar.bz2
txr-c27f83bdae5eb00206a478f7764df4fdaa48fc76.zip
1 files changed, 33 insertions, 34 deletions
diff --git a/utf8.c b/utf8.c
index 03154bae..7c719cf5 100644
--- a/utf8.c
+++ b/utf8.c
@@ -27,6 +27,7 @@
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <wchar.h>
 #include <signal.h>
 #include "config.h"
@@ -47,34 +48,27 @@ static void conversion_error(void)
 }
 #endif
 
-size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
+size_t utf8_from_buf(wchar_t *wdst, const unsigned char *src, size_t nbytes)
 {
   size_t nchar = 1;
   enum utf8_state state = utf8_init;
   const unsigned char *backtrack = 0;
   wchar_t wch = 0, wch_min = 0;
 
-  for (;;) {
+  while (nbytes-- > 0) {
     int ch = *src++;
 
-    if (ch == 0) {
-      if (state == utf8_init)
-        break;
-      src = backtrack;
-      if (wdst)
-        *wdst++ = 0xDC00 | *src;
-      nchar++;
-      state = utf8_init;
-      continue;
-    }
-
     switch (state) {
     case utf8_init:
       switch (ch >> 4) {
       case 0x0: case 0x1: case 0x2: case 0x3:
       case 0x4: case 0x5: case 0x6: case 0x7:
-        if (wdst)
-          *wdst++ = ch;
+        if (wdst) {
+          if (ch)
+            *wdst++ = ch;
+          else
+            *wdst++ = 0xDC00;
+        }
         nchar++;
         break;
       case 0xC: case 0xD:
@@ -146,12 +140,13 @@ size_t utf8_from_uc(wchar_t *wdst, const unsigned char *src)
 
 size_t utf8_from(wchar_t *wdst, const char *src)
 {
-  return utf8_from_uc(wdst, coerce(const unsigned char *, src));
+  size_t nbytes = strlen(src);
+  return utf8_from_buf(wdst, coerce(const unsigned char *, src), nbytes);
 }
 
-size_t utf8_to_uc(unsigned char *dst, const wchar_t *wsrc)
+size_t utf8_to_buf(unsigned char *dst, const wchar_t *wsrc, int null_term)
 {
-  size_t nbyte = 1;
+  size_t nbyte = 0;
   wchar_t wch;
 
   while ((wch = *wsrc++)) {
@@ -189,22 +184,18 @@ size_t utf8_to_uc(unsigned char *dst, const wchar_t *wsrc)
     }
   }
 
-  if (dst)
-    *dst++ = 0;
+  if (null_term) {
+    if (dst)
+      *dst++ = 0;
+    nbyte++;
+  }
+
   return nbyte;
 }
 
 size_t utf8_to(char *dst, const wchar_t *wsrc)
 {
-  return utf8_to_uc(coerce(unsigned char *, dst), wsrc);
-}
-
-wchar_t *utf8_dup_from_uc(const unsigned char *str)
-{
-  size_t nchar = utf8_from_uc(0, str);
-  wchar_t *wstr = chk_wmalloc(nchar);
-  utf8_from_uc(wstr, str);
-  return wstr;
+  return utf8_to_buf(coerce(unsigned char *, dst), wsrc, 1);
 }
 
 wchar_t *utf8_dup_from(const char *str)
@@ -215,19 +206,27 @@ wchar_t *utf8_dup_from(const char *str)
   return wstr;
 }
 
-unsigned char *utf8_dup_to_uc(const wchar_t *wstr)
+unsigned char *utf8_dup_to_buf(const wchar_t *wstr, size_t *pnbytes,
+                               int null_term)
 {
-  size_t nbyte = utf8_to_uc(0, wstr);
+  size_t nbyte = utf8_to_buf(0, wstr, null_term);
   unsigned char *str = chk_malloc(nbyte);
-  utf8_to_uc(str, wstr);
+  utf8_to_buf(str, wstr, null_term);
+  *pnbytes = nbyte;
   return str;
 }
 
 char *utf8_dup_to(const wchar_t *wstr)
 {
-  size_t nbyte = utf8_to(0, wstr);
-  char *str = coerce(char *, chk_malloc(nbyte));
+  size_t len = utf8_to(0, wstr) - 1;
+  char *str = coerce(char *, chk_malloc(len + 1));
   utf8_to(str, wstr);
+  str[len] = 0;
+  if (strlen(str) != len) {
+    free(str);
+    uw_throw(error_s,
+             lit("Cannot convert string with embedded NUL to UTF-8 string"));
+  }
   return str;
 }
author	Kaz Kylheku <kaz@kylheku.com>	2016-03-31 20:53:03 -0700
committer	Kaz Kylheku <kaz@kylheku.com>	2016-03-31 20:53:03 -0700
commit	c27f83bdae5eb00206a478f7764df4fdaa48fc76 (patch)
tree	3fdcd29807e120c1836a7ba59de6098a0460b636 /utf8.c
parent	98b26ff13eeb8a9f730801720c4cba30eba9e61d (diff)
download	txr-c27f83bdae5eb00206a478f7764df4fdaa48fc76.tar.gz txr-c27f83bdae5eb00206a478f7764df4fdaa48fc76.tar.bz2 txr-c27f83bdae5eb00206a478f7764df4fdaa48fc76.zip