From: Pavel Charvat <pchar@ucw.cz>
Date: Thu, 14 Dec 2017 09:08:00 +0000 (+0100)
Subject: Unicode: Added reading of 32bit UTF-8 unicode values with protection against buffer... 
X-Git-Tag: v6.5.7~4
X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=445e507caadb330b5ad640b3c5a357e6326c7855;p=libucw.git

Unicode: Added reading of 32bit UTF-8 unicode values with protection against buffer overflow.

Until now safe reading was possible only in fastbufs, which are
in some cases inefficient to initialize.

I'm still not sure about this change -> added only for one of many similar
functions. It multiplies the size of already long source code.
---

diff --git a/ucw/unicode.h b/ucw/unicode.h
index 4ec1c6b2..2cb1c02f 100644
--- a/ucw/unicode.h
+++ b/ucw/unicode.h
@@ -89,6 +89,7 @@ put1: *p++ = 0x80 | (u & 0x3f);
 }
 
 #define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f)
+#define UTF8_CHECK_AVAIL(n) if (unlikely(avail < n)) goto bad
 #define UTF8_CHECK_RANGE(r) if (unlikely(u < r)) goto bad
 
 /**
@@ -184,6 +185,75 @@ bad:
   return (byte *)p;
 }
 
+/**
+ * Decode a value from the range `[0, 0x7FFFFFFF]`
+ * or return @repl if the encoding has been corrupted.
+ * This function never reads behind @stop (including).
+ * At least one byte must be available (@stop > @p).
+ **/
+static inline byte *utf8_32_get_repl_safe(const byte *p, const byte *stop, uint *uu, uint repl)
+{
+  uint u = *p++;
+  if (u < 0x80)
+    goto ok;
+  else if (unlikely(u < 0xc0))
+    goto bad;
+  uint limit;
+  size_t avail = stop - p;
+  if (u < 0xe0)
+    {
+      UTF8_CHECK_AVAIL(1);
+      u &= 0x1f;
+      limit = 0x80;
+      goto get1;
+    }
+  else if (u < 0xf0)
+    {
+      UTF8_CHECK_AVAIL(2);
+      u &= 0x0f;
+      limit = 0x800;
+      goto get2;
+    }
+  else if (u < 0xf8)
+    {
+      UTF8_CHECK_AVAIL(3);
+      u &= 0x07;
+      limit = 1 << 16;
+      goto get3;
+    }
+  else if (u < 0xfc)
+    {
+      UTF8_CHECK_AVAIL(4);
+      u &= 0x03;
+      limit = 1 << 21;
+      goto get4;
+    }
+  else if (u < 0xfe)
+    {
+      UTF8_CHECK_AVAIL(5);
+      u &= 0x01;
+      limit = 1 << 26;
+      UTF8_GET_NEXT;
+get4: UTF8_GET_NEXT;
+get3: UTF8_GET_NEXT;
+get2: UTF8_GET_NEXT;
+get1: UTF8_GET_NEXT;
+      if (unlikely(u < limit))
+	goto bad;
+    }
+  else
+    goto bad;
+
+ok:
+  *uu = u;
+  return (byte *)p;
+
+bad:
+  /* Incorrect byte sequence */
+  *uu = repl;
+  return (byte *)p;
+}
+
 /**
  * Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane)
  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.