From 4f09a030963aea5c12876eb49db76d1ce4df380d Mon Sep 17 00:00:00 2001
From: Pavel Charvat <pchar@ucw.cz>
Date: Fri, 29 Dec 2017 08:40:24 +0100
Subject: [PATCH] UTF-8: Generator of long inlined functions. Also added
 "uft8_full" variants for 0..0x10ffff range.

---
 ucw/ff-unicode.c  | 185 +++++++++++-----------------------------------
 ucw/ff-unicode.h  |  23 ++++++
 ucw/unicode-gen.h | 172 ++++++++++++++++++++++++++++++++++++++++++
 ucw/unicode.h     | 184 ++++++++++++++-------------------------------
 4 files changed, 295 insertions(+), 269 deletions(-)
 create mode 100644 ucw/unicode-gen.h

diff --git a/ucw/ff-unicode.c b/ucw/ff-unicode.c
index e0faa0c8..de6df9d7 100644
--- a/ucw/ff-unicode.c
+++ b/ucw/ff-unicode.c
@@ -19,164 +19,65 @@
 int
 bget_utf8_slow(struct fastbuf *b, uint repl)
 {
-  int c = bgetc(b);
-  int code;
-
-  if (c < 0x80)				/* Includes EOF */
-    return c;
-  if (c < 0xc0)				/* Incorrect combination */
-    return repl;
-  if (c >= 0xf0)			/* Too large, skip it */
-    {
-      while ((c = bgetc(b)) >= 0x80 && c < 0xc0)
-	;
-      goto wrong;
-    }
-  if (c >= 0xe0)			/* 3 bytes */
-    {
-      code = c & 0x0f;
-      if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
-	goto wrong;
-      code = (code << 6) | (c & 0x3f);
-      if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
-	goto wrong;
-      code = (code << 6) | (c & 0x3f);
-      if (code < 0x800)
-	goto wrong2;
-    }
-  else					/* 2 bytes */
-    {
-      code = c & 0x1f;
-      if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
-	goto wrong;
-      code = (code << 6) | (c & 0x3f);
-      if (code < 0x80)
-	goto wrong2;
-    }
-  return code;
+  #define UNI_WANT_GET_UTF8
+  #define UNI_GIVE_FIRST_GETC { int x = bgetc(b); if (x < 0) return -1; c = x; }
+  #define UNI_GIVE_PEEKC { int x = bpeekc(b); if (x < 0) goto bad; c = x; }
+  #define UNI_GIVE_SKIPC b->bptr++
+  #define UNI_GIVE_OK return u
+  #define UNI_GIVE_BAD u = repl
+  #include <ucw/unicode-gen.h>
+}
 
-wrong:
-  if (c >= 0)
-    bungetc(b);
-wrong2:
-  return repl;
+int
+bget_utf8_full_slow(struct fastbuf *b, uint repl)
+{
+  #define UNI_WANT_GET_UTF8
+  #define UNI_WANT_UTF8_FULL
+  #define UNI_GIVE_FIRST_GETC { int x = bgetc(b); if (x < 0) return -1; c = x; }
+  #define UNI_GIVE_PEEKC { int x = bpeekc(b); if (x < 0) goto bad; c = x; }
+  #define UNI_GIVE_SKIPC b->bptr++
+  #define UNI_GIVE_OK return u
+  #define UNI_GIVE_BAD u = repl
+  #include <ucw/unicode-gen.h>
 }
 
 int
 bget_utf8_32_slow(struct fastbuf *b, uint repl)
 {
-  int c = bgetc(b);
-  int code;
-  int nr;
-  int limit;
-
-  if (c < 0x80)				/* Includes EOF */
-    return c;
-  if (c < 0xc0)				/* Incorrect combination */
-    return repl;
-  if (c < 0xe0)
-    {
-      code = c & 0x1f;
-      nr = 1;
-      limit = 0x80;
-    }
-  else if (c < 0xf0)
-    {
-      code = c & 0x0f;
-      nr = 2;
-      limit = 0x800;
-    }
-  else if (c < 0xf8)
-    {
-      code = c & 0x07;
-      nr = 3;
-      limit = 1 << 16;
-    }
-  else if (c < 0xfc)
-    {
-      code = c & 0x03;
-      nr = 4;
-      limit = 1 << 21;
-    }
-  else if (c < 0xfe)
-    {
-      code = c & 0x01;
-      nr = 5;
-      limit = 1 << 26;
-    }
-  else					/* Too large */
-    goto wrong2;
-  while (nr-- > 0)
-    {
-      if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
-	goto wrong;
-      code = (code << 6) | (c & 0x3f);
-    }
-  if (code < limit)
-    goto wrong2;
-  return code;
-
-wrong:
-  if (c >= 0)
-    bungetc(b);
-wrong2:
-  return repl;
+  #define UNI_WANT_GET_UTF8
+  #define UNI_WANT_UTF8_32
+  #define UNI_GIVE_FIRST_GETC { int x = bgetc(b); if (x < 0) return -1; c = x; }
+  #define UNI_GIVE_PEEKC { int x = bpeekc(b); if (x < 0) goto bad; c = x; }
+  #define UNI_GIVE_SKIPC b->bptr++
+  #define UNI_GIVE_OK return u
+  #define UNI_GIVE_BAD u = repl
+  #include <ucw/unicode-gen.h>
 }
 
 void
 bput_utf8_slow(struct fastbuf *b, uint u)
 {
-  ASSERT(u < 65536);
-  if (u < 0x80)
-    bputc(b, u);
-  else
-    {
-      if (u < 0x800)
-	bputc(b, 0xc0 | (u >> 6));
-      else
-	{
-	  bputc(b, 0xe0 | (u >> 12));
-	  bputc(b, 0x80 | ((u >> 6) & 0x3f));
-	}
-      bputc(b, 0x80 | (u & 0x3f));
-    }
+  #define UNI_WANT_PUT_UTF8
+  #define UNI_GIVE_PUTC bputc(b, c)
+  #include <ucw/unicode-gen.h>
+}
+
+void
+bput_utf8_full_slow(struct fastbuf *b, uint u)
+{
+  #define UNI_WANT_PUT_UTF8
+  #define UNI_WANT_UTF8_FULL
+  #define UNI_GIVE_PUTC bputc(b, c)
+  #include <ucw/unicode-gen.h>
 }
 
 void
 bput_utf8_32_slow(struct fastbuf *b, uint u)
 {
-  ASSERT(u < (1U<<31));
-  if (u < 0x80)
-    bputc(b, u);
-  else
-    {
-      if (u < 0x800)
-	bputc(b, 0xc0 | (u >> 6));
-      else
-	{
-	  if (u < (1<<16))
-	    bputc(b, 0xe0 | (u >> 12));
-	  else
-	    {
-	      if (u < (1<<21))
-		bputc(b, 0xf0 | (u >> 18));
-	      else
-		{
-		  if (u < (1<<26))
-		    bputc(b, 0xf8 | (u >> 24));
-		  else
-		    {
-		      bputc(b, 0xfc | (u >> 30));
-		      bputc(b, 0x80 | ((u >> 24) & 0x3f));
-		    }
-		  bputc(b, 0x80 | ((u >> 18) & 0x3f));
-		}
-	      bputc(b, 0x80 | ((u >> 12) & 0x3f));
-	    }
-	  bputc(b, 0x80 | ((u >> 6) & 0x3f));
-	}
-      bputc(b, 0x80 | (u & 0x3f));
-    }
+  #define UNI_WANT_PUT_UTF8
+  #define UNI_WANT_UTF8_32
+  #define UNI_GIVE_PUTC bputc(b, c)
+  #include <ucw/unicode-gen.h>
 }
 
 /*** UTF-16 ***/
diff --git a/ucw/ff-unicode.h b/ucw/ff-unicode.h
index 79a0c493..addd58b8 100644
--- a/ucw/ff-unicode.h
+++ b/ucw/ff-unicode.h
@@ -23,14 +23,17 @@
 #define bput_utf16_be_slow ucw_bput_utf16_be_slow
 #define bput_utf16_le_slow ucw_bput_utf16_le_slow
 #define bput_utf8_32_slow ucw_bput_utf8_32_slow
+#define bput_utf8_full_slow ucw_bput_utf8_full_slow
 #define bput_utf8_slow ucw_bput_utf8_slow
 #endif
 
 /* ** UTF-8 ** */
 
 int bget_utf8_slow(struct fastbuf *b, uint repl);
+int bget_utf8_full_slow(struct fastbuf *b, uint repl);
 int bget_utf8_32_slow(struct fastbuf *b, uint repl);
 void bput_utf8_slow(struct fastbuf *b, uint u);
+void bput_utf8_full_slow(struct fastbuf *b, uint u);
 void bput_utf8_32_slow(struct fastbuf *b, uint u);
 
 static inline int bget_utf8_repl(struct fastbuf *b, uint repl)
@@ -45,6 +48,18 @@ static inline int bget_utf8_repl(struct fastbuf *b, uint repl)
     return bget_utf8_slow(b, repl);
 }
 
+static inline int bget_utf8_full_repl(struct fastbuf *b, uint repl)
+{
+  uint u;
+  if (bavailr(b) >= 4)
+    {
+      b->bptr = utf8_full_get_repl(b->bptr, &u, repl);
+      return u;
+    }
+  else
+    return bget_utf8_full_slow(b, repl);
+}
+
 static inline int bget_utf8_32_repl(struct fastbuf *b, uint repl)
 {
   uint u;
@@ -75,6 +90,14 @@ static inline void bput_utf8(struct fastbuf *b, uint u) /** Write a single utf8
     bput_utf8_slow(b, u);
 }
 
+static inline void bput_utf8_full(struct fastbuf *b, uint u) /** Write a single utf8 character from range [0, 0x10ffff]. **/
+{
+  if (bavailw(b) >= 4)
+    b->bptr = utf8_full_put(b->bptr, u);
+  else
+    bput_utf8_full_slow(b, u);
+}
+
 static inline void bput_utf8_32(struct fastbuf *b, uint u) /** Write a single utf8 character (from the whole unicode range). **/
 {
   if (bavailw(b) >= 6)
diff --git a/ucw/unicode-gen.h b/ucw/unicode-gen.h
new file mode 100644
index 00000000..6e10b8a9
--- /dev/null
+++ b/ucw/unicode-gen.h
@@ -0,0 +1,172 @@
+#if defined(UNI_WANT_UTF8_32)
+#define UNI_MAX_UTF8_BYTES 6
+#elif defined(UNI_WANT_UTF8_FULL)
+#define UNI_MAX_UTF8_BYTES 4
+#else
+#define UNI_MAX_UTF8_BYTES 3
+#endif
+
+/* Writing UTF-8 */
+
+#ifdef UNI_WANT_PUT_UTF8
+
+#define UNI_PUT_NEXT(_c) do { byte c = (_c); UNI_GIVE_PUTC; } while (0)
+
+{
+  if (u < 0x80)
+    UNI_PUT_NEXT(u);
+  else if (u < 0x800)
+    {
+      UNI_PUT_NEXT(0xc0 | (u >> 6));
+      goto put1;
+    }
+  else if (u < (1<<16))
+    {
+      UNI_PUT_NEXT(0xe0 | (u >> 12));
+#if UNI_MAX_UTF8_BYTES > 3
+      goto put2;
+    }
+#ifdef UNI_WANT_UTF8_FULL
+  else if (u <= 0x10ffff)
+#else
+  else if (u < (1<<21))
+#endif
+    {
+      UNI_PUT_NEXT(0xf0 | (u >> 18));
+#if UNI_MAX_UTF8_BYTES > 4
+      goto put3;
+    }
+  else if (u < (1<<26))
+    {
+      UNI_PUT_NEXT(0xf8 | (u >> 24));
+      goto put4;
+    }
+  else if (u < (1U<<31))
+    {
+      UNI_PUT_NEXT(0xfc | (u >> 30));
+      UNI_PUT_NEXT(0x80 | ((u >> 24) & 0x3f));
+put4:
+      UNI_PUT_NEXT(0x80 | ((u >> 18) & 0x3f));
+put3:
+#endif
+      UNI_PUT_NEXT(0x80 | ((u >> 12) & 0x3f));
+put2:
+#endif
+      UNI_PUT_NEXT(0x80 | ((u >> 6) & 0x3f));
+put1:
+      UNI_PUT_NEXT(0x80 | (u & 0x3f));
+    }
+  else
+    ASSERT(0);
+}
+
+#endif
+
+/* Reading UTF-8 */
+
+#ifdef UNI_WANT_GET_UTF8
+
+#define UNI_GET_NEXT \
+   do { \
+     UNI_GIVE_PEEKC; \
+     if (unlikely((c & 0xc0) != 0x80)) goto bad; \
+     u = (u << 6) | (c & 0x3f); \
+     UNI_GIVE_SKIPC; \
+  } while (0)
+
+{
+  byte c;
+  uint u, limit;
+#ifdef UNI_GIVE_FIRST_GETC
+  UNI_GIVE_FIRST_GETC;
+#else
+  UNI_GIVE_PEEKC;
+  UNI_GIVE_SKIPC;
+#endif
+  u = c;
+  if (u < 0x80)
+    ;
+  else if (unlikely(u < 0xc0))
+    goto bad;
+  else if (u < 0xe0)
+    {
+      u &= 0x1f;
+      limit = 0x80;
+      goto get1;
+    }
+  else if (u < 0xf0)
+    {
+      u &= 0x0f;
+      limit = 0x800;
+#if UNI_MAX_UTF8_BYTES > 3
+      goto get2;
+    }
+  else if (u < 0xf8)
+    {
+      u &= 0x07;
+      limit = 1 << 16;
+#if UNI_MAX_UTF8_BYTES > 4
+      goto get3;
+    }
+  else if (u < 0xfc)
+    {
+      u &= 0x03;
+      limit = 1 << 21;
+      goto get4;
+    }
+  else if (u < 0xfe)
+    {
+      u &= 0x01;
+      limit = 1 << 26;
+
+      UNI_GET_NEXT;
+get4:
+      UNI_GET_NEXT;
+get3:
+#endif
+      UNI_GET_NEXT;
+get2:
+#endif
+      UNI_GET_NEXT;
+get1:
+      UNI_GET_NEXT;
+
+      if (unlikely(u < limit))
+	goto bad;
+#ifdef UNI_WANT_UTF8_FULL
+      if (unlikely(u > 0x10ffff))
+	goto bad;
+#endif
+    }
+  else
+    {
+bad:
+#ifdef UNI_GIVE_BAD
+      UNI_GIVE_BAD;
+#else
+      u = UNI_REPLACEMENT;
+#endif
+    }
+
+  UNI_GIVE_OK;
+}
+
+#endif
+
+#undef UNI_WANT_PUT_UTF8
+#undef UNI_WANT_GET_UTF8
+
+#undef UNI_WANT_UTF8_32
+#undef UNI_WANT_UTF8_FULL
+
+#undef UNI_MAX_UTF8_BYTES
+
+#undef UNI_GIVE_PUTC
+#undef UNI_GIVE_FIRST_GETC
+#undef UNI_GIVE_PEEKC
+#undef UNI_GIVE_SKIPC
+#undef UNI_GIVE_OK
+#undef UNI_GIVE_BAD
+
+#undef UNI_PUT_NEXT
+#undef UNI_GET_NEXT
diff --git a/ucw/unicode.h b/ucw/unicode.h
index 4ec1c6b2..b35531cd 100644
--- a/ucw/unicode.h
+++ b/ucw/unicode.h
@@ -29,20 +29,22 @@
  **/
 static inline byte *utf8_put(byte *p, uint u)
 {
-  if (u < 0x80)
-    *p++ = u;
-  else if (u < 0x800)
-    {
-      *p++ = 0xc0 | (u >> 6);
-      *p++ = 0x80 | (u & 0x3f);
-    }
-  else
-    {
-      ASSERT(u < 0x10000);
-      *p++ = 0xe0 | (u >> 12);
-      *p++ = 0x80 | ((u >> 6) & 0x3f);
-      *p++ = 0x80 | (u & 0x3f);
-    }
+  #define UNI_WANT_PUT_UTF8
+  #define UNI_GIVE_PUTC *p++ = c
+  #include <ucw/unicode-gen.h>
+  return p;
+}
+
+/**
+ * Encode a value from the range `[0, 0x10FFFF]`
+ * (full Unicode range); up to 4 bytes needed (RFC2279).
+ **/
+static inline byte *utf8_full_put(byte *p, uint u)
+{
+  #define UNI_WANT_PUT_UTF8
+  #define UNI_WANT_UTF8_FULL
+  #define UNI_GIVE_PUTC *p++ = c
+  #include <ucw/unicode-gen.h>
   return p;
 }
 
@@ -52,77 +54,40 @@ static inline byte *utf8_put(byte *p, uint u)
  **/
 static inline byte *utf8_32_put(byte *p, uint u)
 {
-  if (u < 0x80)
-    *p++ = u;
-  else if (u < 0x800)
-    {
-      *p++ = 0xc0 | (u >> 6);
-      goto put1;
-    }
-  else if (u < (1<<16))
-    {
-      *p++ = 0xe0 | (u >> 12);
-      goto put2;
-    }
-  else if (u < (1<<21))
-    {
-      *p++ = 0xf0 | (u >> 18);
-      goto put3;
-    }
-  else if (u < (1<<26))
-    {
-      *p++ = 0xf8 | (u >> 24);
-      goto put4;
-    }
-  else if (u < (1U<<31))
-    {
-      *p++ = 0xfc | (u >> 30);
-      *p++ = 0x80 | ((u >> 24) & 0x3f);
-put4: *p++ = 0x80 | ((u >> 18) & 0x3f);
-put3: *p++ = 0x80 | ((u >> 12) & 0x3f);
-put2: *p++ = 0x80 | ((u >> 6) & 0x3f);
-put1: *p++ = 0x80 | (u & 0x3f);
-    }
-  else
-    ASSERT(0);
+  #define UNI_WANT_PUT_UTF8
+  #define UNI_WANT_UTF8_32
+  #define UNI_GIVE_PUTC *p++ = c
+  #include <ucw/unicode-gen.h>
   return p;
 }
 
-#define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f)
-#define UTF8_CHECK_RANGE(r) if (unlikely(u < r)) goto bad
-
 /**
  * Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane)
  * or return @repl if the encoding has been corrupted.
  **/
 static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl)
 {
-  uint u = *p++;
-  if (u < 0x80)
-    ;
-  else if (unlikely(u < 0xc0))
-    {
-      /* Incorrect byte sequence */
-    bad:
-      u = repl;
-    }
-  else if (u < 0xe0)
-    {
-      u &= 0x1f;
-      UTF8_GET_NEXT;
-      UTF8_CHECK_RANGE(0x80);
-    }
-  else if (likely(u < 0xf0))
-    {
-      u &= 0x0f;
-      UTF8_GET_NEXT;
-      UTF8_GET_NEXT;
-      UTF8_CHECK_RANGE(0x800);
-    }
-  else
-    goto bad;
-  *uu = u;
-  return (byte *)p;
+  #define UNI_WANT_GET_UTF8
+  #define UNI_GIVE_PEEKC c = *p
+  #define UNI_GIVE_SKIPC p++
+  #define UNI_GIVE_OK { *uu = u; return (byte *)p; }
+  #define UNI_GIVE_BAD u = repl
+  #include <ucw/unicode-gen.h>
+}
+
+/**
+ * Decode a value from the range `[0, 0x10FFFF]` (full Unicode range)
+ * or return @repl if the encoding has been corrupted.
+ **/
+static inline byte *utf8_full_get_repl(const byte *p, uint *uu, uint repl)
+{
+  #define UNI_WANT_GET_UTF8
+  #define UNI_WANT_UTF8_FULL
+  #define UNI_GIVE_PEEKC c = *p
+  #define UNI_GIVE_SKIPC p++
+  #define UNI_GIVE_OK { *uu = u; return (byte *)p; }
+  #define UNI_GIVE_BAD u = repl
+  #include <ucw/unicode-gen.h>
 }
 
 /**
@@ -131,57 +96,13 @@ static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl)
  **/
 static inline byte *utf8_32_get_repl(const byte *p, uint *uu, uint repl)
 {
-  uint u = *p++;
-  uint limit;
-  if (u < 0x80)
-    ;
-  else if (unlikely(u < 0xc0))
-    goto bad;
-  else if (u < 0xe0)
-    {
-      u &= 0x1f;
-      limit = 0x80;
-      goto get1;
-    }
-  else if (u < 0xf0)
-    {
-      u &= 0x0f;
-      limit = 0x800;
-      goto get2;
-    }
-  else if (u < 0xf8)
-    {
-      u &= 0x07;
-      limit = 1 << 16;
-      goto get3;
-    }
-  else if (u < 0xfc)
-    {
-      u &= 0x03;
-      limit = 1 << 21;
-      goto get4;
-    }
-  else if (u < 0xfe)
-    {
-      u &= 0x01;
-      limit = 1 << 26;
-      UTF8_GET_NEXT;
-get4: UTF8_GET_NEXT;
-get3: UTF8_GET_NEXT;
-get2: UTF8_GET_NEXT;
-get1: UTF8_GET_NEXT;
-      if (unlikely(u < limit))
-	goto bad;
-    }
-  else
-    goto bad;
-  *uu = u;
-  return (byte *)p;
-
-bad:
-  /* Incorrect byte sequence */
-  *uu = repl;
-  return (byte *)p;
+  #define UNI_WANT_GET_UTF8
+  #define UNI_WANT_UTF8_32
+  #define UNI_GIVE_PEEKC c = *p
+  #define UNI_GIVE_SKIPC p++
+  #define UNI_GIVE_OK { *uu = u; return (byte *)p; }
+  #define UNI_GIVE_BAD u = repl
+  #include <ucw/unicode-gen.h>
 }
 
 /**
@@ -193,6 +114,15 @@ static inline byte *utf8_get(const byte *p, uint *uu)
   return utf8_get_repl(p, uu, UNI_REPLACEMENT);
 }
 
+/**
+ * Decode a value from the range `[0, 0x10FFFF]` (full Unicode range)
+ * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
+ **/
+static inline byte *utf8_full_get(const byte *p, uint *uu)
+{
+  return utf8_full_get_repl(p, uu, UNI_REPLACEMENT);
+}
+
 /**
  * Decode a value from the range `[0, 0x7FFFFFFF]`
  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
-- 
2.39.5