UTF-8: Generator of long inlined functions. Also added "uft8_full" variants for 0...

author Pavel Charvat <pchar@ucw.cz>

Fri, 29 Dec 2017 07:40:24 +0000 (08:40 +0100)

committer Pavel Charvat <pchar@ucw.cz>

Fri, 29 Dec 2017 07:40:24 +0000 (08:40 +0100)
author Pavel Charvat <pchar@ucw.cz>
Fri, 29 Dec 2017 07:40:24 +0000 (08:40 +0100)
committer Pavel Charvat <pchar@ucw.cz>
Fri, 29 Dec 2017 07:40:24 +0000 (08:40 +0100)
diff --git a/ucw/ff-unicode.c b/ucw/ff-unicode.c

index e0faa0c85ae4aeb31a917bc87f34b24844522761..de6df9d7e12f962e727afac56790b1cefdffef88 100644 (file)
--- a/ucw/ff-unicode.c
+++ b/ucw/ff-unicode.c
@@ -19,164 +19,65 @@
  int
  bget_utf8_slow(struct fastbuf *b, uint repl)
  {
-  int c = bgetc(b);
-  int code;
-
-  if (c < 0x80)                                /* Includes EOF */
-    return c;
-  if (c < 0xc0)                                /* Incorrect combination */
-    return repl;
-  if (c >= 0xf0)                       /* Too large, skip it */
-    {
-      while ((c = bgetc(b)) >= 0x80 && c < 0xc0)
-       ;
-      goto wrong;
-    }
-  if (c >= 0xe0)                       /* 3 bytes */
-    {
-      code = c & 0x0f;
-      if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
-       goto wrong;
-      code = (code << 6) | (c & 0x3f);
-      if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
-       goto wrong;
-      code = (code << 6) | (c & 0x3f);
-      if (code < 0x800)
-       goto wrong2;
-    }
-  else                                 /* 2 bytes */
-    {
-      code = c & 0x1f;
-      if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
-       goto wrong;
-      code = (code << 6) | (c & 0x3f);
-      if (code < 0x80)
-       goto wrong2;
-    }
-  return code;
+  #define UNI_WANT_GET_UTF8
+  #define UNI_GIVE_FIRST_GETC { int x = bgetc(b); if (x < 0) return -1; c = x; }
+  #define UNI_GIVE_PEEKC { int x = bpeekc(b); if (x < 0) goto bad; c = x; }
+  #define UNI_GIVE_SKIPC b->bptr++
+  #define UNI_GIVE_OK return u
+  #define UNI_GIVE_BAD u = repl
+  #include <ucw/unicode-gen.h>
+}
  
-wrong:
-  if (c >= 0)
-    bungetc(b);
-wrong2:
-  return repl;
+int
+bget_utf8_full_slow(struct fastbuf *b, uint repl)
+{
+  #define UNI_WANT_GET_UTF8
+  #define UNI_WANT_UTF8_FULL
+  #define UNI_GIVE_FIRST_GETC { int x = bgetc(b); if (x < 0) return -1; c = x; }
+  #define UNI_GIVE_PEEKC { int x = bpeekc(b); if (x < 0) goto bad; c = x; }
+  #define UNI_GIVE_SKIPC b->bptr++
+  #define UNI_GIVE_OK return u
+  #define UNI_GIVE_BAD u = repl
+  #include <ucw/unicode-gen.h>
  }
  
  int
  bget_utf8_32_slow(struct fastbuf *b, uint repl)
  {
-  int c = bgetc(b);
-  int code;
-  int nr;
-  int limit;
-
-  if (c < 0x80)                                /* Includes EOF */
-    return c;
-  if (c < 0xc0)                                /* Incorrect combination */
-    return repl;
-  if (c < 0xe0)
-    {
-      code = c & 0x1f;
-      nr = 1;
-      limit = 0x80;
-    }
-  else if (c < 0xf0)
-    {
-      code = c & 0x0f;
-      nr = 2;
-      limit = 0x800;
-    }
-  else if (c < 0xf8)
-    {
-      code = c & 0x07;
-      nr = 3;
-      limit = 1 << 16;
-    }
-  else if (c < 0xfc)
-    {
-      code = c & 0x03;
-      nr = 4;
-      limit = 1 << 21;
-    }
-  else if (c < 0xfe)
-    {
-      code = c & 0x01;
-      nr = 5;
-      limit = 1 << 26;
-    }
-  else                                 /* Too large */
-    goto wrong2;
-  while (nr-- > 0)
-    {
-      if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
-       goto wrong;
-      code = (code << 6) | (c & 0x3f);
-    }
-  if (code < limit)
-    goto wrong2;
-  return code;
-
-wrong:
-  if (c >= 0)
-    bungetc(b);
-wrong2:
-  return repl;
+  #define UNI_WANT_GET_UTF8
+  #define UNI_WANT_UTF8_32
+  #define UNI_GIVE_FIRST_GETC { int x = bgetc(b); if (x < 0) return -1; c = x; }
+  #define UNI_GIVE_PEEKC { int x = bpeekc(b); if (x < 0) goto bad; c = x; }
+  #define UNI_GIVE_SKIPC b->bptr++
+  #define UNI_GIVE_OK return u
+  #define UNI_GIVE_BAD u = repl
+  #include <ucw/unicode-gen.h>
  }
  
  void
  bput_utf8_slow(struct fastbuf *b, uint u)
  {
-  ASSERT(u < 65536);
-  if (u < 0x80)
-    bputc(b, u);
-  else
-    {
-      if (u < 0x800)
-       bputc(b, 0xc0 | (u >> 6));
-      else
-       {
-         bputc(b, 0xe0 | (u >> 12));
-         bputc(b, 0x80 | ((u >> 6) & 0x3f));
-       }
-      bputc(b, 0x80 | (u & 0x3f));
-    }
+  #define UNI_WANT_PUT_UTF8
+  #define UNI_GIVE_PUTC bputc(b, c)
+  #include <ucw/unicode-gen.h>
+}
+
+void
+bput_utf8_full_slow(struct fastbuf *b, uint u)
+{
+  #define UNI_WANT_PUT_UTF8
+  #define UNI_WANT_UTF8_FULL
+  #define UNI_GIVE_PUTC bputc(b, c)
+  #include <ucw/unicode-gen.h>
  }
  
  void
  bput_utf8_32_slow(struct fastbuf *b, uint u)
  {
-  ASSERT(u < (1U<<31));
-  if (u < 0x80)
-    bputc(b, u);
-  else
-    {
-      if (u < 0x800)
-       bputc(b, 0xc0 | (u >> 6));
-      else
-       {
-         if (u < (1<<16))
-           bputc(b, 0xe0 | (u >> 12));
-         else
-           {
-             if (u < (1<<21))
-               bputc(b, 0xf0 | (u >> 18));
-             else
-               {
-                 if (u < (1<<26))
-                   bputc(b, 0xf8 | (u >> 24));
-                 else
-                   {
-                     bputc(b, 0xfc | (u >> 30));
-                     bputc(b, 0x80 | ((u >> 24) & 0x3f));
-                   }
-                 bputc(b, 0x80 | ((u >> 18) & 0x3f));
-               }
-             bputc(b, 0x80 | ((u >> 12) & 0x3f));
-           }
-         bputc(b, 0x80 | ((u >> 6) & 0x3f));
-       }
-      bputc(b, 0x80 | (u & 0x3f));
-    }
+  #define UNI_WANT_PUT_UTF8
+  #define UNI_WANT_UTF8_32
+  #define UNI_GIVE_PUTC bputc(b, c)
+  #include <ucw/unicode-gen.h>
  }
  
  /*** UTF-16 ***/
diff --git a/ucw/ff-unicode.h b/ucw/ff-unicode.h

index 79a0c4935be88e1b8b7f976f3ebffd6dec7c9627..addd58b83dc16c62a4336278f450bf405d1b1064 100644 (file)
--- a/ucw/ff-unicode.h
+++ b/ucw/ff-unicode.h
@@ -23,14 +23,17 @@
  #define bput_utf16_be_slow ucw_bput_utf16_be_slow
  #define bput_utf16_le_slow ucw_bput_utf16_le_slow
  #define bput_utf8_32_slow ucw_bput_utf8_32_slow
+#define bput_utf8_full_slow ucw_bput_utf8_full_slow
  #define bput_utf8_slow ucw_bput_utf8_slow
  #endif
  
  /* ** UTF-8 ** */
  
  int bget_utf8_slow(struct fastbuf *b, uint repl);
+int bget_utf8_full_slow(struct fastbuf *b, uint repl);
  int bget_utf8_32_slow(struct fastbuf *b, uint repl);
  void bput_utf8_slow(struct fastbuf *b, uint u);
+void bput_utf8_full_slow(struct fastbuf *b, uint u);
  void bput_utf8_32_slow(struct fastbuf *b, uint u);
  
  static inline int bget_utf8_repl(struct fastbuf *b, uint repl)
@@ -45,6 +48,18 @@ static inline int bget_utf8_repl(struct fastbuf *b, uint repl)
      return bget_utf8_slow(b, repl);
  }
  
+static inline int bget_utf8_full_repl(struct fastbuf *b, uint repl)
+{
+  uint u;
+  if (bavailr(b) >= 4)
+    {
+      b->bptr = utf8_full_get_repl(b->bptr, &u, repl);
+      return u;
+    }
+  else
+    return bget_utf8_full_slow(b, repl);
+}
+
  static inline int bget_utf8_32_repl(struct fastbuf *b, uint repl)
  {
    uint u;
@@ -75,6 +90,14 @@ static inline void bput_utf8(struct fastbuf *b, uint u) /** Write a single utf8
      bput_utf8_slow(b, u);
  }
  
+static inline void bput_utf8_full(struct fastbuf *b, uint u) /** Write a single utf8 character from range [0, 0x10ffff]. **/
+{
+  if (bavailw(b) >= 4)
+    b->bptr = utf8_full_put(b->bptr, u);
+  else
+    bput_utf8_full_slow(b, u);
+}
+
  static inline void bput_utf8_32(struct fastbuf *b, uint u) /** Write a single utf8 character (from the whole unicode range). **/
  {
    if (bavailw(b) >= 6)
diff --git a/ucw/unicode-gen.h b/ucw/unicode-gen.h

new file mode 100644 (file)

index 0000000..6e10b8a
--- /dev/null
+++ b/ucw/unicode-gen.h
@@ -0,0 +1,172 @@
+#if defined(UNI_WANT_UTF8_32)
+#define UNI_MAX_UTF8_BYTES 6
+#elif defined(UNI_WANT_UTF8_FULL)
+#define UNI_MAX_UTF8_BYTES 4
+#else
+#define UNI_MAX_UTF8_BYTES 3
+#endif
+
+/* Writing UTF-8 */
+
+#ifdef UNI_WANT_PUT_UTF8
+
+#define UNI_PUT_NEXT(_c) do { byte c = (_c); UNI_GIVE_PUTC; } while (0)
+
+{
+  if (u < 0x80)
+    UNI_PUT_NEXT(u);
+  else if (u < 0x800)
+    {
+      UNI_PUT_NEXT(0xc0 | (u >> 6));
+      goto put1;
+    }
+  else if (u < (1<<16))
+    {
+      UNI_PUT_NEXT(0xe0 | (u >> 12));
+#if UNI_MAX_UTF8_BYTES > 3
+      goto put2;
+    }
+#ifdef UNI_WANT_UTF8_FULL
+  else if (u <= 0x10ffff)
+#else
+  else if (u < (1<<21))
+#endif
+    {
+      UNI_PUT_NEXT(0xf0 | (u >> 18));
+#if UNI_MAX_UTF8_BYTES > 4
+      goto put3;
+    }
+  else if (u < (1<<26))
+    {
+      UNI_PUT_NEXT(0xf8 | (u >> 24));
+      goto put4;
+    }
+  else if (u < (1U<<31))
+    {
+      UNI_PUT_NEXT(0xfc | (u >> 30));
+      UNI_PUT_NEXT(0x80 | ((u >> 24) & 0x3f));
+put4:
+      UNI_PUT_NEXT(0x80 | ((u >> 18) & 0x3f));
+put3:
+#endif
+      UNI_PUT_NEXT(0x80 | ((u >> 12) & 0x3f));
+put2:
+#endif
+      UNI_PUT_NEXT(0x80 | ((u >> 6) & 0x3f));
+put1:
+      UNI_PUT_NEXT(0x80 | (u & 0x3f));
+    }
+  else
+    ASSERT(0);
+}
+
+#endif
+
+/* Reading UTF-8 */
+
+#ifdef UNI_WANT_GET_UTF8
+
+#define UNI_GET_NEXT \
+   do { \
+     UNI_GIVE_PEEKC; \
+     if (unlikely((c & 0xc0) != 0x80)) goto bad; \
+     u = (u << 6) | (c & 0x3f); \
+     UNI_GIVE_SKIPC; \
+  } while (0)
+
+{
+  byte c;
+  uint u, limit;
+#ifdef UNI_GIVE_FIRST_GETC
+  UNI_GIVE_FIRST_GETC;
+#else
+  UNI_GIVE_PEEKC;
+  UNI_GIVE_SKIPC;
+#endif
+  u = c;
+  if (u < 0x80)
+    ;
+  else if (unlikely(u < 0xc0))
+    goto bad;
+  else if (u < 0xe0)
+    {
+      u &= 0x1f;
+      limit = 0x80;
+      goto get1;
+    }
+  else if (u < 0xf0)
+    {
+      u &= 0x0f;
+      limit = 0x800;
+#if UNI_MAX_UTF8_BYTES > 3
+      goto get2;
+    }
+  else if (u < 0xf8)
+    {
+      u &= 0x07;
+      limit = 1 << 16;
+#if UNI_MAX_UTF8_BYTES > 4
+      goto get3;
+    }
+  else if (u < 0xfc)
+    {
+      u &= 0x03;
+      limit = 1 << 21;
+      goto get4;
+    }
+  else if (u < 0xfe)
+    {
+      u &= 0x01;
+      limit = 1 << 26;
+
+      UNI_GET_NEXT;
+get4:
+      UNI_GET_NEXT;
+get3:
+#endif
+      UNI_GET_NEXT;
+get2:
+#endif
+      UNI_GET_NEXT;
+get1:
+      UNI_GET_NEXT;
+
+      if (unlikely(u < limit))
+       goto bad;
+#ifdef UNI_WANT_UTF8_FULL
+      if (unlikely(u > 0x10ffff))
+       goto bad;
+#endif
+    }
+  else
+    {
+bad:
+#ifdef UNI_GIVE_BAD
+      UNI_GIVE_BAD;
+#else
+      u = UNI_REPLACEMENT;
+#endif
+    }
+
+  UNI_GIVE_OK;
+}
+
+#endif
+
+#undef UNI_WANT_PUT_UTF8
+#undef UNI_WANT_GET_UTF8
+
+#undef UNI_WANT_UTF8_32
+#undef UNI_WANT_UTF8_FULL
+
+#undef UNI_MAX_UTF8_BYTES
+
+#undef UNI_GIVE_PUTC
+#undef UNI_GIVE_FIRST_GETC
+#undef UNI_GIVE_PEEKC
+#undef UNI_GIVE_SKIPC
+#undef UNI_GIVE_OK
+#undef UNI_GIVE_BAD
+
+#undef UNI_PUT_NEXT
+#undef UNI_GET_NEXT
diff --git a/ucw/unicode.h b/ucw/unicode.h

index 4ec1c6b2ca8aa010733f0a09d46824ec2ae10f25..b35531cd239cb409034e9a631d8844b0554b1084 100644 (file)
--- a/ucw/unicode.h
+++ b/ucw/unicode.h
@@ -29,20 +29,22 @@
   **/
  static inline byte *utf8_put(byte *p, uint u)
  {
-  if (u < 0x80)
-    *p++ = u;
-  else if (u < 0x800)
-    {
-      *p++ = 0xc0 | (u >> 6);
-      *p++ = 0x80 | (u & 0x3f);
-    }
-  else
-    {
-      ASSERT(u < 0x10000);
-      *p++ = 0xe0 | (u >> 12);
-      *p++ = 0x80 | ((u >> 6) & 0x3f);
-      *p++ = 0x80 | (u & 0x3f);
-    }
+  #define UNI_WANT_PUT_UTF8
+  #define UNI_GIVE_PUTC *p++ = c
+  #include <ucw/unicode-gen.h>
+  return p;
+}
+
+/**
+ * Encode a value from the range `[0, 0x10FFFF]`
+ * (full Unicode range); up to 4 bytes needed (RFC2279).
+ **/
+static inline byte *utf8_full_put(byte *p, uint u)
+{
+  #define UNI_WANT_PUT_UTF8
+  #define UNI_WANT_UTF8_FULL
+  #define UNI_GIVE_PUTC *p++ = c
+  #include <ucw/unicode-gen.h>
    return p;
  }
  
@@ -52,77 +54,40 @@ static inline byte *utf8_put(byte *p, uint u)
   **/
  static inline byte *utf8_32_put(byte *p, uint u)
  {
-  if (u < 0x80)
-    *p++ = u;
-  else if (u < 0x800)
-    {
-      *p++ = 0xc0 | (u >> 6);
-      goto put1;
-    }
-  else if (u < (1<<16))
-    {
-      *p++ = 0xe0 | (u >> 12);
-      goto put2;
-    }
-  else if (u < (1<<21))
-    {
-      *p++ = 0xf0 | (u >> 18);
-      goto put3;
-    }
-  else if (u < (1<<26))
-    {
-      *p++ = 0xf8 | (u >> 24);
-      goto put4;
-    }
-  else if (u < (1U<<31))
-    {
-      *p++ = 0xfc | (u >> 30);
-      *p++ = 0x80 | ((u >> 24) & 0x3f);
-put4: *p++ = 0x80 | ((u >> 18) & 0x3f);
-put3: *p++ = 0x80 | ((u >> 12) & 0x3f);
-put2: *p++ = 0x80 | ((u >> 6) & 0x3f);
-put1: *p++ = 0x80 | (u & 0x3f);
-    }
-  else
-    ASSERT(0);
+  #define UNI_WANT_PUT_UTF8
+  #define UNI_WANT_UTF8_32
+  #define UNI_GIVE_PUTC *p++ = c
+  #include <ucw/unicode-gen.h>
    return p;
  }
  
-#define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f)
-#define UTF8_CHECK_RANGE(r) if (unlikely(u < r)) goto bad
-
  /**
   * Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane)
   * or return @repl if the encoding has been corrupted.
   **/
  static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl)
  {
-  uint u = *p++;
-  if (u < 0x80)
-    ;
-  else if (unlikely(u < 0xc0))
-    {
-      /* Incorrect byte sequence */
-    bad:
-      u = repl;
-    }
-  else if (u < 0xe0)
-    {
-      u &= 0x1f;
-      UTF8_GET_NEXT;
-      UTF8_CHECK_RANGE(0x80);
-    }
-  else if (likely(u < 0xf0))
-    {
-      u &= 0x0f;
-      UTF8_GET_NEXT;
-      UTF8_GET_NEXT;
-      UTF8_CHECK_RANGE(0x800);
-    }
-  else
-    goto bad;
-  *uu = u;
-  return (byte *)p;
+  #define UNI_WANT_GET_UTF8
+  #define UNI_GIVE_PEEKC c = *p
+  #define UNI_GIVE_SKIPC p++
+  #define UNI_GIVE_OK { *uu = u; return (byte *)p; }
+  #define UNI_GIVE_BAD u = repl
+  #include <ucw/unicode-gen.h>
+}
+
+/**
+ * Decode a value from the range `[0, 0x10FFFF]` (full Unicode range)
+ * or return @repl if the encoding has been corrupted.
+ **/
+static inline byte *utf8_full_get_repl(const byte *p, uint *uu, uint repl)
+{
+  #define UNI_WANT_GET_UTF8
+  #define UNI_WANT_UTF8_FULL
+  #define UNI_GIVE_PEEKC c = *p
+  #define UNI_GIVE_SKIPC p++
+  #define UNI_GIVE_OK { *uu = u; return (byte *)p; }
+  #define UNI_GIVE_BAD u = repl
+  #include <ucw/unicode-gen.h>
  }
  
  /**
@@ -131,57 +96,13 @@ static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl)
   **/
  static inline byte *utf8_32_get_repl(const byte *p, uint *uu, uint repl)
  {
-  uint u = *p++;
-  uint limit;
-  if (u < 0x80)
-    ;
-  else if (unlikely(u < 0xc0))
-    goto bad;
-  else if (u < 0xe0)
-    {
-      u &= 0x1f;
-      limit = 0x80;
-      goto get1;
-    }
-  else if (u < 0xf0)
-    {
-      u &= 0x0f;
-      limit = 0x800;
-      goto get2;
-    }
-  else if (u < 0xf8)
-    {
-      u &= 0x07;
-      limit = 1 << 16;
-      goto get3;
-    }
-  else if (u < 0xfc)
-    {
-      u &= 0x03;
-      limit = 1 << 21;
-      goto get4;
-    }
-  else if (u < 0xfe)
-    {
-      u &= 0x01;
-      limit = 1 << 26;
-      UTF8_GET_NEXT;
-get4: UTF8_GET_NEXT;
-get3: UTF8_GET_NEXT;
-get2: UTF8_GET_NEXT;
-get1: UTF8_GET_NEXT;
-      if (unlikely(u < limit))
-       goto bad;
-    }
-  else
-    goto bad;
-  *uu = u;
-  return (byte *)p;
-
-bad:
-  /* Incorrect byte sequence */
-  *uu = repl;
-  return (byte *)p;
+  #define UNI_WANT_GET_UTF8
+  #define UNI_WANT_UTF8_32
+  #define UNI_GIVE_PEEKC c = *p
+  #define UNI_GIVE_SKIPC p++
+  #define UNI_GIVE_OK { *uu = u; return (byte *)p; }
+  #define UNI_GIVE_BAD u = repl
+  #include <ucw/unicode-gen.h>
  }
  
  /**
@@ -193,6 +114,15 @@ static inline byte *utf8_get(const byte *p, uint *uu)
    return utf8_get_repl(p, uu, UNI_REPLACEMENT);
  }
  
+/**
+ * Decode a value from the range `[0, 0x10FFFF]` (full Unicode range)
+ * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
+ **/
+static inline byte *utf8_full_get(const byte *p, uint *uu)
+{
+  return utf8_full_get_repl(p, uu, UNI_REPLACEMENT);
+}
+
  /**
   * Decode a value from the range `[0, 0x7FFFFFFF]`
   * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
author	Pavel Charvat <pchar@ucw.cz>
	Fri, 29 Dec 2017 07:40:24 +0000 (08:40 +0100)
committer	Pavel Charvat <pchar@ucw.cz>
	Fri, 29 Dec 2017 07:40:24 +0000 (08:40 +0100)
ucw/ff-unicode.c		patch \| blob \| history
ucw/ff-unicode.h		patch \| blob \| history
ucw/unicode-gen.h	[new file with mode: 0644]	patch \| blob
ucw/unicode.h		patch \| blob \| history