- added {get,put}_utf8_32() for all full 6-byte codes

author Robert Spalek <robert@ucw.cz>

Fri, 20 Aug 2004 09:34:02 +0000 (09:34 +0000)

committer Robert Spalek <robert@ucw.cz>

Fri, 20 Aug 2004 09:34:02 +0000 (09:34 +0000)
author Robert Spalek <robert@ucw.cz>
Fri, 20 Aug 2004 09:34:02 +0000 (09:34 +0000)
committer Robert Spalek <robert@ucw.cz>
Fri, 20 Aug 2004 09:34:02 +0000 (09:34 +0000)
diff --git a/lib/ff-utf8.c b/lib/ff-utf8.c

index f55719b1ada99c02811bee8d2da1141912f9d89b..b30cb581790048bc07ccefbb1d3206bdb6d7983f 100644 (file)
--- a/lib/ff-utf8.c
+++ b/lib/ff-utf8.c
@@ -2,6 +2,7 @@
   *     Sherlock Library: Reading and writing of UTF-8 on Fastbuf Streams
   *
   *     (c) 2001--2004 Martin Mares <mj@ucw.cz>
+ *     (c) 2004 Robert Spalek <robert@ucw.cz>
   *
   *     This software may be freely distributed and used according to the terms
   *     of the GNU Lesser General Public License.
@@ -53,6 +54,62 @@ bget_utf8_slow(struct fastbuf *b)
    return UNI_REPLACEMENT;
  }
  
+int
+bget_utf8_32_slow(struct fastbuf *b)
+{
+  int c = bgetc(b);
+  int code;
+  int nr;
+
+  if (c < 0x80)                                /* Includes EOF */
+    return c;
+  if (c < 0xc0)                                /* Incorrect combination */
+    return UNI_REPLACEMENT;
+  if (c < 0xe0)
+    {
+      code = c & 0x1f;
+      nr = 1;
+    }
+  else if (c < 0xf0)
+    {
+      code = c & 0x0f;
+      nr = 2;
+    }
+  else if (c < 0xf8)
+    {
+      code = c & 0x07;
+      nr = 3;
+    }
+  else if (c < 0xfc)
+    {
+      code = c & 0x03;
+      nr = 4;
+    }
+  else if (c < 0xfe)
+    {
+      code = c & 0x01;
+      nr = 5;
+    }
+  else                                 /* Too large, skip it */
+    {
+      while ((c = bgetc(b)) >= 0x80 && c < 0xc0)
+       ;
+      goto wrong;
+    }
+  while (nr-- > 0)
+    {
+      if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
+       goto wrong;
+      code = (code << 6) | (c & 0x3f);
+    }
+  return code;
+
+ wrong:
+  if (c >= 0)
+    bungetc(b);
+  return UNI_REPLACEMENT;
+}
+
  void
  bput_utf8_slow(struct fastbuf *b, uns u)
  {
@@ -71,3 +128,40 @@ bput_utf8_slow(struct fastbuf *b, uns u)
        bputc(b, 0x80 | (u & 0x3f));
      }
  }
+
+void
+bput_utf8_32_slow(struct fastbuf *b, uns u)
+{
+  ASSERT(u < (1U<<31));
+  if (u < 0x80)
+    bputc(b, u);
+  else
+    {
+      if (u < 0x800)
+       bputc(b, 0xc0 | (u >> 6));
+      else
+       {
+         if (u < (1<<16))
+           bputc(b, 0xe0 | (u >> 12));
+         else
+           {
+             if (u < (1<<21))
+               bputc(b, 0xf0 | (u >> 18));
+             else
+               {
+                 if (u < (1<<26))
+                   bputc(b, 0xf8 | (u >> 24));
+                 else
+                   {
+                     bputc(b, 0xfc | (u >> 30));
+                     bputc(b, 0x80 | ((u >> 24) & 0x3f));
+                   }
+                 bputc(b, 0x80 | ((u >> 18) & 0x3f));
+               }
+             bputc(b, 0x80 | ((u >> 12) & 0x3f));
+           }
+         bputc(b, 0x80 | ((u >> 6) & 0x3f));
+       }
+      bputc(b, 0x80 | (u & 0x3f));
+    }
+}
diff --git a/lib/ff-utf8.h b/lib/ff-utf8.h

index 752c7187aea9096f02b6e1945c98f0d94108279e..dd86ba6504227aabc432c908ff8e9d30615597da 100644 (file)
--- a/lib/ff-utf8.h
+++ b/lib/ff-utf8.h
@@ -2,6 +2,7 @@
   *     Sherlock Library: Reading and writing of UTF-8 on Fastbuf Streams
   *
   *     (c) 2001--2004 Martin Mares <mj@ucw.cz>
+ *     (c) 2004 Robert Spalek <robert@ucw.cz>
   *
   *     This software may be freely distributed and used according to the terms
   *     of the GNU Lesser General Public License.
@@ -14,14 +15,16 @@
  #include "lib/unicode.h"
  
  int bget_utf8_slow(struct fastbuf *b);
+int bget_utf8_32_slow(struct fastbuf *b);
  void bput_utf8_slow(struct fastbuf *b, uns u);
+void bput_utf8_32_slow(struct fastbuf *b, uns u);
  
  static inline int
  bget_utf8(struct fastbuf *b)
  {
    uns u;
  
-  if (bavailr(b) >= 5)
+  if (bavailr(b) >= 3)
      {
        GET_UTF8(b->bptr, u);
        return u;
@@ -40,4 +43,28 @@ bput_utf8(struct fastbuf *b, uns u)
      bput_utf8_slow(b, u);
  }
  
+static inline int
+bget_utf8_32(struct fastbuf *b)
+{
+  uns u;
+
+  if (bavailr(b) >= 6)
+    {
+      GET_UTF8_32(b->bptr, u);
+      return u;
+    }
+  else
+    return bget_utf8_32_slow(b);
+}
+
+static inline void
+bput_utf8_32(struct fastbuf *b, uns u)
+{
+  ASSERT(u < (1U<<31));
+  if (bavailw(b) >= 6)
+    PUT_UTF8_32(b->bptr, u);
+  else
+    bput_utf8_32_slow(b, u);
+}
+
  #endif
diff --git a/lib/unicode.h b/lib/unicode.h

index 199b3d7042cd2e3e511cb4b8f0584750ae5c0c4e..6358e9aa729b5c843d1a60b1b87956b3b63802a3 100644 (file)
--- a/lib/unicode.h
+++ b/lib/unicode.h
@@ -2,6 +2,7 @@
   *     Sherlock Library -- Unicode Characters
   *
   *     (c) 1997--2004 Martin Mares <mj@ucw.cz>
+ *     (c) 2004 Robert Spalek <robert@ucw.cz>
   *
   *     This software may be freely distributed and used according to the terms
   *     of the GNU Lesser General Public License.
@@ -30,6 +31,35 @@
      }                                  \
    } while(0)
  
+#define PUT_UTF8_32(p,u) do {          \
+  if (u < (1<<16))                     \
+    PUT_UTF8(p,u);                     \
+  else if (u < (1<<21))                        \
+    {                                  \
+      *p++ = 0xf0 | (u >> 18);         \
+      *p++ = 0x80 | ((u >> 12) & 0x3f);        \
+      *p++ = 0x80 | ((u >> 6) & 0x3f); \
+      *p++ = 0x80 | (u & 0x3f);                \
+    }                                  \
+  else if (u < (1<<26))                        \
+    {                                  \
+      *p++ = 0xf8 | (u >> 24);         \
+      *p++ = 0x80 | ((u >> 18) & 0x3f);        \
+      *p++ = 0x80 | ((u >> 12) & 0x3f);        \
+      *p++ = 0x80 | ((u >> 6) & 0x3f); \
+      *p++ = 0x80 | (u & 0x3f);                \
+    }                                  \
+  else if (u < (1U<<31))               \
+    {                                  \
+      *p++ = 0xfc | (u >> 30);         \
+      *p++ = 0x80 | ((u >> 24) & 0x3f);        \
+      *p++ = 0x80 | ((u >> 18) & 0x3f);        \
+      *p++ = 0x80 | ((u >> 12) & 0x3f);        \
+      *p++ = 0x80 | ((u >> 6) & 0x3f); \
+      *p++ = 0x80 | (u & 0x3f);                \
+    }                                  \
+  } while(0)
+
  #define IS_UTF8(c) ((c) >= 0xc0)
  
  #define GET_UTF8_CHAR(p,u) do {                \
@@ -56,12 +86,66 @@
        }                                        \
    } while (0)                          \
  
+#define GET_UTF8_32_CHAR(p,u) do {     \
+    if (*p < 0xf0)                     \
+      GET_UTF8_CHAR(p,u);              \
+    else if (*p < 0xf8)                        \
+      {                                        \
+       u = *p++ & 0x07;                \
+       if ((*p & 0xc0) == 0x80)        \
+         u = (u << 6) | (*p++ & 0x3f); \
+       if ((*p & 0xc0) == 0x80)        \
+         u = (u << 6) | (*p++ & 0x3f); \
+       if ((*p & 0xc0) == 0x80)        \
+         u = (u << 6) | (*p++ & 0x3f); \
+      }                                        \
+    else if (*p < 0xfc)                        \
+      {                                        \
+       u = *p++ & 0x03;                \
+       if ((*p & 0xc0) == 0x80)        \
+         u = (u << 6) | (*p++ & 0x3f); \
+       if ((*p & 0xc0) == 0x80)        \
+         u = (u << 6) | (*p++ & 0x3f); \
+       if ((*p & 0xc0) == 0x80)        \
+         u = (u << 6) | (*p++ & 0x3f); \
+       if ((*p & 0xc0) == 0x80)        \
+         u = (u << 6) | (*p++ & 0x3f); \
+      }                                        \
+    else if (*p < 0xfe)                        \
+      {                                        \
+       u = *p++ & 0x01;                \
+       if ((*p & 0xc0) == 0x80)        \
+         u = (u << 6) | (*p++ & 0x3f); \
+       if ((*p & 0xc0) == 0x80)        \
+         u = (u << 6) | (*p++ & 0x3f); \
+       if ((*p & 0xc0) == 0x80)        \
+         u = (u << 6) | (*p++ & 0x3f); \
+       if ((*p & 0xc0) == 0x80)        \
+         u = (u << 6) | (*p++ & 0x3f); \
+       if ((*p & 0xc0) == 0x80)        \
+         u = (u << 6) | (*p++ & 0x3f); \
+      }                                        \
+    else                               \
+      {        /* Too large, use replacement char */   \
+       p++;                            \
+       while ((*p & 0xc0) == 0x80)     \
+         p++;                          \
+       u = UNI_REPLACEMENT;            \
+      }                                        \
+  } while (0)                          \
+
  #define GET_UTF8(p,u)                  \
      if (IS_UTF8(*p))                   \
        GET_UTF8_CHAR(p,u);              \
      else                               \
        u = *p++
  
+#define GET_UTF8_32(p,u)               \
+    if (IS_UTF8(*p))                   \
+      GET_UTF8_32_CHAR(p,u);           \
+    else                               \
+      u = *p++
+
  #define UTF8_SKIP(p) do {                              \
      uns c = *p++;                                      \
      if (c >= 0xc0)                                     \
@@ -69,7 +153,7 @@
          p++, c <<= 1;                                  \
    } while (0)
  
-#define UTF8_SKIP_BWD(p) while ((--*(p) & 0xc0) == 0x80)
+#define UTF8_SKIP_BWD(p) while ((*--(p) & 0xc0) == 0x80)
  
  static inline uns
  utf8_space(uns u)
@@ -78,7 +162,13 @@ utf8_space(uns u)
      return 1;
    if (u < 0x800)
      return 2;
-  return 3;
+  if (u < (1<<16))
+    return 3;
+  if (u < (1<<21))
+    return 4;
+  if (u < (1<<26))
+    return 5;
+  return 6;
  }
  
  static inline uns
@@ -86,10 +176,16 @@ utf8_encoding_len(uns c)
  {
    if (c < 0x80)
      return 1;
-  ASSERT(c >= 0xc0 && c < 0xf0);
+  ASSERT(c >= 0xc0 && c < 0xfe);
    if (c < 0xe0)
      return 2;
-  return 3;
+  if (c < 0xf0)
+    return 3;
+  if (c < 0xf8)
+    return 4;
+  if (c < 0xfc)
+    return 5;
+  return 6;
  }
  
  /* unicode-utf8.c */
author	Robert Spalek <robert@ucw.cz>
	Fri, 20 Aug 2004 09:34:02 +0000 (09:34 +0000)
committer	Robert Spalek <robert@ucw.cz>
	Fri, 20 Aug 2004 09:34:02 +0000 (09:34 +0000)
lib/ff-utf8.c		patch \| blob \| history
lib/ff-utf8.h		patch \| blob \| history
lib/unicode.h		patch \| blob \| history