From 542dedf4edb4302c008004d751a9bb055dfaee3d Mon Sep 17 00:00:00 2001
From: Robert Spalek <robert@ucw.cz>
Date: Fri, 20 Aug 2004 09:34:02 +0000
Subject: [PATCH] - added {get,put}_utf8_32() for all full 6-byte codes - fixed
 UTF8_SKIP_BWD(); it is never used - upgraded utf8_space() and
 utf8_encoding_len()

---
 lib/ff-utf8.c |  94 +++++++++++++++++++++++++++++++++++++++++++++
 lib/ff-utf8.h |  29 +++++++++++++-
 lib/unicode.h | 104 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 222 insertions(+), 5 deletions(-)

diff --git a/lib/ff-utf8.c b/lib/ff-utf8.c
index f55719b1..b30cb581 100644
--- a/lib/ff-utf8.c
+++ b/lib/ff-utf8.c
@@ -2,6 +2,7 @@
  *	Sherlock Library: Reading and writing of UTF-8 on Fastbuf Streams
  *
  *	(c) 2001--2004 Martin Mares <mj@ucw.cz>
+ *	(c) 2004 Robert Spalek <robert@ucw.cz>
  *
  *	This software may be freely distributed and used according to the terms
  *	of the GNU Lesser General Public License.
@@ -53,6 +54,62 @@ bget_utf8_slow(struct fastbuf *b)
   return UNI_REPLACEMENT;
 }
 
+int
+bget_utf8_32_slow(struct fastbuf *b)
+{
+  int c = bgetc(b);
+  int code;
+  int nr;
+
+  if (c < 0x80)				/* Includes EOF */
+    return c;
+  if (c < 0xc0)				/* Incorrect combination */
+    return UNI_REPLACEMENT;
+  if (c < 0xe0)
+    {
+      code = c & 0x1f;
+      nr = 1;
+    }
+  else if (c < 0xf0)
+    {
+      code = c & 0x0f;
+      nr = 2;
+    }
+  else if (c < 0xf8)
+    {
+      code = c & 0x07;
+      nr = 3;
+    }
+  else if (c < 0xfc)
+    {
+      code = c & 0x03;
+      nr = 4;
+    }
+  else if (c < 0xfe)
+    {
+      code = c & 0x01;
+      nr = 5;
+    }
+  else					/* Too large, skip it */
+    {
+      while ((c = bgetc(b)) >= 0x80 && c < 0xc0)
+	;
+      goto wrong;
+    }
+  while (nr-- > 0)
+    {
+      if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
+	goto wrong;
+      code = (code << 6) | (c & 0x3f);
+    }
+  return code;
+
+ wrong:
+  if (c >= 0)
+    bungetc(b);
+  return UNI_REPLACEMENT;
+}
+
 void
 bput_utf8_slow(struct fastbuf *b, uns u)
 {
@@ -71,3 +128,40 @@ bput_utf8_slow(struct fastbuf *b, uns u)
       bputc(b, 0x80 | (u & 0x3f));
     }
 }
+
+void
+bput_utf8_32_slow(struct fastbuf *b, uns u)
+{
+  ASSERT(u < (1U<<31));
+  if (u < 0x80)
+    bputc(b, u);
+  else
+    {
+      if (u < 0x800)
+	bputc(b, 0xc0 | (u >> 6));
+      else
+	{
+	  if (u < (1<<16))
+	    bputc(b, 0xe0 | (u >> 12));
+	  else
+	    {
+	      if (u < (1<<21))
+		bputc(b, 0xf0 | (u >> 18));
+	      else
+		{
+		  if (u < (1<<26))
+		    bputc(b, 0xf8 | (u >> 24));
+		  else
+		    {
+		      bputc(b, 0xfc | (u >> 30));
+		      bputc(b, 0x80 | ((u >> 24) & 0x3f));
+		    }
+		  bputc(b, 0x80 | ((u >> 18) & 0x3f));
+		}
+	      bputc(b, 0x80 | ((u >> 12) & 0x3f));
+	    }
+	  bputc(b, 0x80 | ((u >> 6) & 0x3f));
+	}
+      bputc(b, 0x80 | (u & 0x3f));
+    }
+}
diff --git a/lib/ff-utf8.h b/lib/ff-utf8.h
index 752c7187..dd86ba65 100644
--- a/lib/ff-utf8.h
+++ b/lib/ff-utf8.h
@@ -2,6 +2,7 @@
  *	Sherlock Library: Reading and writing of UTF-8 on Fastbuf Streams
  *
  *	(c) 2001--2004 Martin Mares <mj@ucw.cz>
+ *	(c) 2004 Robert Spalek <robert@ucw.cz>
  *
  *	This software may be freely distributed and used according to the terms
  *	of the GNU Lesser General Public License.
@@ -14,14 +15,16 @@
 #include "lib/unicode.h"
 
 int bget_utf8_slow(struct fastbuf *b);
+int bget_utf8_32_slow(struct fastbuf *b);
 void bput_utf8_slow(struct fastbuf *b, uns u);
+void bput_utf8_32_slow(struct fastbuf *b, uns u);
 
 static inline int
 bget_utf8(struct fastbuf *b)
 {
   uns u;
 
-  if (bavailr(b) >= 5)
+  if (bavailr(b) >= 3)
     {
       GET_UTF8(b->bptr, u);
       return u;
@@ -40,4 +43,28 @@ bput_utf8(struct fastbuf *b, uns u)
     bput_utf8_slow(b, u);
 }
 
+static inline int
+bget_utf8_32(struct fastbuf *b)
+{
+  uns u;
+
+  if (bavailr(b) >= 6)
+    {
+      GET_UTF8_32(b->bptr, u);
+      return u;
+    }
+  else
+    return bget_utf8_32_slow(b);
+}
+
+static inline void
+bput_utf8_32(struct fastbuf *b, uns u)
+{
+  ASSERT(u < (1U<<31));
+  if (bavailw(b) >= 6)
+    PUT_UTF8_32(b->bptr, u);
+  else
+    bput_utf8_32_slow(b, u);
+}
+
 #endif
diff --git a/lib/unicode.h b/lib/unicode.h
index 199b3d70..6358e9aa 100644
--- a/lib/unicode.h
+++ b/lib/unicode.h
@@ -2,6 +2,7 @@
  *	Sherlock Library -- Unicode Characters
  *
  *	(c) 1997--2004 Martin Mares <mj@ucw.cz>
+ *	(c) 2004 Robert Spalek <robert@ucw.cz>
  *
  *	This software may be freely distributed and used according to the terms
  *	of the GNU Lesser General Public License.
@@ -30,6 +31,35 @@
     }					\
   } while(0)
 
+#define PUT_UTF8_32(p,u) do {		\
+  if (u < (1<<16))			\
+    PUT_UTF8(p,u);			\
+  else if (u < (1<<21))			\
+    {					\
+      *p++ = 0xf0 | (u >> 18);		\
+      *p++ = 0x80 | ((u >> 12) & 0x3f);	\
+      *p++ = 0x80 | ((u >> 6) & 0x3f);	\
+      *p++ = 0x80 | (u & 0x3f);		\
+    }					\
+  else if (u < (1<<26))			\
+    {					\
+      *p++ = 0xf8 | (u >> 24);		\
+      *p++ = 0x80 | ((u >> 18) & 0x3f);	\
+      *p++ = 0x80 | ((u >> 12) & 0x3f);	\
+      *p++ = 0x80 | ((u >> 6) & 0x3f);	\
+      *p++ = 0x80 | (u & 0x3f);		\
+    }					\
+  else if (u < (1U<<31))		\
+    {					\
+      *p++ = 0xfc | (u >> 30);		\
+      *p++ = 0x80 | ((u >> 24) & 0x3f);	\
+      *p++ = 0x80 | ((u >> 18) & 0x3f);	\
+      *p++ = 0x80 | ((u >> 12) & 0x3f);	\
+      *p++ = 0x80 | ((u >> 6) & 0x3f);	\
+      *p++ = 0x80 | (u & 0x3f);		\
+    }					\
+  } while(0)
+
 #define IS_UTF8(c) ((c) >= 0xc0)
 
 #define GET_UTF8_CHAR(p,u) do {		\
@@ -56,12 +86,66 @@
       }					\
   } while (0)				\
 
+#define GET_UTF8_32_CHAR(p,u) do {	\
+    if (*p < 0xf0)			\
+      GET_UTF8_CHAR(p,u);		\
+    else if (*p < 0xf8)			\
+      {					\
+	u = *p++ & 0x07;		\
+	if ((*p & 0xc0) == 0x80)       	\
+	  u = (u << 6) | (*p++ & 0x3f);	\
+	if ((*p & 0xc0) == 0x80)	\
+	  u = (u << 6) | (*p++ & 0x3f);	\
+	if ((*p & 0xc0) == 0x80)	\
+	  u = (u << 6) | (*p++ & 0x3f);	\
+      }					\
+    else if (*p < 0xfc)			\
+      {					\
+	u = *p++ & 0x03;		\
+	if ((*p & 0xc0) == 0x80)       	\
+	  u = (u << 6) | (*p++ & 0x3f);	\
+	if ((*p & 0xc0) == 0x80)       	\
+	  u = (u << 6) | (*p++ & 0x3f);	\
+	if ((*p & 0xc0) == 0x80)	\
+	  u = (u << 6) | (*p++ & 0x3f);	\
+	if ((*p & 0xc0) == 0x80)	\
+	  u = (u << 6) | (*p++ & 0x3f);	\
+      }					\
+    else if (*p < 0xfe)			\
+      {					\
+	u = *p++ & 0x01;		\
+	if ((*p & 0xc0) == 0x80)       	\
+	  u = (u << 6) | (*p++ & 0x3f);	\
+	if ((*p & 0xc0) == 0x80)       	\
+	  u = (u << 6) | (*p++ & 0x3f);	\
+	if ((*p & 0xc0) == 0x80)       	\
+	  u = (u << 6) | (*p++ & 0x3f);	\
+	if ((*p & 0xc0) == 0x80)	\
+	  u = (u << 6) | (*p++ & 0x3f);	\
+	if ((*p & 0xc0) == 0x80)	\
+	  u = (u << 6) | (*p++ & 0x3f);	\
+      }					\
+    else				\
+      {	/* Too large, use replacement char */	\
+	p++;				\
+	while ((*p & 0xc0) == 0x80)	\
+	  p++;				\
+	u = UNI_REPLACEMENT;		\
+      }					\
+  } while (0)				\
+
 #define GET_UTF8(p,u)			\
     if (IS_UTF8(*p))			\
       GET_UTF8_CHAR(p,u);		\
     else				\
       u = *p++
 
+#define GET_UTF8_32(p,u)		\
+    if (IS_UTF8(*p))			\
+      GET_UTF8_32_CHAR(p,u);		\
+    else				\
+      u = *p++
+
 #define UTF8_SKIP(p) do {				\
     uns c = *p++;					\
     if (c >= 0xc0)					\
@@ -69,7 +153,7 @@
         p++, c <<= 1;					\
   } while (0)
 
-#define UTF8_SKIP_BWD(p) while ((--*(p) & 0xc0) == 0x80)
+#define UTF8_SKIP_BWD(p) while ((*--(p) & 0xc0) == 0x80)
 
 static inline uns
 utf8_space(uns u)
@@ -78,7 +162,13 @@ utf8_space(uns u)
     return 1;
   if (u < 0x800)
     return 2;
-  return 3;
+  if (u < (1<<16))
+    return 3;
+  if (u < (1<<21))
+    return 4;
+  if (u < (1<<26))
+    return 5;
+  return 6;
 }
 
 static inline uns
@@ -86,10 +176,16 @@ utf8_encoding_len(uns c)
 {
   if (c < 0x80)
     return 1;
-  ASSERT(c >= 0xc0 && c < 0xf0);
+  ASSERT(c >= 0xc0 && c < 0xfe);
   if (c < 0xe0)
     return 2;
-  return 3;
+  if (c < 0xf0)
+    return 3;
+  if (c < 0xf8)
+    return 4;
+  if (c < 0xfc)
+    return 5;
+  return 6;
 }
 
 /* unicode-utf8.c */
-- 
2.39.5