* Sherlock Library: Reading and writing of UTF-8 on Fastbuf Streams
*
* (c) 2001--2004 Martin Mares <mj@ucw.cz>
+ * (c) 2004 Robert Spalek <robert@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
return UNI_REPLACEMENT;
}
+int
+bget_utf8_32_slow(struct fastbuf *b)
+{
+ int c = bgetc(b);
+ int code;
+ int nr;
+
+ if (c < 0x80) /* Includes EOF */
+ return c;
+ if (c < 0xc0) /* Incorrect combination */
+ return UNI_REPLACEMENT;
+ if (c < 0xe0)
+ {
+ code = c & 0x1f;
+ nr = 1;
+ }
+ else if (c < 0xf0)
+ {
+ code = c & 0x0f;
+ nr = 2;
+ }
+ else if (c < 0xf8)
+ {
+ code = c & 0x07;
+ nr = 3;
+ }
+ else if (c < 0xfc)
+ {
+ code = c & 0x03;
+ nr = 4;
+ }
+ else if (c < 0xfe)
+ {
+ code = c & 0x01;
+ nr = 5;
+ }
+ else /* Too large, skip it */
+ {
+ while ((c = bgetc(b)) >= 0x80 && c < 0xc0)
+ ;
+ goto wrong;
+ }
+ while (nr-- > 0)
+ {
+ if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
+ goto wrong;
+ code = (code << 6) | (c & 0x3f);
+ }
+ return code;
+
+ wrong:
+ if (c >= 0)
+ bungetc(b);
+ return UNI_REPLACEMENT;
+}
+
void
bput_utf8_slow(struct fastbuf *b, uns u)
{
bputc(b, 0x80 | (u & 0x3f));
}
}
+
+void
+bput_utf8_32_slow(struct fastbuf *b, uns u)
+{
+ ASSERT(u < (1U<<31));
+ if (u < 0x80)
+ bputc(b, u);
+ else
+ {
+ if (u < 0x800)
+ bputc(b, 0xc0 | (u >> 6));
+ else
+ {
+ if (u < (1<<16))
+ bputc(b, 0xe0 | (u >> 12));
+ else
+ {
+ if (u < (1<<21))
+ bputc(b, 0xf0 | (u >> 18));
+ else
+ {
+ if (u < (1<<26))
+ bputc(b, 0xf8 | (u >> 24));
+ else
+ {
+ bputc(b, 0xfc | (u >> 30));
+ bputc(b, 0x80 | ((u >> 24) & 0x3f));
+ }
+ bputc(b, 0x80 | ((u >> 18) & 0x3f));
+ }
+ bputc(b, 0x80 | ((u >> 12) & 0x3f));
+ }
+ bputc(b, 0x80 | ((u >> 6) & 0x3f));
+ }
+ bputc(b, 0x80 | (u & 0x3f));
+ }
+}
* Sherlock Library: Reading and writing of UTF-8 on Fastbuf Streams
*
* (c) 2001--2004 Martin Mares <mj@ucw.cz>
+ * (c) 2004 Robert Spalek <robert@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
#include "lib/unicode.h"
int bget_utf8_slow(struct fastbuf *b);
+int bget_utf8_32_slow(struct fastbuf *b);
void bput_utf8_slow(struct fastbuf *b, uns u);
+void bput_utf8_32_slow(struct fastbuf *b, uns u);
static inline int
bget_utf8(struct fastbuf *b)
{
uns u;
- if (bavailr(b) >= 5)
+ if (bavailr(b) >= 3)
{
GET_UTF8(b->bptr, u);
return u;
bput_utf8_slow(b, u);
}
+static inline int
+bget_utf8_32(struct fastbuf *b)
+{
+ uns u;
+
+ if (bavailr(b) >= 6)
+ {
+ GET_UTF8_32(b->bptr, u);
+ return u;
+ }
+ else
+ return bget_utf8_32_slow(b);
+}
+
+static inline void
+bput_utf8_32(struct fastbuf *b, uns u)
+{
+ ASSERT(u < (1U<<31));
+ if (bavailw(b) >= 6)
+ PUT_UTF8_32(b->bptr, u);
+ else
+ bput_utf8_32_slow(b, u);
+}
+
#endif
* Sherlock Library -- Unicode Characters
*
* (c) 1997--2004 Martin Mares <mj@ucw.cz>
+ * (c) 2004 Robert Spalek <robert@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
} \
} while(0)
+#define PUT_UTF8_32(p,u) do { \
+ if (u < (1<<16)) \
+ PUT_UTF8(p,u); \
+ else if (u < (1<<21)) \
+ { \
+ *p++ = 0xf0 | (u >> 18); \
+ *p++ = 0x80 | ((u >> 12) & 0x3f); \
+ *p++ = 0x80 | ((u >> 6) & 0x3f); \
+ *p++ = 0x80 | (u & 0x3f); \
+ } \
+ else if (u < (1<<26)) \
+ { \
+ *p++ = 0xf8 | (u >> 24); \
+ *p++ = 0x80 | ((u >> 18) & 0x3f); \
+ *p++ = 0x80 | ((u >> 12) & 0x3f); \
+ *p++ = 0x80 | ((u >> 6) & 0x3f); \
+ *p++ = 0x80 | (u & 0x3f); \
+ } \
+ else if (u < (1U<<31)) \
+ { \
+ *p++ = 0xfc | (u >> 30); \
+ *p++ = 0x80 | ((u >> 24) & 0x3f); \
+ *p++ = 0x80 | ((u >> 18) & 0x3f); \
+ *p++ = 0x80 | ((u >> 12) & 0x3f); \
+ *p++ = 0x80 | ((u >> 6) & 0x3f); \
+ *p++ = 0x80 | (u & 0x3f); \
+ } \
+ } while(0)
+
#define IS_UTF8(c) ((c) >= 0xc0)
#define GET_UTF8_CHAR(p,u) do { \
} \
} while (0) \
+#define GET_UTF8_32_CHAR(p,u) do { \
+ if (*p < 0xf0) \
+ GET_UTF8_CHAR(p,u); \
+ else if (*p < 0xf8) \
+ { \
+ u = *p++ & 0x07; \
+ if ((*p & 0xc0) == 0x80) \
+ u = (u << 6) | (*p++ & 0x3f); \
+ if ((*p & 0xc0) == 0x80) \
+ u = (u << 6) | (*p++ & 0x3f); \
+ if ((*p & 0xc0) == 0x80) \
+ u = (u << 6) | (*p++ & 0x3f); \
+ } \
+ else if (*p < 0xfc) \
+ { \
+ u = *p++ & 0x03; \
+ if ((*p & 0xc0) == 0x80) \
+ u = (u << 6) | (*p++ & 0x3f); \
+ if ((*p & 0xc0) == 0x80) \
+ u = (u << 6) | (*p++ & 0x3f); \
+ if ((*p & 0xc0) == 0x80) \
+ u = (u << 6) | (*p++ & 0x3f); \
+ if ((*p & 0xc0) == 0x80) \
+ u = (u << 6) | (*p++ & 0x3f); \
+ } \
+ else if (*p < 0xfe) \
+ { \
+ u = *p++ & 0x01; \
+ if ((*p & 0xc0) == 0x80) \
+ u = (u << 6) | (*p++ & 0x3f); \
+ if ((*p & 0xc0) == 0x80) \
+ u = (u << 6) | (*p++ & 0x3f); \
+ if ((*p & 0xc0) == 0x80) \
+ u = (u << 6) | (*p++ & 0x3f); \
+ if ((*p & 0xc0) == 0x80) \
+ u = (u << 6) | (*p++ & 0x3f); \
+ if ((*p & 0xc0) == 0x80) \
+ u = (u << 6) | (*p++ & 0x3f); \
+ } \
+ else \
+ { /* Too large, use replacement char */ \
+ p++; \
+ while ((*p & 0xc0) == 0x80) \
+ p++; \
+ u = UNI_REPLACEMENT; \
+ } \
+ } while (0) \
+
#define GET_UTF8(p,u) \
if (IS_UTF8(*p)) \
GET_UTF8_CHAR(p,u); \
else \
u = *p++
+#define GET_UTF8_32(p,u) \
+ if (IS_UTF8(*p)) \
+ GET_UTF8_32_CHAR(p,u); \
+ else \
+ u = *p++
+
#define UTF8_SKIP(p) do { \
uns c = *p++; \
if (c >= 0xc0) \
p++, c <<= 1; \
} while (0)
-#define UTF8_SKIP_BWD(p) while ((--*(p) & 0xc0) == 0x80)
+#define UTF8_SKIP_BWD(p) while ((*--(p) & 0xc0) == 0x80)
static inline uns
utf8_space(uns u)
return 1;
if (u < 0x800)
return 2;
- return 3;
+ if (u < (1<<16))
+ return 3;
+ if (u < (1<<21))
+ return 4;
+ if (u < (1<<26))
+ return 5;
+ return 6;
}
static inline uns
{
if (c < 0x80)
return 1;
- ASSERT(c >= 0xc0 && c < 0xf0);
+ ASSERT(c >= 0xc0 && c < 0xfe);
if (c < 0xe0)
return 2;
- return 3;
+ if (c < 0xf0)
+ return 3;
+ if (c < 0xf8)
+ return 4;
+ if (c < 0xfc)
+ return 5;
+ return 6;
}
/* unicode-utf8.c */