int
bget_utf8_slow(struct fastbuf *b, uint repl)
{
- int c = bgetc(b);
- int code;
-
- if (c < 0x80) /* Includes EOF */
- return c;
- if (c < 0xc0) /* Incorrect combination */
- return repl;
- if (c >= 0xf0) /* Too large, skip it */
- {
- while ((c = bgetc(b)) >= 0x80 && c < 0xc0)
- ;
- goto wrong;
- }
- if (c >= 0xe0) /* 3 bytes */
- {
- code = c & 0x0f;
- if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
- goto wrong;
- code = (code << 6) | (c & 0x3f);
- if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
- goto wrong;
- code = (code << 6) | (c & 0x3f);
- if (code < 0x800)
- goto wrong2;
- }
- else /* 2 bytes */
- {
- code = c & 0x1f;
- if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
- goto wrong;
- code = (code << 6) | (c & 0x3f);
- if (code < 0x80)
- goto wrong2;
- }
- return code;
+ #define UNI_WANT_GET_UTF8
+ #define UNI_GIVE_FIRST_GETC { int x = bgetc(b); if (x < 0) return -1; c = x; }
+ #define UNI_GIVE_PEEKC { int x = bpeekc(b); if (x < 0) goto bad; c = x; }
+ #define UNI_GIVE_SKIPC b->bptr++
+ #define UNI_GIVE_OK return u
+ #define UNI_GIVE_BAD u = repl
+ #include <ucw/unicode-gen.h>
+}
-wrong:
- if (c >= 0)
- bungetc(b);
-wrong2:
- return repl;
+int
+bget_utf8_full_slow(struct fastbuf *b, uint repl)
+{
+ #define UNI_WANT_GET_UTF8
+ #define UNI_WANT_UTF8_FULL
+ #define UNI_GIVE_FIRST_GETC { int x = bgetc(b); if (x < 0) return -1; c = x; }
+ #define UNI_GIVE_PEEKC { int x = bpeekc(b); if (x < 0) goto bad; c = x; }
+ #define UNI_GIVE_SKIPC b->bptr++
+ #define UNI_GIVE_OK return u
+ #define UNI_GIVE_BAD u = repl
+ #include <ucw/unicode-gen.h>
}
int
bget_utf8_32_slow(struct fastbuf *b, uint repl)
{
- int c = bgetc(b);
- int code;
- int nr;
- int limit;
-
- if (c < 0x80) /* Includes EOF */
- return c;
- if (c < 0xc0) /* Incorrect combination */
- return repl;
- if (c < 0xe0)
- {
- code = c & 0x1f;
- nr = 1;
- limit = 0x80;
- }
- else if (c < 0xf0)
- {
- code = c & 0x0f;
- nr = 2;
- limit = 0x800;
- }
- else if (c < 0xf8)
- {
- code = c & 0x07;
- nr = 3;
- limit = 1 << 16;
- }
- else if (c < 0xfc)
- {
- code = c & 0x03;
- nr = 4;
- limit = 1 << 21;
- }
- else if (c < 0xfe)
- {
- code = c & 0x01;
- nr = 5;
- limit = 1 << 26;
- }
- else /* Too large */
- goto wrong2;
- while (nr-- > 0)
- {
- if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
- goto wrong;
- code = (code << 6) | (c & 0x3f);
- }
- if (code < limit)
- goto wrong2;
- return code;
-
-wrong:
- if (c >= 0)
- bungetc(b);
-wrong2:
- return repl;
+ #define UNI_WANT_GET_UTF8
+ #define UNI_WANT_UTF8_32
+ #define UNI_GIVE_FIRST_GETC { int x = bgetc(b); if (x < 0) return -1; c = x; }
+ #define UNI_GIVE_PEEKC { int x = bpeekc(b); if (x < 0) goto bad; c = x; }
+ #define UNI_GIVE_SKIPC b->bptr++
+ #define UNI_GIVE_OK return u
+ #define UNI_GIVE_BAD u = repl
+ #include <ucw/unicode-gen.h>
}
void
bput_utf8_slow(struct fastbuf *b, uint u)
{
- ASSERT(u < 65536);
- if (u < 0x80)
- bputc(b, u);
- else
- {
- if (u < 0x800)
- bputc(b, 0xc0 | (u >> 6));
- else
- {
- bputc(b, 0xe0 | (u >> 12));
- bputc(b, 0x80 | ((u >> 6) & 0x3f));
- }
- bputc(b, 0x80 | (u & 0x3f));
- }
+ #define UNI_WANT_PUT_UTF8
+ #define UNI_GIVE_PUTC bputc(b, c)
+ #include <ucw/unicode-gen.h>
+}
+
+void
+bput_utf8_full_slow(struct fastbuf *b, uint u)
+{
+ #define UNI_WANT_PUT_UTF8
+ #define UNI_WANT_UTF8_FULL
+ #define UNI_GIVE_PUTC bputc(b, c)
+ #include <ucw/unicode-gen.h>
}
void
bput_utf8_32_slow(struct fastbuf *b, uint u)
{
- ASSERT(u < (1U<<31));
- if (u < 0x80)
- bputc(b, u);
- else
- {
- if (u < 0x800)
- bputc(b, 0xc0 | (u >> 6));
- else
- {
- if (u < (1<<16))
- bputc(b, 0xe0 | (u >> 12));
- else
- {
- if (u < (1<<21))
- bputc(b, 0xf0 | (u >> 18));
- else
- {
- if (u < (1<<26))
- bputc(b, 0xf8 | (u >> 24));
- else
- {
- bputc(b, 0xfc | (u >> 30));
- bputc(b, 0x80 | ((u >> 24) & 0x3f));
- }
- bputc(b, 0x80 | ((u >> 18) & 0x3f));
- }
- bputc(b, 0x80 | ((u >> 12) & 0x3f));
- }
- bputc(b, 0x80 | ((u >> 6) & 0x3f));
- }
- bputc(b, 0x80 | (u & 0x3f));
- }
+ #define UNI_WANT_PUT_UTF8
+ #define UNI_WANT_UTF8_32
+ #define UNI_GIVE_PUTC bputc(b, c)
+ #include <ucw/unicode-gen.h>
}
/*** UTF-16 ***/
#define bput_utf16_be_slow ucw_bput_utf16_be_slow
#define bput_utf16_le_slow ucw_bput_utf16_le_slow
#define bput_utf8_32_slow ucw_bput_utf8_32_slow
+#define bput_utf8_full_slow ucw_bput_utf8_full_slow
#define bput_utf8_slow ucw_bput_utf8_slow
#endif
/* ** UTF-8 ** */
int bget_utf8_slow(struct fastbuf *b, uint repl);
+int bget_utf8_full_slow(struct fastbuf *b, uint repl);
int bget_utf8_32_slow(struct fastbuf *b, uint repl);
void bput_utf8_slow(struct fastbuf *b, uint u);
+void bput_utf8_full_slow(struct fastbuf *b, uint u);
void bput_utf8_32_slow(struct fastbuf *b, uint u);
static inline int bget_utf8_repl(struct fastbuf *b, uint repl)
return bget_utf8_slow(b, repl);
}
+static inline int bget_utf8_full_repl(struct fastbuf *b, uint repl)
+{
+ uint u;
+ if (bavailr(b) >= 4)
+ {
+ b->bptr = utf8_full_get_repl(b->bptr, &u, repl);
+ return u;
+ }
+ else
+ return bget_utf8_full_slow(b, repl);
+}
+
static inline int bget_utf8_32_repl(struct fastbuf *b, uint repl)
{
uint u;
bput_utf8_slow(b, u);
}
+static inline void bput_utf8_full(struct fastbuf *b, uint u) /** Write a single utf8 character from range [0, 0x10ffff]. **/
+{
+ if (bavailw(b) >= 4)
+ b->bptr = utf8_full_put(b->bptr, u);
+ else
+ bput_utf8_full_slow(b, u);
+}
+
static inline void bput_utf8_32(struct fastbuf *b, uint u) /** Write a single utf8 character (from the whole unicode range). **/
{
if (bavailw(b) >= 6)
--- /dev/null
+#if defined(UNI_WANT_UTF8_32)
+#define UNI_MAX_UTF8_BYTES 6
+#elif defined(UNI_WANT_UTF8_FULL)
+#define UNI_MAX_UTF8_BYTES 4
+#else
+#define UNI_MAX_UTF8_BYTES 3
+#endif
+
+/* Writing UTF-8 */
+
+#ifdef UNI_WANT_PUT_UTF8
+
+#define UNI_PUT_NEXT(_c) do { byte c = (_c); UNI_GIVE_PUTC; } while (0)
+
+{
+ if (u < 0x80)
+ UNI_PUT_NEXT(u);
+ else if (u < 0x800)
+ {
+ UNI_PUT_NEXT(0xc0 | (u >> 6));
+ goto put1;
+ }
+ else if (u < (1<<16))
+ {
+ UNI_PUT_NEXT(0xe0 | (u >> 12));
+#if UNI_MAX_UTF8_BYTES > 3
+ goto put2;
+ }
+#ifdef UNI_WANT_UTF8_FULL
+ else if (u <= 0x10ffff)
+#else
+ else if (u < (1<<21))
+#endif
+ {
+ UNI_PUT_NEXT(0xf0 | (u >> 18));
+#if UNI_MAX_UTF8_BYTES > 4
+ goto put3;
+ }
+ else if (u < (1<<26))
+ {
+ UNI_PUT_NEXT(0xf8 | (u >> 24));
+ goto put4;
+ }
+ else if (u < (1U<<31))
+ {
+ UNI_PUT_NEXT(0xfc | (u >> 30));
+ UNI_PUT_NEXT(0x80 | ((u >> 24) & 0x3f));
+put4:
+ UNI_PUT_NEXT(0x80 | ((u >> 18) & 0x3f));
+put3:
+#endif
+ UNI_PUT_NEXT(0x80 | ((u >> 12) & 0x3f));
+put2:
+#endif
+ UNI_PUT_NEXT(0x80 | ((u >> 6) & 0x3f));
+put1:
+ UNI_PUT_NEXT(0x80 | (u & 0x3f));
+ }
+ else
+ ASSERT(0);
+}
+
+#endif
+
+/* Reading UTF-8 */
+
+#ifdef UNI_WANT_GET_UTF8
+
+#define UNI_GET_NEXT \
+ do { \
+ UNI_GIVE_PEEKC; \
+ if (unlikely((c & 0xc0) != 0x80)) goto bad; \
+ u = (u << 6) | (c & 0x3f); \
+ UNI_GIVE_SKIPC; \
+ } while (0)
+
+{
+ byte c;
+ uint u, limit;
+#ifdef UNI_GIVE_FIRST_GETC
+ UNI_GIVE_FIRST_GETC;
+#else
+ UNI_GIVE_PEEKC;
+ UNI_GIVE_SKIPC;
+#endif
+ u = c;
+ if (u < 0x80)
+ ;
+ else if (unlikely(u < 0xc0))
+ goto bad;
+ else if (u < 0xe0)
+ {
+ u &= 0x1f;
+ limit = 0x80;
+ goto get1;
+ }
+ else if (u < 0xf0)
+ {
+ u &= 0x0f;
+ limit = 0x800;
+#if UNI_MAX_UTF8_BYTES > 3
+ goto get2;
+ }
+ else if (u < 0xf8)
+ {
+ u &= 0x07;
+ limit = 1 << 16;
+#if UNI_MAX_UTF8_BYTES > 4
+ goto get3;
+ }
+ else if (u < 0xfc)
+ {
+ u &= 0x03;
+ limit = 1 << 21;
+ goto get4;
+ }
+ else if (u < 0xfe)
+ {
+ u &= 0x01;
+ limit = 1 << 26;
+
+ UNI_GET_NEXT;
+get4:
+ UNI_GET_NEXT;
+get3:
+#endif
+ UNI_GET_NEXT;
+get2:
+#endif
+ UNI_GET_NEXT;
+get1:
+ UNI_GET_NEXT;
+
+ if (unlikely(u < limit))
+ goto bad;
+#ifdef UNI_WANT_UTF8_FULL
+ if (unlikely(u > 0x10ffff))
+ goto bad;
+#endif
+ }
+ else
+ {
+bad:
+#ifdef UNI_GIVE_BAD
+ UNI_GIVE_BAD;
+#else
+ u = UNI_REPLACEMENT;
+#endif
+ }
+
+ UNI_GIVE_OK;
+}
+
+#endif
+
+#undef UNI_WANT_PUT_UTF8
+#undef UNI_WANT_GET_UTF8
+
+#undef UNI_WANT_UTF8_32
+#undef UNI_WANT_UTF8_FULL
+
+#undef UNI_MAX_UTF8_BYTES
+
+#undef UNI_GIVE_PUTC
+#undef UNI_GIVE_FIRST_GETC
+#undef UNI_GIVE_PEEKC
+#undef UNI_GIVE_SKIPC
+#undef UNI_GIVE_OK
+#undef UNI_GIVE_BAD
+
+#undef UNI_PUT_NEXT
+#undef UNI_GET_NEXT
**/
static inline byte *utf8_put(byte *p, uint u)
{
- if (u < 0x80)
- *p++ = u;
- else if (u < 0x800)
- {
- *p++ = 0xc0 | (u >> 6);
- *p++ = 0x80 | (u & 0x3f);
- }
- else
- {
- ASSERT(u < 0x10000);
- *p++ = 0xe0 | (u >> 12);
- *p++ = 0x80 | ((u >> 6) & 0x3f);
- *p++ = 0x80 | (u & 0x3f);
- }
+ #define UNI_WANT_PUT_UTF8
+ #define UNI_GIVE_PUTC *p++ = c
+ #include <ucw/unicode-gen.h>
+ return p;
+}
+
+/**
+ * Encode a value from the range `[0, 0x10FFFF]`
+ * (full Unicode range); up to 4 bytes needed (RFC2279).
+ **/
+static inline byte *utf8_full_put(byte *p, uint u)
+{
+ #define UNI_WANT_PUT_UTF8
+ #define UNI_WANT_UTF8_FULL
+ #define UNI_GIVE_PUTC *p++ = c
+ #include <ucw/unicode-gen.h>
return p;
}
**/
static inline byte *utf8_32_put(byte *p, uint u)
{
- if (u < 0x80)
- *p++ = u;
- else if (u < 0x800)
- {
- *p++ = 0xc0 | (u >> 6);
- goto put1;
- }
- else if (u < (1<<16))
- {
- *p++ = 0xe0 | (u >> 12);
- goto put2;
- }
- else if (u < (1<<21))
- {
- *p++ = 0xf0 | (u >> 18);
- goto put3;
- }
- else if (u < (1<<26))
- {
- *p++ = 0xf8 | (u >> 24);
- goto put4;
- }
- else if (u < (1U<<31))
- {
- *p++ = 0xfc | (u >> 30);
- *p++ = 0x80 | ((u >> 24) & 0x3f);
-put4: *p++ = 0x80 | ((u >> 18) & 0x3f);
-put3: *p++ = 0x80 | ((u >> 12) & 0x3f);
-put2: *p++ = 0x80 | ((u >> 6) & 0x3f);
-put1: *p++ = 0x80 | (u & 0x3f);
- }
- else
- ASSERT(0);
+ #define UNI_WANT_PUT_UTF8
+ #define UNI_WANT_UTF8_32
+ #define UNI_GIVE_PUTC *p++ = c
+ #include <ucw/unicode-gen.h>
return p;
}
-#define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f)
-#define UTF8_CHECK_RANGE(r) if (unlikely(u < r)) goto bad
-
/**
* Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane)
* or return @repl if the encoding has been corrupted.
**/
static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl)
{
- uint u = *p++;
- if (u < 0x80)
- ;
- else if (unlikely(u < 0xc0))
- {
- /* Incorrect byte sequence */
- bad:
- u = repl;
- }
- else if (u < 0xe0)
- {
- u &= 0x1f;
- UTF8_GET_NEXT;
- UTF8_CHECK_RANGE(0x80);
- }
- else if (likely(u < 0xf0))
- {
- u &= 0x0f;
- UTF8_GET_NEXT;
- UTF8_GET_NEXT;
- UTF8_CHECK_RANGE(0x800);
- }
- else
- goto bad;
- *uu = u;
- return (byte *)p;
+ #define UNI_WANT_GET_UTF8
+ #define UNI_GIVE_PEEKC c = *p
+ #define UNI_GIVE_SKIPC p++
+ #define UNI_GIVE_OK { *uu = u; return (byte *)p; }
+ #define UNI_GIVE_BAD u = repl
+ #include <ucw/unicode-gen.h>
+}
+
+/**
+ * Decode a value from the range `[0, 0x10FFFF]` (full Unicode range)
+ * or return @repl if the encoding has been corrupted.
+ **/
+static inline byte *utf8_full_get_repl(const byte *p, uint *uu, uint repl)
+{
+ #define UNI_WANT_GET_UTF8
+ #define UNI_WANT_UTF8_FULL
+ #define UNI_GIVE_PEEKC c = *p
+ #define UNI_GIVE_SKIPC p++
+ #define UNI_GIVE_OK { *uu = u; return (byte *)p; }
+ #define UNI_GIVE_BAD u = repl
+ #include <ucw/unicode-gen.h>
}
/**
**/
static inline byte *utf8_32_get_repl(const byte *p, uint *uu, uint repl)
{
- uint u = *p++;
- uint limit;
- if (u < 0x80)
- ;
- else if (unlikely(u < 0xc0))
- goto bad;
- else if (u < 0xe0)
- {
- u &= 0x1f;
- limit = 0x80;
- goto get1;
- }
- else if (u < 0xf0)
- {
- u &= 0x0f;
- limit = 0x800;
- goto get2;
- }
- else if (u < 0xf8)
- {
- u &= 0x07;
- limit = 1 << 16;
- goto get3;
- }
- else if (u < 0xfc)
- {
- u &= 0x03;
- limit = 1 << 21;
- goto get4;
- }
- else if (u < 0xfe)
- {
- u &= 0x01;
- limit = 1 << 26;
- UTF8_GET_NEXT;
-get4: UTF8_GET_NEXT;
-get3: UTF8_GET_NEXT;
-get2: UTF8_GET_NEXT;
-get1: UTF8_GET_NEXT;
- if (unlikely(u < limit))
- goto bad;
- }
- else
- goto bad;
- *uu = u;
- return (byte *)p;
-
-bad:
- /* Incorrect byte sequence */
- *uu = repl;
- return (byte *)p;
+ #define UNI_WANT_GET_UTF8
+ #define UNI_WANT_UTF8_32
+ #define UNI_GIVE_PEEKC c = *p
+ #define UNI_GIVE_SKIPC p++
+ #define UNI_GIVE_OK { *uu = u; return (byte *)p; }
+ #define UNI_GIVE_BAD u = repl
+ #include <ucw/unicode-gen.h>
}
/**
return utf8_get_repl(p, uu, UNI_REPLACEMENT);
}
+/**
+ * Decode a value from the range `[0, 0x10FFFF]` (full Unicode range)
+ * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
+ **/
+static inline byte *utf8_full_get(const byte *p, uint *uu)
+{
+ return utf8_full_get_repl(p, uu, UNI_REPLACEMENT);
+}
+
/**
* Decode a value from the range `[0, 0x7FFFFFFF]`
* or return `UNI_REPLACEMENT` if the encoding has been corrupted.