From 4f09a030963aea5c12876eb49db76d1ce4df380d Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Fri, 29 Dec 2017 08:40:24 +0100 Subject: [PATCH] UTF-8: Generator of long inlined functions. Also added "uft8_full" variants for 0..0x10ffff range. --- ucw/ff-unicode.c | 185 +++++++++++----------------------------------- ucw/ff-unicode.h | 23 ++++++ ucw/unicode-gen.h | 172 ++++++++++++++++++++++++++++++++++++++++++ ucw/unicode.h | 184 ++++++++++++++------------------------------- 4 files changed, 295 insertions(+), 269 deletions(-) create mode 100644 ucw/unicode-gen.h diff --git a/ucw/ff-unicode.c b/ucw/ff-unicode.c index e0faa0c8..de6df9d7 100644 --- a/ucw/ff-unicode.c +++ b/ucw/ff-unicode.c @@ -19,164 +19,65 @@ int bget_utf8_slow(struct fastbuf *b, uint repl) { - int c = bgetc(b); - int code; - - if (c < 0x80) /* Includes EOF */ - return c; - if (c < 0xc0) /* Incorrect combination */ - return repl; - if (c >= 0xf0) /* Too large, skip it */ - { - while ((c = bgetc(b)) >= 0x80 && c < 0xc0) - ; - goto wrong; - } - if (c >= 0xe0) /* 3 bytes */ - { - code = c & 0x0f; - if ((c = bgetc(b)) < 0x80 || c >= 0xc0) - goto wrong; - code = (code << 6) | (c & 0x3f); - if ((c = bgetc(b)) < 0x80 || c >= 0xc0) - goto wrong; - code = (code << 6) | (c & 0x3f); - if (code < 0x800) - goto wrong2; - } - else /* 2 bytes */ - { - code = c & 0x1f; - if ((c = bgetc(b)) < 0x80 || c >= 0xc0) - goto wrong; - code = (code << 6) | (c & 0x3f); - if (code < 0x80) - goto wrong2; - } - return code; + #define UNI_WANT_GET_UTF8 + #define UNI_GIVE_FIRST_GETC { int x = bgetc(b); if (x < 0) return -1; c = x; } + #define UNI_GIVE_PEEKC { int x = bpeekc(b); if (x < 0) goto bad; c = x; } + #define UNI_GIVE_SKIPC b->bptr++ + #define UNI_GIVE_OK return u + #define UNI_GIVE_BAD u = repl + #include +} -wrong: - if (c >= 0) - bungetc(b); -wrong2: - return repl; +int +bget_utf8_full_slow(struct fastbuf *b, uint repl) +{ + #define UNI_WANT_GET_UTF8 + #define UNI_WANT_UTF8_FULL + #define UNI_GIVE_FIRST_GETC { int x = bgetc(b); if (x < 0) return -1; c = x; } + #define UNI_GIVE_PEEKC { int x = bpeekc(b); if (x < 0) goto bad; c = x; } + #define UNI_GIVE_SKIPC b->bptr++ + #define UNI_GIVE_OK return u + #define UNI_GIVE_BAD u = repl + #include } int bget_utf8_32_slow(struct fastbuf *b, uint repl) { - int c = bgetc(b); - int code; - int nr; - int limit; - - if (c < 0x80) /* Includes EOF */ - return c; - if (c < 0xc0) /* Incorrect combination */ - return repl; - if (c < 0xe0) - { - code = c & 0x1f; - nr = 1; - limit = 0x80; - } - else if (c < 0xf0) - { - code = c & 0x0f; - nr = 2; - limit = 0x800; - } - else if (c < 0xf8) - { - code = c & 0x07; - nr = 3; - limit = 1 << 16; - } - else if (c < 0xfc) - { - code = c & 0x03; - nr = 4; - limit = 1 << 21; - } - else if (c < 0xfe) - { - code = c & 0x01; - nr = 5; - limit = 1 << 26; - } - else /* Too large */ - goto wrong2; - while (nr-- > 0) - { - if ((c = bgetc(b)) < 0x80 || c >= 0xc0) - goto wrong; - code = (code << 6) | (c & 0x3f); - } - if (code < limit) - goto wrong2; - return code; - -wrong: - if (c >= 0) - bungetc(b); -wrong2: - return repl; + #define UNI_WANT_GET_UTF8 + #define UNI_WANT_UTF8_32 + #define UNI_GIVE_FIRST_GETC { int x = bgetc(b); if (x < 0) return -1; c = x; } + #define UNI_GIVE_PEEKC { int x = bpeekc(b); if (x < 0) goto bad; c = x; } + #define UNI_GIVE_SKIPC b->bptr++ + #define UNI_GIVE_OK return u + #define UNI_GIVE_BAD u = repl + #include } void bput_utf8_slow(struct fastbuf *b, uint u) { - ASSERT(u < 65536); - if (u < 0x80) - bputc(b, u); - else - { - if (u < 0x800) - bputc(b, 0xc0 | (u >> 6)); - else - { - bputc(b, 0xe0 | (u >> 12)); - bputc(b, 0x80 | ((u >> 6) & 0x3f)); - } - bputc(b, 0x80 | (u & 0x3f)); - } + #define UNI_WANT_PUT_UTF8 + #define UNI_GIVE_PUTC bputc(b, c) + #include +} + +void +bput_utf8_full_slow(struct fastbuf *b, uint u) +{ + #define UNI_WANT_PUT_UTF8 + #define UNI_WANT_UTF8_FULL + #define UNI_GIVE_PUTC bputc(b, c) + #include } void bput_utf8_32_slow(struct fastbuf *b, uint u) { - ASSERT(u < (1U<<31)); - if (u < 0x80) - bputc(b, u); - else - { - if (u < 0x800) - bputc(b, 0xc0 | (u >> 6)); - else - { - if (u < (1<<16)) - bputc(b, 0xe0 | (u >> 12)); - else - { - if (u < (1<<21)) - bputc(b, 0xf0 | (u >> 18)); - else - { - if (u < (1<<26)) - bputc(b, 0xf8 | (u >> 24)); - else - { - bputc(b, 0xfc | (u >> 30)); - bputc(b, 0x80 | ((u >> 24) & 0x3f)); - } - bputc(b, 0x80 | ((u >> 18) & 0x3f)); - } - bputc(b, 0x80 | ((u >> 12) & 0x3f)); - } - bputc(b, 0x80 | ((u >> 6) & 0x3f)); - } - bputc(b, 0x80 | (u & 0x3f)); - } + #define UNI_WANT_PUT_UTF8 + #define UNI_WANT_UTF8_32 + #define UNI_GIVE_PUTC bputc(b, c) + #include } /*** UTF-16 ***/ diff --git a/ucw/ff-unicode.h b/ucw/ff-unicode.h index 79a0c493..addd58b8 100644 --- a/ucw/ff-unicode.h +++ b/ucw/ff-unicode.h @@ -23,14 +23,17 @@ #define bput_utf16_be_slow ucw_bput_utf16_be_slow #define bput_utf16_le_slow ucw_bput_utf16_le_slow #define bput_utf8_32_slow ucw_bput_utf8_32_slow +#define bput_utf8_full_slow ucw_bput_utf8_full_slow #define bput_utf8_slow ucw_bput_utf8_slow #endif /* ** UTF-8 ** */ int bget_utf8_slow(struct fastbuf *b, uint repl); +int bget_utf8_full_slow(struct fastbuf *b, uint repl); int bget_utf8_32_slow(struct fastbuf *b, uint repl); void bput_utf8_slow(struct fastbuf *b, uint u); +void bput_utf8_full_slow(struct fastbuf *b, uint u); void bput_utf8_32_slow(struct fastbuf *b, uint u); static inline int bget_utf8_repl(struct fastbuf *b, uint repl) @@ -45,6 +48,18 @@ static inline int bget_utf8_repl(struct fastbuf *b, uint repl) return bget_utf8_slow(b, repl); } +static inline int bget_utf8_full_repl(struct fastbuf *b, uint repl) +{ + uint u; + if (bavailr(b) >= 4) + { + b->bptr = utf8_full_get_repl(b->bptr, &u, repl); + return u; + } + else + return bget_utf8_full_slow(b, repl); +} + static inline int bget_utf8_32_repl(struct fastbuf *b, uint repl) { uint u; @@ -75,6 +90,14 @@ static inline void bput_utf8(struct fastbuf *b, uint u) /** Write a single utf8 bput_utf8_slow(b, u); } +static inline void bput_utf8_full(struct fastbuf *b, uint u) /** Write a single utf8 character from range [0, 0x10ffff]. **/ +{ + if (bavailw(b) >= 4) + b->bptr = utf8_full_put(b->bptr, u); + else + bput_utf8_full_slow(b, u); +} + static inline void bput_utf8_32(struct fastbuf *b, uint u) /** Write a single utf8 character (from the whole unicode range). **/ { if (bavailw(b) >= 6) diff --git a/ucw/unicode-gen.h b/ucw/unicode-gen.h new file mode 100644 index 00000000..6e10b8a9 --- /dev/null +++ b/ucw/unicode-gen.h @@ -0,0 +1,172 @@ +#if defined(UNI_WANT_UTF8_32) +#define UNI_MAX_UTF8_BYTES 6 +#elif defined(UNI_WANT_UTF8_FULL) +#define UNI_MAX_UTF8_BYTES 4 +#else +#define UNI_MAX_UTF8_BYTES 3 +#endif + +/* Writing UTF-8 */ + +#ifdef UNI_WANT_PUT_UTF8 + +#define UNI_PUT_NEXT(_c) do { byte c = (_c); UNI_GIVE_PUTC; } while (0) + +{ + if (u < 0x80) + UNI_PUT_NEXT(u); + else if (u < 0x800) + { + UNI_PUT_NEXT(0xc0 | (u >> 6)); + goto put1; + } + else if (u < (1<<16)) + { + UNI_PUT_NEXT(0xe0 | (u >> 12)); +#if UNI_MAX_UTF8_BYTES > 3 + goto put2; + } +#ifdef UNI_WANT_UTF8_FULL + else if (u <= 0x10ffff) +#else + else if (u < (1<<21)) +#endif + { + UNI_PUT_NEXT(0xf0 | (u >> 18)); +#if UNI_MAX_UTF8_BYTES > 4 + goto put3; + } + else if (u < (1<<26)) + { + UNI_PUT_NEXT(0xf8 | (u >> 24)); + goto put4; + } + else if (u < (1U<<31)) + { + UNI_PUT_NEXT(0xfc | (u >> 30)); + UNI_PUT_NEXT(0x80 | ((u >> 24) & 0x3f)); +put4: + UNI_PUT_NEXT(0x80 | ((u >> 18) & 0x3f)); +put3: +#endif + UNI_PUT_NEXT(0x80 | ((u >> 12) & 0x3f)); +put2: +#endif + UNI_PUT_NEXT(0x80 | ((u >> 6) & 0x3f)); +put1: + UNI_PUT_NEXT(0x80 | (u & 0x3f)); + } + else + ASSERT(0); +} + +#endif + +/* Reading UTF-8 */ + +#ifdef UNI_WANT_GET_UTF8 + +#define UNI_GET_NEXT \ + do { \ + UNI_GIVE_PEEKC; \ + if (unlikely((c & 0xc0) != 0x80)) goto bad; \ + u = (u << 6) | (c & 0x3f); \ + UNI_GIVE_SKIPC; \ + } while (0) + +{ + byte c; + uint u, limit; +#ifdef UNI_GIVE_FIRST_GETC + UNI_GIVE_FIRST_GETC; +#else + UNI_GIVE_PEEKC; + UNI_GIVE_SKIPC; +#endif + u = c; + if (u < 0x80) + ; + else if (unlikely(u < 0xc0)) + goto bad; + else if (u < 0xe0) + { + u &= 0x1f; + limit = 0x80; + goto get1; + } + else if (u < 0xf0) + { + u &= 0x0f; + limit = 0x800; +#if UNI_MAX_UTF8_BYTES > 3 + goto get2; + } + else if (u < 0xf8) + { + u &= 0x07; + limit = 1 << 16; +#if UNI_MAX_UTF8_BYTES > 4 + goto get3; + } + else if (u < 0xfc) + { + u &= 0x03; + limit = 1 << 21; + goto get4; + } + else if (u < 0xfe) + { + u &= 0x01; + limit = 1 << 26; + + UNI_GET_NEXT; +get4: + UNI_GET_NEXT; +get3: +#endif + UNI_GET_NEXT; +get2: +#endif + UNI_GET_NEXT; +get1: + UNI_GET_NEXT; + + if (unlikely(u < limit)) + goto bad; +#ifdef UNI_WANT_UTF8_FULL + if (unlikely(u > 0x10ffff)) + goto bad; +#endif + } + else + { +bad: +#ifdef UNI_GIVE_BAD + UNI_GIVE_BAD; +#else + u = UNI_REPLACEMENT; +#endif + } + + UNI_GIVE_OK; +} + +#endif + +#undef UNI_WANT_PUT_UTF8 +#undef UNI_WANT_GET_UTF8 + +#undef UNI_WANT_UTF8_32 +#undef UNI_WANT_UTF8_FULL + +#undef UNI_MAX_UTF8_BYTES + +#undef UNI_GIVE_PUTC +#undef UNI_GIVE_FIRST_GETC +#undef UNI_GIVE_PEEKC +#undef UNI_GIVE_SKIPC +#undef UNI_GIVE_OK +#undef UNI_GIVE_BAD + +#undef UNI_PUT_NEXT +#undef UNI_GET_NEXT diff --git a/ucw/unicode.h b/ucw/unicode.h index 4ec1c6b2..b35531cd 100644 --- a/ucw/unicode.h +++ b/ucw/unicode.h @@ -29,20 +29,22 @@ **/ static inline byte *utf8_put(byte *p, uint u) { - if (u < 0x80) - *p++ = u; - else if (u < 0x800) - { - *p++ = 0xc0 | (u >> 6); - *p++ = 0x80 | (u & 0x3f); - } - else - { - ASSERT(u < 0x10000); - *p++ = 0xe0 | (u >> 12); - *p++ = 0x80 | ((u >> 6) & 0x3f); - *p++ = 0x80 | (u & 0x3f); - } + #define UNI_WANT_PUT_UTF8 + #define UNI_GIVE_PUTC *p++ = c + #include + return p; +} + +/** + * Encode a value from the range `[0, 0x10FFFF]` + * (full Unicode range); up to 4 bytes needed (RFC2279). + **/ +static inline byte *utf8_full_put(byte *p, uint u) +{ + #define UNI_WANT_PUT_UTF8 + #define UNI_WANT_UTF8_FULL + #define UNI_GIVE_PUTC *p++ = c + #include return p; } @@ -52,77 +54,40 @@ static inline byte *utf8_put(byte *p, uint u) **/ static inline byte *utf8_32_put(byte *p, uint u) { - if (u < 0x80) - *p++ = u; - else if (u < 0x800) - { - *p++ = 0xc0 | (u >> 6); - goto put1; - } - else if (u < (1<<16)) - { - *p++ = 0xe0 | (u >> 12); - goto put2; - } - else if (u < (1<<21)) - { - *p++ = 0xf0 | (u >> 18); - goto put3; - } - else if (u < (1<<26)) - { - *p++ = 0xf8 | (u >> 24); - goto put4; - } - else if (u < (1U<<31)) - { - *p++ = 0xfc | (u >> 30); - *p++ = 0x80 | ((u >> 24) & 0x3f); -put4: *p++ = 0x80 | ((u >> 18) & 0x3f); -put3: *p++ = 0x80 | ((u >> 12) & 0x3f); -put2: *p++ = 0x80 | ((u >> 6) & 0x3f); -put1: *p++ = 0x80 | (u & 0x3f); - } - else - ASSERT(0); + #define UNI_WANT_PUT_UTF8 + #define UNI_WANT_UTF8_32 + #define UNI_GIVE_PUTC *p++ = c + #include return p; } -#define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f) -#define UTF8_CHECK_RANGE(r) if (unlikely(u < r)) goto bad - /** * Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane) * or return @repl if the encoding has been corrupted. **/ static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl) { - uint u = *p++; - if (u < 0x80) - ; - else if (unlikely(u < 0xc0)) - { - /* Incorrect byte sequence */ - bad: - u = repl; - } - else if (u < 0xe0) - { - u &= 0x1f; - UTF8_GET_NEXT; - UTF8_CHECK_RANGE(0x80); - } - else if (likely(u < 0xf0)) - { - u &= 0x0f; - UTF8_GET_NEXT; - UTF8_GET_NEXT; - UTF8_CHECK_RANGE(0x800); - } - else - goto bad; - *uu = u; - return (byte *)p; + #define UNI_WANT_GET_UTF8 + #define UNI_GIVE_PEEKC c = *p + #define UNI_GIVE_SKIPC p++ + #define UNI_GIVE_OK { *uu = u; return (byte *)p; } + #define UNI_GIVE_BAD u = repl + #include +} + +/** + * Decode a value from the range `[0, 0x10FFFF]` (full Unicode range) + * or return @repl if the encoding has been corrupted. + **/ +static inline byte *utf8_full_get_repl(const byte *p, uint *uu, uint repl) +{ + #define UNI_WANT_GET_UTF8 + #define UNI_WANT_UTF8_FULL + #define UNI_GIVE_PEEKC c = *p + #define UNI_GIVE_SKIPC p++ + #define UNI_GIVE_OK { *uu = u; return (byte *)p; } + #define UNI_GIVE_BAD u = repl + #include } /** @@ -131,57 +96,13 @@ static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl) **/ static inline byte *utf8_32_get_repl(const byte *p, uint *uu, uint repl) { - uint u = *p++; - uint limit; - if (u < 0x80) - ; - else if (unlikely(u < 0xc0)) - goto bad; - else if (u < 0xe0) - { - u &= 0x1f; - limit = 0x80; - goto get1; - } - else if (u < 0xf0) - { - u &= 0x0f; - limit = 0x800; - goto get2; - } - else if (u < 0xf8) - { - u &= 0x07; - limit = 1 << 16; - goto get3; - } - else if (u < 0xfc) - { - u &= 0x03; - limit = 1 << 21; - goto get4; - } - else if (u < 0xfe) - { - u &= 0x01; - limit = 1 << 26; - UTF8_GET_NEXT; -get4: UTF8_GET_NEXT; -get3: UTF8_GET_NEXT; -get2: UTF8_GET_NEXT; -get1: UTF8_GET_NEXT; - if (unlikely(u < limit)) - goto bad; - } - else - goto bad; - *uu = u; - return (byte *)p; - -bad: - /* Incorrect byte sequence */ - *uu = repl; - return (byte *)p; + #define UNI_WANT_GET_UTF8 + #define UNI_WANT_UTF8_32 + #define UNI_GIVE_PEEKC c = *p + #define UNI_GIVE_SKIPC p++ + #define UNI_GIVE_OK { *uu = u; return (byte *)p; } + #define UNI_GIVE_BAD u = repl + #include } /** @@ -193,6 +114,15 @@ static inline byte *utf8_get(const byte *p, uint *uu) return utf8_get_repl(p, uu, UNI_REPLACEMENT); } +/** + * Decode a value from the range `[0, 0x10FFFF]` (full Unicode range) + * or return `UNI_REPLACEMENT` if the encoding has been corrupted. + **/ +static inline byte *utf8_full_get(const byte *p, uint *uu) +{ + return utf8_full_get_repl(p, uu, UNI_REPLACEMENT); +} + /** * Decode a value from the range `[0, 0x7FFFFFFF]` * or return `UNI_REPLACEMENT` if the encoding has been corrupted. -- 2.39.5