From 542dedf4edb4302c008004d751a9bb055dfaee3d Mon Sep 17 00:00:00 2001 From: Robert Spalek Date: Fri, 20 Aug 2004 09:34:02 +0000 Subject: [PATCH] - added {get,put}_utf8_32() for all full 6-byte codes - fixed UTF8_SKIP_BWD(); it is never used - upgraded utf8_space() and utf8_encoding_len() --- lib/ff-utf8.c | 94 +++++++++++++++++++++++++++++++++++++++++++++ lib/ff-utf8.h | 29 +++++++++++++- lib/unicode.h | 104 ++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 222 insertions(+), 5 deletions(-) diff --git a/lib/ff-utf8.c b/lib/ff-utf8.c index f55719b1..b30cb581 100644 --- a/lib/ff-utf8.c +++ b/lib/ff-utf8.c @@ -2,6 +2,7 @@ * Sherlock Library: Reading and writing of UTF-8 on Fastbuf Streams * * (c) 2001--2004 Martin Mares + * (c) 2004 Robert Spalek * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -53,6 +54,62 @@ bget_utf8_slow(struct fastbuf *b) return UNI_REPLACEMENT; } +int +bget_utf8_32_slow(struct fastbuf *b) +{ + int c = bgetc(b); + int code; + int nr; + + if (c < 0x80) /* Includes EOF */ + return c; + if (c < 0xc0) /* Incorrect combination */ + return UNI_REPLACEMENT; + if (c < 0xe0) + { + code = c & 0x1f; + nr = 1; + } + else if (c < 0xf0) + { + code = c & 0x0f; + nr = 2; + } + else if (c < 0xf8) + { + code = c & 0x07; + nr = 3; + } + else if (c < 0xfc) + { + code = c & 0x03; + nr = 4; + } + else if (c < 0xfe) + { + code = c & 0x01; + nr = 5; + } + else /* Too large, skip it */ + { + while ((c = bgetc(b)) >= 0x80 && c < 0xc0) + ; + goto wrong; + } + while (nr-- > 0) + { + if ((c = bgetc(b)) < 0x80 || c >= 0xc0) + goto wrong; + code = (code << 6) | (c & 0x3f); + } + return code; + + wrong: + if (c >= 0) + bungetc(b); + return UNI_REPLACEMENT; +} + void bput_utf8_slow(struct fastbuf *b, uns u) { @@ -71,3 +128,40 @@ bput_utf8_slow(struct fastbuf *b, uns u) bputc(b, 0x80 | (u & 0x3f)); } } + +void +bput_utf8_32_slow(struct fastbuf *b, uns u) +{ + ASSERT(u < (1U<<31)); + if (u < 0x80) + bputc(b, u); + else + { + if (u < 0x800) + bputc(b, 0xc0 | (u >> 6)); + else + { + if (u < (1<<16)) + bputc(b, 0xe0 | (u >> 12)); + else + { + if (u < (1<<21)) + bputc(b, 0xf0 | (u >> 18)); + else + { + if (u < (1<<26)) + bputc(b, 0xf8 | (u >> 24)); + else + { + bputc(b, 0xfc | (u >> 30)); + bputc(b, 0x80 | ((u >> 24) & 0x3f)); + } + bputc(b, 0x80 | ((u >> 18) & 0x3f)); + } + bputc(b, 0x80 | ((u >> 12) & 0x3f)); + } + bputc(b, 0x80 | ((u >> 6) & 0x3f)); + } + bputc(b, 0x80 | (u & 0x3f)); + } +} diff --git a/lib/ff-utf8.h b/lib/ff-utf8.h index 752c7187..dd86ba65 100644 --- a/lib/ff-utf8.h +++ b/lib/ff-utf8.h @@ -2,6 +2,7 @@ * Sherlock Library: Reading and writing of UTF-8 on Fastbuf Streams * * (c) 2001--2004 Martin Mares + * (c) 2004 Robert Spalek * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -14,14 +15,16 @@ #include "lib/unicode.h" int bget_utf8_slow(struct fastbuf *b); +int bget_utf8_32_slow(struct fastbuf *b); void bput_utf8_slow(struct fastbuf *b, uns u); +void bput_utf8_32_slow(struct fastbuf *b, uns u); static inline int bget_utf8(struct fastbuf *b) { uns u; - if (bavailr(b) >= 5) + if (bavailr(b) >= 3) { GET_UTF8(b->bptr, u); return u; @@ -40,4 +43,28 @@ bput_utf8(struct fastbuf *b, uns u) bput_utf8_slow(b, u); } +static inline int +bget_utf8_32(struct fastbuf *b) +{ + uns u; + + if (bavailr(b) >= 6) + { + GET_UTF8_32(b->bptr, u); + return u; + } + else + return bget_utf8_32_slow(b); +} + +static inline void +bput_utf8_32(struct fastbuf *b, uns u) +{ + ASSERT(u < (1U<<31)); + if (bavailw(b) >= 6) + PUT_UTF8_32(b->bptr, u); + else + bput_utf8_32_slow(b, u); +} + #endif diff --git a/lib/unicode.h b/lib/unicode.h index 199b3d70..6358e9aa 100644 --- a/lib/unicode.h +++ b/lib/unicode.h @@ -2,6 +2,7 @@ * Sherlock Library -- Unicode Characters * * (c) 1997--2004 Martin Mares + * (c) 2004 Robert Spalek * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -30,6 +31,35 @@ } \ } while(0) +#define PUT_UTF8_32(p,u) do { \ + if (u < (1<<16)) \ + PUT_UTF8(p,u); \ + else if (u < (1<<21)) \ + { \ + *p++ = 0xf0 | (u >> 18); \ + *p++ = 0x80 | ((u >> 12) & 0x3f); \ + *p++ = 0x80 | ((u >> 6) & 0x3f); \ + *p++ = 0x80 | (u & 0x3f); \ + } \ + else if (u < (1<<26)) \ + { \ + *p++ = 0xf8 | (u >> 24); \ + *p++ = 0x80 | ((u >> 18) & 0x3f); \ + *p++ = 0x80 | ((u >> 12) & 0x3f); \ + *p++ = 0x80 | ((u >> 6) & 0x3f); \ + *p++ = 0x80 | (u & 0x3f); \ + } \ + else if (u < (1U<<31)) \ + { \ + *p++ = 0xfc | (u >> 30); \ + *p++ = 0x80 | ((u >> 24) & 0x3f); \ + *p++ = 0x80 | ((u >> 18) & 0x3f); \ + *p++ = 0x80 | ((u >> 12) & 0x3f); \ + *p++ = 0x80 | ((u >> 6) & 0x3f); \ + *p++ = 0x80 | (u & 0x3f); \ + } \ + } while(0) + #define IS_UTF8(c) ((c) >= 0xc0) #define GET_UTF8_CHAR(p,u) do { \ @@ -56,12 +86,66 @@ } \ } while (0) \ +#define GET_UTF8_32_CHAR(p,u) do { \ + if (*p < 0xf0) \ + GET_UTF8_CHAR(p,u); \ + else if (*p < 0xf8) \ + { \ + u = *p++ & 0x07; \ + if ((*p & 0xc0) == 0x80) \ + u = (u << 6) | (*p++ & 0x3f); \ + if ((*p & 0xc0) == 0x80) \ + u = (u << 6) | (*p++ & 0x3f); \ + if ((*p & 0xc0) == 0x80) \ + u = (u << 6) | (*p++ & 0x3f); \ + } \ + else if (*p < 0xfc) \ + { \ + u = *p++ & 0x03; \ + if ((*p & 0xc0) == 0x80) \ + u = (u << 6) | (*p++ & 0x3f); \ + if ((*p & 0xc0) == 0x80) \ + u = (u << 6) | (*p++ & 0x3f); \ + if ((*p & 0xc0) == 0x80) \ + u = (u << 6) | (*p++ & 0x3f); \ + if ((*p & 0xc0) == 0x80) \ + u = (u << 6) | (*p++ & 0x3f); \ + } \ + else if (*p < 0xfe) \ + { \ + u = *p++ & 0x01; \ + if ((*p & 0xc0) == 0x80) \ + u = (u << 6) | (*p++ & 0x3f); \ + if ((*p & 0xc0) == 0x80) \ + u = (u << 6) | (*p++ & 0x3f); \ + if ((*p & 0xc0) == 0x80) \ + u = (u << 6) | (*p++ & 0x3f); \ + if ((*p & 0xc0) == 0x80) \ + u = (u << 6) | (*p++ & 0x3f); \ + if ((*p & 0xc0) == 0x80) \ + u = (u << 6) | (*p++ & 0x3f); \ + } \ + else \ + { /* Too large, use replacement char */ \ + p++; \ + while ((*p & 0xc0) == 0x80) \ + p++; \ + u = UNI_REPLACEMENT; \ + } \ + } while (0) \ + #define GET_UTF8(p,u) \ if (IS_UTF8(*p)) \ GET_UTF8_CHAR(p,u); \ else \ u = *p++ +#define GET_UTF8_32(p,u) \ + if (IS_UTF8(*p)) \ + GET_UTF8_32_CHAR(p,u); \ + else \ + u = *p++ + #define UTF8_SKIP(p) do { \ uns c = *p++; \ if (c >= 0xc0) \ @@ -69,7 +153,7 @@ p++, c <<= 1; \ } while (0) -#define UTF8_SKIP_BWD(p) while ((--*(p) & 0xc0) == 0x80) +#define UTF8_SKIP_BWD(p) while ((*--(p) & 0xc0) == 0x80) static inline uns utf8_space(uns u) @@ -78,7 +162,13 @@ utf8_space(uns u) return 1; if (u < 0x800) return 2; - return 3; + if (u < (1<<16)) + return 3; + if (u < (1<<21)) + return 4; + if (u < (1<<26)) + return 5; + return 6; } static inline uns @@ -86,10 +176,16 @@ utf8_encoding_len(uns c) { if (c < 0x80) return 1; - ASSERT(c >= 0xc0 && c < 0xf0); + ASSERT(c >= 0xc0 && c < 0xfe); if (c < 0xe0) return 2; - return 3; + if (c < 0xf0) + return 3; + if (c < 0xf8) + return 4; + if (c < 0xfc) + return 5; + return 6; } /* unicode-utf8.c */ -- 2.39.5