X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;ds=sidebyside;f=lib%2Fff-unicode.c;h=6057e2408dd0c3b9328079d2089dc3077fe97dcd;hb=e371dcc1cd2857036374dd9597705faed0427006;hp=69a5247b0500c109c89c5dc1c36f5b06186e29e9;hpb=10f1d0ab666c28cf8aeca9c04a254af5c6ed6b22;p=libucw.git diff --git a/lib/ff-unicode.c b/lib/ff-unicode.c index 69a5247b..6057e240 100644 --- a/lib/ff-unicode.c +++ b/lib/ff-unicode.c @@ -12,6 +12,9 @@ #include "lib/fastbuf.h" #include "lib/unicode.h" #include "lib/ff-unicode.h" +#include "lib/ff-binary.h" + +/*** UTF-8 ***/ int bget_utf8_slow(struct fastbuf *b, uns repl) @@ -165,3 +168,179 @@ bput_utf8_32_slow(struct fastbuf *b, uns u) bputc(b, 0x80 | (u & 0x3f)); } } + +/*** UTF-16 ***/ + +int +bget_utf16_be_slow(struct fastbuf *b, uns repl) +{ + if (bpeekc(b) < 0) + return -1; + uns u = bgetw_be(b), x, y; + if ((int)u < 0) + return repl; + if ((x = u - 0xd800) >= 0x800) + return u; + if (x >= 0x400 || bpeekc(b) < 0 || (y = bgetw_be(b) - 0xdc00) >= 0x400) + return repl; + return 0x10000 + (x << 10) + y; +} + +int +bget_utf16_le_slow(struct fastbuf *b, uns repl) +{ + if (bpeekc(b) < 0) + return -1; + uns u = bgetw_le(b), x, y; + if ((int)u < 0) + return repl; + if ((x = u - 0xd800) >= 0x800) + return u; + if (x >= 0x400 || bpeekc(b) < 0 || (y = bgetw_le(b) - 0xdc00) >= 0x400) + return repl; + return 0x10000 + (x << 10) + y; +} + +void +bput_utf16_be_slow(struct fastbuf *b, uns u) +{ + if (u < 0xd800 || (u < 0x10000 && u >= 0xe000)) + { + bputc(b, u >> 8); + bputc(b, u & 0xff); + } + else if ((u -= 0x10000) < 0x100000) + { + bputc(b, 0xd8 | (u >> 18)); + bputc(b, (u >> 10) & 0xff); + bputc(b, 0xdc | ((u >> 8) & 0x3)); + bputc(b, u & 0xff); + } + else + ASSERT(0); +} + +void +bput_utf16_le_slow(struct fastbuf *b, uns u) +{ + if (u < 0xd800 || (u < 0x10000 && u >= 0xe000)) + { + bputc(b, u & 0xff); + bputc(b, u >> 8); + } + else if ((u -= 0x10000) < 0x100000) + { + bputc(b, (u >> 10) & 0xff); + bputc(b, 0xd8 | (u >> 18)); + bputc(b, u & 0xff); + bputc(b, 0xdc | ((u >> 8) & 0x3)); + } + else + ASSERT(0); +} + +#ifdef TEST + +#include +#include + +int main(int argc, char **argv) +{ +#define FUNCS \ + F(BGET_UTF8) F(BGET_UTF8_32) F(BGET_UTF16_BE) F(BGET_UTF16_LE) \ + F(BPUT_UTF8) F(BPUT_UTF8_32) F(BPUT_UTF16_BE) F(BPUT_UTF16_LE) + + enum { +#define F(x) FUNC_##x, + FUNCS +#undef F + }; + char *names[] = { +#define F(x) [FUNC_##x] = #x, + FUNCS +#undef F + }; + + uns func = ~0U; + if (argc > 1) + for (uns i = 0; i < ARRAY_SIZE(names); i++) + if (!strcasecmp(names[i], argv[1])) + func = i; + if (!~func) + { + fprintf(stderr, "Invalid usage!\n"); + return 1; + } + + struct fastbuf *b = fbgrow_create(8); + if (func < FUNC_BPUT_UTF8) + { + uns u; + while (scanf("%x", &u) == 1) + bputc(b, u); + fbgrow_rewind(b); + while (bpeekc(b) >= 0) + { + if (btell(b)) + putchar(' '); + switch (func) + { + case FUNC_BGET_UTF8: + u = bget_utf8_slow(b, UNI_REPLACEMENT); + break; + case FUNC_BGET_UTF8_32: + u = bget_utf8_32_slow(b, UNI_REPLACEMENT); + break; + case FUNC_BGET_UTF16_BE: + u = bget_utf16_be_slow(b, UNI_REPLACEMENT); + break; + case FUNC_BGET_UTF16_LE: + u = bget_utf16_le_slow(b, UNI_REPLACEMENT); + break; + default: + ASSERT(0); + } + printf("%04x", u); + } + putchar('\n'); + } + else + { + uns u, i = 0; + while (scanf("%x", &u) == 1) + { + switch (func) + { + case FUNC_BPUT_UTF8: + bput_utf8_slow(b, u); + break; + case FUNC_BPUT_UTF8_32: + bput_utf8_32_slow(b, u); + break; + case FUNC_BPUT_UTF16_BE: + bput_utf16_be_slow(b, u); + break; + case FUNC_BPUT_UTF16_LE: + bput_utf16_le_slow(b, u); + break; + default: + ASSERT(0); + } + fbgrow_rewind(b); + u = 0; + while (bpeekc(b) >= 0) + { + if (i++) + putchar(' '); + printf("%02x", bgetc(b)); + } + fbgrow_reset(b); + } + putchar('\n'); + } + bclose(b); + + return 0; +} + +#endif