X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=charset%2Fcharconv.c;h=bf431dd1b93569fdf68ade711848801c4e18aedf;hb=d5fdccbecd2acde9a6e067b54fcd69b02f31a820;hp=c5d3717b65cfa8c62281dc21e765cb077ebf83ad;hpb=2926a57424946281a78b9cc5b8098b1cff39e45a;p=libucw.git diff --git a/charset/charconv.c b/charset/charconv.c index c5d3717b..bf431dd1 100644 --- a/charset/charconv.c +++ b/charset/charconv.c @@ -1,13 +1,16 @@ /* - * Character Set Conversion Library 1.1 + * Character Set Conversion Library 1.2 * - * (c) 1998--2001 Martin Mares + * (c) 1998--2004 Martin Mares + * (c) 2007 Pavel Charvat * * This software may be freely distributed and used according to the terms - * of the GNU General Public License. + * of the GNU Lesser General Public License. */ -#include "lib/lib.h" +#include "ucw/lib.h" +#include "ucw/unicode.h" +#include "ucw/unaligned.h" #include "charset/charconv.h" #include "charset/chartable.h" @@ -26,175 +29,328 @@ conv_none(struct conv_context *c) return CONV_SOURCE_END | CONV_DEST_END | CONV_SKIP; } +enum state { + CLEAN, + SINGLE_WRITE, + SEQ_WRITE, + UTF8_READ, + UTF8_WRITE_START, + UTF8_WRITE_CONT, + UTF16_BE_WRITE, + UTF16_LE_WRITE, + UTF16_BE_READ, + UTF16_BE_READ_1, + UTF16_BE_READ_2, + UTF16_BE_READ_3, + UTF16_LE_READ, + UTF16_LE_READ_1, + UTF16_LE_READ_2, + UTF16_LE_READ_3, +}; + static int -conv_from_utf8(struct conv_context *c) +conv_slow(struct conv_context *c) { - unsigned short *x_to_out = c->x_to_out; const unsigned char *s = c->source; const unsigned char *se = c->source_end; unsigned char *d = c->dest; unsigned char *de = c->dest_end; - unsigned char *strings = string_table - 0x100; - unsigned int counter, code, cc; - - if (c->state) - goto go_slow; - while (s < se) /* Optimized for speed, beware of spaghetti code */ + switch (c->state) { - cc = *s++; - if (cc < 0x80) - code = cc; - else if (cc >= 0xc0) + case SINGLE_WRITE: + if (d >= de) + goto cde; + *d++ = c->code; + break; + case SEQ_WRITE: +seq: + while (c->remains) { - if (s + 6 > se) - goto go_slow_1; - if (cc < 0xe0) - { - if ((s[0] & 0xc0) != 0x80) - goto nocode; - code = cc & 0x1f; - code = (code << 6) | (*s++ & 0x3f); - } - else if (cc < 0xf0) - { - if ((s[0] & 0xc0) != 0x80 || (s[1] & 0xc0) != 0x80) - goto nocode; - code = cc & 0x0f; - code = (code << 6) | (*s++ & 0x3f); - code = (code << 6) | (*s++ & 0x3f); - } - else if (cc < 0xfc) + if (d >= de) + goto cde; + *d++ = *c->string_at++; + c->remains--; + } + break; + + case UTF8_READ: + while (c->remains) + { + if (s >= se) + goto cse; + if ((*s & 0xc0) != 0x80) { - while (cc & 0x80) - { - if ((*s++ & 0xc0) != 0x80) - break; - cc <<= 1; - } - goto nocode; + c->code = 0xfffd; + break; } - else - goto nocode; + c->code = (c->code << 6) | (*s++ & 0x3f); + c->remains--; + } + if (c->code >= 0x10000) + c->code = 0xfffd; +got_char: + c->source = s; + c->state = 0; + return -1; + + /* Writing of UTF-8 */ + case UTF8_WRITE_START: + if (d >= de) + goto cde; + if (c->code < 0x80) + { + *d++ = c->code; + break; + } + else if (c->code < 0x800) + { + *d++ = 0xc0 | (c->code >> 6); + c->code <<= 10; + c->remains = 1; } else { - nocode: - code = 0xfffd; + *d++ = 0xe0 | (c->code >> 12); + c->code <<= 4; + c->remains = 2; } - uni_again: - code = x_to_out[uni_to_x[code >> 8U][code & 0xff]]; - code_again: - if (code < 0x100) + c->code &= 0xffff; + c->state = UTF8_WRITE_CONT; + /* fall-thru */ + case UTF8_WRITE_CONT: + while (c->remains) { if (d >= de) - goto dend; - *d++ = code; + goto cde; + *d++ = 0x80 | (c->code >> 10); + c->code <<= 6; + c->remains--; } + break; + + /* Writing of UTF-16BE */ + case UTF16_BE_WRITE: + { + void *p = &c->code; + c->string_at = p; + uns code = c->code; + c->string_at = p; + if (code < 0xd800 || code - 0xe000 < 0x2000) + {} + else if ((code -= 0x10000) < 0x100000) + { + put_u16_be(p, 0xd800 | (code >> 10)); + put_u16_be(p + 2, 0xdc00 | (code & 0x3ff)); + c->remains = 4; + c->state = SEQ_WRITE; + goto seq; + } + else + code = UNI_REPLACEMENT; + put_u16_be(p, code); + c->remains = 2; + c->state = SEQ_WRITE; + goto seq; + } + + /* Writing of UTF-16LE */ + case UTF16_LE_WRITE: + { + void *p = &c->code; + c->string_at = p; + uns code = c->code; + c->string_at = p; + if (code < 0xd800 || code - 0xe000 < 0x2000) + {} + else if ((code -= 0x10000) < 0x100000) + { + put_u16_le(p, 0xd800 | (code >> 10)); + put_u16_le(p + 2, 0xdc00 | (code & 0x3ff)); + c->remains = 4; + c->state = SEQ_WRITE; + } + else + code = UNI_REPLACEMENT; + put_u16_le(p, code); + c->remains = 2; + c->state = SEQ_WRITE; + goto seq; + } + + /* Reading of UTF16-BE */ + case UTF16_BE_READ: + if (s >= se) + goto cse; + c->code = *s++; + c->state = UTF16_BE_READ_1; + /* fall-thru */ + case UTF16_BE_READ_1: + if (s >= se) + goto cse; + c->code = (c->code << 8) | *s++; + if (c->code - 0xd800 >= 0x800) + goto got_char; + c->code = (c->code - 0xd800) << 10; + c->state = UTF16_BE_READ_2; + /* fall-thru */ + case UTF16_BE_READ_2: + if (s >= se) + goto cse; + if (*s - 0xdc >= 4) + c->code = ~0U; else - { - unsigned char *k = strings + code; - unsigned int len = *k++; + c->code |= (*s - 0xdc) << 8; + s++; + c->state = UTF16_BE_READ_3; + /* fall-thru */ + case UTF16_BE_READ_3: + if (s >= se) + goto cse; + if ((int)c->code >= 0) + c->code += 0x10000 + *s; + else + c->code = UNI_REPLACEMENT; + s++; + goto got_char; - if (d + len > de) - goto dend; - while (len--) - *d++ = *k++; - } + /* Reading of UTF16-LE */ + case UTF16_LE_READ: + if (s >= se) + goto cse; + c->code = *s++; + c->state = UTF16_LE_READ_1; + /* fall-thru */ + case UTF16_LE_READ_1: + if (s >= se) + goto cse; + c->code |= *s++ << 8; + if (c->code - 0xd800 >= 0x800) + goto got_char; + c->code = (c->code - 0xd800) << 10; + c->state = UTF16_LE_READ_2; + /* fall-thru */ + case UTF16_LE_READ_2: + if (s >= se) + goto cse; + c->code |= *s++; + c->state = UTF16_LE_READ_3; + /* fall-thru */ + case UTF16_LE_READ_3: + if (s >= se) + goto cse; + if (*s - 0xdc < 4) + c->code += 0x10000 + ((*s - 0xdc) << 8); + else + c->code = UNI_REPLACEMENT; + s++; + goto got_char; + + default: + ASSERT(0); } - c->state = 0; -send_noreset: c->source = s; c->dest = d; - return CONV_SOURCE_END; + c->state = 0; + return 0; -dend: - c->state = ~0; - c->value = code; + cse: c->source = s; + return CONV_SOURCE_END; + + cde: c->dest = d; return CONV_DEST_END; +} -go_slow: - code = c->value; - counter = c->state; - if (counter == ~0U) - goto code_again; - goto go_slow_2; +/* Generate inlined routines */ -go_slow_1: - if (cc < 0xe0) { code = cc & 0x1f; counter = 1; } - else if (cc < 0xf0) { code = cc & 0x0f; counter = 2; } - else - { - code = ~0; - if (cc < 0xf8) counter = 3; - else if (cc < 0xfc) counter = 4; - else if (cc < 0xfe) counter = 5; - else goto nocode; - } -go_slow_2: - while (counter) - { - if (s >= se) - { - c->state = counter; - c->value = code; - goto send_noreset; - } - if ((*s & 0xc0) != 0x80) - goto nocode; - code = (code << 6) | (*s++ & 0x3f); - counter--; - } - if (code >= 0x10000) - goto nocode; - goto uni_again; +static int +conv_std_to_utf8(struct conv_context *c) +{ +#define CONV_READ_STD +#define CONV_WRITE_UTF8 +#include "charset/charconv-gen.h" } static int -conv_to_utf8(struct conv_context *c) +conv_utf8_to_std(struct conv_context *c) { - unsigned short *in_to_x = c->in_to_x; - const unsigned char *s = c->source; - const unsigned char *se = c->source_end; - unsigned char *d = c->dest; - unsigned char *de = c->dest_end; +#define CONV_READ_UTF8 +#define CONV_WRITE_STD +#include "charset/charconv-gen.h" +} - while (s < se) - { - unsigned int code = x_to_uni[in_to_x[*s]]; - if (code < 0x80) - { - if (d >= de) - goto dend; - *d++ = code; - } - else if (code < 0x800) - { - if (d + 2 > de) - goto dend; - *d++ = 0xc0 | (code >> 6); - *d++ = 0x80 | (code & 0x3f); - } - else - { - if (d + 3 > de) - goto dend; - *d++ = 0xc0 | (code >> 12); - *d++ = 0x80 | ((code >> 6) & 0x3f); - *d++ = 0x80 | (code & 0x3f); - } - s++; - } - c->source = s; - c->dest = d; - return CONV_SOURCE_END; +static int +conv_std_to_utf16_be(struct conv_context *c) +{ +#define CONV_READ_STD +#define CONV_WRITE_UTF16_BE +#include "charset/charconv-gen.h" +} -dend: - c->source = s; - c->dest = d; - return CONV_DEST_END; +static int +conv_utf16_be_to_std(struct conv_context *c) +{ +#define CONV_READ_UTF16_BE +#define CONV_WRITE_STD +#include "charset/charconv-gen.h" +} + +static int +conv_std_to_utf16_le(struct conv_context *c) +{ +#define CONV_READ_STD +#define CONV_WRITE_UTF16_LE +#include "charset/charconv-gen.h" +} + +static int +conv_utf16_le_to_std(struct conv_context *c) +{ +#define CONV_READ_UTF16_LE +#define CONV_WRITE_STD +#include "charset/charconv-gen.h" +} + +static int +conv_utf8_to_utf16_be(struct conv_context *c) +{ +#define CONV_READ_UTF8 +#define CONV_WRITE_UTF16_BE +#include "charset/charconv-gen.h" +} + +static int +conv_utf16_be_to_utf8(struct conv_context *c) +{ +#define CONV_READ_UTF16_BE +#define CONV_WRITE_UTF8 +#include "charset/charconv-gen.h" +} + +static int +conv_utf8_to_utf16_le(struct conv_context *c) +{ +#define CONV_READ_UTF8 +#define CONV_WRITE_UTF16_LE +#include "charset/charconv-gen.h" +} + +static int +conv_utf16_le_to_utf8(struct conv_context *c) +{ +#define CONV_READ_UTF16_LE +#define CONV_WRITE_UTF8 +#include "charset/charconv-gen.h" +} + +static int +conv_utf16_be_to_utf16_le(struct conv_context *c) +{ +#define CONV_READ_UTF16_BE +#define CONV_WRITE_UTF16_LE +#include "charset/charconv-gen.h" } static int @@ -202,28 +358,33 @@ conv_standard(struct conv_context *c) { unsigned short *in_to_x = c->in_to_x; unsigned short *x_to_out = c->x_to_out; - const unsigned char *s = c->source; - const unsigned char *se = c->source_end; - unsigned char *d = c->dest; - unsigned char *de = c->dest_end; - unsigned char *strings = string_table - 0x100; + const unsigned char *s, *se; + unsigned char *d, *de, *k; + unsigned int len, e; + + if (unlikely(c->state)) + goto slow; +main: + s = c->source; + se = c->source_end; + d = c->dest; + de = c->dest_end; while (s < se) { unsigned int code = x_to_out[in_to_x[*s]]; if (code < 0x100) { - if (d >= de) + if (unlikely(d >= de)) goto dend; *d++ = code; } else { - unsigned char *k = strings + code; - unsigned int len = *k++; - - if (d + len > de) - goto dend; + k = string_table + code - 0x100; + len = *k++; + if (unlikely(d + len > de)) + goto dend_str; while (len--) *d++ = *k++; } @@ -237,24 +398,49 @@ dend: c->source = s; c->dest = d; return CONV_DEST_END; + +dend_str: + c->source = s; + c->dest = d; + c->state = SEQ_WRITE; + c->string_at = k; + c->remains = len; +slow: + e = conv_slow(c); + if (e) + return e; + goto main; } void conv_set_charset(struct conv_context *c, int src, int dest) { + c->source_charset = src; + c->dest_charset = dest; if (src == dest) - c->convert = conv_none; + { + c->convert = conv_none; + c->in_to_x = NULL; + c->x_to_out = NULL; + } else { - c->convert = conv_standard; - if (src == CONV_CHARSET_UTF8) - c->convert = conv_from_utf8; - else - c->in_to_x = input_to_x[src]; - if (dest == CONV_CHARSET_UTF8) - c->convert = conv_to_utf8; - else - c->x_to_out = x_to_output[dest]; + static uns lookup[] = { + [CONV_CHARSET_UTF8] = 1, + [CONV_CHARSET_UTF16_BE] = 2, + [CONV_CHARSET_UTF16_LE] = 3, + }; + static int (*tab[4][4])(struct conv_context *c) = { + { conv_standard, conv_std_to_utf8, conv_std_to_utf16_be, conv_std_to_utf16_le }, + { conv_utf8_to_std, conv_none, conv_utf8_to_utf16_be, conv_utf8_to_utf16_le }, + { conv_utf16_be_to_std, conv_utf16_be_to_utf8, conv_none, conv_utf16_be_to_utf16_le }, + { conv_utf16_le_to_std, conv_utf16_le_to_utf8, conv_utf16_be_to_utf16_le, conv_none }, + }; + uns src_idx = ((uns)src < ARRAY_SIZE(lookup)) ? lookup[src] : 0; + uns dest_idx = ((uns)dest < ARRAY_SIZE(lookup)) ? lookup[dest] : 0; + c->convert = tab[src_idx][dest_idx]; + c->in_to_x = src_idx ? NULL : input_to_x[src]; + c->x_to_out = dest_idx ? NULL : x_to_output[dest]; } c->state = 0; } @@ -276,3 +462,18 @@ conv_x_count(void) { return sizeof(x_to_uni) / sizeof(x_to_uni[0]); } + +int +conv_in_to_ucs(struct conv_context *c, unsigned int y) +{ + return x_to_uni[c->in_to_x[y]]; +} + +int conv_ucs_to_out(struct conv_context *c, unsigned int ucs) +{ + uns x = uni_to_x[ucs >> 8U][ucs & 0xff]; + if (x == 256 || c->x_to_out[x] >= 256) + return -1; + else + return c->x_to_out[x]; +}