X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=charset%2Fcharconv.c;h=2418f4d2567223672dcedfd6226d5848d13640e8;hb=9adf0cac23ff0639f64a9d510c71e5aab44c27e9;hp=ed07f2f96e4adeb1a54958c592614caa26b3485e;hpb=fece0021c9d669b6e0175906d0137a857b4bf4a9;p=libucw.git diff --git a/charset/charconv.c b/charset/charconv.c index ed07f2f9..2418f4d2 100644 --- a/charset/charconv.c +++ b/charset/charconv.c @@ -2,14 +2,17 @@ * Character Set Conversion Library 1.2 * * (c) 1998--2004 Martin Mares + * (c) 2007 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. */ -#include "lib/lib.h" -#include "charset/charconv.h" -#include "charset/chartable.h" +#include +#include +#include +#include +#include void conv_init(struct conv_context *c) @@ -32,7 +35,17 @@ enum state { SEQ_WRITE, UTF8_READ, UTF8_WRITE_START, - UTF8_WRITE_CONT + UTF8_WRITE_CONT, + UTF16_BE_WRITE, + UTF16_LE_WRITE, + UTF16_BE_READ, + UTF16_BE_READ_1, + UTF16_BE_READ_2, + UTF16_BE_READ_3, + UTF16_LE_READ, + UTF16_LE_READ_1, + UTF16_LE_READ_2, + UTF16_LE_READ_3, }; static int @@ -51,6 +64,7 @@ conv_slow(struct conv_context *c) *d++ = c->code; break; case SEQ_WRITE: +seq: while (c->remains) { if (d >= de) @@ -59,6 +73,7 @@ conv_slow(struct conv_context *c) c->remains--; } break; + case UTF8_READ: while (c->remains) { @@ -74,9 +89,12 @@ conv_slow(struct conv_context *c) } if (c->code >= 0x10000) c->code = 0xfffd; +got_char: c->source = s; c->state = 0; return -1; + + /* Writing of UTF-8 */ case UTF8_WRITE_START: if (d >= de) goto cde; @@ -97,6 +115,7 @@ conv_slow(struct conv_context *c) c->code <<= 4; c->remains = 2; } + c->code &= 0xffff; c->state = UTF8_WRITE_CONT; /* fall-thru */ case UTF8_WRITE_CONT: @@ -109,6 +128,124 @@ conv_slow(struct conv_context *c) c->remains--; } break; + + /* Writing of UTF-16BE */ + case UTF16_BE_WRITE: + { + void *p = &c->code; + c->string_at = p; + uint code = c->code; + c->string_at = p; + if (code < 0xd800 || code - 0xe000 < 0x2000) + {} + else if ((code -= 0x10000) < 0x100000) + { + put_u16_be(p, 0xd800 | (code >> 10)); + put_u16_be(p + 2, 0xdc00 | (code & 0x3ff)); + c->remains = 4; + c->state = SEQ_WRITE; + goto seq; + } + else + code = UNI_REPLACEMENT; + put_u16_be(p, code); + c->remains = 2; + c->state = SEQ_WRITE; + goto seq; + } + + /* Writing of UTF-16LE */ + case UTF16_LE_WRITE: + { + void *p = &c->code; + c->string_at = p; + uint code = c->code; + c->string_at = p; + if (code < 0xd800 || code - 0xe000 < 0x2000) + {} + else if ((code -= 0x10000) < 0x100000) + { + put_u16_le(p, 0xd800 | (code >> 10)); + put_u16_le(p + 2, 0xdc00 | (code & 0x3ff)); + c->remains = 4; + c->state = SEQ_WRITE; + } + else + code = UNI_REPLACEMENT; + put_u16_le(p, code); + c->remains = 2; + c->state = SEQ_WRITE; + goto seq; + } + + /* Reading of UTF16-BE */ + case UTF16_BE_READ: + if (s >= se) + goto cse; + c->code = *s++; + c->state = UTF16_BE_READ_1; + /* fall-thru */ + case UTF16_BE_READ_1: + if (s >= se) + goto cse; + c->code = (c->code << 8) | *s++; + if (c->code - 0xd800 >= 0x800) + goto got_char; + c->code = (c->code - 0xd800) << 10; + c->state = UTF16_BE_READ_2; + /* fall-thru */ + case UTF16_BE_READ_2: + if (s >= se) + goto cse; + if (*s - 0xdc >= 4) + c->code = ~0U; + else + c->code |= (*s - 0xdc) << 8; + s++; + c->state = UTF16_BE_READ_3; + /* fall-thru */ + case UTF16_BE_READ_3: + if (s >= se) + goto cse; + if ((int)c->code >= 0) + c->code += 0x10000 + *s; + else + c->code = UNI_REPLACEMENT; + s++; + goto got_char; + + /* Reading of UTF16-LE */ + case UTF16_LE_READ: + if (s >= se) + goto cse; + c->code = *s++; + c->state = UTF16_LE_READ_1; + /* fall-thru */ + case UTF16_LE_READ_1: + if (s >= se) + goto cse; + c->code |= *s++ << 8; + if (c->code - 0xd800 >= 0x800) + goto got_char; + c->code = (c->code - 0xd800) << 10; + c->state = UTF16_LE_READ_2; + /* fall-thru */ + case UTF16_LE_READ_2: + if (s >= se) + goto cse; + c->code |= *s++; + c->state = UTF16_LE_READ_3; + /* fall-thru */ + case UTF16_LE_READ_3: + if (s >= se) + goto cse; + if (*s - 0xdc < 4) + c->code += 0x10000 + ((*s - 0xdc) << 8); + else + c->code = UNI_REPLACEMENT; + s++; + goto got_char; + default: ASSERT(0); } @@ -126,191 +263,94 @@ conv_slow(struct conv_context *c) return CONV_DEST_END; } +/* Generate inlined routines */ + static int -conv_from_utf8(struct conv_context *c) +conv_std_to_utf8(struct conv_context *c) { - unsigned short *x_to_out = c->x_to_out; - const unsigned char *s, *se; - unsigned char *d, *de, *k; - unsigned int code, cc, len; - int e; +#define CONV_READ_STD +#define CONV_WRITE_UTF8 +#include +} - if (unlikely(c->state)) - goto slow; +static int +conv_utf8_to_std(struct conv_context *c) +{ +#define CONV_READ_UTF8 +#define CONV_WRITE_STD +#include +} -main: - s = c->source; - se = c->source_end; - d = c->dest; - de = c->dest_end; - while (s < se) /* Optimized for speed, beware of spaghetti code */ - { - cc = *s++; - if (cc < 0x80) - code = cc; - else if (cc >= 0xc0) - { - if (s + 6 > se) - goto send_utf; - if (cc < 0xe0) - { - if ((s[0] & 0xc0) != 0x80) - goto nocode; - code = cc & 0x1f; - code = (code << 6) | (*s++ & 0x3f); - } - else if (cc < 0xf0) - { - if ((s[0] & 0xc0) != 0x80 || (s[1] & 0xc0) != 0x80) - goto nocode; - code = cc & 0x0f; - code = (code << 6) | (*s++ & 0x3f); - code = (code << 6) | (*s++ & 0x3f); - } - else if (cc < 0xfc) - { - while (cc & 0x80) - { - if ((*s++ & 0xc0) != 0x80) - break; - cc <<= 1; - } - goto nocode; - } - else - goto nocode; - } - else - { - nocode: - code = 0xfffd; - } - got_code: - code = x_to_out[uni_to_x[code >> 8U][code & 0xff]]; - if (code < 0x100) - { - if (d >= de) - goto dend_char; - *d++ = code; - } - else - { - k = string_table + code - 0x100; - len = *k++; - if (d + len > de) - goto dend_str; - while (len--) - *d++ = *k++; - } - } - c->source = s; - c->dest = d; - return CONV_SOURCE_END; +static int +conv_std_to_utf16_be(struct conv_context *c) +{ +#define CONV_READ_STD +#define CONV_WRITE_UTF16_BE +#include +} -send_utf: - c->state = UTF8_WRITE_START; - if (cc < 0xe0) { c->code = cc & 0x1f; c->remains = 1; } - else if (cc < 0xf0) { c->code = cc & 0x0f; c->remains = 2; } - else - { - c->code = ~0U; - if (cc < 0xf8) c->remains = 3; - else if (cc < 0xfc) c->remains = 4; - else if (cc < 0xfe) c->remains = 5; - else goto nocode; - } - goto go_slow; +static int +conv_utf16_be_to_std(struct conv_context *c) +{ +#define CONV_READ_UTF16_BE +#define CONV_WRITE_STD +#include +} -dend_str: - c->state = SEQ_WRITE; - c->string_at = k; - c->remains = len; +static int +conv_std_to_utf16_le(struct conv_context *c) +{ +#define CONV_READ_STD +#define CONV_WRITE_UTF16_LE +#include +} -dend_char: - c->state = SINGLE_WRITE; - c->code = code; - goto go_slow; -go_slow: - c->source = s; - c->dest = d; -slow: - e = conv_slow(c); - if (e < 0) - { - code = c->code; - s = c->source; - se = c->source_end; - d = c->dest; - de = c->dest_end; - goto got_code; - } - if (e) - return e; - goto main; +static int +conv_utf16_le_to_std(struct conv_context *c) +{ +#define CONV_READ_UTF16_LE +#define CONV_WRITE_STD +#include } static int -conv_to_utf8(struct conv_context *c) +conv_utf8_to_utf16_be(struct conv_context *c) { - unsigned short *in_to_x = c->in_to_x; - const unsigned char *s, *se; - unsigned char *d, *de; - unsigned int code; - int e; +#define CONV_READ_UTF8 +#define CONV_WRITE_UTF16_BE +#include +} - if (unlikely(c->state)) - goto slow; +static int +conv_utf16_be_to_utf8(struct conv_context *c) +{ +#define CONV_READ_UTF16_BE +#define CONV_WRITE_UTF8 +#include +} -main: - s = c->source; - se = c->source_end; - d = c->dest; - de = c->dest_end; - while (s < se) - { - code = x_to_uni[in_to_x[*s]]; - if (code < 0x80) - { - if (d >= de) - goto dend; - *d++ = code; - } - else if (code < 0x800) - { - if (d + 2 > de) - goto dend_utf; - *d++ = 0xc0 | (code >> 6); - *d++ = 0x80 | (code & 0x3f); - } - else - { - if (d + 3 > de) - goto dend_utf; - *d++ = 0xe0 | (code >> 12); - *d++ = 0x80 | ((code >> 6) & 0x3f); - *d++ = 0x80 | (code & 0x3f); - } - s++; - } - c->source = s; - c->dest = d; - return CONV_SOURCE_END; +static int +conv_utf8_to_utf16_le(struct conv_context *c) +{ +#define CONV_READ_UTF8 +#define CONV_WRITE_UTF16_LE +#include +} -dend: - c->source = s; - c->dest = d; - return CONV_DEST_END; +static int +conv_utf16_le_to_utf8(struct conv_context *c) +{ +#define CONV_READ_UTF16_LE +#define CONV_WRITE_UTF8 +#include +} -dend_utf: - c->source = s; - c->dest = d; - c->state = UTF8_WRITE_START; - c->code = code; -slow: - e = conv_slow(c); - if (e) - return e; - goto main; +static int +conv_utf16_be_to_utf16_le(struct conv_context *c) +{ +#define CONV_READ_UTF16_BE +#define CONV_WRITE_UTF16_LE +#include } static int @@ -320,7 +360,7 @@ conv_standard(struct conv_context *c) unsigned short *x_to_out = c->x_to_out; const unsigned char *s, *se; unsigned char *d, *de, *k; - unsigned int len, e; + uint len, e; if (unlikely(c->state)) goto slow; @@ -332,7 +372,7 @@ main: de = c->dest_end; while (s < se) { - unsigned int code = x_to_out[in_to_x[*s]]; + uint code = x_to_out[in_to_x[*s]]; if (code < 0x100) { if (unlikely(d >= de)) @@ -378,36 +418,62 @@ conv_set_charset(struct conv_context *c, int src, int dest) c->source_charset = src; c->dest_charset = dest; if (src == dest) - c->convert = conv_none; + { + c->convert = conv_none; + c->in_to_x = NULL; + c->x_to_out = NULL; + } else { - c->convert = conv_standard; - if (src == CONV_CHARSET_UTF8) - c->convert = conv_from_utf8; - else - c->in_to_x = input_to_x[src]; - if (dest == CONV_CHARSET_UTF8) - c->convert = conv_to_utf8; - else - c->x_to_out = x_to_output[dest]; + static uint lookup[] = { + [CONV_CHARSET_UTF8] = 1, + [CONV_CHARSET_UTF16_BE] = 2, + [CONV_CHARSET_UTF16_LE] = 3, + }; + static int (*tab[4][4])(struct conv_context *c) = { + { conv_standard, conv_std_to_utf8, conv_std_to_utf16_be, conv_std_to_utf16_le }, + { conv_utf8_to_std, conv_none, conv_utf8_to_utf16_be, conv_utf8_to_utf16_le }, + { conv_utf16_be_to_std, conv_utf16_be_to_utf8, conv_none, conv_utf16_be_to_utf16_le }, + { conv_utf16_le_to_std, conv_utf16_le_to_utf8, conv_utf16_be_to_utf16_le, conv_none }, + }; + uint src_idx = ((uint)src < ARRAY_SIZE(lookup)) ? lookup[src] : 0; + uint dest_idx = ((uint)dest < ARRAY_SIZE(lookup)) ? lookup[dest] : 0; + c->convert = tab[src_idx][dest_idx]; + c->in_to_x = src_idx ? NULL : input_to_x[src]; + c->x_to_out = dest_idx ? NULL : x_to_output[dest]; } c->state = 0; } -unsigned int -conv_x_to_ucs(unsigned int x) +uint +conv_x_to_ucs(uint x) { return x_to_uni[x]; } -unsigned int -conv_ucs_to_x(unsigned int ucs) +uint +conv_ucs_to_x(uint ucs) { return uni_to_x[ucs >> 8U][ucs & 0xff]; } -unsigned int +uint conv_x_count(void) { return sizeof(x_to_uni) / sizeof(x_to_uni[0]); } + +int +conv_in_to_ucs(struct conv_context *c, uint y) +{ + return x_to_uni[c->in_to_x[y]]; +} + +int conv_ucs_to_out(struct conv_context *c, uint ucs) +{ + uint x = uni_to_x[ucs >> 8U][ucs & 0xff]; + if (x == 256 || c->x_to_out[x] >= 256) + return -1; + else + return c->x_to_out[x]; +}