2 * Character Set Conversion Library 1.2
4 * (c) 1998--2004 Martin Mares <mj@ucw.cz>
5 * (c) 2007 Pavel Charvat <pchar@ucw.cz>
7 * This software may be freely distributed and used according to the terms
8 * of the GNU Lesser General Public License.
12 #include <ucw/unicode.h>
13 #include <ucw/unaligned.h>
14 #include <charset/charconv.h>
15 #include <charset/chartable.h>
18 conv_init(struct conv_context *c)
20 c->source = c->source_end = NULL;
21 c->dest = c->dest_start = c->dest_end = NULL;
25 conv_none(struct conv_context *c)
27 c->dest_start = (char *) c->source;
28 c->dest = (char *) c->source_end;
29 return CONV_SOURCE_END | CONV_DEST_END | CONV_SKIP;
52 conv_slow(struct conv_context *c)
54 const unsigned char *s = c->source;
55 const unsigned char *se = c->source_end;
56 unsigned char *d = c->dest;
57 unsigned char *de = c->dest_end;
72 *d++ = *c->string_at++;
82 if ((*s & 0xc0) != 0x80)
87 c->code = (c->code << 6) | (*s++ & 0x3f);
90 if (c->code >= 0x10000)
97 /* Writing of UTF-8 */
98 case UTF8_WRITE_START:
106 else if (c->code < 0x800)
108 *d++ = 0xc0 | (c->code >> 6);
114 *d++ = 0xe0 | (c->code >> 12);
119 c->state = UTF8_WRITE_CONT;
121 case UTF8_WRITE_CONT:
126 *d++ = 0x80 | (c->code >> 10);
132 /* Writing of UTF-16BE */
139 if (code < 0xd800 || code - 0xe000 < 0x2000)
141 else if ((code -= 0x10000) < 0x100000)
143 put_u16_be(p, 0xd800 | (code >> 10));
144 put_u16_be(p + 2, 0xdc00 | (code & 0x3ff));
146 c->state = SEQ_WRITE;
150 code = UNI_REPLACEMENT;
153 c->state = SEQ_WRITE;
157 /* Writing of UTF-16LE */
164 if (code < 0xd800 || code - 0xe000 < 0x2000)
166 else if ((code -= 0x10000) < 0x100000)
168 put_u16_le(p, 0xd800 | (code >> 10));
169 put_u16_le(p + 2, 0xdc00 | (code & 0x3ff));
171 c->state = SEQ_WRITE;
174 code = UNI_REPLACEMENT;
177 c->state = SEQ_WRITE;
181 /* Reading of UTF16-BE */
186 c->state = UTF16_BE_READ_1;
188 case UTF16_BE_READ_1:
191 c->code = (c->code << 8) | *s++;
192 if (c->code - 0xd800 >= 0x800)
194 c->code = (c->code - 0xd800) << 10;
195 c->state = UTF16_BE_READ_2;
197 case UTF16_BE_READ_2:
203 c->code |= (*s - 0xdc) << 8;
205 c->state = UTF16_BE_READ_3;
207 case UTF16_BE_READ_3:
210 if ((int)c->code >= 0)
211 c->code += 0x10000 + *s;
213 c->code = UNI_REPLACEMENT;
217 /* Reading of UTF16-LE */
222 c->state = UTF16_LE_READ_1;
224 case UTF16_LE_READ_1:
227 c->code |= *s++ << 8;
228 if (c->code - 0xd800 >= 0x800)
230 c->code = (c->code - 0xd800) << 10;
231 c->state = UTF16_LE_READ_2;
233 case UTF16_LE_READ_2:
237 c->state = UTF16_LE_READ_3;
239 case UTF16_LE_READ_3:
243 c->code += 0x10000 + ((*s - 0xdc) << 8);
245 c->code = UNI_REPLACEMENT;
259 return CONV_SOURCE_END;
263 return CONV_DEST_END;
266 /* Generate inlined routines */
269 conv_std_to_utf8(struct conv_context *c)
271 #define CONV_READ_STD
272 #define CONV_WRITE_UTF8
273 #include <charset/charconv-gen.h>
277 conv_utf8_to_std(struct conv_context *c)
279 #define CONV_READ_UTF8
280 #define CONV_WRITE_STD
281 #include <charset/charconv-gen.h>
285 conv_std_to_utf16_be(struct conv_context *c)
287 #define CONV_READ_STD
288 #define CONV_WRITE_UTF16_BE
289 #include <charset/charconv-gen.h>
293 conv_utf16_be_to_std(struct conv_context *c)
295 #define CONV_READ_UTF16_BE
296 #define CONV_WRITE_STD
297 #include <charset/charconv-gen.h>
301 conv_std_to_utf16_le(struct conv_context *c)
303 #define CONV_READ_STD
304 #define CONV_WRITE_UTF16_LE
305 #include <charset/charconv-gen.h>
309 conv_utf16_le_to_std(struct conv_context *c)
311 #define CONV_READ_UTF16_LE
312 #define CONV_WRITE_STD
313 #include <charset/charconv-gen.h>
317 conv_utf8_to_utf16_be(struct conv_context *c)
319 #define CONV_READ_UTF8
320 #define CONV_WRITE_UTF16_BE
321 #include <charset/charconv-gen.h>
325 conv_utf16_be_to_utf8(struct conv_context *c)
327 #define CONV_READ_UTF16_BE
328 #define CONV_WRITE_UTF8
329 #include <charset/charconv-gen.h>
333 conv_utf8_to_utf16_le(struct conv_context *c)
335 #define CONV_READ_UTF8
336 #define CONV_WRITE_UTF16_LE
337 #include <charset/charconv-gen.h>
341 conv_utf16_le_to_utf8(struct conv_context *c)
343 #define CONV_READ_UTF16_LE
344 #define CONV_WRITE_UTF8
345 #include <charset/charconv-gen.h>
349 conv_utf16_be_to_utf16_le(struct conv_context *c)
351 #define CONV_READ_UTF16_BE
352 #define CONV_WRITE_UTF16_LE
353 #include <charset/charconv-gen.h>
357 conv_standard(struct conv_context *c)
359 unsigned short *in_to_x = c->in_to_x;
360 unsigned short *x_to_out = c->x_to_out;
361 const unsigned char *s, *se;
362 unsigned char *d, *de, *k;
365 if (unlikely(c->state))
375 uint code = x_to_out[in_to_x[*s]];
378 if (unlikely(d >= de))
384 k = string_table + code - 0x100;
386 if (unlikely(d + len > de))
395 return CONV_SOURCE_END;
400 return CONV_DEST_END;
405 c->state = SEQ_WRITE;
416 conv_set_charset(struct conv_context *c, int src, int dest)
418 c->source_charset = src;
419 c->dest_charset = dest;
422 c->convert = conv_none;
428 static uint lookup[] = {
429 [CONV_CHARSET_UTF8] = 1,
430 [CONV_CHARSET_UTF16_BE] = 2,
431 [CONV_CHARSET_UTF16_LE] = 3,
433 static int (*tab[4][4])(struct conv_context *c) = {
434 { conv_standard, conv_std_to_utf8, conv_std_to_utf16_be, conv_std_to_utf16_le },
435 { conv_utf8_to_std, conv_none, conv_utf8_to_utf16_be, conv_utf8_to_utf16_le },
436 { conv_utf16_be_to_std, conv_utf16_be_to_utf8, conv_none, conv_utf16_be_to_utf16_le },
437 { conv_utf16_le_to_std, conv_utf16_le_to_utf8, conv_utf16_be_to_utf16_le, conv_none },
439 uint src_idx = ((uint)src < ARRAY_SIZE(lookup)) ? lookup[src] : 0;
440 uint dest_idx = ((uint)dest < ARRAY_SIZE(lookup)) ? lookup[dest] : 0;
441 c->convert = tab[src_idx][dest_idx];
442 c->in_to_x = src_idx ? NULL : input_to_x[src];
443 c->x_to_out = dest_idx ? NULL : x_to_output[dest];
449 conv_x_to_ucs(uint x)
455 conv_ucs_to_x(uint ucs)
457 return uni_to_x[ucs >> 8U][ucs & 0xff];
463 return sizeof(x_to_uni) / sizeof(x_to_uni[0]);
467 conv_in_to_ucs(struct conv_context *c, uint y)
469 return x_to_uni[c->in_to_x[y]];
472 int conv_ucs_to_out(struct conv_context *c, uint ucs)
474 uint x = uni_to_x[ucs >> 8U][ucs & 0xff];
475 if (x == 256 || c->x_to_out[x] >= 256)
478 return c->x_to_out[x];