charset/charconv.c

   1 /*
   2  *      Character Set Conversion Library 1.2
   3  *
   4  *      (c) 1998--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2007 Pavel Charvat <pchar@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  */
  10
  11 #include <ucw/lib.h>
  12 #include <ucw/unicode.h>
  13 #include <ucw/unaligned.h>
  14 #include <charset/charconv.h>
  15 #include <charset/chartable.h>
  16
  17 void
  18 conv_init(struct conv_context *c)
  19 {
  20   c->source = c->source_end = NULL;
  21   c->dest = c->dest_start = c->dest_end = NULL;
  22 }
  23
  24 static int
  25 conv_none(struct conv_context *c)
  26 {
  27   c->dest_start = (char *) c->source;
  28   c->dest = (char *) c->source_end;
  29   return CONV_SOURCE_END | CONV_DEST_END | CONV_SKIP;
  30 }
  31
  32 enum state {
  33   CLEAN,
  34   SINGLE_WRITE,
  35   SEQ_WRITE,
  36   UTF8_READ,
  37   UTF8_WRITE_START,
  38   UTF8_WRITE_CONT,
  39   UTF16_BE_WRITE,
  40   UTF16_LE_WRITE,
  41   UTF16_BE_READ,
  42   UTF16_BE_READ_1,
  43   UTF16_BE_READ_2,
  44   UTF16_BE_READ_3,
  45   UTF16_LE_READ,
  46   UTF16_LE_READ_1,
  47   UTF16_LE_READ_2,
  48   UTF16_LE_READ_3,
  49 };
  50
  51 static int
  52 conv_slow(struct conv_context *c)
  53 {
  54   const unsigned char *s = c->source;
  55   const unsigned char *se = c->source_end;
  56   unsigned char *d = c->dest;
  57   unsigned char *de = c->dest_end;
  58
  59   switch (c->state)
  60     {
  61     case SINGLE_WRITE:
  62       if (d >= de)
  63         goto cde;
  64       *d++ = c->code;
  65       break;
  66     case SEQ_WRITE:
  67 seq:
  68       while (c->remains)
  69         {
  70           if (d >= de)
  71             goto cde;
  72           *d++ = *c->string_at++;
  73           c->remains--;
  74         }
  75       break;
  76
  77     case UTF8_READ:
  78       while (c->remains)
  79         {
  80           if (s >= se)
  81             goto cse;
  82           if ((*s & 0xc0) != 0x80)
  83             {
  84               c->code = 0xfffd;
  85               break;
  86             }
  87           c->code = (c->code << 6) | (*s++ & 0x3f);
  88           c->remains--;
  89         }
  90       if (c->code >= 0x10000)
  91         c->code = 0xfffd;
  92 got_char:
  93       c->source = s;
  94       c->state = 0;
  95       return -1;
  96
  97     /* Writing of UTF-8 */
  98     case UTF8_WRITE_START:
  99       if (d >= de)
 100         goto cde;
 101       if (c->code < 0x80)
 102         {
 103           *d++ = c->code;
 104           break;
 105         }
 106       else if (c->code < 0x800)
 107         {
 108           *d++ = 0xc0 | (c->code >> 6);
 109           c->code <<= 10;
 110           c->remains = 1;
 111         }
 112       else
 113         {
 114           *d++ = 0xe0 | (c->code >> 12);
 115           c->code <<= 4;
 116           c->remains = 2;
 117         }
 118       c->code &= 0xffff;
 119       c->state = UTF8_WRITE_CONT;
 120       /* fall-thru */
 121     case UTF8_WRITE_CONT:
 122       while (c->remains)
 123         {
 124           if (d >= de)
 125             goto cde;
 126           *d++ = 0x80 | (c->code >> 10);
 127           c->code <<= 6;
 128           c->remains--;
 129         }
 130       break;
 131
 132     /* Writing of UTF-16BE */
 133     case UTF16_BE_WRITE:
 134       {
 135         void *p = &c->code;
 136         c->string_at = p;
 137         uns code = c->code;
 138         c->string_at = p;
 139         if (code < 0xd800 || code - 0xe000 < 0x2000)
 140           {}
 141         else if ((code -= 0x10000) < 0x100000)
 142           {
 143             put_u16_be(p, 0xd800 | (code >> 10));
 144             put_u16_be(p + 2, 0xdc00 | (code & 0x3ff));
 145             c->remains = 4;
 146             c->state = SEQ_WRITE;
 147             goto seq;
 148           }
 149         else
 150           code = UNI_REPLACEMENT;
 151         put_u16_be(p, code);
 152         c->remains = 2;
 153         c->state = SEQ_WRITE;
 154         goto seq;
 155       }
 156
 157     /* Writing of UTF-16LE */
 158     case UTF16_LE_WRITE:
 159       {
 160         void *p = &c->code;
 161         c->string_at = p;
 162         uns code = c->code;
 163         c->string_at = p;
 164         if (code < 0xd800 || code - 0xe000 < 0x2000)
 165           {}
 166         else if ((code -= 0x10000) < 0x100000)
 167           {
 168             put_u16_le(p, 0xd800 | (code >> 10));
 169             put_u16_le(p + 2, 0xdc00 | (code & 0x3ff));
 170             c->remains = 4;
 171             c->state = SEQ_WRITE;
 172           }
 173         else
 174           code = UNI_REPLACEMENT;
 175         put_u16_le(p, code);
 176         c->remains = 2;
 177         c->state = SEQ_WRITE;
 178         goto seq;
 179       }
 180
 181     /* Reading of UTF16-BE */
 182     case UTF16_BE_READ:
 183       if (s >= se)
 184         goto cse;
 185       c->code = *s++;
 186       c->state = UTF16_BE_READ_1;
 187       /* fall-thru */
 188     case UTF16_BE_READ_1:
 189       if (s >= se)
 190         goto cse;
 191       c->code = (c->code << 8) | *s++;
 192       if (c->code - 0xd800 >= 0x800)
 193         goto got_char;
 194       c->code = (c->code - 0xd800) << 10;
 195       c->state = UTF16_BE_READ_2;
 196       /* fall-thru */
 197     case UTF16_BE_READ_2:
 198       if (s >= se)
 199         goto cse;
 200       if (*s - 0xdc >= 4)
 201         c->code = ~0U;
 202       else
 203         c->code |= (*s - 0xdc) << 8;
 204       s++;
 205       c->state = UTF16_BE_READ_3;
 206       /* fall-thru */
 207     case UTF16_BE_READ_3:
 208       if (s >= se)
 209         goto cse;
 210       if ((int)c->code >= 0)
 211         c->code += 0x10000 + *s;
 212       else
 213         c->code = UNI_REPLACEMENT;
 214       s++;
 215       goto got_char;
 216
 217     /* Reading of UTF16-LE */
 218     case UTF16_LE_READ:
 219       if (s >= se)
 220         goto cse;
 221       c->code = *s++;
 222       c->state = UTF16_LE_READ_1;
 223       /* fall-thru */
 224     case UTF16_LE_READ_1:
 225       if (s >= se)
 226         goto cse;
 227       c->code |= *s++ << 8;
 228       if (c->code - 0xd800 >= 0x800)
 229         goto got_char;
 230       c->code = (c->code - 0xd800) << 10;
 231       c->state = UTF16_LE_READ_2;
 232       /* fall-thru */
 233     case UTF16_LE_READ_2:
 234       if (s >= se)
 235         goto cse;
 236       c->code |= *s++;
 237       c->state = UTF16_LE_READ_3;
 238       /* fall-thru */
 239     case UTF16_LE_READ_3:
 240       if (s >= se)
 241         goto cse;
 242       if (*s - 0xdc < 4)
 243         c->code += 0x10000 + ((*s - 0xdc) << 8);
 244       else
 245         c->code = UNI_REPLACEMENT;
 246       s++;
 247       goto got_char;
 248
 249     default:
 250       ASSERT(0);
 251     }
 252   c->source = s;
 253   c->dest = d;
 254   c->state = 0;
 255   return 0;
 256
 257  cse:
 258   c->source = s;
 259   return CONV_SOURCE_END;
 260
 261  cde:
 262   c->dest = d;
 263   return CONV_DEST_END;
 264 }
 265
 266 /* Generate inlined routines */
 267
 268 static int
 269 conv_std_to_utf8(struct conv_context *c)
 270 {
 271 #define CONV_READ_STD
 272 #define CONV_WRITE_UTF8
 273 #include <charset/charconv-gen.h>
 274 }
 275
 276 static int
 277 conv_utf8_to_std(struct conv_context *c)
 278 {
 279 #define CONV_READ_UTF8
 280 #define CONV_WRITE_STD
 281 #include <charset/charconv-gen.h>
 282 }
 283
 284 static int
 285 conv_std_to_utf16_be(struct conv_context *c)
 286 {
 287 #define CONV_READ_STD
 288 #define CONV_WRITE_UTF16_BE
 289 #include <charset/charconv-gen.h>
 290 }
 291
 292 static int
 293 conv_utf16_be_to_std(struct conv_context *c)
 294 {
 295 #define CONV_READ_UTF16_BE
 296 #define CONV_WRITE_STD
 297 #include <charset/charconv-gen.h>
 298 }
 299
 300 static int
 301 conv_std_to_utf16_le(struct conv_context *c)
 302 {
 303 #define CONV_READ_STD
 304 #define CONV_WRITE_UTF16_LE
 305 #include <charset/charconv-gen.h>
 306 }
 307
 308 static int
 309 conv_utf16_le_to_std(struct conv_context *c)
 310 {
 311 #define CONV_READ_UTF16_LE
 312 #define CONV_WRITE_STD
 313 #include <charset/charconv-gen.h>
 314 }
 315
 316 static int
 317 conv_utf8_to_utf16_be(struct conv_context *c)
 318 {
 319 #define CONV_READ_UTF8
 320 #define CONV_WRITE_UTF16_BE
 321 #include <charset/charconv-gen.h>
 322 }
 323
 324 static int
 325 conv_utf16_be_to_utf8(struct conv_context *c)
 326 {
 327 #define CONV_READ_UTF16_BE
 328 #define CONV_WRITE_UTF8
 329 #include <charset/charconv-gen.h>
 330 }
 331
 332 static int
 333 conv_utf8_to_utf16_le(struct conv_context *c)
 334 {
 335 #define CONV_READ_UTF8
 336 #define CONV_WRITE_UTF16_LE
 337 #include <charset/charconv-gen.h>
 338 }
 339
 340 static int
 341 conv_utf16_le_to_utf8(struct conv_context *c)
 342 {
 343 #define CONV_READ_UTF16_LE
 344 #define CONV_WRITE_UTF8
 345 #include <charset/charconv-gen.h>
 346 }
 347
 348 static int
 349 conv_utf16_be_to_utf16_le(struct conv_context *c)
 350 {
 351 #define CONV_READ_UTF16_BE
 352 #define CONV_WRITE_UTF16_LE
 353 #include <charset/charconv-gen.h>
 354 }
 355
 356 static int
 357 conv_standard(struct conv_context *c)
 358 {
 359   unsigned short *in_to_x = c->in_to_x;
 360   unsigned short *x_to_out = c->x_to_out;
 361   const unsigned char *s, *se;
 362   unsigned char *d, *de, *k;
 363   unsigned int len, e;
 364
 365   if (unlikely(c->state))
 366     goto slow;
 367
 368 main:
 369   s = c->source;
 370   se = c->source_end;
 371   d = c->dest;
 372   de = c->dest_end;
 373   while (s < se)
 374     {
 375       unsigned int code = x_to_out[in_to_x[*s]];
 376       if (code < 0x100)
 377         {
 378           if (unlikely(d >= de))
 379             goto dend;
 380           *d++ = code;
 381         }
 382       else
 383         {
 384           k = string_table + code - 0x100;
 385           len = *k++;
 386           if (unlikely(d + len > de))
 387             goto dend_str;
 388           while (len--)
 389             *d++ = *k++;
 390         }
 391       s++;
 392     }
 393   c->source = s;
 394   c->dest = d;
 395   return CONV_SOURCE_END;
 396
 397 dend:
 398   c->source = s;
 399   c->dest = d;
 400   return CONV_DEST_END;
 401
 402 dend_str:
 403   c->source = s;
 404   c->dest = d;
 405   c->state = SEQ_WRITE;
 406   c->string_at = k;
 407   c->remains = len;
 408 slow:
 409   e = conv_slow(c);
 410   if (e)
 411     return e;
 412   goto main;
 413 }
 414
 415 void
 416 conv_set_charset(struct conv_context *c, int src, int dest)
 417 {
 418   c->source_charset = src;
 419   c->dest_charset = dest;
 420   if (src == dest)
 421     {
 422       c->convert = conv_none;
 423       c->in_to_x = NULL;
 424       c->x_to_out = NULL;
 425     }
 426   else
 427     {
 428       static uns lookup[] = {
 429         [CONV_CHARSET_UTF8] = 1,
 430         [CONV_CHARSET_UTF16_BE] = 2,
 431         [CONV_CHARSET_UTF16_LE] = 3,
 432       };
 433       static int (*tab[4][4])(struct conv_context *c) = {
 434         { conv_standard,        conv_std_to_utf8,       conv_std_to_utf16_be,   conv_std_to_utf16_le },
 435         { conv_utf8_to_std,     conv_none,              conv_utf8_to_utf16_be,  conv_utf8_to_utf16_le },
 436         { conv_utf16_be_to_std, conv_utf16_be_to_utf8,  conv_none,              conv_utf16_be_to_utf16_le },
 437         { conv_utf16_le_to_std, conv_utf16_le_to_utf8,  conv_utf16_be_to_utf16_le,      conv_none },
 438       };
 439       uns src_idx = ((uns)src < ARRAY_SIZE(lookup)) ? lookup[src] : 0;
 440       uns dest_idx = ((uns)dest < ARRAY_SIZE(lookup)) ? lookup[dest] : 0;
 441       c->convert = tab[src_idx][dest_idx];
 442       c->in_to_x = src_idx ? NULL : input_to_x[src];
 443       c->x_to_out = dest_idx ? NULL : x_to_output[dest];
 444     }
 445   c->state = 0;
 446 }
 447
 448 unsigned int
 449 conv_x_to_ucs(unsigned int x)
 450 {
 451   return x_to_uni[x];
 452 }
 453
 454 unsigned int
 455 conv_ucs_to_x(unsigned int ucs)
 456 {
 457   return uni_to_x[ucs >> 8U][ucs & 0xff];
 458 }
 459
 460 unsigned int
 461 conv_x_count(void)
 462 {
 463   return sizeof(x_to_uni) / sizeof(x_to_uni[0]);
 464 }
 465
 466 int
 467 conv_in_to_ucs(struct conv_context *c, unsigned int y)
 468 {
 469   return x_to_uni[c->in_to_x[y]];
 470 }
 471
 472 int conv_ucs_to_out(struct conv_context *c, unsigned int ucs)
 473 {
 474   uns x = uni_to_x[ucs >> 8U][ucs & 0xff];
 475   if (x == 256 || c->x_to_out[x] >= 256)
 476     return -1;
 477   else
 478     return c->x_to_out[x];
 479 }