ucw/unicode.h

   1 /*
   2  *      UCW Library -- Unicode Characters
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2004 Robert Spalek <robert@ucw.cz>
   6  *      (c) 2007 Pavel Charvat <pchar@ucw.cz>
   7  *
   8  *      This software may be freely distributed and used according to the terms
   9  *      of the GNU Lesser General Public License.
  10  */
  11
  12 #ifndef _UCW_UNICODE_H
  13 #define _UCW_UNICODE_H
  14
  15 #include <ucw/unaligned.h>
  16
  17 #ifdef CONFIG_UCW_CLEAN_ABI
  18 #define utf8_strlen ucw_utf8_strlen
  19 #define utf8_strnlen ucw_utf8_strnlen
  20 #endif
  21
  22 /* Macros for handling UTF-8 */
  23
  24 #define UNI_REPLACEMENT 0xfffc  /** Unicode value used as a default replacement of invalid characters. **/
  25
  26 /**
  27  * Encode a value from the range `[0, 0xFFFF]`
  28  * (basic multilingual plane); up to 3 bytes needed (RFC2279).
  29  **/
  30 static inline byte *utf8_put(byte *p, uint u)
  31 {
  32   if (u < 0x80)
  33     *p++ = u;
  34   else if (u < 0x800)
  35     {
  36       *p++ = 0xc0 | (u >> 6);
  37       *p++ = 0x80 | (u & 0x3f);
  38     }
  39   else
  40     {
  41       ASSERT(u < 0x10000);
  42       *p++ = 0xe0 | (u >> 12);
  43       *p++ = 0x80 | ((u >> 6) & 0x3f);
  44       *p++ = 0x80 | (u & 0x3f);
  45     }
  46   return p;
  47 }
  48
  49 /**
  50  * Encode a value from the range `[0, 0x7FFFFFFF]`;
  51  * (superset of Unicode 4.0) up to 6 bytes needed (RFC2279).
  52  **/
  53 static inline byte *utf8_32_put(byte *p, uint u)
  54 {
  55   if (u < 0x80)
  56     *p++ = u;
  57   else if (u < 0x800)
  58     {
  59       *p++ = 0xc0 | (u >> 6);
  60       goto put1;
  61     }
  62   else if (u < (1<<16))
  63     {
  64       *p++ = 0xe0 | (u >> 12);
  65       goto put2;
  66     }
  67   else if (u < (1<<21))
  68     {
  69       *p++ = 0xf0 | (u >> 18);
  70       goto put3;
  71     }
  72   else if (u < (1<<26))
  73     {
  74       *p++ = 0xf8 | (u >> 24);
  75       goto put4;
  76     }
  77   else if (u < (1U<<31))
  78     {
  79       *p++ = 0xfc | (u >> 30);
  80       *p++ = 0x80 | ((u >> 24) & 0x3f);
  81 put4: *p++ = 0x80 | ((u >> 18) & 0x3f);
  82 put3: *p++ = 0x80 | ((u >> 12) & 0x3f);
  83 put2: *p++ = 0x80 | ((u >> 6) & 0x3f);
  84 put1: *p++ = 0x80 | (u & 0x3f);
  85     }
  86   else
  87     ASSERT(0);
  88   return p;
  89 }
  90
  91 #define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f)
  92 #define UTF8_CHECK_AVAIL(n) if (unlikely(avail < n)) goto bad
  93 #define UTF8_CHECK_RANGE(r) if (unlikely(u < r)) goto bad
  94
  95 /**
  96  * Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane)
  97  * or return @repl if the encoding has been corrupted.
  98  **/
  99 static inline byte *utf8_get_repl(const byte *p, uint *uu, uint repl)
 100 {
 101   uint u = *p++;
 102   if (u < 0x80)
 103     ;
 104   else if (unlikely(u < 0xc0))
 105     {
 106       /* Incorrect byte sequence */
 107     bad:
 108       u = repl;
 109     }
 110   else if (u < 0xe0)
 111     {
 112       u &= 0x1f;
 113       UTF8_GET_NEXT;
 114       UTF8_CHECK_RANGE(0x80);
 115     }
 116   else if (likely(u < 0xf0))
 117     {
 118       u &= 0x0f;
 119       UTF8_GET_NEXT;
 120       UTF8_GET_NEXT;
 121       UTF8_CHECK_RANGE(0x800);
 122     }
 123   else
 124     goto bad;
 125   *uu = u;
 126   return (byte *)p;
 127 }
 128
 129 /**
 130  * Decode a value from the range `[0, 0x7FFFFFFF]`
 131  * or return @repl if the encoding has been corrupted.
 132  **/
 133 static inline byte *utf8_32_get_repl(const byte *p, uint *uu, uint repl)
 134 {
 135   uint u = *p++;
 136   uint limit;
 137   if (u < 0x80)
 138     ;
 139   else if (unlikely(u < 0xc0))
 140     goto bad;
 141   else if (u < 0xe0)
 142     {
 143       u &= 0x1f;
 144       limit = 0x80;
 145       goto get1;
 146     }
 147   else if (u < 0xf0)
 148     {
 149       u &= 0x0f;
 150       limit = 0x800;
 151       goto get2;
 152     }
 153   else if (u < 0xf8)
 154     {
 155       u &= 0x07;
 156       limit = 1 << 16;
 157       goto get3;
 158     }
 159   else if (u < 0xfc)
 160     {
 161       u &= 0x03;
 162       limit = 1 << 21;
 163       goto get4;
 164     }
 165   else if (u < 0xfe)
 166     {
 167       u &= 0x01;
 168       limit = 1 << 26;
 169       UTF8_GET_NEXT;
 170 get4: UTF8_GET_NEXT;
 171 get3: UTF8_GET_NEXT;
 172 get2: UTF8_GET_NEXT;
 173 get1: UTF8_GET_NEXT;
 174       if (unlikely(u < limit))
 175         goto bad;
 176     }
 177   else
 178     goto bad;
 179   *uu = u;
 180   return (byte *)p;
 181
 182 bad:
 183   /* Incorrect byte sequence */
 184   *uu = repl;
 185   return (byte *)p;
 186 }
 187
 188 /**
 189  * Decode a value from the range `[0, 0x7FFFFFFF]`
 190  * or return @repl if the encoding has been corrupted.
 191  * This function never reads behind @stop (including).
 192  * At least one byte must be available (@stop > @p).
 193  **/
 194 static inline byte *utf8_32_get_repl_safe(const byte *p, const byte *stop, uint *uu, uint repl)
 195 {
 196   uint u = *p++;
 197   if (u < 0x80)
 198     goto ok;
 199   else if (unlikely(u < 0xc0))
 200     goto bad;
 201   uint limit;
 202   size_t avail = stop - p;
 203   if (u < 0xe0)
 204     {
 205       UTF8_CHECK_AVAIL(1);
 206       u &= 0x1f;
 207       limit = 0x80;
 208       goto get1;
 209     }
 210   else if (u < 0xf0)
 211     {
 212       UTF8_CHECK_AVAIL(2);
 213       u &= 0x0f;
 214       limit = 0x800;
 215       goto get2;
 216     }
 217   else if (u < 0xf8)
 218     {
 219       UTF8_CHECK_AVAIL(3);
 220       u &= 0x07;
 221       limit = 1 << 16;
 222       goto get3;
 223     }
 224   else if (u < 0xfc)
 225     {
 226       UTF8_CHECK_AVAIL(4);
 227       u &= 0x03;
 228       limit = 1 << 21;
 229       goto get4;
 230     }
 231   else if (u < 0xfe)
 232     {
 233       UTF8_CHECK_AVAIL(5);
 234       u &= 0x01;
 235       limit = 1 << 26;
 236       UTF8_GET_NEXT;
 237 get4: UTF8_GET_NEXT;
 238 get3: UTF8_GET_NEXT;
 239 get2: UTF8_GET_NEXT;
 240 get1: UTF8_GET_NEXT;
 241       if (unlikely(u < limit))
 242         goto bad;
 243     }
 244   else
 245     goto bad;
 246
 247 ok:
 248   *uu = u;
 249   return (byte *)p;
 250
 251 bad:
 252   /* Incorrect byte sequence */
 253   *uu = repl;
 254   return (byte *)p;
 255 }
 256
 257 /**
 258  * Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane)
 259  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
 260  **/
 261 static inline byte *utf8_get(const byte *p, uint *uu)
 262 {
 263   return utf8_get_repl(p, uu, UNI_REPLACEMENT);
 264 }
 265
 266 /**
 267  * Decode a value from the range `[0, 0x7FFFFFFF]`
 268  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
 269  **/
 270 static inline byte *utf8_32_get(const byte *p, uint *uu)
 271 {
 272   return utf8_32_get_repl(p, uu, UNI_REPLACEMENT);
 273 }
 274
 275 #define UTF8_SKIP(p) do {                               \
 276     uint c = *p++;                                      \
 277     if (c >= 0xc0)                                      \
 278       while (c & 0x40 && *p >= 0x80 && *p < 0xc0)       \
 279         p++, c <<= 1;                                   \
 280   } while (0)
 281
 282 #define UTF8_SKIP_BWD(p) while ((*--(p) & 0xc0) == 0x80)
 283
 284 /**
 285  * Return the number of bytes needed to encode a given value from the range `[0, 0x7FFFFFFF]` to UTF-8.
 286  **/
 287 static inline uint utf8_space(uint u)
 288 {
 289   if (u < 0x80)
 290     return 1;
 291   if (u < 0x800)
 292     return 2;
 293   if (u < (1<<16))
 294     return 3;
 295   if (u < (1<<21))
 296     return 4;
 297   if (u < (1<<26))
 298     return 5;
 299   return 6;
 300 }
 301
 302 /**
 303  * Compute the length of a single UTF-8 character from its first byte. The encoding must be valid.
 304  **/
 305 static inline uint utf8_encoding_len(uint c)
 306 {
 307   if (c < 0x80)
 308     return 1;
 309   ASSERT(c >= 0xc0 && c < 0xfe);
 310   if (c < 0xe0)
 311     return 2;
 312   if (c < 0xf0)
 313     return 3;
 314   if (c < 0xf8)
 315     return 4;
 316   if (c < 0xfc)
 317     return 5;
 318   return 6;
 319 }
 320
 321 /** Maximum number of bytes an UTF-8 character can have. **/
 322 #define UTF8_MAX_LEN 6
 323
 324 /**
 325  * Encode an UTF-16LE character from the range `[0, 0xD7FF]` or `[0xE000,0x11FFFF]`;
 326  * up to 4 bytes needed.
 327  **/
 328 static inline void *utf16_le_put(void *p, uint u)
 329 {
 330   if (u < 0xd800 || (u < 0x10000 && u >= 0xe000))
 331     {
 332       put_u16_le(p, u);
 333       return p + 2;
 334     }
 335   else if ((u -= 0x10000) < 0x100000)
 336     {
 337       put_u16_le(p, 0xd800 | (u >> 10));
 338       put_u16_le(p + 2, 0xdc00 | (u & 0x3ff));
 339       return p + 4;
 340     }
 341   else
 342     ASSERT(0);
 343 }
 344
 345 /**
 346  * Encode a UTF-16BE character from the range `[0, 0xD7FF]` or `[0xE000,0x11FFFF]`;
 347  * up to 4 bytes needed.
 348  **/
 349 static inline void *utf16_be_put(void *p, uint u)
 350 {
 351   if (u < 0xd800 || (u < 0x10000 && u >= 0xe000))
 352     {
 353       put_u16_be(p, u);
 354       return p + 2;
 355     }
 356   else if ((u -= 0x10000) < 0x100000)
 357     {
 358       put_u16_be(p, 0xd800 | (u >> 10));
 359       put_u16_be(p + 2, 0xdc00 | (u & 0x3ff));
 360       return p + 4;
 361     }
 362   else
 363     ASSERT(0);
 364 }
 365
 366 /**
 367  * Decode a UTF-16LE character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]`
 368  * or return @repl if the encoding has been corrupted.
 369  **/
 370 static inline void *utf16_le_get_repl(const void *p, uint *uu, uint repl)
 371 {
 372   uint u = get_u16_le(p), x, y;
 373   x = u - 0xd800;
 374   if (x < 0x800)
 375     if (x < 0x400 && (y = get_u16_le(p + 2) - 0xdc00) < 0x400)
 376       {
 377         u = 0x10000 + (x << 10) + y;
 378         p += 2;
 379       }
 380     else
 381       u = repl;
 382   *uu = u;
 383   return (void *)(p + 2);
 384 }
 385
 386 /**
 387  * Decode a UTF-16BE character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]`
 388  * or return @repl if the encoding has been corrupted.
 389  **/
 390 static inline void *utf16_be_get_repl(const void *p, uint *uu, uint repl)
 391 {
 392   uint u = get_u16_be(p), x, y;
 393   x = u - 0xd800;
 394   if (x < 0x800)
 395     if (x < 0x400 && (y = get_u16_be(p + 2) - 0xdc00) < 0x400)
 396       {
 397         u = 0x10000 + (x << 10) + y;
 398         p += 2;
 399       }
 400     else
 401       u = repl;
 402   *uu = u;
 403   return (void *)(p + 2);
 404 }
 405
 406 /**
 407  * Decode a UTF-16LE  character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]`
 408  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
 409  **/
 410 static inline void *utf16_le_get(const void *p, uint *uu)
 411 {
 412   return utf16_le_get_repl(p, uu, UNI_REPLACEMENT);
 413 }
 414
 415 /**
 416  * Decode a UTF-16BE  character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]`
 417  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
 418  **/
 419 static inline void *utf16_be_get(const void *p, uint *uu)
 420 {
 421   return utf16_be_get_repl(p, uu, UNI_REPLACEMENT);
 422 }
 423
 424 /**
 425  * Basic sanity check on Unicode characters. Return `UNI_REPLACEMENT` if the input
 426  * character is a surrogate, ASCII or Latin-1 control character different from the tab,
 427  * or if it lies outside the basic plane. In all other cases, it acts as an identity.
 428  **/
 429 static inline uint unicode_sanitize_char(uint u)
 430 {
 431   if (u >= 0x10000 ||                   // We don't accept anything outside the basic plane
 432       u >= 0xd800 && u < 0xf900 ||      // neither we do surrogates and private use characters
 433       u >= 0x80 && u < 0xa0 ||          // nor latin-1 control chars
 434       u < 0x20 && u != '\t')
 435     return UNI_REPLACEMENT;
 436   return u;
 437 }
 438
 439 /* unicode-utf8.c */
 440
 441 /**
 442  * Count the number of Unicode characters in a zero-terminated UTF-8 string.
 443  * Returned value for corrupted encoding is undefined, but is never greater than strlen().
 444  **/
 445 size_t utf8_strlen(const byte *str);
 446
 447 /**
 448  * Same as @utf8_strlen(), but returns at most @n characters.
 449  **/
 450 size_t utf8_strnlen(const byte *str, size_t n);
 451
 452 #endif