ucw/unicode.h

   1 /*
   2  *      UCW Library -- Unicode Characters
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2004 Robert Spalek <robert@ucw.cz>
   6  *      (c) 2007 Pavel Charvat <pchar@ucw.cz>
   7  *
   8  *      This software may be freely distributed and used according to the terms
   9  *      of the GNU Lesser General Public License.
  10  */
  11
  12 #ifndef _UCW_UNICODE_H
  13 #define _UCW_UNICODE_H
  14
  15 #include <ucw/unaligned.h>
  16
  17 #ifdef CONFIG_UCW_CLEAN_ABI
  18 #define utf8_strlen ucw_utf8_strlen
  19 #define utf8_strnlen ucw_utf8_strnlen
  20 #endif
  21
  22 /* Macros for handling UTF-8 */
  23
  24 #define UNI_REPLACEMENT 0xfffc  /** Unicode value used as a default replacement of invalid characters. **/
  25
  26 /**
  27  * Encode a value from the range `[0, 0xFFFF]`
  28  * (basic multilingual plane); up to 3 bytes needed (RFC2279).
  29  **/
  30 static inline byte *utf8_put(byte *p, uns u)
  31 {
  32   if (u < 0x80)
  33     *p++ = u;
  34   else if (u < 0x800)
  35     {
  36       *p++ = 0xc0 | (u >> 6);
  37       *p++ = 0x80 | (u & 0x3f);
  38     }
  39   else
  40     {
  41       ASSERT(u < 0x10000);
  42       *p++ = 0xe0 | (u >> 12);
  43       *p++ = 0x80 | ((u >> 6) & 0x3f);
  44       *p++ = 0x80 | (u & 0x3f);
  45     }
  46   return p;
  47 }
  48
  49 /**
  50  * Encode a value from the range `[0, 0x7FFFFFFF]`;
  51  * (superset of Unicode 4.0) up to 6 bytes needed (RFC2279).
  52  **/
  53 static inline byte *utf8_32_put(byte *p, uns u)
  54 {
  55   if (u < 0x80)
  56     *p++ = u;
  57   else if (u < 0x800)
  58     {
  59       *p++ = 0xc0 | (u >> 6);
  60       goto put1;
  61     }
  62   else if (u < (1<<16))
  63     {
  64       *p++ = 0xe0 | (u >> 12);
  65       goto put2;
  66     }
  67   else if (u < (1<<21))
  68     {
  69       *p++ = 0xf0 | (u >> 18);
  70       goto put3;
  71     }
  72   else if (u < (1<<26))
  73     {
  74       *p++ = 0xf8 | (u >> 24);
  75       goto put4;
  76     }
  77   else if (u < (1U<<31))
  78     {
  79       *p++ = 0xfc | (u >> 30);
  80       *p++ = 0x80 | ((u >> 24) & 0x3f);
  81 put4: *p++ = 0x80 | ((u >> 18) & 0x3f);
  82 put3: *p++ = 0x80 | ((u >> 12) & 0x3f);
  83 put2: *p++ = 0x80 | ((u >> 6) & 0x3f);
  84 put1: *p++ = 0x80 | (u & 0x3f);
  85     }
  86   else
  87     ASSERT(0);
  88   return p;
  89 }
  90
  91 #define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f)
  92
  93 /**
  94  * Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane)
  95  * or return @repl if the encoding has been corrupted.
  96  **/
  97 static inline byte *utf8_get_repl(const byte *p, uns *uu, uns repl)
  98 {
  99   uns u = *p++;
 100   if (u < 0x80)
 101     ;
 102   else if (unlikely(u < 0xc0))
 103     {
 104       /* Incorrect byte sequence */
 105     bad:
 106       u = repl;
 107     }
 108   else if (u < 0xe0)
 109     {
 110       u &= 0x1f;
 111       UTF8_GET_NEXT;
 112     }
 113   else if (likely(u < 0xf0))
 114     {
 115       u &= 0x0f;
 116       UTF8_GET_NEXT;
 117       UTF8_GET_NEXT;
 118     }
 119   else
 120     goto bad;
 121   *uu = u;
 122   return (byte *)p;
 123 }
 124
 125 /**
 126  * Decode a value from the range `[0, 0x7FFFFFFF]`
 127  * or return @repl if the encoding has been corrupted.
 128  **/
 129 static inline byte *utf8_32_get_repl(const byte *p, uns *uu, uns repl)
 130 {
 131   uns u = *p++;
 132   if (u < 0x80)
 133     ;
 134   else if (unlikely(u < 0xc0))
 135     {
 136       /* Incorrect byte sequence */
 137     bad:
 138       u = repl;
 139     }
 140   else if (u < 0xe0)
 141     {
 142       u &= 0x1f;
 143       goto get1;
 144     }
 145   else if (u < 0xf0)
 146     {
 147       u &= 0x0f;
 148       goto get2;
 149     }
 150   else if (u < 0xf8)
 151     {
 152       u &= 0x07;
 153       goto get3;
 154     }
 155   else if (u < 0xfc)
 156     {
 157       u &= 0x03;
 158       goto get4;
 159     }
 160   else if (u < 0xfe)
 161     {
 162       u &= 0x01;
 163       UTF8_GET_NEXT;
 164 get4: UTF8_GET_NEXT;
 165 get3: UTF8_GET_NEXT;
 166 get2: UTF8_GET_NEXT;
 167 get1: UTF8_GET_NEXT;
 168     }
 169   else
 170     goto bad;
 171   *uu = u;
 172   return (byte *)p;
 173 }
 174
 175 /**
 176  * Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane)
 177  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
 178  **/
 179 static inline byte *utf8_get(const byte *p, uns *uu)
 180 {
 181   return utf8_get_repl(p, uu, UNI_REPLACEMENT);
 182 }
 183
 184 /**
 185  * Decode a value from the range `[0, 0x7FFFFFFF]`
 186  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
 187  **/
 188 static inline byte *utf8_32_get(const byte *p, uns *uu)
 189 {
 190   return utf8_32_get_repl(p, uu, UNI_REPLACEMENT);
 191 }
 192
 193 #define UTF8_SKIP(p) do {                               \
 194     uns c = *p++;                                       \
 195     if (c >= 0xc0)                                      \
 196       while (c & 0x40 && *p >= 0x80 && *p < 0xc0)       \
 197         p++, c <<= 1;                                   \
 198   } while (0)
 199
 200 #define UTF8_SKIP_BWD(p) while ((*--(p) & 0xc0) == 0x80)
 201
 202 /**
 203  * Return the number of bytes needed to encode a given value from the range `[0, 0x7FFFFFFF]` to UTF-8.
 204  **/
 205 static inline uns utf8_space(uns u)
 206 {
 207   if (u < 0x80)
 208     return 1;
 209   if (u < 0x800)
 210     return 2;
 211   if (u < (1<<16))
 212     return 3;
 213   if (u < (1<<21))
 214     return 4;
 215   if (u < (1<<26))
 216     return 5;
 217   return 6;
 218 }
 219
 220 /**
 221  * Compute the length of a single UTF-8 character from its first byte. The encoding must be valid.
 222  **/
 223 static inline uns utf8_encoding_len(uns c)
 224 {
 225   if (c < 0x80)
 226     return 1;
 227   ASSERT(c >= 0xc0 && c < 0xfe);
 228   if (c < 0xe0)
 229     return 2;
 230   if (c < 0xf0)
 231     return 3;
 232   if (c < 0xf8)
 233     return 4;
 234   if (c < 0xfc)
 235     return 5;
 236   return 6;
 237 }
 238
 239 /**
 240  * Encode an UTF-16LE character from the range `[0, 0xD7FF]` or `[0xE000,0x11FFFF]`;
 241  * up to 4 bytes needed.
 242  **/
 243 static inline void *utf16_le_put(void *p, uns u)
 244 {
 245   if (u < 0xd800 || (u < 0x10000 && u >= 0xe000))
 246     {
 247       put_u16_le(p, u);
 248       return p + 2;
 249     }
 250   else if ((u -= 0x10000) < 0x100000)
 251     {
 252       put_u16_le(p, 0xd800 | (u >> 10));
 253       put_u16_le(p + 2, 0xdc00 | (u & 0x3ff));
 254       return p + 4;
 255     }
 256   else
 257     ASSERT(0);
 258 }
 259
 260 /**
 261  * Encode a UTF-16BE character from the range `[0, 0xD7FF]` or `[0xE000,0x11FFFF]`;
 262  * up to 4 bytes needed.
 263  **/
 264 static inline void *utf16_be_put(void *p, uns u)
 265 {
 266   if (u < 0xd800 || (u < 0x10000 && u >= 0xe000))
 267     {
 268       put_u16_be(p, u);
 269       return p + 2;
 270     }
 271   else if ((u -= 0x10000) < 0x100000)
 272     {
 273       put_u16_be(p, 0xd800 | (u >> 10));
 274       put_u16_be(p + 2, 0xdc00 | (u & 0x3ff));
 275       return p + 4;
 276     }
 277   else
 278     ASSERT(0);
 279 }
 280
 281 /**
 282  * Decode a UTF-16LE character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]`
 283  * or return @repl if the encoding has been corrupted.
 284  **/
 285 static inline void *utf16_le_get_repl(const void *p, uns *uu, uns repl)
 286 {
 287   uns u = get_u16_le(p), x, y;
 288   x = u - 0xd800;
 289   if (x < 0x800)
 290     if (x < 0x400 && (y = get_u16_le(p + 2) - 0xdc00) < 0x400)
 291       {
 292         u = 0x10000 + (x << 10) + y;
 293         p += 2;
 294       }
 295     else
 296       u = repl;
 297   *uu = u;
 298   return (void *)(p + 2);
 299 }
 300
 301 /**
 302  * Decode a UTF-16BE character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]`
 303  * or return @repl if the encoding has been corrupted.
 304  **/
 305 static inline void *utf16_be_get_repl(const void *p, uns *uu, uns repl)
 306 {
 307   uns u = get_u16_be(p), x, y;
 308   x = u - 0xd800;
 309   if (x < 0x800)
 310     if (x < 0x400 && (y = get_u16_be(p + 2) - 0xdc00) < 0x400)
 311       {
 312         u = 0x10000 + (x << 10) + y;
 313         p += 2;
 314       }
 315     else
 316       u = repl;
 317   *uu = u;
 318   return (void *)(p + 2);
 319 }
 320
 321 /**
 322  * Decode a UTF-16LE  character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]`
 323  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
 324  **/
 325 static inline void *utf16_le_get(const void *p, uns *uu)
 326 {
 327   return utf16_le_get_repl(p, uu, UNI_REPLACEMENT);
 328 }
 329
 330 /**
 331  * Decode a UTF-16BE  character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]`
 332  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
 333  **/
 334 static inline void *utf16_be_get(const void *p, uns *uu)
 335 {
 336   return utf16_be_get_repl(p, uu, UNI_REPLACEMENT);
 337 }
 338
 339 /**
 340  * Basic sanity check on Unicode characters. Return `UNI_REPLACEMENT` if the input
 341  * character is a surrogate, ASCII or Latin-1 control character different from the tab,
 342  * or if it lies outside the basic plane. In all other cases, it acts as an identity.
 343  **/
 344 static inline uns unicode_sanitize_char(uns u)
 345 {
 346   if (u >= 0x10000 ||                   // We don't accept anything outside the basic plane
 347       u >= 0xd800 && u < 0xf900 ||      // neither we do surrogates
 348       u >= 0x80 && u < 0xa0 ||          // nor latin-1 control chars
 349       u < 0x20 && u != '\t')
 350     return UNI_REPLACEMENT;
 351   return u;
 352 }
 353
 354 /* unicode-utf8.c */
 355
 356 /**
 357  * Count the number of Unicode characters in a zero-terminated UTF-8 string.
 358  * Returned value for corrupted encoding is undefined, but is never greater than strlen().
 359  **/
 360 uns utf8_strlen(const byte *str);
 361
 362 /**
 363  * Same as @utf8_strlen(), but returns at most @n characters.
 364  **/
 365 uns utf8_strnlen(const byte *str, uns n);
 366
 367 #endif