ucw/unicode.h

   1 /*
   2  *      UCW Library -- Unicode Characters
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2004 Robert Spalek <robert@ucw.cz>
   6  *      (c) 2007 Pavel Charvat <pchar@ucw.cz>
   7  *
   8  *      This software may be freely distributed and used according to the terms
   9  *      of the GNU Lesser General Public License.
  10  */
  11
  12 #ifndef _UCW_UNICODE_H
  13 #define _UCW_UNICODE_H
  14
  15 #include "ucw/unaligned.h"
  16
  17 /* Macros for handling UTF-8 */
  18
  19 #define UNI_REPLACEMENT 0xfffc  /** Unicode value used as a default replacement of invalid characters. **/
  20
  21 /**
  22  * Encode a value from the range `[0, 0xFFFF]`
  23  * (basic multilingual plane); up to 3 bytes needed (RFC2279).
  24  **/
  25 static inline byte *utf8_put(byte *p, uns u)
  26 {
  27   if (u < 0x80)
  28     *p++ = u;
  29   else if (u < 0x800)
  30     {
  31       *p++ = 0xc0 | (u >> 6);
  32       *p++ = 0x80 | (u & 0x3f);
  33     }
  34   else
  35     {
  36       ASSERT(u < 0x10000);
  37       *p++ = 0xe0 | (u >> 12);
  38       *p++ = 0x80 | ((u >> 6) & 0x3f);
  39       *p++ = 0x80 | (u & 0x3f);
  40     }
  41   return p;
  42 }
  43
  44 /**
  45  * Encode a value from the range `[0, 0x7FFFFFFF]`;
  46  * (superset of Unicode 4.0) up to 6 bytes needed (RFC2279).
  47  **/
  48 static inline byte *utf8_32_put(byte *p, uns u)
  49 {
  50   if (u < 0x80)
  51     *p++ = u;
  52   else if (u < 0x800)
  53     {
  54       *p++ = 0xc0 | (u >> 6);
  55       goto put1;
  56     }
  57   else if (u < (1<<16))
  58     {
  59       *p++ = 0xe0 | (u >> 12);
  60       goto put2;
  61     }
  62   else if (u < (1<<21))
  63     {
  64       *p++ = 0xf0 | (u >> 18);
  65       goto put3;
  66     }
  67   else if (u < (1<<26))
  68     {
  69       *p++ = 0xf8 | (u >> 24);
  70       goto put4;
  71     }
  72   else if (u < (1U<<31))
  73     {
  74       *p++ = 0xfc | (u >> 30);
  75       *p++ = 0x80 | ((u >> 24) & 0x3f);
  76 put4: *p++ = 0x80 | ((u >> 18) & 0x3f);
  77 put3: *p++ = 0x80 | ((u >> 12) & 0x3f);
  78 put2: *p++ = 0x80 | ((u >> 6) & 0x3f);
  79 put1: *p++ = 0x80 | (u & 0x3f);
  80     }
  81   else
  82     ASSERT(0);
  83   return p;
  84 }
  85
  86 #define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f)
  87
  88 /**
  89  * Decode a value from the range `[0, 0xFFFF]` (basic multilingual plane)
  90  * or return @repl if the encoding has been corrupted.
  91  **/
  92 static inline byte *utf8_get_repl(const byte *p, uns *uu, uns repl)
  93 {
  94   uns u = *p++;
  95   if (u < 0x80)
  96     ;
  97   else if (unlikely(u < 0xc0))
  98     {
  99       /* Incorrect byte sequence */
 100     bad:
 101       u = repl;
 102     }
 103   else if (u < 0xe0)
 104     {
 105       u &= 0x1f;
 106       UTF8_GET_NEXT;
 107     }
 108   else if (likely(u < 0xf0))
 109     {
 110       u &= 0x0f;
 111       UTF8_GET_NEXT;
 112       UTF8_GET_NEXT;
 113     }
 114   else
 115     goto bad;
 116   *uu = u;
 117   return (byte *)p;
 118 }
 119
 120 /**
 121  * Decode a value from the range `[0, 0x7FFFFFFF]`
 122  * or return @repl if the encoding has been corrupted.
 123  **/
 124 static inline byte *utf8_32_get_repl(const byte *p, uns *uu, uns repl)
 125 {
 126   uns u = *p++;
 127   if (u < 0x80)
 128     ;
 129   else if (unlikely(u < 0xc0))
 130     {
 131       /* Incorrect byte sequence */
 132     bad:
 133       u = repl;
 134     }
 135   else if (u < 0xe0)
 136     {
 137       u &= 0x1f;
 138       goto get1;
 139     }
 140   else if (u < 0xf0)
 141     {
 142       u &= 0x0f;
 143       goto get2;
 144     }
 145   else if (u < 0xf8)
 146     {
 147       u &= 0x07;
 148       goto get3;
 149     }
 150   else if (u < 0xfc)
 151     {
 152       u &= 0x03;
 153       goto get4;
 154     }
 155   else if (u < 0xfe)
 156     {
 157       u &= 0x01;
 158       UTF8_GET_NEXT;
 159 get4: UTF8_GET_NEXT;
 160 get3: UTF8_GET_NEXT;
 161 get2: UTF8_GET_NEXT;
 162 get1: UTF8_GET_NEXT;
 163     }
 164   else
 165     goto bad;
 166   *uu = u;
 167   return (byte *)p;
 168 }
 169
 170 /**
 171  * Decode a value from the range `[0, 0xFFFF]` (basic multilignual plane)
 172  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
 173  **/
 174 static inline byte *utf8_get(const byte *p, uns *uu)
 175 {
 176   return utf8_get_repl(p, uu, UNI_REPLACEMENT);
 177 }
 178
 179 /**
 180  * Decode a value from the range `[0, 0x7FFFFFFF]`
 181  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
 182  **/
 183 static inline byte *utf8_32_get(const byte *p, uns *uu)
 184 {
 185   return utf8_32_get_repl(p, uu, UNI_REPLACEMENT);
 186 }
 187
 188 #define UTF8_SKIP(p) do {                               \
 189     uns c = *p++;                                       \
 190     if (c >= 0xc0)                                      \
 191       while (c & 0x40 && *p >= 0x80 && *p < 0xc0)       \
 192         p++, c <<= 1;                                   \
 193   } while (0)
 194
 195 #define UTF8_SKIP_BWD(p) while ((*--(p) & 0xc0) == 0x80)
 196
 197 /**
 198  * Return the number of bytes needed to encode a given value from the range `[0, 0x7FFFFFFF]` to UTF-8.
 199  **/
 200 static inline uns utf8_space(uns u)
 201 {
 202   if (u < 0x80)
 203     return 1;
 204   if (u < 0x800)
 205     return 2;
 206   if (u < (1<<16))
 207     return 3;
 208   if (u < (1<<21))
 209     return 4;
 210   if (u < (1<<26))
 211     return 5;
 212   return 6;
 213 }
 214
 215 /**
 216  * Compute the length of a single UTF-8 character from its first byte. The encoding must be valid.
 217  **/
 218 static inline uns utf8_encoding_len(uns c)
 219 {
 220   if (c < 0x80)
 221     return 1;
 222   ASSERT(c >= 0xc0 && c < 0xfe);
 223   if (c < 0xe0)
 224     return 2;
 225   if (c < 0xf0)
 226     return 3;
 227   if (c < 0xf8)
 228     return 4;
 229   if (c < 0xfc)
 230     return 5;
 231   return 6;
 232 }
 233
 234 /**
 235  * Encode an UTF-16LE character from the range `[0, 0xD7FF]` or `[0xE000,0x11FFFF]`;
 236  * up to 4 bytes needed.
 237  **/
 238 static inline void *utf16_le_put(void *p, uns u)
 239 {
 240   if (u < 0xd800 || (u < 0x10000 && u >= 0xe000))
 241     {
 242       put_u16_le(p, u);
 243       return p + 2;
 244     }
 245   else if ((u -= 0x10000) < 0x100000)
 246     {
 247       put_u16_le(p, 0xd800 | (u >> 10));
 248       put_u16_le(p + 2, 0xdc00 | (u & 0x3ff));
 249       return p + 4;
 250     }
 251   else
 252     ASSERT(0);
 253 }
 254
 255 /**
 256  * Encode a UTF-16BE character from the range `[0, 0xD7FF]` or `[0xE000,0x11FFFF]`;
 257  * up to 4 bytes needed.
 258  **/
 259 static inline void *utf16_be_put(void *p, uns u)
 260 {
 261   if (u < 0xd800 || (u < 0x10000 && u >= 0xe000))
 262     {
 263       put_u16_be(p, u);
 264       return p + 2;
 265     }
 266   else if ((u -= 0x10000) < 0x100000)
 267     {
 268       put_u16_be(p, 0xd800 | (u >> 10));
 269       put_u16_be(p + 2, 0xdc00 | (u & 0x3ff));
 270       return p + 4;
 271     }
 272   else
 273     ASSERT(0);
 274 }
 275
 276 /**
 277  * Decode a UTF-16LE character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]`
 278  * or return @repl if the encoding has been corrupted.
 279  **/
 280 static inline void *utf16_le_get_repl(const void *p, uns *uu, uns repl)
 281 {
 282   uns u = get_u16_le(p), x, y;
 283   x = u - 0xd800;
 284   if (x < 0x800)
 285     if (x < 0x400 && (y = get_u16_le(p + 2) - 0xdc00) < 0x400)
 286       {
 287         u = 0x10000 + (x << 10) + y;
 288         p += 2;
 289       }
 290     else
 291       u = repl;
 292   *uu = u;
 293   return (void *)(p + 2);
 294 }
 295
 296 /**
 297  * Decode a UTF-16BE character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]`
 298  * or return @repl if the encoding has been corrupted.
 299  **/
 300 static inline void *utf16_be_get_repl(const void *p, uns *uu, uns repl)
 301 {
 302   uns u = get_u16_be(p), x, y;
 303   x = u - 0xd800;
 304   if (x < 0x800)
 305     if (x < 0x400 && (y = get_u16_be(p + 2) - 0xdc00) < 0x400)
 306       {
 307         u = 0x10000 + (x << 10) + y;
 308         p += 2;
 309       }
 310     else
 311       u = repl;
 312   *uu = u;
 313   return (void *)(p + 2);
 314 }
 315
 316 /**
 317  * Decode a UTF-16LE  character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]`
 318  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
 319  **/
 320 static inline void *utf16_le_get(const void *p, uns *uu)
 321 {
 322   return utf16_le_get_repl(p, uu, UNI_REPLACEMENT);
 323 }
 324
 325 /**
 326  * Decode a UTF-16BE  character from the range `[0, 0xD7FF]` or `[0xE000,11FFFF]`
 327  * or return `UNI_REPLACEMENT` if the encoding has been corrupted.
 328  **/
 329 static inline void *utf16_be_get(const void *p, uns *uu)
 330 {
 331   return utf16_be_get_repl(p, uu, UNI_REPLACEMENT);
 332 }
 333
 334 /**
 335  * Basic sanity check on Unicode characters. Return `UNI_REPLACEMENT` if the input
 336  * character is a surrogate, ASCII or Latin-1 control character different from the tab,
 337  * or if it lies outside the basic plane. In all other cases, it acts as an identity.
 338  **/
 339 static inline uns unicode_sanitize_char(uns u)
 340 {
 341   if (u >= 0x10000 ||                   // We don't accept anything outside the basic plane
 342       u >= 0xd800 && u < 0xf900 ||      // neither we do surrogates
 343       u >= 0x80 && u < 0xa0 ||          // nor latin-1 control chars
 344       u < 0x20 && u != '\t')
 345     return UNI_REPLACEMENT;
 346   return u;
 347 }
 348
 349 /* unicode-utf8.c */
 350
 351 /**
 352  * Count the number of Unicode characters in a zero-terminated UTF-8 string.
 353  * Returned value for corrupted encoding is undefined, but is never greater than strlen().
 354  **/
 355 uns utf8_strlen(const byte *str);
 356
 357 /**
 358  * Same as @utf8_strlen(), but returns at most @n characters.
 359  **/
 360 uns utf8_strnlen(const byte *str, uns n);
 361
 362 #endif