lib/unicode.h

   1 /*
   2  *      UCW Library -- Unicode Characters
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2004 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  */
  10
  11 #ifndef _UCW_UNICODE_H
  12 #define _UCW_UNICODE_H
  13
  14 /* Macros for handling UTF-8 */
  15
  16 #define UNI_REPLACEMENT 0xfffc
  17
  18 /* Encode a character from the basic multilingual plane [0, 0xFFFF]
  19  * (subset of Unicode 4.0); up to 3 bytes needed (RFC2279) */
  20 static inline byte *
  21 utf8_put(byte *p, uns u)
  22 {
  23   if (u < 0x80)
  24     *p++ = u;
  25   else if (u < 0x800)
  26     {
  27       *p++ = 0xc0 | (u >> 6);
  28       *p++ = 0x80 | (u & 0x3f);
  29     }
  30   else
  31     {
  32       ASSERT(u < 0x10000);
  33       *p++ = 0xe0 | (u >> 12);
  34       *p++ = 0x80 | ((u >> 6) & 0x3f);
  35       *p++ = 0x80 | (u & 0x3f);
  36     }
  37   return p;
  38 }
  39
  40 /* Encode a value from the range [0, 0x7FFFFFFF];
  41  * (superset of Unicode 4.0) up to 6 bytes needed (RFC2279) */
  42 static inline byte *
  43 utf8_32_put(byte *p, uns u)
  44 {
  45   if (u < 0x80)
  46     *p++ = u;
  47   else if (u < 0x800)
  48     {
  49       *p++ = 0xc0 | (u >> 6);
  50       goto put1;
  51     }
  52   else if (u < (1<<16))
  53     {
  54       *p++ = 0xe0 | (u >> 12);
  55       goto put2;
  56     }
  57   else if (u < (1<<21))
  58     {
  59       *p++ = 0xf0 | (u >> 18);
  60       goto put3;
  61     }
  62   else if (u < (1<<26))
  63     {
  64       *p++ = 0xf8 | (u >> 24);
  65       goto put4;
  66     }
  67   else if (u < (1U<<31))
  68     {
  69       *p++ = 0xfc | (u >> 30);
  70       *p++ = 0x80 | ((u >> 24) & 0x3f);
  71 put4: *p++ = 0x80 | ((u >> 18) & 0x3f);
  72 put3: *p++ = 0x80 | ((u >> 12) & 0x3f);
  73 put2: *p++ = 0x80 | ((u >> 6) & 0x3f);
  74 put1: *p++ = 0x80 | (u & 0x3f);
  75     }
  76   else
  77     ASSERT(0);
  78   return p;
  79 }
  80
  81 #define UTF8_GET_NEXT if (unlikely((*p & 0xc0) != 0x80)) goto bad; u = (u << 6) | (*p++ & 0x3f)
  82
  83 /* Decode a character from the basic multilingual plane [0, 0xFFFF]
  84  * or return UNI_REPLACEMENT if the encoding has been corrupted */
  85 static inline byte *
  86 utf8_get(const byte *p, uns *uu)
  87 {
  88   uns u = *p++;
  89   if (u < 0x80)
  90     ;
  91   else if (unlikely(u < 0xc0))
  92     {
  93       /* Incorrect byte sequence */
  94     bad:
  95       u = UNI_REPLACEMENT;
  96     }
  97   else if (u < 0xe0)
  98     {
  99       u &= 0x1f;
 100       UTF8_GET_NEXT;
 101     }
 102   else if (likely(u < 0xf0))
 103     {
 104       u &= 0x0f;
 105       UTF8_GET_NEXT;
 106       UTF8_GET_NEXT;
 107     }
 108   else
 109     goto bad;
 110   *uu = u;
 111   return (byte *)p;
 112 }
 113
 114 /* Decode a value from the range [0, 0x7FFFFFFF]
 115  * or return UNI_REPLACEMENT if the encoding has been corrupted */
 116 static inline byte *
 117 utf8_32_get(const byte *p, uns *uu)
 118 {
 119   uns u = *p++;
 120   if (u < 0x80)
 121     ;
 122   else if (unlikely(u < 0xc0))
 123     {
 124       /* Incorrect byte sequence */
 125     bad:
 126       u = UNI_REPLACEMENT;
 127     }
 128   else if (u < 0xe0)
 129     {
 130       u &= 0x1f;
 131       goto get1;
 132     }
 133   else if (u < 0xf0)
 134     {
 135       u &= 0x0f;
 136       goto get2;
 137     }
 138   else if (u < 0xf8)
 139     {
 140       u &= 0x07;
 141       goto get3;
 142     }
 143   else if (u < 0xfc)
 144     {
 145       u &= 0x03;
 146       goto get4;
 147     }
 148   else if (u < 0xfe)
 149     {
 150       u &= 0x01;
 151       UTF8_GET_NEXT;
 152 get4: UTF8_GET_NEXT;
 153 get3: UTF8_GET_NEXT;
 154 get2: UTF8_GET_NEXT;
 155 get1: UTF8_GET_NEXT;
 156     }
 157   else
 158     goto bad;
 159   *uu = u;
 160   return (byte *)p;
 161 }
 162
 163 #define PUT_UTF8(p,u) p = utf8_put(p, u)
 164 #define GET_UTF8(p,u) p = (byte*)utf8_get(p, &(u))
 165
 166 #define PUT_UTF8_32(p,u) p = utf8_32_put(p, u)
 167 #define GET_UTF8_32(p,u) p = (byte*)utf8_32_get(p, &(u))
 168
 169 #define UTF8_SKIP(p) do {                               \
 170     uns c = *p++;                                       \
 171     if (c >= 0xc0)                                      \
 172       while (c & 0x40 && *p >= 0x80 && *p < 0xc0)       \
 173         p++, c <<= 1;                                   \
 174   } while (0)
 175
 176 #define UTF8_SKIP_BWD(p) while ((*--(p) & 0xc0) == 0x80)
 177
 178 static inline uns
 179 utf8_space(uns u)
 180 {
 181   if (u < 0x80)
 182     return 1;
 183   if (u < 0x800)
 184     return 2;
 185   if (u < (1<<16))
 186     return 3;
 187   if (u < (1<<21))
 188     return 4;
 189   if (u < (1<<26))
 190     return 5;
 191   return 6;
 192 }
 193
 194 static inline uns
 195 utf8_encoding_len(uns c)
 196 {
 197   if (c < 0x80)
 198     return 1;
 199   ASSERT(c >= 0xc0 && c < 0xfe);
 200   if (c < 0xe0)
 201     return 2;
 202   if (c < 0xf0)
 203     return 3;
 204   if (c < 0xf8)
 205     return 4;
 206   if (c < 0xfc)
 207     return 5;
 208   return 6;
 209 }
 210
 211 /* unicode-utf8.c */
 212
 213 uns utf8_strlen(const byte *str);
 214 uns utf8_strnlen(const byte *str, uns n);
 215
 216 #endif