lib/unicode.h

   1 /*
   2  *      Sherlock Library -- Unicode Characters
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2004 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  */
  10
  11 #ifndef _UNICODE_H
  12 #define _UNICODE_H
  13
  14 /* Macros for handling UTF-8 */
  15
  16 #define UNI_REPLACEMENT 0xfffc
  17
  18 #define PUT_UTF8(p,u) do {              \
  19   if (u < 0x80)                         \
  20     *p++ = u;                           \
  21   else if (u < 0x800)                   \
  22     {                                   \
  23       *p++ = 0xc0 | (u >> 6);           \
  24       *p++ = 0x80 | (u & 0x3f);         \
  25     }                                   \
  26   else                                  \
  27     {                                   \
  28       *p++ = 0xe0 | (u >> 12);          \
  29       *p++ = 0x80 | ((u >> 6) & 0x3f);  \
  30       *p++ = 0x80 | (u & 0x3f);         \
  31     }                                   \
  32   } while(0)
  33
  34 #define PUT_UTF8_32(p,u) do {           \
  35   if (u < (1<<16))                      \
  36     PUT_UTF8(p,u);                      \
  37   else if (u < (1<<21))                 \
  38     {                                   \
  39       *p++ = 0xf0 | (u >> 18);          \
  40       *p++ = 0x80 | ((u >> 12) & 0x3f); \
  41       *p++ = 0x80 | ((u >> 6) & 0x3f);  \
  42       *p++ = 0x80 | (u & 0x3f);         \
  43     }                                   \
  44   else if (u < (1<<26))                 \
  45     {                                   \
  46       *p++ = 0xf8 | (u >> 24);          \
  47       *p++ = 0x80 | ((u >> 18) & 0x3f); \
  48       *p++ = 0x80 | ((u >> 12) & 0x3f); \
  49       *p++ = 0x80 | ((u >> 6) & 0x3f);  \
  50       *p++ = 0x80 | (u & 0x3f);         \
  51     }                                   \
  52   else if (u < (1U<<31))                \
  53     {                                   \
  54       *p++ = 0xfc | (u >> 30);          \
  55       *p++ = 0x80 | ((u >> 24) & 0x3f); \
  56       *p++ = 0x80 | ((u >> 18) & 0x3f); \
  57       *p++ = 0x80 | ((u >> 12) & 0x3f); \
  58       *p++ = 0x80 | ((u >> 6) & 0x3f);  \
  59       *p++ = 0x80 | (u & 0x3f);         \
  60     }                                   \
  61   } while(0)
  62
  63 #define IS_UTF8(c) ((c) >= 0xc0)
  64
  65 #define GET_UTF8_CHAR(p,u) do {         \
  66     if (*p >= 0xf0)                     \
  67       { /* Too large, use replacement char */   \
  68         p++;                            \
  69         while ((*p & 0xc0) == 0x80)     \
  70           p++;                          \
  71         u = UNI_REPLACEMENT;            \
  72       }                                 \
  73     else if (*p >= 0xe0)                \
  74       {                                 \
  75         u = *p++ & 0x0f;                \
  76         if ((*p & 0xc0) == 0x80)        \
  77           u = (u << 6) | (*p++ & 0x3f); \
  78         if ((*p & 0xc0) == 0x80)        \
  79           u = (u << 6) | (*p++ & 0x3f); \
  80       }                                 \
  81     else                                \
  82       {                                 \
  83         u = *p++ & 0x1f;                \
  84         if ((*p & 0xc0) == 0x80)        \
  85           u = (u << 6) | (*p++ & 0x3f); \
  86       }                                 \
  87   } while (0)                           \
  88
  89 #define GET_UTF8_32_CHAR(p,u) do {      \
  90     if (*p < 0xf0)                      \
  91       GET_UTF8_CHAR(p,u);               \
  92     else if (*p < 0xf8)                 \
  93       {                                 \
  94         u = *p++ & 0x07;                \
  95         if ((*p & 0xc0) == 0x80)        \
  96           u = (u << 6) | (*p++ & 0x3f); \
  97         if ((*p & 0xc0) == 0x80)        \
  98           u = (u << 6) | (*p++ & 0x3f); \
  99         if ((*p & 0xc0) == 0x80)        \
 100           u = (u << 6) | (*p++ & 0x3f); \
 101       }                                 \
 102     else if (*p < 0xfc)                 \
 103       {                                 \
 104         u = *p++ & 0x03;                \
 105         if ((*p & 0xc0) == 0x80)        \
 106           u = (u << 6) | (*p++ & 0x3f); \
 107         if ((*p & 0xc0) == 0x80)        \
 108           u = (u << 6) | (*p++ & 0x3f); \
 109         if ((*p & 0xc0) == 0x80)        \
 110           u = (u << 6) | (*p++ & 0x3f); \
 111         if ((*p & 0xc0) == 0x80)        \
 112           u = (u << 6) | (*p++ & 0x3f); \
 113       }                                 \
 114     else if (*p < 0xfe)                 \
 115       {                                 \
 116         u = *p++ & 0x01;                \
 117         if ((*p & 0xc0) == 0x80)        \
 118           u = (u << 6) | (*p++ & 0x3f); \
 119         if ((*p & 0xc0) == 0x80)        \
 120           u = (u << 6) | (*p++ & 0x3f); \
 121         if ((*p & 0xc0) == 0x80)        \
 122           u = (u << 6) | (*p++ & 0x3f); \
 123         if ((*p & 0xc0) == 0x80)        \
 124           u = (u << 6) | (*p++ & 0x3f); \
 125         if ((*p & 0xc0) == 0x80)        \
 126           u = (u << 6) | (*p++ & 0x3f); \
 127       }                                 \
 128     else                                \
 129       { /* Too large, use replacement char */   \
 130         p++;                            \
 131         while ((*p & 0xc0) == 0x80)     \
 132           p++;                          \
 133         u = UNI_REPLACEMENT;            \
 134       }                                 \
 135   } while (0)                           \
 136
 137 #define GET_UTF8(p,u)                   \
 138     if (IS_UTF8(*p))                    \
 139       GET_UTF8_CHAR(p,u);               \
 140     else                                \
 141       u = *p++
 142
 143 #define GET_UTF8_32(p,u)                \
 144     if (IS_UTF8(*p))                    \
 145       GET_UTF8_32_CHAR(p,u);            \
 146     else                                \
 147       u = *p++
 148
 149 #define UTF8_SKIP(p) do {                               \
 150     uns c = *p++;                                       \
 151     if (c >= 0xc0)                                      \
 152       while (c & 0x40 && *p >= 0x80 && *p < 0xc0)       \
 153         p++, c <<= 1;                                   \
 154   } while (0)
 155
 156 #define UTF8_SKIP_BWD(p) while ((*--(p) & 0xc0) == 0x80)
 157
 158 static inline uns
 159 utf8_space(uns u)
 160 {
 161   if (u < 0x80)
 162     return 1;
 163   if (u < 0x800)
 164     return 2;
 165   if (u < (1<<16))
 166     return 3;
 167   if (u < (1<<21))
 168     return 4;
 169   if (u < (1<<26))
 170     return 5;
 171   return 6;
 172 }
 173
 174 static inline uns
 175 utf8_encoding_len(uns c)
 176 {
 177   if (c < 0x80)
 178     return 1;
 179   ASSERT(c >= 0xc0 && c < 0xfe);
 180   if (c < 0xe0)
 181     return 2;
 182   if (c < 0xf0)
 183     return 3;
 184   if (c < 0xf8)
 185     return 4;
 186   if (c < 0xfc)
 187     return 5;
 188   return 6;
 189 }
 190
 191 /* unicode-utf8.c */
 192
 193 uns utf8_strlen(byte *str);
 194 uns utf8_strnlen(byte *str, uns n);
 195
 196 #endif