charset/unicode.h

   1 /*
   2  *      The UniCode Library
   3  *
   4  *      (c) 1997--2003 Martin Mares <mj@ucw.cz>
   5  *
   6  *      This software may be freely distributed and used according to the terms
   7  *      of the GNU Lesser General Public License.
   8  */
   9
  10 #ifndef _UNICODE_H
  11 #define _UNICODE_H
  12
  13 extern const byte *_U_cat[];
  14 extern const word *_U_upper[], *_U_lower[], *_U_unaccent[];
  15
  16 static inline uns Ucategory(uns x)
  17 {
  18   if (_U_cat[x >> 8U])
  19     return _U_cat[x >> 8U][x & 0xff];
  20   else
  21     return 0;
  22 }
  23
  24 static inline uns Utoupper(uns x)
  25 {
  26   word w = (_U_upper[x >> 8U]) ? _U_upper[x >> 8U][x & 0xff] : 0;
  27   return w ? w : x;
  28 }
  29
  30 static inline uns Utolower(uns x)
  31 {
  32   word w = (_U_lower[x >> 8U]) ? _U_lower[x >> 8U][x & 0xff] : 0;
  33   return w ? w : x;
  34 }
  35
  36 static inline uns Uunaccent(uns x)
  37 {
  38   word w = (_U_unaccent[x >> 8U]) ? _U_unaccent[x >> 8U][x & 0xff] : 0;
  39   return w ? w : x;
  40 }
  41
  42 extern const word *Uexpand_lig(uns x);
  43
  44 enum unicode_char_type {
  45   _U_LETTER = 1,                /* Letters */
  46   _U_UPPER = 2,                 /* Upper-case letters */
  47   _U_LOWER = 4,                 /* Lower-case letters */
  48   _U_CTRL = 8,                  /* Control characters */
  49   _U_DIGIT = 16,                /* Digits */
  50   _U_XDIGIT = 32,               /* Hexadecimal digits */
  51   _U_SPACE = 64,                /* White spaces (spaces, tabs, newlines) */
  52   _U_LIGATURE = 128,            /* Compatibility ligature (to be expanded) */
  53 };
  54
  55 #define _U_LUPPER (_U_LETTER | _U_UPPER)
  56 #define _U_LLOWER (_U_LETTER | _U_LOWER)
  57
  58 #define UCat(x,y) (Ucategory(x) & (y))
  59
  60 #define Ualpha(x) UCat(x, _U_LETTER)
  61 #define Uupper(x) UCat(x, _U_UPPER)
  62 #define Ulower(x) UCat(x, _U_LOWER)
  63 #define Udigit(x) UCat(x, _U_DIGIT)
  64 #define Uxdigit(x) UCat(x, (_U_DIGIT | _U_XDIGIT))
  65 #define Ualnum(x) UCat(x, (_U_LETTER | _U_DIGIT))
  66 #define Uctrl(x) UCat(x, _U_CTRL)
  67 #define Uprint(x) !Uctrl(x)
  68 #define Uspace(x) UCat(x, _U_SPACE)
  69
  70 #define UNI_REPLACEMENT 0xfffc
  71
  72 #define PUT_UTF8(p,u) do {              \
  73   if (u < 0x80)                         \
  74     *p++ = u;                           \
  75   else if (u < 0x800)                   \
  76     {                                   \
  77       *p++ = 0xc0 | (u >> 6);           \
  78       *p++ = 0x80 | (u & 0x3f);         \
  79     }                                   \
  80   else                                  \
  81     {                                   \
  82       *p++ = 0xe0 | (u >> 12);          \
  83       *p++ = 0x80 | ((u >> 6) & 0x3f);  \
  84       *p++ = 0x80 | (u & 0x3f);         \
  85     }                                   \
  86   } while(0)
  87
  88 #define IS_UTF8(c) ((c) >= 0xc0)
  89
  90 #define GET_UTF8_CHAR(p,u) do {         \
  91     if (*p >= 0xf0)                     \
  92       { /* Too large, use replacement char */   \
  93         p++;                            \
  94         while ((*p & 0xc0) == 0x80)     \
  95           p++;                          \
  96         u = UNI_REPLACEMENT;            \
  97       }                                 \
  98     else if (*p >= 0xe0)                \
  99       {                                 \
 100         u = *p++ & 0x0f;                \
 101         if ((*p & 0xc0) == 0x80)        \
 102           u = (u << 6) | (*p++ & 0x3f); \
 103         if ((*p & 0xc0) == 0x80)        \
 104           u = (u << 6) | (*p++ & 0x3f); \
 105       }                                 \
 106     else                                \
 107       {                                 \
 108         u = *p++ & 0x1f;                \
 109         if ((*p & 0xc0) == 0x80)        \
 110           u = (u << 6) | (*p++ & 0x3f); \
 111       }                                 \
 112   } while (0)                           \
 113
 114 #define GET_UTF8(p,u)                   \
 115     if (IS_UTF8(*p))                    \
 116       GET_UTF8_CHAR(p,u);               \
 117     else                                \
 118       u = *p++
 119
 120 #define UTF8_SKIP(p) do {                               \
 121     uns c = *p++;                                       \
 122     if (c >= 0xc0)                                      \
 123       while (c & 0x40 && *p >= 0x80 && *p < 0xc0)       \
 124         p++, c <<= 1;                                   \
 125   } while (0)
 126
 127 #define UTF8_SKIP_BWD(p) while ((--*(p) & 0xc0) == 0x80)
 128
 129 #define UTF8_SPACE(u) ((u) < 0x80 ? 1 : (u) < 0x800 ? 2 : 3)
 130
 131 uns ucs2_to_utf8(byte *, word *);
 132 uns utf8_to_ucs2(word *, byte *);
 133 byte *static_ucs2_to_utf8(word *);
 134 uns Ustrlen(word *);
 135 uns utf8_strlen(byte *str);
 136 uns utf8_strnlen(byte *str, uns n);
 137
 138 #endif