charset/unicode.h

   1 /*
   2  *      The UniCode Library
   3  *
   4  *      (c) 1997--2003 Martin Mares <mj@ucw.cz>
   5  *
   6  *      This software may be freely distributed and used according to the terms
   7  *      of the GNU Lesser General Public License.
   8  */
   9
  10 #ifndef _UNICODE_H
  11 #define _UNICODE_H
  12
  13 #include "lib/chartype.h"
  14
  15 extern byte *_U_cat[];
  16 extern word *_U_upper[], *_U_lower[], *_U_unaccent[];
  17
  18 static inline uns Ucategory(word x)
  19 {
  20   if (_U_cat[x >> 8U])
  21     return _U_cat[x >> 8U][x & 0xff];
  22   else
  23     return 0;
  24 }
  25
  26 static inline word Utoupper(word x)
  27 {
  28   word w = (_U_upper[x >> 8U]) ? _U_upper[x >> 8U][x & 0xff] : 0;
  29   return w ? w : x;
  30 }
  31
  32 static inline word Utolower(word x)
  33 {
  34   word w = (_U_lower[x >> 8U]) ? _U_lower[x >> 8U][x & 0xff] : 0;
  35   return w ? w : x;
  36 }
  37
  38 static inline word Uunaccent(word x)
  39 {
  40   word w = (_U_unaccent[x >> 8U]) ? _U_unaccent[x >> 8U][x & 0xff] : 0;
  41   return w ? w : x;
  42 }
  43
  44 #define UCat(x,y) (Ucategory(x) & (y))
  45
  46 #define Uupper(x) UCat(x, _C_UPPER)
  47 #define Ulower(x) UCat(x, _C_LOWER)
  48 #define Ualpha(x) UCat(x, _C_ALPHA)
  49 #define Ualnum(x) UCat(x, _C_ALNUM)
  50 #define Uprint(x) !Uctrl(x)
  51 #define Udigit(x) UCat(x, _C_DIGIT)
  52 #define Uxdigit(x) UCat(x, _C_XDIGIT)
  53 #define Uword(x) UCat(x, _C_WORD)
  54 #define Ublank(x) UCat(x, _C_BLANK)
  55 #define Uctrl(x) UCat(x, _C_CTRL)
  56 #define Uspace(x) Ublank(x)
  57
  58 #define UNI_REPLACEMENT 0xfffc
  59
  60 #define PUT_UTF8(p,u) do {              \
  61   if (u < 0x80)                         \
  62     *p++ = u;                           \
  63   else if (u < 0x800)                   \
  64     {                                   \
  65       *p++ = 0xc0 | (u >> 6);           \
  66       *p++ = 0x80 | (u & 0x3f);         \
  67     }                                   \
  68   else                                  \
  69     {                                   \
  70       *p++ = 0xe0 | (u >> 12);          \
  71       *p++ = 0x80 | ((u >> 6) & 0x3f);  \
  72       *p++ = 0x80 | (u & 0x3f);         \
  73     }                                   \
  74   } while(0)
  75
  76 #define IS_UTF8(c) ((c) >= 0xc0)
  77
  78 #define GET_UTF8_CHAR(p,u) do {         \
  79     if (*p >= 0xf0)                     \
  80       { /* Too large, use replacement char */   \
  81         p++;                            \
  82         while ((*p & 0xc0) == 0x80)     \
  83           p++;                          \
  84         u = UNI_REPLACEMENT;            \
  85       }                                 \
  86     else if (*p >= 0xe0)                \
  87       {                                 \
  88         u = *p++ & 0x0f;                \
  89         if ((*p & 0xc0) == 0x80)        \
  90           u = (u << 6) | (*p++ & 0x3f); \
  91         if ((*p & 0xc0) == 0x80)        \
  92           u = (u << 6) | (*p++ & 0x3f); \
  93       }                                 \
  94     else                                \
  95       {                                 \
  96         u = *p++ & 0x1f;                \
  97         if ((*p & 0xc0) == 0x80)        \
  98           u = (u << 6) | (*p++ & 0x3f); \
  99       }                                 \
 100   } while (0)                           \
 101
 102 #define GET_UTF8(p,u)                   \
 103     if (IS_UTF8(*p))                    \
 104       GET_UTF8_CHAR(p,u);               \
 105     else                                \
 106       u = *p++
 107
 108 #define UTF8_SKIP(p) do {                               \
 109     uns c = *p++;                                       \
 110     if (c >= 0xc0)                                      \
 111       while (c & 0x40 && *p >= 0x80 && *p < 0xc0)       \
 112         p++, c <<= 1;                                   \
 113   } while (0)
 114
 115 #define UTF8_SPACE(u) ((u) < 0x80 ? 1 : (u) < 0x800 ? 2 : 3)
 116
 117 uns ucs2_to_utf8(byte *, word *);
 118 uns utf8_to_ucs2(word *, byte *);
 119 byte *static_ucs2_to_utf8(word *);
 120 uns Ustrlen(word *);
 121 uns utf8_strlen(byte *str);
 122 uns utf8_strnlen(byte *str, uns n);
 123
 124 #endif