charset/unicode.h

   1 /*
   2  *      The UniCode Library
   3  *
   4  *      (c) 1997 Martin Mares, <mj@atrey.karlin.mff.cuni.cz>
   5  */
   6
   7 #ifndef _UNICODE_H
   8 #define _UNICODE_H
   9
  10 #include "lib/config.h"
  11
  12 extern byte *_U_cat[], *_U_sig[];
  13 extern word *_U_upper[], *_U_lower[], *_U_unaccent[];
  14
  15 #define _C_UPPER 1                      /* Upper-case letters */
  16 #define _C_LOWER 2                      /* Lower-case letters */
  17 #define _C_PRINT 4                      /* Printable */
  18 #define _C_DIGIT 8                      /* Digits */
  19 #define _C_CTRL 16                      /* Control characters */
  20 #define _C_XDIGIT 32                    /* Hexadecimal digits */
  21 #define _C_BLANK 64                     /* Blanks */
  22 #define _C_INNER 128                    /* `inner punctuation' -- underscore etc. */
  23
  24 #define _C_ALPHA (_C_UPPER | _C_LOWER)
  25 #define _C_ALNUM (_C_ALPHA | _C_DIGIT)
  26 #define _C_WORD (_C_ALNUM | _C_INNER)
  27 #define _C_WSTART (_C_ALPHA | _C_INNER)
  28
  29 static inline uns Ucategory(word x)
  30 {
  31   if (_U_cat[x >> 8U])
  32     return _U_cat[x >> 8U][x & 0xff];
  33   else
  34     return 0;
  35 }
  36
  37 static inline word Utoupper(word x)
  38 {
  39   word w = (_U_upper[x >> 8U]) ? _U_upper[x >> 8U][x & 0xff] : 0;
  40   return w ? w : x;
  41 }
  42
  43 static inline word Utolower(word x)
  44 {
  45   word w = (_U_lower[x >> 8U]) ? _U_lower[x >> 8U][x & 0xff] : 0;
  46   return w ? w : x;
  47 }
  48
  49 static inline word Uunaccent(word x)
  50 {
  51   word w = (_U_unaccent[x >> 8U]) ? _U_unaccent[x >> 8U][x & 0xff] : 0;
  52   return w ? w : x;
  53 }
  54
  55 static inline byte Usig(word x)
  56 {
  57   if (_U_sig[x >> 8U])
  58     return _U_sig[x >> 8U][x & 0xff] ? : 0xff;
  59   else
  60     return 0xff;
  61 }
  62
  63 #define UCat(x,y) (Ucategory(x) & (y))
  64
  65 #define Uupper(x) UCat(x, _C_UPPER)
  66 #define Ulower(x) UCat(x, _C_LOWER)
  67 #define Ualpha(x) UCat(x, _C_ALPHA)
  68 #define Ualnum(x) UCat(x, _C_ALNUM)
  69 #define Uprint(x) !Uctrl(x)
  70 #define Udigit(x) UCat(x, _C_DIGIT)
  71 #define Uxdigit(x) UCat(x, _C_XDIGIT)
  72 #define Uword(x) UCat(x, _C_WORD)
  73 #define Ublank(x) UCat(x, _C_BLANK)
  74 #define Uctrl(x) UCat(x, _C_CTRL)
  75 #define Uspace(x) Ublank(x)
  76
  77 #define UNI_REPLACEMENT 0xfffc
  78
  79 #define PUT_UTF8(p,u) do {              \
  80   if (u < 0x80)                         \
  81     *p++ = u;                           \
  82   else if (u < 0x800)                   \
  83     {                                   \
  84       *p++ = 0xc0 | (u >> 6);           \
  85       *p++ = 0x80 | (u & 0x3f);         \
  86     }                                   \
  87   else                                  \
  88     {                                   \
  89       *p++ = 0xe0 | (u >> 12);          \
  90       *p++ = 0x80 | ((u >> 6) & 0x3f);  \
  91       *p++ = 0x80 | (u & 0x3f);         \
  92     }                                   \
  93   } while(0)
  94
  95 #define IS_UTF8(c) ((c) >= 0xc0)
  96
  97 #define GET_UTF8_CHAR(p,u) do {         \
  98     if (*p >= 0xf0)                     \
  99       { /* Too large, use replacement char */   \
 100         p++;                            \
 101         while ((*p & 0xc0) == 0x80)     \
 102           p++;                          \
 103         u = UNI_REPLACEMENT;            \
 104       }                                 \
 105     else if (*p >= 0xe0)                \
 106       {                                 \
 107         u = *p++ & 0x0f;                \
 108         if ((*p & 0xc0) == 0x80)        \
 109           u = (u << 6) | (*p++ & 0x3f); \
 110         if ((*p & 0xc0) == 0x80)        \
 111           u = (u << 6) | (*p++ & 0x3f); \
 112       }                                 \
 113     else                                \
 114       {                                 \
 115         u = *p++ & 0x1f;                \
 116         if ((*p & 0xc0) == 0x80)        \
 117           u = (u << 6) | (*p++ & 0x3f); \
 118       }                                 \
 119   } while (0)                           \
 120
 121 #define GET_UTF8(p,u)                   \
 122     if (IS_UTF8(*p))                    \
 123       GET_UTF8_CHAR(p,u);               \
 124     else                                \
 125       u = *p++
 126
 127 #define UTF8_SPACE(u) ((u) < 0x80 ? 1 : (u) < 0x800 ? 2 : 3)
 128
 129 uns ucs2_to_utf8(byte *, word *);
 130 uns utf8_to_ucs2(word *, byte *);
 131 byte *static_ucs2_to_utf8(word *);
 132 uns Ustrlen(word *);
 133
 134 #endif