4 * (c) 1997--2003 Martin Mares <mj@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
13 extern const byte *_U_cat[];
14 extern const word *_U_upper[], *_U_lower[], *_U_unaccent[];
16 static inline uns Ucategory(uns x)
19 return _U_cat[x >> 8U][x & 0xff];
24 static inline uns Utoupper(uns x)
26 word w = (_U_upper[x >> 8U]) ? _U_upper[x >> 8U][x & 0xff] : 0;
30 static inline uns Utolower(uns x)
32 word w = (_U_lower[x >> 8U]) ? _U_lower[x >> 8U][x & 0xff] : 0;
36 static inline uns Uunaccent(uns x)
38 word w = (_U_unaccent[x >> 8U]) ? _U_unaccent[x >> 8U][x & 0xff] : 0;
42 extern const word *Uexpand_lig(uns x);
44 enum unicode_char_type {
45 _U_LETTER = 1, /* Letters */
46 _U_UPPER = 2, /* Upper-case letters */
47 _U_LOWER = 4, /* Lower-case letters */
48 _U_CTRL = 8, /* Control characters */
49 _U_DIGIT = 16, /* Digits */
50 _U_XDIGIT = 32, /* Hexadecimal digits */
51 _U_SPACE = 64, /* White spaces (spaces, tabs, newlines) */
52 _U_LIGATURE = 128, /* Compatibility ligature (to be expanded) */
55 #define _U_LUPPER (_U_LETTER | _U_UPPER)
56 #define _U_LLOWER (_U_LETTER | _U_LOWER)
58 #define UCat(x,y) (Ucategory(x) & (y))
60 #define Ualpha(x) UCat(x, _U_LETTER)
61 #define Uupper(x) UCat(x, _U_UPPER)
62 #define Ulower(x) UCat(x, _U_LOWER)
63 #define Udigit(x) UCat(x, _U_DIGIT)
64 #define Uxdigit(x) UCat(x, (_U_DIGIT | _U_XDIGIT))
65 #define Ualnum(x) UCat(x, (_U_LETTER | _U_DIGIT))
66 #define Uctrl(x) UCat(x, _U_CTRL)
67 #define Uprint(x) !Uctrl(x)
68 #define Uspace(x) UCat(x, _U_SPACE)
70 #define UNI_REPLACEMENT 0xfffc
72 #define PUT_UTF8(p,u) do { \
77 *p++ = 0xc0 | (u >> 6); \
78 *p++ = 0x80 | (u & 0x3f); \
82 *p++ = 0xe0 | (u >> 12); \
83 *p++ = 0x80 | ((u >> 6) & 0x3f); \
84 *p++ = 0x80 | (u & 0x3f); \
88 #define IS_UTF8(c) ((c) >= 0xc0)
90 #define GET_UTF8_CHAR(p,u) do { \
92 { /* Too large, use replacement char */ \
94 while ((*p & 0xc0) == 0x80) \
96 u = UNI_REPLACEMENT; \
98 else if (*p >= 0xe0) \
101 if ((*p & 0xc0) == 0x80) \
102 u = (u << 6) | (*p++ & 0x3f); \
103 if ((*p & 0xc0) == 0x80) \
104 u = (u << 6) | (*p++ & 0x3f); \
109 if ((*p & 0xc0) == 0x80) \
110 u = (u << 6) | (*p++ & 0x3f); \
114 #define GET_UTF8(p,u) \
116 GET_UTF8_CHAR(p,u); \
120 #define UTF8_SKIP(p) do { \
123 while (c & 0x40 && *p >= 0x80 && *p < 0xc0) \
127 #define UTF8_SKIP_BWD(p) while ((--*(p) & 0xc0) == 0x80)
129 #define UTF8_SPACE(u) ((u) < 0x80 ? 1 : (u) < 0x800 ? 2 : 3)
131 uns ucs2_to_utf8(byte *, word *);
132 uns utf8_to_ucs2(word *, byte *);
133 byte *static_ucs2_to_utf8(word *);
135 uns utf8_strlen(byte *str);
136 uns utf8_strnlen(byte *str, uns n);