/*
* The UniCode Library
*
- * (c) 1997 Martin Mares, <mj@atrey.karlin.mff.cuni.cz>
+ * (c) 1997--2003 Martin Mares <mj@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
*/
#ifndef _UNICODE_H
#define _UNICODE_H
-#include "lib/config.h"
-
-extern byte *_U_cat[], *_U_sig[];
-extern word *_U_upper[], *_U_lower[], *_U_unaccent[];
-
-#define _C_UPPER 1 /* Upper-case letters */
-#define _C_LOWER 2 /* Lower-case letters */
-#define _C_PRINT 4 /* Printable */
-#define _C_DIGIT 8 /* Digits */
-#define _C_CTRL 16 /* Control characters */
-#define _C_XDIGIT 32 /* Hexadecimal digits */
-#define _C_BLANK 64 /* Blanks */
-#define _C_INNER 128 /* `inner punctuation' -- underscore etc. */
+extern const byte *_U_cat[];
+extern const word *_U_upper[], *_U_lower[], *_U_unaccent[];
-#define _C_ALPHA (_C_UPPER | _C_LOWER)
-#define _C_ALNUM (_C_ALPHA | _C_DIGIT)
-#define _C_WORD (_C_ALNUM | _C_INNER)
-#define _C_WSTART (_C_ALPHA | _C_INNER)
-
-static inline uns Ucategory(word x)
+static inline uns Ucategory(uns x)
{
if (_U_cat[x >> 8U])
return _U_cat[x >> 8U][x & 0xff];
return 0;
}
-static inline word Utoupper(word x)
+static inline uns Utoupper(uns x)
{
word w = (_U_upper[x >> 8U]) ? _U_upper[x >> 8U][x & 0xff] : 0;
return w ? w : x;
}
-static inline word Utolower(word x)
+static inline uns Utolower(uns x)
{
word w = (_U_lower[x >> 8U]) ? _U_lower[x >> 8U][x & 0xff] : 0;
return w ? w : x;
}
-static inline word Uunaccent(word x)
+static inline uns Uunaccent(uns x)
{
word w = (_U_unaccent[x >> 8U]) ? _U_unaccent[x >> 8U][x & 0xff] : 0;
return w ? w : x;
}
-static inline byte Usig(word x)
-{
- if (_U_sig[x >> 8U])
- return _U_sig[x >> 8U][x & 0xff] ? : 0xff;
- else
- return 0xff;
-}
+extern const word *Uexpand_lig(uns x);
+
+enum unicode_char_type {
+ _U_LETTER = 1, /* Letters */
+ _U_UPPER = 2, /* Upper-case letters */
+ _U_LOWER = 4, /* Lower-case letters */
+ _U_CTRL = 8, /* Control characters */
+ _U_DIGIT = 16, /* Digits */
+ _U_XDIGIT = 32, /* Hexadecimal digits */
+ _U_SPACE = 64, /* White spaces (spaces, tabs, newlines) */
+ _U_LIGATURE = 128, /* Compatibility ligature (to be expanded) */
+};
+
+#define _U_LUPPER (_U_LETTER | _U_UPPER)
+#define _U_LLOWER (_U_LETTER | _U_LOWER)
#define UCat(x,y) (Ucategory(x) & (y))
-#define Uupper(x) UCat(x, _C_UPPER)
-#define Ulower(x) UCat(x, _C_LOWER)
-#define Ualpha(x) UCat(x, _C_ALPHA)
-#define Ualnum(x) UCat(x, _C_ALNUM)
+#define Ualpha(x) UCat(x, _U_LETTER)
+#define Uupper(x) UCat(x, _U_UPPER)
+#define Ulower(x) UCat(x, _U_LOWER)
+#define Udigit(x) UCat(x, _U_DIGIT)
+#define Uxdigit(x) UCat(x, (_U_DIGIT | _U_XDIGIT))
+#define Ualnum(x) UCat(x, (_U_LETTER | _U_DIGIT))
+#define Uctrl(x) UCat(x, _U_CTRL)
#define Uprint(x) !Uctrl(x)
-#define Udigit(x) UCat(x, _C_DIGIT)
-#define Uxdigit(x) UCat(x, _C_XDIGIT)
-#define Uword(x) UCat(x, _C_WORD)
-#define Ublank(x) UCat(x, _C_BLANK)
-#define Uctrl(x) UCat(x, _C_CTRL)
-#define Uspace(x) Ublank(x)
+#define Uspace(x) UCat(x, _U_SPACE)
#define UNI_REPLACEMENT 0xfffc
else \
u = *p++
+#define UTF8_SKIP(p) do { \
+ uns c = *p++; \
+ if (c >= 0xc0) \
+ while (c & 0x40 && *p >= 0x80 && *p < 0xc0) \
+ p++, c <<= 1; \
+ } while (0)
+
+#define UTF8_SKIP_BWD(p) while ((--*(p) & 0xc0) == 0x80)
+
#define UTF8_SPACE(u) ((u) < 0x80 ? 1 : (u) < 0x800 ? 2 : 3)
uns ucs2_to_utf8(byte *, word *);
uns utf8_to_ucs2(word *, byte *);
byte *static_ucs2_to_utf8(word *);
uns Ustrlen(word *);
+uns utf8_strlen(byte *str);
+uns utf8_strnlen(byte *str, uns n);
#endif