X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;ds=sidebyside;f=charset%2Funicode.h;h=7bc6f235812ef3c5529dac1de956e768924eea7f;hb=2b797fef8d8133e5b1cb40706aabc92718da40bd;hp=80c4ff88a7815bf562b80960d82dbc44bc8e8bb3;hpb=0f448197aa5c08a75e91d56ed2f9acde3e132f64;p=libucw.git

diff --git a/charset/unicode.h b/charset/unicode.h
index 80c4ff88..7bc6f235 100644
--- a/charset/unicode.h
+++ b/charset/unicode.h
@@ -10,12 +10,10 @@
 #ifndef _UNICODE_H
 #define _UNICODE_H
 
-#include "lib/chartype.h"
+extern const byte *_U_cat[];
+extern const word *_U_upper[], *_U_lower[], *_U_unaccent[];
 
-extern byte *_U_cat[];
-extern word *_U_upper[], *_U_lower[], *_U_unaccent[];
-
-static inline uns Ucategory(word x)
+static inline uns Ucategory(uns x)
 {
   if (_U_cat[x >> 8U])
     return _U_cat[x >> 8U][x & 0xff];
@@ -23,37 +21,51 @@ static inline uns Ucategory(word x)
     return 0;
 }
 
-static inline word Utoupper(word x)
+static inline uns Utoupper(uns x)
 {
   word w = (_U_upper[x >> 8U]) ? _U_upper[x >> 8U][x & 0xff] : 0;
   return w ? w : x;
 }
 
-static inline word Utolower(word x)
+static inline uns Utolower(uns x)
 {
   word w = (_U_lower[x >> 8U]) ? _U_lower[x >> 8U][x & 0xff] : 0;
   return w ? w : x;
 }
 
-static inline word Uunaccent(word x)
+static inline uns Uunaccent(uns x)
 {
   word w = (_U_unaccent[x >> 8U]) ? _U_unaccent[x >> 8U][x & 0xff] : 0;
   return w ? w : x;
 }
 
+extern const word *Uexpand_lig(uns x);
+
+enum unicode_char_type {
+  _U_LETTER = 1,		/* Letters */
+  _U_UPPER = 2,			/* Upper-case letters */
+  _U_LOWER = 4,			/* Lower-case letters */
+  _U_CTRL = 8,			/* Control characters */
+  _U_DIGIT = 16,		/* Digits */
+  _U_XDIGIT = 32,		/* Hexadecimal digits */
+  _U_SPACE = 64,		/* White spaces (spaces, tabs, newlines) */
+  _U_LIGATURE = 128,		/* Compatibility ligature (to be expanded) */
+};
+
+#define _U_LUPPER (_U_LETTER | _U_UPPER)
+#define _U_LLOWER (_U_LETTER | _U_LOWER)
+
 #define UCat(x,y) (Ucategory(x) & (y))
 
-#define Uupper(x) UCat(x, _C_UPPER)
-#define Ulower(x) UCat(x, _C_LOWER)
-#define Ualpha(x) UCat(x, _C_ALPHA)
-#define Ualnum(x) UCat(x, _C_ALNUM)
+#define Ualpha(x) UCat(x, _U_LETTER)
+#define Uupper(x) UCat(x, _U_UPPER)
+#define Ulower(x) UCat(x, _U_LOWER)
+#define Udigit(x) UCat(x, _U_DIGIT)
+#define Uxdigit(x) UCat(x, (_U_DIGIT | _U_XDIGIT))
+#define Ualnum(x) UCat(x, (_U_LETTER | _U_DIGIT))
+#define Uctrl(x) UCat(x, _U_CTRL)
 #define Uprint(x) !Uctrl(x)
-#define Udigit(x) UCat(x, _C_DIGIT)
-#define Uxdigit(x) UCat(x, _C_XDIGIT)
-#define Uword(x) UCat(x, _C_WORD)
-#define Ublank(x) UCat(x, _C_BLANK)
-#define Uctrl(x) UCat(x, _C_CTRL)
-#define Uspace(x) Ublank(x)
+#define Uspace(x) UCat(x, _U_SPACE)
 
 #define UNI_REPLACEMENT 0xfffc
 
@@ -112,6 +124,8 @@ static inline word Uunaccent(word x)
         p++, c <<= 1;					\
   } while (0)
 
+#define UTF8_SKIP_BWD(p) while ((--*(p) & 0xc0) == 0x80)
+
 #define UTF8_SPACE(u) ((u) < 0x80 ? 1 : (u) < 0x800 ? 2 : 3)
 
 uns ucs2_to_utf8(byte *, word *);