X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=charset%2Funicode.h;h=7bc6f235812ef3c5529dac1de956e768924eea7f;hb=2b797fef8d8133e5b1cb40706aabc92718da40bd;hp=a01eb97c23d22b004cca1a60a9b17b7c103c480e;hpb=679480caf343ae93611a854cf899c258acfaa02f;p=libucw.git

diff --git a/charset/unicode.h b/charset/unicode.h
index a01eb97c..7bc6f235 100644
--- a/charset/unicode.h
+++ b/charset/unicode.h
@@ -1,32 +1,19 @@
 /*
  *	The UniCode Library
  *
- *	(c) 1997 Martin Mares, <mj@atrey.karlin.mff.cuni.cz>
+ *	(c) 1997--2003 Martin Mares <mj@ucw.cz>
+ *
+ *	This software may be freely distributed and used according to the terms
+ *	of the GNU Lesser General Public License.
  */
 
 #ifndef _UNICODE_H
 #define _UNICODE_H
 
-#include "lib/config.h"
-
-extern byte *_U_cat[], *_U_sig[];
-extern word *_U_upper[], *_U_lower[], *_U_unaccent[];
-
-#define _C_UPPER 1			/* Upper-case letters */
-#define _C_LOWER 2			/* Lower-case letters */
-#define _C_PRINT 4			/* Printable */
-#define _C_DIGIT 8			/* Digits */
-#define _C_CTRL 16			/* Control characters */
-#define _C_XDIGIT 32			/* Hexadecimal digits */
-#define _C_BLANK 64			/* Blanks */
-#define _C_INNER 128			/* `inner punctuation' -- underscore etc. */
+extern const byte *_U_cat[];
+extern const word *_U_upper[], *_U_lower[], *_U_unaccent[];
 
-#define _C_ALPHA (_C_UPPER | _C_LOWER)
-#define _C_ALNUM (_C_ALPHA | _C_DIGIT)
-#define _C_WORD (_C_ALNUM | _C_INNER)
-#define _C_WSTART (_C_ALPHA | _C_INNER)
-
-static inline uns Ucategory(word x)
+static inline uns Ucategory(uns x)
 {
   if (_U_cat[x >> 8U])
     return _U_cat[x >> 8U][x & 0xff];
@@ -34,45 +21,51 @@ static inline uns Ucategory(word x)
     return 0;
 }
 
-static inline word Utoupper(word x)
+static inline uns Utoupper(uns x)
 {
   word w = (_U_upper[x >> 8U]) ? _U_upper[x >> 8U][x & 0xff] : 0;
   return w ? w : x;
 }
 
-static inline word Utolower(word x)
+static inline uns Utolower(uns x)
 {
   word w = (_U_lower[x >> 8U]) ? _U_lower[x >> 8U][x & 0xff] : 0;
   return w ? w : x;
 }
 
-static inline word Uunaccent(word x)
+static inline uns Uunaccent(uns x)
 {
   word w = (_U_unaccent[x >> 8U]) ? _U_unaccent[x >> 8U][x & 0xff] : 0;
   return w ? w : x;
 }
 
-static inline byte Usig(word x)
-{
-  if (_U_sig[x >> 8U])
-    return _U_sig[x >> 8U][x & 0xff] ? : 0xff;
-  else
-    return 0xff;
-}
+extern const word *Uexpand_lig(uns x);
+
+enum unicode_char_type {
+  _U_LETTER = 1,		/* Letters */
+  _U_UPPER = 2,			/* Upper-case letters */
+  _U_LOWER = 4,			/* Lower-case letters */
+  _U_CTRL = 8,			/* Control characters */
+  _U_DIGIT = 16,		/* Digits */
+  _U_XDIGIT = 32,		/* Hexadecimal digits */
+  _U_SPACE = 64,		/* White spaces (spaces, tabs, newlines) */
+  _U_LIGATURE = 128,		/* Compatibility ligature (to be expanded) */
+};
+
+#define _U_LUPPER (_U_LETTER | _U_UPPER)
+#define _U_LLOWER (_U_LETTER | _U_LOWER)
 
 #define UCat(x,y) (Ucategory(x) & (y))
 
-#define Uupper(x) UCat(x, _C_UPPER)
-#define Ulower(x) UCat(x, _C_LOWER)
-#define Ualpha(x) UCat(x, _C_ALPHA)
-#define Ualnum(x) UCat(x, _C_ALNUM)
+#define Ualpha(x) UCat(x, _U_LETTER)
+#define Uupper(x) UCat(x, _U_UPPER)
+#define Ulower(x) UCat(x, _U_LOWER)
+#define Udigit(x) UCat(x, _U_DIGIT)
+#define Uxdigit(x) UCat(x, (_U_DIGIT | _U_XDIGIT))
+#define Ualnum(x) UCat(x, (_U_LETTER | _U_DIGIT))
+#define Uctrl(x) UCat(x, _U_CTRL)
 #define Uprint(x) !Uctrl(x)
-#define Udigit(x) UCat(x, _C_DIGIT)
-#define Uxdigit(x) UCat(x, _C_XDIGIT)
-#define Uword(x) UCat(x, _C_WORD)
-#define Ublank(x) UCat(x, _C_BLANK)
-#define Uctrl(x) UCat(x, _C_CTRL)
-#define Uspace(x) Ublank(x)
+#define Uspace(x) UCat(x, _U_SPACE)
 
 #define UNI_REPLACEMENT 0xfffc
 
@@ -124,11 +117,22 @@ static inline byte Usig(word x)
     else				\
       u = *p++
 
+#define UTF8_SKIP(p) do {				\
+    uns c = *p++;					\
+    if (c >= 0xc0)					\
+      while (c & 0x40 && *p >= 0x80 && *p < 0xc0)	\
+        p++, c <<= 1;					\
+  } while (0)
+
+#define UTF8_SKIP_BWD(p) while ((--*(p) & 0xc0) == 0x80)
+
 #define UTF8_SPACE(u) ((u) < 0x80 ? 1 : (u) < 0x800 ? 2 : 3)
 
 uns ucs2_to_utf8(byte *, word *);
 uns utf8_to_ucs2(word *, byte *);
 byte *static_ucs2_to_utf8(word *);
 uns Ustrlen(word *);
+uns utf8_strlen(byte *str);
+uns utf8_strnlen(byte *str, uns n);
 
 #endif