Interface to our own regex library.

[libucw.git] / charset / unicode.h
diff --git a/charset/unicode.h b/charset/unicode.h

index a01eb97c23d22b004cca1a60a9b17b7c103c480e..7bc6f235812ef3c5529dac1de956e768924eea7f 100644 (file)
--- a/charset/unicode.h
+++ b/charset/unicode.h
@@ -1,32 +1,19 @@
  /*
   *     The UniCode Library
   *
- *     (c) 1997 Martin Mares, <mj@atrey.karlin.mff.cuni.cz>
+ *     (c) 1997--2003 Martin Mares <mj@ucw.cz>
+ *
+ *     This software may be freely distributed and used according to the terms
+ *     of the GNU Lesser General Public License.
   */
  
  #ifndef _UNICODE_H
  #define _UNICODE_H
  
-#include "lib/config.h"
-
-extern byte *_U_cat[], *_U_sig[];
-extern word *_U_upper[], *_U_lower[], *_U_unaccent[];
-
-#define _C_UPPER 1                     /* Upper-case letters */
-#define _C_LOWER 2                     /* Lower-case letters */
-#define _C_PRINT 4                     /* Printable */
-#define _C_DIGIT 8                     /* Digits */
-#define _C_CTRL 16                     /* Control characters */
-#define _C_XDIGIT 32                   /* Hexadecimal digits */
-#define _C_BLANK 64                    /* Blanks */
-#define _C_INNER 128                   /* `inner punctuation' -- underscore etc. */
+extern const byte *_U_cat[];
+extern const word *_U_upper[], *_U_lower[], *_U_unaccent[];
  
-#define _C_ALPHA (_C_UPPER | _C_LOWER)
-#define _C_ALNUM (_C_ALPHA | _C_DIGIT)
-#define _C_WORD (_C_ALNUM | _C_INNER)
-#define _C_WSTART (_C_ALPHA | _C_INNER)
-
-static inline uns Ucategory(word x)
+static inline uns Ucategory(uns x)
  {
    if (_U_cat[x >> 8U])
      return _U_cat[x >> 8U][x & 0xff];
@@ -34,45 +21,51 @@ static inline uns Ucategory(word x)
      return 0;
  }
  
-static inline word Utoupper(word x)
+static inline uns Utoupper(uns x)
  {
    word w = (_U_upper[x >> 8U]) ? _U_upper[x >> 8U][x & 0xff] : 0;
    return w ? w : x;
  }
  
-static inline word Utolower(word x)
+static inline uns Utolower(uns x)
  {
    word w = (_U_lower[x >> 8U]) ? _U_lower[x >> 8U][x & 0xff] : 0;
    return w ? w : x;
  }
  
-static inline word Uunaccent(word x)
+static inline uns Uunaccent(uns x)
  {
    word w = (_U_unaccent[x >> 8U]) ? _U_unaccent[x >> 8U][x & 0xff] : 0;
    return w ? w : x;
  }
  
-static inline byte Usig(word x)
-{
-  if (_U_sig[x >> 8U])
-    return _U_sig[x >> 8U][x & 0xff] ? : 0xff;
-  else
-    return 0xff;
-}
+extern const word *Uexpand_lig(uns x);
+
+enum unicode_char_type {
+  _U_LETTER = 1,               /* Letters */
+  _U_UPPER = 2,                        /* Upper-case letters */
+  _U_LOWER = 4,                        /* Lower-case letters */
+  _U_CTRL = 8,                 /* Control characters */
+  _U_DIGIT = 16,               /* Digits */
+  _U_XDIGIT = 32,              /* Hexadecimal digits */
+  _U_SPACE = 64,               /* White spaces (spaces, tabs, newlines) */
+  _U_LIGATURE = 128,           /* Compatibility ligature (to be expanded) */
+};
+
+#define _U_LUPPER (_U_LETTER | _U_UPPER)
+#define _U_LLOWER (_U_LETTER | _U_LOWER)
  
  #define UCat(x,y) (Ucategory(x) & (y))
  
-#define Uupper(x) UCat(x, _C_UPPER)
-#define Ulower(x) UCat(x, _C_LOWER)
-#define Ualpha(x) UCat(x, _C_ALPHA)
-#define Ualnum(x) UCat(x, _C_ALNUM)
+#define Ualpha(x) UCat(x, _U_LETTER)
+#define Uupper(x) UCat(x, _U_UPPER)
+#define Ulower(x) UCat(x, _U_LOWER)
+#define Udigit(x) UCat(x, _U_DIGIT)
+#define Uxdigit(x) UCat(x, (_U_DIGIT | _U_XDIGIT))
+#define Ualnum(x) UCat(x, (_U_LETTER | _U_DIGIT))
+#define Uctrl(x) UCat(x, _U_CTRL)
  #define Uprint(x) !Uctrl(x)
-#define Udigit(x) UCat(x, _C_DIGIT)
-#define Uxdigit(x) UCat(x, _C_XDIGIT)
-#define Uword(x) UCat(x, _C_WORD)
-#define Ublank(x) UCat(x, _C_BLANK)
-#define Uctrl(x) UCat(x, _C_CTRL)
-#define Uspace(x) Ublank(x)
+#define Uspace(x) UCat(x, _U_SPACE)
  
  #define UNI_REPLACEMENT 0xfffc
  
@@ -124,11 +117,22 @@ static inline byte Usig(word x)
      else                               \
        u = *p++
  
+#define UTF8_SKIP(p) do {                              \
+    uns c = *p++;                                      \
+    if (c >= 0xc0)                                     \
+      while (c & 0x40 && *p >= 0x80 && *p < 0xc0)      \
+        p++, c <<= 1;                                  \
+  } while (0)
+
+#define UTF8_SKIP_BWD(p) while ((--*(p) & 0xc0) == 0x80)
+
  #define UTF8_SPACE(u) ((u) < 0x80 ? 1 : (u) < 0x800 ? 2 : 3)
  
  uns ucs2_to_utf8(byte *, word *);
  uns utf8_to_ucs2(word *, byte *);
  byte *static_ucs2_to_utf8(word *);
  uns Ustrlen(word *);
+uns utf8_strlen(byte *str);
+uns utf8_strnlen(byte *str, uns n);
  
  #endif