Several improvements to the unicode library:

author Martin Mares <mj@ucw.cz>

Sat, 11 Oct 2003 10:19:40 +0000 (10:19 +0000)

committer Martin Mares <mj@ucw.cz>

Sat, 11 Oct 2003 10:19:40 +0000 (10:19 +0000)
author Martin Mares <mj@ucw.cz>
Sat, 11 Oct 2003 10:19:40 +0000 (10:19 +0000)
committer Martin Mares <mj@ucw.cz>
Sat, 11 Oct 2003 10:19:40 +0000 (10:19 +0000)
diff --git a/charset/misc/gen-basic b/charset/misc/gen-basic

index 6c3b8fd89efb0f42c570508e5985139bb9b4398a..bd0044b82745ed59f30bae24910d89739a6ac2aa 100755 (executable)
--- a/charset/misc/gen-basic
+++ b/charset/misc/gen-basic
@@ -1,26 +1,30 @@
  #!/usr/bin/perl
  #
  #  Split Unicode Data File
-#  (c) 1997--2001 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+#  (c) 1997--2003 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
  #
  
  open(I, "unidata/UnicodeData.txt") || die "Unable to open UniCode data file";
  open(C, ">misc/u-cat") || die "cat file open";
  open(U, ">misc/u-upper") || die "upper file open";
  open(L, ">misc/u-lower") || die "lower file open";
+open(G, ">misc/u-ligatures") || die "lig file open";
  while (<I>) {
         chomp;
         (/^$/ || /^#/) && next;
         ($code,$name,$cat,$comb,$bidir,$decomp,$d0,$d1,$n0,$mirr,$cmt1,$cmt2,$upper,$lower,$title) = split /;/;
         $code =~ /^....$/ || next;
-       if ($cat =~ /^C/) { $ccat = "_C_CTRL"; }
-       elsif ($cat =~ /^Z/) { $ccat = "_C_BLANK"; }
-       elsif ($cat =~ /^Ll/) { $ccat = "_C_LOWER"; }
-       elsif ($cat =~ /^Lu/) { $ccat = "_C_UPPER"; }
-       elsif ($code ge "0030" && $code le "0039") { $ccat = "_C_DIGIT|_C_XDIGIT"; }
-       elsif ($code eq "005F") { $ccat = "_C_INNER"; }
+       if ($cat =~ /^C/) { $ccat = "_U_CTRL"; }
+       elsif ($cat =~ /^Z/) { $ccat = "_U_SPACE"; }
+       elsif ($decomp =~ /<compat>/ && $name =~ / LIGATURE /) {
+               $ccat = "_U_LIGATURE";
+               print G "$code\n";
+       } elsif ($cat =~ /^Ll/) { $ccat = "_U_LLOWER"; }
+       elsif ($cat =~ /^Lu/) { $ccat = "_U_LUPPER"; }
+       elsif ($cat =~ /^L/) { $ccat = "_U_LETTER"; }
+       elsif ($code ge "0030" && $code le "0039") { $ccat = "_U_DIGIT | _U_XDIGIT"; }
         else { $ccat = ""; }
-       if ($code ge "0041" && $code le "0046" || $code ge "0061" && $code le "0066") { $ccat = $ccat . "|_C_XDIGIT"; }
+       if ($code ge "0041" && $code le "0046" || $code ge "0061" && $code le "0066") { $ccat = $ccat . "|_U_XDIGIT"; }
         if ($ccat ne "") { print C "$code\t$ccat\n"; }
         if ($upper ne "") { print U "$code\t0x$upper\n"; }
         if ($lower ne "") { print L "$code\t0x$lower\n"; }
@@ -29,3 +33,4 @@ close I;
  close C;
  close U;
  close L;
+close G;
diff --git a/charset/unicode.h b/charset/unicode.h

index 80c4ff88a7815bf562b80960d82dbc44bc8e8bb3..3246f1346162a33c996a4abd8c08323d4a3b2162 100644 (file)
--- a/charset/unicode.h
+++ b/charset/unicode.h
@@ -10,12 +10,10 @@
  #ifndef _UNICODE_H
  #define _UNICODE_H
  
-#include "lib/chartype.h"
+extern const byte *_U_cat[];
+extern const word *_U_upper[], *_U_lower[], *_U_unaccent[];
  
-extern byte *_U_cat[];
-extern word *_U_upper[], *_U_lower[], *_U_unaccent[];
-
-static inline uns Ucategory(word x)
+static inline uns Ucategory(uns x)
  {
    if (_U_cat[x >> 8U])
      return _U_cat[x >> 8U][x & 0xff];
@@ -23,37 +21,51 @@ static inline uns Ucategory(word x)
      return 0;
  }
  
-static inline word Utoupper(word x)
+static inline uns Utoupper(uns x)
  {
    word w = (_U_upper[x >> 8U]) ? _U_upper[x >> 8U][x & 0xff] : 0;
    return w ? w : x;
  }
  
-static inline word Utolower(word x)
+static inline uns Utolower(uns x)
  {
    word w = (_U_lower[x >> 8U]) ? _U_lower[x >> 8U][x & 0xff] : 0;
    return w ? w : x;
  }
  
-static inline word Uunaccent(word x)
+static inline uns Uunaccent(uns x)
  {
    word w = (_U_unaccent[x >> 8U]) ? _U_unaccent[x >> 8U][x & 0xff] : 0;
    return w ? w : x;
  }
  
+extern const word *Uexpand_lig(uns x);
+
+enum unicode_char_type {
+  _U_LETTER,                   /* Letters */
+  _U_UPPER,                    /* Upper-case letters */
+  _U_LOWER,                    /* Lower-case letters */
+  _U_CTRL,                     /* Control characters */
+  _U_DIGIT,                    /* Digits */
+  _U_XDIGIT,                   /* Hexadecimal digits */
+  _U_SPACE,                    /* White spaces (spaces, tabs, newlines) */
+  _U_LIGATURE,                 /* Compatibility ligature (to be expanded) */
+};
+
+#define _U_LUPPER (_U_LETTER | _U_UPPER)
+#define _U_LLOWER (_U_LETTER | _U_LOWER)
+
  #define UCat(x,y) (Ucategory(x) & (y))
  
-#define Uupper(x) UCat(x, _C_UPPER)
-#define Ulower(x) UCat(x, _C_LOWER)
-#define Ualpha(x) UCat(x, _C_ALPHA)
-#define Ualnum(x) UCat(x, _C_ALNUM)
+#define Ualpha(x) UCat(x, _U_LETTER)
+#define Uupper(x) UCat(x, _U_UPPER)
+#define Ulower(x) UCat(x, _U_LOWER)
+#define Udigit(x) UCat(x, _U_DIGIT)
+#define Uxdigit(x) UCat(x, (_U_DIGIT | _U_XDIGIT))
+#define Ualnum(x) UCat(x, (_U_LETTER | _U_DIGIT))
+#define Uctrl(x) UCat(x, _U_CTRL)
  #define Uprint(x) !Uctrl(x)
-#define Udigit(x) UCat(x, _C_DIGIT)
-#define Uxdigit(x) UCat(x, _C_XDIGIT)
-#define Uword(x) UCat(x, _C_WORD)
-#define Ublank(x) UCat(x, _C_BLANK)
-#define Uctrl(x) UCat(x, _C_CTRL)
-#define Uspace(x) Ublank(x)
+#define Uspace(x) UCat(x, _U_SPACE)
  
  #define UNI_REPLACEMENT 0xfffc
author	Martin Mares <mj@ucw.cz>
	Sat, 11 Oct 2003 10:19:40 +0000 (10:19 +0000)
committer	Martin Mares <mj@ucw.cz>
	Sat, 11 Oct 2003 10:19:40 +0000 (10:19 +0000)
charset/misc/gen-basic		patch \| blob \| history
charset/unicode.h		patch \| blob \| history