From ef6639d2cf305018f9edb2f7944969c236afbfea Mon Sep 17 00:00:00 2001 From: Martin Mares Date: Sat, 11 Oct 2003 10:19:40 +0000 Subject: [PATCH] Several improvements to the unicode library: o All tables are now const. o Redefined the categories: - now using _U_* instead of _C_* - introduced _U_LETTER modified with either _U_UPPER or _U_LOWER or none (titlecase letters, letter modifiers etc.) o Added the ligature expansions and _U_LIGATURE. o Minor cleanups. --- charset/misc/gen-basic | 21 +++++++++++------- charset/unicode.h | 48 ++++++++++++++++++++++++++---------------- 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/charset/misc/gen-basic b/charset/misc/gen-basic index 6c3b8fd8..bd0044b8 100755 --- a/charset/misc/gen-basic +++ b/charset/misc/gen-basic @@ -1,26 +1,30 @@ #!/usr/bin/perl # # Split Unicode Data File -# (c) 1997--2001 Martin Mares +# (c) 1997--2003 Martin Mares # open(I, "unidata/UnicodeData.txt") || die "Unable to open UniCode data file"; open(C, ">misc/u-cat") || die "cat file open"; open(U, ">misc/u-upper") || die "upper file open"; open(L, ">misc/u-lower") || die "lower file open"; +open(G, ">misc/u-ligatures") || die "lig file open"; while () { chomp; (/^$/ || /^#/) && next; ($code,$name,$cat,$comb,$bidir,$decomp,$d0,$d1,$n0,$mirr,$cmt1,$cmt2,$upper,$lower,$title) = split /;/; $code =~ /^....$/ || next; - if ($cat =~ /^C/) { $ccat = "_C_CTRL"; } - elsif ($cat =~ /^Z/) { $ccat = "_C_BLANK"; } - elsif ($cat =~ /^Ll/) { $ccat = "_C_LOWER"; } - elsif ($cat =~ /^Lu/) { $ccat = "_C_UPPER"; } - elsif ($code ge "0030" && $code le "0039") { $ccat = "_C_DIGIT|_C_XDIGIT"; } - elsif ($code eq "005F") { $ccat = "_C_INNER"; } + if ($cat =~ /^C/) { $ccat = "_U_CTRL"; } + elsif ($cat =~ /^Z/) { $ccat = "_U_SPACE"; } + elsif ($decomp =~ // && $name =~ / LIGATURE /) { + $ccat = "_U_LIGATURE"; + print G "$code\n"; + } elsif ($cat =~ /^Ll/) { $ccat = "_U_LLOWER"; } + elsif ($cat =~ /^Lu/) { $ccat = "_U_LUPPER"; } + elsif ($cat =~ /^L/) { $ccat = "_U_LETTER"; } + elsif ($code ge "0030" && $code le "0039") { $ccat = "_U_DIGIT | _U_XDIGIT"; } else { $ccat = ""; } - if ($code ge "0041" && $code le "0046" || $code ge "0061" && $code le "0066") { $ccat = $ccat . "|_C_XDIGIT"; } + if ($code ge "0041" && $code le "0046" || $code ge "0061" && $code le "0066") { $ccat = $ccat . "|_U_XDIGIT"; } if ($ccat ne "") { print C "$code\t$ccat\n"; } if ($upper ne "") { print U "$code\t0x$upper\n"; } if ($lower ne "") { print L "$code\t0x$lower\n"; } @@ -29,3 +33,4 @@ close I; close C; close U; close L; +close G; diff --git a/charset/unicode.h b/charset/unicode.h index 80c4ff88..3246f134 100644 --- a/charset/unicode.h +++ b/charset/unicode.h @@ -10,12 +10,10 @@ #ifndef _UNICODE_H #define _UNICODE_H -#include "lib/chartype.h" +extern const byte *_U_cat[]; +extern const word *_U_upper[], *_U_lower[], *_U_unaccent[]; -extern byte *_U_cat[]; -extern word *_U_upper[], *_U_lower[], *_U_unaccent[]; - -static inline uns Ucategory(word x) +static inline uns Ucategory(uns x) { if (_U_cat[x >> 8U]) return _U_cat[x >> 8U][x & 0xff]; @@ -23,37 +21,51 @@ static inline uns Ucategory(word x) return 0; } -static inline word Utoupper(word x) +static inline uns Utoupper(uns x) { word w = (_U_upper[x >> 8U]) ? _U_upper[x >> 8U][x & 0xff] : 0; return w ? w : x; } -static inline word Utolower(word x) +static inline uns Utolower(uns x) { word w = (_U_lower[x >> 8U]) ? _U_lower[x >> 8U][x & 0xff] : 0; return w ? w : x; } -static inline word Uunaccent(word x) +static inline uns Uunaccent(uns x) { word w = (_U_unaccent[x >> 8U]) ? _U_unaccent[x >> 8U][x & 0xff] : 0; return w ? w : x; } +extern const word *Uexpand_lig(uns x); + +enum unicode_char_type { + _U_LETTER, /* Letters */ + _U_UPPER, /* Upper-case letters */ + _U_LOWER, /* Lower-case letters */ + _U_CTRL, /* Control characters */ + _U_DIGIT, /* Digits */ + _U_XDIGIT, /* Hexadecimal digits */ + _U_SPACE, /* White spaces (spaces, tabs, newlines) */ + _U_LIGATURE, /* Compatibility ligature (to be expanded) */ +}; + +#define _U_LUPPER (_U_LETTER | _U_UPPER) +#define _U_LLOWER (_U_LETTER | _U_LOWER) + #define UCat(x,y) (Ucategory(x) & (y)) -#define Uupper(x) UCat(x, _C_UPPER) -#define Ulower(x) UCat(x, _C_LOWER) -#define Ualpha(x) UCat(x, _C_ALPHA) -#define Ualnum(x) UCat(x, _C_ALNUM) +#define Ualpha(x) UCat(x, _U_LETTER) +#define Uupper(x) UCat(x, _U_UPPER) +#define Ulower(x) UCat(x, _U_LOWER) +#define Udigit(x) UCat(x, _U_DIGIT) +#define Uxdigit(x) UCat(x, (_U_DIGIT | _U_XDIGIT)) +#define Ualnum(x) UCat(x, (_U_LETTER | _U_DIGIT)) +#define Uctrl(x) UCat(x, _U_CTRL) #define Uprint(x) !Uctrl(x) -#define Udigit(x) UCat(x, _C_DIGIT) -#define Uxdigit(x) UCat(x, _C_XDIGIT) -#define Uword(x) UCat(x, _C_WORD) -#define Ublank(x) UCat(x, _C_BLANK) -#define Uctrl(x) UCat(x, _C_CTRL) -#define Uspace(x) Ublank(x) +#define Uspace(x) UCat(x, _U_SPACE) #define UNI_REPLACEMENT 0xfffc -- 2.39.2