X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=charset%2Funicode.h;h=7bc6f235812ef3c5529dac1de956e768924eea7f;hb=2b797fef8d8133e5b1cb40706aabc92718da40bd;hp=a01eb97c23d22b004cca1a60a9b17b7c103c480e;hpb=679480caf343ae93611a854cf899c258acfaa02f;p=libucw.git diff --git a/charset/unicode.h b/charset/unicode.h index a01eb97c..7bc6f235 100644 --- a/charset/unicode.h +++ b/charset/unicode.h @@ -1,32 +1,19 @@ /* * The UniCode Library * - * (c) 1997 Martin Mares, + * (c) 1997--2003 Martin Mares + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. */ #ifndef _UNICODE_H #define _UNICODE_H -#include "lib/config.h" - -extern byte *_U_cat[], *_U_sig[]; -extern word *_U_upper[], *_U_lower[], *_U_unaccent[]; - -#define _C_UPPER 1 /* Upper-case letters */ -#define _C_LOWER 2 /* Lower-case letters */ -#define _C_PRINT 4 /* Printable */ -#define _C_DIGIT 8 /* Digits */ -#define _C_CTRL 16 /* Control characters */ -#define _C_XDIGIT 32 /* Hexadecimal digits */ -#define _C_BLANK 64 /* Blanks */ -#define _C_INNER 128 /* `inner punctuation' -- underscore etc. */ +extern const byte *_U_cat[]; +extern const word *_U_upper[], *_U_lower[], *_U_unaccent[]; -#define _C_ALPHA (_C_UPPER | _C_LOWER) -#define _C_ALNUM (_C_ALPHA | _C_DIGIT) -#define _C_WORD (_C_ALNUM | _C_INNER) -#define _C_WSTART (_C_ALPHA | _C_INNER) - -static inline uns Ucategory(word x) +static inline uns Ucategory(uns x) { if (_U_cat[x >> 8U]) return _U_cat[x >> 8U][x & 0xff]; @@ -34,45 +21,51 @@ static inline uns Ucategory(word x) return 0; } -static inline word Utoupper(word x) +static inline uns Utoupper(uns x) { word w = (_U_upper[x >> 8U]) ? _U_upper[x >> 8U][x & 0xff] : 0; return w ? w : x; } -static inline word Utolower(word x) +static inline uns Utolower(uns x) { word w = (_U_lower[x >> 8U]) ? _U_lower[x >> 8U][x & 0xff] : 0; return w ? w : x; } -static inline word Uunaccent(word x) +static inline uns Uunaccent(uns x) { word w = (_U_unaccent[x >> 8U]) ? _U_unaccent[x >> 8U][x & 0xff] : 0; return w ? w : x; } -static inline byte Usig(word x) -{ - if (_U_sig[x >> 8U]) - return _U_sig[x >> 8U][x & 0xff] ? : 0xff; - else - return 0xff; -} +extern const word *Uexpand_lig(uns x); + +enum unicode_char_type { + _U_LETTER = 1, /* Letters */ + _U_UPPER = 2, /* Upper-case letters */ + _U_LOWER = 4, /* Lower-case letters */ + _U_CTRL = 8, /* Control characters */ + _U_DIGIT = 16, /* Digits */ + _U_XDIGIT = 32, /* Hexadecimal digits */ + _U_SPACE = 64, /* White spaces (spaces, tabs, newlines) */ + _U_LIGATURE = 128, /* Compatibility ligature (to be expanded) */ +}; + +#define _U_LUPPER (_U_LETTER | _U_UPPER) +#define _U_LLOWER (_U_LETTER | _U_LOWER) #define UCat(x,y) (Ucategory(x) & (y)) -#define Uupper(x) UCat(x, _C_UPPER) -#define Ulower(x) UCat(x, _C_LOWER) -#define Ualpha(x) UCat(x, _C_ALPHA) -#define Ualnum(x) UCat(x, _C_ALNUM) +#define Ualpha(x) UCat(x, _U_LETTER) +#define Uupper(x) UCat(x, _U_UPPER) +#define Ulower(x) UCat(x, _U_LOWER) +#define Udigit(x) UCat(x, _U_DIGIT) +#define Uxdigit(x) UCat(x, (_U_DIGIT | _U_XDIGIT)) +#define Ualnum(x) UCat(x, (_U_LETTER | _U_DIGIT)) +#define Uctrl(x) UCat(x, _U_CTRL) #define Uprint(x) !Uctrl(x) -#define Udigit(x) UCat(x, _C_DIGIT) -#define Uxdigit(x) UCat(x, _C_XDIGIT) -#define Uword(x) UCat(x, _C_WORD) -#define Ublank(x) UCat(x, _C_BLANK) -#define Uctrl(x) UCat(x, _C_CTRL) -#define Uspace(x) Ublank(x) +#define Uspace(x) UCat(x, _U_SPACE) #define UNI_REPLACEMENT 0xfffc @@ -124,11 +117,22 @@ static inline byte Usig(word x) else \ u = *p++ +#define UTF8_SKIP(p) do { \ + uns c = *p++; \ + if (c >= 0xc0) \ + while (c & 0x40 && *p >= 0x80 && *p < 0xc0) \ + p++, c <<= 1; \ + } while (0) + +#define UTF8_SKIP_BWD(p) while ((--*(p) & 0xc0) == 0x80) + #define UTF8_SPACE(u) ((u) < 0x80 ? 1 : (u) < 0x800 ? 2 : 3) uns ucs2_to_utf8(byte *, word *); uns utf8_to_ucs2(word *, byte *); byte *static_ucs2_to_utf8(word *); uns Ustrlen(word *); +uns utf8_strlen(byte *str); +uns utf8_strnlen(byte *str, uns n); #endif