X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=lib%2Findex.h;h=d484ee8ee8e177b3d008ff47cad5f56ae5f7f1e4;hb=995458e48d482bd5fa349882fba5cb4d6350076d;hp=017cf7b2d58771e540c106efca38d879a631ba8b;hpb=d54d0a248b4d24092ba2d2ed1824406d14367053;p=libucw.git diff --git a/lib/index.h b/lib/index.h index 017cf7b2..d484ee8e 100644 --- a/lib/index.h +++ b/lib/index.h @@ -25,6 +25,40 @@ enum word_type { WT_ALT /* Alternate texts for graphical elements */ }; +#define WORD_TYPE_NAMES \ + T(WORD, ~0) \ + T(TEXT, 1 << WT_TEXT) \ + T(EMPH, 1 << WT_EMPH) \ + T(SMALL, 1 << WT_SMALL) \ + T(TITLE, 1 << WT_TITLE) \ + T(HDR, (1 << WT_SMALL_HEADING) | (1 << WT_BIG_HEADING)) \ + T(HDR1, 1 << WT_SMALL_HEADING) \ + T(HDR2, 1 << WT_BIG_HEADING) \ + T(KEYWD, 1 << WT_KEYWORD) \ + T(META, 1 << WT_META) \ + T(ALT, 1 << WT_ALT) + +/* String types */ + +enum string_type { + ST_RESERVED, /* Reserved string type */ + ST_URL, /* URL of the document */ + ST_HOST, /* Host name */ + ST_DOMAIN, /* Domain name */ + ST_REF, /* URL reference */ + ST_BACKREF, /* Back-reference (frame or redirect source) */ +}; + +#define STRING_TYPE_NAMES \ + T(URL, 1 << ST_URL) \ + T(HOST, 1 << ST_HOST) \ + T(DOMAIN, 1 << ST_DOMAIN) \ + T(REF, 1 << ST_REF) \ + T(BACKREF, 1 << ST_BACKREF) + +#define STRING_TYPES_URL ((1 << ST_URL) | (1 << ST_REF) | (1 << ST_BACKREF)) +#define STRING_TYPES_CASE_INSENSITIVE ((1 << ST_HOST) | (1 << ST_DOMAIN)) + /* Index card attributes */ struct card_attr { @@ -42,6 +76,8 @@ enum card_flag { CARD_FLAG_MERGED = 8, /* Destination of a merge [merger] */ }; +#define CARD_POS_SHIFT 5 /* Card positions are shifted this # of bytes to the right */ + /* String fingerprints */ struct fingerprint { @@ -49,3 +85,29 @@ struct fingerprint { }; void fingerprint(byte *string, struct fingerprint *fp); + +static inline u32 +fp_hash(struct fingerprint *fp) +{ + return (fp->hash[0] << 24) | (fp->hash[1] << 16) | (fp->hash[2] << 8) | fp->hash[3]; +} + +/* Reading of tagged text (Unicode values, tags mapped to 0x80000000 and higher) */ + +#define GET_TAGGED_CHAR(p,u) do { \ + u = *p; \ + if (u >= 0xc0) \ + GET_UTF8(p,u); \ + else if (u >= 0x80) \ + { \ + p++; \ + if (u >= 0xb0) \ + u += 0x80020000; \ + else if (u >= 0xa0) \ + u = 0x80010000 + ((u & 0x0f) << 6) + (*p++ & 0x3f); \ + else \ + u += 0x80000000; \ + } \ + else \ + p++; \ +} while (0)