2 * Sherlock Gatherer: Data structures used in indices
4 * (c) 2001 Martin Mares <mj@ucw.cz>
9 #define MAX_WORD_LEN 64
14 WT_RESERVED, /* Reserved word type */
15 WT_TEXT, /* Ordinary text */
16 WT_EMPH, /* Emphasized text */
17 WT_SMALL, /* Small font */
18 WT_TITLE, /* Document title */
19 WT_SMALL_HEADING, /* Heading */
20 WT_BIG_HEADING, /* Larger heading */
21 WT_KEYWORD, /* Explicitly marked keyword */
22 WT_META, /* Various meta-information */
23 WT_ALT, /* Alternate texts for graphical elements */
24 WT_URL1, /* Word extracted from document URL (low and high weight) */
29 /* Descriptive names used for user output */
30 #define WORD_TYPE_USER_NAMES \
31 "reserved", "text", "emph", "small", "title", "hdr1", "hdr2", "keywd", \
32 "meta", "alt", "urlword1", "urlword2", "type12", "type13", "type14", "type15"
34 /* Keywords for word type names */
35 #define WORD_TYPE_NAMES \
37 T(TEXT, 1 << WT_TEXT) \
38 T(EMPH, 1 << WT_EMPH) \
39 T(SMALL, 1 << WT_SMALL) \
40 T(TITLE, 1 << WT_TITLE) \
41 T(HDR, (1 << WT_SMALL_HEADING) | (1 << WT_BIG_HEADING)) \
42 T(HDR1, 1 << WT_SMALL_HEADING) \
43 T(HDR2, 1 << WT_BIG_HEADING) \
44 T(KEYWD, 1 << WT_KEYWORD) \
45 T(META, 1 << WT_META) \
47 T(URLWORD, (1 << WT_URL1) | (1 << WT_URL2))
49 /* These types are not shown in document contexts */
50 #define WORD_TYPES_HIDDEN ((1 << WT_URL1) | (1 << WT_URL2))
52 /* These types are always matched without accents if accent mode is set to "auto" */
53 #define WORD_TYPES_NO_AUTO_ACCENT ((1 << WT_URL1) | (1 << WT_URL2))
58 ST_RESERVED, /* Reserved string type */
59 ST_URL, /* URL of the document */
60 ST_HOST, /* Host name */
61 ST_DOMAIN, /* Domain name */
62 ST_REF, /* URL reference */
63 ST_BACKREF, /* Back-reference (frame or redirect source) */
67 #define STRING_TYPE_USER_NAMES \
68 "URL", "host", "domain", "ref", "backref", "type5", "type6", "type7", \
69 "type8", "type9", "type10", "type11", "type12", "type13", "type14", "type15"
71 #define STRING_TYPE_NAMES \
73 T(HOST, 1 << ST_HOST) \
74 T(DOMAIN, 1 << ST_DOMAIN) \
76 T(BACKREF, 1 << ST_BACKREF)
78 #define STRING_TYPES_URL ((1 << ST_URL) | (1 << ST_REF) | (1 << ST_BACKREF))
79 /* These must be indexed in lowercase form */
80 #define STRING_TYPES_CASE_INSENSITIVE ((1 << ST_HOST) | (1 << ST_DOMAIN))
82 /* Index card attributes */
85 u32 card; /* Reference to card description (either oid or filepos) */
93 CARD_FLAG_EMPTY = 1, /* Empty document (redirect, robot file etc.) [scanner] */
94 CARD_FLAG_ACCENTED = 2, /* Document contains accented characters [scanner] */
95 CARD_FLAG_DUP = 4, /* Removed as a duplicate [merger] */
96 CARD_FLAG_MERGED = 8, /* Destination of a merge [merger] */
99 #define CARD_POS_SHIFT 5 /* Card positions are shifted this # of bytes to the right */
101 /* String fingerprints */
107 void fingerprint(byte *string, struct fingerprint *fp);
110 fp_hash(struct fingerprint *fp)
112 return (fp->hash[0] << 24) | (fp->hash[1] << 16) | (fp->hash[2] << 8) | fp->hash[3];
115 /* Reading of tagged text (Unicode values, tags mapped to 0x80000000 and higher) */
117 #define GET_TAGGED_CHAR(p,u) do { \
121 else if (u >= 0x80) \
130 else if (u >= 0xa0) \
132 ASSERT(*p >= 0x80 && *p <= 0xbf); \
133 u = 0x80010000 + ((u & 0x0f) << 6) + (*p++ & 0x3f); \