X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=lib%2Findex.h;h=70a8186d7e2eaf6078811ba45cd4ff4264acba34;hb=91373ee6489fbe08db12bef2cf6df704b31e1378;hp=33e7ef733cfea07af3bf8afe975ca39d50fddf23;hpb=fc7ef7712e802fe3deb66c89d7dff0315d1d7ce2;p=libucw.git diff --git a/lib/index.h b/lib/index.h index 33e7ef73..70a8186d 100644 --- a/lib/index.h +++ b/lib/index.h @@ -4,6 +4,8 @@ * (c) 2001 Martin Mares */ +#define CLAMP(x,min,max) ({ int _t=x; (_t < min) ? min : (_t > max) ? max : _t; }) + /* Words */ #define MAX_WORD_LEN 64 @@ -11,6 +13,7 @@ /* Word types */ enum word_type { + WT_RESERVED, /* Reserved word type */ WT_TEXT, /* Ordinary text */ WT_EMPH, /* Emphasized text */ WT_SMALL, /* Small font */ @@ -21,3 +24,61 @@ enum word_type { WT_META, /* Various meta-information */ WT_ALT /* Alternate texts for graphical elements */ }; + +/* String types */ + +enum string_type { + ST_RESERVED, /* Reserved string type */ + ST_URL, /* URL of the document */ + ST_HOST, /* Host name */ + ST_DOMAIN, /* Domain name */ + ST_REF, /* URL reference */ + ST_BACKREF, /* Back-reference (frame or redirect source) */ +}; + +/* Index card attributes */ + +struct card_attr { + u32 card; /* Reference to card description (either oid or filepos) */ + u32 site_id; + byte weight; + byte flags; + byte rfu[2]; +}; + +enum card_flag { + CARD_FLAG_EMPTY = 1, /* Empty document (redirect, robot file etc.) [scanner] */ + CARD_FLAG_ACCENTED = 2, /* Document contains accented characters [scanner] */ + CARD_FLAG_DUP = 4, /* Removed as a duplicate [merger] */ + CARD_FLAG_MERGED = 8, /* Destination of a merge [merger] */ +}; + +#define CARD_POS_SHIFT 5 /* Card positions are shifted this # of bytes to the right */ + +/* String fingerprints */ + +struct fingerprint { + byte hash[12]; +}; + +void fingerprint(byte *string, struct fingerprint *fp); + +/* Reading of tagged text (Unicode values, tags mapped to 0x80000000 and higher) */ + +#define GET_TAGGED_CHAR(p,u) do { \ + u = *p; \ + if (u >= 0xc0) \ + GET_UTF8(p,u); \ + else if (u >= 0x80) \ + { \ + p++; \ + if (u >= 0xb0) \ + u += 0x80020000; \ + else if (u >= 0xa0) \ + u = 0x80010000 + ((u & 0x0f) << 6) + (*p++ & 0x3f); \ + else \ + u += 0x80000000; \ + } \ + else \ + p++; \ +} while (0)