X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=lib%2Findex.h;h=6579fc4d09aec556531726687a2ef1a3715272f0;hb=89556b90f1afb9b7ebe247c05721e0a04a7232c0;hp=0bd9a63583a1b26df843c02394ced78b28efbf5c;hpb=13bc5e1c30abb77c6e1120b2e0bf50cd0ab94a6a;p=libucw.git diff --git a/lib/index.h b/lib/index.h index 0bd9a635..6579fc4d 100644 --- a/lib/index.h +++ b/lib/index.h @@ -1,23 +1,39 @@ /* * Sherlock: Data structures used in indices * - * (c) 2001 Martin Mares + * (c) 2001--2002 Martin Mares */ +#ifndef _SHERLOCK_INDEX_H +#define _SHERLOCK_INDEX_H + +#include "lib/fastbuf.h" +#include SHERLOCK_CUSTOM +#include "charset/unistream.h" + /* Words */ #define MAX_WORD_LEN 64 +#define MAX_COMPLEX_LEN 10 /* Word and string types are defined in lib/custom.h */ +/* Global index parameters */ + +struct index_params { + sh_time_t ref_time; /* Reference time (for document ages etc.) */ +}; + /* Index card attributes */ struct card_attr { u32 card; /* Reference to card description (either oid or filepos) */ u32 site_id; + CUSTOM_CARD_ATTRS /* Include all custom attributes */ byte weight; byte flags; - byte rfu[2]; + byte age; /* Document age in pseudo-logarithmic units wrt. reference time */ + // byte rfu[1]; /* If no custom attributes are defined */ }; enum card_flag { @@ -25,6 +41,7 @@ enum card_flag { CARD_FLAG_ACCENTED = 2, /* Document contains accented characters [scanner] */ CARD_FLAG_DUP = 4, /* Removed as a duplicate [merger] */ CARD_FLAG_MERGED = 8, /* Destination of a merge [merger] */ + CARD_FLAG_IMAGE = 16, /* Is an image object [scanner] */ }; #define CARD_POS_SHIFT 5 /* Card positions are shifted this # of bytes to the right */ @@ -54,8 +71,7 @@ fp_hash(struct fingerprint *fp) p++; \ if (u >= 0xb0) \ { \ - if (u != 0xb0) \ - ASSERT(0); \ + ASSERT(u == 0xb0); \ u += 0x80020000; \ } \ else if (u >= 0xa0) \ @@ -69,3 +85,58 @@ fp_hash(struct fingerprint *fp) else \ p++; \ } while (0) + +static inline uns +bget_tagged_char(struct fastbuf *f) +{ + uns u = bgetc(f); + if ((int)u < 0x80) + ; + else if (u < 0xc0) + { + if (u >= 0xb0) + { + ASSERT(u == 0xb0); + u += 0x80020000; + } + else if (u >= 0xa0) + { + uns v = bgetc(f); + ASSERT(v >= 0x80 && v <= 0xbf); + u = 0x80010000 + ((u & 0x0f) << 6) + (v & 0x3f); + } + else + u += 0x80000000; + } + else + { + bungetc(f); + u = bget_utf8(f); + } + return u; +} + +/* Conversion of document age from seconds to our internal units */ + +static inline int +convert_age(sh_time_t lastmod, sh_time_t reftime) +{ + sh_time_t age; + if (reftime < lastmod) /* past times */ + return -1; + age = (reftime - lastmod) / 3600; + if (age < 48) /* last 2 days: 1 hour resolution */ + return age; + age = (age-48) / 24; + if (age < 64) /* next 64 days: 1 day resolution */ + return 48 + age; + age = (age-64) / 7; + if (age < 135) /* next 135 weeks: 1 week resolution */ + return 112 + age; + age = (age-135) / 52; + if (age < 8) /* next 8 years: 1 year resolution */ + return 247 + age; + return 255; /* then just "infinite future" */ +} + +#endif