/*
- * Sherlock Gatherer: Data structures used in indices
+ * Sherlock: Data structures used in indices
*
- * (c) 2001 Martin Mares <mj@ucw.cz>
+ * (c) 2001--2002 Martin Mares <mj@ucw.cz>
*/
-#define CLAMP(x,min,max) ({ int _t=x; (_t < min) ? min : (_t > max) ? max : _t; })
-
/* Words */
#define MAX_WORD_LEN 64
-/* Word types */
-
-enum word_type {
- WT_RESERVED, /* Reserved word type */
- WT_TEXT, /* Ordinary text */
- WT_EMPH, /* Emphasized text */
- WT_SMALL, /* Small font */
- WT_TITLE, /* Document title */
- WT_SMALL_HEADING, /* Heading */
- WT_BIG_HEADING, /* Larger heading */
- WT_KEYWORD, /* Explicitly marked keyword */
- WT_META, /* Various meta-information */
- WT_ALT /* Alternate texts for graphical elements */
-};
-
-#define WORD_TYPE_NAMES \
- T(WORD, ~0) \
- T(TEXT, 1 << WT_TEXT) \
- T(EMPH, 1 << WT_EMPH) \
- T(SMALL, 1 << WT_SMALL) \
- T(TITLE, 1 << WT_TITLE) \
- T(HDR, (1 << WT_SMALL_HEADING) | (1 << WT_BIG_HEADING)) \
- T(HDR1, 1 << WT_SMALL_HEADING) \
- T(HDR2, 1 << WT_BIG_HEADING) \
- T(KEYWD, 1 << WT_KEYWORD) \
- T(META, 1 << WT_META) \
- T(ALT, 1 << WT_ALT)
-
-/* String types */
-
-enum string_type {
- ST_RESERVED, /* Reserved string type */
- ST_URL, /* URL of the document */
- ST_HOST, /* Host name */
- ST_DOMAIN, /* Domain name */
- ST_REF, /* URL reference */
- ST_BACKREF, /* Back-reference (frame or redirect source) */
-};
+/* Word and string types are defined in lib/custom.h */
-#define STRING_TYPE_NAMES \
- T(URL, 1 << ST_URL) \
- T(HOST, 1 << ST_HOST) \
- T(DOMAIN, 1 << ST_DOMAIN) \
- T(REF, 1 << ST_REF) \
- T(BACKREF, 1 << ST_BACKREF)
+/* Global index parameters */
-#define STRING_TYPES_URL ((1 << ST_URL) | (1 << ST_REF) | (1 << ST_BACKREF))
-#define STRING_TYPES_CASE_INSENSITIVE ((1 << ST_HOST) | (1 << ST_DOMAIN))
+struct index_params {
+ sh_time_t ref_time; /* Reference time (for document ages etc.) */
+};
/* Index card attributes */
struct card_attr {
u32 card; /* Reference to card description (either oid or filepos) */
u32 site_id;
+#define INT_ATTR(t,i,o,k,g,p) t i;
+ CUSTOM_ATTRS /* Include all custom attributes */
+#undef INT_ATTR
byte weight;
byte flags;
- byte rfu[2];
+ byte age; /* Document age in pseudo-logarithmic units wrt. reference time */
+ // byte rfu[1]; /* If no custom attributes are defined */
};
enum card_flag {
void fingerprint(byte *string, struct fingerprint *fp);
+static inline u32
+fp_hash(struct fingerprint *fp)
+{
+ return (fp->hash[0] << 24) | (fp->hash[1] << 16) | (fp->hash[2] << 8) | fp->hash[3];
+}
+
/* Reading of tagged text (Unicode values, tags mapped to 0x80000000 and higher) */
#define GET_TAGGED_CHAR(p,u) do { \
u = *p; \
if (u >= 0xc0) \
- GET_UTF8(p,u); \
+ GET_UTF8_CHAR(p,u); \
else if (u >= 0x80) \
{ \
p++; \
if (u >= 0xb0) \
- u += 0x80020000; \
+ { \
+ if (u != 0xb0) \
+ ASSERT(0); \
+ u += 0x80020000; \
+ } \
else if (u >= 0xa0) \
- u = 0x80010000 + ((u & 0x0f) << 6) + (*p++ & 0x3f); \
+ { \
+ ASSERT(*p >= 0x80 && *p <= 0xbf); \
+ u = 0x80010000 + ((u & 0x0f) << 6) + (*p++ & 0x3f); \
+ } \
else \
u += 0x80000000; \
} \
else \
p++; \
} while (0)
+
+/* Conversion of document age from seconds to our internal units */
+
+static inline int
+convert_age(sh_time_t lastmod, sh_time_t reftime)
+{
+ sh_time_t age;
+ if (reftime < lastmod) /* past times */
+ return -1;
+ age = (reftime - lastmod) / 3600;
+ if (age < 48) /* last 2 days: 1 hour resolution */
+ return age;
+ age = (age-48) / 24;
+ if (age < 64) /* next 64 days: 1 day resolution */
+ return 48 + age;
+ age = (age-64) / 7;
+ if (age < 135) /* next 135 weeks: 1 week resolution */
+ return 112 + age;
+ age = (age-135) / 52;
+ if (age < 8) /* next 8 years: 1 year resolution */
+ return 247 + age;
+ return 255; /* then just "infinite future" */
+}