X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=lib%2Findex.h;h=696a92191c3996011a5cf3774e47cdd9b8ad2ec7;hb=8299c27047b1a992e2b38463421ec4a4a69f8cec;hp=c102be10979c4136a4569e04b060c7cb4d6513f2;hpb=58403754674e781bb733748a6b2b720b40ce797c;p=libucw.git

diff --git a/lib/index.h b/lib/index.h
index c102be10..696a9219 100644
--- a/lib/index.h
+++ b/lib/index.h
@@ -1,92 +1,39 @@
 /*
- *	Sherlock Gatherer: Data structures used in indices
+ *	Sherlock: Data structures used in indices
  *
- *	(c) 2001 Martin Mares <mj@ucw.cz>
+ *	(c) 2001--2002 Martin Mares <mj@ucw.cz>
  */
 
-/* Words */
+#ifndef _SHERLOCK_INDEX_H
+#define _SHERLOCK_INDEX_H
 
-#define MAX_WORD_LEN		64
+#include "lib/fastbuf.h"
+#include "charset/unistream.h"
 
-/* Word types */
-
-enum word_type {
-  WT_RESERVED,				/* Reserved word type */
-  WT_TEXT,				/* Ordinary text */
-  WT_EMPH,				/* Emphasized text */
-  WT_SMALL,				/* Small font */
-  WT_TITLE,				/* Document title */
-  WT_SMALL_HEADING,			/* Heading */
-  WT_BIG_HEADING,			/* Larger heading */
-  WT_KEYWORD,				/* Explicitly marked keyword */
-  WT_META,				/* Various meta-information */
-  WT_ALT,				/* Alternate texts for graphical elements */
-  WT_URL1,				/* Word extracted from document URL (low and high weight) */
-  WT_URL2,
-  WT_MAX
-};
+/* Words */
 
-/* Descriptive names used for user output */
-#define WORD_TYPE_USER_NAMES							\
-   "reserved", "text", "emph", "small", "title", "hdr1", "hdr2", "keywd",	\
-   "meta", "alt", "urlword1", "urlword2", "type12", "type13", "type14", "type15"
-
-/* Keywords for word type names */
-#define WORD_TYPE_NAMES	       			\
-	T(WORD, ~0)				\
-	T(TEXT, 1 << WT_TEXT)			\
-	T(EMPH, 1 << WT_EMPH)			\
-	T(SMALL, 1 << WT_SMALL)			\
-	T(TITLE, 1 << WT_TITLE)			\
-	T(HDR, (1 << WT_SMALL_HEADING) | (1 << WT_BIG_HEADING))  \
-	T(HDR1, 1 << WT_SMALL_HEADING)		\
-	T(HDR2, 1 << WT_BIG_HEADING)		\
-	T(KEYWD, 1 << WT_KEYWORD)		\
-	T(META, 1 << WT_META)			\
-	T(ALT, 1 << WT_ALT)			\
-	T(URLWORD, (1 << WT_URL1) | (1 << WT_URL2))
-
-/* These types are not shown in document contexts */
-#define WORD_TYPES_HIDDEN ((1 << WT_URL1) | (1 << WT_URL2))
-
-/* These types are always matched without accents if accent mode is set to "auto" */
-#define WORD_TYPES_NO_AUTO_ACCENT ((1 << WT_URL1) | (1 << WT_URL2))
-
-/* String types */
-
-enum string_type {
-  ST_RESERVED,				/* Reserved string type */
-  ST_URL,				/* URL of the document */
-  ST_HOST,				/* Host name */
-  ST_DOMAIN,				/* Domain name */
-  ST_REF,				/* URL reference */
-  ST_BACKREF,				/* Back-reference (frame or redirect source) */
-  ST_MAX
-};
+#define MAX_WORD_LEN		64
 
-#define STRING_TYPE_USER_NAMES							\
-   "URL", "host", "domain", "ref", "backref", "type5", "type6", "type7",	\
-   "type8", "type9", "type10", "type11", "type12", "type13", "type14", "type15"
+/* Word and string types are defined in lib/custom.h */
 
-#define STRING_TYPE_NAMES			\
-	T(URL, 1 << ST_URL)			\
-	T(HOST, 1 << ST_HOST)			\
-	T(DOMAIN, 1 << ST_DOMAIN)		\
-	T(REF, 1 << ST_REF)			\
-	T(BACKREF, 1 << ST_BACKREF)
+/* Global index parameters */
 
-#define STRING_TYPES_URL ((1 << ST_URL) | (1 << ST_REF) | (1 << ST_BACKREF))
-/* These must be indexed in lowercase form */
-#define STRING_TYPES_CASE_INSENSITIVE ((1 << ST_HOST) | (1 << ST_DOMAIN))
+struct index_params {
+  sh_time_t ref_time;			/* Reference time (for document ages etc.) */
+};
 
 /* Index card attributes */
 
 struct card_attr {
   u32 card;				/* Reference to card description (either oid or filepos) */
   u32 site_id;
+#define INT_ATTR(t,i,o,k,g,p) t i;
+  CUSTOM_ATTRS				/* Include all custom attributes */
+#undef INT_ATTR
   byte weight;
   byte flags;
-  byte rfu[2];
+  byte age;				/* Document age in pseudo-logarithmic units wrt. reference time */
+  // byte rfu[1];			/* If no custom attributes are defined */
 };
 
 enum card_flag {
@@ -94,6 +41,7 @@ enum card_flag {
   CARD_FLAG_ACCENTED = 2,		/* Document contains accented characters [scanner] */
   CARD_FLAG_DUP = 4,			/* Removed as a duplicate [merger] */
   CARD_FLAG_MERGED = 8,			/* Destination of a merge [merger] */
+  CARD_FLAG_IMAGE = 16,			/* Is an image object [scanner] */
 };
 
 #define CARD_POS_SHIFT 5		/* Card positions are shifted this # of bytes to the right */
@@ -123,8 +71,7 @@ fp_hash(struct fingerprint *fp)
       p++;							\
       if (u >= 0xb0)						\
         {							\
-	  if (u != 0xb0)					\
-            ASSERT(0);						\
+	  ASSERT(u == 0xb0);					\
 	  u += 0x80020000;					\
         }							\
       else if (u >= 0xa0)					\
@@ -138,3 +85,58 @@ fp_hash(struct fingerprint *fp)
   else								\
     p++;							\
 } while (0)
+
+static inline uns
+bget_tagged_char(struct fastbuf *f)
+{
+  uns u = bgetc(f);
+  if ((int)u < 0x80)
+    ;
+  else if (u < 0xc0)
+    {
+      if (u >= 0xb0)
+	{
+	  ASSERT(u == 0xb0);
+	  u += 0x80020000;
+	}
+      else if (u >= 0xa0)
+	{
+	  uns v = bgetc(f);
+	  ASSERT(v >= 0x80 && v <= 0xbf);
+	  u = 0x80010000 + ((u & 0x0f) << 6) + (v & 0x3f);
+	}
+      else
+	u += 0x80000000;
+    }
+  else
+    {
+      bungetc(f);
+      u = bget_utf8(f);
+    }
+  return u;
+}
+
+/* Conversion of document age from seconds to our internal units */
+
+static inline int
+convert_age(sh_time_t lastmod, sh_time_t reftime)
+{
+  sh_time_t age;
+  if (reftime < lastmod)		/* past times */
+    return -1;
+  age = (reftime - lastmod) / 3600;
+  if (age < 48)				/* last 2 days: 1 hour resolution */
+    return age;
+  age = (age-48) / 24;
+  if (age < 64)				/* next 64 days: 1 day resolution */
+    return 48 + age;
+  age = (age-64) / 7;
+  if (age < 135)			/* next 135 weeks: 1 week resolution */
+    return 112 + age;
+  age = (age-135) / 52;
+  if (age < 8)				/* next 8 years: 1 year resolution */
+    return 247 + age;
+  return 255;				/* then just "infinite future" */
+}
+
+#endif