]> mj.ucw.cz Git - libucw.git/blobdiff - lib/index.h
As usually, stuff in lib/* is LGPL'ed.
[libucw.git] / lib / index.h
index ebd28f3aa29f27cbab22bcd74e753a8d987dbe4d..8bfbe88d4d6862f58118de370d52f41d3d483553 100644 (file)
@@ -1,71 +1,41 @@
 /*
 /*
- *     Sherlock Gatherer: Data structures used in indices
+ *     Sherlock: Data structures used in indices
  *
  *
- *     (c) 2001 Martin Mares <mj@ucw.cz>
+ *     (c) 2001--2002 Martin Mares <mj@ucw.cz>
  */
 
  */
 
+#ifndef _SHERLOCK_INDEX_H
+#define _SHERLOCK_INDEX_H
+
+#include "lib/fastbuf.h"
+#include SHERLOCK_CUSTOM
+#include "charset/unistream.h"
+
 /* Words */
 
 #define MAX_WORD_LEN           64
 /* Words */
 
 #define MAX_WORD_LEN           64
+#define MAX_COMPLEX_LEN                10
 
 
-/* Word types */
-
-enum word_type {
-  WT_RESERVED,                         /* Reserved word type */
-  WT_TEXT,                             /* Ordinary text */
-  WT_EMPH,                             /* Emphasized text */
-  WT_SMALL,                            /* Small font */
-  WT_TITLE,                            /* Document title */
-  WT_SMALL_HEADING,                    /* Heading */
-  WT_BIG_HEADING,                      /* Larger heading */
-  WT_KEYWORD,                          /* Explicitly marked keyword */
-  WT_META,                             /* Various meta-information */
-  WT_ALT                               /* Alternate texts for graphical elements */
-};
-
-#define WORD_TYPE_NAMES                                \
-       T(WORD, ~0)                             \
-       T(TEXT, 1 << WT_TEXT)                   \
-       T(EMPH, 1 << WT_EMPH)                   \
-       T(SMALL, 1 << WT_SMALL)                 \
-       T(TITLE, 1 << WT_TITLE)                 \
-       T(HDR, (1 << WT_SMALL_HEADING) | (1 << WT_BIG_HEADING))  \
-       T(HDR1, 1 << WT_SMALL_HEADING)          \
-       T(HDR2, 1 << WT_BIG_HEADING)            \
-       T(KEYWD, 1 << WT_KEYWORD)               \
-       T(META, 1 << WT_META)                   \
-       T(ALT, 1 << WT_ALT)
-
-/* String types */
-
-enum string_type {
-  ST_RESERVED,                         /* Reserved string type */
-  ST_URL,                              /* URL of the document */
-  ST_HOST,                             /* Host name */
-  ST_DOMAIN,                           /* Domain name */
-  ST_REF,                              /* URL reference */
-  ST_BACKREF,                          /* Back-reference (frame or redirect source) */
-};
+/* Word and string types are defined in lib/custom.h */
 
 
-#define STRING_TYPE_NAMES                      \
-       T(URL, 1 << ST_URL)                     \
-       T(HOST, 1 << ST_HOST)                   \
-       T(DOMAIN, 1 << ST_DOMAIN)               \
-       T(REF, 1 << ST_REF)                     \
-       T(BACKREF, 1 << ST_BACKREF)
+/* Global index parameters */
 
 
-#define STRING_TYPES_URL ((1 << ST_URL) | (1 << ST_REF) | (1 << ST_BACKREF))
-/* These must be indexed in lowercase form */
-#define STRING_TYPES_CASE_INSENSITIVE ((1 << ST_HOST) | (1 << ST_DOMAIN))
+struct index_params {
+  sh_time_t ref_time;                  /* Reference time (for document ages etc.) */
+};
 
 /* Index card attributes */
 
 struct card_attr {
   u32 card;                            /* Reference to card description (either oid or filepos) */
 
 /* Index card attributes */
 
 struct card_attr {
   u32 card;                            /* Reference to card description (either oid or filepos) */
+#ifdef CONFIG_SITES
   u32 site_id;
   u32 site_id;
+#endif
+  CUSTOM_CARD_ATTRS                    /* Include all custom attributes */
   byte weight;
   byte flags;
   byte weight;
   byte flags;
-  byte rfu[2];
+  byte age;                            /* Document age in pseudo-logarithmic units wrt. reference time */
+  // byte rfu[1];                      /* If no custom attributes are defined */
 };
 
 enum card_flag {
 };
 
 enum card_flag {
@@ -73,6 +43,7 @@ enum card_flag {
   CARD_FLAG_ACCENTED = 2,              /* Document contains accented characters [scanner] */
   CARD_FLAG_DUP = 4,                   /* Removed as a duplicate [merger] */
   CARD_FLAG_MERGED = 8,                        /* Destination of a merge [merger] */
   CARD_FLAG_ACCENTED = 2,              /* Document contains accented characters [scanner] */
   CARD_FLAG_DUP = 4,                   /* Removed as a duplicate [merger] */
   CARD_FLAG_MERGED = 8,                        /* Destination of a merge [merger] */
+  CARD_FLAG_IMAGE = 16,                        /* Is an image object [scanner] */
 };
 
 #define CARD_POS_SHIFT 5               /* Card positions are shifted this # of bytes to the right */
 };
 
 #define CARD_POS_SHIFT 5               /* Card positions are shifted this # of bytes to the right */
@@ -96,17 +67,89 @@ fp_hash(struct fingerprint *fp)
 #define GET_TAGGED_CHAR(p,u) do {                              \
   u = *p;                                                      \
   if (u >= 0xc0)                                               \
 #define GET_TAGGED_CHAR(p,u) do {                              \
   u = *p;                                                      \
   if (u >= 0xc0)                                               \
-    GET_UTF8(p,u);                                             \
+    GET_UTF8_CHAR(p,u);                                                \
   else if (u >= 0x80)                                          \
     {                                                          \
       p++;                                                     \
       if (u >= 0xb0)                                           \
   else if (u >= 0x80)                                          \
     {                                                          \
       p++;                                                     \
       if (u >= 0xb0)                                           \
-       u += 0x80020000;                                        \
+        {                                                      \
+         ASSERT(u == 0xb0);                                    \
+         u += 0x80020000;                                      \
+        }                                                      \
       else if (u >= 0xa0)                                      \
       else if (u >= 0xa0)                                      \
-       u = 0x80010000 + ((u & 0x0f) << 6) + (*p++ & 0x3f);     \
+        {                                                      \
+         ASSERT(*p >= 0x80 && *p <= 0xbf);                     \
+         u = 0x80010000 + ((u & 0x0f) << 6) + (*p++ & 0x3f);   \
+        }                                                      \
       else                                                     \
        u += 0x80000000;                                        \
     }                                                          \
   else                                                         \
     p++;                                                       \
 } while (0)
       else                                                     \
        u += 0x80000000;                                        \
     }                                                          \
   else                                                         \
     p++;                                                       \
 } while (0)
+
+#define SKIP_TAGGED_CHAR(p) do {                               \
+  if (*p >= 0x80 && *p < 0xc0)                                 \
+    {                                                          \
+      uns u = *p++;                                            \
+      if (u >= 0xa0 && u < 0xb0 && *p >= 0x80 && *p < 0xc0)    \
+       p++;                                                    \
+    }                                                          \
+  else                                                         \
+    UTF8_SKIP(p);                                              \
+} while (0)
+
+static inline uns
+bget_tagged_char(struct fastbuf *f)
+{
+  uns u = bgetc(f);
+  if ((int)u < 0x80)
+    ;
+  else if (u < 0xc0)
+    {
+      if (u >= 0xb0)
+       {
+         ASSERT(u == 0xb0);
+         u += 0x80020000;
+       }
+      else if (u >= 0xa0)
+       {
+         uns v = bgetc(f);
+         ASSERT(v >= 0x80 && v <= 0xbf);
+         u = 0x80010000 + ((u & 0x0f) << 6) + (v & 0x3f);
+       }
+      else
+       u += 0x80000000;
+    }
+  else
+    {
+      bungetc(f);
+      u = bget_utf8(f);
+    }
+  return u;
+}
+
+/* Conversion of document age from seconds to our internal units */
+
+static inline int
+convert_age(sh_time_t lastmod, sh_time_t reftime)
+{
+  sh_time_t age;
+  if (reftime < lastmod)               /* past times */
+    return -1;
+  age = (reftime - lastmod) / 3600;
+  if (age < 48)                                /* last 2 days: 1 hour resolution */
+    return age;
+  age = (age-48) / 24;
+  if (age < 64)                                /* next 64 days: 1 day resolution */
+    return 48 + age;
+  age = (age-64) / 7;
+  if (age < 135)                       /* next 135 weeks: 1 week resolution */
+    return 112 + age;
+  age = (age-135) / 52;
+  if (age < 8)                         /* next 8 years: 1 year resolution */
+    return 247 + age;
+  return 255;                          /* then just "infinite future" */
+}
+
+#endif