lib/index.h

   1 /*
   2  *      Sherlock: Data structures used in indices
   3  *
   4  *      (c) 2001--2003 Martin Mares <mj@ucw.cz>
   5  */
   6
   7 #ifndef _SHERLOCK_INDEX_H
   8 #define _SHERLOCK_INDEX_H
   9
  10 #include "lib/fastbuf.h"
  11 #include SHERLOCK_CUSTOM
  12 #include "charset/unistream.h"
  13
  14 /*
  15  *  Words and word complexes
  16  *
  17  *  MAX_WORD_LEN is the maximum length (measured in UTF-8 characters, excluding
  18  *  the terminating zero byte if there's any) of any word which may appear in the
  19  *  indices or in the bucket file. Naturally, the same constant also bounds
  20  *  the number of UCS-2 characters in a word.
  21  *
  22  *  Caveat: If you are upcasing/downcasing the word, the UTF-8 encoding can
  23  *  expand, although at most twice, so you need to reserve 2*MAX_WORD_LEN bytes.
  24  *
  25  *  MAX_COMPLEX_LEN is the upper bound on number of words in any word complex.
  26  */
  27
  28 #define MAX_WORD_LEN            64      /* a multiple of 4 */
  29 #define MAX_COMPLEX_LEN         10
  30
  31 /* Word and string types are defined in lib/custom.h */
  32
  33 /* Global index parameters */
  34
  35 struct index_params {
  36   sh_time_t ref_time;                   /* Reference time (for document ages etc.) */
  37 };
  38
  39 /* Index card attributes */
  40
  41 struct card_attr {
  42   u32 card;                             /* Reference to card description (either oid or filepos) */
  43 #ifdef CONFIG_SITES
  44   u32 site_id;
  45 #endif
  46   CUSTOM_CARD_ATTRS                     /* Include all custom attributes */
  47   byte weight;
  48   byte flags;
  49 #ifdef CONFIG_LASTMOD
  50   byte age;                             /* Document age in pseudo-logarithmic units wrt. reference time */
  51 #endif
  52 #ifdef CONFIG_FILETYPE
  53   byte type_flags;                      /* File type flags (see below) */
  54 #endif
  55 };
  56
  57 enum card_flag {
  58   CARD_FLAG_EMPTY = 1,                  /* Empty document (redirect, robot file etc.) [scanner] */
  59   CARD_FLAG_ACCENTED = 2,               /* Document contains accented characters [scanner] */
  60   CARD_FLAG_DUP = 4,                    /* Removed as a duplicate [merger] */
  61   CARD_FLAG_MERGED = 8,                 /* Destination of a merge [merger] */
  62   CARD_FLAG_IMAGE = 16,                 /* Is an image object [scanner] */
  63   CARD_FLAG_FRAMESET = 32,              /* Contains a frameset to be ignored [scanner] */
  64 };
  65
  66 #define CARD_POS_SHIFT 5                /* Card positions are shifted this # of bits to the right */
  67
  68 /*
  69  *  We store document type and several other properties in card_attr->type_flags.
  70  *  Here we define only the basic structure, the details and also how to match the
  71  *  types are defined in custom.h.
  72  *
  73  *  bits 7--5   file type: (0-3: text types, 4-7: other types, defined by custom.h)
  74  *  bits 4--0   type-dependent information, for text types it's document language code
  75  */
  76
  77 #define CA_GET_FILE_TYPE(a) ((a)->type_flags >> 5)
  78 #define CA_GET_FILE_INFO(a) ((a)->type_flags & 0x1f)
  79 #define CA_GET_FILE_LANG(a) ((a)->type_flags & 0x80 ? 0 : CA_GET_FILE_INFO(a))
  80
  81 /* String fingerprints */
  82
  83 struct fingerprint {
  84   byte hash[12];
  85 };
  86
  87 void fingerprint(byte *string, struct fingerprint *fp);
  88
  89 static inline u32
  90 fp_hash(struct fingerprint *fp)
  91 {
  92   return (fp->hash[0] << 24) | (fp->hash[1] << 16) | (fp->hash[2] << 8) | fp->hash[3];
  93 }
  94
  95 /* Reading of tagged text (Unicode values, tags mapped to 0x80000000 and higher) */
  96
  97 #define GET_TAGGED_CHAR(p,u) do {                               \
  98   u = *p;                                                       \
  99   if (u >= 0xc0)                                                \
 100     GET_UTF8_CHAR(p,u);                                         \
 101   else if (u >= 0x80)                                           \
 102     {                                                           \
 103       p++;                                                      \
 104       if (u >= 0xb0)                                            \
 105         {                                                       \
 106           ASSERT(u == 0xb0);                                    \
 107           u += 0x80020000;                                      \
 108         }                                                       \
 109       else if (u >= 0xa0)                                       \
 110         {                                                       \
 111           ASSERT(*p >= 0x80 && *p <= 0xbf);                     \
 112           u = 0x80010000 + ((u & 0x0f) << 6) + (*p++ & 0x3f);   \
 113         }                                                       \
 114       else                                                      \
 115         u += 0x80000000;                                        \
 116     }                                                           \
 117   else                                                          \
 118     p++;                                                        \
 119 } while (0)
 120
 121 #define SKIP_TAGGED_CHAR(p) do {                                \
 122   if (*p >= 0x80 && *p < 0xc0)                                  \
 123     {                                                           \
 124       uns u = *p++;                                             \
 125       if (u >= 0xa0 && u < 0xb0 && *p >= 0x80 && *p < 0xc0)     \
 126         p++;                                                    \
 127     }                                                           \
 128   else                                                          \
 129     UTF8_SKIP(p);                                               \
 130 } while (0)
 131
 132 static inline uns
 133 bget_tagged_char(struct fastbuf *f)
 134 {
 135   uns u = bgetc(f);
 136   if ((int)u < 0x80)
 137     ;
 138   else if (u < 0xc0)
 139     {
 140       if (u >= 0xb0)
 141         {
 142           ASSERT(u == 0xb0);
 143           u += 0x80020000;
 144         }
 145       else if (u >= 0xa0)
 146         {
 147           uns v = bgetc(f);
 148           ASSERT(v >= 0x80 && v <= 0xbf);
 149           u = 0x80010000 + ((u & 0x0f) << 6) + (v & 0x3f);
 150         }
 151       else
 152         u += 0x80000000;
 153     }
 154   else
 155     {
 156       bungetc(f);
 157       u = bget_utf8(f);
 158     }
 159   return u;
 160 }
 161
 162 /* Conversion of document age from seconds to our internal units */
 163
 164 static inline int
 165 convert_age(sh_time_t lastmod, sh_time_t reftime)
 166 {
 167   sh_time_t age;
 168   if (reftime < lastmod)                /* past times */
 169     return -1;
 170   age = (reftime - lastmod) / 3600;
 171   if (age < 48)                         /* last 2 days: 1 hour resolution */
 172     return age;
 173   age = (age-48) / 24;
 174   if (age < 64)                         /* next 64 days: 1 day resolution */
 175     return 48 + age;
 176   age = (age-64) / 7;
 177   if (age < 135)                        /* next 135 weeks: 1 week resolution */
 178     return 112 + age;
 179   age = (age-135) / 52;
 180   if (age < 8)                          /* next 8 years: 1 year resolution */
 181     return 247 + age;
 182   return 255;                           /* then just "infinite future" */
 183 }
 184
 185 #endif