lib/index.h

   1 /*
   2  *      Sherlock: Data structures used in indices
   3  *
   4  *      (c) 2001--2003 Martin Mares <mj@ucw.cz>
   5  */
   6
   7 #ifndef _SHERLOCK_INDEX_H
   8 #define _SHERLOCK_INDEX_H
   9
  10 #include "lib/fastbuf.h"
  11 #include SHERLOCK_CUSTOM
  12 #include "charset/unistream.h"
  13
  14 #define INDEX_VERSION (0x32240100+sizeof(struct card_attr))     /* Increase with each incompatible change in index format */
  15
  16 /*
  17  *  Words and word complexes
  18  *
  19  *  MAX_WORD_LEN is the maximum length (measured in UTF-8 characters, excluding
  20  *  the terminating zero byte if there's any) of any word which may appear in the
  21  *  indices or in the bucket file. Naturally, the same constant also bounds
  22  *  the number of UCS-2 characters in a word.
  23  *
  24  *  Caveat: If you are upcasing/downcasing the word, the UTF-8 encoding can
  25  *  expand, although at most twice, so you need to reserve 2*MAX_WORD_LEN bytes.
  26  *
  27  *  MAX_COMPLEX_LEN is the upper bound on number of words in any word complex.
  28  */
  29
  30 #define MAX_WORD_LEN            64      /* a multiple of 4 */
  31 #define MAX_COMPLEX_LEN         10
  32
  33 /* Word and string types are defined in lib/custom.h */
  34
  35 /* Index card attributes */
  36
  37 struct card_attr {
  38   u32 card;                             /* Reference to card description (either oid or filepos) */
  39 #ifdef CONFIG_SITES
  40   u32 site_id;
  41 #endif
  42   CUSTOM_CARD_ATTRS                     /* Include all custom attributes */
  43   byte weight;
  44   byte flags;
  45 #ifdef CONFIG_LASTMOD
  46   byte age;                             /* Document age in pseudo-logarithmic units wrt. reference time */
  47 #endif
  48 #ifdef CONFIG_FILETYPE
  49   byte type_flags;                      /* File type flags (see below) */
  50 #endif
  51 };
  52
  53 enum card_flag {
  54   CARD_FLAG_EMPTY = 1,                  /* Empty document (redirect, robot file etc.) [scanner] */
  55   CARD_FLAG_ACCENTED = 2,               /* Document contains accented characters [scanner] */
  56   CARD_FLAG_DUP = 4,                    /* Removed as a duplicate [merger] */
  57   CARD_FLAG_MERGED = 8,                 /* Destination of a merge [merger] */
  58   CARD_FLAG_IMAGE = 16,                 /* Is an image object [scanner] */
  59   CARD_FLAG_FRAMESET = 32,              /* Contains a frameset to be ignored [scanner] */
  60 };
  61
  62 #define CARD_POS_SHIFT 5                /* Card positions are shifted this # of bits to the right */
  63
  64 /*
  65  *  We store document type and several other properties in card_attr->type_flags.
  66  *  Here we define only the basic structure, the details are defined in custom.h
  67  *  (the list of type names custom_file_type_names[] and also setting of the file
  68  *  types in custom_create_attrs()).
  69  *
  70  *  bits 7--5   file type: (0-3: text types, 4-7: other types, defined by custom.h)
  71  *  bits 4--0   type-dependent information, for text types it's document language code
  72  */
  73
  74 #ifdef CONFIG_FILETYPE
  75 #define CA_GET_FILE_TYPE(a) ((a)->type_flags >> 5)
  76 #define CA_GET_FILE_INFO(a) ((a)->type_flags & 0x1f)
  77 #define CA_GET_FILE_LANG(a) ((a)->type_flags & 0x80 ? 0 : CA_GET_FILE_INFO(a))
  78 #define FILETYPE_ATTRS SMALL_SET_ATTR(ftype, FILETYPE, CA_GET_FILE_TYPE, ext_ft_parse)
  79 byte *ext_ft_parse(u32 *dest, byte *value, uns intval);
  80 extern byte *custom_file_type_names[8];
  81 #else
  82 #define FILETYPE_ATTRS
  83 #endif
  84
  85 #ifdef CONFIG_LANG
  86 /* You can use language matching without CONFIG_FILETYPE, but you have to define CA_GET_FILE_LANG yourself. */
  87 #define LANG_ATTRS SMALL_SET_ATTR(lang, LANG, CA_GET_FILE_LANG, ext_lang_parse)
  88 byte *ext_lang_parse(u32 *dest, byte *value, uns intval);
  89 #else
  90 #define LANG_ATTRS
  91 #endif
  92
  93 #define EXTENDED_ATTRS CUSTOM_ATTRS FILETYPE_ATTRS LANG_ATTRS
  94
  95 /* String fingerprints */
  96
  97 struct fingerprint {
  98   byte hash[12];
  99 };
 100
 101 void fingerprint(byte *string, struct fingerprint *fp);
 102
 103 static inline u32
 104 fp_hash(struct fingerprint *fp)
 105 {
 106   return (fp->hash[0] << 24) | (fp->hash[1] << 16) | (fp->hash[2] << 8) | fp->hash[3];
 107 }
 108
 109 /* Reading of tagged text (Unicode values, tags mapped to 0x80000000 and higher) */
 110
 111 #define GET_TAGGED_CHAR(p,u) do {                               \
 112   u = *p;                                                       \
 113   if (u >= 0xc0)                                                \
 114     GET_UTF8_CHAR(p,u);                                         \
 115   else if (u >= 0x80)                                           \
 116     {                                                           \
 117       p++;                                                      \
 118       if (u >= 0xb0)                                            \
 119         {                                                       \
 120           ASSERT(u == 0xb0);                                    \
 121           u += 0x80020000;                                      \
 122         }                                                       \
 123       else if (u >= 0xa0)                                       \
 124         {                                                       \
 125           ASSERT(*p >= 0x80 && *p <= 0xbf);                     \
 126           u = 0x80010000 + ((u & 0x0f) << 6) + (*p++ & 0x3f);   \
 127         }                                                       \
 128       else                                                      \
 129         u += 0x80000000;                                        \
 130     }                                                           \
 131   else                                                          \
 132     p++;                                                        \
 133 } while (0)
 134
 135 #define SKIP_TAGGED_CHAR(p) do {                                \
 136   if (*p >= 0x80 && *p < 0xc0)                                  \
 137     {                                                           \
 138       uns u = *p++;                                             \
 139       if (u >= 0xa0 && u < 0xb0 && *p >= 0x80 && *p < 0xc0)     \
 140         p++;                                                    \
 141     }                                                           \
 142   else                                                          \
 143     UTF8_SKIP(p);                                               \
 144 } while (0)
 145
 146 static inline uns
 147 bget_tagged_char(struct fastbuf *f)
 148 {
 149   uns u = bgetc(f);
 150   if ((int)u < 0x80)
 151     ;
 152   else if (u < 0xc0)
 153     {
 154       if (u >= 0xb0)
 155         {
 156           ASSERT(u == 0xb0);
 157           u += 0x80020000;
 158         }
 159       else if (u >= 0xa0)
 160         {
 161           uns v = bgetc(f);
 162           ASSERT(v >= 0x80 && v <= 0xbf);
 163           u = 0x80010000 + ((u & 0x0f) << 6) + (v & 0x3f);
 164         }
 165       else
 166         u += 0x80000000;
 167     }
 168   else
 169     {
 170       bungetc(f);
 171       u = bget_utf8(f);
 172     }
 173   return u;
 174 }
 175
 176 /* Conversion of document age from seconds to our internal units */
 177
 178 static inline int
 179 convert_age(sh_time_t lastmod, sh_time_t reftime)
 180 {
 181   sh_time_t age;
 182   if (reftime < lastmod)                /* past times */
 183     return -1;
 184   age = (reftime - lastmod) / 3600;
 185   if (age < 48)                         /* last 2 days: 1 hour resolution */
 186     return age;
 187   age = (age-48) / 24;
 188   if (age < 64)                         /* next 64 days: 1 day resolution */
 189     return 48 + age;
 190   age = (age-64) / 7;
 191   if (age < 135)                        /* next 135 weeks: 1 week resolution */
 192     return 112 + age;
 193   age = (age-135) / 52;
 194   if (age < 8)                          /* next 8 years: 1 year resolution */
 195     return 247 + age;
 196   return 255;                           /* then just "infinite future" */
 197 }
 198
 199 #endif