]> mj.ucw.cz Git - libucw.git/blobdiff - lib/index.h
Several bug fixes in the logger:
[libucw.git] / lib / index.h
index 0585141245e440dd4b03e0dc75068956ee00abea..b17e9a62626440c342d1632086ffa94aa8bc7e1b 100644 (file)
@@ -4,9 +4,29 @@
  *     (c) 2001--2002 Martin Mares <mj@ucw.cz>
  */
 
-/* Words */
+#ifndef _SHERLOCK_INDEX_H
+#define _SHERLOCK_INDEX_H
 
-#define MAX_WORD_LEN           64
+#include "lib/fastbuf.h"
+#include SHERLOCK_CUSTOM
+#include "charset/unistream.h"
+
+/*
+ *  Words and word complexes
+ *
+ *  MAX_WORD_LEN is the maximum length (measured in UTF-8 characters, excluding
+ *  the terminating zero byte if there's any) of any word which may appear in the
+ *  indices or in the bucket file. Naturally, the same constant also bounds
+ *  the number of UCS-2 characters in a word.
+ *
+ *  Caveat: If you are upcasing/downcasing the word, the UTF-8 encoding can
+ *  expand, although at most twice, so you need to reserve 2*MAX_WORD_LEN bytes.
+ *
+ *  MAX_COMPLEX_LEN is the upper bound on number of words in any word complex.
+ */
+
+#define MAX_WORD_LEN           64      /* a multiple of 4 */
+#define MAX_COMPLEX_LEN                10
 
 /* Word and string types are defined in lib/custom.h */
 
@@ -20,10 +40,10 @@ struct index_params {
 
 struct card_attr {
   u32 card;                            /* Reference to card description (either oid or filepos) */
+#ifdef CONFIG_SITES
   u32 site_id;
-#define INT_ATTR(t,i,o,k,g,p) t i;
-  CUSTOM_ATTRS                         /* Include all custom attributes */
-#undef INT_ATTR
+#endif
+  CUSTOM_CARD_ATTRS                    /* Include all custom attributes */
   byte weight;
   byte flags;
   byte age;                            /* Document age in pseudo-logarithmic units wrt. reference time */
@@ -35,6 +55,7 @@ enum card_flag {
   CARD_FLAG_ACCENTED = 2,              /* Document contains accented characters [scanner] */
   CARD_FLAG_DUP = 4,                   /* Removed as a duplicate [merger] */
   CARD_FLAG_MERGED = 8,                        /* Destination of a merge [merger] */
+  CARD_FLAG_IMAGE = 16,                        /* Is an image object [scanner] */
 };
 
 #define CARD_POS_SHIFT 5               /* Card positions are shifted this # of bytes to the right */
@@ -64,8 +85,7 @@ fp_hash(struct fingerprint *fp)
       p++;                                                     \
       if (u >= 0xb0)                                           \
         {                                                      \
-         if (u != 0xb0)                                        \
-            ASSERT(0);                                         \
+         ASSERT(u == 0xb0);                                    \
          u += 0x80020000;                                      \
         }                                                      \
       else if (u >= 0xa0)                                      \
@@ -80,6 +100,47 @@ fp_hash(struct fingerprint *fp)
     p++;                                                       \
 } while (0)
 
+#define SKIP_TAGGED_CHAR(p) do {                               \
+  if (*p >= 0x80 && *p < 0xc0)                                 \
+    {                                                          \
+      uns u = *p++;                                            \
+      if (u >= 0xa0 && u < 0xb0 && *p >= 0x80 && *p < 0xc0)    \
+       p++;                                                    \
+    }                                                          \
+  else                                                         \
+    UTF8_SKIP(p);                                              \
+} while (0)
+
+static inline uns
+bget_tagged_char(struct fastbuf *f)
+{
+  uns u = bgetc(f);
+  if ((int)u < 0x80)
+    ;
+  else if (u < 0xc0)
+    {
+      if (u >= 0xb0)
+       {
+         ASSERT(u == 0xb0);
+         u += 0x80020000;
+       }
+      else if (u >= 0xa0)
+       {
+         uns v = bgetc(f);
+         ASSERT(v >= 0x80 && v <= 0xbf);
+         u = 0x80010000 + ((u & 0x0f) << 6) + (v & 0x3f);
+       }
+      else
+       u += 0x80000000;
+    }
+  else
+    {
+      bungetc(f);
+      u = bget_utf8(f);
+    }
+  return u;
+}
+
 /* Conversion of document age from seconds to our internal units */
 
 static inline int
@@ -102,3 +163,5 @@ convert_age(sh_time_t lastmod, sh_time_t reftime)
     return 247 + age;
   return 255;                          /* then just "infinite future" */
 }
+
+#endif