-/* Descriptive names used for user output */
-#define WORD_TYPE_USER_NAMES \
- "reserved", "text", "emph", "small", "title", "hdr1", "hdr2", "keywd", \
- "meta", "alt", "urlword1", "urlword2", "type12", "type13", "type14", "type15"
-
-/* Keywords for word type names */
-#define WORD_TYPE_NAMES \
- T(WORD, ~0) \
- T(TEXT, 1 << WT_TEXT) \
- T(EMPH, 1 << WT_EMPH) \
- T(SMALL, 1 << WT_SMALL) \
- T(TITLE, 1 << WT_TITLE) \
- T(HDR, (1 << WT_SMALL_HEADING) | (1 << WT_BIG_HEADING)) \
- T(HDR1, 1 << WT_SMALL_HEADING) \
- T(HDR2, 1 << WT_BIG_HEADING) \
- T(KEYWD, 1 << WT_KEYWORD) \
- T(META, 1 << WT_META) \
- T(ALT, 1 << WT_ALT) \
- T(URLWORD, (1 << WT_URL1) | (1 << WT_URL2))
-
-/* These types are not shown in document contexts */
-#define WORD_TYPES_HIDDEN ((1 << WT_URL1) | (1 << WT_URL2))
-
-/* These types are always matched without accents if accent mode is set to "auto" */
-#define WORD_TYPES_NO_AUTO_ACCENT ((1 << WT_URL1) | (1 << WT_URL2))
-
-/* String types */
-
-enum string_type {
- ST_RESERVED, /* Reserved string type */
- ST_URL, /* URL of the document */
- ST_HOST, /* Host name */
- ST_DOMAIN, /* Domain name */
- ST_REF, /* URL reference */
- ST_BACKREF, /* Back-reference (frame or redirect source) */
- ST_MAX
-};
+#include "lib/fastbuf.h"
+#include SHERLOCK_CUSTOM
+#include "charset/unistream.h"
+
+/*
+ * Words and word complexes
+ *
+ * MAX_WORD_LEN is the maximum length (measured in UTF-8 characters, excluding
+ * the terminating zero byte if there's any) of any word which may appear in the
+ * indices or in the bucket file. Naturally, the same constant also bounds
+ * the number of UCS-2 characters in a word.
+ *
+ * Caveat: If you are upcasing/downcasing the word, the UTF-8 encoding can
+ * expand, although at most twice, so you need to reserve 2*MAX_WORD_LEN bytes.
+ *
+ * MAX_COMPLEX_LEN is the upper bound on number of words in any word complex.
+ */
+
+#define MAX_WORD_LEN 64 /* a multiple of 4 */
+#define MAX_COMPLEX_LEN 10