X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=lib%2Findex.h;h=b17e9a62626440c342d1632086ffa94aa8bc7e1b;hb=8a5c234a1c81e4d199d6c8a20b25548068d843f7;hp=8bfbe88d4d6862f58118de370d52f41d3d483553;hpb=7a84ff60630fb4211e60320843955ff2cf013ac2;p=libucw.git diff --git a/lib/index.h b/lib/index.h index 8bfbe88d..b17e9a62 100644 --- a/lib/index.h +++ b/lib/index.h @@ -11,9 +11,21 @@ #include SHERLOCK_CUSTOM #include "charset/unistream.h" -/* Words */ +/* + * Words and word complexes + * + * MAX_WORD_LEN is the maximum length (measured in UTF-8 characters, excluding + * the terminating zero byte if there's any) of any word which may appear in the + * indices or in the bucket file. Naturally, the same constant also bounds + * the number of UCS-2 characters in a word. + * + * Caveat: If you are upcasing/downcasing the word, the UTF-8 encoding can + * expand, although at most twice, so you need to reserve 2*MAX_WORD_LEN bytes. + * + * MAX_COMPLEX_LEN is the upper bound on number of words in any word complex. + */ -#define MAX_WORD_LEN 64 +#define MAX_WORD_LEN 64 /* a multiple of 4 */ #define MAX_COMPLEX_LEN 10 /* Word and string types are defined in lib/custom.h */