X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=lib%2Fkmp.h;h=67889a17e9e78c03ce30e5eee9f39935ad0ffac4;hb=b5f9a4654516e0ec37373e198e6b3a6f6a2a186d;hp=d4836988dda500336fde89df4c03f80a1ac05e02;hpb=e1ccb1a25eae9368b00589d948cc0e23718f2fbb;p=libucw.git diff --git a/lib/kmp.h b/lib/kmp.h index d4836988..67889a17 100644 --- a/lib/kmp.h +++ b/lib/kmp.h @@ -14,34 +14,74 @@ * preprocessor macros, it generates KMP structures and functions * with the parameters given. * + * This file contains only construction of the automaton. The search + * itself can be generated by inclusion of file lib/kmp-search.h. + * Separeted headers allow the user to define multiple search + * routines for one common set of key strings. + * + * Example: + * + * #define KMP_PREFIX(x) kmp_##x + * #define KMP_WANT_CLEANUP + * #define KMP_WANT_SEARCH // includes lib/kmp-search.h automatically + * #define KMPS_FOUND(kmp,src,s) printf("found\n") + * #include "lib/kmp.h" + * + * [...] + * + * struct kmp_struct kmp; // a structure describing the whole automaton + * kmp_init(&kmp); // initialization (must be called before all other functions) + * + * // add key strings we want to search + * kmp_add(&kmp, "aaa"); + * kmp_add(&kmp, "abc"); + * + * // complete the automaton, no more strings can be added later + * kmp_build(&kmp); + * + * // example of search, should print single "found" to stdout + * kmp_run(&kmp, "aabaabca"); + * + * // destroy all internal structures + * kmp_cleanup(&kmp); + * + * + * Brief description of all parameters: * * Basic parameters: * KMP_PREFIX(x) macro to add a name prefix (used on all global names - * defined by the KMP generator); mandatory + * defined by the KMP generator); mandatory; + * we abbreviate this to P(x) below * * KMP_CHAR alphabet type, the default is u16 - * - * KMP_SOURCE user-defined text source; KMP_GET_CHAR must - * KMP_GET_CHAR(kmp,src,c) return next character from the input or zero at the end; + * + * KMP_SOURCE user-defined text source; KMP_GET_CHAR must + * KMP_GET_CHAR(kmp,src,c) return zero at the end or nonzero together with the next character in c otherwise; * if not defined, zero-terminated array of bytes is used as the input - * - * KMP_VARS user-defined data in main structure (a structure describing - * the whole automaton) - * KMP_STATE_VARS user-defined data in each state of the automaton + * + * KMP_VARS user-defined variables in 'struct P(struct)' + * -- a structure describing the whole automaton; + * these variables are stored in .u substructure to avoid collisions + * KMP_STATE_VARS user-defined variables in 'struct P(state)' + * -- created for each state of the automaton; + * these variables are stored in .u substructure to avoid collisions * * Parameters which select how the input is interpreted (if KMP_SOURCE is unset): * KMP_USE_ASCII reads single bytes from the input (default) * KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed) * KMP_TOLOWER converts all to lowercase * KMP_UNACCENT removes accents - * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR - * KMP_CONTROL_CHAR special control character (default is ':') + * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR (see below) * - * Parameters controlling add(): - * KMP_ADD_EXTRA_ARGS extra arguments - * KMP_ADD_INIT(kmp,src) - * KMP_ADD_NEW(kmp,src,s) - * KMP_ADD_DUP(kmp,src,s) + * Parameters controlling add(kmp, src): + * KMP_ADD_EXTRA_ARGS extra arguments, should be used carefully because of possible collisions + * KMP_ADD_INIT(kmp,src) called in the beginning of add(), src is the first + * KMP_INIT_STATE(kmp,s) initialization of a new state s (called before KMP_ADD_{NEW,DUP}); + * null state is not included and should be handled after init() if necessary; + * all user-defined data are filled by zeros before call to KMP_INIT_STATE + * KMP_ADD_NEW(kmp,src,s) initialize last state of every new key string (called after KMP_INIT_STATE); + * the string must be parsed before so src is after the last string's character + * KMP_ADD_DUP(kmp,src,s) analogy of KMP_ADD_NEW called for duplicates * * Parameters to build(): * KMP_BUILD_STATE(kmp,s) called for all states (including null) in order of non-decreasing tree depth @@ -49,13 +89,19 @@ * Other parameters: * KMP_WANT_CLEANUP define cleanup() * KMP_WANT_SEARCH includes lib/kmp-search.h with the same prefix; - * there can be multiple search variants for a single KMP structure - * + * there can be multiple search variants for a single KMP automaton * KMP_USE_POOL allocates in a given pool - * - * KMP_GIVE_ALLOC - * KMP_GIVE_HASHFN - * KMP_GIVE_EQ + * KMP_CONTROL_CHAR special control character (default is ':') + * KMP_GIVE_ALLOC if set, you must supply custom allocation functions: + * void *alloc(unsigned int size) -- allocate space for + * a state. Default is pooled allocation from a local pool or HASH_USE_POOL. + * void free(void *) -- the converse. + * KMP_GIVE_HASHFN if set, you must supply custom hash function: + * unsigned int hash(struct P(struct) *kmp, struct P(state) *state, KMP_CHAR c); + * default hash function works only for integer character types + * KMP_GIVE_EQ if set, you must supply custom compare function of two characters: + * int eq(struct P(struct) *kmp, KMP_CHAR a, KMP_CHAR b); + * default is 'a == b' */ #ifndef KMP_PREFIX @@ -85,15 +131,15 @@ typedef struct {} P(node_t); struct P(struct); struct P(state) { - struct P(state) *from; /* state with previous character */ - struct P(state) *back; /* backwards edge to the largest shorter state */ - struct P(state) *next; /* largest shorter match */ - P(len_t) len; /* largest match, zero otherwise */ - P(char_t) c; /* last character */ + struct P(state) *from; /* state with the previous character (forms a tree with null state in the root) */ + struct P(state) *back; /* backwards edge to the longest shorter state with same suffix */ + struct P(state) *next; /* the longest of shorter matches (or NULL) */ + P(len_t) len; /* state depth if it represents a key string, zero otherwise */ + P(char_t) c; /* last character of the represented string */ struct { # ifdef KMP_STATE_VARS KMP_STATE_VARS -# endif +# endif } u; /* user-defined data*/ }; @@ -167,6 +213,10 @@ static inline void P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(state) *f, P(char_t) c) { bzero(s, sizeof(*s)); +# ifdef KMP_INIT_STATE + struct P(struct) *kmp = (struct P(struct) *)t; + { KMP_INIT_STATE(kmp, s); } +# endif s->from = f; s->c = c; s->next = f->back; /* the pointers hold the link-list of sons... changed in build() */ @@ -174,7 +224,7 @@ P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(s } #undef P -#define HASH_PREFIX(x) KMP_PREFIX(GLUE(hash_,x)) +#define HASH_PREFIX(x) KMP_PREFIX(hash_##x) #define HASH_NODE struct KMP_PREFIX(state) #define HASH_KEY_COMPLEX(x) x from, x c #define HASH_KEY_DECL struct KMP_PREFIX(state) *from, KMP_PREFIX(char_t) c @@ -237,7 +287,7 @@ P(get_char) (struct P(struct) *kmp UNUSED, P(source_t) *src, P(char_t) *c) else if (!Ualpha(cc)) cc = P(control)(); else -# endif +# endif { # ifdef KMP_TOLOWER cc = Utolower(cc); @@ -352,7 +402,7 @@ P(build) (struct P(struct) *kmp) null->back = NULL; # ifdef KMP_BUILD_STATE { KMP_BUILD_STATE(kmp, null); } -# endif +# endif while (read != write) { struct P(state) *s = fifo[read++], *t; @@ -375,7 +425,7 @@ P(build) (struct P(struct) *kmp) } # ifdef KMP_BUILD_STATE { KMP_BUILD_STATE(kmp, s); } -# endif +# endif } } @@ -396,6 +446,7 @@ P(build) (struct P(struct) *kmp) #undef KMP_ADD_INIT #undef KMP_ADD_NEW #undef KMP_ADD_DUP +#undef KMP_INIT_STATE #undef KMP_BUILD_STATE #undef KMP_USE_POOL #undef KMP_GIVE_ALLOC