From cea5b11fa465e902552c06a54e1b1255f8b71fd5 Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Thu, 20 Apr 2006 17:56:09 +0200 Subject: [PATCH] Some description and corrections... --- lib/kmp-search.h | 47 +++++++++++++++--------- lib/kmp.h | 93 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 100 insertions(+), 40 deletions(-) diff --git a/lib/kmp-search.h b/lib/kmp-search.h index 4b7c7028..a399030b 100644 --- a/lib/kmp-search.h +++ b/lib/kmp-search.h @@ -12,29 +12,44 @@ * This is not a normal header file, it's a generator of KMP algorithm. * Each time you include it with parameters set in the corresponding * preprocessor macros, it generates KMP structures and functions - * with the parameters given. Macros marked with [*] are mandatory. + * with the parameters given. See lib/kmp.h before reading this description. + * + * This file defines: + * + * struct search structure with both the internal and the user-defined variables + * used during the search and accessible from all macros + * + * void search(kmp,search,src) executes the search; search structure is allocated by the caller (possible input/output) + * + * void run(kmp,src) the same, but automatically allocates search structre from the stack + * + * + * Macros marked with [*] are mandatory. * * [*] KMPS_PREFIX(x) macro to add a name prefix (used on all global names * defined by the KMP search generator) - * [*] KMPS_KMP_PREFIX(x) prefix used for lib/kmp.h; - * more variants of kmp-search can be used for single lib/kmp.h + * [*] KMPS_KMP_PREFIX(x) prefix used for lib/kmp.h * - * KMPS_SOURCE user-defined search input (together with KMPS_GET_CHAR); - * if unset, the one from lib/kmp.h is used - * KMPS_GET_CHAR(kmp,src,s) + * KMPS_SOURCE user-defined text source (together with KMPS_GET_CHAR); + * if unset, the one from lib/kmp.h is taken + * KMPS_GET_CHAR(kmp,src,search) analogy to KMP_GET_CHAR, but it must store the next character to search->c * - * KMPS_ADD_CONTROLS add control characters at both ends of the input string + * KMPS_ADD_CONTROLS add control characters (see KMP_CONTROL_CHAR in kmp.h) at both ends of the input string * KMPS_MERGE_CONTROLS merge adjacent control characters to a single one * - * KMPS_VARS + * KMPS_VARS user-defined variables in struct search (in .u substructure to avoid collisions) * - * KMPS_INIT(kmp,src,s) - * KMPS_EXIT(kmp,src,s) - * KMPS_STEP(kmp,src,s) - * KMPS_FOUND(kmp,src,s) - * KMPS_FOUND_CHAIN(kmp,src,s) - * - * KMPS_WANT_BEST + * KMPS_INIT(kmp,src,search) statement executed at the beginning of search() + * KMPS_EXIT(kmp,src,search) ... at the end + * KMPS_STEP(kmp,src,search) ... after each step (read of next character + current state update) + * of the algorithm, but before KMPS_FOUND[_CHAIN] + * KMPS_FOUND_CHAIN(kmp,src,search) ... for each state representing locally longest match + * (stored in search->out - NOT necessary search.s!); + * all matches forms a NULL-terminated link list (search->out, search->out->next, ...) + * in order of decreasing length + * KMPS_FOUND(kmp,src,search) ... called for every match (in search->out) + * KMPS_WANT_BEST algorithm computes globally longest match, which is available + * in search.best in KMPS_EXIT; if there is no match, it points to the null state */ #define P(x) KMPS_PREFIX(x) @@ -85,7 +100,7 @@ P(search) (struct KP(struct) *kmp, struct P(search) *s, P(search_source_t) src) # endif # ifndef KMPS_ADD_CONTROLS goto start_read; -#endif +# endif for (;;) { for (struct KP(state) *t = s->s; t && !(s->s = KP(hash_find)(&kmp->hash, t, s->c)); t = t->back); diff --git a/lib/kmp.h b/lib/kmp.h index aa97aa68..f4d14049 100644 --- a/lib/kmp.h +++ b/lib/kmp.h @@ -14,35 +14,74 @@ * preprocessor macros, it generates KMP structures and functions * with the parameters given. * + * This file contains only construction of the automaton. The search + * itself can be generated by inclusion of file lib/kmp-search.h. + * Separeted headers allows the user to define multiple search + * routines for one common set of key strings. + * + * Example: + * + * #define KMP_PREFIX(x) GLUE_(kmp,x) + * #define KMP_WANT_CLEANUP + * #define KMP_WANT_SEARCH // includes lib/kmp-search.h automatically + * #define KMPS_FOUND(kmp,src,s) printf("found\n") + * #include "lib/kmp.h" + * + * [...] + * + * struct kmp_struct kmp; // a structure describing the whole automaton + * kmp_init(&kmp); // initialization (must be called before all other functions) + * + * // add key strings we want to search + * kmp_add(&kmp, "aaa"); + * kmp_add(&kmp, "abc"); + * + * // complete the automaton, no more strings can be added later + * kmp_build(&kmp); + * + * // example of search, should print single "found" to stdout + * kmp_run(&kmp, "aabaabca"); + * + * // destroy all internal structures + * kmp_cleanup(&kmp); + * + * + * Brief description of all parameters: * * Basic parameters: * KMP_PREFIX(x) macro to add a name prefix (used on all global names - * defined by the KMP generator); mandatory + * defined by the KMP generator); mandatory; + * below we use P(x) alias * * KMP_CHAR alphabet type, the default is u16 * * KMP_SOURCE user-defined text source; KMP_GET_CHAR must - * KMP_GET_CHAR(kmp,src,c) return next character from the input or zero at the end; + * KMP_GET_CHAR(kmp,src,c) return zero at the end or nonzero together with the next character in c otherwise; * if not defined, zero-terminated array of bytes is used as the input * - * KMP_VARS user-defined data in main structure (a structure describing - * the whole automaton) - * KMP_STATE_VARS user-defined data in each state of the automaton + * KMP_VARS user-defined variables in 'struct P(struct)' + * -- a structure describing the whole automaton; + * these variables are stored in .u substructure to avoid collisions + * KMP_STATE_VARS user-defined variables in 'struct P(state)' + * -- created for each state of the automaton; + * these variables are stored in .u substructure to avoid collisions * * Parameters which select how the input is interpreted (if KMP_SOURCE is unset): * KMP_USE_ASCII reads single bytes from the input (default) * KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed) * KMP_TOLOWER converts all to lowercase * KMP_UNACCENT removes accents - * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR - * KMP_CONTROL_CHAR special control character (default is ':') + * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR (see below) * - * Parameters controlling add(): - * KMP_ADD_EXTRA_ARGS extra arguments - * KMP_ADD_INIT(kmp,src) - * KMP_ADD_NEW(kmp,src,s) - * KMP_ADD_DUP(kmp,src,s) - * KMP_INIT_STATE(kmp,s) initialize new state (called before KMP_ADD_{NEW,DUP}) + * Parameters controlling add(kmp, src): + * KMP_ADD_EXTRA_ARGS extra arguments, should be used carefully because of possible collisions + * KMP_ADD_INIT(kmp,src) called in the beginning of add(), src is the first + * KMP_INIT_STATE(kmp,s) initialization of a new state s (called before KMP_ADD_{NEW,DUP}); + * null state is not included and should be handled after init() if necessary; + * all user-defined data are filled by zeros before call to KMP_INIT_STATE + * KMP_ADD_NEW(kmp,src,s) initialize last state of every new key string (called after KMP_INIT_STATE); + * the string must be parsed before so src is after the last string's character + * KMP_ADD_DUP(kmp,src,s) analogy of KMP_ADD_NEW called for duplicates * * Parameters to build(): * KMP_BUILD_STATE(kmp,s) called for all states (including null) in order of non-decreasing tree depth @@ -50,13 +89,19 @@ * Other parameters: * KMP_WANT_CLEANUP define cleanup() * KMP_WANT_SEARCH includes lib/kmp-search.h with the same prefix; - * there can be multiple search variants for a single KMP structure - * + * there can be multiple search variants for a single KMP automaton * KMP_USE_POOL allocates in a given pool - * - * KMP_GIVE_ALLOC - * KMP_GIVE_HASHFN - * KMP_GIVE_EQ + * KMP_CONTROL_CHAR special control character (default is ':') + * KMP_GIVE_ALLOC if set, you must supply custom allocation functions: + * void *alloc(unsigned int size) -- allocate space for + * a state. Default is pooled allocation from a local pool or HASH_USE_POOL. + * void free(void *) -- the converse. + * KMP_GIVE_HASHFN if set, you must supply custom hash function: + * unsigned int hash(struct P(struct) *kmp, struct P(state) *state, KMP_CHAR c); + * default hash function works only for integer character types + * KMP_GIVE_EQ if set, you must supply custom compare function of two characters: + * int eq(struct P(struct) *kmp, KMP_CHAR a, KMP_CHAR b); + * default is 'a == b' */ #ifndef KMP_PREFIX @@ -86,11 +131,11 @@ typedef struct {} P(node_t); struct P(struct); struct P(state) { - struct P(state) *from; /* state with previous character */ - struct P(state) *back; /* backwards edge to the largest shorter state */ - struct P(state) *next; /* largest shorter match */ - P(len_t) len; /* largest match, zero otherwise */ - P(char_t) c; /* last character */ + struct P(state) *from; /* state with previous character (forms a tree with null state in the root) */ + struct P(state) *back; /* backwards edge to the longest shorter state with same suffix */ + struct P(state) *next; /* longest shorter match (or NULL) */ + P(len_t) len; /* state depth if it represents a key string, zero otherwise */ + P(char_t) c; /* last character of represented string */ struct { # ifdef KMP_STATE_VARS KMP_STATE_VARS -- 2.39.2