* This is not a normal header file, it's a generator of KMP algorithm.
* Each time you include it with parameters set in the corresponding
* preprocessor macros, it generates KMP structures and functions
- * with the parameters given. Macros marked with [*] are mandatory.
+ * with the parameters given. See lib/kmp.h before reading this description.
+ *
+ * This file defines:
+ *
+ * struct search structure with both the internal and the user-defined variables
+ * used during the search and accessible from all macros
+ *
+ * void search(kmp,search,src) executes the search; search structure is allocated by the caller (possible input/output)
+ *
+ * void run(kmp,src) the same, but automatically allocates search structre from the stack
+ *
+ *
+ * Macros marked with [*] are mandatory.
*
* [*] KMPS_PREFIX(x) macro to add a name prefix (used on all global names
* defined by the KMP search generator)
- * [*] KMPS_KMP_PREFIX(x) prefix used for lib/kmp.h;
- * more variants of kmp-search can be used for single lib/kmp.h
+ * [*] KMPS_KMP_PREFIX(x) prefix used for lib/kmp.h
*
- * KMPS_SOURCE user-defined search input (together with KMPS_GET_CHAR);
- * if unset, the one from lib/kmp.h is used
- * KMPS_GET_CHAR(kmp,src,s)
+ * KMPS_SOURCE user-defined text source (together with KMPS_GET_CHAR);
+ * if unset, the one from lib/kmp.h is taken
+ * KMPS_GET_CHAR(kmp,src,search) analogy to KMP_GET_CHAR, but it must store the next character to search->c
*
- * KMPS_ADD_CONTROLS add control characters at both ends of the input string
+ * KMPS_ADD_CONTROLS add control characters (see KMP_CONTROL_CHAR in kmp.h) at both ends of the input string
* KMPS_MERGE_CONTROLS merge adjacent control characters to a single one
*
- * KMPS_VARS
+ * KMPS_VARS user-defined variables in struct search (in .u substructure to avoid collisions)
*
- * KMPS_INIT(kmp,src,s)
- * KMPS_EXIT(kmp,src,s)
- * KMPS_STEP(kmp,src,s)
- * KMPS_FOUND(kmp,src,s)
- * KMPS_FOUND_CHAIN(kmp,src,s)
- *
- * KMPS_WANT_BEST
+ * KMPS_INIT(kmp,src,search) statement executed at the beginning of search()
+ * KMPS_EXIT(kmp,src,search) ... at the end
+ * KMPS_STEP(kmp,src,search) ... after each step (read of next character + current state update)
+ * of the algorithm, but before KMPS_FOUND[_CHAIN]
+ * KMPS_FOUND_CHAIN(kmp,src,search) ... for each state representing locally longest match
+ * (stored in search->out - NOT necessary search.s!);
+ * all matches forms a NULL-terminated link list (search->out, search->out->next, ...)
+ * in order of decreasing length
+ * KMPS_FOUND(kmp,src,search) ... called for every match (in search->out)
+ * KMPS_WANT_BEST algorithm computes globally longest match, which is available
+ * in search.best in KMPS_EXIT; if there is no match, it points to the null state
*/
#define P(x) KMPS_PREFIX(x)
# endif
# ifndef KMPS_ADD_CONTROLS
goto start_read;
-#endif
+# endif
for (;;)
{
for (struct KP(state) *t = s->s; t && !(s->s = KP(hash_find)(&kmp->hash, t, s->c)); t = t->back);
* preprocessor macros, it generates KMP structures and functions
* with the parameters given.
*
+ * This file contains only construction of the automaton. The search
+ * itself can be generated by inclusion of file lib/kmp-search.h.
+ * Separeted headers allows the user to define multiple search
+ * routines for one common set of key strings.
+ *
+ * Example:
+ *
+ * #define KMP_PREFIX(x) GLUE_(kmp,x)
+ * #define KMP_WANT_CLEANUP
+ * #define KMP_WANT_SEARCH // includes lib/kmp-search.h automatically
+ * #define KMPS_FOUND(kmp,src,s) printf("found\n")
+ * #include "lib/kmp.h"
+ *
+ * [...]
+ *
+ * struct kmp_struct kmp; // a structure describing the whole automaton
+ * kmp_init(&kmp); // initialization (must be called before all other functions)
+ *
+ * // add key strings we want to search
+ * kmp_add(&kmp, "aaa");
+ * kmp_add(&kmp, "abc");
+ *
+ * // complete the automaton, no more strings can be added later
+ * kmp_build(&kmp);
+ *
+ * // example of search, should print single "found" to stdout
+ * kmp_run(&kmp, "aabaabca");
+ *
+ * // destroy all internal structures
+ * kmp_cleanup(&kmp);
+ *
+ *
+ * Brief description of all parameters:
*
* Basic parameters:
* KMP_PREFIX(x) macro to add a name prefix (used on all global names
- * defined by the KMP generator); mandatory
+ * defined by the KMP generator); mandatory;
+ * below we use P(x) alias
*
* KMP_CHAR alphabet type, the default is u16
*
* KMP_SOURCE user-defined text source; KMP_GET_CHAR must
- * KMP_GET_CHAR(kmp,src,c) return next character from the input or zero at the end;
+ * KMP_GET_CHAR(kmp,src,c) return zero at the end or nonzero together with the next character in c otherwise;
* if not defined, zero-terminated array of bytes is used as the input
*
- * KMP_VARS user-defined data in main structure (a structure describing
- * the whole automaton)
- * KMP_STATE_VARS user-defined data in each state of the automaton
+ * KMP_VARS user-defined variables in 'struct P(struct)'
+ * -- a structure describing the whole automaton;
+ * these variables are stored in .u substructure to avoid collisions
+ * KMP_STATE_VARS user-defined variables in 'struct P(state)'
+ * -- created for each state of the automaton;
+ * these variables are stored in .u substructure to avoid collisions
*
* Parameters which select how the input is interpreted (if KMP_SOURCE is unset):
* KMP_USE_ASCII reads single bytes from the input (default)
* KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed)
* KMP_TOLOWER converts all to lowercase
* KMP_UNACCENT removes accents
- * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR
- * KMP_CONTROL_CHAR special control character (default is ':')
+ * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR (see below)
*
- * Parameters controlling add():
- * KMP_ADD_EXTRA_ARGS extra arguments
- * KMP_ADD_INIT(kmp,src)
- * KMP_ADD_NEW(kmp,src,s)
- * KMP_ADD_DUP(kmp,src,s)
- * KMP_INIT_STATE(kmp,s) initialize new state (called before KMP_ADD_{NEW,DUP})
+ * Parameters controlling add(kmp, src):
+ * KMP_ADD_EXTRA_ARGS extra arguments, should be used carefully because of possible collisions
+ * KMP_ADD_INIT(kmp,src) called in the beginning of add(), src is the first
+ * KMP_INIT_STATE(kmp,s) initialization of a new state s (called before KMP_ADD_{NEW,DUP});
+ * null state is not included and should be handled after init() if necessary;
+ * all user-defined data are filled by zeros before call to KMP_INIT_STATE
+ * KMP_ADD_NEW(kmp,src,s) initialize last state of every new key string (called after KMP_INIT_STATE);
+ * the string must be parsed before so src is after the last string's character
+ * KMP_ADD_DUP(kmp,src,s) analogy of KMP_ADD_NEW called for duplicates
*
* Parameters to build():
* KMP_BUILD_STATE(kmp,s) called for all states (including null) in order of non-decreasing tree depth
* Other parameters:
* KMP_WANT_CLEANUP define cleanup()
* KMP_WANT_SEARCH includes lib/kmp-search.h with the same prefix;
- * there can be multiple search variants for a single KMP structure
- *
+ * there can be multiple search variants for a single KMP automaton
* KMP_USE_POOL allocates in a given pool
- *
- * KMP_GIVE_ALLOC
- * KMP_GIVE_HASHFN
- * KMP_GIVE_EQ
+ * KMP_CONTROL_CHAR special control character (default is ':')
+ * KMP_GIVE_ALLOC if set, you must supply custom allocation functions:
+ * void *alloc(unsigned int size) -- allocate space for
+ * a state. Default is pooled allocation from a local pool or HASH_USE_POOL.
+ * void free(void *) -- the converse.
+ * KMP_GIVE_HASHFN if set, you must supply custom hash function:
+ * unsigned int hash(struct P(struct) *kmp, struct P(state) *state, KMP_CHAR c);
+ * default hash function works only for integer character types
+ * KMP_GIVE_EQ if set, you must supply custom compare function of two characters:
+ * int eq(struct P(struct) *kmp, KMP_CHAR a, KMP_CHAR b);
+ * default is 'a == b'
*/
#ifndef KMP_PREFIX
struct P(struct);
struct P(state) {
- struct P(state) *from; /* state with previous character */
- struct P(state) *back; /* backwards edge to the largest shorter state */
- struct P(state) *next; /* largest shorter match */
- P(len_t) len; /* largest match, zero otherwise */
- P(char_t) c; /* last character */
+ struct P(state) *from; /* state with previous character (forms a tree with null state in the root) */
+ struct P(state) *back; /* backwards edge to the longest shorter state with same suffix */
+ struct P(state) *next; /* longest shorter match (or NULL) */
+ P(len_t) len; /* state depth if it represents a key string, zero otherwise */
+ P(char_t) c; /* last character of represented string */
struct {
# ifdef KMP_STATE_VARS
KMP_STATE_VARS