2 * Knuth-Morris-Pratt's Substring Search for N given strings
4 * (c) 1999--2005, Robert Spalek <robert@ucw.cz>
5 * (c) 2006, Pavel Charvat <pchar@ucw.cz>
7 * (In fact, the algorithm is usually referred to as Aho-McCorasick,
8 * but that's just an extension of KMP to multiple strings.)
12 * This is not a normal header file, it's a generator of KMP algorithm.
13 * Each time you include it with parameters set in the corresponding
14 * preprocessor macros, it generates KMP structures and functions
15 * with the parameters given.
17 * This file contains only construction of the automaton. The search
18 * itself can be generated by inclusion of file lib/kmp-search.h.
19 * Separeted headers allow the user to define multiple search
20 * routines for one common set of key strings.
24 * #define KMP_PREFIX(x) kmp_##x
25 * #define KMP_WANT_CLEANUP
26 * #define KMP_WANT_SEARCH // includes lib/kmp-search.h automatically
27 * #define KMPS_FOUND(kmp,src,s) printf("found\n")
28 * #include "lib/kmp.h"
32 * struct kmp_struct kmp; // a structure describing the whole automaton
33 * kmp_init(&kmp); // initialization (must be called before all other functions)
35 * // add key strings we want to search
36 * kmp_add(&kmp, "aaa");
37 * kmp_add(&kmp, "abc");
39 * // complete the automaton, no more strings can be added later
42 * // example of search, should print single "found" to stdout
43 * kmp_run(&kmp, "aabaabca");
45 * // destroy all internal structures
49 * Brief description of all parameters:
52 * KMP_PREFIX(x) macro to add a name prefix (used on all global names
53 * defined by the KMP generator); mandatory;
54 * we abbreviate this to P(x) below
56 * KMP_CHAR alphabet type, the default is u16
58 * KMP_SOURCE user-defined text source; KMP_GET_CHAR must
59 * KMP_GET_CHAR(kmp,src,c) return zero at the end or nonzero together with the next character in c otherwise;
60 * if not defined, zero-terminated array of bytes is used as the input
62 * KMP_VARS user-defined variables in 'struct P(struct)'
63 * -- a structure describing the whole automaton;
64 * these variables are stored in .u substructure to avoid collisions
65 * KMP_STATE_VARS user-defined variables in 'struct P(state)'
66 * -- created for each state of the automaton;
67 * these variables are stored in .u substructure to avoid collisions
69 * Parameters which select how the input is interpreted (if KMP_SOURCE is unset):
70 * KMP_USE_ASCII reads single bytes from the input (default)
71 * KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed)
72 * KMP_TOLOWER converts all to lowercase
73 * KMP_UNACCENT removes accents
74 * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR (see below)
76 * Parameters controlling add(kmp, src):
77 * KMP_ADD_EXTRA_ARGS extra arguments, should be used carefully because of possible collisions
78 * KMP_ADD_INIT(kmp,src) called in the beginning of add(), src is the first
79 * KMP_INIT_STATE(kmp,s) initialization of a new state s (called before KMP_ADD_{NEW,DUP});
80 * null state is not included and should be handled after init() if necessary;
81 * all user-defined data are filled by zeros before call to KMP_INIT_STATE
82 * KMP_ADD_NEW(kmp,src,s) initialize last state of every new key string (called after KMP_INIT_STATE);
83 * the string must be parsed before so src is after the last string's character
84 * KMP_ADD_DUP(kmp,src,s) analogy of KMP_ADD_NEW called for duplicates
86 * Parameters to build():
87 * KMP_BUILD_STATE(kmp,s) called for all states (including null) in order of non-decreasing tree depth
90 * KMP_WANT_CLEANUP define cleanup()
91 * KMP_WANT_SEARCH includes lib/kmp-search.h with the same prefix;
92 * there can be multiple search variants for a single KMP automaton
93 * KMP_USE_POOL allocates in a given pool
94 * KMP_CONTROL_CHAR special control character (default is ':')
95 * KMP_GIVE_ALLOC if set, you must supply custom allocation functions:
96 * void *alloc(unsigned int size) -- allocate space for
97 * a state. Default is pooled allocation from a local pool or HASH_USE_POOL.
98 * void free(void *) -- the converse.
99 * KMP_GIVE_HASHFN if set, you must supply custom hash function:
100 * unsigned int hash(struct P(struct) *kmp, struct P(state) *state, KMP_CHAR c);
101 * default hash function works only for integer character types
102 * KMP_GIVE_EQ if set, you must supply custom compare function of two characters:
103 * int eq(struct P(struct) *kmp, KMP_CHAR a, KMP_CHAR b);
104 * default is 'a == b'
108 #error Missing KMP_PREFIX
111 #include "lib/mempool.h"
115 #define P(x) KMP_PREFIX(x)
118 typedef KMP_CHAR P(char_t);
120 typedef u16 P(char_t);
123 typedef u32 P(len_t);
126 typedef KMP_NODE P(node_t);
128 typedef struct {} P(node_t);
134 struct P(state) *from; /* state with the previous character (forms a tree with null state in the root) */
135 struct P(state) *back; /* backwards edge to the longest shorter state with same suffix */
136 struct P(state) *next; /* the longest of shorter matches (or NULL) */
137 P(len_t) len; /* state depth if it represents a key string, zero otherwise */
138 P(char_t) c; /* last character of the represented string */
140 # ifdef KMP_STATE_VARS
143 } u; /* user-defined data*/
147 static inline P(char_t)
150 # ifdef KMP_CONTROL_CHAR
151 return KMP_CONTROL_CHAR;
157 /* User-defined source */
158 struct P(hash_table);
160 #define HASH_GIVE_HASHFN
161 #ifdef KMP_GIVE_HASHFN
163 P(hash_hash) (struct P(hash_table) *t, struct P(state) *f, P(char_t) c)
165 return P(hash) ((struct P(struct) *) t, f, c);
169 P(hash_hash) (struct P(hash_table) *t UNUSED, struct P(state) *f, P(char_t) c)
171 return (((uns)c) << 16) + (uns)(uintptr_t)f;
177 P(eq) (struct P(struct) *kmp UNUSED, P(char_t) c1, P(char_t) c2)
184 P(is_control) (struct P(struct) *kmp, P(char_t) c)
186 return P(eq) (kmp, c, P(control)());
191 P(hash_eq) (struct P(hash_table) *t, struct P(state) *f1, P(char_t) c1, struct P(state) *f2, P(char_t) c2)
193 return f1 == f2 && P(eq)((struct P(struct) *) t, c1, c2);
196 #ifdef KMP_GIVE_ALLOC
197 #define HASH_GIVE_ALLOC
199 P(hash_alloc) (struct P(hash_table) *t, uns size)
201 return P(alloc) ((struct P(struct) *) t, size);
205 P(hash_free) (struct P(hash_table) *t, void *ptr)
207 P(free) ((struct P(struct) *) t, ptr);
211 #define HASH_GIVE_INIT_KEY
213 P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(state) *f, P(char_t) c)
215 bzero(s, sizeof(*s));
216 # ifdef KMP_INIT_STATE
217 struct P(struct) *kmp = (struct P(struct) *)t;
218 { KMP_INIT_STATE(kmp, s); }
222 s->next = f->back; /* the pointers hold the link-list of sons... changed in build() */
227 #define HASH_PREFIX(x) KMP_PREFIX(hash_##x)
228 #define HASH_NODE struct KMP_PREFIX(state)
229 #define HASH_KEY_COMPLEX(x) x from, x c
230 #define HASH_KEY_DECL struct KMP_PREFIX(state) *from, KMP_PREFIX(char_t) c
231 #define HASH_WANT_NEW
232 #define HASH_WANT_FIND
233 #ifdef KMP_WANT_CLEANUP
234 #define HASH_WANT_CLEANUP
236 #if defined(KMP_USE_POOL)
237 #define HASH_USE_POOL KMP_USE_POOL
239 #define HASH_AUTO_POOL 4096
241 #define HASH_CONSERVE_SPACE
242 #define HASH_TABLE_DYNAMIC
243 #include "lib/hashtable.h"
244 #define P(x) KMP_PREFIX(x)
247 struct P(hash_table) hash; /* hash table of state transitions */
248 struct P(state) null; /* null state */
253 } u; /* user-defined data */
257 typedef KMP_SOURCE P(source_t);
259 typedef char *P(source_t);
264 P(get_char) (struct P(struct) *kmp UNUSED, P(source_t) *src UNUSED, P(char_t) *c UNUSED)
266 return KMP_GET_CHAR(kmp, (*src), (*c));
269 # if defined(KMP_USE_UTF8)
270 # include "lib/unicode.h"
271 # if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER) || defined(KMP_UNACCENT)
272 # include "charset/unicat.h"
274 # elif defined(KMP_USE_ASCII)
275 # if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER)
276 # include "lib/chartype.h"
280 P(get_char) (struct P(struct) *kmp UNUSED, P(source_t) *src, P(char_t) *c)
284 *src = utf8_get(*src, &cc);
285 # ifdef KMP_ONLYALPHA
287 else if (!Ualpha(cc))
301 # ifdef KMP_ONLYALPHA
303 else if (!Calpha(cc))
311 # error Do not know how to unaccent ASCII characters
319 static struct P(state) *
320 P(add) (struct P(struct) *kmp, P(source_t) src
321 # ifdef KMP_ADD_EXTRA_ARGS
327 { KMP_ADD_INIT(kmp, src); }
331 if (!P(get_char)(kmp, &src, &c))
333 struct P(state) *p = &kmp->null, *s;
337 s = P(hash_find)(&kmp->hash, p, c);
341 s = P(hash_new)(&kmp->hash, p, c);
343 if (!(P(get_char)(kmp, &src, &c)))
350 while (P(get_char)(kmp, &src, &c));
354 { KMP_ADD_DUP(kmp, src, s); }
361 { KMP_ADD_NEW(kmp, src, s); }
367 P(init) (struct P(struct) *kmp)
369 bzero(&kmp->null, sizeof(struct P(state)));
370 P(hash_init)(&kmp->hash);
373 #ifdef KMP_WANT_CLEANUP
375 P(cleanup) (struct P(struct) *kmp)
377 P(hash_cleanup)(&kmp->hash);
382 P(empty) (struct P(struct) *kmp)
384 return !kmp->hash.hash_count;
387 static inline struct P(state) *
388 P(chain_start) (struct P(state) *s)
390 return s->len ? s : s->next;
394 P(build) (struct P(struct) *kmp)
398 uns read = 0, write = 0;
399 struct P(state) *fifo[kmp->hash.hash_count], *null = &kmp->null;
400 for (struct P(state) *s = null->back; s; s = s->next)
403 # ifdef KMP_BUILD_STATE
404 { KMP_BUILD_STATE(kmp, null); }
406 while (read != write)
408 struct P(state) *s = fifo[read++], *t;
409 for (t = s->back; t; t = t->next)
411 for (t = s->from->back; 1; t = t->back)
419 s->back = P(hash_find)(&kmp->hash, t, s->c);
422 s->next = s->back->len ? s->back : s->back->next;
426 # ifdef KMP_BUILD_STATE
427 { KMP_BUILD_STATE(kmp, s); }
437 #undef KMP_STATE_VARS
444 #undef KMP_CONTROL_CHAR
445 #undef KMP_ADD_EXTRA_ARGS
449 #undef KMP_INIT_STATE
450 #undef KMP_BUILD_STATE
452 #undef KMP_GIVE_ALLOC
453 #undef KMP_GIVE_HASHFN
456 #ifdef KMP_WANT_SEARCH
457 # undef KMP_WANT_SEARCH
458 # define KMPS_PREFIX(x) KMP_PREFIX(x)
459 # define KMPS_KMP_PREFIX(x) KMP_PREFIX(x)
460 # include "lib/kmp-search.h"