2 * Knuth-Morris-Pratt's Substring Search for N given strings
4 * (c) 1999--2005, Robert Spalek <robert@ucw.cz>
5 * (c) 2006, Pavel Charvat <pchar@ucw.cz>
7 * (In fact, the algorithm is usually referred to as Aho-McCorasick,
8 * but that's just an extension of KMP to multiple strings.)
12 * This is not a normal header file, it's a generator of KMP algorithm.
13 * Each time you include it with parameters set in the corresponding
14 * preprocessor macros, it generates KMP structures and functions
15 * with the parameters given.
19 * KMP_PREFIX(x) macro to add a name prefix (used on all global names
20 * defined by the KMP generator); mandatory
22 * KMP_CHAR alphabet type, the default is u16
24 * KMP_SOURCE user-defined text source; KMP_GET_CHAR must
25 * KMP_GET_CHAR(kmp,src,c) return next character from the input or zero at the end;
26 * if not defined, zero-terminated array of bytes is used as the input
28 * KMP_VARS user-defined data in main structure (a structure describing
29 * the whole automaton)
30 * KMP_STATE_VARS user-defined data in each state of the automaton
32 * Parameters which select how the input is interpreted (if KMP_SOURCE is unset):
33 * KMP_USE_ASCII reads single bytes from the input (default)
34 * KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed)
35 * KMP_TOLOWER converts all to lowercase
36 * KMP_UNACCENT removes accents
37 * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR
38 * KMP_CONTROL_CHAR special control character (default is ':')
40 * Parameters controlling add():
41 * KMP_ADD_EXTRA_ARGS extra arguments
42 * KMP_ADD_INIT(kmp,src)
43 * KMP_ADD_NEW(kmp,src,s)
44 * KMP_ADD_DUP(kmp,src,s)
45 * KMP_INIT_STATE(kmp,s) initialize new state (called before KMP_ADD_{NEW,DUP})
47 * Parameters to build():
48 * KMP_BUILD_STATE(kmp,s) called for all states (including null) in order of non-decreasing tree depth
51 * KMP_WANT_CLEANUP define cleanup()
52 * KMP_WANT_SEARCH includes lib/kmp-search.h with the same prefix;
53 * there can be multiple search variants for a single KMP structure
55 * KMP_USE_POOL allocates in a given pool
63 #error Missing KMP_PREFIX
66 #include "lib/mempool.h"
70 #define P(x) KMP_PREFIX(x)
73 typedef KMP_CHAR P(char_t);
75 typedef u16 P(char_t);
81 typedef KMP_NODE P(node_t);
83 typedef struct {} P(node_t);
89 struct P(state) *from; /* state with previous character */
90 struct P(state) *back; /* backwards edge to the largest shorter state */
91 struct P(state) *next; /* largest shorter match */
92 P(len_t) len; /* largest match, zero otherwise */
93 P(char_t) c; /* last character */
95 # ifdef KMP_STATE_VARS
98 } u; /* user-defined data*/
102 static inline P(char_t)
105 # ifdef KMP_CONTROL_CHAR
106 return KMP_CONTROL_CHAR;
112 /* User-defined source */
113 struct P(hash_table);
115 #define HASH_GIVE_HASHFN
116 #ifdef KMP_GIVE_HASHFN
118 P(hash_hash) (struct P(hash_table) *t, struct P(state) *f, P(char_t) c)
120 return P(hash) ((struct P(struct) *) t, f, c);
124 P(hash_hash) (struct P(hash_table) *t UNUSED, struct P(state) *f, P(char_t) c)
126 return (((uns)c) << 16) + (uns)(addr_int_t)f;
132 P(eq) (struct P(struct) *kmp UNUSED, P(char_t) c1, P(char_t) c2)
139 P(is_control) (struct P(struct) *kmp, P(char_t) c)
141 return P(eq) (kmp, c, P(control)());
146 P(hash_eq) (struct P(hash_table) *t, struct P(state) *f1, P(char_t) c1, struct P(state) *f2, P(char_t) c2)
148 return f1 == f2 && P(eq)((struct P(struct) *) t, c1, c2);
151 #ifdef KMP_GIVE_ALLOC
152 #define HASH_GIVE_ALLOC
154 P(hash_alloc) (struct P(hash_table) *t, uns size)
156 return P(alloc) ((struct P(struct) *) t, size);
160 P(hash_free) (struct P(hash_table) *t, void *ptr)
162 P(free) ((struct P(struct) *) t, ptr);
166 #define HASH_GIVE_INIT_KEY
168 P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(state) *f, P(char_t) c)
170 bzero(s, sizeof(*s));
171 # ifdef KMP_INIT_STATE
172 struct P(struct) *kmp = (struct P(struct) *)t;
173 { KMP_INIT_STATE(kmp, s); }
177 s->next = f->back; /* the pointers hold the link-list of sons... changed in build() */
182 #define HASH_PREFIX(x) KMP_PREFIX(GLUE(hash_,x))
183 #define HASH_NODE struct KMP_PREFIX(state)
184 #define HASH_KEY_COMPLEX(x) x from, x c
185 #define HASH_KEY_DECL struct KMP_PREFIX(state) *from, KMP_PREFIX(char_t) c
186 #define HASH_WANT_NEW
187 #define HASH_WANT_FIND
188 #ifdef KMP_WANT_CLEANUP
189 #define HASH_WANT_CLEANUP
191 #if defined(KMP_USE_POOL)
192 #define HASH_USE_POOL KMP_USE_POOL
194 #define HASH_AUTO_POOL 4096
196 #define HASH_CONSERVE_SPACE
197 #define HASH_TABLE_DYNAMIC
198 #include "lib/hashtable.h"
199 #define P(x) KMP_PREFIX(x)
202 struct P(hash_table) hash; /* hash table of state transitions */
203 struct P(state) null; /* null state */
208 } u; /* user-defined data */
212 typedef KMP_SOURCE P(source_t);
214 typedef byte *P(source_t);
219 P(get_char) (struct P(struct) *kmp UNUSED, P(source_t) *src UNUSED, P(char_t) *c UNUSED)
221 return KMP_GET_CHAR(kmp, (*src), (*c));
224 # if defined(KMP_USE_UTF8)
225 # include "lib/unicode.h"
226 # if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER) || defined(KMP_UNACCENT)
227 # include "charset/unicat.h"
229 # elif defined(KMP_USE_ASCII)
230 # if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER)
231 # include "lib/chartype.h"
235 P(get_char) (struct P(struct) *kmp UNUSED, P(source_t) *src, P(char_t) *c)
239 *src = (byte *)utf8_get(*src, &cc);
240 # ifdef KMP_ONLYALPHA
242 else if (!Ualpha(cc))
256 # ifdef KMP_ONLYALPHA
258 else if (!Calpha(cc))
266 # error Do not know how to unaccent ASCII characters
274 static struct P(state) *
275 P(add) (struct P(struct) *kmp, P(source_t) src
276 # ifdef KMP_ADD_EXTRA_ARGS
282 { KMP_ADD_INIT(kmp, src); }
286 if (!P(get_char)(kmp, &src, &c))
288 struct P(state) *p = &kmp->null, *s;
292 s = P(hash_find)(&kmp->hash, p, c);
296 s = P(hash_new)(&kmp->hash, p, c);
298 if (!(P(get_char)(kmp, &src, &c)))
305 while (P(get_char)(kmp, &src, &c));
309 { KMP_ADD_DUP(kmp, src, s); }
316 { KMP_ADD_NEW(kmp, src, s); }
322 P(init) (struct P(struct) *kmp)
324 bzero(&kmp->null, sizeof(struct P(state)));
325 P(hash_init)(&kmp->hash);
328 #ifdef KMP_WANT_CLEANUP
330 P(cleanup) (struct P(struct) *kmp)
332 P(hash_cleanup)(&kmp->hash);
337 P(empty) (struct P(struct) *kmp)
339 return !kmp->hash.hash_count;
342 static inline struct P(state) *
343 P(chain_start) (struct P(state) *s)
345 return s->len ? s : s->next;
349 P(build) (struct P(struct) *kmp)
353 uns read = 0, write = 0;
354 struct P(state) *fifo[kmp->hash.hash_count], *null = &kmp->null;
355 for (struct P(state) *s = null->back; s; s = s->next)
358 # ifdef KMP_BUILD_STATE
359 { KMP_BUILD_STATE(kmp, null); }
361 while (read != write)
363 struct P(state) *s = fifo[read++], *t;
364 for (t = s->back; t; t = t->next)
366 for (t = s->from->back; 1; t = t->back)
374 s->back = P(hash_find)(&kmp->hash, t, s->c);
377 s->next = s->back->len ? s->back : s->back->next;
381 # ifdef KMP_BUILD_STATE
382 { KMP_BUILD_STATE(kmp, s); }
392 #undef KMP_STATE_VARS
399 #undef KMP_CONTROL_CHAR
400 #undef KMP_ADD_EXTRA_ARGS
404 #undef KMP_INIT_STATE
405 #undef KMP_BUILD_STATE
407 #undef KMP_GIVE_ALLOC
408 #undef KMP_GIVE_HASHFN
411 #ifdef KMP_WANT_SEARCH
412 # undef KMP_WANT_SEARCH
413 # define KMPS_PREFIX(x) KMP_PREFIX(x)
414 # define KMPS_KMP_PREFIX(x) KMP_PREFIX(x)
415 # include "lib/kmp-search.h"