2 * Knuth-Morris-Pratt's Substring Search for N given strings
4 * (c) 1999--2005, Robert Spalek <robert@ucw.cz>
6 * This is a preprocessor template: you need to set KMP_GET_CHAR
7 * to a macro for reading a single character from the input and
8 * translating it according to the MF_xxx flags. See below for
9 * some pre-defined values.
11 * Don't touch this file, it can bite.
13 * (In fact, the algorithm is usually referred to as Aho-McCorasick,
14 * but that's just an extension of KMP to multiple strings.)
20 #include "lib/lists.h"
23 * Input conversion flags (the conversion is handled exclusively by the KMP_GET_CHAR
24 * macro, so you can define your own conversion modes, soo).
28 #define MF_ONLYALPHA 4 /* Convert non-alphas to KMP_CONTROL_CHAR */
30 #define KMP_CONTROL_CHAR ':'
32 /* Pre-defined input functions */
34 #define KMP_GET_RAW(pos, c, flags) c=*pos++
36 #define KMP_GET_ASCII(pos, c, flags) do { \
39 if (flags & MF_TOLOWER) \
41 if (flags & MF_ONLYALPHA && !Calpha(c)) \
42 c = KMP_CONTROL_CHAR; \
46 /* Types and structures */
48 typedef uns kmp_state_t;
49 typedef word kmp_char_t;
51 struct kmp_transition {
52 struct node n; /* link list of sons for a given node */
53 struct kmp_transition *next; /* collision in the hash-table of all transitions */
57 struct kmp_transitions {
59 struct list *sons; /* link-list of all sons for each given node */
61 struct kmp_transition **chain; /* hash-table of [node, char]->son */
65 struct kmp_output *next; /* output link list for every node */
73 int modify_flags; /* which nocase/noaccent mode is this kmp for */
74 int words_len; /* total length of searched words */
75 struct kmp_transitions g; /* hash table of forward transitions of automat */
76 kmp_state_t *f; /* back transitions of automat */
77 struct kmp_output **out; /* found words for every state */
81 struct node n; /* strings with non-zero frequency are put into a link-list */
86 struct kmp *kmp_new(struct mempool *mp, int words_len, uns modify_flags);
87 void kmp_enter_raw_string(struct kmp *kmp, const byte *str, uns id);
88 void kmp_build(struct kmp *kmp);
91 kmp_get_char(const byte **str, kmp_char_t *c, uns modify_flags UNUSED)
96 KMP_GET_CHAR((*str), new_c, modify_flags);
97 if (new_c != KMP_CONTROL_CHAR || *c != KMP_CONTROL_CHAR)
106 kmp_enter_string(struct kmp *kmp, const byte *str, uns id)
108 /* To avoid dependencies between libucw and other libraries (which might
109 * be referenced by the KMP_GET_CHAR macro), we have to split kmp_enter_string()
110 * to a conversion wrapper (this function) and the rest, which resides in kmp.c
111 * and uses KMP_GET_RAW to read its input.
113 byte buf[3*strlen(str)+1], *str2 = buf;
117 kmp_get_char(&str, &c, kmp->modify_flags);
118 str2 = utf8_put(str2, c);
121 kmp_enter_raw_string(kmp, buf, id);
125 transition_hashf(struct kmp_transitions *l UNUSED, struct kmp_transition *tr)
127 return tr->from + (tr->c << 16);
131 transition_compare(struct kmp_transition *a, struct kmp_transition *b)
133 if (a->from == b->from && a->c == b->c)
139 static inline struct kmp_transition **
140 transition_search(struct kmp_transitions *l, struct kmp_transition *tr)
142 uns hf = transition_hashf(l, tr) % l->hash_size;
143 struct kmp_transition **last = l->chain + hf;
144 while (*last && transition_compare(*last, tr))
145 last = &(*last)->next;
151 add_result(struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out)
153 for (; out; out = out->next)
154 if (!freq[out->id].occur++)
155 add_tail(nonzeroes, &freq[out->id].n);
159 kmp_search_internal(struct kmp *kmp, byte *str, uns len, struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out)
160 /* For every found string with id ID, it increments freq[ID].
161 * Also, it finds the longest among the leftmost matches. */
163 byte *str_end = str + len;
165 kmp_char_t c = KMP_CONTROL_CHAR;
166 struct kmp_transition tr, **prev;
168 if (kmp->words_len <= 1)
170 //TRACE(20, "kmp.c: Searching string %s", str);
171 byte *largest_match = NULL;
176 prev = transition_search(&kmp->g, &tr);
177 while (tr.from && !*prev)
179 tr.from = kmp->f[ tr.from ];
180 prev = transition_search(&kmp->g, &tr);
182 s = *prev ? (*prev)->to : 0;
184 add_result(nonzeroes, freq, kmp->out[s]);
185 /* Beware that out->len is measured in modified characters of
186 * the search pattern, hence it is not very reliable if you use
188 struct kmp_output *kout = kmp->out[s];
189 if (kout && (!largest_match || str - kout->len <= largest_match))
191 largest_match = str - kout->len;
200 kmp_get_char((const byte **)&str, &c, kmp->modify_flags);
203 /* Insert KMP_CONTROL_CHAR at the beginning and at the end too. */
204 c = KMP_CONTROL_CHAR;
208 return largest_match;
212 kmp_search(struct kmp *kmp, const byte *str, struct list *nonzeroes, struct kmp_result *freq)
214 kmp_search_internal(kmp, (byte*) str, strlen(str), nonzeroes, freq, NULL);
218 kmp_find_first(struct kmp *kmp, byte *str, uns len, struct kmp_output *out)
220 return kmp_search_internal(kmp, str, len, NULL, NULL, out);