2 * Knuth-Morris-Pratt's Substring Search for N given strings
4 * (c) 1999--2005, Robert Spalek <robert@ucw.cz>
6 * This is a preprocessor template: you need to set KMP_GET_CHAR
7 * to a macro for reading a single character from the input and
8 * translating it according to the MF_xxx flags. See below for
9 * some pre-defined values.
11 * Don't touch this file, it can bite.
13 * (In fact, the algorithm is usually referred to as Aho-McCorasick,
14 * but that's just an extension of KMP to multiple strings.)
20 #include "lib/lists.h"
25 * Input conversion flags (the conversion is handled exclusively by the KMP_GET_CHAR
26 * macro, so you can define your own conversion modes, soo).
30 #define MF_ONLYALPHA 4 /* Convert non-alphas to KMP_CONTROL_CHAR */
32 #define KMP_CONTROL_CHAR ':'
34 /* Pre-defined input functions */
36 #define KMP_GET_RAW(pos, c, flags) c=*pos++
38 #define KMP_GET_ASCII(pos, c, flags) do { \
41 if (flags & MF_TOLOWER) \
43 if (flags & MF_ONLYALPHA && !Calpha(c)) \
44 c = KMP_CONTROL_CHAR; \
48 /* Types and structures */
50 typedef uns kmp_state_t;
51 typedef word kmp_char_t;
53 struct kmp_transition {
54 struct node n; /* link list of sons for a given node */
55 struct kmp_transition *next; /* collision in the hash-table of all transitions */
59 struct kmp_transitions {
61 struct list *sons; /* link-list of all sons for each given node */
63 struct kmp_transition **chain; /* hash-table of [node, char]->son */
67 struct kmp_output *next; /* output link list for every node */
75 int modify_flags; /* which nocase/noaccent mode is this kmp for */
76 int words_len; /* total length of searched words */
77 struct kmp_transitions g; /* hash table of forward transitions of automat */
78 kmp_state_t *f; /* back transitions of automat */
79 struct kmp_output **out; /* found words for every state */
83 struct node n; /* strings with non-zero frequency are put into a link-list */
88 struct kmp *kmp_new(struct mempool *mp, int words_len, uns modify_flags);
89 void kmp_enter_raw_string(struct kmp *kmp, const byte *str, uns id);
90 void kmp_build(struct kmp *kmp);
93 kmp_get_char(const byte **str, kmp_char_t *c, uns modify_flags UNUSED)
98 KMP_GET_CHAR((*str), new_c, modify_flags);
99 if (new_c != KMP_CONTROL_CHAR || *c != KMP_CONTROL_CHAR)
108 kmp_enter_string(struct kmp *kmp, const byte *str, uns id)
110 /* To avoid dependencies between libucw and other libraries (which might
111 * be referenced by the KMP_GET_CHAR macro), we have to split kmp_enter_string()
112 * to a conversion wrapper (this function) and the rest, which resides in kmp.c
113 * and uses KMP_GET_RAW to read its input.
115 byte buf[3*strlen(str)+1], *str2 = buf;
119 kmp_get_char(&str, &c, kmp->modify_flags);
120 str2 = utf8_put(str2, c);
123 kmp_enter_raw_string(kmp, buf, id);
127 transition_hashf(struct kmp_transitions *l UNUSED, struct kmp_transition *tr)
129 return tr->from + (tr->c << 16);
133 transition_compare(struct kmp_transition *a, struct kmp_transition *b)
135 if (a->from == b->from && a->c == b->c)
141 static inline struct kmp_transition **
142 transition_search(struct kmp_transitions *l, struct kmp_transition *tr)
144 uns hf = transition_hashf(l, tr) % l->hash_size;
145 struct kmp_transition **last = l->chain + hf;
146 while (*last && transition_compare(*last, tr))
147 last = &(*last)->next;
153 add_result(struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out)
155 for (; out; out = out->next)
156 if (!freq[out->id].occur++)
157 add_tail(nonzeroes, &freq[out->id].n);
161 kmp_search_internal(struct kmp *kmp, byte *str, uns len, struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out)
162 /* For every found string with id ID, it increments freq[ID].
163 * Also, it finds the longest among the leftmost matches. */
165 byte *str_end = str + len;
167 kmp_char_t c = KMP_CONTROL_CHAR;
168 struct kmp_transition tr, **prev;
170 if (kmp->words_len <= 1)
172 //TRACE(20, "kmp.c: Searching string %s", str);
173 byte *largest_match = NULL;
178 prev = transition_search(&kmp->g, &tr);
179 while (tr.from && !*prev)
181 tr.from = kmp->f[ tr.from ];
182 prev = transition_search(&kmp->g, &tr);
184 s = *prev ? (*prev)->to : 0;
186 add_result(nonzeroes, freq, kmp->out[s]);
187 /* Beware that out->len is measured in modified characters of
188 * the search pattern, hence it is not very reliable if you use
190 struct kmp_output *kout = kmp->out[s];
191 if (kout && (!largest_match || str - kout->len <= largest_match))
193 largest_match = str - kout->len;
202 kmp_get_char((const byte **)&str, &c, kmp->modify_flags);
205 /* Insert KMP_CONTROL_CHAR at the beginning and at the end too. */
206 c = KMP_CONTROL_CHAR;
210 return largest_match;
214 kmp_search(struct kmp *kmp, const byte *str, struct list *nonzeroes, struct kmp_result *freq)
216 kmp_search_internal(kmp, (byte*) str, strlen(str), nonzeroes, freq, NULL);
220 kmp_find_first(struct kmp *kmp, byte *str, uns len, struct kmp_output *out)
222 return kmp_search_internal(kmp, str, len, NULL, NULL, out);