2 * Knuth-Morris-Pratt's search automat for N given strings
4 * (c) 1999--2005, Robert Spalek <robert@ucw.cz>
6 * Set KMP_TRANSLATE to one of {UNICODE,ASCII,NO}_TRANSLATE.
8 * Don't touch this file, because it is a mess.
14 #include "lib/lists.h"
18 #define MF_ONLYALPHA 4
19 /* how to modify characters in the string */
20 #define CONTROL_CHAR ':'
21 /* all non-alphabetic characters are treated as CONTROL_CHAR */
23 #define UNICODE_TRANSLATE(new_c, c, flags) do { \
27 if (flags & MF_TOLOWER) \
29 if (flags & MF_UNACCENT) \
31 if (flags & MF_ONLYALPHA && !Ualpha(c)) \
36 #define ASCII_TRANSLATE(new_c, c, flags) do { \
40 if (flags & MF_TOLOWER) \
42 if (flags & MF_ONLYALPHA && !Calpha(c)) \
47 #define NO_TRANSLATE(new_c, c, flags) do { new_c = c; } while (0)
49 typedef uns kmp_state_t;
50 typedef word kmp_char_t;
52 struct kmp_transition {
53 struct node n; /* link list of sons for a given node */
54 struct kmp_transition *next; /* collision in the hash-table of all transitions */
58 struct kmp_transitions {
60 struct list *sons; /* link-list of all sons for each given node */
62 struct kmp_transition **chain; /* hash-table of [node, char]->son */
66 struct kmp_output *next; /* output link list for every node */
74 int modify_flags; /* which nocase/noaccent mode is this kmp for */
75 int words_len; /* total length of searched words */
76 struct kmp_transitions g; /* hash table of forward transitions of automat */
77 kmp_state_t *f; /* back transitions of automat */
78 struct kmp_output **out; /* found words for every state */
82 struct kmp *kmp_new(struct mempool *mp, int words_len, uns modify_flags);
83 void kmp_enter_raw_string(struct kmp *kmp, const byte *str, uns id);
84 void kmp_build(struct kmp *kmp);
87 kmp_get_char(const byte **str, kmp_char_t *c, uns modify_flags UNUSED)
93 GET_TAGGED_CHAR((*str), w);
96 KMP_TRANSLATE(new_c, w, modify_flags);
97 if (new_c != CONTROL_CHAR || *c != CONTROL_CHAR)
106 kmp_enter_string(struct kmp *kmp, const byte *str, uns id)
108 /* Ugly hack to avoid linking libucw with libcharset:
109 * Convert the entered string here in the header and then call the
110 * library function. This function calls kmp_get_char() without any
112 byte buf[strlen(str)+1], *str2 = buf;
116 kmp_get_char(&str, &c, kmp->modify_flags);
117 str2 = utf8_put(str2, c);
120 kmp_enter_raw_string(kmp, buf, id);
124 struct node n; /* strings with non-zero frequency are put into a link-list */
129 transition_hashf(struct kmp_transitions *l UNUSED, struct kmp_transition *tr)
131 return tr->from + (tr->c << 16);
135 transition_compare(struct kmp_transition *a, struct kmp_transition *b)
137 if (a->from == b->from && a->c == b->c)
143 static inline struct kmp_transition **
144 transition_search(struct kmp_transitions *l, struct kmp_transition *tr)
146 uns hf = transition_hashf(l, tr) % l->hash_size;
147 struct kmp_transition **last = l->chain + hf;
148 while (*last && transition_compare(*last, tr))
149 last = &(*last)->next;
155 add_result(struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out)
157 for (; out; out = out->next)
158 if (!freq[out->id].occur++)
159 add_tail(nonzeroes, &freq[out->id].n);
163 kmp_search_internal(struct kmp *kmp, byte *str, uns len, struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out)
164 /* For every found string with id ID, it increments freq[ID].
165 * Also, it finds the longest among the leftmost matches. */
167 byte *str_end = str + len;
169 kmp_char_t c = CONTROL_CHAR;
170 struct kmp_transition tr, **prev;
172 if (kmp->words_len <= 1)
174 //TRACE(20, "kmp.c: Searching string %s", str);
175 byte *largest_match = NULL;
180 prev = transition_search(&kmp->g, &tr);
181 while (tr.from && !*prev)
183 tr.from = kmp->f[ tr.from ];
184 prev = transition_search(&kmp->g, &tr);
186 s = *prev ? (*prev)->to : 0;
188 add_result(nonzeroes, freq, kmp->out[s]);
189 /* Beware that out->len is measured in modified characters of
190 * the search pattern, hence it is not very reliable if you use
192 struct kmp_output *kout = kmp->out[s];
193 if (kout && (!largest_match || str - kout->len <= largest_match))
195 largest_match = str - kout->len;
204 kmp_get_char((const byte **)&str, &c, kmp->modify_flags);
207 /* Insert CONTROL_CHAR at the beginning and at the end too. */
212 return largest_match;
216 kmp_search(struct kmp *kmp, const byte *str, struct list *nonzeroes, struct kmp_result *freq)
218 kmp_search_internal(kmp, (byte*) str, strlen(str), nonzeroes, freq, NULL);
222 kmp_find_first(struct kmp *kmp, byte *str, uns len, struct kmp_output *out)
224 return kmp_search_internal(kmp, str, len, NULL, NULL, out);