--- /dev/null
+/*
+ * Knuth-Morris-Pratt's search automat for N given strings
+ * (taken from ad2 super-server and pruned :-)
+ *
+ * (c) 1999--2005, Robert Spalek <robert@ucw.cz>
+ */
+
+#include "lib/lib.h"
+#include "lib/mempool.h"
+#include "lib/lists.h"
+#include "sherlock/tagged-text.h"
+#include "lib/unicode.h"
+
+#define KMP_TRANSLATE(new_c, c, flags) NO_TRANSLATE(new_c, c, flags)
+#include "lib/kmp.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <alloca.h>
+
+#define TRACE(level, mask...) if (0) fprintf(stderr, mask)
+
+struct kmp *
+kmp_new(struct mempool *mp, int words_len, uns modify_flags)
+{
+ struct kmp *kmp = mp_alloc_zero(mp, sizeof(struct kmp));
+ kmp->mp = mp;
+ kmp->modify_flags = modify_flags;
+ kmp->words_len = words_len;
+ int size = words_len;
+ kmp->g.count = 1;
+ kmp->g.size = size;
+ kmp->g.sons = mp_alloc_zero(mp, size * sizeof(struct list));
+ init_list(kmp->g.sons + 0);
+ if (words_len > 1)
+ size = words_len * fls(words_len);
+ else
+ size = 1;
+ kmp->g.hash_size = size;
+ kmp->g.chain = mp_alloc_zero(mp, size * sizeof(struct kmp_transition *));
+ kmp->f = mp_alloc_zero(mp, words_len * sizeof(kmp_state_t));
+ kmp->out = mp_alloc_zero(mp, words_len * sizeof(struct kmp_output *));
+ return kmp;
+}
+
+/*
+ * The only merge operation is that son includes output of his father (and also
+ * his father,...), so we can merge the link-lists.
+ */
+static void
+merge_output(struct kmp_output **target, struct kmp_output *src)
+{
+ while (*target)
+ target = &(*target)->next;
+ *target = src;
+}
+
+static struct kmp_output *
+new_output(struct kmp *kmp, uns id, uns len)
+{
+ struct kmp_output *out = mp_alloc(kmp->mp, sizeof(struct kmp_output));
+ out->next = NULL;
+ out->id = id;
+ out->len = len;
+ return out;
+}
+
+void
+kmp_enter_raw_string(struct kmp *kmp, const byte *str, uns id)
+{
+ struct kmp_transition tr = { .next=NULL, .from=0 }, **prev;
+ struct kmp_output *new_out;
+ const byte *orig_str = str;
+ uns len = 0;
+ kmp_char_t c = 'a';
+
+ TRACE(20, "kmp.c: Entering string %s", str);
+ kmp_get_char(&str, &c, 0);
+ len++;
+ if (!c)
+ return;
+ while (c)
+ {
+ tr.c = c;
+ prev = transition_search(&kmp->g, &tr);
+ if (!*prev)
+ break;
+ tr.from = (*prev)->to;
+ kmp_get_char(&str, &c, 0);
+ len++;
+ }
+ while (c)
+ {
+ *prev = mp_alloc_zero(kmp->mp, sizeof(struct kmp_transition));
+ tr.to = kmp->g.count++;
+ **prev = tr;
+ add_tail(kmp->g.sons + tr.from, &(*prev)->n);
+ init_list(kmp->g.sons + tr.to);
+ kmp_get_char(&str, &c, 0);
+ len++;
+ tr.from = tr.to;
+ tr.c = c;
+ prev = transition_search(&kmp->g, &tr);
+ ASSERT(!*prev);
+ }
+ if (kmp->out[tr.from])
+ TRACE(5, "kmp.c: string %s is inserted more than once", orig_str);
+ new_out = new_output(kmp, id, len-1);
+ merge_output(kmp->out + tr.from, new_out);
+}
+
+static void
+construct_f_out(struct kmp *kmp)
+{
+ kmp_state_t *fifo;
+ int read, write;
+ struct kmp_transition *son;
+
+ fifo = alloca(kmp->words_len * sizeof(kmp_state_t));
+ read = write = 0;
+ kmp->f[0] = 0;
+ WALK_LIST(son, kmp->g.sons[0])
+ {
+ ASSERT(son->from == 0);
+ kmp->f[son->to] = 0;
+ fifo[write++] = son->to;
+ }
+ while (read != write)
+ {
+ kmp_state_t r, s, t;
+ r = fifo[read++];
+ WALK_LIST(son, kmp->g.sons[r])
+ {
+ struct kmp_transition tr, **prev;
+ ASSERT(son->from == r);
+ tr.c = son->c;
+ s = son->to;
+ fifo[write++] = s;
+ t = kmp->f[r];
+ while (1)
+ {
+ tr.from = t;
+ prev = transition_search(&kmp->g, &tr);
+ if (*prev || !tr.from)
+ break;
+ t = kmp->f[t];
+ }
+ kmp->f[s] = *prev ? (*prev)->to : 0;
+ merge_output(kmp->out + s, kmp->out[ kmp->f[s] ]);
+ }
+ }
+}
+
+void
+kmp_build(struct kmp *kmp)
+{
+ ASSERT(kmp->g.count <= kmp->words_len);
+ construct_f_out(kmp);
+ if (kmp->words_len > 1)
+ TRACE(0, "Built KMP with modify flags %d for total words len %d, it has %d nodes", kmp->modify_flags, kmp->words_len, kmp->g.count);
+}
--- /dev/null
+/*
+ * Knuth-Morris-Pratt's search automat for N given strings
+ *
+ * (c) 1999--2005, Robert Spalek <robert@ucw.cz>
+ *
+ * Set KMP_TRANSLATE to one of {UNICODE,ASCII,NO}_TRANSLATE.
+ *
+ * Don't touch this file, because it is a mess.
+ */
+
+#ifndef _UCW_KMP_H
+#define _UCW_KMP_H
+
+#include "lib/lists.h"
+
+#define MF_TOLOWER 1
+#define MF_UNACCENT 2
+#define MF_ONLYALPHA 4
+ /* how to modify characters in the string */
+#define CONTROL_CHAR ':'
+ /* all non-alphabetic characters are treated as CONTROL_CHAR */
+
+#define UNICODE_TRANSLATE(new_c, c, flags) do { \
+ if (!c) \
+ new_c = 0; \
+ else { \
+ if (flags & MF_TOLOWER) \
+ c = Utolower(c); \
+ if (flags & MF_UNACCENT) \
+ c = Uunaccent(c); \
+ if (flags & MF_ONLYALPHA && !Ualpha(c)) \
+ c = CONTROL_CHAR; \
+ new_c = c; \
+ } \
+} while (0)
+#define ASCII_TRANSLATE(new_c, c, flags) do { \
+ if (!c) \
+ new_c = 0; \
+ else { \
+ if (flags & MF_TOLOWER) \
+ c = Clocase(c); \
+ if (flags & MF_ONLYALPHA && !Calpha(c)) \
+ c = CONTROL_CHAR; \
+ new_c = c; \
+ } \
+} while (0)
+#define NO_TRANSLATE(new_c, c, flags) do { new_c = c; } while (0)
+
+typedef uns kmp_state_t;
+typedef word kmp_char_t;
+
+struct kmp_transition {
+ struct node n; /* link list of sons for a given node */
+ struct kmp_transition *next; /* collision in the hash-table of all transitions */
+ kmp_state_t from, to;
+ kmp_char_t c;
+};
+struct kmp_transitions {
+ int count, size;
+ struct list *sons; /* link-list of all sons for each given node */
+ uns hash_size;
+ struct kmp_transition **chain; /* hash-table of [node, char]->son */
+};
+
+struct kmp_output {
+ struct kmp_output *next; /* output link list for every node */
+ uns id;
+ uns len;
+};
+
+struct mempool;
+struct kmp {
+ struct mempool *mp;
+ int modify_flags; /* which nocase/noaccent mode is this kmp for */
+ int words_len; /* total length of searched words */
+ struct kmp_transitions g; /* hash table of forward transitions of automat */
+ kmp_state_t *f; /* back transitions of automat */
+ struct kmp_output **out; /* found words for every state */
+};
+
+/* kmp.c */
+struct kmp *kmp_new(struct mempool *mp, int words_len, uns modify_flags);
+void kmp_enter_raw_string(struct kmp *kmp, const byte *str, uns id);
+void kmp_build(struct kmp *kmp);
+
+static inline void
+kmp_get_char(const byte **str, kmp_char_t *c, uns modify_flags UNUSED)
+{
+ while (1)
+ {
+ uns w;
+ kmp_char_t new_c;
+ GET_TAGGED_CHAR((*str), w);
+ if (w >= 0x80000000)
+ w = CONTROL_CHAR;
+ KMP_TRANSLATE(new_c, w, modify_flags);
+ if (new_c != CONTROL_CHAR || *c != CONTROL_CHAR)
+ {
+ *c = new_c;
+ return;
+ }
+ }
+}
+
+static inline void
+kmp_enter_string(struct kmp *kmp, const byte *str, uns id)
+{
+ /* Ugly hack to avoid linking libucw with libcharset:
+ * Convert the entered string here in the header and then call the
+ * library function. This function calls kmp_get_char() without any
+ * conversion. */
+ byte buf[strlen(str)+1], *str2 = buf;
+ kmp_char_t c;
+ do
+ {
+ kmp_get_char(&str, &c, kmp->modify_flags);
+ str2 = utf8_put(str2, c);
+ }
+ while (c);
+ kmp_enter_raw_string(kmp, str2, id);
+}
+
+struct kmp_result {
+ struct node n; /* strings with non-zero frequency are put into a link-list */
+ uns occur;
+};
+
+static inline uns
+transition_hashf(struct kmp_transitions *l UNUSED, struct kmp_transition *tr)
+{
+ return tr->from + (tr->c << 16);
+}
+
+static inline int
+transition_compare(struct kmp_transition *a, struct kmp_transition *b)
+{
+ if (a->from == b->from && a->c == b->c)
+ return 0;
+ else
+ return 1;
+}
+
+static inline struct kmp_transition **
+transition_search(struct kmp_transitions *l, struct kmp_transition *tr)
+{
+ uns hf = transition_hashf(l, tr) % l->hash_size;
+ struct kmp_transition **last = l->chain + hf;
+ while (*last && transition_compare(*last, tr))
+ last = &(*last)->next;
+ ASSERT(last);
+ return last;
+}
+
+static inline void
+add_result(struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out)
+{
+ for (; out; out = out->next)
+ if (!freq[out->id].occur++)
+ add_tail(nonzeroes, &freq[out->id].n);
+}
+
+static inline byte *
+kmp_search_internal(struct kmp *kmp, byte *str, uns len, struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out)
+ /* For every found string with id ID, it increments freq[ID].
+ * Also, it finds the longest among the leftmost matches. */
+{
+ byte *str_end = str + len;
+ kmp_state_t s = 0;
+ kmp_char_t c = CONTROL_CHAR;
+ struct kmp_transition tr, **prev;
+ byte eof = 0;
+ if (kmp->words_len <= 1)
+ return NULL;
+ //TRACE(20, "kmp.c: Searching string %s", str);
+ byte *largest_match = NULL;
+ while (1)
+ {
+ tr.from = s;
+ tr.c = c;
+ prev = transition_search(&kmp->g, &tr);
+ while (tr.from && !*prev)
+ {
+ tr.from = kmp->f[ tr.from ];
+ prev = transition_search(&kmp->g, &tr);
+ }
+ s = *prev ? (*prev)->to : 0;
+ if (nonzeroes)
+ add_result(nonzeroes, freq, kmp->out[s]);
+ /* Beware that out->len is measured in modified characters of
+ * the search pattern, hence it is not very reliable if you use
+ * unaccenting. */
+ struct kmp_output *kout = kmp->out[s];
+ if (kout && (!largest_match || str - kout->len <= largest_match))
+ {
+ largest_match = str - kout->len;
+ if (out)
+ *out = *kout;
+ }
+ if (eof)
+ break;
+ if (str >= str_end)
+ c = 0;
+ else
+ kmp_get_char((const byte **)&str, &c, kmp->modify_flags);
+ if (!c)
+ {
+ /* Insert CONTROL_CHAR at the beginning and at the end too. */
+ c = CONTROL_CHAR;
+ eof = 1;
+ }
+ }
+ return largest_match;
+}
+
+static inline void
+kmp_search(struct kmp *kmp, const byte *str, struct list *nonzeroes, struct kmp_result *freq)
+{
+ kmp_search_internal(kmp, (byte*) str, strlen(str), nonzeroes, freq, NULL);
+}
+
+static inline byte *
+kmp_find_first(struct kmp *kmp, byte *str, uns len, struct kmp_output *out)
+{
+ return kmp_search_internal(kmp, str, len, NULL, NULL, out);
+}
+
+#endif