From d1dbec1886a409be6fe2442bc496888bb9eec2ce Mon Sep 17 00:00:00 2001 From: Robert Spalek Date: Tue, 20 Sep 2005 12:50:17 +0000 Subject: [PATCH] kmp search automaton moved from lang/ to lib/ --- lib/Makefile | 2 +- lib/kmp.c | 162 ++++++++++++++++++++++++++++++++++++ lib/kmp.h | 227 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 390 insertions(+), 1 deletion(-) create mode 100644 lib/kmp.c create mode 100644 lib/kmp.h diff --git a/lib/Makefile b/lib/Makefile index 2f06bb18..734ef01a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -9,7 +9,7 @@ endif LIBUCW_MODS= \ alloc alloc_str realloc mempool mempool-str mempool-fmt \ mmap pagecache partmap hashfunc \ - lists sorter bitsig \ + lists sorter bitsig kmp \ log log-file proctitle \ conf ipaccess \ profile \ diff --git a/lib/kmp.c b/lib/kmp.c new file mode 100644 index 00000000..15aff3d1 --- /dev/null +++ b/lib/kmp.c @@ -0,0 +1,162 @@ +/* + * Knuth-Morris-Pratt's search automat for N given strings + * (taken from ad2 super-server and pruned :-) + * + * (c) 1999--2005, Robert Spalek + */ + +#include "lib/lib.h" +#include "lib/mempool.h" +#include "lib/lists.h" +#include "sherlock/tagged-text.h" +#include "lib/unicode.h" + +#define KMP_TRANSLATE(new_c, c, flags) NO_TRANSLATE(new_c, c, flags) +#include "lib/kmp.h" + +#include +#include +#include +#include + +#define TRACE(level, mask...) if (0) fprintf(stderr, mask) + +struct kmp * +kmp_new(struct mempool *mp, int words_len, uns modify_flags) +{ + struct kmp *kmp = mp_alloc_zero(mp, sizeof(struct kmp)); + kmp->mp = mp; + kmp->modify_flags = modify_flags; + kmp->words_len = words_len; + int size = words_len; + kmp->g.count = 1; + kmp->g.size = size; + kmp->g.sons = mp_alloc_zero(mp, size * sizeof(struct list)); + init_list(kmp->g.sons + 0); + if (words_len > 1) + size = words_len * fls(words_len); + else + size = 1; + kmp->g.hash_size = size; + kmp->g.chain = mp_alloc_zero(mp, size * sizeof(struct kmp_transition *)); + kmp->f = mp_alloc_zero(mp, words_len * sizeof(kmp_state_t)); + kmp->out = mp_alloc_zero(mp, words_len * sizeof(struct kmp_output *)); + return kmp; +} + +/* + * The only merge operation is that son includes output of his father (and also + * his father,...), so we can merge the link-lists. + */ +static void +merge_output(struct kmp_output **target, struct kmp_output *src) +{ + while (*target) + target = &(*target)->next; + *target = src; +} + +static struct kmp_output * +new_output(struct kmp *kmp, uns id, uns len) +{ + struct kmp_output *out = mp_alloc(kmp->mp, sizeof(struct kmp_output)); + out->next = NULL; + out->id = id; + out->len = len; + return out; +} + +void +kmp_enter_raw_string(struct kmp *kmp, const byte *str, uns id) +{ + struct kmp_transition tr = { .next=NULL, .from=0 }, **prev; + struct kmp_output *new_out; + const byte *orig_str = str; + uns len = 0; + kmp_char_t c = 'a'; + + TRACE(20, "kmp.c: Entering string %s", str); + kmp_get_char(&str, &c, 0); + len++; + if (!c) + return; + while (c) + { + tr.c = c; + prev = transition_search(&kmp->g, &tr); + if (!*prev) + break; + tr.from = (*prev)->to; + kmp_get_char(&str, &c, 0); + len++; + } + while (c) + { + *prev = mp_alloc_zero(kmp->mp, sizeof(struct kmp_transition)); + tr.to = kmp->g.count++; + **prev = tr; + add_tail(kmp->g.sons + tr.from, &(*prev)->n); + init_list(kmp->g.sons + tr.to); + kmp_get_char(&str, &c, 0); + len++; + tr.from = tr.to; + tr.c = c; + prev = transition_search(&kmp->g, &tr); + ASSERT(!*prev); + } + if (kmp->out[tr.from]) + TRACE(5, "kmp.c: string %s is inserted more than once", orig_str); + new_out = new_output(kmp, id, len-1); + merge_output(kmp->out + tr.from, new_out); +} + +static void +construct_f_out(struct kmp *kmp) +{ + kmp_state_t *fifo; + int read, write; + struct kmp_transition *son; + + fifo = alloca(kmp->words_len * sizeof(kmp_state_t)); + read = write = 0; + kmp->f[0] = 0; + WALK_LIST(son, kmp->g.sons[0]) + { + ASSERT(son->from == 0); + kmp->f[son->to] = 0; + fifo[write++] = son->to; + } + while (read != write) + { + kmp_state_t r, s, t; + r = fifo[read++]; + WALK_LIST(son, kmp->g.sons[r]) + { + struct kmp_transition tr, **prev; + ASSERT(son->from == r); + tr.c = son->c; + s = son->to; + fifo[write++] = s; + t = kmp->f[r]; + while (1) + { + tr.from = t; + prev = transition_search(&kmp->g, &tr); + if (*prev || !tr.from) + break; + t = kmp->f[t]; + } + kmp->f[s] = *prev ? (*prev)->to : 0; + merge_output(kmp->out + s, kmp->out[ kmp->f[s] ]); + } + } +} + +void +kmp_build(struct kmp *kmp) +{ + ASSERT(kmp->g.count <= kmp->words_len); + construct_f_out(kmp); + if (kmp->words_len > 1) + TRACE(0, "Built KMP with modify flags %d for total words len %d, it has %d nodes", kmp->modify_flags, kmp->words_len, kmp->g.count); +} diff --git a/lib/kmp.h b/lib/kmp.h new file mode 100644 index 00000000..649ac4dd --- /dev/null +++ b/lib/kmp.h @@ -0,0 +1,227 @@ +/* + * Knuth-Morris-Pratt's search automat for N given strings + * + * (c) 1999--2005, Robert Spalek + * + * Set KMP_TRANSLATE to one of {UNICODE,ASCII,NO}_TRANSLATE. + * + * Don't touch this file, because it is a mess. + */ + +#ifndef _UCW_KMP_H +#define _UCW_KMP_H + +#include "lib/lists.h" + +#define MF_TOLOWER 1 +#define MF_UNACCENT 2 +#define MF_ONLYALPHA 4 + /* how to modify characters in the string */ +#define CONTROL_CHAR ':' + /* all non-alphabetic characters are treated as CONTROL_CHAR */ + +#define UNICODE_TRANSLATE(new_c, c, flags) do { \ + if (!c) \ + new_c = 0; \ + else { \ + if (flags & MF_TOLOWER) \ + c = Utolower(c); \ + if (flags & MF_UNACCENT) \ + c = Uunaccent(c); \ + if (flags & MF_ONLYALPHA && !Ualpha(c)) \ + c = CONTROL_CHAR; \ + new_c = c; \ + } \ +} while (0) +#define ASCII_TRANSLATE(new_c, c, flags) do { \ + if (!c) \ + new_c = 0; \ + else { \ + if (flags & MF_TOLOWER) \ + c = Clocase(c); \ + if (flags & MF_ONLYALPHA && !Calpha(c)) \ + c = CONTROL_CHAR; \ + new_c = c; \ + } \ +} while (0) +#define NO_TRANSLATE(new_c, c, flags) do { new_c = c; } while (0) + +typedef uns kmp_state_t; +typedef word kmp_char_t; + +struct kmp_transition { + struct node n; /* link list of sons for a given node */ + struct kmp_transition *next; /* collision in the hash-table of all transitions */ + kmp_state_t from, to; + kmp_char_t c; +}; +struct kmp_transitions { + int count, size; + struct list *sons; /* link-list of all sons for each given node */ + uns hash_size; + struct kmp_transition **chain; /* hash-table of [node, char]->son */ +}; + +struct kmp_output { + struct kmp_output *next; /* output link list for every node */ + uns id; + uns len; +}; + +struct mempool; +struct kmp { + struct mempool *mp; + int modify_flags; /* which nocase/noaccent mode is this kmp for */ + int words_len; /* total length of searched words */ + struct kmp_transitions g; /* hash table of forward transitions of automat */ + kmp_state_t *f; /* back transitions of automat */ + struct kmp_output **out; /* found words for every state */ +}; + +/* kmp.c */ +struct kmp *kmp_new(struct mempool *mp, int words_len, uns modify_flags); +void kmp_enter_raw_string(struct kmp *kmp, const byte *str, uns id); +void kmp_build(struct kmp *kmp); + +static inline void +kmp_get_char(const byte **str, kmp_char_t *c, uns modify_flags UNUSED) +{ + while (1) + { + uns w; + kmp_char_t new_c; + GET_TAGGED_CHAR((*str), w); + if (w >= 0x80000000) + w = CONTROL_CHAR; + KMP_TRANSLATE(new_c, w, modify_flags); + if (new_c != CONTROL_CHAR || *c != CONTROL_CHAR) + { + *c = new_c; + return; + } + } +} + +static inline void +kmp_enter_string(struct kmp *kmp, const byte *str, uns id) +{ + /* Ugly hack to avoid linking libucw with libcharset: + * Convert the entered string here in the header and then call the + * library function. This function calls kmp_get_char() without any + * conversion. */ + byte buf[strlen(str)+1], *str2 = buf; + kmp_char_t c; + do + { + kmp_get_char(&str, &c, kmp->modify_flags); + str2 = utf8_put(str2, c); + } + while (c); + kmp_enter_raw_string(kmp, str2, id); +} + +struct kmp_result { + struct node n; /* strings with non-zero frequency are put into a link-list */ + uns occur; +}; + +static inline uns +transition_hashf(struct kmp_transitions *l UNUSED, struct kmp_transition *tr) +{ + return tr->from + (tr->c << 16); +} + +static inline int +transition_compare(struct kmp_transition *a, struct kmp_transition *b) +{ + if (a->from == b->from && a->c == b->c) + return 0; + else + return 1; +} + +static inline struct kmp_transition ** +transition_search(struct kmp_transitions *l, struct kmp_transition *tr) +{ + uns hf = transition_hashf(l, tr) % l->hash_size; + struct kmp_transition **last = l->chain + hf; + while (*last && transition_compare(*last, tr)) + last = &(*last)->next; + ASSERT(last); + return last; +} + +static inline void +add_result(struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out) +{ + for (; out; out = out->next) + if (!freq[out->id].occur++) + add_tail(nonzeroes, &freq[out->id].n); +} + +static inline byte * +kmp_search_internal(struct kmp *kmp, byte *str, uns len, struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out) + /* For every found string with id ID, it increments freq[ID]. + * Also, it finds the longest among the leftmost matches. */ +{ + byte *str_end = str + len; + kmp_state_t s = 0; + kmp_char_t c = CONTROL_CHAR; + struct kmp_transition tr, **prev; + byte eof = 0; + if (kmp->words_len <= 1) + return NULL; + //TRACE(20, "kmp.c: Searching string %s", str); + byte *largest_match = NULL; + while (1) + { + tr.from = s; + tr.c = c; + prev = transition_search(&kmp->g, &tr); + while (tr.from && !*prev) + { + tr.from = kmp->f[ tr.from ]; + prev = transition_search(&kmp->g, &tr); + } + s = *prev ? (*prev)->to : 0; + if (nonzeroes) + add_result(nonzeroes, freq, kmp->out[s]); + /* Beware that out->len is measured in modified characters of + * the search pattern, hence it is not very reliable if you use + * unaccenting. */ + struct kmp_output *kout = kmp->out[s]; + if (kout && (!largest_match || str - kout->len <= largest_match)) + { + largest_match = str - kout->len; + if (out) + *out = *kout; + } + if (eof) + break; + if (str >= str_end) + c = 0; + else + kmp_get_char((const byte **)&str, &c, kmp->modify_flags); + if (!c) + { + /* Insert CONTROL_CHAR at the beginning and at the end too. */ + c = CONTROL_CHAR; + eof = 1; + } + } + return largest_match; +} + +static inline void +kmp_search(struct kmp *kmp, const byte *str, struct list *nonzeroes, struct kmp_result *freq) +{ + kmp_search_internal(kmp, (byte*) str, strlen(str), nonzeroes, freq, NULL); +} + +static inline byte * +kmp_find_first(struct kmp *kmp, byte *str, uns len, struct kmp_output *out) +{ + return kmp_search_internal(kmp, str, len, NULL, NULL, out); +} + +#endif -- 2.39.2