X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;ds=sidebyside;f=lib%2Fkmp.h;h=aa889f694d08b25209b71cc17970af7ad8144806;hb=c9bb8b99608f4442d493081b616f6707043ddb6c;hp=7d392bf95b4dbf87c338a0676443bc8a922ca969;hpb=38cc9359c327c57df93cd649d6e4c538e89619e9;p=libucw.git diff --git a/lib/kmp.h b/lib/kmp.h index 7d392bf9..aa889f69 100644 --- a/lib/kmp.h +++ b/lib/kmp.h @@ -1,225 +1,463 @@ /* - * Knuth-Morris-Pratt's Substring Search for N given strings - * - * (c) 1999--2005, Robert Spalek + * Knuth-Morris-Pratt's Substring Search for N given strings * - * This is a preprocessor template: you need to set KMP_GET_CHAR - * to a macro for reading a single character from the input and - * translating it according to the MF_xxx flags. See below for - * some pre-defined values. + * (c) 1999--2005, Robert Spalek + * (c) 2006, Pavel Charvat * - * Don't touch this file, it can bite. - * - * (In fact, the algorithm is usually referred to as Aho-McCorasick, - * but that's just an extension of KMP to multiple strings.) + * (In fact, the algorithm is usually referred to as Aho-McCorasick, + * but that's just an extension of KMP to multiple strings.) */ -#ifndef _UCW_KMP_H -#define _UCW_KMP_H +/* + * This is not a normal header file, it's a generator of KMP algorithm. + * Each time you include it with parameters set in the corresponding + * preprocessor macros, it generates KMP structures and functions + * with the parameters given. + * + * This file contains only construction of the automaton. The search + * itself can be generated by inclusion of file lib/kmp-search.h. + * Separeted headers allow the user to define multiple search + * routines for one common set of key strings. + * + * Example: + * + * #define KMP_PREFIX(x) kmp_##x + * #define KMP_WANT_CLEANUP + * #define KMP_WANT_SEARCH // includes lib/kmp-search.h automatically + * #define KMPS_FOUND(kmp,src,s) printf("found\n") + * #include "lib/kmp.h" + * + * [...] + * + * struct kmp_struct kmp; // a structure describing the whole automaton + * kmp_init(&kmp); // initialization (must be called before all other functions) + * + * // add key strings we want to search + * kmp_add(&kmp, "aaa"); + * kmp_add(&kmp, "abc"); + * + * // complete the automaton, no more strings can be added later + * kmp_build(&kmp); + * + * // example of search, should print single "found" to stdout + * kmp_run(&kmp, "aabaabca"); + * + * // destroy all internal structures + * kmp_cleanup(&kmp); + * + * + * Brief description of all parameters: + * + * Basic parameters: + * KMP_PREFIX(x) macro to add a name prefix (used on all global names + * defined by the KMP generator); mandatory; + * we abbreviate this to P(x) below + * + * KMP_CHAR alphabet type, the default is u16 + * + * KMP_SOURCE user-defined text source; KMP_GET_CHAR must + * KMP_GET_CHAR(kmp,src,c) return zero at the end or nonzero together with the next character in c otherwise; + * if not defined, zero-terminated array of bytes is used as the input + * + * KMP_VARS user-defined variables in 'struct P(struct)' + * -- a structure describing the whole automaton; + * these variables are stored in .u substructure to avoid collisions + * KMP_STATE_VARS user-defined variables in 'struct P(state)' + * -- created for each state of the automaton; + * these variables are stored in .u substructure to avoid collisions + * + * Parameters which select how the input is interpreted (if KMP_SOURCE is unset): + * KMP_USE_ASCII reads single bytes from the input (default) + * KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed) + * KMP_TOLOWER converts all to lowercase + * KMP_UNACCENT removes accents + * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR (see below) + * + * Parameters controlling add(kmp, src): + * KMP_ADD_EXTRA_ARGS extra arguments, should be used carefully because of possible collisions + * KMP_ADD_INIT(kmp,src) called in the beginning of add(), src is the first + * KMP_INIT_STATE(kmp,s) initialization of a new state s (called before KMP_ADD_{NEW,DUP}); + * null state is not included and should be handled after init() if necessary; + * all user-defined data are filled by zeros before call to KMP_INIT_STATE + * KMP_ADD_NEW(kmp,src,s) initialize last state of every new key string (called after KMP_INIT_STATE); + * the string must be parsed before so src is after the last string's character + * KMP_ADD_DUP(kmp,src,s) analogy of KMP_ADD_NEW called for duplicates + * + * Parameters to build(): + * KMP_BUILD_STATE(kmp,s) called for all states (including null) in order of non-decreasing tree depth + * + * Other parameters: + * KMP_WANT_CLEANUP define cleanup() + * KMP_WANT_SEARCH includes lib/kmp-search.h with the same prefix; + * there can be multiple search variants for a single KMP automaton + * KMP_USE_POOL allocates in a given pool + * KMP_CONTROL_CHAR special control character (default is ':') + * KMP_GIVE_ALLOC if set, you must supply custom allocation functions: + * void *alloc(unsigned int size) -- allocate space for + * a state. Default is pooled allocation from a local pool or HASH_USE_POOL. + * void free(void *) -- the converse. + * KMP_GIVE_HASHFN if set, you must supply custom hash function: + * unsigned int hash(struct P(struct) *kmp, struct P(state) *state, KMP_CHAR c); + * default hash function works only for integer character types + * KMP_GIVE_EQ if set, you must supply custom compare function of two characters: + * int eq(struct P(struct) *kmp, KMP_CHAR a, KMP_CHAR b); + * default is 'a == b' + */ -#include "lib/lists.h" +#ifndef KMP_PREFIX +#error Missing KMP_PREFIX +#endif +#include "lib/mempool.h" +#include #include -/* - * Input conversion flags (the conversion is handled exclusively by the KMP_GET_CHAR - * macro, so you can define your own conversion modes, soo). - */ -#define MF_TOLOWER 1 -#define MF_UNACCENT 2 -#define MF_ONLYALPHA 4 /* Convert non-alphas to KMP_CONTROL_CHAR */ +#define P(x) KMP_PREFIX(x) -#define KMP_CONTROL_CHAR ':' +#ifdef KMP_CHAR +typedef KMP_CHAR P(char_t); +#else +typedef u16 P(char_t); +#endif -/* Pre-defined input functions */ +typedef u32 P(len_t); -#define KMP_GET_RAW(pos, c, flags) do { uns cc; pos = utf8_get(pos, &cc); c=cc; } while(0) +#ifdef KMP_NODE +typedef KMP_NODE P(node_t); +#else +typedef struct {} P(node_t); +#endif -#define KMP_GET_ASCII(pos, c, flags) do { \ - c = *pos++; \ - if (c) { \ - if (flags & MF_TOLOWER) \ - c = Clocase(c); \ - if (flags & MF_ONLYALPHA && !Calpha(c)) \ - c = KMP_CONTROL_CHAR; \ - } \ -} while (0) +struct P(struct); -/* Types and structures */ +struct P(state) { + struct P(state) *from; /* state with the previous character (forms a tree with null state in the root) */ + struct P(state) *back; /* backwards edge to the longest shorter state with same suffix */ + struct P(state) *next; /* the longest of shorter matches (or NULL) */ + P(len_t) len; /* state depth if it represents a key string, zero otherwise */ + P(char_t) c; /* last character of the represented string */ + struct { +# ifdef KMP_STATE_VARS + KMP_STATE_VARS +# endif + } u; /* user-defined data*/ +}; -typedef uns kmp_state_t; -typedef word kmp_char_t; +/* Control char */ +static inline P(char_t) +P(control) (void) +{ +# ifdef KMP_CONTROL_CHAR + return KMP_CONTROL_CHAR; +# else + return ':'; +# endif +} -struct kmp_transition { - struct node n; /* link list of sons for a given node */ - struct kmp_transition *next; /* collision in the hash-table of all transitions */ - kmp_state_t from, to; - kmp_char_t c; -}; -struct kmp_transitions { - int count, size; - struct list *sons; /* link-list of all sons for each given node */ - uns hash_size; - struct kmp_transition **chain; /* hash-table of [node, char]->son */ -}; +/* User-defined source */ +struct P(hash_table); -struct kmp_output { - struct kmp_output *next; /* output link list for every node */ - uns id; - uns len; -}; +#define HASH_GIVE_HASHFN +#ifdef KMP_GIVE_HASHFN +static inline uns +P(hash_hash) (struct P(hash_table) *t, struct P(state) *f, P(char_t) c) +{ + return P(hash) ((struct P(struct) *) t, f, c); +} +#else +static inline uns +P(hash_hash) (struct P(hash_table) *t UNUSED, struct P(state) *f, P(char_t) c) +{ + return (((uns)c) << 16) + (uns)(uintptr_t)f; +} +#endif -struct mempool; -struct kmp { - struct mempool *mp; - int modify_flags; /* which nocase/noaccent mode is this kmp for */ - int words_len; /* total length of searched words */ - struct kmp_transitions g; /* hash table of forward transitions of automat */ - kmp_state_t *f; /* back transitions of automat */ - struct kmp_output **out; /* found words for every state */ -}; +#ifndef KMP_GIVE_EQ +static inline int +P(eq) (struct P(struct) *kmp UNUSED, P(char_t) c1, P(char_t) c2) +{ + return c1 == c2; +} +#endif -struct kmp_result { - struct node n; /* strings with non-zero frequency are put into a link-list */ - uns occur; -}; +static inline int +P(is_control) (struct P(struct) *kmp, P(char_t) c) +{ + return P(eq) (kmp, c, P(control)()); +} -/* kmp.c */ -struct kmp *kmp_new(struct mempool *mp, int words_len, uns modify_flags); -void kmp_enter_raw_string(struct kmp *kmp, const byte *str, uns id); -void kmp_build(struct kmp *kmp); +#define HASH_GIVE_EQ +static inline int +P(hash_eq) (struct P(hash_table) *t, struct P(state) *f1, P(char_t) c1, struct P(state) *f2, P(char_t) c2) +{ + return f1 == f2 && P(eq)((struct P(struct) *) t, c1, c2); +} -static inline void -kmp_get_char(const byte **str, kmp_char_t *c, uns modify_flags UNUSED) -{ - while (1) - { - kmp_char_t new_c; - KMP_GET_CHAR((*str), new_c, modify_flags); - if (new_c != KMP_CONTROL_CHAR || *c != KMP_CONTROL_CHAR) - { - *c = new_c; - return; - } - } +#ifdef KMP_GIVE_ALLOC +#define HASH_GIVE_ALLOC +static inline void * +P(hash_alloc) (struct P(hash_table) *t, uns size) +{ + return P(alloc) ((struct P(struct) *) t, size); } static inline void -kmp_enter_string(struct kmp *kmp, const byte *str, uns id) -{ - /* To avoid dependencies between libucw and other libraries (which might - * be referenced by the KMP_GET_CHAR macro), we have to split kmp_enter_string() - * to a conversion wrapper (this function) and the rest, which resides in kmp.c - * and uses KMP_GET_RAW to read its input. - */ - byte buf[3*strlen(str)+1], *str2 = buf; - kmp_char_t c = 0; - do - { - kmp_get_char(&str, &c, kmp->modify_flags); - str2 = utf8_put(str2, c); - } - while (c); - kmp_enter_raw_string(kmp, buf, id); +P(hash_free) (struct P(hash_table) *t, void *ptr) +{ + P(free) ((struct P(struct) *) t, ptr); } +#endif -static inline uns -transition_hashf(struct kmp_transitions *l UNUSED, struct kmp_transition *tr) +#define HASH_GIVE_INIT_KEY +static inline void +P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(state) *f, P(char_t) c) { - return tr->from + (tr->c << 16); + bzero(s, sizeof(*s)); +# ifdef KMP_INIT_STATE + struct P(struct) *kmp = (struct P(struct) *)t; + { KMP_INIT_STATE(kmp, s); } +# endif + s->from = f; + s->c = c; + s->next = f->back; /* the pointers hold the link-list of sons... changed in build() */ + f->back = s; } +#undef P +#define HASH_PREFIX(x) KMP_PREFIX(hash_##x) +#define HASH_NODE struct KMP_PREFIX(state) +#define HASH_KEY_COMPLEX(x) x from, x c +#define HASH_KEY_DECL struct KMP_PREFIX(state) *from, KMP_PREFIX(char_t) c +#define HASH_WANT_NEW +#define HASH_WANT_FIND +#ifdef KMP_WANT_CLEANUP +#define HASH_WANT_CLEANUP +#endif +#if defined(KMP_USE_POOL) +#define HASH_USE_POOL KMP_USE_POOL +#else +#define HASH_AUTO_POOL 4096 +#endif +#define HASH_CONSERVE_SPACE +#define HASH_TABLE_DYNAMIC +#include "lib/hashtable.h" +#define P(x) KMP_PREFIX(x) + +struct P(struct) { + struct P(hash_table) hash; /* hash table of state transitions */ + struct P(state) null; /* null state */ + struct { +# ifdef KMP_VARS + KMP_VARS +# endif + } u; /* user-defined data */ +}; + +#ifdef KMP_SOURCE +typedef KMP_SOURCE P(source_t); +#else +typedef byte *P(source_t); +#endif + +#ifdef KMP_GET_CHAR +static inline int +P(get_char) (struct P(struct) *kmp UNUSED, P(source_t) *src UNUSED, P(char_t) *c UNUSED) +{ + return KMP_GET_CHAR(kmp, (*src), (*c)); +} +#else +# if defined(KMP_USE_UTF8) +# include "lib/unicode.h" +# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER) || defined(KMP_UNACCENT) +# include "charset/unicat.h" +# endif +# elif defined(KMP_USE_ASCII) +# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER) +# include "lib/chartype.h" +# endif +# endif static inline int -transition_compare(struct kmp_transition *a, struct kmp_transition *b) +P(get_char) (struct P(struct) *kmp UNUSED, P(source_t) *src, P(char_t) *c) { - if (a->from == b->from && a->c == b->c) - return 0; - else - return 1; +# ifdef KMP_USE_UTF8 + uns cc; + *src = (byte *)utf8_get(*src, &cc); +# ifdef KMP_ONLYALPHA + if (!cc) {} + else if (!Ualpha(cc)) + cc = P(control)(); + else +# endif + { +# ifdef KMP_TOLOWER + cc = Utolower(cc); +# endif +# ifdef KMP_UNACCENT + cc = Uunaccent(cc); +# endif + } +# else + uns cc = *(*src)++; +# ifdef KMP_ONLYALPHA + if (!cc) {} + else if (!Calpha(cc)) + cc = P(control)(); + else +# endif +# ifdef KMP_TOLOWER + cc = Clocase(cc); +# endif +# ifdef KMP_UNACCENT +# error Do not know how to unaccent ASCII characters +# endif +# endif + *c = cc; + return !!cc; } +#endif -static inline struct kmp_transition ** -transition_search(struct kmp_transitions *l, struct kmp_transition *tr) +static struct P(state) * +P(add) (struct P(struct) *kmp, P(source_t) src +# ifdef KMP_ADD_EXTRA_ARGS + , KMP_ADD_EXTRA_ARGS +# endif +) { - uns hf = transition_hashf(l, tr) % l->hash_size; - struct kmp_transition **last = l->chain + hf; - while (*last && transition_compare(*last, tr)) - last = &(*last)->next; - ASSERT(last); - return last; +# ifdef KMP_ADD_INIT + { KMP_ADD_INIT(kmp, src); } +# endif + + P(char_t) c; + if (!P(get_char)(kmp, &src, &c)) + return NULL; + struct P(state) *p = &kmp->null, *s; + uns len = 0; + do + { + s = P(hash_find)(&kmp->hash, p, c); + if (!s) + for (;;) + { + s = P(hash_new)(&kmp->hash, p, c); + len++; + if (!(P(get_char)(kmp, &src, &c))) + goto enter_new; + p = s; + } + p = s; + len++; + } + while (P(get_char)(kmp, &src, &c)); + if (s->len) + { +# ifdef KMP_ADD_DUP + { KMP_ADD_DUP(kmp, src, s); } +# endif + return s; + } +enter_new: + s->len = len; +# ifdef KMP_ADD_NEW + { KMP_ADD_NEW(kmp, src, s); } +# endif + return s; } -static inline void -add_result(struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out) -{ - for (; out; out = out->next) - if (!freq[out->id].occur++) - add_tail(nonzeroes, &freq[out->id].n); -} - -static inline byte * -kmp_search_internal(struct kmp *kmp, byte *str, uns len, struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out) - /* For every found string with id ID, it increments freq[ID]. - * Also, it finds the longest among the leftmost matches. */ -{ - byte *str_end = str + len; - kmp_state_t s = 0; - kmp_char_t c = KMP_CONTROL_CHAR; - struct kmp_transition tr, **prev; - byte eof = 0; - if (kmp->words_len <= 1) - return NULL; - //TRACE(20, "kmp.c: Searching string %s", str); - byte *largest_match = NULL; - while (1) - { - tr.from = s; - tr.c = c; - prev = transition_search(&kmp->g, &tr); - while (tr.from && !*prev) - { - tr.from = kmp->f[ tr.from ]; - prev = transition_search(&kmp->g, &tr); - } - s = *prev ? (*prev)->to : 0; - if (nonzeroes) - add_result(nonzeroes, freq, kmp->out[s]); - /* Beware that out->len is measured in modified characters of - * the search pattern, hence it is not very reliable if you use - * unaccenting. */ - struct kmp_output *kout = kmp->out[s]; - if (kout && (!largest_match || str - kout->len <= largest_match)) - { - largest_match = str - kout->len; - if (out) - *out = *kout; - } - if (eof) - break; - if (str >= str_end) - c = 0; - else - kmp_get_char((const byte **)&str, &c, kmp->modify_flags); - if (!c) - { - /* Insert KMP_CONTROL_CHAR at the beginning and at the end too. */ - c = KMP_CONTROL_CHAR; - eof = 1; - } - } - return largest_match; +static void +P(init) (struct P(struct) *kmp) +{ + bzero(&kmp->null, sizeof(struct P(state))); + P(hash_init)(&kmp->hash); } +#ifdef KMP_WANT_CLEANUP static inline void -kmp_search(struct kmp *kmp, const byte *str, struct list *nonzeroes, struct kmp_result *freq) +P(cleanup) (struct P(struct) *kmp) { - kmp_search_internal(kmp, (byte*) str, strlen(str), nonzeroes, freq, NULL); + P(hash_cleanup)(&kmp->hash); } +#endif -static inline byte * -kmp_find_first(struct kmp *kmp, byte *str, uns len, struct kmp_output *out) +static inline int +P(empty) (struct P(struct) *kmp) +{ + return !kmp->hash.hash_count; +} + +static inline struct P(state) * +P(chain_start) (struct P(state) *s) +{ + return s->len ? s : s->next; +} + +static void +P(build) (struct P(struct) *kmp) { - return kmp_search_internal(kmp, str, len, NULL, NULL, out); + if (P(empty)(kmp)) + return; + uns read = 0, write = 0; + struct P(state) *fifo[kmp->hash.hash_count], *null = &kmp->null; + for (struct P(state) *s = null->back; s; s = s->next) + fifo[write++] = s; + null->back = NULL; +# ifdef KMP_BUILD_STATE + { KMP_BUILD_STATE(kmp, null); } +# endif + while (read != write) + { + struct P(state) *s = fifo[read++], *t; + for (t = s->back; t; t = t->next) + fifo[write++] = t; + for (t = s->from->back; 1; t = t->back) + { + if (!t) + { + s->back = null; + s->next = NULL; + break; + } + s->back = P(hash_find)(&kmp->hash, t, s->c); + if (s->back) + { + s->next = s->back->len ? s->back : s->back->next; + break; + } + } +# ifdef KMP_BUILD_STATE + { KMP_BUILD_STATE(kmp, s); } +# endif + } } +#undef P +#undef KMP_CHAR +#undef KMP_SOURCE +#undef KMP_GET_CHAR +#undef KMP_VARS +#undef KMP_STATE_VARS +#undef KMP_CONTEXT +#undef KMP_USE_ASCII +#undef KMP_USE_UTF8 +#undef KMP_TOLOWER +#undef KMP_UNACCENT +#undef KMP_ONLYALPHA +#undef KMP_CONTROL_CHAR +#undef KMP_ADD_EXTRA_ARGS +#undef KMP_ADD_INIT +#undef KMP_ADD_NEW +#undef KMP_ADD_DUP +#undef KMP_INIT_STATE +#undef KMP_BUILD_STATE +#undef KMP_USE_POOL +#undef KMP_GIVE_ALLOC +#undef KMP_GIVE_HASHFN +#undef KMP_GIVE_EQ + +#ifdef KMP_WANT_SEARCH +# undef KMP_WANT_SEARCH +# define KMPS_PREFIX(x) KMP_PREFIX(x) +# define KMPS_KMP_PREFIX(x) KMP_PREFIX(x) +# include "lib/kmp-search.h" #endif + +#undef KMP_PREFIX