From: Pavel Charvat Date: Thu, 20 Apr 2006 07:19:58 +0000 (+0200) Subject: Removed old KMP completely X-Git-Tag: holmes-import~645^2~19 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=42b76ab379f0419d9521c9028ca41ae82b6ce890;p=libucw.git Removed old KMP completely --- diff --git a/lib/Makefile b/lib/Makefile index a026f39e..8555ad27 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -9,7 +9,7 @@ endif LIBUCW_MODS= \ alloc alloc_str realloc mempool mempool-str mempool-fmt \ mmap pagecache partmap hashfunc \ - lists slists sorter bitsig kmp \ + lists slists sorter bitsig \ log log-file proctitle \ conf ipaccess \ profile \ diff --git a/lib/kmp-new.h b/lib/kmp-new.h deleted file mode 100644 index 2ce01b3a..00000000 --- a/lib/kmp-new.h +++ /dev/null @@ -1,416 +0,0 @@ -/* - * Knuth-Morris-Pratt's Substring Search for N given strings - * - * (c) 1999--2005, Robert Spalek - * (c) 2006, Pavel Charvat - * - * (In fact, the algorithm is usually referred to as Aho-McCorasick, - * but that's just an extension of KMP to multiple strings.) - */ - -/* - * This is not a normal header file, it's a generator of KMP algorithm. - * Each time you include it with parameters set in the corresponding - * preprocessor macros, it generates KMP structures and functions - * with the parameters given. - * - * - * Basic parameters: - * KMP_PREFIX(x) macro to add a name prefix (used on all global names - * defined by the KMP generator); mandatory - * - * KMP_CHAR alphabet type, the default is u16 - * - * KMP_SOURCE user-defined text source; KMP_GET_CHAR must - * KMP_GET_CHAR(ctx,src,c) return next character from the input or zero at the end; - * if not defined, zero-terminated array of bytes is used as the input - * - * KMP_NODE user-defined data in each state of the automaton - * KMP_CONTEXT user-defined data in struct context (a structure describing - * the whole automaton) - * - * Parameters which select how the input is interpreted (if KMP_SOURCE is unset): - * KMP_USE_ASCII reads single bytes from the input (default) - * KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed) - * KMP_TOLOWER converts all to lowercase - * KMP_UNACCENT removes accents - * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR - * KMP_CONTROL_CHAR special control character (default is ':') - * - * Parameters controlling add(): - * KMP_ADD_EXTRA_ARGS extra arguments - * KMP_ADD_EXTRA_VAR structure with extra local variables - * KMP_ADD_INIT(ctx,src,v) - * KMP_ADD_NEW(ctx,src,v,s) - * KMP_ADD_DUP(ctx,src,v,s) - * KMP_NO_DUPS no support for duplicates - * - * Parameters to build(): - * KMP_BUILD_STATE(ctx,s) called for all states (including null) in order of non-decreasing tree depth - * - * Other parameters: - * KMP_WANT_CLEANUP define cleanup() - * KMP_WANT_SEARCH includes lib/kmp-search.h with the same prefix; - * there can be multiple search variants for a single KMP structure - * - * KMP_USE_POOL allocates in a given pool - * - * KMP_GIVE_ALLOC - * KMP_GIVE_HASHFN - * KMP_GIVE_EQ - */ - -#ifndef KMP_PREFIX -#error Missing KMP_PREFIX -#endif - -#include "lib/mempool.h" -#include -#include - -#define P(x) KMP_PREFIX(x) - -#ifdef KMP_CHAR -typedef KMP_CHAR P(char_t); -#else -typedef u16 P(char_t); -#endif - -typedef u32 P(len_t); - -#ifdef KMP_NODE -typedef KMP_NODE P(node_t); -#else -typedef struct {} P(node_t); -#endif - -struct P(context); - -struct P(state) { - struct P(state) *from; /* state with previous character */ - struct P(state) *back; /* backwards edge to the largest shorter state */ - struct P(state) *next; /* largest shorter match */ - P(len_t) len; /* largest match, zero otherwise */ - P(char_t) c; /* last character */ - P(node_t) n; /* user-defined data */ -}; - -/* Control char */ -static inline P(char_t) -P(control) (void) -{ -#ifdef KMP_CONTROL_CHAR - return KMP_CONTROL_CHAR; -#else - return ':'; -#endif -} - -/* User-defined source */ -struct P(hash_table); - -#define HASH_GIVE_HASHFN -#ifdef KMP_GIVE_HASHFN -static inline uns -P(hash_hash) (struct P(hash_table) *t, struct P(state) *f, P(char_t) c) -{ - return P(hash) ((struct P(context) *) t, f, c); -} -#else -static inline uns -P(hash_hash) (struct P(hash_table) *t UNUSED, struct P(state) *f, P(char_t) c) -{ - return (((uns)c) << 16) + (uns)(addr_int_t)f; -} -#endif - -#ifndef KMP_GIVE_EQ -static inline int -P(eq) (struct P(context) *ctx UNUSED, P(char_t) c1, P(char_t) c2) -{ - return c1 == c2; -} -#endif - -static inline int -P(is_control) (struct P(context) *ctx, P(char_t) c) -{ - return P(eq) (ctx, c, P(control)()); -} - -#define HASH_GIVE_EQ -static inline int -P(hash_eq) (struct P(hash_table) *t, struct P(state) *f1, P(char_t) c1, struct P(state) *f2, P(char_t) c2) -{ - return f1 == f2 && P(eq)((struct P(context) *) t, c1, c2); -} - -#ifdef KMP_GIVE_ALLOC -#define HASH_GIVE_ALLOC -static inline void * -P(hash_alloc) (struct P(hash_table) *t, uns size) -{ - return P(alloc) ((struct P(context) *) t, size); -} - -static inline void -P(hash_free) (struct P(hash_table) *t, void *ptr) -{ - P(free) ((struct P(context) *) t, ptr); -} -#endif - -#define HASH_GIVE_INIT_KEY -static inline void -P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(state) *f, P(char_t) c) -{ - bzero(s, sizeof(*s)); - s->from = f; - s->c = c; - s->next = f->back; /* the pointers hold the link-list of sons... changed in build() */ - f->back = s; -} - -#undef P -#define HASH_PREFIX(x) KMP_PREFIX(GLUE(hash_,x)) -#define HASH_NODE struct KMP_PREFIX(state) -#define HASH_KEY_COMPLEX(x) x from, x c -#define HASH_KEY_DECL struct KMP_PREFIX(state) *from, KMP_PREFIX(char_t) c -#define HASH_WANT_NEW -#define HASH_WANT_FIND -#ifdef KMP_WANT_CLEANUP -#define HASH_WANT_CLEANUP -#endif -#if defined(KMP_USE_POOL) -#define HASH_USE_POOL KMP_USE_POOL -#else -#define HASH_AUTO_POOL 4096 -#endif -#define HASH_CONSERVE_SPACE -#define HASH_TABLE_DYNAMIC -#include "lib/hashtable.h" -#define P(x) KMP_PREFIX(x) - -struct P(context) { - struct P(hash_table) hash; /* hash table of state transitions */ - struct P(state) null; /* null state */ -# ifdef KMP_CONTEXT - KMP_CONTEXT v; /* user defined data */ -# endif -}; - -#ifdef KMP_SOURCE -typedef KMP_SOURCE P(source_t); -#else -typedef byte *P(source_t); -#endif - -#ifdef KMP_GET_CHAR -static inline int -P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src UNUSED, P(char_t) *c UNUSED) -{ - return KMP_GET_CHAR(ctx, (*src), (*c)); -} -#else -# if defined(KMP_USE_UTF8) -# include "lib/unicode.h" -# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER) || defined(KMP_UNACCENT) -# include "charset/unicat.h" -# endif -# elif defined(KMP_USE_ASCII) -# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER) -# include "lib/chartype.h" -# endif -# endif -static inline int -P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src, P(char_t) *c) -{ -# ifdef KMP_USE_UTF8 - uns cc; - *src = (byte *)utf8_get(*src, &cc); -# ifdef KMP_ONLYALPHA - if (!cc) {} - else if (!Ualpha(cc)) - cc = P(control)(); - else -# endif - { -# ifdef KMP_TOLOWER - cc = Utolower(cc); -# endif -# ifdef KMP_UNACCENT - cc = Uunaccent(cc); -# endif - } -# else - uns cc = *(*src)++; -# ifdef KMP_ONLYALPHA - if (!cc) {} - else if (!Calpha(cc)) - cc = P(control)(); - else -# endif -# ifdef KMP_TOLOWER - cc = Clocase(cc); -# endif -# ifdef KMP_UNACCENT -# error Do not know how to unaccent ASCII characters -# endif -# endif - *c = cc; - return !!cc; -} -#endif - -static struct P(state) * -P(add) (struct P(context) *ctx, P(source_t) src -# ifdef KMP_ADD_EXTRA_ARGS - , KMP_ADD_EXTRA_ARGS -# endif -) -{ -# ifdef KMP_ADD_EXTRA_VAR - KMP_ADD_EXTRA_VAR v; -# endif -# ifdef KMP_ADD_INIT - { KMP_ADD_INIT(ctx, src, v); } -# endif - - P(char_t) c; - if (!P(get_char)(ctx, &src, &c)) - return NULL; - struct P(state) *p = &ctx->null, *s; - uns len = 0; - do - { - s = P(hash_find)(&ctx->hash, p, c); - if (!s) - for (;;) - { - s = P(hash_new)(&ctx->hash, p, c); - len++; - if (!(P(get_char)(ctx, &src, &c))) - goto enter_new; - p = s; - } - p = s; - len++; - } - while (P(get_char)(ctx, &src, &c)); -# ifdef KMP_NO_DUPS - ASSERT(!s->len); -# else - if (s->len) - { -# ifdef KMP_ADD_DUP - { KMP_ADD_DUP(ctx, src, v, s); } -# endif - return s; - } -# endif -enter_new: - s->len = len; -# ifdef KMP_ADD_NEW - { KMP_ADD_NEW(ctx, src, v, s); } -# endif - return s; -} - -static void -P(init) (struct P(context) *ctx) -{ - bzero(&ctx->null, sizeof(struct P(state))); - P(hash_init)(&ctx->hash); -} - -#ifdef KMP_WANT_CLEANUP -static inline void -P(cleanup) (struct P(context) *ctx) -{ - P(hash_cleanup)(&ctx->hash); -} -#endif - -static inline int -P(empty) (struct P(context) *ctx) -{ - return !ctx->hash.hash_count; -} - -static inline struct P(state) * -P(chain_start) (struct P(state) *s) -{ - return s->len ? s : s->next; -} - -static void -P(build) (struct P(context) *ctx) -{ - if (P(empty)(ctx)) - return; - uns read = 0, write = 0; - struct P(state) *fifo[ctx->hash.hash_count], *null = &ctx->null; - for (struct P(state) *s = null->back; s; s = s->next) - fifo[write++] = s; - null->back = NULL; -# ifdef KMP_BUILD_STATE - { KMP_BUILD_STATE(ctx, null); } -# endif - while (read != write) - { - struct P(state) *s = fifo[read++], *t; - for (t = s->back; t; t = t->next) - fifo[write++] = t; - for (t = s->from->back; 1; t = t->back) - { - if (!t) - { - s->back = null; - s->next = NULL; - break; - } - s->back = P(hash_find)(&ctx->hash, t, s->c); - if (s->back) - { - s->next = s->back->len ? s->back : s->back->next; - break; - } - } -# ifdef KMP_BUILD_STATE - { KMP_BUILD_STATE(ctx, s); } -# endif - } -} - -#undef P -#undef KMP_CHAR -#undef KMP_SOURCE -#undef KMP_GET_CHAR -#undef KMP_NODE -#undef KMP_CONTEXT -#undef KMP_USE_ASCII -#undef KMP_USE_UTF8 -#undef KMP_TOLOWER -#undef KMP_UNACCENT -#undef KMP_ONLYALPHA -#undef KMP_CONTROL_CHAR -#undef KMP_ADD_EXTRA_ARGS -#undef KMP_ADD_EXTRA_VAR -#undef KMP_ADD_INIT -#undef KMP_ADD_NEW -#undef KMP_ADD_DUP -#undef KMP_NO_DUPS -#undef KMP_BUILD_STATE -#undef KMP_USE_POOL -#undef KMP_GIVE_ALLOC -#undef KMP_GIVE_HASHFN -#undef KMP_GIVE_EQ - -#ifdef KMP_WANT_SEARCH -# undef KMP_WANT_SEARCH -# define KMPS_PREFIX(x) KMP_PREFIX(x) -# define KMPS_KMP_PREFIX(x) KMP_PREFIX(x) -# include "lib/kmp-search.h" -#endif - -#undef KMP_PREFIX diff --git a/lib/kmp-test.c b/lib/kmp-test.c index efa29a27..100291eb 100644 --- a/lib/kmp-test.c +++ b/lib/kmp-test.c @@ -18,7 +18,7 @@ #define KMP_PREFIX(x) GLUE_(kmp1,x) #define KMP_WANT_CLEANUP -#include "lib/kmp-new.h" +#include "lib/kmp.h" #define KMPS_PREFIX(x) GLUE_(kmp1s1,x) #define KMPS_KMP_PREFIX(x) GLUE_(kmp1,x) #define KMPS_WANT_BEST @@ -74,7 +74,7 @@ test1(void) #define KMPS_FOUND(ctx,src,s) do{ TRACE("String %s with id %d found", s.out->n.str, s.out->n.id); }while(0) #define KMPS_STEP(ctx,src,s) do{ TRACE("Got to state %p after reading %d", s.s, s.c); }while(0) #define KMPS_EXIT(ctx,src,s) do{ if (s.best->len) TRACE("Best match is %s", s.best->n.str); } while(0) -#include "lib/kmp-new.h" +#include "lib/kmp.h" static void test2(void) @@ -107,7 +107,7 @@ test2(void) #define KMP_WANT_SEARCH #define KMPS_EXTRA_ARGS uns *cnt, uns *sum #define KMPS_FOUND(ctx,src,s) do{ ASSERT(cnt[s.out->n]); cnt[s.out->n]--; sum[0]--; }while(0) -#include "lib/kmp-new.h" +#include "lib/kmp.h" static void test3(void) @@ -183,7 +183,7 @@ kmp4_hash(struct kmp4_context *ctx UNUSED, struct kmp4_state *s, byte *c) #define KMPS_FOUND(ctx,src,s) do{ TRACE("found"); }while(0) #define KMPS_ADD_CONTROLS #define KMPS_MERGE_CONTROLS -#include "lib/kmp-new.h" +#include "lib/kmp.h" static void test4(void) diff --git a/lib/kmp.c b/lib/kmp.c deleted file mode 100644 index 95a709d4..00000000 --- a/lib/kmp.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Knuth-Morris-Pratt's Substring Search for N given strings - * - * (c) 1999--2005, Robert Spalek - */ - -#include "lib/lib.h" -#include "lib/bitops.h" -#include "lib/mempool.h" -#include "lib/lists.h" -#include "lib/unicode.h" - -#define KMP_GET_CHAR(pos, c, flags) ASSERT(0) -#include "lib/kmp.h" - -#include -#include -#include - -#define TRACE(level, mask...) if (0) fprintf(stderr, mask) - -struct kmp * -kmp_new(struct mempool *mp, int words_len, uns modify_flags) -{ - struct kmp *kmp = mp_alloc_zero(mp, sizeof(struct kmp)); - kmp->mp = mp; - kmp->modify_flags = modify_flags; - kmp->words_len = words_len; - int size = words_len; - kmp->g.count = 1; - kmp->g.size = size; - kmp->g.sons = mp_alloc_zero(mp, size * sizeof(struct list)); - init_list(kmp->g.sons + 0); - if (words_len > 1) - size = words_len * bit_fls(words_len); - else - size = 1; - kmp->g.hash_size = size; - kmp->g.chain = mp_alloc_zero(mp, size * sizeof(struct kmp_transition *)); - kmp->f = mp_alloc_zero(mp, words_len * sizeof(kmp_state_t)); - kmp->out = mp_alloc_zero(mp, words_len * sizeof(struct kmp_output *)); - return kmp; -} - -/* - * The only merge operation is that son includes output of his father (and also - * his father,...), so we can merge the link-lists. - */ -static void -merge_output(struct kmp_output **target, struct kmp_output *src) -{ - while (*target) - target = &(*target)->next; - *target = src; -} - -static struct kmp_output * -new_output(struct kmp *kmp, uns id, uns len) -{ - struct kmp_output *out = mp_alloc(kmp->mp, sizeof(struct kmp_output)); - out->next = NULL; - out->id = id; - out->len = len; - return out; -} - -void -kmp_enter_raw_string(struct kmp *kmp, kmp_char_t *str, uns id) -{ - struct kmp_transition tr = { .next=NULL, .from=0 }, **prev; - struct kmp_output *new_out; - uns len = 0; - kmp_char_t c = 'a'; - - TRACE(20, "kmp.c: Entering string"); - c = *str++; - len++; - if (!c) - return; - while (c) - { - tr.c = c; - prev = transition_search(&kmp->g, &tr); - if (!*prev) - break; - tr.from = (*prev)->to; - c = *str++; - len++; - } - while (c) - { - *prev = mp_alloc_zero(kmp->mp, sizeof(struct kmp_transition)); - tr.to = kmp->g.count++; - **prev = tr; - add_tail(kmp->g.sons + tr.from, &(*prev)->n); - init_list(kmp->g.sons + tr.to); - c = *str++; - len++; - tr.from = tr.to; - tr.c = c; - prev = transition_search(&kmp->g, &tr); - ASSERT(!*prev); - } - if (kmp->out[tr.from]) - TRACE(5, "kmp.c: string is inserted more than once"); - new_out = new_output(kmp, id, len-1); - merge_output(kmp->out + tr.from, new_out); -} - -static void -construct_f_out(struct kmp *kmp) -{ - kmp_state_t *fifo; - int read, write; - struct kmp_transition *son; - - fifo = alloca(kmp->words_len * sizeof(kmp_state_t)); - read = write = 0; - kmp->f[0] = 0; - WALK_LIST(son, kmp->g.sons[0]) - { - ASSERT(son->from == 0); - kmp->f[son->to] = 0; - fifo[write++] = son->to; - } - while (read != write) - { - kmp_state_t r, s, t; - r = fifo[read++]; - WALK_LIST(son, kmp->g.sons[r]) - { - struct kmp_transition tr, **prev; - ASSERT(son->from == r); - tr.c = son->c; - s = son->to; - fifo[write++] = s; - t = kmp->f[r]; - while (1) - { - tr.from = t; - prev = transition_search(&kmp->g, &tr); - if (*prev || !tr.from) - break; - t = kmp->f[t]; - } - kmp->f[s] = *prev ? (*prev)->to : 0; - merge_output(kmp->out + s, kmp->out[ kmp->f[s] ]); - } - } -} - -void -kmp_build(struct kmp *kmp) -{ - ASSERT(kmp->g.count <= kmp->words_len); - construct_f_out(kmp); - if (kmp->words_len > 1) - TRACE(0, "Built KMP with modify flags %d for total words len %d, it has %d nodes", kmp->modify_flags, kmp->words_len, kmp->g.count); -} diff --git a/lib/kmp.h b/lib/kmp.h index fd37a6e3..2ce01b3a 100644 --- a/lib/kmp.h +++ b/lib/kmp.h @@ -1,225 +1,416 @@ /* - * Knuth-Morris-Pratt's Substring Search for N given strings - * - * (c) 1999--2005, Robert Spalek + * Knuth-Morris-Pratt's Substring Search for N given strings * - * This is a preprocessor template: you need to set KMP_GET_CHAR - * to a macro for reading a single character from the input and - * translating it according to the MF_xxx flags. See below for - * some pre-defined values. + * (c) 1999--2005, Robert Spalek + * (c) 2006, Pavel Charvat * - * Don't touch this file, it can bite. - * - * (In fact, the algorithm is usually referred to as Aho-McCorasick, - * but that's just an extension of KMP to multiple strings.) + * (In fact, the algorithm is usually referred to as Aho-McCorasick, + * but that's just an extension of KMP to multiple strings.) */ -#ifndef _UCW_KMP_H -#define _UCW_KMP_H +/* + * This is not a normal header file, it's a generator of KMP algorithm. + * Each time you include it with parameters set in the corresponding + * preprocessor macros, it generates KMP structures and functions + * with the parameters given. + * + * + * Basic parameters: + * KMP_PREFIX(x) macro to add a name prefix (used on all global names + * defined by the KMP generator); mandatory + * + * KMP_CHAR alphabet type, the default is u16 + * + * KMP_SOURCE user-defined text source; KMP_GET_CHAR must + * KMP_GET_CHAR(ctx,src,c) return next character from the input or zero at the end; + * if not defined, zero-terminated array of bytes is used as the input + * + * KMP_NODE user-defined data in each state of the automaton + * KMP_CONTEXT user-defined data in struct context (a structure describing + * the whole automaton) + * + * Parameters which select how the input is interpreted (if KMP_SOURCE is unset): + * KMP_USE_ASCII reads single bytes from the input (default) + * KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed) + * KMP_TOLOWER converts all to lowercase + * KMP_UNACCENT removes accents + * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR + * KMP_CONTROL_CHAR special control character (default is ':') + * + * Parameters controlling add(): + * KMP_ADD_EXTRA_ARGS extra arguments + * KMP_ADD_EXTRA_VAR structure with extra local variables + * KMP_ADD_INIT(ctx,src,v) + * KMP_ADD_NEW(ctx,src,v,s) + * KMP_ADD_DUP(ctx,src,v,s) + * KMP_NO_DUPS no support for duplicates + * + * Parameters to build(): + * KMP_BUILD_STATE(ctx,s) called for all states (including null) in order of non-decreasing tree depth + * + * Other parameters: + * KMP_WANT_CLEANUP define cleanup() + * KMP_WANT_SEARCH includes lib/kmp-search.h with the same prefix; + * there can be multiple search variants for a single KMP structure + * + * KMP_USE_POOL allocates in a given pool + * + * KMP_GIVE_ALLOC + * KMP_GIVE_HASHFN + * KMP_GIVE_EQ + */ -#include "lib/lists.h" +#ifndef KMP_PREFIX +#error Missing KMP_PREFIX +#endif +#include "lib/mempool.h" +#include #include -/* - * Input conversion flags (the conversion is handled exclusively by the KMP_GET_CHAR - * macro, so you can define your own conversion modes, soo). - */ -#define MF_TOLOWER 1 -#define MF_UNACCENT 2 -#define MF_ONLYALPHA 4 /* Convert non-alphas to KMP_CONTROL_CHAR */ +#define P(x) KMP_PREFIX(x) -#define KMP_CONTROL_CHAR ':' +#ifdef KMP_CHAR +typedef KMP_CHAR P(char_t); +#else +typedef u16 P(char_t); +#endif -/* Pre-defined input functions */ +typedef u32 P(len_t); -#define KMP_GET_UTF8(pos, c, flags) do { uns cc; pos = utf8_get(pos, &cc); c = cc; } while(0) +#ifdef KMP_NODE +typedef KMP_NODE P(node_t); +#else +typedef struct {} P(node_t); +#endif -#define KMP_GET_ASCII(pos, c, flags) do { \ - c = *pos++; \ - if (c) { \ - if (flags & MF_TOLOWER) \ - c = Clocase(c); \ - if (flags & MF_ONLYALPHA && !Calpha(c)) \ - c = KMP_CONTROL_CHAR; \ - } \ -} while (0) +struct P(context); -/* Types and structures */ +struct P(state) { + struct P(state) *from; /* state with previous character */ + struct P(state) *back; /* backwards edge to the largest shorter state */ + struct P(state) *next; /* largest shorter match */ + P(len_t) len; /* largest match, zero otherwise */ + P(char_t) c; /* last character */ + P(node_t) n; /* user-defined data */ +}; -typedef uns kmp_state_t; -typedef word kmp_char_t; +/* Control char */ +static inline P(char_t) +P(control) (void) +{ +#ifdef KMP_CONTROL_CHAR + return KMP_CONTROL_CHAR; +#else + return ':'; +#endif +} -struct kmp_transition { - struct node n; /* link list of sons for a given node */ - struct kmp_transition *next; /* collision in the hash-table of all transitions */ - kmp_state_t from, to; - kmp_char_t c; -}; -struct kmp_transitions { - int count, size; - struct list *sons; /* link-list of all sons for each given node */ - uns hash_size; - struct kmp_transition **chain; /* hash-table of [node, char]->son */ -}; +/* User-defined source */ +struct P(hash_table); -struct kmp_output { - struct kmp_output *next; /* output link list for every node */ - uns id; - uns len; -}; +#define HASH_GIVE_HASHFN +#ifdef KMP_GIVE_HASHFN +static inline uns +P(hash_hash) (struct P(hash_table) *t, struct P(state) *f, P(char_t) c) +{ + return P(hash) ((struct P(context) *) t, f, c); +} +#else +static inline uns +P(hash_hash) (struct P(hash_table) *t UNUSED, struct P(state) *f, P(char_t) c) +{ + return (((uns)c) << 16) + (uns)(addr_int_t)f; +} +#endif -struct mempool; -struct kmp { - struct mempool *mp; - int modify_flags; /* which nocase/noaccent mode is this kmp for */ - int words_len; /* total length of searched words */ - struct kmp_transitions g; /* hash table of forward transitions of automat */ - kmp_state_t *f; /* back transitions of automat */ - struct kmp_output **out; /* found words for every state */ -}; +#ifndef KMP_GIVE_EQ +static inline int +P(eq) (struct P(context) *ctx UNUSED, P(char_t) c1, P(char_t) c2) +{ + return c1 == c2; +} +#endif -struct kmp_result { - struct node n; /* strings with non-zero frequency are put into a link-list */ - uns occur; -}; +static inline int +P(is_control) (struct P(context) *ctx, P(char_t) c) +{ + return P(eq) (ctx, c, P(control)()); +} -/* kmp.c */ -struct kmp *kmp_new(struct mempool *mp, int words_len, uns modify_flags); -void kmp_enter_raw_string(struct kmp *kmp, kmp_char_t *str, uns id); -void kmp_build(struct kmp *kmp); +#define HASH_GIVE_EQ +static inline int +P(hash_eq) (struct P(hash_table) *t, struct P(state) *f1, P(char_t) c1, struct P(state) *f2, P(char_t) c2) +{ + return f1 == f2 && P(eq)((struct P(context) *) t, c1, c2); +} -static inline void -kmp_get_char(const byte **str UNUSED, kmp_char_t *c, uns modify_flags UNUSED) -{ - while (1) - { - kmp_char_t new_c; - KMP_GET_CHAR((*str), new_c, modify_flags); - if (new_c != KMP_CONTROL_CHAR || *c != KMP_CONTROL_CHAR) - { - *c = new_c; - return; - } - } +#ifdef KMP_GIVE_ALLOC +#define HASH_GIVE_ALLOC +static inline void * +P(hash_alloc) (struct P(hash_table) *t, uns size) +{ + return P(alloc) ((struct P(context) *) t, size); } static inline void -kmp_enter_string(struct kmp *kmp, const byte *str, uns id) -{ - /* To avoid dependencies between libucw and other libraries (which might - * be referenced by the KMP_GET_CHAR macro), we have to split kmp_enter_string() - * to a conversion wrapper (this function) and the rest, which resides in kmp.c - * and uses zero-terminated array of kmp_char_t characters as its input. - */ - kmp_char_t buf[strlen(str)+1], *str2 = buf, c = 0; - do - { - kmp_get_char(&str, &c, kmp->modify_flags); - *str2++ = c; - } - while (c); - kmp_enter_raw_string(kmp, buf, id); +P(hash_free) (struct P(hash_table) *t, void *ptr) +{ + P(free) ((struct P(context) *) t, ptr); } +#endif -static inline uns -transition_hashf(struct kmp_transitions *l UNUSED, struct kmp_transition *tr) +#define HASH_GIVE_INIT_KEY +static inline void +P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(state) *f, P(char_t) c) { - return tr->from + (tr->c << 16); + bzero(s, sizeof(*s)); + s->from = f; + s->c = c; + s->next = f->back; /* the pointers hold the link-list of sons... changed in build() */ + f->back = s; } +#undef P +#define HASH_PREFIX(x) KMP_PREFIX(GLUE(hash_,x)) +#define HASH_NODE struct KMP_PREFIX(state) +#define HASH_KEY_COMPLEX(x) x from, x c +#define HASH_KEY_DECL struct KMP_PREFIX(state) *from, KMP_PREFIX(char_t) c +#define HASH_WANT_NEW +#define HASH_WANT_FIND +#ifdef KMP_WANT_CLEANUP +#define HASH_WANT_CLEANUP +#endif +#if defined(KMP_USE_POOL) +#define HASH_USE_POOL KMP_USE_POOL +#else +#define HASH_AUTO_POOL 4096 +#endif +#define HASH_CONSERVE_SPACE +#define HASH_TABLE_DYNAMIC +#include "lib/hashtable.h" +#define P(x) KMP_PREFIX(x) + +struct P(context) { + struct P(hash_table) hash; /* hash table of state transitions */ + struct P(state) null; /* null state */ +# ifdef KMP_CONTEXT + KMP_CONTEXT v; /* user defined data */ +# endif +}; + +#ifdef KMP_SOURCE +typedef KMP_SOURCE P(source_t); +#else +typedef byte *P(source_t); +#endif + +#ifdef KMP_GET_CHAR static inline int -transition_compare(struct kmp_transition *a, struct kmp_transition *b) +P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src UNUSED, P(char_t) *c UNUSED) { - if (a->from == b->from && a->c == b->c) - return 0; - else - return 1; + return KMP_GET_CHAR(ctx, (*src), (*c)); } +#else +# if defined(KMP_USE_UTF8) +# include "lib/unicode.h" +# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER) || defined(KMP_UNACCENT) +# include "charset/unicat.h" +# endif +# elif defined(KMP_USE_ASCII) +# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER) +# include "lib/chartype.h" +# endif +# endif +static inline int +P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src, P(char_t) *c) +{ +# ifdef KMP_USE_UTF8 + uns cc; + *src = (byte *)utf8_get(*src, &cc); +# ifdef KMP_ONLYALPHA + if (!cc) {} + else if (!Ualpha(cc)) + cc = P(control)(); + else +# endif + { +# ifdef KMP_TOLOWER + cc = Utolower(cc); +# endif +# ifdef KMP_UNACCENT + cc = Uunaccent(cc); +# endif + } +# else + uns cc = *(*src)++; +# ifdef KMP_ONLYALPHA + if (!cc) {} + else if (!Calpha(cc)) + cc = P(control)(); + else +# endif +# ifdef KMP_TOLOWER + cc = Clocase(cc); +# endif +# ifdef KMP_UNACCENT +# error Do not know how to unaccent ASCII characters +# endif +# endif + *c = cc; + return !!cc; +} +#endif -static inline struct kmp_transition ** -transition_search(struct kmp_transitions *l, struct kmp_transition *tr) +static struct P(state) * +P(add) (struct P(context) *ctx, P(source_t) src +# ifdef KMP_ADD_EXTRA_ARGS + , KMP_ADD_EXTRA_ARGS +# endif +) { - uns hf = transition_hashf(l, tr) % l->hash_size; - struct kmp_transition **last = l->chain + hf; - while (*last && transition_compare(*last, tr)) - last = &(*last)->next; - ASSERT(last); - return last; +# ifdef KMP_ADD_EXTRA_VAR + KMP_ADD_EXTRA_VAR v; +# endif +# ifdef KMP_ADD_INIT + { KMP_ADD_INIT(ctx, src, v); } +# endif + + P(char_t) c; + if (!P(get_char)(ctx, &src, &c)) + return NULL; + struct P(state) *p = &ctx->null, *s; + uns len = 0; + do + { + s = P(hash_find)(&ctx->hash, p, c); + if (!s) + for (;;) + { + s = P(hash_new)(&ctx->hash, p, c); + len++; + if (!(P(get_char)(ctx, &src, &c))) + goto enter_new; + p = s; + } + p = s; + len++; + } + while (P(get_char)(ctx, &src, &c)); +# ifdef KMP_NO_DUPS + ASSERT(!s->len); +# else + if (s->len) + { +# ifdef KMP_ADD_DUP + { KMP_ADD_DUP(ctx, src, v, s); } +# endif + return s; + } +# endif +enter_new: + s->len = len; +# ifdef KMP_ADD_NEW + { KMP_ADD_NEW(ctx, src, v, s); } +# endif + return s; } -static inline void -add_result(struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out) -{ - for (; out; out = out->next) - if (!freq[out->id].occur++) - add_tail(nonzeroes, &freq[out->id].n); -} - -static inline byte * -kmp_search_internal(struct kmp *kmp, byte *str, uns len, struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out) - /* For every found string with id ID, it increments freq[ID]. - * Also, it finds the longest among the leftmost matches. */ -{ - if (!len) - return NULL; - kmp_state_t s = 0; - kmp_char_t c = KMP_CONTROL_CHAR; - struct kmp_transition tr, **prev; - byte eof = 0; - if (kmp->words_len <= 1) - return NULL; - //TRACE(20, "kmp.c: Searching string %s", str); - byte *largest_match = NULL; - while (1) - { - tr.from = s; - tr.c = c; - prev = transition_search(&kmp->g, &tr); - while (tr.from && !*prev) - { - tr.from = kmp->f[ tr.from ]; - prev = transition_search(&kmp->g, &tr); - } - s = *prev ? (*prev)->to : 0; - if (nonzeroes) - add_result(nonzeroes, freq, kmp->out[s]); - /* Beware that out->len is measured in modified characters of - * the search pattern, hence it is not very reliable if you use - * unaccenting. */ - struct kmp_output *kout = kmp->out[s]; - if (kout && (!largest_match || str - kout->len <= largest_match)) - { - largest_match = str - kout->len; - if (out) - *out = *kout; - } - if (eof) - break; - if (!--len) - c = 0; - else - kmp_get_char((const byte **)&str, &c, kmp->modify_flags); - if (!c) - { - /* Insert KMP_CONTROL_CHAR at the beginning and at the end too. */ - c = KMP_CONTROL_CHAR; - eof = 1; - } - } - return largest_match; +static void +P(init) (struct P(context) *ctx) +{ + bzero(&ctx->null, sizeof(struct P(state))); + P(hash_init)(&ctx->hash); } +#ifdef KMP_WANT_CLEANUP static inline void -kmp_search(struct kmp *kmp, const byte *str, uns len, struct list *nonzeroes, struct kmp_result *freq) +P(cleanup) (struct P(context) *ctx) { - kmp_search_internal(kmp, (byte*) str, len, nonzeroes, freq, NULL); + P(hash_cleanup)(&ctx->hash); } +#endif -static inline byte * -kmp_find_first(struct kmp *kmp, byte *str, uns len, struct kmp_output *out) +static inline int +P(empty) (struct P(context) *ctx) { - return kmp_search_internal(kmp, str, len, NULL, NULL, out); + return !ctx->hash.hash_count; } +static inline struct P(state) * +P(chain_start) (struct P(state) *s) +{ + return s->len ? s : s->next; +} + +static void +P(build) (struct P(context) *ctx) +{ + if (P(empty)(ctx)) + return; + uns read = 0, write = 0; + struct P(state) *fifo[ctx->hash.hash_count], *null = &ctx->null; + for (struct P(state) *s = null->back; s; s = s->next) + fifo[write++] = s; + null->back = NULL; +# ifdef KMP_BUILD_STATE + { KMP_BUILD_STATE(ctx, null); } +# endif + while (read != write) + { + struct P(state) *s = fifo[read++], *t; + for (t = s->back; t; t = t->next) + fifo[write++] = t; + for (t = s->from->back; 1; t = t->back) + { + if (!t) + { + s->back = null; + s->next = NULL; + break; + } + s->back = P(hash_find)(&ctx->hash, t, s->c); + if (s->back) + { + s->next = s->back->len ? s->back : s->back->next; + break; + } + } +# ifdef KMP_BUILD_STATE + { KMP_BUILD_STATE(ctx, s); } +# endif + } +} + +#undef P +#undef KMP_CHAR +#undef KMP_SOURCE +#undef KMP_GET_CHAR +#undef KMP_NODE +#undef KMP_CONTEXT +#undef KMP_USE_ASCII +#undef KMP_USE_UTF8 +#undef KMP_TOLOWER +#undef KMP_UNACCENT +#undef KMP_ONLYALPHA +#undef KMP_CONTROL_CHAR +#undef KMP_ADD_EXTRA_ARGS +#undef KMP_ADD_EXTRA_VAR +#undef KMP_ADD_INIT +#undef KMP_ADD_NEW +#undef KMP_ADD_DUP +#undef KMP_NO_DUPS +#undef KMP_BUILD_STATE +#undef KMP_USE_POOL +#undef KMP_GIVE_ALLOC +#undef KMP_GIVE_HASHFN +#undef KMP_GIVE_EQ + +#ifdef KMP_WANT_SEARCH +# undef KMP_WANT_SEARCH +# define KMPS_PREFIX(x) KMP_PREFIX(x) +# define KMPS_KMP_PREFIX(x) KMP_PREFIX(x) +# include "lib/kmp-search.h" #endif + +#undef KMP_PREFIX