From: Pavel Charvat Date: Sun, 16 Apr 2006 11:40:08 +0000 (+0200) Subject: Rewritten KMP... incomplete comments and some useful hooks for statistics X-Git-Tag: holmes-import~650^2~9 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=1ebc934f9776b6f30785351fcfb7ddb526c9cd0c;p=libucw.git Rewritten KMP... incomplete comments and some useful hooks for statistics --- diff --git a/lib/Makefile b/lib/Makefile index 8535b687..35b865a1 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -75,6 +75,7 @@ $(o)/lib/asort-test: $(o)/lib/asort-test.o $(LIBUCW) $(o)/lib/redblack-test: $(o)/lib/redblack-test.o $(LIBUCW) $(o)/lib/binheap-test: $(o)/lib/binheap-test.o $(LIBUCW) $(o)/lib/lizard-test: $(o)/lib/lizard-test.o $(LIBUCW) +$(o)/lib/kmp-test: $(o)/lib/kmp-test.o $(LIBUCW) $(LIBCHARSET) TESTS+=$(addprefix $(o)/lib/,regex.test unicode-utf8.test hash-test.test mempool.test stkstring.test slists.test) $(o)/lib/regex.test: $(o)/lib/regex-t diff --git a/lib/kmp-new.h b/lib/kmp-new.h new file mode 100644 index 00000000..2d677aa5 --- /dev/null +++ b/lib/kmp-new.h @@ -0,0 +1,351 @@ +/* + * Knuth-Morris-Pratt's Substring Search for N given strings + * + * (c) 1999--2005, Robert Spalek + * (c) 2006, Pavel Charvat + * + * (In fact, the algorithm is usually referred to as Aho-McCorasick, + * but that's just an extension of KMP to multiple strings.) + */ + +/* + * This is not a normal header file, it's a generator of KMP algorithm. + * Each time you include it with parameters set in the corresponding + * preprocessor macros, it generates KMP structures and functions + * with the parameters given. + * + * + * [*] KMP_PREFIX(x) macro to add a name prefix (used on all global names + * defined by the KMP generator). + * + * KMP_CHAR alphabet type, the default is u16 + * + * KMP_SOURCE user-defined source; KMP_GET_CHAR must + * return next character from the input or zero at the end; + * if not defined, zero-terminated array of bytes is used as the input + * KMP_GET_CHAR(ctx,src,c) + * + * KMP_NODE user-defined data stored in each added string + * + * Parameters to default get_char(): + * KMP_USE_ASCII reads single bytes from the input (default) + * KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed) + * KMP_TOLOWER converts all to lowercase + * KMP_UNACCENT removes accents + * KMP_ONLYALPHA converts nonalphas to KMP_CONTROL_CHAR + * KMP_CONTROL_CHAR special control character (default is ':') + * + * Parameters to add(): + * KMP_ADD_EXTRA_ARGS extra arguments + * KMP_ADD_EXTRA_VAR structure with extra local varriables + * KMP_ADD_INIT(ctx,src,v) + * KMP_ADD_NEW(ctx,src,v,s) + * KMP_ADD_DUP(ctx,src,v,s) + * KMP_NO_DUPS no support for duplicates + * + * Parameters to build(): + * KMP_BUILD_STATE(ctx,s) called for all states (except null) in order of non-decreasing tree depth + * + * KMP_WANT_CLEANUP cleanup() + * KMP_WANT_SEARCH includes lib/kmp-search.h with the same prefix; + * there can be multiple search variants for a single KMP structure + * + * KMP_USE_POOL allocates on a given pool + */ + +#ifndef KMP_PREFIX +#error Missing KMP_PREFIX +#endif + +#include "lib/mempool.h" +#include + +#define P(x) KMP_PREFIX(x) + +#ifdef KMP_CHAR +typedef KMP_CHAR P(char_t); +#else +typedef u16 P(char_t); +#endif + +typedef u32 P(len_t); + +#ifdef KMP_NODE +typedef KMP_NODE P(node_t); +#else +typedef struct {} P(node_t); +#endif + +struct P(state) { + struct P(state) *from; /* state with previous character */ + struct P(state) *back; /* backwards edge to the largest shorter state */ + struct P(state) *next; /* largest shorter match */ + P(len_t) len; /* largest match, zero otherwise */ + P(char_t) c; /* last character */ + P(node_t) n; /* user-defined data */ +}; + +/* Control char */ +static inline P(char_t) +P(control_char) (void) +{ +#ifdef KMP_CONTROL_CHAR + return KMP_CONTROL_CHAR; +#else + return ':'; +#endif +} + +/* User-defined source */ +struct P(hash_table); + +static inline uns +P(hash_hash) (struct P(hash_table) *t UNUSED, struct P(state) *f, P(char_t) c) +{ + return (((uns)c) << 16) + (uns)(addr_int_t)f; +} + +static inline int +P(hash_eq) (struct P(hash_table) *t UNUSED, struct P(state) *f1, P(char_t) c1, struct P(state) *f2, P(char_t) c2) +{ + return f1 == f2 && c1 == c2; +} + +static inline void +P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(state) *f, P(char_t) c) +{ + s->from = f; + s->c = c; + s->len = 0; + s->back = NULL; + s->next = f->back; /* the pointers hold the link-list of sons... change in build() */ + f->back = s; +} + +#undef P +#define HASH_PREFIX(x) KMP_PREFIX(GLUE(hash_,x)) +#define HASH_NODE struct KMP_PREFIX(state) +#define HASH_KEY_COMPLEX(x) x from, x c +#define HASH_KEY_DECL struct KMP_PREFIX(state) *from, KMP_PREFIX(char_t) c +#define HASH_WANT_NEW +#define HASH_WANT_FIND +#ifdef KMP_WANT_CLEANUP +#define HASH_WANT_CLEANUP +#endif +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#ifdef KMP_USE_POOL +#define HASH_USE_POOL KMP_USE_POOL +#else +#define HASH_AUTO_POOL 4096 +#endif +#define HASH_CONSERVE_SPACE +#define HASH_TABLE_DYNAMIC +#include "lib/hashtable.h" +#define P(x) KMP_PREFIX(x) + +struct P(context) { + struct P(hash_table) hash; /* hash table*/ + struct P(state) null; /* null state */ +}; + +#ifdef KMP_SOURCE +typedef KMP_SOURCE P(source_t); +#else +typedef byte *P(source_t); +#endif + +#ifdef KMP_GET_CHAR +static inline int +P(get_char) (struct P(context) *ctx, P(source_t) *src, P(char_t) *c) +{ + return KMP_GET_CHAR(*ctx, *src, *c); +} +#else +# if defined(KMP_USE_UTF8) +# include "lib/unicode.h" +# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER) || defined(KMP_UNACCENT) +# include "charset/unicat.h" +# endif +# elif defined(KMP_USE_ASCII) +# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER) +# include "lib/chartype.h" +# endif +# endif +static inline int +P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src, P(char_t) *c) +{ +# ifdef KMP_USE_UTF8 + uns cc; + *src = (byte *)utf8_get(*src, &cc); +# ifdef KMP_ONLYALPHA + if (unlikely(!cc)) {} + else if (!Ualpha(cc)) + cc = P(control_char)(); + else +# endif + { +# ifdef KMP_TOLOWER + cc = Utolower(cc); +# endif +# ifdef KMP_UNACCENT + cc = Uunaccent(cc); +# endif + } +# else + uns cc = *(*src)++; +# ifdef KMP_ONLYALPHA + if (unlikely(!cc)) {} + else if (!Calpha(cc)) + cc = P(control_char)(); + else +# endif +# ifdef KMP_TOLOWER + cc = Clocase(c); +# endif +# endif + *c = cc; + return !!cc; +} +#endif + +static struct P(state) * +P(add) (struct P(context) *ctx, P(source_t) src +# ifdef KMP_ADD_EXTRA_ARGS + , KMP_ADD_EXTRA_ARGS +# endif +) +{ +# ifdef KMP_ADD_EXTRA_VAR + KMP_ADD_EXTRA_VAR v; +# endif +# ifdef KMP_ADD_INIT + { KMP_ADD_INIT(ctx, src, v); } +# endif + + P(char_t) c; + if (unlikely(!P(get_char)(ctx, &src, &c))) + return NULL; + struct P(state) *p = &ctx->null, *s; + uns len = 0; + do + { + s = P(hash_find)(&ctx->hash, p, c); + if (!s) + for (;;) + { + s = P(hash_new)(&ctx->hash, p, c); + len++; + if (unlikely(!(P(get_char)(ctx, &src, &c)))) + goto enter_new; + p = s; + } + p = s; + len++; + } + while (P(get_char)(ctx, &src, &c)); +# ifdef KMP_NO_DUPS + ASSERT(!s->len); +# else + if (s->len) + { +# ifdef KMP_ADD_DUP + { KMP_ADD_DUP(ctx, src, v, s); } +# endif + return s; + } +# endif +enter_new: + s->len = len; +# ifdef KMP_ADD_NEW + { KMP_ADD_NEW(ctx, src, v, s); } +# endif + return s; +} + +static void +P(init) (struct P(context) *ctx) +{ + memset(ctx, 0, sizeof(*ctx)); + P(hash_init)(&ctx->hash); +} + +#ifdef KMP_WANT_CLEANUP +static inline void +P(cleanup) (struct P(context) *ctx) +{ + P(hash_cleanup)(&ctx->hash); +} +#endif + +static inline int +P(empty) (struct P(context) *ctx) +{ + return !ctx->hash.hash_count; +} + +static void +P(build) (struct P(context) *ctx) +{ + if (P(empty)(ctx)) + return; + uns read = 0, write = 0; + struct P(state) *fifo[ctx->hash.hash_count]; + for (struct P(state) *s = ctx->null.back; s; s = s->next) + fifo[write++] = s; + ctx->null.back = NULL; + while (read != write) + { + struct P(state) *s = fifo[read++], *t; + for (t = s->back; t; t = t->next) + fifo[write++] = t; + for (t = s->from->back; 1; t = t->back) + { + if (!t) + { + s->back = &ctx->null; + s->next = NULL; + break; + } + s->back = P(hash_find)(&ctx->hash, t, s->c); + if (s->back) + { + s->next = s->back->len ? s->back : s->back->next; + break; + } + } +#ifdef KMP_BUILD_STATE + { KMP_BUILD_STATE(ctx, s); } +#endif + } +} + +#undef P +#undef KMP_CHAR +#undef KMP_SOURCE +#undef KMP_GET_CHAR +#undef KMP_NODE +#undef KMP_USE_ASCII +#undef KMP_USE_UTF8 +#undef KMP_TOLOWER +#undef KMP_UNACCENT +#undef KMP_ONLYALPHA +#undef KMP_CONTROL_CHAR +#undef KMP_ADD_EXTRA_ARGS +#undef KMP_ADD_EXTRA_VAR +#undef KMP_ADD_INIT +#undef KMP_ADD_NEW +#undef KMP_ADD_DUP +#undef KMP_NO_DUPS +#undef KMP_BUILD_STATE +#undef KMP_USE_POOL + +#ifdef KMP_WANT_SEARCH +# undef KMP_WANT_SEARCH +# define KMPS_PREFIX(x) KMP_PREFIX(x) +# define KMPS_KMP_PREFIX(x) KMP_PREFIX(x) +# include "lib/kmp-search.h" +#endif + +#undef KMP_PREFIX diff --git a/lib/kmp-search.h b/lib/kmp-search.h new file mode 100644 index 00000000..0c572b65 --- /dev/null +++ b/lib/kmp-search.h @@ -0,0 +1,179 @@ +/* + * Knuth-Morris-Pratt's Substring Search for N given strings + * + * (c) 1999--2005, Robert Spalek + * (c) 2006, Pavel Charvat + * + * (In fact, the algorithm is usually referred to as Aho-McCorasick, + * but that's just an extension of KMP to multiple strings.) + */ + +/* + * This is not a normal header file, it's a generator of KMP algorithm. + * Each time you include it with parameters set in the corresponding + * preprocessor macros, it generates KMP structures and functions + * with the parameters given. + * + * [*] KMPS_PREFIX(x) macro to add a name prefix (used on all global names + * defined by the KMP search generator). + * [*] KMPS_KMP_PREFIX(x) prefix used for lib/kmp.h; + * more variants of kmp-search can be used for single lib/kmp.h + * + * KMPS_SOURCE user-defined search input (together with KMPS_GET_CHAR); + * if unset, the one from lib/kmp.h is used + * KMPS_GET_CHAR(ctx,src,s) + * + * KMPS_ADD_CONTROLS adds control characters to start and the end + * KMPS_MERGE_CONTROLS merges adjacent control characterss to a single one + * + * KMPS_EXTRA_ARGS extra arguments to the search routine + * KMPS_EXTRA_VAR extra user-defined structure in search structures + * KMPS_INIT(ctx,src,s) + * KMPS_EXIT(ctx,src,s) + * KMPS_FOUND(ctx,src,s) + * KMPS_FOUND_CHAIN(ctx,src,s) + * KMPS_STEP(ctx,src,s) + * KMPS_T + * + * KMPS_WANT_BEST + */ + +#define P(x) KMPS_PREFIX(x) +#define KP(x) KMPS_KMP_PREFIX(x) + +#ifdef KMPS_SOURCE +typedef KMPS_SOURCE P(search_source_t); +#else +typedef KP(source_t) P(search_source_t); +#endif + +#ifndef KMPS_GET_CHAR +#define KMPS_GET_CHAR(ctx,src,s) ({ KP(get_char)(ctx, &src, &s.c); }) +#endif + +struct P(search) { + struct KP(state) *s; /* current state */ + struct KP(state) *out; /* output state */ +# ifdef KMPS_WANT_BEST + struct KP(state) *best; /* largest match */ +# endif + KP(char_t) c; /* last character */ +# ifdef KMPS_EXTRA_VAR + KMPS_EXTRA_VAR v; /* user-defined */ +# endif +# ifdef KMPS_ADD_CONTROLS + uns eof; +# endif +}; + +#ifdef KMPS_T +static KMPS_T +#else +static void +#endif +P(search) (struct KP(context) *ctx, P(search_source_t) src +# ifdef KMPS_EXTRA_ARGS + , KMPS_EXTRA_ARGS +# endif +) +{ + struct P(search) s; + s.s = &ctx->null; +# ifdef KMPS_WANT_BEST + s.best = &ctx->null; +# endif +# ifdef KMPS_ADD_CONTROLS + s.c = KP(control_char)(); + s.eof = 0; +# else + s.c = 0; +# endif +# ifdef KMPS_INIT + { KMPS_INIT(ctx, src, s); } +# endif +# ifndef KMPS_ADD_CONTROLS + goto start_read; +#endif + for (;;) + { + for (struct KP(state) *t = s.s; t && !(s.s = KP(hash_find)(&ctx->hash, t, s.c)); t = t->back); + s.s = s.s ? : &ctx->null; + +# ifdef KMPS_STEP + { KMPS_STEP(ctx, src, s); } +# endif + +# if defined(KMPS_FOUND) || defined(KMPS_FOUND_CHAIN) || defined(KMPS_WANT_BEST) + s.out = s.s->len ? s.s : s.s->next; + if (s.out) + { +# ifdef KMPS_WANT_BEST + if (s.out->len > s.best->len) + s.best = s.out; +# endif + #ifdef KMPS_FOUND_CHAIN + { KMPS_FOUND_CHAIN(ctx, src, s); } +# endif +# ifdef KMPS_FOUND + do + { KMPS_FOUND(ctx, src, s); } + while (s.out = s.out->next); +# endif + } +# endif + +# ifdef KMPS_ADD_CONTROLS + if (unlikely(s.eof)) + break; +# endif + +# ifndef KMPS_ADD_CONTROLS +start_read: ; +# endif +# ifdef KMPS_MERGE_CONTROLS + KP(char_t) last_c = s.c; +# endif + + do + { + if (unlikely(!KMPS_GET_CHAR(ctx, src, s))) + { +# ifdef KMPS_ADD_CONTROLS + if (s.c != KP(control_char)()) + { + s.c = KP(control_char)(); + s.eof = 1; + break; + } +# endif + goto exit; + } + } + while (0 +# ifdef KMPS_MERGE_CONTROLS + || (last_c == KP(control_char)() && s.c == KP(control_char)()) +# endif + ); + } +exit: ; +# ifdef KMPS_EXIT + { KMPS_EXIT(ctx, src, s); } +# endif +} + +#undef P +#undef KMPS_PREFIX +#undef KMPS_KMP_PREFIX +#undef KMPS_SOURCE +#undef KMPS_GET_CHAR +#undef KMPS_ADD_CONTROLS +#undef KMPS_MERGE_CONTROLS +#undef KMPS_EXTRA_ARGS +#undef KMPS_EXTRA_VAR +#undef KMPS_INIT +#undef KMPS_EXIT +#undef KMPS_FOUND +#undef KMPS_FOUND_CHAIN +#undef KMPS_STEP +#undef KMPS_T +#undef KMPS_WANT_BEST diff --git a/lib/kmp-test.c b/lib/kmp-test.c new file mode 100644 index 00000000..fa07f060 --- /dev/null +++ b/lib/kmp-test.c @@ -0,0 +1,140 @@ +#include "lib/lib.h" +#include "lib/mempool.h" +#include + +#if 0 +#define TRACE(x...) do{log(L_DEBUG, x);}while(0) +#else +#define TRACE(x...) do{}while(0) +#endif + +#define KMP_PREFIX(x) GLUE_(kmp1,x) +#define KMP_WANT_CLEANUP +#define KMP_WANT_SEARCH +#define KMPS_WANT_BEST +#define KMPS_T uns +#define KMPS_EXIT(ctx,src,s) do{ return s.best->len; }while(0) +#include "lib/kmp-new.h" + +static void +test1(void) +{ + log(L_INFO, "Running test1"); + struct kmp1_context ctx; + kmp1_init(&ctx); + kmp1_add(&ctx, "ahoj"); + kmp1_add(&ctx, "hoj"); + kmp1_add(&ctx, "aho"); + kmp1_build(&ctx); + UNUSED uns best = kmp1_search(&ctx, "asjlahslhalahosjkjhojsas"); + TRACE("Best match has %d characters", best); + ASSERT(best == 3); + kmp1_cleanup(&ctx); +} + +#define KMP_PREFIX(x) GLUE_(kmp2,x) +#define KMP_USE_UTF8 +#define KMP_TOLOWER +#define KMP_ONLYALPHA +#define KMP_NODE struct { byte *str; uns id; } +#define KMP_ADD_EXTRA_ARGS uns id +#define KMP_ADD_EXTRA_VAR byte * +#define KMP_ADD_INIT(ctx,src,var) do{ var = src; }while(0) +#define KMP_ADD_NEW(ctx,src,var,state) do{ TRACE("Inserting string %s with id %d", var, id); \ + state->n.str = var; state->n.id = id; }while(0) +#define KMP_ADD_DUP(ctx,src,var,state) do{ TRACE("String %s already inserted", var); }while(0) +#define KMP_WANT_CLEANUP +#define KMP_WANT_SEARCH +#define KMPS_ADD_CONTROLS +#define KMPS_MERGE_CONTROLS +#define KMPS_WANT_BEST +#define KMPS_FOUND(ctx,src,s) do{ TRACE("String %s with id %d found", s.out->n.str, s.out->n.id); }while(0) +#define KMPS_STEP(ctx,src,s) do{ TRACE("Got to state %p after reading %d", s.s, s.c); }while(0) +#define KMPS_EXIT(ctx,src,s) do{ if (s.best->len) TRACE("Best match is %s", s.best->n.str); } while(0) +#include "lib/kmp-new.h" + +static void +test2(void) +{ + log(L_INFO, "Running test2"); + struct kmp2_context ctx; + kmp2_init(&ctx); + kmp2_add(&ctx, "ahoj", 1); + kmp2_add(&ctx, "ahoj", 2); + kmp2_add(&ctx, "hoj", 3); + kmp2_add(&ctx, "aho", 4); + kmp2_add(&ctx, "aba", 5); + kmp2_add(&ctx, "aba", 5); + kmp2_add(&ctx, "pěl", 5); + kmp2_build(&ctx); + kmp2_search(&ctx, "Šíleně žluťoučký kůň úpěl ďábelské ódy labababaks sdahojdhsaladsjhla"); + kmp2_cleanup(&ctx); +} + +#define KMP_PREFIX(x) GLUE_(kmp3,x) +#define KMP_NODE uns +#define KMP_ADD_EXTRA_ARGS uns index +#define KMP_ADD_EXTRA_VAR byte * +#define KMP_ADD_INIT(ctx,src,v) do{ v = src; }while(0) +#define KMP_ADD_NEW(ctx,src,v,s) do{ s->n = index; }while(0) +#define KMP_ADD_DUP(ctx,src,v,s) do{ *v = 0; }while(0) +#define KMP_WANT_CLEANUP +#define KMP_WANT_SEARCH +#define KMPS_EXTRA_ARGS uns *cnt, uns *sum +#define KMPS_FOUND(ctx,src,s) do{ ASSERT(cnt[s.out->n]); cnt[s.out->n]--; sum[0]--; }while(0) +#include "lib/kmp-new.h" + +static void +test3(void) +{ + log(L_INFO, "Running test3"); + struct mempool *pool = mp_new(1024); + for (uns testn = 0; testn < 100; testn++) + { + mp_flush(pool); + uns n = random_max(100); + byte *s[n]; + struct kmp3_context ctx; + kmp3_init(&ctx); + for (uns i = 0; i < n; i++) + { + uns m = random_max(10); + s[i] = mp_alloc(pool, m + 1); + for (uns j = 0; j < m; j++) + s[i][j] = 'a' + random_max(3); + s[i][m] = 0; + kmp3_add(&ctx, s[i], i); + } + kmp3_build(&ctx); + for (uns i = 0; i < 10; i++) + { + uns m = random_max(100); + byte b[m + 1]; + for (uns j = 0; j < m; j++) + b[j] = 'a' + random_max(4); + b[m] = 0; + uns cnt[n], sum = 0; + for (uns j = 0; j < n; j++) + { + cnt[j] = 0; + if (*s[j]) + for (uns k = 0; k < m; k++) + if (!strncmp(b + k, s[j], strlen(s[j]))) + cnt[j]++, sum++; + } + kmp3_search(&ctx, b, cnt, &sum); + ASSERT(sum == 0); + } + kmp3_cleanup(&ctx); + } + mp_delete(pool); +} + +int +main(void) +{ + test1(); + test2(); + test3(); + return 0; +}