X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=lib%2Fkmp.h;h=aa889f694d08b25209b71cc17970af7ad8144806;hb=8977e6ac284666f602be2a8d954842c343854e41;hp=2ce01b3a910e0dee29505b59c1d041c5e6bf5b3f;hpb=42b76ab379f0419d9521c9028ca41ae82b6ce890;p=libucw.git diff --git a/lib/kmp.h b/lib/kmp.h index 2ce01b3a..aa889f69 100644 --- a/lib/kmp.h +++ b/lib/kmp.h @@ -14,50 +14,94 @@ * preprocessor macros, it generates KMP structures and functions * with the parameters given. * + * This file contains only construction of the automaton. The search + * itself can be generated by inclusion of file lib/kmp-search.h. + * Separeted headers allow the user to define multiple search + * routines for one common set of key strings. + * + * Example: + * + * #define KMP_PREFIX(x) kmp_##x + * #define KMP_WANT_CLEANUP + * #define KMP_WANT_SEARCH // includes lib/kmp-search.h automatically + * #define KMPS_FOUND(kmp,src,s) printf("found\n") + * #include "lib/kmp.h" + * + * [...] + * + * struct kmp_struct kmp; // a structure describing the whole automaton + * kmp_init(&kmp); // initialization (must be called before all other functions) + * + * // add key strings we want to search + * kmp_add(&kmp, "aaa"); + * kmp_add(&kmp, "abc"); + * + * // complete the automaton, no more strings can be added later + * kmp_build(&kmp); + * + * // example of search, should print single "found" to stdout + * kmp_run(&kmp, "aabaabca"); + * + * // destroy all internal structures + * kmp_cleanup(&kmp); + * + * + * Brief description of all parameters: * * Basic parameters: * KMP_PREFIX(x) macro to add a name prefix (used on all global names - * defined by the KMP generator); mandatory + * defined by the KMP generator); mandatory; + * we abbreviate this to P(x) below * * KMP_CHAR alphabet type, the default is u16 - * - * KMP_SOURCE user-defined text source; KMP_GET_CHAR must - * KMP_GET_CHAR(ctx,src,c) return next character from the input or zero at the end; + * + * KMP_SOURCE user-defined text source; KMP_GET_CHAR must + * KMP_GET_CHAR(kmp,src,c) return zero at the end or nonzero together with the next character in c otherwise; * if not defined, zero-terminated array of bytes is used as the input - * - * KMP_NODE user-defined data in each state of the automaton - * KMP_CONTEXT user-defined data in struct context (a structure describing - * the whole automaton) + * + * KMP_VARS user-defined variables in 'struct P(struct)' + * -- a structure describing the whole automaton; + * these variables are stored in .u substructure to avoid collisions + * KMP_STATE_VARS user-defined variables in 'struct P(state)' + * -- created for each state of the automaton; + * these variables are stored in .u substructure to avoid collisions * * Parameters which select how the input is interpreted (if KMP_SOURCE is unset): * KMP_USE_ASCII reads single bytes from the input (default) * KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed) * KMP_TOLOWER converts all to lowercase * KMP_UNACCENT removes accents - * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR - * KMP_CONTROL_CHAR special control character (default is ':') + * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR (see below) * - * Parameters controlling add(): - * KMP_ADD_EXTRA_ARGS extra arguments - * KMP_ADD_EXTRA_VAR structure with extra local variables - * KMP_ADD_INIT(ctx,src,v) - * KMP_ADD_NEW(ctx,src,v,s) - * KMP_ADD_DUP(ctx,src,v,s) - * KMP_NO_DUPS no support for duplicates + * Parameters controlling add(kmp, src): + * KMP_ADD_EXTRA_ARGS extra arguments, should be used carefully because of possible collisions + * KMP_ADD_INIT(kmp,src) called in the beginning of add(), src is the first + * KMP_INIT_STATE(kmp,s) initialization of a new state s (called before KMP_ADD_{NEW,DUP}); + * null state is not included and should be handled after init() if necessary; + * all user-defined data are filled by zeros before call to KMP_INIT_STATE + * KMP_ADD_NEW(kmp,src,s) initialize last state of every new key string (called after KMP_INIT_STATE); + * the string must be parsed before so src is after the last string's character + * KMP_ADD_DUP(kmp,src,s) analogy of KMP_ADD_NEW called for duplicates * * Parameters to build(): - * KMP_BUILD_STATE(ctx,s) called for all states (including null) in order of non-decreasing tree depth + * KMP_BUILD_STATE(kmp,s) called for all states (including null) in order of non-decreasing tree depth * * Other parameters: * KMP_WANT_CLEANUP define cleanup() * KMP_WANT_SEARCH includes lib/kmp-search.h with the same prefix; - * there can be multiple search variants for a single KMP structure - * + * there can be multiple search variants for a single KMP automaton * KMP_USE_POOL allocates in a given pool - * - * KMP_GIVE_ALLOC - * KMP_GIVE_HASHFN - * KMP_GIVE_EQ + * KMP_CONTROL_CHAR special control character (default is ':') + * KMP_GIVE_ALLOC if set, you must supply custom allocation functions: + * void *alloc(unsigned int size) -- allocate space for + * a state. Default is pooled allocation from a local pool or HASH_USE_POOL. + * void free(void *) -- the converse. + * KMP_GIVE_HASHFN if set, you must supply custom hash function: + * unsigned int hash(struct P(struct) *kmp, struct P(state) *state, KMP_CHAR c); + * default hash function works only for integer character types + * KMP_GIVE_EQ if set, you must supply custom compare function of two characters: + * int eq(struct P(struct) *kmp, KMP_CHAR a, KMP_CHAR b); + * default is 'a == b' */ #ifndef KMP_PREFIX @@ -84,26 +128,30 @@ typedef KMP_NODE P(node_t); typedef struct {} P(node_t); #endif -struct P(context); +struct P(struct); struct P(state) { - struct P(state) *from; /* state with previous character */ - struct P(state) *back; /* backwards edge to the largest shorter state */ - struct P(state) *next; /* largest shorter match */ - P(len_t) len; /* largest match, zero otherwise */ - P(char_t) c; /* last character */ - P(node_t) n; /* user-defined data */ + struct P(state) *from; /* state with the previous character (forms a tree with null state in the root) */ + struct P(state) *back; /* backwards edge to the longest shorter state with same suffix */ + struct P(state) *next; /* the longest of shorter matches (or NULL) */ + P(len_t) len; /* state depth if it represents a key string, zero otherwise */ + P(char_t) c; /* last character of the represented string */ + struct { +# ifdef KMP_STATE_VARS + KMP_STATE_VARS +# endif + } u; /* user-defined data*/ }; /* Control char */ static inline P(char_t) P(control) (void) { -#ifdef KMP_CONTROL_CHAR +# ifdef KMP_CONTROL_CHAR return KMP_CONTROL_CHAR; -#else +# else return ':'; -#endif +# endif } /* User-defined source */ @@ -114,35 +162,35 @@ struct P(hash_table); static inline uns P(hash_hash) (struct P(hash_table) *t, struct P(state) *f, P(char_t) c) { - return P(hash) ((struct P(context) *) t, f, c); + return P(hash) ((struct P(struct) *) t, f, c); } #else static inline uns P(hash_hash) (struct P(hash_table) *t UNUSED, struct P(state) *f, P(char_t) c) { - return (((uns)c) << 16) + (uns)(addr_int_t)f; + return (((uns)c) << 16) + (uns)(uintptr_t)f; } #endif #ifndef KMP_GIVE_EQ static inline int -P(eq) (struct P(context) *ctx UNUSED, P(char_t) c1, P(char_t) c2) +P(eq) (struct P(struct) *kmp UNUSED, P(char_t) c1, P(char_t) c2) { return c1 == c2; } #endif static inline int -P(is_control) (struct P(context) *ctx, P(char_t) c) +P(is_control) (struct P(struct) *kmp, P(char_t) c) { - return P(eq) (ctx, c, P(control)()); + return P(eq) (kmp, c, P(control)()); } #define HASH_GIVE_EQ static inline int P(hash_eq) (struct P(hash_table) *t, struct P(state) *f1, P(char_t) c1, struct P(state) *f2, P(char_t) c2) { - return f1 == f2 && P(eq)((struct P(context) *) t, c1, c2); + return f1 == f2 && P(eq)((struct P(struct) *) t, c1, c2); } #ifdef KMP_GIVE_ALLOC @@ -150,13 +198,13 @@ P(hash_eq) (struct P(hash_table) *t, struct P(state) *f1, P(char_t) c1, struct P static inline void * P(hash_alloc) (struct P(hash_table) *t, uns size) { - return P(alloc) ((struct P(context) *) t, size); + return P(alloc) ((struct P(struct) *) t, size); } static inline void P(hash_free) (struct P(hash_table) *t, void *ptr) { - P(free) ((struct P(context) *) t, ptr); + P(free) ((struct P(struct) *) t, ptr); } #endif @@ -165,6 +213,10 @@ static inline void P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(state) *f, P(char_t) c) { bzero(s, sizeof(*s)); +# ifdef KMP_INIT_STATE + struct P(struct) *kmp = (struct P(struct) *)t; + { KMP_INIT_STATE(kmp, s); } +# endif s->from = f; s->c = c; s->next = f->back; /* the pointers hold the link-list of sons... changed in build() */ @@ -172,7 +224,7 @@ P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(s } #undef P -#define HASH_PREFIX(x) KMP_PREFIX(GLUE(hash_,x)) +#define HASH_PREFIX(x) KMP_PREFIX(hash_##x) #define HASH_NODE struct KMP_PREFIX(state) #define HASH_KEY_COMPLEX(x) x from, x c #define HASH_KEY_DECL struct KMP_PREFIX(state) *from, KMP_PREFIX(char_t) c @@ -191,12 +243,14 @@ P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(s #include "lib/hashtable.h" #define P(x) KMP_PREFIX(x) -struct P(context) { +struct P(struct) { struct P(hash_table) hash; /* hash table of state transitions */ struct P(state) null; /* null state */ -# ifdef KMP_CONTEXT - KMP_CONTEXT v; /* user defined data */ -# endif + struct { +# ifdef KMP_VARS + KMP_VARS +# endif + } u; /* user-defined data */ }; #ifdef KMP_SOURCE @@ -207,9 +261,9 @@ typedef byte *P(source_t); #ifdef KMP_GET_CHAR static inline int -P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src UNUSED, P(char_t) *c UNUSED) +P(get_char) (struct P(struct) *kmp UNUSED, P(source_t) *src UNUSED, P(char_t) *c UNUSED) { - return KMP_GET_CHAR(ctx, (*src), (*c)); + return KMP_GET_CHAR(kmp, (*src), (*c)); } #else # if defined(KMP_USE_UTF8) @@ -223,7 +277,7 @@ P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src UNUSED, P(char_t) * # endif # endif static inline int -P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src, P(char_t) *c) +P(get_char) (struct P(struct) *kmp UNUSED, P(source_t) *src, P(char_t) *c) { # ifdef KMP_USE_UTF8 uns cc; @@ -233,7 +287,7 @@ P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src, P(char_t) *c) else if (!Ualpha(cc)) cc = P(control)(); else -# endif +# endif { # ifdef KMP_TOLOWER cc = Utolower(cc); @@ -263,78 +317,71 @@ P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src, P(char_t) *c) #endif static struct P(state) * -P(add) (struct P(context) *ctx, P(source_t) src +P(add) (struct P(struct) *kmp, P(source_t) src # ifdef KMP_ADD_EXTRA_ARGS , KMP_ADD_EXTRA_ARGS # endif ) { -# ifdef KMP_ADD_EXTRA_VAR - KMP_ADD_EXTRA_VAR v; -# endif # ifdef KMP_ADD_INIT - { KMP_ADD_INIT(ctx, src, v); } + { KMP_ADD_INIT(kmp, src); } # endif P(char_t) c; - if (!P(get_char)(ctx, &src, &c)) + if (!P(get_char)(kmp, &src, &c)) return NULL; - struct P(state) *p = &ctx->null, *s; + struct P(state) *p = &kmp->null, *s; uns len = 0; do { - s = P(hash_find)(&ctx->hash, p, c); + s = P(hash_find)(&kmp->hash, p, c); if (!s) for (;;) { - s = P(hash_new)(&ctx->hash, p, c); + s = P(hash_new)(&kmp->hash, p, c); len++; - if (!(P(get_char)(ctx, &src, &c))) + if (!(P(get_char)(kmp, &src, &c))) goto enter_new; p = s; } p = s; len++; } - while (P(get_char)(ctx, &src, &c)); -# ifdef KMP_NO_DUPS - ASSERT(!s->len); -# else + while (P(get_char)(kmp, &src, &c)); if (s->len) { # ifdef KMP_ADD_DUP - { KMP_ADD_DUP(ctx, src, v, s); } + { KMP_ADD_DUP(kmp, src, s); } # endif return s; } -# endif enter_new: s->len = len; # ifdef KMP_ADD_NEW - { KMP_ADD_NEW(ctx, src, v, s); } + { KMP_ADD_NEW(kmp, src, s); } # endif return s; } static void -P(init) (struct P(context) *ctx) +P(init) (struct P(struct) *kmp) { - bzero(&ctx->null, sizeof(struct P(state))); - P(hash_init)(&ctx->hash); + bzero(&kmp->null, sizeof(struct P(state))); + P(hash_init)(&kmp->hash); } #ifdef KMP_WANT_CLEANUP static inline void -P(cleanup) (struct P(context) *ctx) +P(cleanup) (struct P(struct) *kmp) { - P(hash_cleanup)(&ctx->hash); + P(hash_cleanup)(&kmp->hash); } #endif static inline int -P(empty) (struct P(context) *ctx) +P(empty) (struct P(struct) *kmp) { - return !ctx->hash.hash_count; + return !kmp->hash.hash_count; } static inline struct P(state) * @@ -344,18 +391,18 @@ P(chain_start) (struct P(state) *s) } static void -P(build) (struct P(context) *ctx) +P(build) (struct P(struct) *kmp) { - if (P(empty)(ctx)) + if (P(empty)(kmp)) return; uns read = 0, write = 0; - struct P(state) *fifo[ctx->hash.hash_count], *null = &ctx->null; + struct P(state) *fifo[kmp->hash.hash_count], *null = &kmp->null; for (struct P(state) *s = null->back; s; s = s->next) fifo[write++] = s; null->back = NULL; # ifdef KMP_BUILD_STATE - { KMP_BUILD_STATE(ctx, null); } -# endif + { KMP_BUILD_STATE(kmp, null); } +# endif while (read != write) { struct P(state) *s = fifo[read++], *t; @@ -369,7 +416,7 @@ P(build) (struct P(context) *ctx) s->next = NULL; break; } - s->back = P(hash_find)(&ctx->hash, t, s->c); + s->back = P(hash_find)(&kmp->hash, t, s->c); if (s->back) { s->next = s->back->len ? s->back : s->back->next; @@ -377,8 +424,8 @@ P(build) (struct P(context) *ctx) } } # ifdef KMP_BUILD_STATE - { KMP_BUILD_STATE(ctx, s); } -# endif + { KMP_BUILD_STATE(kmp, s); } +# endif } } @@ -386,7 +433,8 @@ P(build) (struct P(context) *ctx) #undef KMP_CHAR #undef KMP_SOURCE #undef KMP_GET_CHAR -#undef KMP_NODE +#undef KMP_VARS +#undef KMP_STATE_VARS #undef KMP_CONTEXT #undef KMP_USE_ASCII #undef KMP_USE_UTF8 @@ -395,11 +443,10 @@ P(build) (struct P(context) *ctx) #undef KMP_ONLYALPHA #undef KMP_CONTROL_CHAR #undef KMP_ADD_EXTRA_ARGS -#undef KMP_ADD_EXTRA_VAR #undef KMP_ADD_INIT #undef KMP_ADD_NEW #undef KMP_ADD_DUP -#undef KMP_NO_DUPS +#undef KMP_INIT_STATE #undef KMP_BUILD_STATE #undef KMP_USE_POOL #undef KMP_GIVE_ALLOC