* preprocessor macros, it generates KMP structures and functions
* with the parameters given.
*
+ * This file contains only construction of the automaton. The search
+ * itself can be generated by inclusion of file lib/kmp-search.h.
+ * Separeted headers allow the user to define multiple search
+ * routines for one common set of key strings.
+ *
+ * Example:
+ *
+ * #define KMP_PREFIX(x) kmp_##x
+ * #define KMP_WANT_CLEANUP
+ * #define KMP_WANT_SEARCH // includes lib/kmp-search.h automatically
+ * #define KMPS_FOUND(kmp,src,s) printf("found\n")
+ * #include "lib/kmp.h"
+ *
+ * [...]
+ *
+ * struct kmp_struct kmp; // a structure describing the whole automaton
+ * kmp_init(&kmp); // initialization (must be called before all other functions)
+ *
+ * // add key strings we want to search
+ * kmp_add(&kmp, "aaa");
+ * kmp_add(&kmp, "abc");
+ *
+ * // complete the automaton, no more strings can be added later
+ * kmp_build(&kmp);
+ *
+ * // example of search, should print single "found" to stdout
+ * kmp_run(&kmp, "aabaabca");
+ *
+ * // destroy all internal structures
+ * kmp_cleanup(&kmp);
+ *
+ *
+ * Brief description of all parameters:
*
* Basic parameters:
* KMP_PREFIX(x) macro to add a name prefix (used on all global names
- * defined by the KMP generator); mandatory
+ * defined by the KMP generator); mandatory;
+ * we abbreviate this to P(x) below
*
* KMP_CHAR alphabet type, the default is u16
- *
- * KMP_SOURCE user-defined text source; KMP_GET_CHAR must
- * KMP_GET_CHAR(ctx,src,c) return next character from the input or zero at the end;
+ *
+ * KMP_SOURCE user-defined text source; KMP_GET_CHAR must
+ * KMP_GET_CHAR(kmp,src,c) return zero at the end or nonzero together with the next character in c otherwise;
* if not defined, zero-terminated array of bytes is used as the input
- *
- * KMP_NODE user-defined data in each state of the automaton
- * KMP_CONTEXT user-defined data in struct context (a structure describing
- * the whole automaton)
+ *
+ * KMP_VARS user-defined variables in 'struct P(struct)'
+ * -- a structure describing the whole automaton;
+ * these variables are stored in .u substructure to avoid collisions
+ * KMP_STATE_VARS user-defined variables in 'struct P(state)'
+ * -- created for each state of the automaton;
+ * these variables are stored in .u substructure to avoid collisions
*
* Parameters which select how the input is interpreted (if KMP_SOURCE is unset):
* KMP_USE_ASCII reads single bytes from the input (default)
* KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed)
* KMP_TOLOWER converts all to lowercase
* KMP_UNACCENT removes accents
- * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR
- * KMP_CONTROL_CHAR special control character (default is ':')
+ * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR (see below)
*
- * Parameters controlling add():
- * KMP_ADD_EXTRA_ARGS extra arguments
- * KMP_ADD_EXTRA_VAR structure with extra local variables
- * KMP_ADD_INIT(ctx,src,v)
- * KMP_ADD_NEW(ctx,src,v,s)
- * KMP_ADD_DUP(ctx,src,v,s)
- * KMP_NO_DUPS no support for duplicates
+ * Parameters controlling add(kmp, src):
+ * KMP_ADD_EXTRA_ARGS extra arguments, should be used carefully because of possible collisions
+ * KMP_ADD_INIT(kmp,src) called in the beginning of add(), src is the first
+ * KMP_INIT_STATE(kmp,s) initialization of a new state s (called before KMP_ADD_{NEW,DUP});
+ * null state is not included and should be handled after init() if necessary;
+ * all user-defined data are filled by zeros before call to KMP_INIT_STATE
+ * KMP_ADD_NEW(kmp,src,s) initialize last state of every new key string (called after KMP_INIT_STATE);
+ * the string must be parsed before so src is after the last string's character
+ * KMP_ADD_DUP(kmp,src,s) analogy of KMP_ADD_NEW called for duplicates
*
* Parameters to build():
- * KMP_BUILD_STATE(ctx,s) called for all states (including null) in order of non-decreasing tree depth
+ * KMP_BUILD_STATE(kmp,s) called for all states (including null) in order of non-decreasing tree depth
*
* Other parameters:
* KMP_WANT_CLEANUP define cleanup()
* KMP_WANT_SEARCH includes lib/kmp-search.h with the same prefix;
- * there can be multiple search variants for a single KMP structure
- *
+ * there can be multiple search variants for a single KMP automaton
* KMP_USE_POOL allocates in a given pool
- *
- * KMP_GIVE_ALLOC
- * KMP_GIVE_HASHFN
- * KMP_GIVE_EQ
+ * KMP_CONTROL_CHAR special control character (default is ':')
+ * KMP_GIVE_ALLOC if set, you must supply custom allocation functions:
+ * void *alloc(unsigned int size) -- allocate space for
+ * a state. Default is pooled allocation from a local pool or HASH_USE_POOL.
+ * void free(void *) -- the converse.
+ * KMP_GIVE_HASHFN if set, you must supply custom hash function:
+ * unsigned int hash(struct P(struct) *kmp, struct P(state) *state, KMP_CHAR c);
+ * default hash function works only for integer character types
+ * KMP_GIVE_EQ if set, you must supply custom compare function of two characters:
+ * int eq(struct P(struct) *kmp, KMP_CHAR a, KMP_CHAR b);
+ * default is 'a == b'
*/
#ifndef KMP_PREFIX
typedef struct {} P(node_t);
#endif
-struct P(context);
+struct P(struct);
struct P(state) {
- struct P(state) *from; /* state with previous character */
- struct P(state) *back; /* backwards edge to the largest shorter state */
- struct P(state) *next; /* largest shorter match */
- P(len_t) len; /* largest match, zero otherwise */
- P(char_t) c; /* last character */
- P(node_t) n; /* user-defined data */
+ struct P(state) *from; /* state with the previous character (forms a tree with null state in the root) */
+ struct P(state) *back; /* backwards edge to the longest shorter state with same suffix */
+ struct P(state) *next; /* the longest of shorter matches (or NULL) */
+ P(len_t) len; /* state depth if it represents a key string, zero otherwise */
+ P(char_t) c; /* last character of the represented string */
+ struct {
+# ifdef KMP_STATE_VARS
+ KMP_STATE_VARS
+# endif
+ } u; /* user-defined data*/
};
/* Control char */
static inline P(char_t)
P(control) (void)
{
-#ifdef KMP_CONTROL_CHAR
+# ifdef KMP_CONTROL_CHAR
return KMP_CONTROL_CHAR;
-#else
+# else
return ':';
-#endif
+# endif
}
/* User-defined source */
static inline uns
P(hash_hash) (struct P(hash_table) *t, struct P(state) *f, P(char_t) c)
{
- return P(hash) ((struct P(context) *) t, f, c);
+ return P(hash) ((struct P(struct) *) t, f, c);
}
#else
static inline uns
P(hash_hash) (struct P(hash_table) *t UNUSED, struct P(state) *f, P(char_t) c)
{
- return (((uns)c) << 16) + (uns)(addr_int_t)f;
+ return (((uns)c) << 16) + (uns)(uintptr_t)f;
}
#endif
#ifndef KMP_GIVE_EQ
static inline int
-P(eq) (struct P(context) *ctx UNUSED, P(char_t) c1, P(char_t) c2)
+P(eq) (struct P(struct) *kmp UNUSED, P(char_t) c1, P(char_t) c2)
{
return c1 == c2;
}
#endif
static inline int
-P(is_control) (struct P(context) *ctx, P(char_t) c)
+P(is_control) (struct P(struct) *kmp, P(char_t) c)
{
- return P(eq) (ctx, c, P(control)());
+ return P(eq) (kmp, c, P(control)());
}
#define HASH_GIVE_EQ
static inline int
P(hash_eq) (struct P(hash_table) *t, struct P(state) *f1, P(char_t) c1, struct P(state) *f2, P(char_t) c2)
{
- return f1 == f2 && P(eq)((struct P(context) *) t, c1, c2);
+ return f1 == f2 && P(eq)((struct P(struct) *) t, c1, c2);
}
#ifdef KMP_GIVE_ALLOC
static inline void *
P(hash_alloc) (struct P(hash_table) *t, uns size)
{
- return P(alloc) ((struct P(context) *) t, size);
+ return P(alloc) ((struct P(struct) *) t, size);
}
static inline void
P(hash_free) (struct P(hash_table) *t, void *ptr)
{
- P(free) ((struct P(context) *) t, ptr);
+ P(free) ((struct P(struct) *) t, ptr);
}
#endif
P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(state) *f, P(char_t) c)
{
bzero(s, sizeof(*s));
+# ifdef KMP_INIT_STATE
+ struct P(struct) *kmp = (struct P(struct) *)t;
+ { KMP_INIT_STATE(kmp, s); }
+# endif
s->from = f;
s->c = c;
s->next = f->back; /* the pointers hold the link-list of sons... changed in build() */
}
#undef P
-#define HASH_PREFIX(x) KMP_PREFIX(GLUE(hash_,x))
+#define HASH_PREFIX(x) KMP_PREFIX(hash_##x)
#define HASH_NODE struct KMP_PREFIX(state)
#define HASH_KEY_COMPLEX(x) x from, x c
#define HASH_KEY_DECL struct KMP_PREFIX(state) *from, KMP_PREFIX(char_t) c
#include "lib/hashtable.h"
#define P(x) KMP_PREFIX(x)
-struct P(context) {
+struct P(struct) {
struct P(hash_table) hash; /* hash table of state transitions */
struct P(state) null; /* null state */
-# ifdef KMP_CONTEXT
- KMP_CONTEXT v; /* user defined data */
-# endif
+ struct {
+# ifdef KMP_VARS
+ KMP_VARS
+# endif
+ } u; /* user-defined data */
};
#ifdef KMP_SOURCE
typedef KMP_SOURCE P(source_t);
#else
-typedef byte *P(source_t);
+typedef char *P(source_t);
#endif
#ifdef KMP_GET_CHAR
static inline int
-P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src UNUSED, P(char_t) *c UNUSED)
+P(get_char) (struct P(struct) *kmp UNUSED, P(source_t) *src UNUSED, P(char_t) *c UNUSED)
{
- return KMP_GET_CHAR(ctx, (*src), (*c));
+ return KMP_GET_CHAR(kmp, (*src), (*c));
}
#else
# if defined(KMP_USE_UTF8)
# endif
# endif
static inline int
-P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src, P(char_t) *c)
+P(get_char) (struct P(struct) *kmp UNUSED, P(source_t) *src, P(char_t) *c)
{
# ifdef KMP_USE_UTF8
uns cc;
- *src = (byte *)utf8_get(*src, &cc);
+ *src = utf8_get(*src, &cc);
# ifdef KMP_ONLYALPHA
if (!cc) {}
else if (!Ualpha(cc))
cc = P(control)();
else
-# endif
+# endif
{
# ifdef KMP_TOLOWER
cc = Utolower(cc);
#endif
static struct P(state) *
-P(add) (struct P(context) *ctx, P(source_t) src
+P(add) (struct P(struct) *kmp, P(source_t) src
# ifdef KMP_ADD_EXTRA_ARGS
, KMP_ADD_EXTRA_ARGS
# endif
)
{
-# ifdef KMP_ADD_EXTRA_VAR
- KMP_ADD_EXTRA_VAR v;
-# endif
# ifdef KMP_ADD_INIT
- { KMP_ADD_INIT(ctx, src, v); }
+ { KMP_ADD_INIT(kmp, src); }
# endif
P(char_t) c;
- if (!P(get_char)(ctx, &src, &c))
+ if (!P(get_char)(kmp, &src, &c))
return NULL;
- struct P(state) *p = &ctx->null, *s;
+ struct P(state) *p = &kmp->null, *s;
uns len = 0;
do
{
- s = P(hash_find)(&ctx->hash, p, c);
+ s = P(hash_find)(&kmp->hash, p, c);
if (!s)
for (;;)
{
- s = P(hash_new)(&ctx->hash, p, c);
+ s = P(hash_new)(&kmp->hash, p, c);
len++;
- if (!(P(get_char)(ctx, &src, &c)))
+ if (!(P(get_char)(kmp, &src, &c)))
goto enter_new;
p = s;
}
p = s;
len++;
}
- while (P(get_char)(ctx, &src, &c));
-# ifdef KMP_NO_DUPS
- ASSERT(!s->len);
-# else
+ while (P(get_char)(kmp, &src, &c));
if (s->len)
{
# ifdef KMP_ADD_DUP
- { KMP_ADD_DUP(ctx, src, v, s); }
+ { KMP_ADD_DUP(kmp, src, s); }
# endif
return s;
}
-# endif
enter_new:
s->len = len;
# ifdef KMP_ADD_NEW
- { KMP_ADD_NEW(ctx, src, v, s); }
+ { KMP_ADD_NEW(kmp, src, s); }
# endif
return s;
}
static void
-P(init) (struct P(context) *ctx)
+P(init) (struct P(struct) *kmp)
{
- bzero(&ctx->null, sizeof(struct P(state)));
- P(hash_init)(&ctx->hash);
+ bzero(&kmp->null, sizeof(struct P(state)));
+ P(hash_init)(&kmp->hash);
}
#ifdef KMP_WANT_CLEANUP
static inline void
-P(cleanup) (struct P(context) *ctx)
+P(cleanup) (struct P(struct) *kmp)
{
- P(hash_cleanup)(&ctx->hash);
+ P(hash_cleanup)(&kmp->hash);
}
#endif
static inline int
-P(empty) (struct P(context) *ctx)
+P(empty) (struct P(struct) *kmp)
{
- return !ctx->hash.hash_count;
+ return !kmp->hash.hash_count;
}
static inline struct P(state) *
}
static void
-P(build) (struct P(context) *ctx)
+P(build) (struct P(struct) *kmp)
{
- if (P(empty)(ctx))
+ if (P(empty)(kmp))
return;
uns read = 0, write = 0;
- struct P(state) *fifo[ctx->hash.hash_count], *null = &ctx->null;
+ struct P(state) *fifo[kmp->hash.hash_count], *null = &kmp->null;
for (struct P(state) *s = null->back; s; s = s->next)
fifo[write++] = s;
null->back = NULL;
# ifdef KMP_BUILD_STATE
- { KMP_BUILD_STATE(ctx, null); }
-# endif
+ { KMP_BUILD_STATE(kmp, null); }
+# endif
while (read != write)
{
struct P(state) *s = fifo[read++], *t;
s->next = NULL;
break;
}
- s->back = P(hash_find)(&ctx->hash, t, s->c);
+ s->back = P(hash_find)(&kmp->hash, t, s->c);
if (s->back)
{
s->next = s->back->len ? s->back : s->back->next;
}
}
# ifdef KMP_BUILD_STATE
- { KMP_BUILD_STATE(ctx, s); }
-# endif
+ { KMP_BUILD_STATE(kmp, s); }
+# endif
}
}
#undef KMP_CHAR
#undef KMP_SOURCE
#undef KMP_GET_CHAR
-#undef KMP_NODE
+#undef KMP_VARS
+#undef KMP_STATE_VARS
#undef KMP_CONTEXT
#undef KMP_USE_ASCII
#undef KMP_USE_UTF8
#undef KMP_ONLYALPHA
#undef KMP_CONTROL_CHAR
#undef KMP_ADD_EXTRA_ARGS
-#undef KMP_ADD_EXTRA_VAR
#undef KMP_ADD_INIT
#undef KMP_ADD_NEW
#undef KMP_ADD_DUP
-#undef KMP_NO_DUPS
+#undef KMP_INIT_STATE
#undef KMP_BUILD_STATE
#undef KMP_USE_POOL
#undef KMP_GIVE_ALLOC