+++ /dev/null
-/*
- * Knuth-Morris-Pratt's Substring Search for N given strings
- *
- * (c) 1999--2005, Robert Spalek <robert@ucw.cz>
- * (c) 2006, Pavel Charvat <pchar@ucw.cz>
- *
- * (In fact, the algorithm is usually referred to as Aho-McCorasick,
- * but that's just an extension of KMP to multiple strings.)
- */
-
-/*
- * This is not a normal header file, it's a generator of KMP algorithm.
- * Each time you include it with parameters set in the corresponding
- * preprocessor macros, it generates KMP structures and functions
- * with the parameters given.
- *
- *
- * Basic parameters:
- * KMP_PREFIX(x) macro to add a name prefix (used on all global names
- * defined by the KMP generator); mandatory
- *
- * KMP_CHAR alphabet type, the default is u16
- *
- * KMP_SOURCE user-defined text source; KMP_GET_CHAR must
- * KMP_GET_CHAR(ctx,src,c) return next character from the input or zero at the end;
- * if not defined, zero-terminated array of bytes is used as the input
- *
- * KMP_NODE user-defined data in each state of the automaton
- * KMP_CONTEXT user-defined data in struct context (a structure describing
- * the whole automaton)
- *
- * Parameters which select how the input is interpreted (if KMP_SOURCE is unset):
- * KMP_USE_ASCII reads single bytes from the input (default)
- * KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed)
- * KMP_TOLOWER converts all to lowercase
- * KMP_UNACCENT removes accents
- * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR
- * KMP_CONTROL_CHAR special control character (default is ':')
- *
- * Parameters controlling add():
- * KMP_ADD_EXTRA_ARGS extra arguments
- * KMP_ADD_EXTRA_VAR structure with extra local variables
- * KMP_ADD_INIT(ctx,src,v)
- * KMP_ADD_NEW(ctx,src,v,s)
- * KMP_ADD_DUP(ctx,src,v,s)
- * KMP_NO_DUPS no support for duplicates
- *
- * Parameters to build():
- * KMP_BUILD_STATE(ctx,s) called for all states (including null) in order of non-decreasing tree depth
- *
- * Other parameters:
- * KMP_WANT_CLEANUP define cleanup()
- * KMP_WANT_SEARCH includes lib/kmp-search.h with the same prefix;
- * there can be multiple search variants for a single KMP structure
- *
- * KMP_USE_POOL allocates in a given pool
- *
- * KMP_GIVE_ALLOC
- * KMP_GIVE_HASHFN
- * KMP_GIVE_EQ
- */
-
-#ifndef KMP_PREFIX
-#error Missing KMP_PREFIX
-#endif
-
-#include "lib/mempool.h"
-#include <alloca.h>
-#include <string.h>
-
-#define P(x) KMP_PREFIX(x)
-
-#ifdef KMP_CHAR
-typedef KMP_CHAR P(char_t);
-#else
-typedef u16 P(char_t);
-#endif
-
-typedef u32 P(len_t);
-
-#ifdef KMP_NODE
-typedef KMP_NODE P(node_t);
-#else
-typedef struct {} P(node_t);
-#endif
-
-struct P(context);
-
-struct P(state) {
- struct P(state) *from; /* state with previous character */
- struct P(state) *back; /* backwards edge to the largest shorter state */
- struct P(state) *next; /* largest shorter match */
- P(len_t) len; /* largest match, zero otherwise */
- P(char_t) c; /* last character */
- P(node_t) n; /* user-defined data */
-};
-
-/* Control char */
-static inline P(char_t)
-P(control) (void)
-{
-#ifdef KMP_CONTROL_CHAR
- return KMP_CONTROL_CHAR;
-#else
- return ':';
-#endif
-}
-
-/* User-defined source */
-struct P(hash_table);
-
-#define HASH_GIVE_HASHFN
-#ifdef KMP_GIVE_HASHFN
-static inline uns
-P(hash_hash) (struct P(hash_table) *t, struct P(state) *f, P(char_t) c)
-{
- return P(hash) ((struct P(context) *) t, f, c);
-}
-#else
-static inline uns
-P(hash_hash) (struct P(hash_table) *t UNUSED, struct P(state) *f, P(char_t) c)
-{
- return (((uns)c) << 16) + (uns)(addr_int_t)f;
-}
-#endif
-
-#ifndef KMP_GIVE_EQ
-static inline int
-P(eq) (struct P(context) *ctx UNUSED, P(char_t) c1, P(char_t) c2)
-{
- return c1 == c2;
-}
-#endif
-
-static inline int
-P(is_control) (struct P(context) *ctx, P(char_t) c)
-{
- return P(eq) (ctx, c, P(control)());
-}
-
-#define HASH_GIVE_EQ
-static inline int
-P(hash_eq) (struct P(hash_table) *t, struct P(state) *f1, P(char_t) c1, struct P(state) *f2, P(char_t) c2)
-{
- return f1 == f2 && P(eq)((struct P(context) *) t, c1, c2);
-}
-
-#ifdef KMP_GIVE_ALLOC
-#define HASH_GIVE_ALLOC
-static inline void *
-P(hash_alloc) (struct P(hash_table) *t, uns size)
-{
- return P(alloc) ((struct P(context) *) t, size);
-}
-
-static inline void
-P(hash_free) (struct P(hash_table) *t, void *ptr)
-{
- P(free) ((struct P(context) *) t, ptr);
-}
-#endif
-
-#define HASH_GIVE_INIT_KEY
-static inline void
-P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(state) *f, P(char_t) c)
-{
- bzero(s, sizeof(*s));
- s->from = f;
- s->c = c;
- s->next = f->back; /* the pointers hold the link-list of sons... changed in build() */
- f->back = s;
-}
-
-#undef P
-#define HASH_PREFIX(x) KMP_PREFIX(GLUE(hash_,x))
-#define HASH_NODE struct KMP_PREFIX(state)
-#define HASH_KEY_COMPLEX(x) x from, x c
-#define HASH_KEY_DECL struct KMP_PREFIX(state) *from, KMP_PREFIX(char_t) c
-#define HASH_WANT_NEW
-#define HASH_WANT_FIND
-#ifdef KMP_WANT_CLEANUP
-#define HASH_WANT_CLEANUP
-#endif
-#if defined(KMP_USE_POOL)
-#define HASH_USE_POOL KMP_USE_POOL
-#else
-#define HASH_AUTO_POOL 4096
-#endif
-#define HASH_CONSERVE_SPACE
-#define HASH_TABLE_DYNAMIC
-#include "lib/hashtable.h"
-#define P(x) KMP_PREFIX(x)
-
-struct P(context) {
- struct P(hash_table) hash; /* hash table of state transitions */
- struct P(state) null; /* null state */
-# ifdef KMP_CONTEXT
- KMP_CONTEXT v; /* user defined data */
-# endif
-};
-
-#ifdef KMP_SOURCE
-typedef KMP_SOURCE P(source_t);
-#else
-typedef byte *P(source_t);
-#endif
-
-#ifdef KMP_GET_CHAR
-static inline int
-P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src UNUSED, P(char_t) *c UNUSED)
-{
- return KMP_GET_CHAR(ctx, (*src), (*c));
-}
-#else
-# if defined(KMP_USE_UTF8)
-# include "lib/unicode.h"
-# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER) || defined(KMP_UNACCENT)
-# include "charset/unicat.h"
-# endif
-# elif defined(KMP_USE_ASCII)
-# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER)
-# include "lib/chartype.h"
-# endif
-# endif
-static inline int
-P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src, P(char_t) *c)
-{
-# ifdef KMP_USE_UTF8
- uns cc;
- *src = (byte *)utf8_get(*src, &cc);
-# ifdef KMP_ONLYALPHA
- if (!cc) {}
- else if (!Ualpha(cc))
- cc = P(control)();
- else
-# endif
- {
-# ifdef KMP_TOLOWER
- cc = Utolower(cc);
-# endif
-# ifdef KMP_UNACCENT
- cc = Uunaccent(cc);
-# endif
- }
-# else
- uns cc = *(*src)++;
-# ifdef KMP_ONLYALPHA
- if (!cc) {}
- else if (!Calpha(cc))
- cc = P(control)();
- else
-# endif
-# ifdef KMP_TOLOWER
- cc = Clocase(cc);
-# endif
-# ifdef KMP_UNACCENT
-# error Do not know how to unaccent ASCII characters
-# endif
-# endif
- *c = cc;
- return !!cc;
-}
-#endif
-
-static struct P(state) *
-P(add) (struct P(context) *ctx, P(source_t) src
-# ifdef KMP_ADD_EXTRA_ARGS
- , KMP_ADD_EXTRA_ARGS
-# endif
-)
-{
-# ifdef KMP_ADD_EXTRA_VAR
- KMP_ADD_EXTRA_VAR v;
-# endif
-# ifdef KMP_ADD_INIT
- { KMP_ADD_INIT(ctx, src, v); }
-# endif
-
- P(char_t) c;
- if (!P(get_char)(ctx, &src, &c))
- return NULL;
- struct P(state) *p = &ctx->null, *s;
- uns len = 0;
- do
- {
- s = P(hash_find)(&ctx->hash, p, c);
- if (!s)
- for (;;)
- {
- s = P(hash_new)(&ctx->hash, p, c);
- len++;
- if (!(P(get_char)(ctx, &src, &c)))
- goto enter_new;
- p = s;
- }
- p = s;
- len++;
- }
- while (P(get_char)(ctx, &src, &c));
-# ifdef KMP_NO_DUPS
- ASSERT(!s->len);
-# else
- if (s->len)
- {
-# ifdef KMP_ADD_DUP
- { KMP_ADD_DUP(ctx, src, v, s); }
-# endif
- return s;
- }
-# endif
-enter_new:
- s->len = len;
-# ifdef KMP_ADD_NEW
- { KMP_ADD_NEW(ctx, src, v, s); }
-# endif
- return s;
-}
-
-static void
-P(init) (struct P(context) *ctx)
-{
- bzero(&ctx->null, sizeof(struct P(state)));
- P(hash_init)(&ctx->hash);
-}
-
-#ifdef KMP_WANT_CLEANUP
-static inline void
-P(cleanup) (struct P(context) *ctx)
-{
- P(hash_cleanup)(&ctx->hash);
-}
-#endif
-
-static inline int
-P(empty) (struct P(context) *ctx)
-{
- return !ctx->hash.hash_count;
-}
-
-static inline struct P(state) *
-P(chain_start) (struct P(state) *s)
-{
- return s->len ? s : s->next;
-}
-
-static void
-P(build) (struct P(context) *ctx)
-{
- if (P(empty)(ctx))
- return;
- uns read = 0, write = 0;
- struct P(state) *fifo[ctx->hash.hash_count], *null = &ctx->null;
- for (struct P(state) *s = null->back; s; s = s->next)
- fifo[write++] = s;
- null->back = NULL;
-# ifdef KMP_BUILD_STATE
- { KMP_BUILD_STATE(ctx, null); }
-# endif
- while (read != write)
- {
- struct P(state) *s = fifo[read++], *t;
- for (t = s->back; t; t = t->next)
- fifo[write++] = t;
- for (t = s->from->back; 1; t = t->back)
- {
- if (!t)
- {
- s->back = null;
- s->next = NULL;
- break;
- }
- s->back = P(hash_find)(&ctx->hash, t, s->c);
- if (s->back)
- {
- s->next = s->back->len ? s->back : s->back->next;
- break;
- }
- }
-# ifdef KMP_BUILD_STATE
- { KMP_BUILD_STATE(ctx, s); }
-# endif
- }
-}
-
-#undef P
-#undef KMP_CHAR
-#undef KMP_SOURCE
-#undef KMP_GET_CHAR
-#undef KMP_NODE
-#undef KMP_CONTEXT
-#undef KMP_USE_ASCII
-#undef KMP_USE_UTF8
-#undef KMP_TOLOWER
-#undef KMP_UNACCENT
-#undef KMP_ONLYALPHA
-#undef KMP_CONTROL_CHAR
-#undef KMP_ADD_EXTRA_ARGS
-#undef KMP_ADD_EXTRA_VAR
-#undef KMP_ADD_INIT
-#undef KMP_ADD_NEW
-#undef KMP_ADD_DUP
-#undef KMP_NO_DUPS
-#undef KMP_BUILD_STATE
-#undef KMP_USE_POOL
-#undef KMP_GIVE_ALLOC
-#undef KMP_GIVE_HASHFN
-#undef KMP_GIVE_EQ
-
-#ifdef KMP_WANT_SEARCH
-# undef KMP_WANT_SEARCH
-# define KMPS_PREFIX(x) KMP_PREFIX(x)
-# define KMPS_KMP_PREFIX(x) KMP_PREFIX(x)
-# include "lib/kmp-search.h"
-#endif
-
-#undef KMP_PREFIX
+++ /dev/null
-/*
- * Knuth-Morris-Pratt's Substring Search for N given strings
- *
- * (c) 1999--2005, Robert Spalek <robert@ucw.cz>
- */
-
-#include "lib/lib.h"
-#include "lib/bitops.h"
-#include "lib/mempool.h"
-#include "lib/lists.h"
-#include "lib/unicode.h"
-
-#define KMP_GET_CHAR(pos, c, flags) ASSERT(0)
-#include "lib/kmp.h"
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <alloca.h>
-
-#define TRACE(level, mask...) if (0) fprintf(stderr, mask)
-
-struct kmp *
-kmp_new(struct mempool *mp, int words_len, uns modify_flags)
-{
- struct kmp *kmp = mp_alloc_zero(mp, sizeof(struct kmp));
- kmp->mp = mp;
- kmp->modify_flags = modify_flags;
- kmp->words_len = words_len;
- int size = words_len;
- kmp->g.count = 1;
- kmp->g.size = size;
- kmp->g.sons = mp_alloc_zero(mp, size * sizeof(struct list));
- init_list(kmp->g.sons + 0);
- if (words_len > 1)
- size = words_len * bit_fls(words_len);
- else
- size = 1;
- kmp->g.hash_size = size;
- kmp->g.chain = mp_alloc_zero(mp, size * sizeof(struct kmp_transition *));
- kmp->f = mp_alloc_zero(mp, words_len * sizeof(kmp_state_t));
- kmp->out = mp_alloc_zero(mp, words_len * sizeof(struct kmp_output *));
- return kmp;
-}
-
-/*
- * The only merge operation is that son includes output of his father (and also
- * his father,...), so we can merge the link-lists.
- */
-static void
-merge_output(struct kmp_output **target, struct kmp_output *src)
-{
- while (*target)
- target = &(*target)->next;
- *target = src;
-}
-
-static struct kmp_output *
-new_output(struct kmp *kmp, uns id, uns len)
-{
- struct kmp_output *out = mp_alloc(kmp->mp, sizeof(struct kmp_output));
- out->next = NULL;
- out->id = id;
- out->len = len;
- return out;
-}
-
-void
-kmp_enter_raw_string(struct kmp *kmp, kmp_char_t *str, uns id)
-{
- struct kmp_transition tr = { .next=NULL, .from=0 }, **prev;
- struct kmp_output *new_out;
- uns len = 0;
- kmp_char_t c = 'a';
-
- TRACE(20, "kmp.c: Entering string");
- c = *str++;
- len++;
- if (!c)
- return;
- while (c)
- {
- tr.c = c;
- prev = transition_search(&kmp->g, &tr);
- if (!*prev)
- break;
- tr.from = (*prev)->to;
- c = *str++;
- len++;
- }
- while (c)
- {
- *prev = mp_alloc_zero(kmp->mp, sizeof(struct kmp_transition));
- tr.to = kmp->g.count++;
- **prev = tr;
- add_tail(kmp->g.sons + tr.from, &(*prev)->n);
- init_list(kmp->g.sons + tr.to);
- c = *str++;
- len++;
- tr.from = tr.to;
- tr.c = c;
- prev = transition_search(&kmp->g, &tr);
- ASSERT(!*prev);
- }
- if (kmp->out[tr.from])
- TRACE(5, "kmp.c: string is inserted more than once");
- new_out = new_output(kmp, id, len-1);
- merge_output(kmp->out + tr.from, new_out);
-}
-
-static void
-construct_f_out(struct kmp *kmp)
-{
- kmp_state_t *fifo;
- int read, write;
- struct kmp_transition *son;
-
- fifo = alloca(kmp->words_len * sizeof(kmp_state_t));
- read = write = 0;
- kmp->f[0] = 0;
- WALK_LIST(son, kmp->g.sons[0])
- {
- ASSERT(son->from == 0);
- kmp->f[son->to] = 0;
- fifo[write++] = son->to;
- }
- while (read != write)
- {
- kmp_state_t r, s, t;
- r = fifo[read++];
- WALK_LIST(son, kmp->g.sons[r])
- {
- struct kmp_transition tr, **prev;
- ASSERT(son->from == r);
- tr.c = son->c;
- s = son->to;
- fifo[write++] = s;
- t = kmp->f[r];
- while (1)
- {
- tr.from = t;
- prev = transition_search(&kmp->g, &tr);
- if (*prev || !tr.from)
- break;
- t = kmp->f[t];
- }
- kmp->f[s] = *prev ? (*prev)->to : 0;
- merge_output(kmp->out + s, kmp->out[ kmp->f[s] ]);
- }
- }
-}
-
-void
-kmp_build(struct kmp *kmp)
-{
- ASSERT(kmp->g.count <= kmp->words_len);
- construct_f_out(kmp);
- if (kmp->words_len > 1)
- TRACE(0, "Built KMP with modify flags %d for total words len %d, it has %d nodes", kmp->modify_flags, kmp->words_len, kmp->g.count);
-}
/*
- * Knuth-Morris-Pratt's Substring Search for N given strings
- *
- * (c) 1999--2005, Robert Spalek <robert@ucw.cz>
+ * Knuth-Morris-Pratt's Substring Search for N given strings
*
- * This is a preprocessor template: you need to set KMP_GET_CHAR
- * to a macro for reading a single character from the input and
- * translating it according to the MF_xxx flags. See below for
- * some pre-defined values.
+ * (c) 1999--2005, Robert Spalek <robert@ucw.cz>
+ * (c) 2006, Pavel Charvat <pchar@ucw.cz>
*
- * Don't touch this file, it can bite.
- *
- * (In fact, the algorithm is usually referred to as Aho-McCorasick,
- * but that's just an extension of KMP to multiple strings.)
+ * (In fact, the algorithm is usually referred to as Aho-McCorasick,
+ * but that's just an extension of KMP to multiple strings.)
*/
-#ifndef _UCW_KMP_H
-#define _UCW_KMP_H
+/*
+ * This is not a normal header file, it's a generator of KMP algorithm.
+ * Each time you include it with parameters set in the corresponding
+ * preprocessor macros, it generates KMP structures and functions
+ * with the parameters given.
+ *
+ *
+ * Basic parameters:
+ * KMP_PREFIX(x) macro to add a name prefix (used on all global names
+ * defined by the KMP generator); mandatory
+ *
+ * KMP_CHAR alphabet type, the default is u16
+ *
+ * KMP_SOURCE user-defined text source; KMP_GET_CHAR must
+ * KMP_GET_CHAR(ctx,src,c) return next character from the input or zero at the end;
+ * if not defined, zero-terminated array of bytes is used as the input
+ *
+ * KMP_NODE user-defined data in each state of the automaton
+ * KMP_CONTEXT user-defined data in struct context (a structure describing
+ * the whole automaton)
+ *
+ * Parameters which select how the input is interpreted (if KMP_SOURCE is unset):
+ * KMP_USE_ASCII reads single bytes from the input (default)
+ * KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed)
+ * KMP_TOLOWER converts all to lowercase
+ * KMP_UNACCENT removes accents
+ * KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR
+ * KMP_CONTROL_CHAR special control character (default is ':')
+ *
+ * Parameters controlling add():
+ * KMP_ADD_EXTRA_ARGS extra arguments
+ * KMP_ADD_EXTRA_VAR structure with extra local variables
+ * KMP_ADD_INIT(ctx,src,v)
+ * KMP_ADD_NEW(ctx,src,v,s)
+ * KMP_ADD_DUP(ctx,src,v,s)
+ * KMP_NO_DUPS no support for duplicates
+ *
+ * Parameters to build():
+ * KMP_BUILD_STATE(ctx,s) called for all states (including null) in order of non-decreasing tree depth
+ *
+ * Other parameters:
+ * KMP_WANT_CLEANUP define cleanup()
+ * KMP_WANT_SEARCH includes lib/kmp-search.h with the same prefix;
+ * there can be multiple search variants for a single KMP structure
+ *
+ * KMP_USE_POOL allocates in a given pool
+ *
+ * KMP_GIVE_ALLOC
+ * KMP_GIVE_HASHFN
+ * KMP_GIVE_EQ
+ */
-#include "lib/lists.h"
+#ifndef KMP_PREFIX
+#error Missing KMP_PREFIX
+#endif
+#include "lib/mempool.h"
+#include <alloca.h>
#include <string.h>
-/*
- * Input conversion flags (the conversion is handled exclusively by the KMP_GET_CHAR
- * macro, so you can define your own conversion modes, soo).
- */
-#define MF_TOLOWER 1
-#define MF_UNACCENT 2
-#define MF_ONLYALPHA 4 /* Convert non-alphas to KMP_CONTROL_CHAR */
+#define P(x) KMP_PREFIX(x)
-#define KMP_CONTROL_CHAR ':'
+#ifdef KMP_CHAR
+typedef KMP_CHAR P(char_t);
+#else
+typedef u16 P(char_t);
+#endif
-/* Pre-defined input functions */
+typedef u32 P(len_t);
-#define KMP_GET_UTF8(pos, c, flags) do { uns cc; pos = utf8_get(pos, &cc); c = cc; } while(0)
+#ifdef KMP_NODE
+typedef KMP_NODE P(node_t);
+#else
+typedef struct {} P(node_t);
+#endif
-#define KMP_GET_ASCII(pos, c, flags) do { \
- c = *pos++; \
- if (c) { \
- if (flags & MF_TOLOWER) \
- c = Clocase(c); \
- if (flags & MF_ONLYALPHA && !Calpha(c)) \
- c = KMP_CONTROL_CHAR; \
- } \
-} while (0)
+struct P(context);
-/* Types and structures */
+struct P(state) {
+ struct P(state) *from; /* state with previous character */
+ struct P(state) *back; /* backwards edge to the largest shorter state */
+ struct P(state) *next; /* largest shorter match */
+ P(len_t) len; /* largest match, zero otherwise */
+ P(char_t) c; /* last character */
+ P(node_t) n; /* user-defined data */
+};
-typedef uns kmp_state_t;
-typedef word kmp_char_t;
+/* Control char */
+static inline P(char_t)
+P(control) (void)
+{
+#ifdef KMP_CONTROL_CHAR
+ return KMP_CONTROL_CHAR;
+#else
+ return ':';
+#endif
+}
-struct kmp_transition {
- struct node n; /* link list of sons for a given node */
- struct kmp_transition *next; /* collision in the hash-table of all transitions */
- kmp_state_t from, to;
- kmp_char_t c;
-};
-struct kmp_transitions {
- int count, size;
- struct list *sons; /* link-list of all sons for each given node */
- uns hash_size;
- struct kmp_transition **chain; /* hash-table of [node, char]->son */
-};
+/* User-defined source */
+struct P(hash_table);
-struct kmp_output {
- struct kmp_output *next; /* output link list for every node */
- uns id;
- uns len;
-};
+#define HASH_GIVE_HASHFN
+#ifdef KMP_GIVE_HASHFN
+static inline uns
+P(hash_hash) (struct P(hash_table) *t, struct P(state) *f, P(char_t) c)
+{
+ return P(hash) ((struct P(context) *) t, f, c);
+}
+#else
+static inline uns
+P(hash_hash) (struct P(hash_table) *t UNUSED, struct P(state) *f, P(char_t) c)
+{
+ return (((uns)c) << 16) + (uns)(addr_int_t)f;
+}
+#endif
-struct mempool;
-struct kmp {
- struct mempool *mp;
- int modify_flags; /* which nocase/noaccent mode is this kmp for */
- int words_len; /* total length of searched words */
- struct kmp_transitions g; /* hash table of forward transitions of automat */
- kmp_state_t *f; /* back transitions of automat */
- struct kmp_output **out; /* found words for every state */
-};
+#ifndef KMP_GIVE_EQ
+static inline int
+P(eq) (struct P(context) *ctx UNUSED, P(char_t) c1, P(char_t) c2)
+{
+ return c1 == c2;
+}
+#endif
-struct kmp_result {
- struct node n; /* strings with non-zero frequency are put into a link-list */
- uns occur;
-};
+static inline int
+P(is_control) (struct P(context) *ctx, P(char_t) c)
+{
+ return P(eq) (ctx, c, P(control)());
+}
-/* kmp.c */
-struct kmp *kmp_new(struct mempool *mp, int words_len, uns modify_flags);
-void kmp_enter_raw_string(struct kmp *kmp, kmp_char_t *str, uns id);
-void kmp_build(struct kmp *kmp);
+#define HASH_GIVE_EQ
+static inline int
+P(hash_eq) (struct P(hash_table) *t, struct P(state) *f1, P(char_t) c1, struct P(state) *f2, P(char_t) c2)
+{
+ return f1 == f2 && P(eq)((struct P(context) *) t, c1, c2);
+}
-static inline void
-kmp_get_char(const byte **str UNUSED, kmp_char_t *c, uns modify_flags UNUSED)
-{
- while (1)
- {
- kmp_char_t new_c;
- KMP_GET_CHAR((*str), new_c, modify_flags);
- if (new_c != KMP_CONTROL_CHAR || *c != KMP_CONTROL_CHAR)
- {
- *c = new_c;
- return;
- }
- }
+#ifdef KMP_GIVE_ALLOC
+#define HASH_GIVE_ALLOC
+static inline void *
+P(hash_alloc) (struct P(hash_table) *t, uns size)
+{
+ return P(alloc) ((struct P(context) *) t, size);
}
static inline void
-kmp_enter_string(struct kmp *kmp, const byte *str, uns id)
-{
- /* To avoid dependencies between libucw and other libraries (which might
- * be referenced by the KMP_GET_CHAR macro), we have to split kmp_enter_string()
- * to a conversion wrapper (this function) and the rest, which resides in kmp.c
- * and uses zero-terminated array of kmp_char_t characters as its input.
- */
- kmp_char_t buf[strlen(str)+1], *str2 = buf, c = 0;
- do
- {
- kmp_get_char(&str, &c, kmp->modify_flags);
- *str2++ = c;
- }
- while (c);
- kmp_enter_raw_string(kmp, buf, id);
+P(hash_free) (struct P(hash_table) *t, void *ptr)
+{
+ P(free) ((struct P(context) *) t, ptr);
}
+#endif
-static inline uns
-transition_hashf(struct kmp_transitions *l UNUSED, struct kmp_transition *tr)
+#define HASH_GIVE_INIT_KEY
+static inline void
+P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(state) *f, P(char_t) c)
{
- return tr->from + (tr->c << 16);
+ bzero(s, sizeof(*s));
+ s->from = f;
+ s->c = c;
+ s->next = f->back; /* the pointers hold the link-list of sons... changed in build() */
+ f->back = s;
}
+#undef P
+#define HASH_PREFIX(x) KMP_PREFIX(GLUE(hash_,x))
+#define HASH_NODE struct KMP_PREFIX(state)
+#define HASH_KEY_COMPLEX(x) x from, x c
+#define HASH_KEY_DECL struct KMP_PREFIX(state) *from, KMP_PREFIX(char_t) c
+#define HASH_WANT_NEW
+#define HASH_WANT_FIND
+#ifdef KMP_WANT_CLEANUP
+#define HASH_WANT_CLEANUP
+#endif
+#if defined(KMP_USE_POOL)
+#define HASH_USE_POOL KMP_USE_POOL
+#else
+#define HASH_AUTO_POOL 4096
+#endif
+#define HASH_CONSERVE_SPACE
+#define HASH_TABLE_DYNAMIC
+#include "lib/hashtable.h"
+#define P(x) KMP_PREFIX(x)
+
+struct P(context) {
+ struct P(hash_table) hash; /* hash table of state transitions */
+ struct P(state) null; /* null state */
+# ifdef KMP_CONTEXT
+ KMP_CONTEXT v; /* user defined data */
+# endif
+};
+
+#ifdef KMP_SOURCE
+typedef KMP_SOURCE P(source_t);
+#else
+typedef byte *P(source_t);
+#endif
+
+#ifdef KMP_GET_CHAR
static inline int
-transition_compare(struct kmp_transition *a, struct kmp_transition *b)
+P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src UNUSED, P(char_t) *c UNUSED)
{
- if (a->from == b->from && a->c == b->c)
- return 0;
- else
- return 1;
+ return KMP_GET_CHAR(ctx, (*src), (*c));
}
+#else
+# if defined(KMP_USE_UTF8)
+# include "lib/unicode.h"
+# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER) || defined(KMP_UNACCENT)
+# include "charset/unicat.h"
+# endif
+# elif defined(KMP_USE_ASCII)
+# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER)
+# include "lib/chartype.h"
+# endif
+# endif
+static inline int
+P(get_char) (struct P(context) *ctx UNUSED, P(source_t) *src, P(char_t) *c)
+{
+# ifdef KMP_USE_UTF8
+ uns cc;
+ *src = (byte *)utf8_get(*src, &cc);
+# ifdef KMP_ONLYALPHA
+ if (!cc) {}
+ else if (!Ualpha(cc))
+ cc = P(control)();
+ else
+# endif
+ {
+# ifdef KMP_TOLOWER
+ cc = Utolower(cc);
+# endif
+# ifdef KMP_UNACCENT
+ cc = Uunaccent(cc);
+# endif
+ }
+# else
+ uns cc = *(*src)++;
+# ifdef KMP_ONLYALPHA
+ if (!cc) {}
+ else if (!Calpha(cc))
+ cc = P(control)();
+ else
+# endif
+# ifdef KMP_TOLOWER
+ cc = Clocase(cc);
+# endif
+# ifdef KMP_UNACCENT
+# error Do not know how to unaccent ASCII characters
+# endif
+# endif
+ *c = cc;
+ return !!cc;
+}
+#endif
-static inline struct kmp_transition **
-transition_search(struct kmp_transitions *l, struct kmp_transition *tr)
+static struct P(state) *
+P(add) (struct P(context) *ctx, P(source_t) src
+# ifdef KMP_ADD_EXTRA_ARGS
+ , KMP_ADD_EXTRA_ARGS
+# endif
+)
{
- uns hf = transition_hashf(l, tr) % l->hash_size;
- struct kmp_transition **last = l->chain + hf;
- while (*last && transition_compare(*last, tr))
- last = &(*last)->next;
- ASSERT(last);
- return last;
+# ifdef KMP_ADD_EXTRA_VAR
+ KMP_ADD_EXTRA_VAR v;
+# endif
+# ifdef KMP_ADD_INIT
+ { KMP_ADD_INIT(ctx, src, v); }
+# endif
+
+ P(char_t) c;
+ if (!P(get_char)(ctx, &src, &c))
+ return NULL;
+ struct P(state) *p = &ctx->null, *s;
+ uns len = 0;
+ do
+ {
+ s = P(hash_find)(&ctx->hash, p, c);
+ if (!s)
+ for (;;)
+ {
+ s = P(hash_new)(&ctx->hash, p, c);
+ len++;
+ if (!(P(get_char)(ctx, &src, &c)))
+ goto enter_new;
+ p = s;
+ }
+ p = s;
+ len++;
+ }
+ while (P(get_char)(ctx, &src, &c));
+# ifdef KMP_NO_DUPS
+ ASSERT(!s->len);
+# else
+ if (s->len)
+ {
+# ifdef KMP_ADD_DUP
+ { KMP_ADD_DUP(ctx, src, v, s); }
+# endif
+ return s;
+ }
+# endif
+enter_new:
+ s->len = len;
+# ifdef KMP_ADD_NEW
+ { KMP_ADD_NEW(ctx, src, v, s); }
+# endif
+ return s;
}
-static inline void
-add_result(struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out)
-{
- for (; out; out = out->next)
- if (!freq[out->id].occur++)
- add_tail(nonzeroes, &freq[out->id].n);
-}
-
-static inline byte *
-kmp_search_internal(struct kmp *kmp, byte *str, uns len, struct list *nonzeroes, struct kmp_result *freq, struct kmp_output *out)
- /* For every found string with id ID, it increments freq[ID].
- * Also, it finds the longest among the leftmost matches. */
-{
- if (!len)
- return NULL;
- kmp_state_t s = 0;
- kmp_char_t c = KMP_CONTROL_CHAR;
- struct kmp_transition tr, **prev;
- byte eof = 0;
- if (kmp->words_len <= 1)
- return NULL;
- //TRACE(20, "kmp.c: Searching string %s", str);
- byte *largest_match = NULL;
- while (1)
- {
- tr.from = s;
- tr.c = c;
- prev = transition_search(&kmp->g, &tr);
- while (tr.from && !*prev)
- {
- tr.from = kmp->f[ tr.from ];
- prev = transition_search(&kmp->g, &tr);
- }
- s = *prev ? (*prev)->to : 0;
- if (nonzeroes)
- add_result(nonzeroes, freq, kmp->out[s]);
- /* Beware that out->len is measured in modified characters of
- * the search pattern, hence it is not very reliable if you use
- * unaccenting. */
- struct kmp_output *kout = kmp->out[s];
- if (kout && (!largest_match || str - kout->len <= largest_match))
- {
- largest_match = str - kout->len;
- if (out)
- *out = *kout;
- }
- if (eof)
- break;
- if (!--len)
- c = 0;
- else
- kmp_get_char((const byte **)&str, &c, kmp->modify_flags);
- if (!c)
- {
- /* Insert KMP_CONTROL_CHAR at the beginning and at the end too. */
- c = KMP_CONTROL_CHAR;
- eof = 1;
- }
- }
- return largest_match;
+static void
+P(init) (struct P(context) *ctx)
+{
+ bzero(&ctx->null, sizeof(struct P(state)));
+ P(hash_init)(&ctx->hash);
}
+#ifdef KMP_WANT_CLEANUP
static inline void
-kmp_search(struct kmp *kmp, const byte *str, uns len, struct list *nonzeroes, struct kmp_result *freq)
+P(cleanup) (struct P(context) *ctx)
{
- kmp_search_internal(kmp, (byte*) str, len, nonzeroes, freq, NULL);
+ P(hash_cleanup)(&ctx->hash);
}
+#endif
-static inline byte *
-kmp_find_first(struct kmp *kmp, byte *str, uns len, struct kmp_output *out)
+static inline int
+P(empty) (struct P(context) *ctx)
{
- return kmp_search_internal(kmp, str, len, NULL, NULL, out);
+ return !ctx->hash.hash_count;
}
+static inline struct P(state) *
+P(chain_start) (struct P(state) *s)
+{
+ return s->len ? s : s->next;
+}
+
+static void
+P(build) (struct P(context) *ctx)
+{
+ if (P(empty)(ctx))
+ return;
+ uns read = 0, write = 0;
+ struct P(state) *fifo[ctx->hash.hash_count], *null = &ctx->null;
+ for (struct P(state) *s = null->back; s; s = s->next)
+ fifo[write++] = s;
+ null->back = NULL;
+# ifdef KMP_BUILD_STATE
+ { KMP_BUILD_STATE(ctx, null); }
+# endif
+ while (read != write)
+ {
+ struct P(state) *s = fifo[read++], *t;
+ for (t = s->back; t; t = t->next)
+ fifo[write++] = t;
+ for (t = s->from->back; 1; t = t->back)
+ {
+ if (!t)
+ {
+ s->back = null;
+ s->next = NULL;
+ break;
+ }
+ s->back = P(hash_find)(&ctx->hash, t, s->c);
+ if (s->back)
+ {
+ s->next = s->back->len ? s->back : s->back->next;
+ break;
+ }
+ }
+# ifdef KMP_BUILD_STATE
+ { KMP_BUILD_STATE(ctx, s); }
+# endif
+ }
+}
+
+#undef P
+#undef KMP_CHAR
+#undef KMP_SOURCE
+#undef KMP_GET_CHAR
+#undef KMP_NODE
+#undef KMP_CONTEXT
+#undef KMP_USE_ASCII
+#undef KMP_USE_UTF8
+#undef KMP_TOLOWER
+#undef KMP_UNACCENT
+#undef KMP_ONLYALPHA
+#undef KMP_CONTROL_CHAR
+#undef KMP_ADD_EXTRA_ARGS
+#undef KMP_ADD_EXTRA_VAR
+#undef KMP_ADD_INIT
+#undef KMP_ADD_NEW
+#undef KMP_ADD_DUP
+#undef KMP_NO_DUPS
+#undef KMP_BUILD_STATE
+#undef KMP_USE_POOL
+#undef KMP_GIVE_ALLOC
+#undef KMP_GIVE_HASHFN
+#undef KMP_GIVE_EQ
+
+#ifdef KMP_WANT_SEARCH
+# undef KMP_WANT_SEARCH
+# define KMPS_PREFIX(x) KMP_PREFIX(x)
+# define KMPS_KMP_PREFIX(x) KMP_PREFIX(x)
+# include "lib/kmp-search.h"
#endif
+
+#undef KMP_PREFIX