From: Martin Mares Date: Sat, 27 Mar 1999 12:48:07 +0000 (+0000) Subject: Added fast wildcard matcher. X-Git-Tag: holmes-import~1672 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=98890a92740cbfc0c5ddab9844f1ffe8d715ac8a;p=libucw.git Added fast wildcard matcher. --- diff --git a/lib/config.h b/lib/config.h index a699aa22..66fffa98 100644 --- a/lib/config.h +++ b/lib/config.h @@ -17,11 +17,16 @@ typedef unsigned char byte; /* exactly 8 bits, unsigned */ typedef signed char sbyte; /* exactly 8 bits, signed */ typedef unsigned short word; /* exactly 16 bits, unsigned */ typedef short sword; /* exactly 16 bits, signed */ +typedef unsigned short u16; /* exactly 16 bits, unsigned */ +typedef short s16; /* exactly 16 bits, signed */ typedef unsigned int ulg; /* exactly 32 bits, unsigned */ typedef int slg; /* exactly 32 bits, signed */ +typedef unsigned int u32; /* exactly 32 bits, unsigned */ +typedef int s32; /* exactly 32 bits, signed */ typedef unsigned int uns; /* at least 32 bits */ typedef unsigned long long int u64; /* exactly 64 bits, unsigned */ typedef long long int s64; /* exactly 64 bits, signed */ +typedef unsigned long addr_int_t; /* Both integer and address */ #ifndef NULL #define NULL (void *)0 diff --git a/lib/wildmatch.c b/lib/wildmatch.c new file mode 100644 index 00000000..545487ce --- /dev/null +++ b/lib/wildmatch.c @@ -0,0 +1,224 @@ +/* + * Fast Pattern Matcher for Short Wildcard Patterns (only `?' and `*' supported) + * + * Traditional NFA -> DFA method with on-the-fly DFA construction. + * + * (c) 1999 Martin Mares + */ + +#include +#include + +#include "lib.h" +#include "pools.h" +#include "wildmatch.h" + +#define MAX_STATES 32 /* Must be <= 32, state 0 is reserved, state 1 is initial */ +#define MAX_CACHED 256 /* Maximum number of cached DFA states */ +#define HASH_SIZE 512 /* Number of entries in DFA hash table (at least MAX_CACHED+MAX_STATES) */ +#define HASH_SKIP 137 + +struct nfa_state { + byte ch; /* 0 for non-matching state */ + byte final; /* Accepting state */ + u32 match_states; /* States to go to when input character == ch */ + u32 default_states; /* States to go to whatever the input is */ +}; + +struct dfa_state { + addr_int_t edge[256]; /* Outgoing DFA edges. Bit 0 is set for incomplete edges which + * contain just state set and clear for complete ones which point + * to other states. NULL means `no match'. + */ + u32 nfa_set; /* A set of NFA states this DFA state represents */ + int final; /* This is an accepting state */ + struct dfa_state *next; /* Next in the chain of free states */ +}; + +struct wildpatt { + struct nfa_state nfa[MAX_STATES]; + struct dfa_state *hash[HASH_SIZE]; + struct dfa_state *dfa_start; + int nfa_states; + int dfa_cache_counter; + struct mempool *pool; + struct dfa_state *free_states; +}; + +static inline unsigned +wp_hash(u32 set) +{ + set ^= set >> 16; + set ^= set >> 8; + return set % HASH_SIZE; +} + +static struct dfa_state * +wp_new_state(struct wildpatt *w, u32 set) +{ + unsigned h = wp_hash(set); + struct dfa_state *d; + unsigned bit; + u32 def_set; + + while (d = w->hash[h]) + { + if (d->nfa_set == set) + return d; + h = (h + HASH_SKIP) % HASH_SIZE; + } + if (d = w->free_states) + w->free_states = d->next; + else + d = pool_alloc(w->pool, sizeof(*d)); + w->hash[h] = d; + bzero(d, sizeof(*d)); + d->nfa_set = set; + def_set = 0; + for(bit=1; bit <= w->nfa_states; bit++) + if (set & (1 << bit)) + { + struct nfa_state *n = &w->nfa[bit]; + if (n->ch) + d->edge[n->ch] |= n->match_states | 1; + d->final |= n->final; + def_set |= n->default_states; + } + if (def_set) + { + unsigned i; + def_set |= 1; + for(i=0; i<256; i++) + d->edge[i] |= def_set; + } + w->dfa_cache_counter++; + return d; +} + +struct wildpatt * +wp_compile(byte *p, struct mempool *pool) +{ + struct wildpatt *w; + uns i; + + if (strlen(p) >= MAX_STATES) /* Too long */ + return NULL; + w = pool_alloc(pool, sizeof(*w)); + bzero(w, sizeof(w)); + w->pool = pool; + for(i=1; *p; p++) + { + struct nfa_state *n = w->nfa + i; + if (*p == '?') + n->default_states |= 1 << (++i);/* Default edge to a new state */ + else if (*p == '*') + n->default_states |= 1 << i; /* Default edge to the same state */ + else + { + n->ch = *p; /* Edge to new state labelled with 'c' */ + n->match_states = 1 << (++i); + } + } + w->nfa[i].final = 1; + w->nfa_states = i; + w->dfa_start = wp_new_state(w, 1 << 1); + return w; +} + +static void +wp_prune_cache(struct wildpatt *w) +{ + /* + * I was unable to trigger cache overflow on my large set of + * test cases, so I decided to handle it in an extremely dumb + * way. --mj + */ + int i; + for(i=0; ihash[i] && w->hash[i]->nfa_set != (1 << 1)) + { + struct dfa_state *d = w->hash[i]; + w->hash[i] = NULL; + d->next = w->free_states; + w->free_states = d; + } + w->dfa_cache_counter = 1; /* Only the initial state remains */ +} + +int +wp_match(struct wildpatt *w, byte *s) +{ + struct dfa_state *d; + + if (w->dfa_cache_counter >= MAX_CACHED) + wp_prune_cache(w); + d = w->dfa_start; + while (*s) + { + addr_int_t next = d->edge[*s]; + if (next & 1) + { + /* Need to lookup/create the destination state */ + struct dfa_state *new = wp_new_state(w, next & ~1); + d->edge[*s] = (addr_int_t) new; + d = new; + } + else if (!next) + return 0; + else + d = (struct dfa_state *) next; + s++; + } + return d->final; +} + +#ifdef TEST + +void +wp_dump(struct wildpatt *w) +{ + int i; + + puts("NFA:"); + for(i=1; i<=w->nfa_states; i++) + { + struct nfa_state *n = w->nfa + i; + printf("%2d: %d %02x %08x %08x\n", i, n->final, n->ch, n->match_states, n->default_states); + } + puts("DFA:"); + for(i=0; ihash[i]) + printf("%3d: %08x\n", i, w->hash[i]->nfa_set); + printf("%d DFA states cached.\n", w->dfa_cache_counter); +} + +int main(int argc, char **argv) +{ + struct wildpatt *w; + char buf[1024]; + + if (argc != 2) return 1; + w = wp_compile(argv[1], new_pool(65536)); + if (!w) + { + puts("Compile error"); + return 1; + } + wp_dump(w); + while (fgets(buf, sizeof(buf)-1, stdin)) + { + char *c = strchr(buf, '\n'); + if (!c) break; + *c = 0; +#if 0 + printf("%d\n", wp_match(w, buf)); +#else + if (wp_match(w, buf)) + puts(buf); +#endif + } + wp_dump(w); + return 0; +} + +#endif diff --git a/lib/wildmatch.h b/lib/wildmatch.h new file mode 100644 index 00000000..27a553b5 --- /dev/null +++ b/lib/wildmatch.h @@ -0,0 +1,11 @@ +/* + * Fast Wildcard Pattern Matcher (only `?' and `*' supported) + * + * (c) 1999 Martin Mares + */ + +struct wildpatt; +struct mempool; + +struct wildpatt *wp_compile(byte *, struct mempool *); +int wp_match(struct wildpatt *, byte *);