From 183cfc814c9b677980aae0092b4983039d6d7d88 Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Tue, 8 Jul 2008 08:25:58 +0200 Subject: [PATCH] Implemented byte-bases trie. --- lib/trie-test.c | 216 +++++++++++ lib/trie-test.t | 7 + lib/trie.h | 955 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1178 insertions(+) create mode 100644 lib/trie-test.c create mode 100644 lib/trie-test.t create mode 100644 lib/trie.h diff --git a/lib/trie-test.c b/lib/trie-test.c new file mode 100644 index 00000000..d2ed9f34 --- /dev/null +++ b/lib/trie-test.c @@ -0,0 +1,216 @@ +/* + * UCW Library -- Byte-based trie -- Testing utility + * + * (c) 2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#undef LOCAL_DEBUG + +#include "lib/lib.h" +#include "lib/getopt.h" + +#include +#include +#include + +#define TRIE_PREFIX(x) basic_##x +#define TRIE_NODE_TYPE char +#define TRIE_WANT_CLEANUP +#define TRIE_WANT_ADD +#define TRIE_WANT_DELETE +#define TRIE_WANT_FIND +#define TRIE_WANT_AUDIT +#ifdef LOCAL_DEBUG +#define TRIE_TRACE +#endif +#include "lib/trie.h" + +static void +basic_test(void) +{ + basic_init(); + basic_add("str1"); + basic_add("str2"); + if (!basic_find("str1") || !basic_find("str2") || basic_find("x") || basic_find("str123")) + ASSERT(0); + basic_audit(); + basic_delete("str1"); + if (basic_find("str1") || !basic_find("str2")) + ASSERT(0); + basic_audit(); + basic_cleanup(); +} + +#define TRIE_PREFIX(x) dynamic_##x +#define TRIE_NODE_TYPE char +#define TRIE_REV +#define TRIE_DYNAMIC +#define TRIE_WANT_CLEANUP +#define TRIE_WANT_ADD +#define TRIE_WANT_FIND +#define TRIE_WANT_AUDIT +#ifdef LOCAL_DEBUG +#define TRIE_TRACE +#endif +#include "lib/trie.h" + +static void +dynamic_test(void) +{ + struct dynamic_trie trie1, trie2; + dynamic_init(&trie1); + dynamic_init(&trie2); + dynamic_add(&trie1, "str1"); + dynamic_add(&trie2, "str2"); + if (!dynamic_find(&trie1, "str1") || dynamic_find(&trie1, "str2") || !dynamic_find(&trie2, "str2")) + ASSERT(0); + dynamic_audit(&trie1); + dynamic_audit(&trie2); + dynamic_cleanup(&trie1); + dynamic_cleanup(&trie2); +} + + +#define TRIE_PREFIX(x) random_##x +#define TRIE_NODE_TYPE char +#define TRIE_LEN_TYPE u16 +#undef TRIE_REV +#define TRIE_WANT_CLEANUP +#define TRIE_WANT_FIND +#define TRIE_WANT_ADD +#define TRIE_WANT_REMOVE +#define TRIE_WANT_AUDIT +#ifdef LOCAL_DEBUG +#define TRIE_TRACE +#endif +#include "lib/trie.h" + +#define MAX_STRINGS 200 + +static uns count; +static char *str[MAX_STRINGS]; + +static char * +gen_string(void) +{ + uns l = random_max(11); + char *s = xmalloc(l + 1); + for (uns i = 0; i < l; i++) + s[i] = random_max('z' - 'a') + 'a'; + s[l] = 0; + return s; +} + +static char * +gen_unique_string(void) +{ + char *s; +again: + s = gen_string(); + for (uns i = 0; i < count; i++) + if (!strcmp(s, str[i])) + { + xfree(s); + goto again; + } + return s; +} + +static void +insert(void) +{ + if (count == MAX_STRINGS) + return; + char *s = gen_unique_string(); + str[count++] = s; + DBG("add '%s'", s); + random_add(s); + random_audit(); +} + +static void +delete(void) +{ + if (!count) + return; + uns i = random_max(count); + DBG("remove '%s'", str[i]); + random_remove(str[i]); + random_audit(); + xfree(str[i]); + str[i] = str[--count]; +} + +static void +find(void) +{ + if (!count || !random_max(4)) + { + char *s = gen_unique_string(); + DBG("negative find '%s'", s); + if (random_find(s)) + ASSERT(0); + xfree(s); + } + else + { + uns i = random_max(count); + DBG("positive find '%s'", str[i]); + if (random_find(str[i]) != str[i]) + ASSERT(0); + } +} + +static void +reset(void) +{ + DBG("reset"); + random_cleanup(); + for (uns i = 0; i < count; i++) + xfree(str[i]); + count = 0; + random_init(); + random_audit(); +} + +static void +random_test(void) +{ + random_init(); + for (uns i = 0; i < 10000; i++) + { + int r = random_max(1000); + if ((r -= 300) < 0) + insert(); + else if ((r -= 150) < 0) + delete(); + else if ((r -= 300) < 0) + find(); + else if ((r -= 1) < 0) + reset(); + } + random_cleanup(); +} + +int main(int argc, char **argv) +{ + log_init(argv[0]); + if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 || optind + 1 != argc) + die("Invalid usage, see the source code"); + srandom(time(NULL)); + + char *test = argv[optind]; + if (!strcmp(test, "basic")) + basic_test(); + else if (!strcmp(test, "dynamic")) + dynamic_test(); + else if (!strcmp(test, "random")) + random_test(); + else + die("Unknown test case"); + + return 0; +} diff --git a/lib/trie-test.t b/lib/trie-test.t new file mode 100644 index 00000000..d272f580 --- /dev/null +++ b/lib/trie-test.t @@ -0,0 +1,7 @@ +# Tests for tries + +Run: ../obj/lib/trie-test basic + +Run: ../obj/lib/trie-test dynamic + +Run: ../obj/lib/trie-test random diff --git a/lib/trie.h b/lib/trie.h new file mode 100644 index 00000000..d78d6d38 --- /dev/null +++ b/lib/trie.h @@ -0,0 +1,955 @@ +/* + * UCW Library -- Byte-based trie + * + * (c) 2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +/* + * This is not a normal header file, it's a generator of tries. + * Each time you include it with parameters set in the corresponding + * preprocessor macros, it generates a trie with the parameters given. + * + * You need to specify: + * + * [*] TRIE_PREFIX(x) macro to add a name prefix (used on all global names + * defined by the trie generator). + * + * [*] TRIE_NODE_TYPE data type where a node dwells (usually a struct). + * TRIE_NODE_KEY(node) macro to return the pointer to the key (default=&x) + * TRIE_NODE_LEN(node) macro to return the length of the key (default=str_len(TRIE_NODE_KEY(node))) + * TRIE_LEN_TYPE integer type large enough to hold length of any inserted string (default=u32). + * TRIE_REV work with reversed strings. + * TRIE_DYNAMIC + * + * TRIE_WANT_CLEANUP cleanup() + * + * TRIE_WANT_FIND node *find(char *str) + * TRIE_WANT_FIND_BUF node *find_buf(byte *ptr, uns len) + * TRIE_WANT_ADD add(*node) + * TRIE_WANT_REPLACE node *replace(*node) + * TRIE_WANT_DELETE delete(char *str) + * TRIE_WANT_DELETE_BUF delete_buf(byte *ptr, uns len) + * TRIE_WANT_REMOVE remove(*node) + * + * TRIE_WANT_AUDIT audit() + * TRIE_WANT_STATS + */ + +/*** Define once ***/ + +#ifndef _SHERLOCK_UCW_TRIE_H +#define _SHERLOCK_UCW_TRIE_H + +#include "lib/eltpool.h" +#include "lib/hashfunc.h" + +#include + +#define TRIE_FLAG_DEG 0x01ff // mask for edge degree (0-256) +#define TRIE_FLAG_HASH 0x0200 // sons are stored in a hash table +#define TRIE_FLAG_NODE 0x0400 // edge contains inserted data + +#endif + +/*** Defaults ***/ + +#ifndef TRIE_PREFIX +#error Undefined mandatory macro TRIE_PREFIX +#endif +#define P(x) TRIE_PREFIX(x) + +#ifndef TRIE_NODE_TYPE +#error Undefined mandatory macro TRIE_NODE_TYPE +#endif +typedef TRIE_NODE_TYPE P(node_t); + +#ifndef TRIE_NODE_KEY +#define TRIE_NODE_KEY(node) ((char *)&(node)) +#endif + +#ifndef TRIE_NODE_LEN +#define TRIE_NODE_LEN(node) (str_len(TRIE_NODE_KEY(node))) +#endif + +#ifndef TRIE_LEN_TYPE +#define TRIE_LEN_TYPE u32 +#endif +typedef TRIE_LEN_TYPE P(len_t); + +#ifndef TRIE_ELTPOOL_SIZE +#define TRIE_ELTPOOL_SIZE 1024 +#endif + +#ifndef TRIE_HASH_THRESHOLD +#define TRIE_HASH_THRESHOLD (6 - sizeof(P(len_t))) +#endif + +#ifndef TRIE_BUCKET_RANK +#define TRIE_BUCKET_RANK (2U + (sizeof(void *) > 4)) +#endif +#define TRIE_BUCKET_SIZE (1U << TRIE_BUCKET_RANK) +#define TRIE_BUCKET_MASK (TRIE_BUCKET_SIZE - 1) +enum { P(bucket_rank) = TRIE_BUCKET_RANK }; + +#define TRIE_COMPILE_ASSERT(x, y) typedef char TRIE_PREFIX(x##_compile_assert)[!!(y)-1] +TRIE_COMPILE_ASSERT(len_type, sizeof(P(len_t)) <= sizeof(uns)); +TRIE_COMPILE_ASSERT(hash_threshold, TRIE_HASH_THRESHOLD >= 2); +TRIE_COMPILE_ASSERT(bucket_size, TRIE_BUCKET_RANK >= 1 && TRIE_BUCKET_MASK < sizeof(void *)); + +#ifdef TRIE_TRACE +#define TRIE_DBG(x...) msg(L_DEBUG, "TRIE: " x) +#else +#define TRIE_DBG(x...) do{}while(0) +#endif + +/*** Solve dependencies ***/ + +#if !defined(TRIE_WANT_DO_FIND) && (defined(TRIE_WANT_FIND) || defined(TRIE_WANT_FIND_BUF)) +#define TRIE_WANT_DO_FIND +#endif + +#if !defined(TRIE_WANT_DO_LOOKUP) && (defined(TRIE_WANT_ADD) || defined(TRIE_WANT_REPLACE)) +#define TRIE_WANT_DO_LOOKUP +#endif + +#if !defined(TRIE_WANT_DO_DELETE) && (defined(TRIE_WANT_DELETE) || defined(TRIE_WANT_DELETE_BUF) || defined(TRIE_WANT_REMOVE)) +#define TRIE_WANT_DO_DELETE +#endif + +#if !defined(TRIE_WANT_DO_LOOKUP) +#error You must request at least one method for inserting nodes +#endif + +/*** Data structures ***/ + +struct P(trie) { + struct P(edge) *root; // root edge or NULL + struct eltpool *epool[TRIE_HASH_THRESHOLD + 1]; // eltpools for edges with array of sons + struct eltpool *hpool[9]; // eltpools for edges with hash table +}; + +struct P(edge) { + u16 flags; // TRIE_FLAG_x + union { + byte trans[TRIE_HASH_THRESHOLD]; // transition characters (!TRIE_FLAG_HASH) + struct { + byte hash_rank; // logarithmic hash size (TRIE_FLAG_HASH) + byte hash_deleted; // number of deleted items + }; + }; + P(len_t) len; // sum of all ancestor edges with their trasition + // characters plus the length of the current edge + union { + P(node_t) *node; // inserted data (TRIE_FLAG_NODE) + struct P(edge) *leaf; // reference to a descendant with data (!TRIE_FLAG_NODE) + }; + struct P(edge) *son[0]; // array of sons (!TRIE_FLAG_HASH) + // or hash table (TRIE_FLAG_HASH) +}; + +#ifdef TRIE_DYNAMIC +#define T (*trie) +#define TA struct P(trie) *trie +#define TAC TA, +#define TT trie +#define TTC trie, +#else +static struct P(trie) P(trie); +#define T P(trie) +#define TA void +#define TAC +#define TT +#define TTC +#endif + +/*** Memory management ***/ + +static void +P(init)(TA) +{ + TRIE_DBG("Initializing"); + bzero(&T, sizeof(T)); + for (uns i = 0; i < ARRAY_SIZE(T.epool); i++) + { + uns size = sizeof(struct P(edge)) + i * sizeof(void *); + T.epool[i] = ep_new(size, MAX(TRIE_ELTPOOL_SIZE / size, 1)); + } + for (uns i = 0; i < ARRAY_SIZE(T.hpool); i++) + { + uns size = sizeof(struct P(edge)) + ((sizeof(void *) << TRIE_BUCKET_RANK) << i); + T.hpool[i] = ep_new(size, MAX(TRIE_ELTPOOL_SIZE / size, 1)); + } +} + +#ifdef TRIE_WANT_CLEANUP +static void +P(cleanup)(TA) +{ + TRIE_DBG("Cleaning up"); + for (uns i = 0; i < ARRAY_SIZE(T.epool); i++) + ep_delete(T.epool[i]); + for (uns i = 0; i < ARRAY_SIZE(T.hpool); i++) + ep_delete(T.hpool[i]); +} +#endif + +static struct P(edge) * +P(edge_alloc)(TAC uns flags) +{ + struct P(edge) *edge; + if (flags & TRIE_FLAG_HASH) + { + uns rank = 0, deg = flags & TRIE_FLAG_DEG; + while ((TRIE_BUCKET_MASK << rank) < deg * 2) // 25-50% density + rank++; + ASSERT(rank < ARRAY_SIZE(T.hpool)); + edge = ep_alloc(T.hpool[rank]); + edge->hash_rank = rank; + edge->hash_deleted = 0; + bzero(edge->son, (sizeof(void *) << TRIE_BUCKET_RANK) << rank); + } + else + edge = ep_alloc(T.epool[flags & TRIE_FLAG_DEG]); + edge->flags = flags; + TRIE_DBG("Allocated edge %p, flags=0x%x", edge, flags); + return edge; +} + +static void +P(edge_free)(TAC struct P(edge) *edge) +{ + TRIE_DBG("Freeing edge %p, flags=0x%x", edge, edge->flags); + if (edge->flags & TRIE_FLAG_HASH) + ep_free(T.hpool[edge->hash_rank], edge); + else + ep_free(T.epool[edge->flags & TRIE_FLAG_DEG], edge); +} + +/*** Manipulation with strings ***/ + +static inline byte * +P(str_get)(P(node_t) *node) +{ + return TRIE_NODE_KEY((*node)); +} + +static inline uns +P(str_len)(P(node_t) *node) +{ + return TRIE_NODE_LEN((*node)); +} + +static inline uns +P(str_char)(byte *ptr, uns len UNUSED, uns pos) +{ +#ifndef TRIE_REV + return ptr[pos]; +#else + return ptr[len - pos - 1]; +#endif +} + +static inline byte * +P(str_prefix)(byte *ptr, uns len UNUSED, uns prefix UNUSED) +{ +#ifndef TRIE_REV + return ptr; +#else + return ptr + len - prefix; +#endif +} + +static inline byte * +P(str_suffix)(byte *ptr, uns len UNUSED, uns suffix UNUSED) +{ +#ifndef TRIE_REV + return ptr + len - suffix; +#else + return ptr; +#endif +} + +static inline uns +P(common_prefix)(byte *ptr1, uns len1, byte *ptr2, uns len2) +{ + uns l = MIN(len1, len2), i; + for (i = 0; i < l; i++) + if (P(str_char)(ptr1, len1, i) != P(str_char)(ptr2, len2, i)) + break; + return i; +} + +/*** Sons ***/ + +static inline uns +P(hash_func)(uns c) +{ + return hash_u32(c) >> 16; +} + +static inline struct P(edge) ** +P(hash_find)(struct P(edge) *edge, uns c) +{ + uns mask = (TRIE_BUCKET_SIZE << edge->hash_rank) - 1; + for (uns i = P(hash_func)(c); ; i++) + if (((i &= mask) & TRIE_BUCKET_MASK) && (uintptr_t)edge->son[i] != 1) + if (!edge->son[i]) + return NULL; + else if (((byte *)&edge->son[i & ~TRIE_BUCKET_MASK])[i & TRIE_BUCKET_MASK] == c) + return &edge->son[i]; +} + +static inline struct P(edge) ** +P(hash_insert)(struct P(edge) *edge, uns c) +{ + uns mask = (TRIE_BUCKET_SIZE << edge->hash_rank) - 1; + for (uns i = P(hash_func)(c); ; i++) + if (((i &= mask) & TRIE_BUCKET_MASK) && (uintptr_t)edge->son[i] <= 1) + { + edge->hash_deleted -= (uintptr_t)edge->son[i]; + edge->son[i] = NULL; + ((byte *)&edge->son[i & ~TRIE_BUCKET_MASK])[i & TRIE_BUCKET_MASK] = c; + return &edge->son[i]; + } +} + +#ifdef TRIE_WANT_DO_DELETE +static inline void +P(hash_delete)(struct P(edge) *edge, uns c) +{ + uns mask = (TRIE_BUCKET_SIZE << edge->hash_rank) - 1; + for (uns i = P(hash_func)(c); ; i++) + if (((i &= mask) & TRIE_BUCKET_MASK) && (uintptr_t)edge->son[i] > 1 && + ((byte *)&edge->son[i & ~TRIE_BUCKET_MASK])[i & TRIE_BUCKET_MASK] == c) + { + edge->hash_deleted++; + edge->son[i] = (void *)1; + return; + } +} +#endif + +#define TRIE_HASH_FOR_ALL(xedge, xtrans, xson) do { \ + struct P(edge) *_edge = (xedge); \ + for (uns _i = (TRIE_BUCKET_SIZE << _edge->hash_rank); _i--; ) \ + if ((_i & TRIE_BUCKET_MASK) && (uintptr_t)_edge->son[_i] > 1) { \ + UNUSED uns xtrans = ((byte *)&_edge->son[_i & ~TRIE_BUCKET_MASK])[_i & TRIE_BUCKET_MASK]; \ + UNUSED struct P(edge) *xson = _edge->son[_i]; \ + do { +#define TRIE_HASH_END_FOR }while(0);}}while(0) + +static void +P(hash_realloc)(TAC struct P(edge) **ref) +{ + struct P(edge) *old = *ref, *edge = *ref = P(edge_alloc)(TTC old->flags); + TRIE_DBG("Reallocating hash table"); + edge->node = old->node; + edge->len = old->len; + TRIE_HASH_FOR_ALL(old, trans, son) + *P(hash_insert)(edge, trans) = son; + TRIE_HASH_END_FOR; + P(edge_free)(TTC old); +} + +/*** Finding/inserting/deleting sons ***/ + +static struct P(edge) ** +P(son_find)(struct P(edge) *edge, uns c) +{ + if (edge->flags & TRIE_FLAG_HASH) + return P(hash_find)(edge, c); + else + for (uns i = edge->flags & TRIE_FLAG_DEG; i--; ) + if (edge->trans[i] == c) + return &edge->son[i]; + return NULL; +} + +static struct P(edge) ** +P(son_insert)(TAC struct P(edge) **ref, uns c) +{ + struct P(edge) *old = *ref, *edge; + uns deg = old->flags & TRIE_FLAG_DEG; + if (old->flags & TRIE_FLAG_HASH) + { + old->flags++; + if ((deg + 1 + old->hash_deleted) * 4 > (TRIE_BUCKET_MASK << old->hash_rank) * 3) // >75% density + { + P(hash_realloc)(TTC ref); + edge = *ref; + } + else + edge = old; + } + else + { + if (deg < TRIE_HASH_THRESHOLD) + { + TRIE_DBG("Growing array"); + edge = P(edge_alloc)(TTC old->flags + 1); + memcpy((byte *)edge + sizeof(edge->flags), (byte *)old + sizeof(edge->flags), + sizeof(*old) - sizeof(edge->flags) + deg * sizeof(*old->son)); + edge->trans[deg] = c; + edge->son[deg] = NULL; + P(edge_free)(TTC old); + *ref = edge; + return &edge->son[deg]; + } + else + { + TRIE_DBG("Growing array to hash table"); + edge = P(edge_alloc)(TTC (old->flags + 1) | TRIE_FLAG_HASH); + edge->node = old->node; + edge->len = old->len; + for (uns i = 0; i < deg; i++) + *P(hash_insert)(edge, old->trans[i]) = old->son[i]; + P(edge_free)(TTC old); + } + } + *ref = edge; + return P(hash_insert)(edge, c); +} + +#ifdef TRIE_WANT_DO_DELETE +static void +P(son_delete)(TAC struct P(edge) **ref, uns c) +{ + struct P(edge) *old = *ref, *edge; + uns deg = old->flags & TRIE_FLAG_DEG; + ASSERT(deg); + if (old->flags & TRIE_FLAG_HASH) + { + P(hash_delete)(old, c); + old->flags--; + deg--; + if (deg <= TRIE_HASH_THRESHOLD / 2) + { + TRIE_DBG("Reducing hash table to array"); + edge = P(edge_alloc)(TTC old->flags & ~TRIE_FLAG_HASH); + uns k = 0; + TRIE_HASH_FOR_ALL(old, trans, son) + edge->trans[k] = trans; + edge->son[k] = son; + k++; + TRIE_HASH_END_FOR; + ASSERT(k == deg); + } + else if (deg * 6 >= (TRIE_BUCKET_MASK << old->hash_rank)) // >= 16% + return; + else + { + P(hash_realloc)(TTC ref); + edge = *ref; + return; + } + } + else + { + TRIE_DBG("Reducing array"); + edge = P(edge_alloc)(TTC old->flags - 1); + uns j = 0; + for (uns i = 0; i < deg; i++) + if (old->trans[i] != c) + { + edge->trans[j] = old->trans[i]; + edge->son[j] = old->son[i]; + j++; + } + ASSERT(j == deg - 1); + } + edge->node = old->node; + edge->len = old->len; + P(edge_free)(TTC old); + *ref = edge; +} +#endif + +#ifdef TRIE_WANT_DO_DELETE +static struct P(edge) * +P(son_any)(struct P(edge) *edge) +{ + ASSERT(edge->flags & TRIE_FLAG_DEG); + if (!(edge->flags & TRIE_FLAG_HASH)) + return edge->son[0]; + else + for (uns i = 0; ; i++) + if ((i & TRIE_BUCKET_MASK) && (uintptr_t)edge->son[i] > 1) + return edge->son[i]; +} +#endif + +/*** Find/insert/delete ***/ + +#ifdef TRIE_WANT_DO_FIND +static struct P(edge) * +P(do_find)(TAC byte *ptr, uns len) +{ + TRIE_DBG("do_find('%.*s')", len, ptr); + struct P(edge) **ref = &T.root, *edge; + do + { + if (!(edge = *ref) || edge->len > len) + return NULL; + else if (edge->len == len) + return ((edge->flags & TRIE_FLAG_NODE) && !memcmp(ptr, P(str_get)(edge->node), len)) ? edge : NULL; + } + while (ref = P(son_find)(edge, P(str_char)(ptr, len, edge->len))); + return NULL; +} +#endif + +static struct P(edge) * +P(do_lookup)(TAC byte *ptr, uns len) +{ + TRIE_DBG("do_lookup('%.*s')", len, ptr); + struct P(edge) **ref, *edge, *leaf, *newleaf; + uns prefix, elen, trans, pos; + byte *eptr; + + if (!(edge = T.root)) + { + TRIE_DBG("Creating first edge"); + edge = T.root = P(edge_alloc)(TTC TRIE_FLAG_NODE); + edge->node = NULL; + edge->len = len; + return edge; + } + else + { + while (edge->len < len && (ref = P(son_find)(edge, P(str_char)(ptr, len, edge->len)))) + edge = *ref; + if (!(edge->flags & TRIE_FLAG_NODE)) + edge = edge->leaf; + ASSERT(edge->flags & TRIE_FLAG_NODE); + eptr = P(str_get)(edge->node); + elen = edge->len; + prefix = P(common_prefix)(ptr, len, eptr, elen); + if (prefix == len && prefix == elen) + return edge; + TRIE_DBG("The longest common prefix is '%.*s'", prefix, P(str_prefix)(ptr, len, prefix)); + + if (prefix < len) + { + TRIE_DBG("Creating a new leaf"); + newleaf = P(edge_alloc)(TTC TRIE_FLAG_NODE); + newleaf->node = NULL; + newleaf->len = len; + } + else + newleaf = NULL; + + ref = &T.root; + while (edge = *ref) + { + pos = edge->len; + if (prefix < pos) + { + leaf = (edge->flags & TRIE_FLAG_NODE) ? edge : edge->leaf; + TRIE_DBG("Splitting edge '%.*s'", leaf->len, P(str_get)(leaf->node)); + trans = P(str_char)(P(str_get)(leaf->node), leaf->len, prefix); + if (len == prefix) + { + edge = P(edge_alloc)(TTC 1 | TRIE_FLAG_NODE); + edge->len = prefix; + edge->node = NULL; + edge->trans[0] = trans; + edge->son[0] = *ref; + return *ref = edge; + } + else + { + edge = P(edge_alloc)(TTC 2); + edge->len = prefix; + edge->leaf = leaf; + edge->trans[0] = trans; + edge->son[0] = *ref; + edge->trans[1] = P(str_char)(ptr, len, prefix); + *ref = edge; + return edge->son[1] = newleaf; + } + } + if (pos == len) + { + TRIE_DBG("Adding the node to an already existing edge"); + edge->flags |= TRIE_FLAG_NODE; + edge->node = NULL; + return edge; + } + if (!(edge->flags & TRIE_FLAG_NODE) && newleaf) + edge->leaf = newleaf; + trans = P(str_char)(ptr, len, pos); + if (pos < prefix) + ref = P(son_find)(edge, trans); + else + ref = P(son_insert)(TTC ref, trans); + } + } + return *ref = newleaf; +} + +#ifdef TRIE_WANT_DO_DELETE +static P(node_t) * +P(do_delete)(TAC byte *ptr, uns len) +{ + TRIE_DBG("do_delete('%.*s')", len, ptr); + struct P(edge) **ref = &T.root, **pref = NULL, *edge, *parent, *leaf, *pold = NULL; + while (1) + { + if (!(edge = *ref) || edge->len > len) + return NULL; + else if (edge->len == len) + if ((edge->flags & TRIE_FLAG_NODE) && !memcmp(ptr, P(str_get)(edge->node), len)) + break; + else + return NULL; + pref = ref; + if (!(ref = P(son_find)(edge, P(str_char)(ptr, len, edge->len)))) + return NULL; + } + + P(node_t) *node = edge->node; + uns deg = edge->flags & TRIE_FLAG_DEG; + + if (!deg) + { + if (!pref) + { + TRIE_DBG("Deleting last edge"); + T.root = NULL; + P(edge_free)(TTC edge); + return node; + } + else + { + TRIE_DBG("Deleting a leaf"); + pold = *pref; + P(son_delete)(TTC pref, P(str_char)(ptr, len, pold->len)); + parent = *pref; + if ((parent->flags & (TRIE_FLAG_DEG | TRIE_FLAG_NODE)) <= 1) + { + ASSERT((parent->flags & (TRIE_FLAG_DEG | TRIE_FLAG_HASH)) == 1); + TRIE_DBG("... and its parent"); + leaf = *pref = parent->son[0]; + P(edge_free)(TTC parent); + } + else if (parent->flags & TRIE_FLAG_NODE) + leaf = parent; + else + leaf = P(son_any)(parent); + } + P(edge_free)(TTC edge); + } + else if (deg == 1) + { + TRIE_DBG("Deleting internal edge"); + ASSERT(!(edge->flags & TRIE_FLAG_HASH)); + leaf = *ref = edge->son[0]; + P(edge_free)(TTC edge); + } + else + { + TRIE_DBG("Deleting node, but leaving edge"); + leaf = P(son_any)(edge); + if (!(leaf->flags & TRIE_FLAG_NODE)) + leaf = leaf->leaf; + edge->leaf = leaf; + edge->flags &= ~TRIE_FLAG_NODE; + } + + TRIE_DBG("Updating leaf pointers"); + if (!(leaf->flags & TRIE_FLAG_NODE)) + leaf = leaf->leaf; + ASSERT(leaf->flags & TRIE_FLAG_NODE); + for (ref = &T.root; ref && (*ref)->len < len; ref = P(son_find)(*ref, P(str_char)(ptr, len, (*ref)->len))) + if ((*ref)->leaf == edge || (*ref)->leaf == pold) + (*ref)->leaf = leaf; + return node; +} +#endif + +#ifdef TRIE_WANT_FIND +static inline P(node_t) * +P(find)(TAC char *str) +{ + struct P(edge) *edge = P(do_find)(TTC str, str_len(str)); + return edge ? edge->node : NULL; +} +#endif + +#ifdef TRIE_WANT_FIND_BUF +static inline P(node_t) * +P(find_buf)(TAC byte *ptr, uns len) +{ + struct P(edge) *edge = P(do_find)(TTC ptr, len); + return edge ? edge->node : NULL; +} +#endif + +#ifdef TRIE_WANT_ADD +static inline void +P(add)(TAC P(node_t) *node) +{ + struct P(edge) *edge = P(do_lookup)(TTC P(str_get)(node), P(str_len)(node)); + ASSERT(!edge->node); + edge->node = node; +} +#endif + +#ifdef TRIE_WANT_REPLACE +static inline P(node_t) * +P(replace)(TAC P(node_t) *node) +{ + struct P(edge) *edge = P(do_lookup)(TTC P(str_get)(node), P(str_len)(node)); + P(node_t) *over = edge->node; + edge->node = node; + return over; +} +#endif + +#ifdef TRIE_WANT_DELETE +static inline P(node_t) * +P(delete)(TAC char *str) +{ + return P(do_delete)(TTC str, str_len(str)); +} +#endif + +#ifdef TRIE_WANT_DELETE_BUF +static inline P(node_t) * +P(delete_buf)(TAC byte *ptr, uns len) +{ + return P(do_delete)(TTC ptr, len); +} +#endif + +#ifdef TRIE_WANT_REMOVE +static inline void +P(remove)(TAC P(node_t) *node) +{ + if (unlikely(P(do_delete)(TTC P(str_get)(node), P(str_len)(node)) != node)) + ASSERT(0); +} +#endif + +/*** Traversing prefixes and subtrees ***/ + +#ifndef TRIE_FOR_ALL + +// for all matched edges until the first >=xlen (including) +#define TRIE_FOR_PREFIX_EDGES(px, xtrie, xptr, xlen, xedge) \ + do \ + { \ + byte *_ptr = (xptr); \ + uns _len = (xlen); \ + struct px##trie *_trie = (xtrie); \ + struct px##edge *xedge, **_ref; \ + if (!(xedge = _trie->root)) \ + break; \ + while (xedge->len < _len && (_ref = px##son_find(xedge, px##str_char(_ptr, _len, xedge->len)))) \ + xedge = *_ref; \ + if (!(xedge->flags & TRIE_FLAG_NODE)) \ + xedge = xedge->leaf; \ + uns _prefix = px##common_prefix(_ptr, _len, px##str_get(xedge->node), xedge->len); \ + for (_ref = &_trie->root; _ref && ((xedge = *_ref)->len <= _prefix || _prefix == _len); \ + _ref = (xedge->len < _prefix) ? px##son_find(xedge, px##str_char(_ptr, _len, xedge->len)) : NULL) \ + { +#define TRIE_END_PREFIX_EDGES \ + } \ + } \ + while (0) + +// for entire subtree starting in the xstart edge +#define TRIE_FOR_SUBTREE_EDGES(px, xstart, xedge) \ + do \ + { \ + struct { struct px##edge *edge; uns pos; } \ + *_sbuf = alloca(sizeof(*_sbuf) * 16), \ + *_sptr = _sbuf, *_send = _sbuf + 16; \ + struct px##edge *_next = (xstart), *xedge; \ + while (xedge = _next) \ + { \ + if (xedge->flags & TRIE_FLAG_DEG) \ + { \ + if (_sptr == _send) \ + { \ + uns stack_size = _sptr - _sbuf; \ + _sptr = alloca(sizeof(*_sptr) * (stack_size * 2)); \ + memcpy(_sptr, _sbuf, sizeof(*_sptr) * stack_size); \ + _sbuf = _sptr; \ + _send = _sptr + stack_size * 2; \ + } \ + _sptr->edge = xedge; \ + _sptr->pos = (xedge->flags & TRIE_FLAG_HASH) ? \ + (1U << px##bucket_rank) << xedge->hash_rank : \ + (xedge->flags & TRIE_FLAG_DEG); \ + _sptr++; \ + } \ + while (1) \ + { \ + if (_sptr == _sbuf) \ + { \ + _next = NULL; \ + break; \ + } \ + _next = (--_sptr)->edge; \ + uns pos = --(_sptr->pos); \ + uns flags = _next->flags; \ + _next = _next->son[pos]; \ + if (pos) \ + _sptr++; \ + if (!(flags & TRIE_FLAG_HASH) || \ + ((pos & ((1U << px##bucket_rank) - 1)) && \ + (uintptr_t)_next > 1)) \ + break; \ + } +#define TRIE_END_SUBTREE_EDGES \ + } \ + } \ + while (0) + +#define TRIE_FOR_SUBTREE(px, xstart, xnode) \ + TRIE_FOR_SUBTREE_EDGES(px, xstart, _edge) \ + if (_edge->flags & TRIE_FLAG_NODE) \ + { \ + px##node_t *xnode = _edge->node; +#define TRIE_END_SUBTREE \ + } \ + TRIE_END_SUBTREE_EDGES; + +#define TRIE_FOR_ALL_EDGES(px, xtrie, xedge) TRIE_FOR_SUBTREE_EDGES(px, (xtrie)->root, xedge) +#define TRIE_END_ALL_EDGES TRIE_END_SUBTREE_EDGES + +#define TRIE_FOR_ALL(px, xtrie, xnode) TRIE_FOR_SUBTREE(px, (xtrie)->root, xnode) +#define TRIE_END_ALL TRIE_END_SUBTREE + +#endif + +/*** Check consistency ***/ + +#ifdef TRIE_WANT_AUDIT + +static void +P(audit)(TA) +{ + uns count = 0; + TRIE_FOR_ALL_EDGES(TRIE_PREFIX(), &T, edge) + { + ASSERT(edge); + uns deg = edge->flags & TRIE_FLAG_DEG; + ASSERT(edge->node); + struct P(edge) * leaf = (edge->flags & TRIE_FLAG_NODE) ? edge : edge->leaf; + if (leaf != edge) + { + ASSERT(leaf->flags & TRIE_FLAG_NODE); + ASSERT(leaf->len > edge->len); + ASSERT(leaf->node); + } + TRIE_DBG("Checking edge %p, %s=%p, flags=0x%x, key='%.*s'", + edge, (edge->flags & TRIE_FLAG_NODE) ? "node" : "leaf", edge->node, edge->flags, + edge->len, P(str_prefix)(P(str_get)(leaf->node), leaf->len, edge->len)); + ASSERT(deg >= 2 || (edge->flags & TRIE_FLAG_NODE)); + if (edge->flags & TRIE_FLAG_HASH) + { + ASSERT(deg > 1 && deg <= 256); + uns count = 0, deleted = 0; + for (uns i = TRIE_BUCKET_SIZE << edge->hash_rank; i--; ) + if (i & TRIE_BUCKET_MASK) + if ((uintptr_t)edge->son[i] == 1) + deleted++; + else if (edge->son[i]) + { + ASSERT(edge->son[i]->len > edge->len); + count++; + } + ASSERT(count == deg); + ASSERT(deleted == edge->hash_deleted); + } + else + { + ASSERT(deg <= TRIE_HASH_THRESHOLD); + for (uns i = 0; i < deg; i++) + ASSERT(edge->son[i]->len > edge->len); + } + count++; + } + TRIE_END_ALL_EDGES; + TRIE_DBG("Found %u edges", count); +} + +#endif + +/*** Statistics ***/ + +#ifdef TRIE_WANT_STATS + +struct P(stats) { + u64 total_size; + u64 small_size; + u64 hash_size; +}; + +static void +P(stats)(TAC struct P(stats) *stats) +{ + bzero(stats, sizeof(*stats)); + for (uns i = 0; i < ARRAY_SIZE(T.epool); i++) + stats->small_size += ep_total_size(T.epool[i]); + for (uns i = 0; i < ARRAY_SIZE(T.hpool); i++) + stats->hash_size += ep_total_size(T.hpool[i]); + stats->total_size = stats->small_size + stats->hash_size + sizeof(T); +} + +static inline u64 +P(total_size)(TA) +{ + struct P(stats) stats; + P(stats)(TTC &stats); + return stats.total_size; +} + +#endif + +/*** Clean up local macros ***/ + +#undef P +#undef T +#undef TA +#undef TAC +#undef TT +#undef TTC + +#undef TRIE_PREFIX +#undef TRIE_NODE_TYPE +#undef TRIE_NODE_KEY +#undef TRIE_NODE_LEN +#undef TRIE_LEN_TYPE +#undef TRIE_REV +#undef TRIE_DYNAMIC +#undef TRIE_ELTPOOL_SIZE +#undef TRIE_HASH_THRESHOLD +#undef TRIE_BUCKET_RANK +#undef TRIE_BUCKET_SIZE +#undef TRIE_BUCKET_MASK +#undef TRIE_TRACE +#undef TRIE_DBG +#undef TRIE_HASH_FOR_ALL +#undef TRIE_HASH_END_FOR + +#undef TRIE_WANT_CLEANUP + +#undef TRIE_WANT_DO_FIND +#undef TRIE_WANT_DO_LOOKUP +#undef TRIE_WANT_DO_DELETE + +#undef TRIE_WANT_FIND +#undef TRIE_WANT_FIND_BUF +#undef TRIE_WANT_ADD +#undef TRIE_WANT_ADD_OVER +#undef TRIE_WANT_DELETE +#undef TRIE_WANT_DELETE_BUF +#undef TRIE_WANT_REMOVE + +#undef TRIE_WANT_AUDIT -- 2.39.2