From fa5fc267286efc42f0c610c4e54a117a9bc07232 Mon Sep 17 00:00:00 2001 From: Martin Mares Date: Thu, 31 Jan 2002 11:36:45 +0000 Subject: [PATCH] Added a data structure for very efficient probabilistic representation of sets. For more info, consult comments at the start of bitsig.c. --- lib/Makefile | 2 +- lib/bitsig.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++ lib/bitsig.h | 11 ++++ 3 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 lib/bitsig.c create mode 100644 lib/bitsig.h diff --git a/lib/Makefile b/lib/Makefile index f9492a38..9cf5c53b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -7,7 +7,7 @@ SHLIB_OBJS=alloc.o alloc_str.o ctmatch.o db.o fastbuf.o fb-file.o fb-mem.o lists log.o log2.o md5.o md5hex.o mmap.o pagecache.o patimatch.o patmatch.o pool.o \ prime.o random.o realloc.o regex.o timer.o url.o wildmatch.o \ wordsplit.o str_ctype.o str_upper.o bucket.o conf.o object.o sorter.o \ - finger.o proctitle.o ipaccess.o profile.o + finger.o proctitle.o ipaccess.o profile.o bitsig.o obj/lib/libsh.a: $(addprefix obj/lib/,$(SHLIB_OBJS)) diff --git a/lib/bitsig.c b/lib/bitsig.c new file mode 100644 index 00000000..1f76fa91 --- /dev/null +++ b/lib/bitsig.c @@ -0,0 +1,148 @@ +/* + * Bit Array Signatures -- A Dubious Detector of Duplicates + * + * (c) 2002 Martin Mares + * + * FIXME: Ref to the original article. + * + * This data structure provides a very compact representation + * of a set of strings with insertion and membership search, + * but with a certain low probability it cheats by incidentally + * reporting a non-member as a member. Generally the larger you + * create the structure, the lower this probability is. + * + * How does it work: the structure is just an array of M bits + * and each possible element is hashed to a set of (at most) L + * bit positions. For each element of the represented set, we + * set its L bits to ones and we report as present all elements + * whose all L bits ar set. + * + * Analysis: Let's assume N items have already been stored and let A + * denote L/M (density of the hash function). The probability that + * a fixed bit of the array is set by any of the N items is + * 1 - (1-1/M)^(NL) = 1 - ((1-1/M)^M)^NA = approx. 1 - e^-NA. + * This is minimized by setting A=(ln 2)/N (try taking derivative). + * Given a non-present item, the probability that all of the bits + * corresponding to this item are set by the other items (that is, + * the structure gives a false answer) is (1-e^-NA)^L = 2^-L. + * Hence, if we want to give false answers with probability less + * than epsilon, we take L := -log_2 epsilon, M := 1.45*N*L. + * + * Example: For a set of 10^7 items with P[error] < 10^-6, we set + * L := 20 and M := 290*10^6 bits = cca 34.5 MB (29 bits per item). + * + * We leave L and an upper bound for N as parameters set during + * creation of the structure. Currently, the structure is limited + * to 4 Gb = 512 MB. + */ + +#include "lib/lib.h" +#include "lib/bitsig.h" +#include "lib/md5.h" + +#include + +struct bitsig { + uns l, m, n, maxn, max_m_mult; + u32 tmp; + byte array[0]; +}; + +struct bitsig * +bitsig_init(uns perrlog, uns maxn) +{ + struct bitsig *b; + u64 m; + uns mbytes; + + m = ((u64) maxn * perrlog * 145 + 99) / 100; + if (m >= (u64) 1 << 32) + die("bitsig_init: bitsig array too large (maximum is 4 Gb)"); + mbytes = (m + 7) >> 3U; + b = xmalloc(sizeof(struct bitsig) + mbytes); + b->l = perrlog; + b->m = m; + b->n = 0; + b->maxn = maxn; + b->max_m_mult = (0xffffffff / m) * m; + bzero(b->array, mbytes); + log(L_DEBUG, "Initialized bitsig array with l=%d, m=%u (%u KB), expecting %d items", b->l, b->m, (mbytes+1023)/1024, maxn); + return b; +} + +static void +bitsig_hash_init(struct bitsig *b, byte *item) +{ + struct MD5Context c; + u32 digest[4]; + + MD5Init(&c); + MD5Update(&c, item, strlen(item)); + MD5Final((byte *) digest, &c); + b->tmp = digest[0]; +} + +static inline uns +bitsig_hash_bit(struct bitsig *b) +{ + do + { + /* FIXME: Check */ + b->tmp *= 3006477127U; + } + while (b->tmp >= b->max_m_mult); + return b->tmp % b->m; +} + +int +bitsig_member(struct bitsig *b, byte *item) +{ + uns i, bit; + + bitsig_hash_init(b, item); + for (i=0; il; i++) + { + bit = bitsig_hash_bit(b); + if (!(b->array[bit >> 3] & (1 << (bit & 7)))) + return 0; + } + return 1; +} + +int +bitsig_insert(struct bitsig *b, byte *item) +{ + uns i, bit, was; + + bitsig_hash_init(b, item); + was = 1; + for (i=0; il; i++) + { + bit = bitsig_hash_bit(b); + if (!(b->array[bit >> 3] & (1 << (bit & 7)))) + { + was = 0; + b->array[bit >> 3] |= (1 << (bit & 7)); + } + } + if (!was && b->n++ == b->maxn+1) + log(L_ERROR, "bitsig: Too many items inserted, error rate will be higher than estimated!"); + return was; +} + +#ifdef TEST + +#include + +int main(void) +{ + struct bitsig *b = bitsig_init(10, 23000); + byte buf[1024]; + + while (fgets(buf, 1024, stdin)) + printf("%d\n", bitsig_insert(b, buf)); + + return 0; +} + +#endif diff --git a/lib/bitsig.h b/lib/bitsig.h new file mode 100644 index 00000000..88aba524 --- /dev/null +++ b/lib/bitsig.h @@ -0,0 +1,11 @@ +/* + * Bit Array Signatures -- A Dubious Detector of Duplicates + * + * (c) 2002 Martin Mares + */ + +struct bitsig; + +struct bitsig *bitsig_init(uns perrlog, uns maxn); +int bitsig_member(struct bitsig *b, byte *item); +int bitsig_insert(struct bitsig *b, byte *item); -- 2.39.2