2 * UCW Library -- Bit Array Signatures -- A Dubious Detector of Duplicates
4 * (c) 2002 Martin Mares <mj@ucw.cz>
6 * Greatly inspired by: Faloutsos, C. and Christodoulakis, S.: Signature files
7 * (An access method for documents and its analytical performance evaluation),
8 * ACM Trans. Office Inf. Syst., 2(4):267--288, Oct. 1984.
10 * This data structure provides a very compact representation
11 * of a set of strings with insertion and membership search,
12 * but with a certain low probability it cheats by incidentally
13 * reporting a non-member as a member. Generally the larger you
14 * create the structure, the lower this probability is.
16 * How does it work: the structure is just an array of M bits
17 * and each possible element is hashed to a set of (at most) L
18 * bit positions. For each element of the represented set, we
19 * set its L bits to ones and we report as present all elements
20 * whose all L bits ar set.
22 * Analysis: Let's assume N items have already been stored and let A
23 * denote L/M (density of the hash function). The probability that
24 * a fixed bit of the array is set by any of the N items is
25 * 1 - (1-1/M)^(NL) = 1 - ((1-1/M)^M)^NA = approx. 1 - e^-NA.
26 * This is minimized by setting A=(ln 2)/N (try taking derivative).
27 * Given a non-present item, the probability that all of the bits
28 * corresponding to this item are set by the other items (that is,
29 * the structure gives a false answer) is (1-e^-NA)^L = 2^-L.
30 * Hence, if we want to give false answers with probability less
31 * than epsilon, we take L := -log_2 epsilon, M := 1.45*N*L.
33 * Example: For a set of 10^7 items with P[error] < 10^-6, we set
34 * L := 20 and M := 290*10^6 bits = cca 34.5 MB (29 bits per item).
36 * We leave L and an upper bound for N as parameters set during
37 * creation of the structure. Currently, the structure is limited
40 * This software may be freely distributed and used according to the terms
41 * of the GNU Lesser General Public License.
45 #include <ucw/bitsig.h>
51 uns l, m, n, maxn, max_m_mult;
58 bitsig_init(uns perrlog, uns maxn)
64 m = ((u64) maxn * perrlog * 145 + 99) / 100;
65 if (m >= (u64) 1 << 32)
66 die("bitsig_init: bitsig array too large (maximum is 4 Gb)");
67 mbytes = (m + 7) >> 3U;
68 b = xmalloc(sizeof(struct bitsig) + mbytes);
73 b->max_m_mult = (0xffffffff / m) * m;
74 bzero(b->array, mbytes);
75 msg(L_DEBUG, "Initialized bitsig array with l=%d, m=%u (%u KB), expecting %d items", b->l, b->m, (mbytes+1023)/1024, maxn);
80 bitsig_free(struct bitsig *b)
86 bitsig_hash_init(struct bitsig *b, byte *item)
88 md5_hash_buffer((byte *) b->hash, item, strlen(item));
93 bitsig_hash_bit(struct bitsig *b)
98 h = b->hash[b->hindex];
99 b->hash[b->hindex] *= 3006477127U;
100 b->hindex = (b->hindex+1) % 4;
102 while (h >= b->max_m_mult);
107 bitsig_member(struct bitsig *b, byte *item)
111 bitsig_hash_init(b, item);
112 for (i=0; i<b->l; i++)
114 bit = bitsig_hash_bit(b);
115 if (!(b->array[bit >> 3] & (1 << (bit & 7))))
122 bitsig_insert(struct bitsig *b, byte *item)
126 bitsig_hash_init(b, item);
128 for (i=0; i<b->l; i++)
130 bit = bitsig_hash_bit(b);
131 if (!(b->array[bit >> 3] & (1 << (bit & 7))))
134 b->array[bit >> 3] |= (1 << (bit & 7));
137 if (!was && b->n++ == b->maxn+1)
138 msg(L_ERROR, "bitsig: Too many items inserted, error rate will be higher than estimated!");
147 int main(int argc, char **argv)
149 struct bitsig *b = bitsig_init(atol(argv[1]), atol(argv[2]));
152 while (fgets(buf, 1024, stdin))
153 printf("%d\n", bitsig_insert(b, buf));