2 * Sherlock Library -- String Fingerprints
4 * (c) 2001--2002 Martin Mares <mj@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
11 * We use a hashing function to map all the URL's and other
12 * hairy strings we work with to a much simpler universe
13 * of constant length bit strings (currently 96-bit ones).
14 * With a random hashing function (which is equivalent to
15 * having a fixed function and random input), the probability
16 * of at least one collision happening is at most c*n^2/m
17 * where n is the number of strings we hash, m is the size
18 * of our bit string universe (2^96) and c is a small constant.
19 * We set m sufficiently large and expect no collisions
20 * to occur. On the other hand, the worst thing which could
21 * be cause by a collision is mixing up two strings or labels
22 * of two documents which is relatively harmless.
27 #include "lib/index.h"
32 static uns finger_www_hack;
34 static struct cfitem finger_config[] = {
35 { "Fingerprints", CT_SECTION, NULL },
36 { "WWWHack", CT_INT, &finger_www_hack },
37 { NULL, CT_STOP, NULL }
40 static void CONSTRUCTOR finger_conf_init(void)
42 cf_register(finger_config);
46 fingerprint(byte *string, struct fingerprint *fp)
49 uns len = strlen(string);
53 if (finger_www_hack && len >= 11 && !memcmp(string, "http://www.", 11))
55 /* FIXME: This is a dirty hack, but it has to stay until we get real handling of duplicates */
56 MD5Update(&c, string, 7);
57 MD5Update(&c, string+11, len-11);
60 MD5Update(&c, string, len);
62 memcpy(fp->hash, digest, 12);