From b4d8192e80f3e272b8cc4e4aed4001f33430a11e Mon Sep 17 00:00:00 2001 From: Martin Mares Date: Thu, 15 Feb 2001 19:05:39 +0000 Subject: [PATCH] Added URL fingerprints. --- lib/Makefile | 3 ++- lib/finger.c | 36 ++++++++++++++++++++++++++++++++++++ lib/index.h | 17 +++++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 lib/finger.c diff --git a/lib/Makefile b/lib/Makefile index 0179e9b8..ac13deea 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -6,7 +6,8 @@ PROGS+=obj/lib/db-test obj/lib/db-rebuild obj/lib/buckettool obj/lib/sort-test SHLIB_OBJS=alloc.o alloc_str.o ctmatch.o db.o fastbuf.o fb-file.o fb-mem.o lists.o \ log.o log2.o md5.o md5hex.o mmap.o pagecache.o patimatch.o patmatch.o pool.o \ prime.o random.o realloc.o regex.o temp.o timer.o url.o wildmatch.o \ - wordsplit.o str_ctype.o str_upper.o bucket.o conf.o object.o sorter.o + wordsplit.o str_ctype.o str_upper.o bucket.o conf.o object.o sorter.o \ + finger.o obj/lib/libsh.a: $(addprefix obj/lib/,$(SHLIB_OBJS)) diff --git a/lib/finger.c b/lib/finger.c new file mode 100644 index 00000000..64205349 --- /dev/null +++ b/lib/finger.c @@ -0,0 +1,36 @@ +/* + * Sherlock Library -- String Fingerprints + * + * (c) 2001 Martin Mares + */ + +/* + * We use a hashing function to map all the URL's and other + * hairy strings we work with to a much simpler universe + * of constant length bit strings (currently 96-bit ones). + * With a random hashing function (which is equivalent to + * having a fixed function and random input), the probability + * of at least one collision happening is at most c*n^2/m + * where n is the number of strings we hash, m is the size + * of our bit string universe (2^96) and c is a small constant. + * We set m sufficiently large and expect no collisions + * to occur. On the other hand, the worst thing which could + * be cause by a collision is mixing up two strings or labels + * of two documents which is relatively harmless. + */ + +#include "lib/lib.h" +#include "lib/index.h" +#include "lib/md5.h" + +void +fingerprint(byte *string, struct fingerprint *fp) +{ + struct MD5Context c; + byte digest[16]; + + MD5Init(&c); + MD5Update(&c, string, strlen(string)); + MD5Final(digest, &c); + memcpy(fp->hash, digest, 12); +} diff --git a/lib/index.h b/lib/index.h index c334ca5a..f2b49160 100644 --- a/lib/index.h +++ b/lib/index.h @@ -22,3 +22,20 @@ enum word_type { WT_META, /* Various meta-information */ WT_ALT /* Alternate texts for graphical elements */ }; + +/* Index card attributes */ + +struct card_attr { + u32 card; /* Reference to card description (either oid or filepos) */ + u32 site_id; + byte weight; + byte rfu[3]; +}; + +/* String fingerprints */ + +struct fingerprint { + u32 hash[3]; +}; + +void fingerprint(byte *string, struct fingerprint *fp); -- 2.39.2