SHLIB_OBJS=alloc.o alloc_str.o ctmatch.o db.o fastbuf.o fb-file.o fb-mem.o lists.o \
log.o log2.o md5.o md5hex.o mmap.o pagecache.o patimatch.o patmatch.o pool.o \
prime.o random.o realloc.o regex.o temp.o timer.o url.o wildmatch.o \
- wordsplit.o str_ctype.o str_upper.o bucket.o conf.o object.o sorter.o
+ wordsplit.o str_ctype.o str_upper.o bucket.o conf.o object.o sorter.o \
+ finger.o
obj/lib/libsh.a: $(addprefix obj/lib/,$(SHLIB_OBJS))
--- /dev/null
+/*
+ * Sherlock Library -- String Fingerprints
+ *
+ * (c) 2001 Martin Mares <mj@ucw.cz>
+ */
+
+/*
+ * We use a hashing function to map all the URL's and other
+ * hairy strings we work with to a much simpler universe
+ * of constant length bit strings (currently 96-bit ones).
+ * With a random hashing function (which is equivalent to
+ * having a fixed function and random input), the probability
+ * of at least one collision happening is at most c*n^2/m
+ * where n is the number of strings we hash, m is the size
+ * of our bit string universe (2^96) and c is a small constant.
+ * We set m sufficiently large and expect no collisions
+ * to occur. On the other hand, the worst thing which could
+ * be cause by a collision is mixing up two strings or labels
+ * of two documents which is relatively harmless.
+ */
+
+#include "lib/lib.h"
+#include "lib/index.h"
+#include "lib/md5.h"
+
+void
+fingerprint(byte *string, struct fingerprint *fp)
+{
+ struct MD5Context c;
+ byte digest[16];
+
+ MD5Init(&c);
+ MD5Update(&c, string, strlen(string));
+ MD5Final(digest, &c);
+ memcpy(fp->hash, digest, 12);
+}
WT_META, /* Various meta-information */
WT_ALT /* Alternate texts for graphical elements */
};
+
+/* Index card attributes */
+
+struct card_attr {
+ u32 card; /* Reference to card description (either oid or filepos) */
+ u32 site_id;
+ byte weight;
+ byte rfu[3];
+};
+
+/* String fingerprints */
+
+struct fingerprint {
+ u32 hash[3];
+};
+
+void fingerprint(byte *string, struct fingerprint *fp);