Added URL fingerprints.

author Martin Mares <mj@ucw.cz>

Thu, 15 Feb 2001 19:05:39 +0000 (19:05 +0000)

committer Martin Mares <mj@ucw.cz>

Thu, 15 Feb 2001 19:05:39 +0000 (19:05 +0000)
author Martin Mares <mj@ucw.cz>
Thu, 15 Feb 2001 19:05:39 +0000 (19:05 +0000)
committer Martin Mares <mj@ucw.cz>
Thu, 15 Feb 2001 19:05:39 +0000 (19:05 +0000)
diff --git a/lib/Makefile b/lib/Makefile

index 0179e9b85bc7da0e3a60bb16f17ee5a554880cd5..ac13deea5014ce6751ec07d77101e179bb6a8b63 100644 (file)
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -6,7 +6,8 @@ PROGS+=obj/lib/db-test obj/lib/db-rebuild obj/lib/buckettool obj/lib/sort-test
  SHLIB_OBJS=alloc.o alloc_str.o ctmatch.o db.o fastbuf.o fb-file.o fb-mem.o lists.o \
         log.o log2.o md5.o md5hex.o mmap.o pagecache.o patimatch.o patmatch.o pool.o \
         prime.o random.o realloc.o regex.o temp.o timer.o url.o wildmatch.o \
-       wordsplit.o str_ctype.o str_upper.o bucket.o conf.o object.o sorter.o
+       wordsplit.o str_ctype.o str_upper.o bucket.o conf.o object.o sorter.o \
+       finger.o
  
  obj/lib/libsh.a: $(addprefix obj/lib/,$(SHLIB_OBJS))
  
diff --git a/lib/finger.c b/lib/finger.c

new file mode 100644 (file)

index 0000000..6420534
--- /dev/null
+++ b/lib/finger.c
@@ -0,0 +1,36 @@
+/*
+ *     Sherlock Library -- String Fingerprints
+ *
+ *     (c) 2001 Martin Mares <mj@ucw.cz>
+ */
+
+/*
+ *  We use a hashing function to map all the URL's and other
+ *  hairy strings we work with to a much simpler universe
+ *  of constant length bit strings (currently 96-bit ones).
+ *  With a random hashing function (which is equivalent to
+ *  having a fixed function and random input), the probability
+ *  of at least one collision happening is at most c*n^2/m
+ *  where n is the number of strings we hash, m is the size
+ *  of our bit string universe (2^96) and c is a small constant.
+ *  We set m sufficiently large and expect no collisions
+ *  to occur. On the other hand, the worst thing which could
+ *  be cause by a collision is mixing up two strings or labels
+ *  of two documents which is relatively harmless.
+ */
+
+#include "lib/lib.h"
+#include "lib/index.h"
+#include "lib/md5.h"
+
+void
+fingerprint(byte *string, struct fingerprint *fp)
+{
+  struct MD5Context c;
+  byte digest[16];
+
+  MD5Init(&c);
+  MD5Update(&c, string, strlen(string));
+  MD5Final(digest, &c);
+  memcpy(fp->hash, digest, 12);
+}
diff --git a/lib/index.h b/lib/index.h

index c334ca5a90174d5efe885eaff42abe3d76f41e71..f2b491601b3aa72381bdf52b0e4bc77ec02a7fb7 100644 (file)
--- a/lib/index.h
+++ b/lib/index.h
@@ -22,3 +22,20 @@ enum word_type {
    WT_META,                             /* Various meta-information */
    WT_ALT                               /* Alternate texts for graphical elements */
  };
+
+/* Index card attributes */
+
+struct card_attr {
+  u32 card;                            /* Reference to card description (either oid or filepos) */
+  u32 site_id;
+  byte weight;
+  byte rfu[3];
+};
+
+/* String fingerprints */
+
+struct fingerprint {
+  u32 hash[3];
+};
+
+void fingerprint(byte *string, struct fingerprint *fp);
author	Martin Mares <mj@ucw.cz>
	Thu, 15 Feb 2001 19:05:39 +0000 (19:05 +0000)
committer	Martin Mares <mj@ucw.cz>
	Thu, 15 Feb 2001 19:05:39 +0000 (19:05 +0000)
lib/Makefile		patch \| blob \| history
lib/finger.c	[new file with mode: 0644]	patch \| blob
lib/index.h		patch \| blob \| history