From 5274a12fc393232680f9932cbbf8f2755b023c96 Mon Sep 17 00:00:00 2001 From: Martin Mares Date: Wed, 11 Jun 2003 13:26:04 +0000 Subject: [PATCH] Split URL fingerprinting inside indexer from the other fingerprints. URL fingerprints will include server equivalence mappings and other such hacks (for now the "www." hack), the other fingerprints (used e.g. for hashing of strings in the index) won't. --- lib/Makefile | 2 +- lib/finger.c | 30 +++--------------------------- lib/index.h | 5 +++++ lib/urlkey.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 28 deletions(-) create mode 100644 lib/urlkey.c diff --git a/lib/Makefile b/lib/Makefile index 48abd228..6ed3976a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -8,7 +8,7 @@ LIBSH_MODS=alloc alloc_str ctmatch db fastbuf fb-file fb-mem lists \ prime random realloc regex timer url wildmatch \ wordsplit str_ctype str_upper bucket conf object sorter \ finger proctitle ipaccess profile bitsig randomkey \ - hashfunc base64 base224 fb-temp fb-mmap fb-printf + hashfunc base64 base224 fb-temp fb-mmap fb-printf urlkey LIBSH_MOD_PATHS=$(addprefix obj/lib/,$(LIBSH_MODS)) $(CUSTOM_LIB_MODULES) obj/lib/libsh.a: $(addsuffix .o,$(LIBSH_MOD_PATHS)) diff --git a/lib/finger.c b/lib/finger.c index a9ce7b49..585678a0 100644 --- a/lib/finger.c +++ b/lib/finger.c @@ -1,7 +1,7 @@ /* * Sherlock Library -- String Fingerprints * - * (c) 2001--2002 Martin Mares + * (c) 2001--2003 Martin Mares * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -18,46 +18,22 @@ * of our bit string universe (2^96) and c is a small constant. * We set m sufficiently large and expect no collisions * to occur. On the other hand, the worst thing which could - * be cause by a collision is mixing up two strings or labels + * be caused by a collision is mixing up two strings or labels * of two documents which is relatively harmless. */ #include "lib/lib.h" -#include "lib/conf.h" #include "lib/index.h" #include "lib/md5.h" -#include - -static uns finger_www_hack; - -static struct cfitem finger_config[] = { - { "Fingerprints", CT_SECTION, NULL }, - { "WWWHack", CT_INT, &finger_www_hack }, - { NULL, CT_STOP, NULL } -}; - -static void CONSTRUCTOR finger_conf_init(void) -{ - cf_register(finger_config); -} - void fingerprint(byte *string, struct fingerprint *fp) { struct MD5Context c; - uns len = strlen(string); byte digest[16]; MD5Init(&c); - if (finger_www_hack && len >= 11 && !memcmp(string, "http://www.", 11)) - { - /* FIXME: This is a dirty hack, but it has to stay until we get real handling of duplicates */ - MD5Update(&c, string, 7); - MD5Update(&c, string+11, len-11); - } - else - MD5Update(&c, string, len); + MD5Update(&c, string, strlen(string)); MD5Final(digest, &c); memcpy(fp->hash, digest, 12); } diff --git a/lib/index.h b/lib/index.h index cd794c93..8ee0b51e 100644 --- a/lib/index.h +++ b/lib/index.h @@ -108,6 +108,11 @@ fp_hash(struct fingerprint *fp) return fp->hash[0] ^ fp->hash[1] ^ fp->hash[2] ^ fp->hash[3]; } +/* URL keys */ + +byte *url_key(byte *url, byte *buf); +void url_fingerprint(byte *url, struct fingerprint *fp); + /* Reading of tagged text (Unicode values, tags mapped to 0x80000000 and higher) */ #define GET_TAGGED_CHAR(p,u) do { \ diff --git a/lib/urlkey.c b/lib/urlkey.c new file mode 100644 index 00000000..3e437b61 --- /dev/null +++ b/lib/urlkey.c @@ -0,0 +1,48 @@ +/* + * Sherlock Library -- URL Keys & URL Fingerprints + * + * (c) 2003 Martin Mares + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#include "lib/lib.h" +#include "lib/conf.h" +#include "lib/index.h" +#include "lib/url.h" + +#include + +static uns urlkey_www_hack; + +static struct cfitem urlkey_config[] = { + { "URLKey", CT_SECTION, NULL }, + { "WWWHack", CT_INT, &urlkey_www_hack }, + { NULL, CT_STOP, NULL } +}; + +static void CONSTRUCTOR urlkey_conf_init(void) +{ + cf_register(urlkey_config); +} + +byte * +url_key(byte *url, byte *buf) +{ + if (urlkey_www_hack && !strncmp(url, "http://www.", 11)) + { + strcpy(buf, "http://"); + strcpy(buf+7, url+11); + return buf; + } + else + return url; +} + +void +url_fingerprint(byte *url, struct fingerprint *fp) +{ + byte buf[MAX_URL_SIZE]; + return fingerprint(url_key(url, buf), fp); +} -- 2.39.2