From: Martin Mares Date: Sun, 27 Oct 2002 20:06:47 +0000 (+0000) Subject: Worked around problems with "www.xyz.cz" and "xyz.cz" being considered identical X-Git-Tag: holmes-import~1315 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=61192ac10b2c1ee0f39f652a94c9a2d681ab2b6a;p=libucw.git Worked around problems with "www.xyz.cz" and "xyz.cz" being considered identical in the gatherer and different in the indexer by adding a hack to calculation of fingerprints (we cannot afford calling filters for each fingerprint, one of reasons being speed, another filters being unavailable in the search server). Closes bug #302. --- diff --git a/lib/finger.c b/lib/finger.c index 6c5fa9d2..a9ce7b49 100644 --- a/lib/finger.c +++ b/lib/finger.c @@ -1,7 +1,7 @@ /* * Sherlock Library -- String Fingerprints * - * (c) 2001 Martin Mares + * (c) 2001--2002 Martin Mares * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -23,19 +23,41 @@ */ #include "lib/lib.h" +#include "lib/conf.h" #include "lib/index.h" #include "lib/md5.h" #include +static uns finger_www_hack; + +static struct cfitem finger_config[] = { + { "Fingerprints", CT_SECTION, NULL }, + { "WWWHack", CT_INT, &finger_www_hack }, + { NULL, CT_STOP, NULL } +}; + +static void CONSTRUCTOR finger_conf_init(void) +{ + cf_register(finger_config); +} + void fingerprint(byte *string, struct fingerprint *fp) { struct MD5Context c; + uns len = strlen(string); byte digest[16]; MD5Init(&c); - MD5Update(&c, string, strlen(string)); + if (finger_www_hack && len >= 11 && !memcmp(string, "http://www.", 11)) + { + /* FIXME: This is a dirty hack, but it has to stay until we get real handling of duplicates */ + MD5Update(&c, string, 7); + MD5Update(&c, string+11, len-11); + } + else + MD5Update(&c, string, len); MD5Final(digest, &c); memcpy(fp->hash, digest, 12); }