prime random realloc regex timer url wildmatch \
wordsplit str_ctype str_upper bucket conf object sorter \
finger proctitle ipaccess profile bitsig randomkey \
- hashfunc base64 base224 fb-temp fb-mmap fb-printf
+ hashfunc base64 base224 fb-temp fb-mmap fb-printf urlkey
LIBSH_MOD_PATHS=$(addprefix obj/lib/,$(LIBSH_MODS)) $(CUSTOM_LIB_MODULES)
obj/lib/libsh.a: $(addsuffix .o,$(LIBSH_MOD_PATHS))
/*
* Sherlock Library -- String Fingerprints
*
- * (c) 2001--2002 Martin Mares <mj@ucw.cz>
+ * (c) 2001--2003 Martin Mares <mj@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
* of our bit string universe (2^96) and c is a small constant.
* We set m sufficiently large and expect no collisions
* to occur. On the other hand, the worst thing which could
- * be cause by a collision is mixing up two strings or labels
+ * be caused by a collision is mixing up two strings or labels
* of two documents which is relatively harmless.
*/
#include "lib/lib.h"
-#include "lib/conf.h"
#include "lib/index.h"
#include "lib/md5.h"
-#include <string.h>
-
-static uns finger_www_hack;
-
-static struct cfitem finger_config[] = {
- { "Fingerprints", CT_SECTION, NULL },
- { "WWWHack", CT_INT, &finger_www_hack },
- { NULL, CT_STOP, NULL }
-};
-
-static void CONSTRUCTOR finger_conf_init(void)
-{
- cf_register(finger_config);
-}
-
void
fingerprint(byte *string, struct fingerprint *fp)
{
struct MD5Context c;
- uns len = strlen(string);
byte digest[16];
MD5Init(&c);
- if (finger_www_hack && len >= 11 && !memcmp(string, "http://www.", 11))
- {
- /* FIXME: This is a dirty hack, but it has to stay until we get real handling of duplicates */
- MD5Update(&c, string, 7);
- MD5Update(&c, string+11, len-11);
- }
- else
- MD5Update(&c, string, len);
+ MD5Update(&c, string, strlen(string));
MD5Final(digest, &c);
memcpy(fp->hash, digest, 12);
}
return fp->hash[0] ^ fp->hash[1] ^ fp->hash[2] ^ fp->hash[3];
}
+/* URL keys */
+
+byte *url_key(byte *url, byte *buf);
+void url_fingerprint(byte *url, struct fingerprint *fp);
+
/* Reading of tagged text (Unicode values, tags mapped to 0x80000000 and higher) */
#define GET_TAGGED_CHAR(p,u) do { \
--- /dev/null
+/*
+ * Sherlock Library -- URL Keys & URL Fingerprints
+ *
+ * (c) 2003 Martin Mares <mj@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+#include "lib/lib.h"
+#include "lib/conf.h"
+#include "lib/index.h"
+#include "lib/url.h"
+
+#include <string.h>
+
+static uns urlkey_www_hack;
+
+static struct cfitem urlkey_config[] = {
+ { "URLKey", CT_SECTION, NULL },
+ { "WWWHack", CT_INT, &urlkey_www_hack },
+ { NULL, CT_STOP, NULL }
+};
+
+static void CONSTRUCTOR urlkey_conf_init(void)
+{
+ cf_register(urlkey_config);
+}
+
+byte *
+url_key(byte *url, byte *buf)
+{
+ if (urlkey_www_hack && !strncmp(url, "http://www.", 11))
+ {
+ strcpy(buf, "http://");
+ strcpy(buf+7, url+11);
+ return buf;
+ }
+ else
+ return url;
+}
+
+void
+url_fingerprint(byte *url, struct fingerprint *fp)
+{
+ byte buf[MAX_URL_SIZE];
+ return fingerprint(url_key(url, buf), fp);
+}