2 * Sherlock Library -- URL Keys & URL Fingerprints
4 * (c) 2003 Martin Mares <mj@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
14 #include "lib/index.h"
16 #include "lib/fastbuf.h"
17 #include "lib/chartype.h"
18 #include "lib/hashfunc.h"
23 /*** Prefix recognition table ***/
26 struct pxtab_node *node;
32 struct pxtab_node *parent;
33 struct pxtab_rhs *rhs;
38 #define HASH_NODE struct pxtab_node
39 #define HASH_PREFIX(p) pxtab_##p
40 #define HASH_KEY_COMPLEX(x) x parent, x component, x len
41 #define HASH_KEY_DECL struct pxtab_node *parent UNUSED, byte *component UNUSED, uns len UNUSED
42 #define HASH_WANT_FIND
43 #define HASH_WANT_LOOKUP
44 #define HASH_GIVE_HASHFN
46 #define HASH_GIVE_EXTRA_SIZE
47 #define HASH_GIVE_INIT_KEY
48 #define HASH_USE_POOL cfpool
51 pxtab_hash(HASH_KEY_DECL)
53 return ((uns)parent) ^ hash_block(component, len);
57 pxtab_eq(struct pxtab_node *p1, byte *c1, uns l1, struct pxtab_node *p2, byte *c2, uns l2)
59 return p1 == p2 && l1 == l2 && !memcmp(c1, c2, l1);
63 pxtab_extra_size(HASH_KEY_DECL)
69 pxtab_init_key(struct pxtab_node *node, HASH_KEY_DECL)
71 node->parent = parent;
73 memcpy(node->component, component, len);
77 #include "lib/hashtable.h"
80 pxtab_skip_first_comp(byte *x)
82 while (*x && *x != ':')
85 while (*x != '/' || x[1] != '/')
95 pxtab_skip_next_comp(byte *x)
107 static struct pxtab_node *
108 pxtab_find_rule(byte *lhs)
111 struct pxtab_node *node, *parent = NULL;
113 next = pxtab_skip_first_comp(lhs);
114 DBG("\tfirst: %.*s", next-lhs, lhs);
115 node = pxtab_find(NULL, lhs, next-lhs);
116 while (node && *next)
120 next = pxtab_skip_next_comp(lhs);
121 DBG("\tnext: %.*s", next-lhs, lhs);
122 node = pxtab_find(parent, lhs, next-lhs);
124 return node ? : parent;
127 static struct pxtab_node *
128 pxtab_add_rule(byte *lhs, struct pxtab_rhs *rhs)
130 byte *lhs_start = lhs;
132 struct pxtab_node *node, *parent;
134 next = pxtab_skip_first_comp(lhs);
135 DBG("\tfirst: %.*s", next-lhs, lhs);
136 node = pxtab_lookup(NULL, lhs, next-lhs);
144 next = pxtab_skip_next_comp(lhs);
146 DBG("\tnext: %.*s", next-lhs, lhs);
147 node = pxtab_lookup(parent, lhs, next-lhs);
149 DBG("\tsetting rhs, %d to eat", next-lhs_start);
151 node->total_len = next - lhs_start;
155 static struct pxtab_rhs *
156 pxtab_add_rhs(byte *rhs)
158 uns len = strlen(rhs);
159 struct pxtab_rhs *r = cfg_malloc(sizeof(*r) + len);
161 memcpy(r->rhs, rhs, len+1);
162 struct pxtab_node *node = pxtab_add_rule(rhs, r);
168 pxtab_load(byte *name)
171 struct pxtab_rhs *rhs = NULL;
172 byte line[MAX_URL_SIZE], url[MAX_URL_SIZE], *c, *d;
176 DBG("Loading prefix table %s", name);
177 f = bopen(name, O_RDONLY, 4096);
178 while (bgets(f, line, sizeof(line)))
184 if (!*c || *c == '#')
186 if (err = url_auto_canonicalize(c, url))
187 die("%s, line %d: Invalid URL (%s)", name, lino, url_error(err));
188 if (!(d = strrchr(c, '/')) || d[1])
189 die("%s, line %d: Prefix rules must end with a slash", name, lino);
192 DBG("Creating RHS <%s>", c);
193 if (!(rhs = pxtab_add_rhs(c)))
194 die("%s, line %d: Right-hand side already mapped", name, lino);
197 die("%s, line %d: Syntax error", name, lino);
200 DBG("Adding LHS <%s>", c);
201 if (!pxtab_add_rule(c, rhs))
202 die("%s, line %d: Duplicate rule", name, lino);
208 /*** Configuration ***/
210 static uns urlkey_www_hack;
211 static byte *urlkey_pxtab_path;
213 static struct cfitem urlkey_config[] = {
214 { "URLKey", CT_SECTION, NULL },
215 { "WWWHack", CT_INT, &urlkey_www_hack },
216 { "PrefixTable", CT_STRING, &urlkey_pxtab_path },
217 { NULL, CT_STOP, NULL }
220 static void CONSTRUCTOR urlkey_conf_init(void)
222 cf_register(urlkey_config);
229 if (urlkey_pxtab_path)
230 pxtab_load(urlkey_pxtab_path);
234 url_key_remove_www(byte *url, byte **pbuf)
236 if (urlkey_www_hack && !strncmp(url, "http://www.", 11))
239 strcpy(buf, "http://");
240 strcpy(buf+7, url+11);
241 DBG("\tWWW hack: %s -> %s", url, buf);
243 *pbuf = buf + MAX_URL_SIZE;
249 url_key(byte *url, byte *buf)
251 DBG("Generating URL key for %s", url);
252 url = url_key_remove_www(url, &buf);
253 struct pxtab_node *rule = pxtab_find_rule(url);
254 if (rule && rule->rhs && rule->rhs->node != rule)
256 struct pxtab_rhs *rhs = rule->rhs;
257 DBG("\tApplying rule <%s>, remove %d, add %d", rhs->rhs, rule->total_len, rhs->len);
258 memcpy(buf, rhs->rhs, rhs->len);
259 strcpy(buf + rhs->len, url + rule->total_len);
263 DBG("\tOutput: %s", url);
268 url_fingerprint(byte *url, struct fingerprint *fp)
270 byte buf[URL_KEY_BUF_SIZE];
271 fingerprint(url_key(url, buf), fp);
276 int main(int argc, char **argv)
280 for (int i=1; i<argc; i++)
282 byte buf[URL_KEY_BUF_SIZE];
283 struct fingerprint fp;
284 byte *key = url_key(argv[i], buf);
285 fingerprint(key, &fp);
286 for (int j=0; j<12; j++)
287 printf("%02x", fp.hash[j]);
288 printf(" %s\n", key);