2 * Sherlock Library -- URL Keys & URL Fingerprints
4 * (c) 2003 Martin Mares <mj@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
14 #include "lib/index.h"
16 #include "lib/fastbuf.h"
17 #include "lib/chartype.h"
18 #include "lib/hashfunc.h"
23 /*** Prefix recognition table ***/
26 struct pxtab_node *node;
32 struct pxtab_node *parent;
33 struct pxtab_rhs *rhs;
38 #define HASH_NODE struct pxtab_node
39 #define HASH_PREFIX(p) pxtab_##p
40 #define HASH_KEY_COMPLEX(x) x parent, x component, x len
41 #define HASH_KEY_DECL struct pxtab_node *parent UNUSED, byte *component UNUSED, uns len UNUSED
42 #define HASH_WANT_FIND
43 #define HASH_WANT_LOOKUP
44 #define HASH_GIVE_HASHFN
46 #define HASH_GIVE_EXTRA_SIZE
47 #define HASH_GIVE_INIT_KEY
48 #define HASH_GIVE_ALLOC
51 pxtab_hash(HASH_KEY_DECL)
53 return ((uns)parent) ^ hash_block(component, len);
57 pxtab_eq(struct pxtab_node *p1, byte *c1, uns l1, struct pxtab_node *p2, byte *c2, uns l2)
59 return p1 == p2 && l1 == l2 && !memcmp(c1, c2, l1);
63 pxtab_extra_size(HASH_KEY_DECL)
69 pxtab_init_key(struct pxtab_node *node, HASH_KEY_DECL)
71 node->parent = parent;
73 memcpy(node->component, component, len);
80 return cfg_malloc(size);
83 #include "lib/hashtable.h"
86 pxtab_skip_first_comp(byte *x)
88 while (*x && *x != ':')
91 while (*x != '/' || x[1] != '/')
101 pxtab_skip_next_comp(byte *x)
113 static struct pxtab_node *
114 pxtab_find_rule(byte *lhs)
117 struct pxtab_node *node, *parent = NULL;
119 next = pxtab_skip_first_comp(lhs);
120 DBG("\tfirst: %.*s", next-lhs, lhs);
121 node = pxtab_find(NULL, lhs, next-lhs);
122 while (node && *next)
126 next = pxtab_skip_next_comp(lhs);
127 DBG("\tnext: %.*s", next-lhs, lhs);
128 node = pxtab_find(parent, lhs, next-lhs);
130 return node ? : parent;
133 static struct pxtab_node *
134 pxtab_add_rule(byte *lhs, struct pxtab_rhs *rhs)
136 byte *lhs_start = lhs;
138 struct pxtab_node *node, *parent;
140 next = pxtab_skip_first_comp(lhs);
141 DBG("\tfirst: %.*s", next-lhs, lhs);
142 node = pxtab_lookup(NULL, lhs, next-lhs);
150 next = pxtab_skip_next_comp(lhs);
152 DBG("\tnext: %.*s", next-lhs, lhs);
153 node = pxtab_lookup(parent, lhs, next-lhs);
155 DBG("\tsetting rhs, %d to eat", next-lhs_start);
157 node->total_len = next - lhs_start;
161 static struct pxtab_rhs *
162 pxtab_add_rhs(byte *rhs)
164 uns len = strlen(rhs);
165 struct pxtab_rhs *r = cfg_malloc(sizeof(*r) + len);
167 memcpy(r->rhs, rhs, len+1);
168 struct pxtab_node *node = pxtab_add_rule(rhs, r);
174 pxtab_load(byte *name)
177 struct pxtab_rhs *rhs = NULL;
178 byte line[MAX_URL_SIZE], url[MAX_URL_SIZE], *c, *d;
182 DBG("Loading prefix table %s", name);
183 f = bopen(name, O_RDONLY, 4096);
184 while (bgets(f, line, sizeof(line)))
190 if (!*c || *c == '#')
192 if (err = url_auto_canonicalize(c, url))
193 die("%s, line %d: Invalid URL (%s)", name, lino, url_error(err));
194 if (!(d = strrchr(c, '/')) || d[1])
195 die("%s, line %d: Prefix rules must end with a slash", name, lino);
198 DBG("Creating RHS <%s>", c);
199 if (!(rhs = pxtab_add_rhs(c)))
200 die("%s, line %d: Right-hand side already mapped", name, lino);
203 die("%s, line %d: Syntax error", name, lino);
206 DBG("Adding LHS <%s>", c);
207 if (!pxtab_add_rule(c, rhs))
208 die("%s, line %d: Duplicate rule", name, lino);
214 /*** Configuration ***/
216 static uns urlkey_www_hack;
217 static byte *urlkey_pxtab_path;
219 static struct cfitem urlkey_config[] = {
220 { "URLKey", CT_SECTION, NULL },
221 { "WWWHack", CT_INT, &urlkey_www_hack },
222 { "PrefixTable", CT_STRING, &urlkey_pxtab_path },
223 { NULL, CT_STOP, NULL }
226 static void CONSTRUCTOR urlkey_conf_init(void)
228 cf_register(urlkey_config);
235 if (urlkey_pxtab_path)
236 pxtab_load(urlkey_pxtab_path);
240 url_key_remove_www(byte *url, byte **pbuf)
242 if (urlkey_www_hack && !strncmp(url, "http://www.", 11))
245 strcpy(buf, "http://");
246 strcpy(buf+7, url+11);
247 DBG("\tWWW hack: %s -> %s", url, buf);
249 *pbuf = buf + MAX_URL_SIZE;
255 url_key(byte *url, byte *buf)
257 DBG("Generating URL key for %s", url);
258 url = url_key_remove_www(url, &buf);
259 struct pxtab_node *rule = pxtab_find_rule(url);
260 if (rule && rule->rhs && rule->rhs->node != rule)
262 struct pxtab_rhs *rhs = rule->rhs;
263 DBG("\tApplying rule <%s>, remove %d, add %d", rhs->rhs, rule->total_len, rhs->len);
264 memcpy(buf, rhs->rhs, rhs->len);
265 strcpy(buf + rhs->len, url + rule->total_len);
269 DBG("\tOutput: %s", url);
274 url_fingerprint(byte *url, struct fingerprint *fp)
276 byte buf[URL_KEY_BUF_SIZE];
277 return fingerprint(url_key(url, buf), fp);