2 * Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
4 * (c) 1997--2002 Martin Mares <mj@ucw.cz>
5 * (c) 2001 Robert Spalek <robert@ucw.cz>
7 * This software may be freely distributed and used according to the terms
8 * of the GNU Lesser General Public License.
13 #include "lib/chartype.h"
22 static uns url_ignore_spaces;
23 static uns url_ignore_underflow;
24 static byte *url_component_separators = "";
25 static uns url_min_repeat_count = 0x7fffffff;
26 static uns url_max_repeat_length = 0;
28 static struct cfitem url_config[] = {
29 { "URL", CT_SECTION, NULL },
30 { "IgnoreSpaces", CT_INT, &url_ignore_spaces },
31 { "IgnoreUnderflow", CT_INT, &url_ignore_underflow },
32 { "ComponentSeparators", CT_STRING, &url_component_separators },
33 { "MinRepeatCount", CT_INT, &url_min_repeat_count },
34 { "MaxRepeatLength", CT_INT, &url_max_repeat_length },
35 { NULL, CT_STOP, NULL }
38 static void CONSTRUCTOR url_init_config(void)
40 cf_register(url_config);
43 /* Escaping and de-escaping */
48 return (x<10) ? (x + '0') : (x - 10 + 'A');
52 url_deescape(byte *s, byte *d)
55 byte *end = d + MAX_URL_SIZE - 10;
59 return URL_ERR_TOO_LONG;
63 if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
64 return URL_ERR_INVALID_ESCAPE;
65 val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
67 return URL_ERR_INVALID_ESCAPED_CHAR;
71 val = NCC_SEMICOLON; break;
73 val = NCC_SLASH; break;
75 val = NCC_QUEST; break;
77 val = NCC_COLON; break;
81 val = NCC_EQUAL; break;
85 val = NCC_HASH; break;
97 if (!url_ignore_spaces || !(!*s || d == dstart))
102 return URL_ERR_TOO_LONG;
108 return URL_ERR_INVALID_CHAR;
115 url_enescape(byte *s, byte *d)
117 byte *end = d + MAX_URL_SIZE - 10;
123 return URL_ERR_TOO_LONG;
124 if (Calnum(c) || /* RFC 1738(2.2): Only alphanumerics ... */
125 c == '$' || c == '-' || c == '_' || c == '.' || c == '+' || /* ... and several other exceptions ... */
126 c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
128 c == '/' || c == '?' || c == ':' || c == '@' || /* ... and reserved chars used for reserved purpose */
129 c == '=' || c == '&' || c == '#' || c == ';')
133 uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
135 *d++ = enhex(val >> 4);
136 *d++ = enhex(val & 0x0f);
144 /* Split an URL (several parts may be copied to the destination buffer) */
146 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
147 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
150 identify_protocol(byte *p)
154 for(i=1; i<URL_PROTO_MAX; i++)
155 if (!strcasecmp(p, url_proto_names[i]))
157 return URL_PROTO_UNKNOWN;
161 url_split(byte *s, struct url *u, byte *d)
163 bzero(u, sizeof(struct url));
165 u->bufend = d + MAX_URL_SIZE - 10;
167 if (s[0] != '/') /* Seek for "protocol:" */
170 while (*p && Calnum(*p))
172 if (p != s && *p == ':')
178 u->protoid = identify_protocol(u->protocol);
180 if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
182 /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
183 int len = d - u->protocol;
192 if (s[0] == '/') /* Host spec or absolute path */
194 if (s[1] == '/') /* Host spec */
201 while (*s && *s != '/') /* Copy user:passwd@host:port */
205 if (w) /* user:passwd present */
209 if (e = strchr(q, ':'))
218 if (e) /* host:port present */
222 p = strtoul(e, &ep, 10);
223 if (ep && *ep || p > 65535)
224 return URL_ERR_INVALID_PORT;
225 else if (p) /* Port 0 (e.g. in :/) is treated as default port */
237 /* Normalization according to given base URL */
239 static uns std_ports[] = URL_DEFPORTS; /* Default port numbers */
242 relpath_merge(struct url *u, struct url *b)
250 if (a[0] == '/') /* Absolute path => OK */
253 return URL_PATH_UNDERFLOW;
255 if (!a[0]) /* Empty URL -> inherit everything */
261 u->rest = d; /* We know we'll need to copy the path somewhere else */
263 if (a[0] == '#') /* Another fragment */
265 for(p=o; *p && *p != '#'; p++)
269 if (a[0] == '?') /* New query */
271 for(p=o; *p && *p != '#' && *p != '?'; p++)
275 if (a[0] == ';') /* Change parameters */
277 for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
282 p = NULL; /* Copy original path and find the last slash */
283 while (*o && *o != ';' && *o != '?' && *o != '#')
286 return URL_ERR_TOO_LONG;
287 if ((*d++ = *o++) == '/')
291 return URL_ERR_REL_NOTHING;
298 if (a[1] == '/' || !a[1]) /* Skip "./" and ".$" */
305 else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
313 * RFC 1808 says we should leave ".." as a path segment, but
314 * we intentionally break the rule and refuse the URL.
316 if (!url_ignore_underflow)
317 return URL_PATH_UNDERFLOW;
321 d--; /* Discard trailing slash */
328 while (a[0] && a[0] != '/')
331 return URL_ERR_TOO_LONG;
343 copy: /* Combine part of old URL with the new one */
348 return URL_ERR_TOO_LONG;
353 return URL_ERR_TOO_LONG;
358 url_normalize(struct url *u, struct url *b)
363 if (url_proto_path_flags[u->protoid] && !u->host ||
364 u->host && !*u->host ||
365 !u->host && u->user ||
366 !u->user && u->pass ||
368 return URL_SYNTAX_ERROR;
372 /* Now we know it's a relative URL. Do we have any base? */
373 if (!b || !url_proto_path_flags[b->protoid])
374 return URL_ERR_REL_NOTHING;
375 u->protocol = b->protocol;
376 u->protoid = b->protoid;
378 /* Reference to the same host */
385 if (err = relpath_merge(u, b))
390 /* Fill in missing info */
392 u->port = std_ports[u->protoid];
397 /* Name canonicalization */
405 if (*b >= 'A' && *b <= 'Z')
412 kill_end_dot(byte *b)
418 k = b + strlen(b) - 1;
419 if (k > b && *k == '.')
425 url_canonicalize(struct url *u)
429 lowercase(u->protocol);
431 kill_end_dot(u->host);
432 if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
434 if (u->rest && (c = strchr(u->rest, '#'))) /* Kill fragment reference */
439 /* Pack a broken-down URL */
442 append(byte *d, byte *s, byte *e)
455 url_pack(struct url *u, byte *d)
457 byte *e = d + MAX_URL_SIZE - 10;
461 d = append(d, u->protocol, e);
462 d = append(d, ":", e);
463 u->protoid = identify_protocol(u->protocol);
467 d = append(d, "//", e);
470 d = append(d, u->user, e);
473 d = append(d, ":", e);
474 d = append(d, u->pass, e);
476 d = append(d, "@", e);
478 d = append(d, u->host, e);
479 if (u->port != std_ports[u->protoid] && u->port != ~0U)
482 sprintf(z, "%d", u->port);
483 d = append(d, ":", e);
488 d = append(d, u->rest, e);
490 return URL_ERR_TOO_LONG;
497 static char *errmsg[] = {
498 "Something is wrong",
502 "Invalid escaped character",
503 "Invalid port number",
504 "Relative URL not allowed",
513 if (err >= sizeof(errmsg) / sizeof(char *))
518 /* Standard cookbook recipes */
521 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
525 if (err = url_deescape(u, buf1))
527 if (err = url_split(buf1, url, buf2))
529 if (err = url_normalize(url, NULL))
531 return url_canonicalize(url);
535 url_auto_canonicalize(byte *src, byte *dst)
537 byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
541 (void)((err = url_canon_split(src, buf1, buf2, &ur)) ||
542 (err = url_pack(&ur, buf3)) ||
543 (err = url_enescape(buf3, dst)));
551 int main(int argc, char **argv)
553 char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
555 struct url url, url0;
559 if (err = url_deescape(argv[1], buf1))
561 printf("deesc: error %d\n", err);
564 printf("deesc: %s\n", buf1);
565 if (err = url_split(buf1, &url, buf2))
567 printf("split: error %d\n", err);
570 printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
571 if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment", &url0, buf3))
573 printf("split base: error %d\n", err);
576 if (err = url_normalize(&url0, NULL))
578 printf("normalize base: error %d\n", err);
581 printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
582 if (err = url_normalize(&url, &url0))
584 printf("normalize: error %d\n", err);
587 printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
588 if (err = url_canonicalize(&url))
590 printf("canonicalize: error %d\n", err);
593 printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
594 if (err = url_pack(&url, buf4))
596 printf("pack: error %d\n", err);
599 printf("pack: %s\n", buf4);
600 if (err = url_enescape(buf4, buf2))
602 printf("enesc: error %d\n", err);
605 printf("enesc: %s\n", buf2);
618 hashf(byte *start, int length)
622 hf = (hf << 8 | hf >> 24) ^ *start++;
627 repeat_count(struct component *comp, uns count, uns len)
629 struct component *orig_comp = comp;
639 for (i=0; i<len; i++)
640 if (comp[i].hash != orig_comp[i].hash
641 || comp[i].length != orig_comp[i].length
642 || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
648 url_has_repeated_component(byte *url)
650 struct component *comp;
651 uns comps, comp_len, rep_prefix;
655 for (comps=0, c=url; c; comps++)
657 c = strpbrk(c, url_component_separators);
661 if (comps < url_min_repeat_count)
663 comp = alloca(comps * sizeof(struct component));
664 for (i=0, c=url; c; i++)
667 c = strpbrk(c, url_component_separators);
670 comp[i].length = c - comp[i].start;
674 comp[i].length = strlen(comp[i].start);
677 for (i=0; i<comps; i++)
678 comp[i].hash = hashf(comp[i].start, comp[i].length);
679 for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
680 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
681 if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)