2 * UCW Library -- URL Functions
4 * (c) 1997--2004 Martin Mares <mj@ucw.cz>
5 * (c) 2001--2005 Robert Spalek <robert@ucw.cz>
7 * This software may be freely distributed and used according to the terms
8 * of the GNU Lesser General Public License.
10 * XXX: The buffer handling in this module is really horrible, but it works.
15 #include "ucw/chartype.h"
17 #include "ucw/prime.h"
26 static uns url_ignore_spaces;
27 static uns url_ignore_underflow;
28 static char *url_component_separators = "";
29 static uns url_min_repeat_count = 0x7fffffff;
30 static uns url_max_repeat_length = 0;
31 static uns url_max_occurences = ~0U;
34 static struct cf_section url_config = {
36 CF_UNS("IgnoreSpaces", &url_ignore_spaces),
37 CF_UNS("IgnoreUnderflow", &url_ignore_underflow),
38 CF_STRING("ComponentSeparators", &url_component_separators),
39 CF_UNS("MinRepeatCount", &url_min_repeat_count),
40 CF_UNS("MaxRepeatLength", &url_max_repeat_length),
41 CF_UNS("MaxOccurences", &url_max_occurences),
46 static void CONSTRUCTOR url_init_config(void)
48 cf_declare_section("URL", &url_config, 0);
52 /* Escaping and de-escaping */
57 return (x<10) ? (x + '0') : (x - 10 + 'A');
61 url_deescape(const char *s, char *d)
64 char *end = d + MAX_URL_SIZE - 10;
68 return URL_ERR_TOO_LONG;
72 if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
73 return URL_ERR_INVALID_ESCAPE;
74 val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
76 return URL_ERR_INVALID_ESCAPED_CHAR;
80 val = NCC_SEMICOLON; break;
82 val = NCC_SLASH; break;
84 val = NCC_QUEST; break;
86 val = NCC_COLON; break;
90 val = NCC_EQUAL; break;
94 val = NCC_HASH; break;
95 #ifndef CONFIG_URL_ESCAPE_COMPAT
97 val = NCC_DOLLAR; break;
99 val = NCC_PLUS; break;
101 val = NCC_COMMA; break;
107 else if ((byte) *s > 0x20)
114 if (!url_ignore_spaces || !(!*s || d == dstart))
119 return URL_ERR_TOO_LONG;
125 return URL_ERR_INVALID_CHAR;
132 url_enescape(const char *s, char *d)
134 char *end = d + MAX_URL_SIZE - 10;
140 return URL_ERR_TOO_LONG;
141 if (Calnum(c) || /* RFC 2396 (2.1-2.3): Only alphanumerics ... */
142 c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' || /* ... and some exceptions and reserved chars */
143 c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||
144 c == ',' || c == '=' || c == '&' || c == '#' || c == ';' ||
145 c == '/' || c == '?' || c == ':' || c == '@'
146 #ifndef CONFIG_URL_ESCAPE_COMPAT
153 uns val = ((byte)*s < NCC_MAX) ? NCC_CHARS[(byte)*s] : *s;
155 *d++ = enhex(val >> 4);
156 *d++ = enhex(val & 0x0f);
165 url_enescape_friendly(const char *src, char *dest)
167 char *end = dest + MAX_URL_SIZE - 10;
168 const byte *srcb = src;
172 return URL_ERR_TOO_LONG;
174 *dest++ = NCC_CHARS[*srcb++];
175 else if (*srcb >= 0x20 && *srcb < 0x7f)
180 *dest++ = enhex(*srcb >> 4);
181 *dest++ = enhex(*srcb++ & 0x0f);
188 /* Split an URL (several parts may be copied to the destination buffer) */
190 char *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
191 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
194 url_identify_protocol(const char *p)
198 for(i=1; i<URL_PROTO_MAX; i++)
199 if (!strcasecmp(p, url_proto_names[i]))
201 return URL_PROTO_UNKNOWN;
205 url_split(char *s, struct url *u, char *d)
207 bzero(u, sizeof(struct url));
209 u->bufend = d + MAX_URL_SIZE - 10;
211 if (s[0] != '/') /* Seek for "protocol:" */
214 while (*p && Calnum(*p))
216 if (p != s && *p == ':')
222 u->protoid = url_identify_protocol(u->protocol);
224 if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
226 /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
227 int len = d - u->protocol;
236 if (s[0] == '/') /* Host spec or absolute path */
238 if (s[1] == '/') /* Host spec */
246 while (*s && *s != '/' && *s != '?') /* Copy user:passwd@host:port */
255 else /* This shouldn't happen with sane URL's, but we need to be sure */
260 if (at) /* user:passwd present */
263 if (e = strchr(q, ':'))
272 if (e) /* host:port present */
276 p = strtoul(e, &ep, 10);
277 if (ep && *ep || p > 65535)
278 return URL_ERR_INVALID_PORT;
279 else if (p) /* Port 0 (e.g. in :/) is treated as default port */
291 /* Normalization according to given base URL */
293 static uns std_ports[] = URL_DEFPORTS; /* Default port numbers */
296 relpath_merge(struct url *u, struct url *b)
304 if (a[0] == '/') /* Absolute path => OK */
306 if (o[0] != '/' && o[0] != '?')
307 return URL_PATH_UNDERFLOW;
309 if (!a[0]) /* Empty URL -> inherit everything */
315 u->rest = d; /* We know we'll need to copy the path somewhere else */
317 if (a[0] == '#') /* Another fragment */
319 for(p=o; *p && *p != '#'; p++)
323 if (a[0] == '?') /* New query */
325 for(p=o; *p && *p != '#' && *p != '?'; p++)
330 p = NULL; /* Copy original path and find the last slash */
331 while (*o && *o != '?' && *o != '#')
334 return URL_ERR_TOO_LONG;
335 if ((*d++ = *o++) == '/')
339 return URL_ERR_REL_NOTHING;
346 if (a[1] == '/' || !a[1]) /* Skip "./" and ".$" */
353 else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
361 * RFC 1808 says we should leave ".." as a path segment, but
362 * we intentionally break the rule and refuse the URL.
364 if (!url_ignore_underflow)
365 return URL_PATH_UNDERFLOW;
369 d--; /* Discard trailing slash */
376 while (a[0] && a[0] != '/')
379 return URL_ERR_TOO_LONG;
391 copy: /* Combine part of old URL with the new one */
396 return URL_ERR_TOO_LONG;
401 return URL_ERR_TOO_LONG;
406 url_normalize(struct url *u, struct url *b)
411 if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
412 !u->host && u->user ||
413 !u->user && u->pass ||
415 return URL_SYNTAX_ERROR;
419 /* Now we know it's a relative URL. Do we have any base? */
420 if (!b || !url_proto_path_flags[b->protoid])
421 return URL_ERR_REL_NOTHING;
422 u->protocol = b->protocol;
423 u->protoid = b->protoid;
425 /* Reference to the same host */
432 if (err = relpath_merge(u, b))
437 /* Change path "?" to "/?" because it's the true meaning */
438 if (u->rest[0] == '?')
440 int l = strlen(u->rest);
441 if (u->bufend - u->buf < l+1)
442 return URL_ERR_TOO_LONG;
444 memcpy(u->buf+1, u->rest, l+1);
449 /* Fill in missing info */
451 u->port = std_ports[u->protoid];
456 /* Name canonicalization */
464 if (*b >= 'A' && *b <= 'Z')
471 kill_end_dot(char *b)
477 k = b + strlen(b) - 1;
478 while (k > b && *k == '.')
484 url_canonicalize(struct url *u)
488 lowercase(u->protocol);
490 kill_end_dot(u->host);
491 if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
493 if (u->rest && (c = strchr(u->rest, '#'))) /* Kill fragment reference */
498 /* Pack a broken-down URL */
501 append(char *d, const char *s, char *e)
514 url_pack(struct url *u, char *d)
516 char *e = d + MAX_URL_SIZE - 10;
520 d = append(d, u->protocol, e);
521 d = append(d, ":", e);
522 u->protoid = url_identify_protocol(u->protocol);
526 d = append(d, "//", e);
529 d = append(d, u->user, e);
532 d = append(d, ":", e);
533 d = append(d, u->pass, e);
535 d = append(d, "@", e);
537 d = append(d, u->host, e);
538 if (u->port != std_ports[u->protoid] && u->port != ~0U)
541 sprintf(z, "%d", u->port);
542 d = append(d, ":", e);
547 d = append(d, u->rest, e);
549 return URL_ERR_TOO_LONG;
556 static char *errmsg[] = {
557 "Something is wrong",
561 "Invalid escaped character",
562 "Invalid port number",
563 "Relative URL not allowed",
572 if (err >= sizeof(errmsg) / sizeof(char *))
577 /* Standard cookbook recipes */
580 url_canon_split_rel(const char *u, char *buf1, char *buf2, struct url *url, struct url *base)
584 if (err = url_deescape(u, buf1))
586 if (err = url_split(buf1, url, buf2))
588 if (err = url_normalize(url, base))
590 return url_canonicalize(url);
594 url_auto_canonicalize_rel(const char *src, char *dst, struct url *base)
596 char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
600 (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
601 (err = url_pack(&ur, buf3)) ||
602 (err = url_enescape(buf3, dst)));
610 int main(int argc, char **argv)
612 char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
614 struct url url, url0;
615 char *base = "http://mj@www.hell.org/123/sub_dir;param/index.html;param?query&zzz/sub;query+#fragment?";
617 if (argc != 2 && argc != 3)
621 if (err = url_deescape(argv[1], buf1))
623 printf("deesc: error %d\n", err);
626 printf("deesc: %s\n", buf1);
627 if (err = url_split(buf1, &url, buf2))
629 printf("split: error %d\n", err);
632 printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
633 if (err = url_split(base, &url0, buf3))
635 printf("split base: error %d\n", err);
638 if (err = url_normalize(&url0, NULL))
640 printf("normalize base: error %d\n", err);
643 printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
644 if (err = url_normalize(&url, &url0))
646 printf("normalize: error %d\n", err);
649 printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
650 if (err = url_canonicalize(&url))
652 printf("canonicalize: error %d\n", err);
655 printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
656 if (err = url_pack(&url, buf4))
658 printf("pack: error %d\n", err);
661 printf("pack: %s\n", buf4);
662 if (err = url_enescape(buf4, buf2))
664 printf("enesc: error %d\n", err);
667 printf("enesc: %s\n", buf2);
681 hashf(const char *start, int length)
685 hf = (hf << 8 | hf >> 24) ^ *start++;
690 repeat_count(struct component *comp, uns count, uns len)
692 struct component *orig_comp = comp;
702 for (i=0; i<len; i++)
703 if (comp[i].hash != orig_comp[i].hash
704 || comp[i].length != orig_comp[i].length
705 || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
711 url_has_repeated_component(const char *url)
713 struct component *comp;
714 uns comps, comp_len, rep_prefix, hash_size, *hash, *next;
718 for (comps=0, c=url; c; comps++)
720 c = strpbrk(c, url_component_separators);
724 if (comps < url_min_repeat_count && comps <= url_max_occurences)
726 comp = alloca(comps * sizeof(*comp));
727 for (i=0, c=url; c; i++)
730 c = strpbrk(c, url_component_separators);
733 comp[i].length = c - comp[i].start;
737 comp[i].length = strlen(comp[i].start);
740 for (i=0; i<comps; i++)
741 comp[i].hash = hashf(comp[i].start, comp[i].length);
742 if (comps > url_max_occurences)
744 hash_size = next_table_prime(comps);
745 hash = alloca(hash_size * sizeof(*hash));
746 next = alloca(comps * sizeof(*next));
747 memset(hash, 255, hash_size * sizeof(*hash));
748 for (i=0; i<comps; i++)
750 j = comp[i].hash % hash_size;
751 for (k = hash[j]; ~k && (comp[i].hash != comp[k].hash || comp[i].length != comp[k].length ||
752 memcmp(comp[k].start, comp[i].start, comp[i].length)); k = next[k]);
761 if (comp[k].count++ >= url_max_occurences)
766 for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
767 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
768 if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)