/*
- * Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
+ * UCW Library -- URL Functions
*
- * (c) 1997 Martin Mares, <mj@atrey.karlin.mff.cuni.cz>
+ * (c) 1997--2004 Martin Mares <mj@ucw.cz>
+ * (c) 2001--2005 Robert Spalek <robert@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ *
+ * The URL syntax corresponds to RFC 2396 with several exceptions:
+ *
+ * o Escaping of special characters still follows RFC 1738.
+ * o Interpretation of path parameters follows RFC 1808.
+ *
+ * XXX: The buffer handling in this module is really horrible, but it works.
*/
+#include "lib/lib.h"
+#include "lib/url.h"
+#include "lib/chartype.h"
+#include "lib/conf.h"
+
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
+#include <alloca.h>
+
+/* Configuration */
+
+static uns url_ignore_spaces;
+static uns url_ignore_underflow;
+static byte *url_component_separators = "";
+static uns url_min_repeat_count = 0x7fffffff;
+static uns url_max_repeat_length = 0;
+
+static struct cf_section url_config = {
+ CF_ITEMS {
+ CF_UNS("IgnoreSpaces", &url_ignore_spaces),
+ CF_UNS("IgnoreUnderflow", &url_ignore_underflow),
+ CF_STRING("ComponentSeparators", &url_component_separators),
+ CF_UNS("MinRepeatCount", &url_min_repeat_count),
+ CF_UNS("MaxRepeatLength", &url_max_repeat_length),
+ CF_END
+ }
+};
-#include "lib.h"
-#include "url.h"
-#include "string.h"
+static void CONSTRUCTOR url_init_config(void)
+{
+ cf_declare_section("URL", &url_config, 0);
+}
/* Escaping and de-escaping */
int
url_deescape(byte *s, byte *d)
{
+ byte *dstart = d;
byte *end = d + MAX_URL_SIZE - 10;
while (*s)
{
if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
return URL_ERR_INVALID_ESCAPE;
val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
- if (!Cprint(val))
+ if (val < 0x20)
return URL_ERR_INVALID_ESCAPED_CHAR;
switch (val)
{
val = NCC_EQUAL; break;
case '&':
val = NCC_AND; break;
+ case '#':
+ val = NCC_HASH; break;
}
*d++ = val;
s += 3;
}
- else if (*s >= 0x20 && *s <= 0x7e || *s >= 0xa0)
+ else if (*s > 0x20)
*d++ = *s++;
+ else if (Cspace(*s))
+ {
+ byte *s0 = s;
+ while (Cspace(*s))
+ s++;
+ if (!url_ignore_spaces || !(!*s || d == dstart))
+ {
+ while (Cspace(*s0))
+ {
+ if (d >= end)
+ return URL_ERR_TOO_LONG;
+ *d++ = *s0++;
+ }
+ }
+ }
else
return URL_ERR_INVALID_CHAR;
}
url_enescape(byte *s, byte *d)
{
byte *end = d + MAX_URL_SIZE - 10;
+ unsigned int c;
- while (*s)
+ while (c = *s)
{
if (d >= end)
return URL_ERR_TOO_LONG;
- if ( *s >= 'A' && *s <= 'Z'
- || *s >= 'a' && *s <= 'z'
- || *s >= '0' && *s <= '9'
- || *s == '$' || *s == '-' || *s == '.' || *s == '+'
- || *s == '!' || *s == '*' || *s == '\'' || *s == '('
- || *s == ')' || *s == '_' || *s == ';' || *s == '/'
- || *s == '?' || *s == ':' || *s == '@' || *s == '='
- || *s == '&')
+ if (Calnum(c) || /* RFC 1738(2.2): Only alphanumerics ... */
+ c == '$' || c == '-' || c == '_' || c == '.' || c == '+' || /* ... and several other exceptions ... */
+ c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
+ c == ',' ||
+ c == '/' || c == '?' || c == ':' || c == '@' || /* ... and reserved chars used for reserved purpose */
+ c == '=' || c == '&' || c == '#' || c == ';')
*d++ = *s++;
else
{
- uns val = (*s < NCC_MAX) ? ";/?:@=&"[*s] : *s;
+ uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
*d++ = '%';
*d++ = enhex(val >> 4);
*d++ = enhex(val & 0x0f);
return 0;
}
+int
+url_enescape_friendly(byte *src, byte *dest)
+{
+ byte *end = dest + MAX_URL_SIZE - 10;
+ while (*src)
+ {
+ if (dest >= end)
+ return URL_ERR_TOO_LONG;
+ if (*src < NCC_MAX)
+ *dest++ = NCC_CHARS[*src++];
+ else if (*src >= 0x20 && *src < 0x7f)
+ *dest++ = *src++;
+ else
+ {
+ *dest++ = '%';
+ *dest++ = enhex(*src >> 4);
+ *dest++ = enhex(*src++ & 0x0f);
+ }
+ }
+ *dest = 0;
+ return 0;
+}
+
/* Split an URL (several parts may be copied to the destination buffer) */
+byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
+static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
+
uns
identify_protocol(byte *p)
{
- if (!strcasecmp(p, "http"))
- return URL_PROTO_HTTP;
- if (!strcasecmp(p, "ftp"))
- return URL_PROTO_FTP;
- return 0;
+ uns i;
+
+ for(i=1; i<URL_PROTO_MAX; i++)
+ if (!strcasecmp(p, url_proto_names[i]))
+ return i;
+ return URL_PROTO_UNKNOWN;
}
int
*d++ = 0;
u->protoid = identify_protocol(u->protocol);
s++;
+ if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
+ {
+ /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
+ int len = d - u->protocol;
+ d -= len;
+ s -= len;
+ u->protocol = NULL;
+ u->protoid = 0;
+ }
}
}
{
if (s[1] == '/') /* Host spec */
{
- byte *q, *w, *e;
+ byte *q, *e;
+ byte *at = NULL;
char *ep;
s += 2;
q = d;
- while (*s && *s != '/') /* Copy user:passwd@host:port */
- *d++ = *s++;
+ while (*s && *s != '/' && *s != '?') /* Copy user:passwd@host:port */
+ {
+ if (*s != '@')
+ *d++ = *s;
+ else if (!at)
+ {
+ *d++ = 0;
+ at = d;
+ }
+ else /* This shouldn't happen with sane URL's, but we need to be sure */
+ *d++ = NCC_AT;
+ s++;
+ }
*d++ = 0;
- w = strchr(q, '@');
- if (w) /* user:passwd present */
+ if (at) /* user:passwd present */
{
- *w++ = 0;
u->user = q;
+ if (e = strchr(q, ':'))
+ {
+ *e++ = 0;
+ u->pass = e;
+ }
}
else
- w = q;
- e = strchr(w, ':');
+ at = q;
+ e = strchr(at, ':');
if (e) /* host:port present */
{
uns p;
else if (p) /* Port 0 (e.g. in :/) is treated as default port */
u->port = p;
}
- u->host = w;
+ u->host = at;
}
}
/* Normalization according to given base URL */
-static uns std_ports[] = { ~0, 80, 21 }; /* Default port numbers */
+static uns std_ports[] = URL_DEFPORTS; /* Default port numbers */
static int
relpath_merge(struct url *u, struct url *b)
if (a[0] == '/') /* Absolute path => OK */
return 0;
- if (o[0] != '/')
+ if (o[0] != '/' && o[0] != '?')
return URL_PATH_UNDERFLOW;
- if (!a[0]) /* Empty relative URL is a special case */
+ if (!a[0]) /* Empty URL -> inherit everything */
{
u->rest = b->rest;
return 0;
}
- u->rest = d;
- p = strrchr(o, '/'); /* Must be found! */
- while (o <= p) /* Copy original path */
+ u->rest = d; /* We know we'll need to copy the path somewhere else */
+
+ if (a[0] == '#') /* Another fragment */
+ {
+ for(p=o; *p && *p != '#'; p++)
+ ;
+ goto copy;
+ }
+ if (a[0] == '?') /* New query */
+ {
+ for(p=o; *p && *p != '#' && *p != '?'; p++)
+ ;
+ goto copy;
+ }
+ if (a[0] == ';') /* Change parameters */
+ {
+ for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
+ ;
+ goto copy;
+ }
+
+ p = NULL; /* Copy original path and find the last slash */
+ while (*o && *o != ';' && *o != '?' && *o != '#')
{
if (d >= e)
return URL_ERR_TOO_LONG;
- *d++ = *o++;
+ if ((*d++ = *o++) == '/')
+ p = d;
}
+ if (!p)
+ return URL_ERR_REL_NOTHING;
+ d = p;
while (*a)
{
else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
{
a += 2;
- if (d <= u->buf + 1)
- return URL_PATH_UNDERFLOW;
- d--; /* Discard trailing slash */
- while (d[-1] != '/')
- d--;
if (a[0])
a++;
+ if (d <= u->buf + 1)
+ {
+ /*
+ * RFC 1808 says we should leave ".." as a path segment, but
+ * we intentionally break the rule and refuse the URL.
+ */
+ if (!url_ignore_underflow)
+ return URL_PATH_UNDERFLOW;
+ }
+ else
+ {
+ d--; /* Discard trailing slash */
+ while (d[-1] != '/')
+ d--;
+ }
continue;
}
}
*d++ = *a++;
}
+okay:
*d++ = 0;
u->buf = d;
return 0;
+
+copy: /* Combine part of old URL with the new one */
+ while (o < p)
+ if (d < e)
+ *d++ = *o++;
+ else
+ return URL_ERR_TOO_LONG;
+ while (*a)
+ if (d < e)
+ *d++ = *a++;
+ else
+ return URL_ERR_TOO_LONG;
+ goto okay;
}
int
url_normalize(struct url *u, struct url *b)
{
- byte *k;
-
- if (u->protocol && !u->protoid)
- return 0;
-
- if ((u->protoid == URL_PROTO_HTTP || (!u->protoid && b && b->protoid == URL_PROTO_HTTP))
- && u->rest && (k = strchr(u->rest, '#')))
- *k = 0; /* Kill fragment reference */
-
- if (u->port == ~0)
- u->port = std_ports[u->protoid];
+ int err;
- if ( u->protocol && !u->host
- || u->host && !*u->host
- || !u->host && u->user
- || !u->rest)
+ /* Basic checks */
+ if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
+ !u->host && u->user ||
+ !u->user && u->pass ||
+ !u->rest)
return URL_SYNTAX_ERROR;
- if (u->protocol) /* Absolute URL */
- return 0;
-
- if (!b) /* Relative to something? */
- return URL_ERR_REL_NOTHING;
- if (!b->protoid)
- return URL_ERR_UNKNOWN_PROTOCOL;
-
if (!u->protocol)
{
+ /* Now we know it's a relative URL. Do we have any base? */
+ if (!b || !url_proto_path_flags[b->protoid])
+ return URL_ERR_REL_NOTHING;
u->protocol = b->protocol;
u->protoid = b->protoid;
+
+ /* Reference to the same host */
+ if (!u->host)
+ {
+ u->host = b->host;
+ u->user = b->user;
+ u->pass = b->pass;
+ u->port = b->port;
+ if (err = relpath_merge(u, b))
+ return err;
+ }
}
- if (!u->host)
+ /* Change path "?" to "/?" because it's the true meaning */
+ if (u->rest[0] == '?')
{
- u->host = b->host;
- u->user = b->user;
- u->port = b->port;
- return relpath_merge(u, b);
+ int l = strlen(u->rest);
+ if (u->bufend - u->buf < l+1)
+ return URL_ERR_TOO_LONG;
+ u->buf[0] = '/';
+ memcpy(u->buf+1, u->rest, l+1);
+ u->rest = u->buf;
+ u->buf += l+2;
}
+ /* Fill in missing info */
+ if (u->port == ~0U)
+ u->port = std_ports[u->protoid];
+
return 0;
}
if (b)
{
k = b + strlen(b) - 1;
- if (k > b && *k == '.')
- *k = 0;
+ while (k > b && *k == '.')
+ *k-- = 0;
}
}
int
url_canonicalize(struct url *u)
{
+ char *c;
+
lowercase(u->protocol);
lowercase(u->host);
kill_end_dot(u->host);
- if ((!u->rest || !*u->rest) && (u->protoid == URL_PROTO_HTTP || u->protoid == URL_PROTO_FTP))
+ if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
u->rest = "/";
+ if (u->rest && (c = strchr(u->rest, '#'))) /* Kill fragment reference */
+ *c = 0;
return 0;
}
/* Pack a broken-down URL */
-byte *
+static byte *
append(byte *d, byte *s, byte *e)
{
if (d)
if (u->user)
{
d = append(d, u->user, e);
+ if (u->pass)
+ {
+ d = append(d, ":", e);
+ d = append(d, u->pass, e);
+ }
d = append(d, "@", e);
}
d = append(d, u->host, e);
- if (u->port != std_ports[u->protoid] && u->port != ~0)
+ if (u->port != std_ports[u->protoid] && u->port != ~0U)
{
char z[10];
sprintf(z, "%d", u->port);
return errmsg[err];
}
-/* A "macro" for canonical split */
+/* Standard cookbook recipes */
int
-url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
+url_canon_split_rel(byte *u, byte *buf1, byte *buf2, struct url *url, struct url *base)
{
int err;
return err;
if (err = url_split(buf1, url, buf2))
return err;
- if (err = url_normalize(url, NULL))
+ if (err = url_normalize(url, base))
return err;
return url_canonicalize(url);
}
+int
+url_auto_canonicalize_rel(byte *src, byte *dst, struct url *base)
+{
+ byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
+ int err;
+ struct url ur;
+
+ (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
+ (err = url_pack(&ur, buf3)) ||
+ (err = url_enescape(buf3, dst)));
+ return err;
+}
+
/* Testing */
#ifdef TEST
char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
int err;
struct url url, url0;
+ char *base = "http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment";
- if (argc != 2)
+ if (argc != 2 && argc != 3)
return 1;
+ if (argc == 3)
+ base = argv[2];
if (err = url_deescape(argv[1], buf1))
{
printf("deesc: error %d\n", err);
printf("split: error %d\n", err);
return 1;
}
- printf("split: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
- if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html", &url0, buf3))
+ printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
+ if (err = url_split(base, &url0, buf3))
{
printf("split base: error %d\n", err);
return 1;
printf("normalize base: error %d\n", err);
return 1;
}
- printf("base: @%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.host, url0.port, url0.rest);
+ printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
if (err = url_normalize(&url, &url0))
{
printf("normalize: error %d\n", err);
return 1;
}
- printf("normalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
+ printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
if (err = url_canonicalize(&url))
{
printf("canonicalize: error %d\n", err);
return 1;
}
- printf("canonicalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
+ printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
if (err = url_pack(&url, buf4))
{
printf("pack: error %d\n", err);
return 1;
}
- printf("pack: %s\n", buf1);
+ printf("pack: %s\n", buf4);
if (err = url_enescape(buf4, buf2))
{
printf("enesc: error %d\n", err);
}
#endif
+
+struct component {
+ byte *start;
+ int length;
+ u32 hash;
+};
+
+static inline u32
+hashf(byte *start, int length)
+{
+ u32 hf = length;
+ while (length-- > 0)
+ hf = (hf << 8 | hf >> 24) ^ *start++;
+ return hf;
+}
+
+static inline uns
+repeat_count(struct component *comp, uns count, uns len)
+{
+ struct component *orig_comp = comp;
+ uns found = 0;
+ while (1)
+ {
+ uns i;
+ comp += len;
+ count -= len;
+ found++;
+ if (count < len)
+ return found;
+ for (i=0; i<len; i++)
+ if (comp[i].hash != orig_comp[i].hash
+ || comp[i].length != orig_comp[i].length
+ || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
+ return found;
+ }
+}
+
+int
+url_has_repeated_component(byte *url)
+{
+ struct component *comp;
+ uns comps, comp_len, rep_prefix;
+ byte *c;
+ uns i;
+
+ for (comps=0, c=url; c; comps++)
+ {
+ c = strpbrk(c, url_component_separators);
+ if (c)
+ c++;
+ }
+ if (comps < url_min_repeat_count)
+ return 0;
+ comp = alloca(comps * sizeof(struct component));
+ for (i=0, c=url; c; i++)
+ {
+ comp[i].start = c;
+ c = strpbrk(c, url_component_separators);
+ if (c)
+ {
+ comp[i].length = c - comp[i].start;
+ c++;
+ }
+ else
+ comp[i].length = strlen(comp[i].start);
+ }
+ ASSERT(i == comps);
+ for (i=0; i<comps; i++)
+ comp[i].hash = hashf(comp[i].start, comp[i].length);
+ for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
+ for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
+ if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
+ return comp_len;
+ return 0;
+}