X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=lib%2Furl.c;h=c4662372c99fc5afc493e83e486496a756b46890;hb=9778f344521a2a0a34582fae70f4d631c82dd7a6;hp=7ecd066d3b3e6854c6946877dd087881bd9c3211;hpb=4ecd6b5eabaf81c764a8ecf4ba8bacb7452a26d1;p=libucw.git diff --git a/lib/url.c b/lib/url.c index 7ecd066d..c4662372 100644 --- a/lib/url.c +++ b/lib/url.c @@ -1,16 +1,53 @@ /* - * Sherlock Library -- URL Functions (according to RFC 1738 and 1808) + * UCW Library -- URL Functions * - * (c) 1997 Martin Mares, + * (c) 1997--2004 Martin Mares + * (c) 2001--2005 Robert Spalek + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + * + * The URL syntax corresponds to RFC 2396 with several exceptions: + * + * o Escaping of special characters still follows RFC 1738. + * o Interpretation of path parameters follows RFC 1808. + * + * XXX: The buffer handling in this module is really horrible, but it works. */ +#include "lib/lib.h" +#include "lib/url.h" +#include "lib/chartype.h" +#include "lib/conf.h" + #include #include #include +#include + +/* Configuration */ + +static uns url_ignore_spaces; +static uns url_ignore_underflow; +static byte *url_component_separators = ""; +static uns url_min_repeat_count = 0x7fffffff; +static uns url_max_repeat_length = 0; + +static struct cf_section url_config = { + CF_ITEMS { + CF_UNS("IgnoreSpaces", &url_ignore_spaces), + CF_UNS("IgnoreUnderflow", &url_ignore_underflow), + CF_STRING("ComponentSeparators", &url_component_separators), + CF_UNS("MinRepeatCount", &url_min_repeat_count), + CF_UNS("MaxRepeatLength", &url_max_repeat_length), + CF_END + } +}; -#include "lib.h" -#include "url.h" -#include "string.h" +static void CONSTRUCTOR url_init_config(void) +{ + cf_declare_section("URL", &url_config, 0); +} /* Escaping and de-escaping */ @@ -23,6 +60,7 @@ enhex(uns x) int url_deescape(byte *s, byte *d) { + byte *dstart = d; byte *end = d + MAX_URL_SIZE - 10; while (*s) { @@ -34,7 +72,7 @@ url_deescape(byte *s, byte *d) if (!Cxdigit(s[1]) || !Cxdigit(s[2])) return URL_ERR_INVALID_ESCAPE; val = Cxvalue(s[1])*16 + Cxvalue(s[2]); - if (!Cprint(val)) + if (val < 0x20) return URL_ERR_INVALID_ESCAPED_CHAR; switch (val) { @@ -52,12 +90,29 @@ url_deescape(byte *s, byte *d) val = NCC_EQUAL; break; case '&': val = NCC_AND; break; + case '#': + val = NCC_HASH; break; } *d++ = val; s += 3; } - else if (*s >= 0x20 && *s <= 0x7e || *s >= 0xa0) + else if (*s > 0x20) *d++ = *s++; + else if (Cspace(*s)) + { + byte *s0 = s; + while (Cspace(*s)) + s++; + if (!url_ignore_spaces || !(!*s || d == dstart)) + { + while (Cspace(*s0)) + { + if (d >= end) + return URL_ERR_TOO_LONG; + *d++ = *s0++; + } + } + } else return URL_ERR_INVALID_CHAR; } @@ -69,19 +124,18 @@ int url_enescape(byte *s, byte *d) { byte *end = d + MAX_URL_SIZE - 10; + unsigned int c; - while (*s) + while (c = *s) { if (d >= end) return URL_ERR_TOO_LONG; - if ( *s >= 'A' && *s <= 'Z' - || *s >= 'a' && *s <= 'z' - || *s >= '0' && *s <= '9' - || *s == '$' || *s == '-' || *s == '.' || *s == '+' - || *s == '!' || *s == '*' || *s == '\'' || *s == '(' - || *s == ')' || *s == '_' || *s == ';' || *s == '/' - || *s == '?' || *s == ':' || *s == '@' || *s == '=' - || *s == '&') + if (Calnum(c) || /* RFC 1738(2.2): Only alphanumerics ... */ + c == '$' || c == '-' || c == '_' || c == '.' || c == '+' || /* ... and several other exceptions ... */ + c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' || + c == ',' || + c == '/' || c == '?' || c == ':' || c == '@' || /* ... and reserved chars used for reserved purpose */ + c == '=' || c == '&' || c == '#' || c == ';') *d++ = *s++; else { @@ -96,9 +150,33 @@ url_enescape(byte *s, byte *d) return 0; } +int +url_enescape_friendly(byte *src, byte *dest) +{ + byte *end = dest + MAX_URL_SIZE - 10; + while (*src) + { + if (dest >= end) + return URL_ERR_TOO_LONG; + if (*src < NCC_MAX) + *dest++ = NCC_CHARS[*src++]; + else if (*src >= 0x20 && *src < 0x7f) + *dest++ = *src++; + else + { + *dest++ = '%'; + *dest++ = enhex(*src >> 4); + *dest++ = enhex(*src++ & 0x0f); + } + } + *dest = 0; + return 0; +} + /* Split an URL (several parts may be copied to the destination buffer) */ byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES; +static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS; uns identify_protocol(byte *p) @@ -131,6 +209,15 @@ url_split(byte *s, struct url *u, byte *d) *d++ = 0; u->protoid = identify_protocol(u->protocol); s++; + if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/')) + { + /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */ + int len = d - u->protocol; + d -= len; + s -= len; + u->protocol = NULL; + u->protoid = 0; + } } } @@ -138,23 +225,38 @@ url_split(byte *s, struct url *u, byte *d) { if (s[1] == '/') /* Host spec */ { - byte *q, *w, *e; + byte *q, *e; + byte *at = NULL; char *ep; s += 2; q = d; - while (*s && *s != '/') /* Copy user:passwd@host:port */ - *d++ = *s++; + while (*s && *s != '/' && *s != '?') /* Copy user:passwd@host:port */ + { + if (*s != '@') + *d++ = *s; + else if (!at) + { + *d++ = 0; + at = d; + } + else /* This shouldn't happen with sane URL's, but we need to be sure */ + *d++ = NCC_AT; + s++; + } *d++ = 0; - w = strchr(q, '@'); - if (w) /* user:passwd present */ + if (at) /* user:passwd present */ { - *w++ = 0; u->user = q; + if (e = strchr(q, ':')) + { + *e++ = 0; + u->pass = e; + } } else - w = q; - e = strchr(w, ':'); + at = q; + e = strchr(at, ':'); if (e) /* host:port present */ { uns p; @@ -165,7 +267,7 @@ url_split(byte *s, struct url *u, byte *d) else if (p) /* Port 0 (e.g. in :/) is treated as default port */ u->port = p; } - u->host = w; + u->host = at; } } @@ -189,23 +291,47 @@ relpath_merge(struct url *u, struct url *b) if (a[0] == '/') /* Absolute path => OK */ return 0; - if (o[0] != '/') + if (o[0] != '/' && o[0] != '?') return URL_PATH_UNDERFLOW; - if (!a[0]) /* Empty relative URL is a special case */ + if (!a[0]) /* Empty URL -> inherit everything */ { u->rest = b->rest; return 0; } - u->rest = d; - p = strrchr(o, '/'); /* Must be found! */ - while (o <= p) /* Copy original path */ + u->rest = d; /* We know we'll need to copy the path somewhere else */ + + if (a[0] == '#') /* Another fragment */ + { + for(p=o; *p && *p != '#'; p++) + ; + goto copy; + } + if (a[0] == '?') /* New query */ + { + for(p=o; *p && *p != '#' && *p != '?'; p++) + ; + goto copy; + } + if (a[0] == ';') /* Change parameters */ + { + for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++) + ; + goto copy; + } + + p = NULL; /* Copy original path and find the last slash */ + while (*o && *o != ';' && *o != '?' && *o != '#') { if (d >= e) return URL_ERR_TOO_LONG; - *d++ = *o++; + if ((*d++ = *o++) == '/') + p = d; } + if (!p) + return URL_ERR_REL_NOTHING; + d = p; while (*a) { @@ -221,13 +347,23 @@ relpath_merge(struct url *u, struct url *b) else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */ { a += 2; - if (d <= u->buf + 1) - return URL_PATH_UNDERFLOW; - d--; /* Discard trailing slash */ - while (d[-1] != '/') - d--; if (a[0]) a++; + if (d <= u->buf + 1) + { + /* + * RFC 1808 says we should leave ".." as a path segment, but + * we intentionally break the rule and refuse the URL. + */ + if (!url_ignore_underflow) + return URL_PATH_UNDERFLOW; + } + else + { + d--; /* Discard trailing slash */ + while (d[-1] != '/') + d--; + } continue; } } @@ -241,54 +377,73 @@ relpath_merge(struct url *u, struct url *b) *d++ = *a++; } +okay: *d++ = 0; u->buf = d; return 0; + +copy: /* Combine part of old URL with the new one */ + while (o < p) + if (d < e) + *d++ = *o++; + else + return URL_ERR_TOO_LONG; + while (*a) + if (d < e) + *d++ = *a++; + else + return URL_ERR_TOO_LONG; + goto okay; } int url_normalize(struct url *u, struct url *b) { - byte *k; - - if (u->protocol && !u->protoid) - return 0; - - if ((u->protoid == URL_PROTO_HTTP || (!u->protoid && b && b->protoid == URL_PROTO_HTTP)) - && u->rest && (k = strchr(u->rest, '#'))) - *k = 0; /* Kill fragment reference */ - - if (u->port == ~0) - u->port = std_ports[u->protoid]; + int err; - if ( u->protocol && !u->host - || u->host && !*u->host - || !u->host && u->user - || !u->rest) + /* Basic checks */ + if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) || + !u->host && u->user || + !u->user && u->pass || + !u->rest) return URL_SYNTAX_ERROR; - if (u->protocol) /* Absolute URL */ - return 0; - - if (!b) /* Relative to something? */ - return URL_ERR_REL_NOTHING; - if (!b->protoid) - return URL_ERR_UNKNOWN_PROTOCOL; - if (!u->protocol) { + /* Now we know it's a relative URL. Do we have any base? */ + if (!b || !url_proto_path_flags[b->protoid]) + return URL_ERR_REL_NOTHING; u->protocol = b->protocol; u->protoid = b->protoid; + + /* Reference to the same host */ + if (!u->host) + { + u->host = b->host; + u->user = b->user; + u->pass = b->pass; + u->port = b->port; + if (err = relpath_merge(u, b)) + return err; + } } - if (!u->host) + /* Change path "?" to "/?" because it's the true meaning */ + if (u->rest[0] == '?') { - u->host = b->host; - u->user = b->user; - u->port = b->port; - return relpath_merge(u, b); + int l = strlen(u->rest); + if (u->bufend - u->buf < l+1) + return URL_ERR_TOO_LONG; + u->buf[0] = '/'; + memcpy(u->buf+1, u->rest, l+1); + u->rest = u->buf; + u->buf += l+2; } + /* Fill in missing info */ + if (u->port == ~0U) + u->port = std_ports[u->protoid]; + return 0; } @@ -314,25 +469,29 @@ kill_end_dot(byte *b) if (b) { k = b + strlen(b) - 1; - if (k > b && *k == '.') - *k = 0; + while (k > b && *k == '.') + *k-- = 0; } } int url_canonicalize(struct url *u) { + char *c; + lowercase(u->protocol); lowercase(u->host); kill_end_dot(u->host); - if ((!u->rest || !*u->rest) && (u->protoid == URL_PROTO_HTTP || u->protoid == URL_PROTO_FTP)) + if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid]) u->rest = "/"; + if (u->rest && (c = strchr(u->rest, '#'))) /* Kill fragment reference */ + *c = 0; return 0; } /* Pack a broken-down URL */ -byte * +static byte * append(byte *d, byte *s, byte *e) { if (d) @@ -362,10 +521,15 @@ url_pack(struct url *u, byte *d) if (u->user) { d = append(d, u->user, e); + if (u->pass) + { + d = append(d, ":", e); + d = append(d, u->pass, e); + } d = append(d, "@", e); } d = append(d, u->host, e); - if (u->port != std_ports[u->protoid] && u->port != ~0) + if (u->port != std_ports[u->protoid] && u->port != ~0U) { char z[10]; sprintf(z, "%d", u->port); @@ -404,10 +568,10 @@ url_error(uns err) return errmsg[err]; } -/* A "macro" for canonical split */ +/* Standard cookbook recipes */ int -url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url) +url_canon_split_rel(byte *u, byte *buf1, byte *buf2, struct url *url, struct url *base) { int err; @@ -415,11 +579,24 @@ url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url) return err; if (err = url_split(buf1, url, buf2)) return err; - if (err = url_normalize(url, NULL)) + if (err = url_normalize(url, base)) return err; return url_canonicalize(url); } +int +url_auto_canonicalize_rel(byte *src, byte *dst, struct url *base) +{ + byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE]; + int err; + struct url ur; + + (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) || + (err = url_pack(&ur, buf3)) || + (err = url_enescape(buf3, dst))); + return err; +} + /* Testing */ #ifdef TEST @@ -429,9 +606,12 @@ int main(int argc, char **argv) char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE]; int err; struct url url, url0; + char *base = "http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment"; - if (argc != 2) + if (argc != 2 && argc != 3) return 1; + if (argc == 3) + base = argv[2]; if (err = url_deescape(argv[1], buf1)) { printf("deesc: error %d\n", err); @@ -443,8 +623,8 @@ int main(int argc, char **argv) printf("split: error %d\n", err); return 1; } - printf("split: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest); - if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html", &url0, buf3)) + printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest); + if (err = url_split(base, &url0, buf3)) { printf("split base: error %d\n", err); return 1; @@ -454,25 +634,25 @@ int main(int argc, char **argv) printf("normalize base: error %d\n", err); return 1; } - printf("base: @%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.host, url0.port, url0.rest); + printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest); if (err = url_normalize(&url, &url0)) { printf("normalize: error %d\n", err); return 1; } - printf("normalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest); + printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest); if (err = url_canonicalize(&url)) { printf("canonicalize: error %d\n", err); return 1; } - printf("canonicalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest); + printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest); if (err = url_pack(&url, buf4)) { printf("pack: error %d\n", err); return 1; } - printf("pack: %s\n", buf1); + printf("pack: %s\n", buf4); if (err = url_enescape(buf4, buf2)) { printf("enesc: error %d\n", err); @@ -483,3 +663,78 @@ int main(int argc, char **argv) } #endif + +struct component { + byte *start; + int length; + u32 hash; +}; + +static inline u32 +hashf(byte *start, int length) +{ + u32 hf = length; + while (length-- > 0) + hf = (hf << 8 | hf >> 24) ^ *start++; + return hf; +} + +static inline uns +repeat_count(struct component *comp, uns count, uns len) +{ + struct component *orig_comp = comp; + uns found = 0; + while (1) + { + uns i; + comp += len; + count -= len; + found++; + if (count < len) + return found; + for (i=0; i= url_min_repeat_count) + return comp_len; + return 0; +}