2 * Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
4 * (c) 1997--1999 Martin Mares <mj@ucw.cz>
9 #include "lib/chartype.h"
15 /* Escaping and de-escaping */
20 return (x<10) ? (x + '0') : (x - 10 + 'A');
24 url_deescape(byte *s, byte *d)
26 byte *end = d + MAX_URL_SIZE - 10;
30 return URL_ERR_TOO_LONG;
34 if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
35 return URL_ERR_INVALID_ESCAPE;
36 val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
38 return URL_ERR_INVALID_ESCAPED_CHAR;
42 val = NCC_SEMICOLON; break;
44 val = NCC_SLASH; break;
46 val = NCC_QUEST; break;
48 val = NCC_COLON; break;
52 val = NCC_EQUAL; break;
56 val = NCC_HASH; break;
64 return URL_ERR_INVALID_CHAR;
71 url_enescape(byte *s, byte *d)
73 byte *end = d + MAX_URL_SIZE - 10;
79 return URL_ERR_TOO_LONG;
80 if (Calnum(c) || /* RFC 1738(2.2): Only alphanumerics ... */
81 c == '$' || c == '-' || c == '_' || c == '.' || c == '+' || /* ... and several other exceptions ... */
82 c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
84 c == '/' || c == '?' || c == ':' || c == '@' || /* ... and reserved chars used for reserved purpose */
85 c == '=' || c == '&' || c == '#' || c == ';')
89 uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
91 *d++ = enhex(val >> 4);
92 *d++ = enhex(val & 0x0f);
100 /* Split an URL (several parts may be copied to the destination buffer) */
102 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
103 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
106 identify_protocol(byte *p)
110 for(i=1; i<URL_PROTO_MAX; i++)
111 if (!strcasecmp(p, url_proto_names[i]))
113 return URL_PROTO_UNKNOWN;
117 url_split(byte *s, struct url *u, byte *d)
119 bzero(u, sizeof(struct url));
121 u->bufend = d + MAX_URL_SIZE - 10;
123 if (s[0] != '/') /* Seek for "protocol:" */
126 while (*p && Calnum(*p))
128 if (p != s && *p == ':')
134 u->protoid = identify_protocol(u->protocol);
136 if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
138 /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
139 int len = d - u->protocol;
148 if (s[0] == '/') /* Host spec or absolute path */
150 if (s[1] == '/') /* Host spec */
157 while (*s && *s != '/') /* Copy user:passwd@host:port */
161 if (w) /* user:passwd present */
169 if (e) /* host:port present */
173 p = strtoul(e, &ep, 10);
174 if (ep && *ep || p > 65535)
175 return URL_ERR_INVALID_PORT;
176 else if (p) /* Port 0 (e.g. in :/) is treated as default port */
188 /* Normalization according to given base URL */
190 static uns std_ports[] = URL_DEFPORTS; /* Default port numbers */
193 relpath_merge(struct url *u, struct url *b)
201 if (a[0] == '/') /* Absolute path => OK */
204 return URL_PATH_UNDERFLOW;
206 if (!a[0]) /* Empty URL -> inherit everything */
212 u->rest = d; /* We know we'll need to copy the path somewhere else */
214 if (a[0] == '#') /* Another fragment */
216 for(p=o; *p && *p != '#'; p++)
220 if (a[0] == '?') /* New query */
222 for(p=o; *p && *p != '#' && *p != '?'; p++)
226 if (a[0] == ';') /* Change parameters */
228 for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
233 p = NULL; /* Copy original path and find the last slash */
234 while (*o && *o != ';' && *o != '?' && *o != '#')
237 return URL_ERR_TOO_LONG;
238 if ((*d++ = *o++) == '/')
242 return URL_ERR_REL_NOTHING;
249 if (a[1] == '/' || !a[1]) /* Skip "./" and ".$" */
256 else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
261 * RFC 1808 says we should leave ".." as a path segment, but
262 * we intentionally break the rule and refuse the URL.
264 return URL_PATH_UNDERFLOW;
265 d--; /* Discard trailing slash */
273 while (a[0] && a[0] != '/')
276 return URL_ERR_TOO_LONG;
288 copy: /* Combine part of old URL with the new one */
293 return URL_ERR_TOO_LONG;
298 return URL_ERR_TOO_LONG;
303 url_normalize(struct url *u, struct url *b)
308 if (url_proto_path_flags[u->protoid] && !u->host ||
309 u->host && !*u->host ||
310 !u->host && u->user ||
312 return URL_SYNTAX_ERROR;
316 /* Now we know it's a relative URL. Do we have any base? */
317 if (!b || !url_proto_path_flags[b->protoid])
318 return URL_ERR_REL_NOTHING;
319 u->protocol = b->protocol;
320 u->protoid = b->protoid;
322 /* Reference to the same host */
328 if (err = relpath_merge(u, b))
333 /* Fill in missing info */
335 u->port = std_ports[u->protoid];
340 /* Name canonicalization */
348 if (*b >= 'A' && *b <= 'Z')
355 kill_end_dot(byte *b)
361 k = b + strlen(b) - 1;
362 if (k > b && *k == '.')
368 url_canonicalize(struct url *u)
372 lowercase(u->protocol);
374 kill_end_dot(u->host);
375 if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
377 if (u->rest && (c = strchr(u->rest, '#'))) /* Kill fragment reference */
382 /* Pack a broken-down URL */
385 append(byte *d, byte *s, byte *e)
398 url_pack(struct url *u, byte *d)
400 byte *e = d + MAX_URL_SIZE - 10;
404 d = append(d, u->protocol, e);
405 d = append(d, ":", e);
406 u->protoid = identify_protocol(u->protocol);
410 d = append(d, "//", e);
413 d = append(d, u->user, e);
414 d = append(d, "@", e);
416 d = append(d, u->host, e);
417 if (u->port != std_ports[u->protoid] && u->port != ~0U)
420 sprintf(z, "%d", u->port);
421 d = append(d, ":", e);
426 d = append(d, u->rest, e);
428 return URL_ERR_TOO_LONG;
435 static char *errmsg[] = {
436 "Something is wrong",
440 "Invalid escaped character",
441 "Invalid port number",
442 "Relative URL not allowed",
451 if (err >= sizeof(errmsg) / sizeof(char *))
456 /* A "macro" for canonical split */
459 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
463 if (err = url_deescape(u, buf1))
465 if (err = url_split(buf1, url, buf2))
467 if (err = url_normalize(url, NULL))
469 return url_canonicalize(url);
476 int main(int argc, char **argv)
478 char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
480 struct url url, url0;
484 if (err = url_deescape(argv[1], buf1))
486 printf("deesc: error %d\n", err);
489 printf("deesc: %s\n", buf1);
490 if (err = url_split(buf1, &url, buf2))
492 printf("split: error %d\n", err);
495 printf("split: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
496 if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment", &url0, buf3))
498 printf("split base: error %d\n", err);
501 if (err = url_normalize(&url0, NULL))
503 printf("normalize base: error %d\n", err);
506 printf("base: @%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.host, url0.port, url0.rest);
507 if (err = url_normalize(&url, &url0))
509 printf("normalize: error %d\n", err);
512 printf("normalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
513 if (err = url_canonicalize(&url))
515 printf("canonicalize: error %d\n", err);
518 printf("canonicalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
519 if (err = url_pack(&url, buf4))
521 printf("pack: error %d\n", err);
524 printf("pack: %s\n", buf4);
525 if (err = url_enescape(buf4, buf2))
527 printf("enesc: error %d\n", err);
530 printf("enesc: %s\n", buf2);