2 * Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
4 * (c) 1997 Martin Mares, <mj@atrey.karlin.mff.cuni.cz>
15 /* Escaping and de-escaping */
20 return (x<10) ? (x + '0') : (x - 10 + 'A');
24 url_deescape(byte *s, byte *d)
26 byte *end = d + MAX_URL_SIZE - 10;
30 return URL_ERR_TOO_LONG;
34 if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
35 return URL_ERR_INVALID_ESCAPE;
36 val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
38 return URL_ERR_INVALID_ESCAPED_CHAR;
42 val = NCC_SEMICOLON; break;
44 val = NCC_SLASH; break;
46 val = NCC_QUEST; break;
48 val = NCC_COLON; break;
52 val = NCC_EQUAL; break;
59 else if (*s >= 0x20 && *s <= 0x7e || *s >= 0xa0)
62 return URL_ERR_INVALID_CHAR;
69 url_enescape(byte *s, byte *d)
71 byte *end = d + MAX_URL_SIZE - 10;
76 return URL_ERR_TOO_LONG;
77 if ( *s >= 'A' && *s <= 'Z'
78 || *s >= 'a' && *s <= 'z'
79 || *s >= '0' && *s <= '9'
80 || *s == '$' || *s == '-' || *s == '.' || *s == '+'
81 || *s == '!' || *s == '*' || *s == '\'' || *s == '('
82 || *s == ')' || *s == '_' || *s == ';' || *s == '/'
83 || *s == '?' || *s == ':' || *s == '@' || *s == '='
88 uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
90 *d++ = enhex(val >> 4);
91 *d++ = enhex(val & 0x0f);
99 /* Split an URL (several parts may be copied to the destination buffer) */
101 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
104 identify_protocol(byte *p)
108 for(i=1; i<URL_PROTO_MAX; i++)
109 if (!strcasecmp(p, url_proto_names[i]))
111 return URL_PROTO_UNKNOWN;
115 url_split(byte *s, struct url *u, byte *d)
117 bzero(u, sizeof(struct url));
119 u->bufend = d + MAX_URL_SIZE - 10;
121 if (s[0] != '/') /* Seek for "protocol:" */
124 while (*p && Calnum(*p))
126 if (p != s && *p == ':')
132 u->protoid = identify_protocol(u->protocol);
137 if (s[0] == '/') /* Host spec or absolute path */
139 if (s[1] == '/') /* Host spec */
146 while (*s && *s != '/') /* Copy user:passwd@host:port */
150 if (w) /* user:passwd present */
158 if (e) /* host:port present */
162 p = strtoul(e, &ep, 10);
163 if (ep && *ep || p > 65535)
164 return URL_ERR_INVALID_PORT;
165 else if (p) /* Port 0 (e.g. in :/) is treated as default port */
177 /* Normalization according to given base URL */
179 static uns std_ports[] = URL_DEFPORTS; /* Default port numbers */
182 relpath_merge(struct url *u, struct url *b)
190 if (a[0] == '/') /* Absolute path => OK */
193 return URL_PATH_UNDERFLOW;
195 if (!a[0]) /* Empty relative URL is a special case */
202 p = strrchr(o, '/'); /* Must be found! */
203 while (o <= p) /* Copy original path */
206 return URL_ERR_TOO_LONG;
214 if (a[1] == '/' || !a[1]) /* Skip "./" and ".$" */
221 else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
225 return URL_PATH_UNDERFLOW;
226 d--; /* Discard trailing slash */
234 while (a[0] && a[0] != '/')
237 return URL_ERR_TOO_LONG;
250 url_normalize(struct url *u, struct url *b)
254 if (u->protocol && !u->protoid)
257 if ((u->protoid == URL_PROTO_HTTP || (!u->protoid && b && b->protoid == URL_PROTO_HTTP))
258 && u->rest && (k = strchr(u->rest, '#')))
259 *k = 0; /* Kill fragment reference */
262 u->port = std_ports[u->protoid];
264 if ( u->protocol && !u->host
265 || u->host && !*u->host
266 || !u->host && u->user
268 return URL_SYNTAX_ERROR;
270 if (u->protocol) /* Absolute URL */
273 if (!b) /* Relative to something? */
274 return URL_ERR_REL_NOTHING;
276 return URL_ERR_UNKNOWN_PROTOCOL;
280 u->protocol = b->protocol;
281 u->protoid = b->protoid;
289 return relpath_merge(u, b);
295 /* Name canonicalization */
303 if (*b >= 'A' && *b <= 'Z')
310 kill_end_dot(byte *b)
316 k = b + strlen(b) - 1;
317 if (k > b && *k == '.')
323 url_canonicalize(struct url *u)
325 lowercase(u->protocol);
327 kill_end_dot(u->host);
328 if ((!u->rest || !*u->rest) && (u->protoid == URL_PROTO_HTTP || u->protoid == URL_PROTO_FTP))
333 /* Pack a broken-down URL */
336 append(byte *d, byte *s, byte *e)
349 url_pack(struct url *u, byte *d)
351 byte *e = d + MAX_URL_SIZE - 10;
355 d = append(d, u->protocol, e);
356 d = append(d, ":", e);
357 u->protoid = identify_protocol(u->protocol);
361 d = append(d, "//", e);
364 d = append(d, u->user, e);
365 d = append(d, "@", e);
367 d = append(d, u->host, e);
368 if (u->port != std_ports[u->protoid] && u->port != ~0)
371 sprintf(z, "%d", u->port);
372 d = append(d, ":", e);
377 d = append(d, u->rest, e);
379 return URL_ERR_TOO_LONG;
386 static char *errmsg[] = {
387 "Something is wrong",
391 "Invalid escaped character",
392 "Invalid port number",
393 "Relative URL not allowed",
402 if (err >= sizeof(errmsg) / sizeof(char *))
407 /* A "macro" for canonical split */
410 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
414 if (err = url_deescape(u, buf1))
416 if (err = url_split(buf1, url, buf2))
418 if (err = url_normalize(url, NULL))
420 return url_canonicalize(url);
427 int main(int argc, char **argv)
429 char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
431 struct url url, url0;
435 if (err = url_deescape(argv[1], buf1))
437 printf("deesc: error %d\n", err);
440 printf("deesc: %s\n", buf1);
441 if (err = url_split(buf1, &url, buf2))
443 printf("split: error %d\n", err);
446 printf("split: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
447 if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html", &url0, buf3))
449 printf("split base: error %d\n", err);
452 if (err = url_normalize(&url0, NULL))
454 printf("normalize base: error %d\n", err);
457 printf("base: @%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.host, url0.port, url0.rest);
458 if (err = url_normalize(&url, &url0))
460 printf("normalize: error %d\n", err);
463 printf("normalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
464 if (err = url_canonicalize(&url))
466 printf("canonicalize: error %d\n", err);
469 printf("canonicalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
470 if (err = url_pack(&url, buf4))
472 printf("pack: error %d\n", err);
475 printf("pack: %s\n", buf1);
476 if (err = url_enescape(buf4, buf2))
478 printf("enesc: error %d\n", err);
481 printf("enesc: %s\n", buf2);