lib/url.c

   1 /*
   2  *      Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
   3  *
   4  *      (c) 1997 Martin Mares, <mj@atrey.karlin.mff.cuni.cz>
   5  */
   6
   7 #include <string.h>
   8 #include <stdlib.h>
   9 #include <stdio.h>
  10
  11 #include "lib.h"
  12 #include "url.h"
  13 #include "string.h"
  14
  15 /* Escaping and de-escaping */
  16
  17 static uns
  18 enhex(uns x)
  19 {
  20   return (x<10) ? (x + '0') : (x - 10 + 'A');
  21 }
  22
  23 int
  24 url_deescape(byte *s, byte *d)
  25 {
  26   byte *end = d + MAX_URL_SIZE - 10;
  27   while (*s)
  28     {
  29       if (d >= end)
  30         return URL_ERR_TOO_LONG;
  31       if (*s == '%')
  32         {
  33           unsigned int val;
  34           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  35             return URL_ERR_INVALID_ESCAPE;
  36           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  37           if (!Cprint(val))
  38             return URL_ERR_INVALID_ESCAPED_CHAR;
  39           switch (val)
  40             {
  41             case ';':
  42               val = NCC_SEMICOLON; break;
  43             case '/':
  44               val = NCC_SLASH; break;
  45             case '?':
  46               val = NCC_QUEST; break;
  47             case ':':
  48               val = NCC_COLON; break;
  49             case '@':
  50               val = NCC_AT; break;
  51             case '=':
  52               val = NCC_EQUAL; break;
  53             case '&':
  54               val = NCC_AND; break;
  55             }
  56           *d++ = val;
  57           s += 3;
  58         }
  59       else if (*s >= 0x20 && *s <= 0x7e || *s >= 0xa0)
  60         *d++ = *s++;
  61       else
  62         return URL_ERR_INVALID_CHAR;
  63     }
  64   *d = 0;
  65   return 0;
  66 }
  67
  68 int
  69 url_enescape(byte *s, byte *d)
  70 {
  71   byte *end = d + MAX_URL_SIZE - 10;
  72
  73   while (*s)
  74     {
  75       if (d >= end)
  76         return URL_ERR_TOO_LONG;
  77       if (   *s >= 'A' && *s <= 'Z'
  78              || *s >= 'a' && *s <= 'z'
  79              || *s >= '0' && *s <= '9'
  80              || *s == '$' || *s == '-' || *s == '.'
  81              || *s == '!' || *s == '*' || *s == '\'' || *s == '('
  82              || *s == ')' || *s == '_' || *s == ';' || *s == '/'
  83              || *s == '?' || *s == ':' || *s == '@' || *s == '='
  84              || *s == '&')
  85         *d++ = *s++;
  86       else
  87         {
  88           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
  89           *d++ = '%';
  90           *d++ = enhex(val >> 4);
  91           *d++ = enhex(val & 0x0f);
  92           s++;
  93         }
  94     }
  95   *d = 0;
  96   return 0;
  97 }
  98
  99 /* Split an URL (several parts may be copied to the destination buffer) */
 100
 101 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 102
 103 uns
 104 identify_protocol(byte *p)
 105 {
 106   uns i;
 107
 108   for(i=1; i<URL_PROTO_MAX; i++)
 109     if (!strcasecmp(p, url_proto_names[i]))
 110       return i;
 111   return URL_PROTO_UNKNOWN;
 112 }
 113
 114 int
 115 url_split(byte *s, struct url *u, byte *d)
 116 {
 117   bzero(u, sizeof(struct url));
 118   u->port = ~0;
 119   u->bufend = d + MAX_URL_SIZE - 10;
 120
 121   if (s[0] != '/')                      /* Seek for "protocol:" */
 122     {
 123       byte *p = s;
 124       while (*p && Calnum(*p))
 125         p++;
 126       if (p != s && *p == ':')
 127         {
 128           u->protocol = d;
 129           while (s < p)
 130             *d++ = *s++;
 131           *d++ = 0;
 132           u->protoid = identify_protocol(u->protocol);
 133           s++;
 134         }
 135     }
 136
 137   if (s[0] == '/')                      /* Host spec or absolute path */
 138     {
 139       if (s[1] == '/')                  /* Host spec */
 140         {
 141           byte *q, *w, *e;
 142           char *ep;
 143
 144           s += 2;
 145           q = d;
 146           while (*s && *s != '/')       /* Copy user:passwd@host:port */
 147             *d++ = *s++;
 148           *d++ = 0;
 149           w = strchr(q, '@');
 150           if (w)                        /* user:passwd present */
 151             {
 152               *w++ = 0;
 153               u->user = q;
 154             }
 155           else
 156             w = q;
 157           e = strchr(w, ':');
 158           if (e)                        /* host:port present */
 159             {
 160               uns p;
 161               *e++ = 0;
 162               p = strtoul(e, &ep, 10);
 163               if (ep && *ep || p > 65535)
 164                 return URL_ERR_INVALID_PORT;
 165               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 166                 u->port = p;
 167             }
 168           u->host = w;
 169         }
 170     }
 171
 172   u->rest = s;
 173   u->buf = d;
 174   return 0;
 175 }
 176
 177 /* Normalization according to given base URL */
 178
 179 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 180
 181 static int
 182 relpath_merge(struct url *u, struct url *b)
 183 {
 184   byte *a = u->rest;
 185   byte *o = b->rest;
 186   byte *d = u->buf;
 187   byte *e = u->bufend;
 188   byte *p;
 189
 190   if (a[0] == '/')                      /* Absolute path => OK */
 191     return 0;
 192   if (o[0] != '/')
 193     return URL_PATH_UNDERFLOW;
 194
 195   if (!a[0])                            /* Empty relative URL is a special case */
 196     {
 197       u->rest = b->rest;
 198       return 0;
 199     }
 200
 201   u->rest = d;
 202   p = strrchr(o, '/');                  /* Must be found! */
 203   while (o <= p)                        /* Copy original path */
 204     {
 205       if (d >= e)
 206         return URL_ERR_TOO_LONG;
 207       *d++ = *o++;
 208     }
 209
 210   while (*a)
 211     {
 212       if (a[0] == '.')
 213         {
 214           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 215             {
 216               a++;
 217               if (a[0])
 218                 a++;
 219               continue;
 220             }
 221           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 222             {
 223               a += 2;
 224               if (d <= u->buf + 1)
 225                 return URL_PATH_UNDERFLOW;
 226               d--;                      /* Discard trailing slash */
 227               while (d[-1] != '/')
 228                 d--;
 229               if (a[0])
 230                 a++;
 231               continue;
 232             }
 233         }
 234       while (a[0] && a[0] != '/')
 235         {
 236           if (d >= e)
 237             return URL_ERR_TOO_LONG;
 238           *d++ = *a++;
 239         }
 240       if (a[0])
 241         *d++ = *a++;
 242     }
 243
 244   *d++ = 0;
 245   u->buf = d;
 246   return 0;
 247 }
 248
 249 int
 250 url_normalize(struct url *u, struct url *b)
 251 {
 252   byte *k;
 253
 254   if (u->protocol && !u->protoid)
 255     return 0;
 256
 257   if ((u->protoid == URL_PROTO_HTTP || (!u->protoid && b && b->protoid == URL_PROTO_HTTP))
 258       && u->rest && (k = strchr(u->rest, '#')))
 259     *k = 0;                             /* Kill fragment reference */
 260
 261   if (u->port == ~0U)
 262     u->port = std_ports[u->protoid];
 263
 264   if (   u->protocol && !u->host
 265          || u->host && !*u->host
 266          || !u->host && u->user
 267          || !u->rest)
 268     return URL_SYNTAX_ERROR;
 269
 270   if (u->protocol)                      /* Absolute URL */
 271     return 0;
 272
 273   if (!b)                               /* Relative to something? */
 274     return URL_ERR_REL_NOTHING;
 275   if (!b->protoid)
 276     return URL_ERR_UNKNOWN_PROTOCOL;
 277
 278   if (!u->protocol)
 279     {
 280       u->protocol = b->protocol;
 281       u->protoid = b->protoid;
 282     }
 283
 284   if (!u->host)
 285     {
 286       u->host = b->host;
 287       u->user = b->user;
 288       u->port = b->port;
 289       return relpath_merge(u, b);
 290     }
 291
 292   return 0;
 293 }
 294
 295 /* Name canonicalization */
 296
 297 static void
 298 lowercase(byte *b)
 299 {
 300   if (b)
 301     while (*b)
 302       {
 303         if (*b >= 'A' && *b <= 'Z')
 304           *b = *b + 0x20;
 305         b++;
 306       }
 307 }
 308
 309 static void
 310 kill_end_dot(byte *b)
 311 {
 312   byte *k;
 313
 314   if (b)
 315     {
 316       k = b + strlen(b) - 1;
 317       if (k > b && *k == '.')
 318         *k = 0;
 319     }
 320 }
 321
 322 int
 323 url_canonicalize(struct url *u)
 324 {
 325   lowercase(u->protocol);
 326   lowercase(u->host);
 327   kill_end_dot(u->host);
 328   if ((!u->rest || !*u->rest) && (u->protoid == URL_PROTO_HTTP || u->protoid == URL_PROTO_FTP))
 329     u->rest = "/";
 330   return 0;
 331 }
 332
 333 /* Pack a broken-down URL */
 334
 335 byte *
 336 append(byte *d, byte *s, byte *e)
 337 {
 338   if (d)
 339     while (*s)
 340       {
 341         if (d >= e)
 342           return NULL;
 343         *d++ = *s++;
 344       }
 345   return d;
 346 }
 347
 348 int
 349 url_pack(struct url *u, byte *d)
 350 {
 351   byte *e = d + MAX_URL_SIZE - 10;
 352
 353   if (u->protocol)
 354     {
 355       d = append(d, u->protocol, e);
 356       d = append(d, ":", e);
 357       u->protoid = identify_protocol(u->protocol);
 358     }
 359   if (u->host)
 360     {
 361       d = append(d, "//", e);
 362       if (u->user)
 363         {
 364           d = append(d, u->user, e);
 365           d = append(d, "@", e);
 366         }
 367       d = append(d, u->host, e);
 368       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 369         {
 370           char z[10];
 371           sprintf(z, "%d", u->port);
 372           d = append(d, ":", e);
 373           d = append(d, z, e);
 374         }
 375     }
 376   if (u->rest)
 377     d = append(d, u->rest, e);
 378   if (!d)
 379     return URL_ERR_TOO_LONG;
 380   *d = 0;
 381   return 0;
 382 }
 383
 384 /* Error messages */
 385
 386 static char *errmsg[] = {
 387   "Something is wrong",
 388   "Too long",
 389   "Invalid character",
 390   "Invalid escape",
 391   "Invalid escaped character",
 392   "Invalid port number",
 393   "Relative URL not allowed",
 394   "Unknown protocol",
 395   "Syntax error",
 396   "Path underflow"
 397 };
 398
 399 char *
 400 url_error(uns err)
 401 {
 402   if (err >= sizeof(errmsg) / sizeof(char *))
 403     err = 0;
 404   return errmsg[err];
 405 }
 406
 407 /* A "macro" for canonical split */
 408
 409 int
 410 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
 411 {
 412   int err;
 413
 414   if (err = url_deescape(u, buf1))
 415     return err;
 416   if (err = url_split(buf1, url, buf2))
 417     return err;
 418   if (err = url_normalize(url, NULL))
 419     return err;
 420   return url_canonicalize(url);
 421 }
 422
 423 /* Testing */
 424
 425 #ifdef TEST
 426
 427 int main(int argc, char **argv)
 428 {
 429   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 430   int err;
 431   struct url url, url0;
 432
 433   if (argc != 2)
 434     return 1;
 435   if (err = url_deescape(argv[1], buf1))
 436     {
 437       printf("deesc: error %d\n", err);
 438       return 1;
 439     }
 440   printf("deesc: %s\n", buf1);
 441   if (err = url_split(buf1, &url, buf2))
 442     {
 443       printf("split: error %d\n", err);
 444       return 1;
 445     }
 446   printf("split: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 447   if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html", &url0, buf3))
 448     {
 449       printf("split base: error %d\n", err);
 450       return 1;
 451     }
 452   if (err = url_normalize(&url0, NULL))
 453     {
 454       printf("normalize base: error %d\n", err);
 455       return 1;
 456     }
 457   printf("base: @%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.host, url0.port, url0.rest);
 458   if (err = url_normalize(&url, &url0))
 459     {
 460       printf("normalize: error %d\n", err);
 461       return 1;
 462     }
 463   printf("normalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 464   if (err = url_canonicalize(&url))
 465     {
 466       printf("canonicalize: error %d\n", err);
 467       return 1;
 468     }
 469   printf("canonicalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 470   if (err = url_pack(&url, buf4))
 471     {
 472       printf("pack: error %d\n", err);
 473       return 1;
 474     }
 475   printf("pack: %s\n", buf1);
 476   if (err = url_enescape(buf4, buf2))
 477     {
 478       printf("enesc: error %d\n", err);
 479       return 1;
 480     }
 481   printf("enesc: %s\n", buf2);
 482   return 0;
 483 }
 484
 485 #endif