lib/url.c

   1 /*
   2  *      Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
   3  *
   4  *      (c) 1997--1999 Martin Mares <mj@ucw.cz>
   5  */
   6
   7 #include "lib/lib.h"
   8 #include "lib/url.h"
   9 #include "lib/chartype.h"
  10
  11 #include <string.h>
  12 #include <stdlib.h>
  13 #include <stdio.h>
  14
  15 /* Escaping and de-escaping */
  16
  17 static uns
  18 enhex(uns x)
  19 {
  20   return (x<10) ? (x + '0') : (x - 10 + 'A');
  21 }
  22
  23 int
  24 url_deescape(byte *s, byte *d)
  25 {
  26   byte *end = d + MAX_URL_SIZE - 10;
  27   while (*s)
  28     {
  29       if (d >= end)
  30         return URL_ERR_TOO_LONG;
  31       if (*s == '%')
  32         {
  33           unsigned int val;
  34           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  35             return URL_ERR_INVALID_ESCAPE;
  36           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  37           if (val < 0x20)
  38             return URL_ERR_INVALID_ESCAPED_CHAR;
  39           switch (val)
  40             {
  41             case ';':
  42               val = NCC_SEMICOLON; break;
  43             case '/':
  44               val = NCC_SLASH; break;
  45             case '?':
  46               val = NCC_QUEST; break;
  47             case ':':
  48               val = NCC_COLON; break;
  49             case '@':
  50               val = NCC_AT; break;
  51             case '=':
  52               val = NCC_EQUAL; break;
  53             case '&':
  54               val = NCC_AND; break;
  55             case '#':
  56               val = NCC_HASH; break;
  57             }
  58           *d++ = val;
  59           s += 3;
  60         }
  61       else if (*s >= 0x20)
  62         *d++ = *s++;
  63       else
  64         return URL_ERR_INVALID_CHAR;
  65     }
  66   *d = 0;
  67   return 0;
  68 }
  69
  70 int
  71 url_enescape(byte *s, byte *d)
  72 {
  73   byte *end = d + MAX_URL_SIZE - 10;
  74   unsigned int c;
  75
  76   while (c = *s)
  77     {
  78       if (d >= end)
  79         return URL_ERR_TOO_LONG;
  80       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
  81           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
  82           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
  83           c == ',' ||
  84           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
  85           c == '=' || c == '&' || c == '#' || c == ';')
  86         *d++ = *s++;
  87       else
  88         {
  89           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
  90           *d++ = '%';
  91           *d++ = enhex(val >> 4);
  92           *d++ = enhex(val & 0x0f);
  93           s++;
  94         }
  95     }
  96   *d = 0;
  97   return 0;
  98 }
  99
 100 /* Split an URL (several parts may be copied to the destination buffer) */
 101
 102 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 103 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 104
 105 uns
 106 identify_protocol(byte *p)
 107 {
 108   uns i;
 109
 110   for(i=1; i<URL_PROTO_MAX; i++)
 111     if (!strcasecmp(p, url_proto_names[i]))
 112       return i;
 113   return URL_PROTO_UNKNOWN;
 114 }
 115
 116 int
 117 url_split(byte *s, struct url *u, byte *d)
 118 {
 119   bzero(u, sizeof(struct url));
 120   u->port = ~0;
 121   u->bufend = d + MAX_URL_SIZE - 10;
 122
 123   if (s[0] != '/')                      /* Seek for "protocol:" */
 124     {
 125       byte *p = s;
 126       while (*p && Calnum(*p))
 127         p++;
 128       if (p != s && *p == ':')
 129         {
 130           u->protocol = d;
 131           while (s < p)
 132             *d++ = *s++;
 133           *d++ = 0;
 134           u->protoid = identify_protocol(u->protocol);
 135           s++;
 136           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 137             {
 138               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 139               int len = d - u->protocol;
 140               d -= len;
 141               s -= len;
 142               u->protocol = NULL;
 143               u->protoid = 0;
 144             }
 145         }
 146     }
 147
 148   if (s[0] == '/')                      /* Host spec or absolute path */
 149     {
 150       if (s[1] == '/')                  /* Host spec */
 151         {
 152           byte *q, *w, *e;
 153           char *ep;
 154
 155           s += 2;
 156           q = d;
 157           while (*s && *s != '/')       /* Copy user:passwd@host:port */
 158             *d++ = *s++;
 159           *d++ = 0;
 160           w = strchr(q, '@');
 161           if (w)                        /* user:passwd present */
 162             {
 163               *w++ = 0;
 164               u->user = q;
 165             }
 166           else
 167             w = q;
 168           e = strchr(w, ':');
 169           if (e)                        /* host:port present */
 170             {
 171               uns p;
 172               *e++ = 0;
 173               p = strtoul(e, &ep, 10);
 174               if (ep && *ep || p > 65535)
 175                 return URL_ERR_INVALID_PORT;
 176               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 177                 u->port = p;
 178             }
 179           u->host = w;
 180         }
 181     }
 182
 183   u->rest = s;
 184   u->buf = d;
 185   return 0;
 186 }
 187
 188 /* Normalization according to given base URL */
 189
 190 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 191
 192 static int
 193 relpath_merge(struct url *u, struct url *b)
 194 {
 195   byte *a = u->rest;
 196   byte *o = b->rest;
 197   byte *d = u->buf;
 198   byte *e = u->bufend;
 199   byte *p;
 200
 201   if (a[0] == '/')                      /* Absolute path => OK */
 202     return 0;
 203   if (o[0] != '/')
 204     return URL_PATH_UNDERFLOW;
 205
 206   if (!a[0])                            /* Empty URL -> inherit everything */
 207     {
 208       u->rest = b->rest;
 209       return 0;
 210     }
 211
 212   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 213
 214   if (a[0] == '#')                      /* Another fragment */
 215     {
 216       for(p=o; *p && *p != '#'; p++)
 217         ;
 218       goto copy;
 219     }
 220   if (a[0] == '?')                      /* New query */
 221     {
 222       for(p=o; *p && *p != '#' && *p != '?'; p++)
 223         ;
 224       goto copy;
 225     }
 226   if (a[0] == ';')                      /* Change parameters */
 227     {
 228       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 229         ;
 230       goto copy;
 231     }
 232
 233   p = NULL;                             /* Copy original path and find the last slash */
 234   while (*o && *o != ';' && *o != '?' && *o != '#')
 235     {
 236       if (d >= e)
 237         return URL_ERR_TOO_LONG;
 238       if ((*d++ = *o++) == '/')
 239         p = d;
 240     }
 241   if (!p)
 242     return URL_ERR_REL_NOTHING;
 243   d = p;
 244
 245   while (*a)
 246     {
 247       if (a[0] == '.')
 248         {
 249           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 250             {
 251               a++;
 252               if (a[0])
 253                 a++;
 254               continue;
 255             }
 256           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 257             {
 258               a += 2;
 259               if (d <= u->buf + 1)
 260                 /*
 261                  * RFC 1808 says we should leave ".." as a path segment, but
 262                  * we intentionally break the rule and refuse the URL.
 263                  */
 264                 return URL_PATH_UNDERFLOW;
 265               d--;                      /* Discard trailing slash */
 266               while (d[-1] != '/')
 267                 d--;
 268               if (a[0])
 269                 a++;
 270               continue;
 271             }
 272         }
 273       while (a[0] && a[0] != '/')
 274         {
 275           if (d >= e)
 276             return URL_ERR_TOO_LONG;
 277           *d++ = *a++;
 278         }
 279       if (a[0])
 280         *d++ = *a++;
 281     }
 282
 283 okay:
 284   *d++ = 0;
 285   u->buf = d;
 286   return 0;
 287
 288 copy:                                   /* Combine part of old URL with the new one */
 289   while (o < p)
 290     if (d < e)
 291       *d++ = *o++;
 292     else
 293       return URL_ERR_TOO_LONG;
 294   while (*a)
 295     if (d < e)
 296       *d++ = *a++;
 297     else
 298       return URL_ERR_TOO_LONG;
 299   goto okay;
 300 }
 301
 302 int
 303 url_normalize(struct url *u, struct url *b)
 304 {
 305   int err;
 306
 307   /* Basic checks */
 308   if (url_proto_path_flags[u->protoid] && !u->host ||
 309       u->host && !*u->host ||
 310       !u->host && u->user ||
 311       !u->rest)
 312     return URL_SYNTAX_ERROR;
 313
 314   if (!u->protocol)
 315     {
 316       /* Now we know it's a relative URL. Do we have any base? */
 317       if (!b || !url_proto_path_flags[b->protoid])
 318         return URL_ERR_REL_NOTHING;
 319       u->protocol = b->protocol;
 320       u->protoid = b->protoid;
 321
 322       /* Reference to the same host */
 323       if (!u->host)
 324         {
 325           u->host = b->host;
 326           u->user = b->user;
 327           u->port = b->port;
 328           if (err = relpath_merge(u, b))
 329             return err;
 330         }
 331     }
 332
 333   /* Fill in missing info */
 334   if (u->port == ~0U)
 335     u->port = std_ports[u->protoid];
 336
 337   return 0;
 338 }
 339
 340 /* Name canonicalization */
 341
 342 static void
 343 lowercase(byte *b)
 344 {
 345   if (b)
 346     while (*b)
 347       {
 348         if (*b >= 'A' && *b <= 'Z')
 349           *b = *b + 0x20;
 350         b++;
 351       }
 352 }
 353
 354 static void
 355 kill_end_dot(byte *b)
 356 {
 357   byte *k;
 358
 359   if (b)
 360     {
 361       k = b + strlen(b) - 1;
 362       if (k > b && *k == '.')
 363         *k = 0;
 364     }
 365 }
 366
 367 int
 368 url_canonicalize(struct url *u)
 369 {
 370   char *c;
 371
 372   lowercase(u->protocol);
 373   lowercase(u->host);
 374   kill_end_dot(u->host);
 375   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 376     u->rest = "/";
 377   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 378     *c = 0;
 379   return 0;
 380 }
 381
 382 /* Pack a broken-down URL */
 383
 384 static byte *
 385 append(byte *d, byte *s, byte *e)
 386 {
 387   if (d)
 388     while (*s)
 389       {
 390         if (d >= e)
 391           return NULL;
 392         *d++ = *s++;
 393       }
 394   return d;
 395 }
 396
 397 int
 398 url_pack(struct url *u, byte *d)
 399 {
 400   byte *e = d + MAX_URL_SIZE - 10;
 401
 402   if (u->protocol)
 403     {
 404       d = append(d, u->protocol, e);
 405       d = append(d, ":", e);
 406       u->protoid = identify_protocol(u->protocol);
 407     }
 408   if (u->host)
 409     {
 410       d = append(d, "//", e);
 411       if (u->user)
 412         {
 413           d = append(d, u->user, e);
 414           d = append(d, "@", e);
 415         }
 416       d = append(d, u->host, e);
 417       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 418         {
 419           char z[10];
 420           sprintf(z, "%d", u->port);
 421           d = append(d, ":", e);
 422           d = append(d, z, e);
 423         }
 424     }
 425   if (u->rest)
 426     d = append(d, u->rest, e);
 427   if (!d)
 428     return URL_ERR_TOO_LONG;
 429   *d = 0;
 430   return 0;
 431 }
 432
 433 /* Error messages */
 434
 435 static char *errmsg[] = {
 436   "Something is wrong",
 437   "Too long",
 438   "Invalid character",
 439   "Invalid escape",
 440   "Invalid escaped character",
 441   "Invalid port number",
 442   "Relative URL not allowed",
 443   "Unknown protocol",
 444   "Syntax error",
 445   "Path underflow"
 446 };
 447
 448 char *
 449 url_error(uns err)
 450 {
 451   if (err >= sizeof(errmsg) / sizeof(char *))
 452     err = 0;
 453   return errmsg[err];
 454 }
 455
 456 /* A "macro" for canonical split */
 457
 458 int
 459 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
 460 {
 461   int err;
 462
 463   if (err = url_deescape(u, buf1))
 464     return err;
 465   if (err = url_split(buf1, url, buf2))
 466     return err;
 467   if (err = url_normalize(url, NULL))
 468     return err;
 469   return url_canonicalize(url);
 470 }
 471
 472 /* Testing */
 473
 474 #ifdef TEST
 475
 476 int main(int argc, char **argv)
 477 {
 478   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 479   int err;
 480   struct url url, url0;
 481
 482   if (argc != 2)
 483     return 1;
 484   if (err = url_deescape(argv[1], buf1))
 485     {
 486       printf("deesc: error %d\n", err);
 487       return 1;
 488     }
 489   printf("deesc: %s\n", buf1);
 490   if (err = url_split(buf1, &url, buf2))
 491     {
 492       printf("split: error %d\n", err);
 493       return 1;
 494     }
 495   printf("split: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 496   if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment", &url0, buf3))
 497     {
 498       printf("split base: error %d\n", err);
 499       return 1;
 500     }
 501   if (err = url_normalize(&url0, NULL))
 502     {
 503       printf("normalize base: error %d\n", err);
 504       return 1;
 505     }
 506   printf("base: @%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.host, url0.port, url0.rest);
 507   if (err = url_normalize(&url, &url0))
 508     {
 509       printf("normalize: error %d\n", err);
 510       return 1;
 511     }
 512   printf("normalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 513   if (err = url_canonicalize(&url))
 514     {
 515       printf("canonicalize: error %d\n", err);
 516       return 1;
 517     }
 518   printf("canonicalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 519   if (err = url_pack(&url, buf4))
 520     {
 521       printf("pack: error %d\n", err);
 522       return 1;
 523     }
 524   printf("pack: %s\n", buf4);
 525   if (err = url_enescape(buf4, buf2))
 526     {
 527       printf("enesc: error %d\n", err);
 528       return 1;
 529     }
 530   printf("enesc: %s\n", buf2);
 531   return 0;
 532 }
 533
 534 #endif