lib/url.c

   1 /*
   2  *      Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
   3  *
   4  *      (c) 1997--1999 Martin Mares, <mj@atrey.karlin.mff.cuni.cz>
   5  */
   6
   7 #include <string.h>
   8 #include <stdlib.h>
   9 #include <stdio.h>
  10
  11 #include "lib.h"
  12 #include "url.h"
  13 #include "string.h"
  14
  15 /* Escaping and de-escaping */
  16
  17 static uns
  18 enhex(uns x)
  19 {
  20   return (x<10) ? (x + '0') : (x - 10 + 'A');
  21 }
  22
  23 int
  24 url_deescape(byte *s, byte *d)
  25 {
  26   byte *end = d + MAX_URL_SIZE - 10;
  27   while (*s)
  28     {
  29       if (d >= end)
  30         return URL_ERR_TOO_LONG;
  31       if (*s == '%')
  32         {
  33           unsigned int val;
  34           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  35             return URL_ERR_INVALID_ESCAPE;
  36           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  37           if (val < 0x20)
  38             return URL_ERR_INVALID_ESCAPED_CHAR;
  39           switch (val)
  40             {
  41             case ';':
  42               val = NCC_SEMICOLON; break;
  43             case '/':
  44               val = NCC_SLASH; break;
  45             case '?':
  46               val = NCC_QUEST; break;
  47             case ':':
  48               val = NCC_COLON; break;
  49             case '@':
  50               val = NCC_AT; break;
  51             case '=':
  52               val = NCC_EQUAL; break;
  53             case '&':
  54               val = NCC_AND; break;
  55             case '#':
  56               val = NCC_HASH; break;
  57             }
  58           *d++ = val;
  59           s += 3;
  60         }
  61       else if (*s >= 0x20)
  62         *d++ = *s++;
  63       else
  64         return URL_ERR_INVALID_CHAR;
  65     }
  66   *d = 0;
  67   return 0;
  68 }
  69
  70 int
  71 url_enescape(byte *s, byte *d)
  72 {
  73   byte *end = d + MAX_URL_SIZE - 10;
  74   unsigned int c;
  75
  76   while (c = *s)
  77     {
  78       if (d >= end)
  79         return URL_ERR_TOO_LONG;
  80       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
  81           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
  82           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
  83           c == ',' ||
  84           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
  85           c == '=' || c == '&' || c == '#' || c == ';')
  86         *d++ = *s++;
  87       else
  88         {
  89           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
  90           *d++ = '%';
  91           *d++ = enhex(val >> 4);
  92           *d++ = enhex(val & 0x0f);
  93           s++;
  94         }
  95     }
  96   *d = 0;
  97   return 0;
  98 }
  99
 100 /* Split an URL (several parts may be copied to the destination buffer) */
 101
 102 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 103 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 104
 105 uns
 106 identify_protocol(byte *p)
 107 {
 108   uns i;
 109
 110   for(i=1; i<URL_PROTO_MAX; i++)
 111     if (!strcasecmp(p, url_proto_names[i]))
 112       return i;
 113   return URL_PROTO_UNKNOWN;
 114 }
 115
 116 int
 117 url_split(byte *s, struct url *u, byte *d)
 118 {
 119   bzero(u, sizeof(struct url));
 120   u->port = ~0;
 121   u->bufend = d + MAX_URL_SIZE - 10;
 122
 123   if (s[0] != '/')                      /* Seek for "protocol:" */
 124     {
 125       byte *p = s;
 126       while (*p && Calnum(*p))
 127         p++;
 128       if (p != s && *p == ':')
 129         {
 130           u->protocol = d;
 131           while (s < p)
 132             *d++ = *s++;
 133           *d++ = 0;
 134           u->protoid = identify_protocol(u->protocol);
 135           s++;
 136           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 137             {
 138               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 139               int len = d - u->protocol;
 140               d -= len;
 141               s -= len;
 142               u->protocol = NULL;
 143               u->protoid = 0;
 144             }
 145         }
 146     }
 147
 148   if (s[0] == '/')                      /* Host spec or absolute path */
 149     {
 150       if (s[1] == '/')                  /* Host spec */
 151         {
 152           byte *q, *w, *e;
 153           char *ep;
 154
 155           s += 2;
 156           q = d;
 157           while (*s && *s != '/')       /* Copy user:passwd@host:port */
 158             *d++ = *s++;
 159           *d++ = 0;
 160           w = strchr(q, '@');
 161           if (w)                        /* user:passwd present */
 162             {
 163               *w++ = 0;
 164               u->user = q;
 165             }
 166           else
 167             w = q;
 168           e = strchr(w, ':');
 169           if (e)                        /* host:port present */
 170             {
 171               uns p;
 172               *e++ = 0;
 173               p = strtoul(e, &ep, 10);
 174               if (ep && *ep || p > 65535)
 175                 return URL_ERR_INVALID_PORT;
 176               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 177                 u->port = p;
 178             }
 179           u->host = w;
 180         }
 181     }
 182
 183   u->rest = s;
 184   u->buf = d;
 185   return 0;
 186 }
 187
 188 /* Normalization according to given base URL */
 189
 190 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 191
 192 static int
 193 relpath_merge(struct url *u, struct url *b)
 194 {
 195   byte *a = u->rest;
 196   byte *o = b->rest;
 197   byte *d = u->buf;
 198   byte *e = u->bufend;
 199   byte *p;
 200
 201   if (a[0] == '/')                      /* Absolute path => OK */
 202     return 0;
 203   if (o[0] != '/')
 204     return URL_PATH_UNDERFLOW;
 205
 206   if (!a[0])                            /* Empty URL -> inherit everything */
 207     {
 208       u->rest = b->rest;
 209       return 0;
 210     }
 211
 212   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 213
 214   if (a[0] == '#')                      /* Another fragment */
 215     {
 216       for(p=o; *p && *p != '#'; p++)
 217         ;
 218       goto copy;
 219     }
 220   if (a[0] == '?')                      /* New query */
 221     {
 222       for(p=o; *p && *p != '#' && *p != '?'; p++)
 223         ;
 224       goto copy;
 225     }
 226   if (a[0] == ';')                      /* Change parameters */
 227     {
 228       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 229         ;
 230       goto copy;
 231     }
 232
 233   p = NULL;                             /* Copy original path and find the last slash */
 234   while (*o && *o != ';' && *o != '?' && *o != '#')
 235     {
 236       if (d >= e)
 237         return URL_ERR_TOO_LONG;
 238       if ((*d++ = *o++) == '/')
 239         p = d;
 240     }
 241   if (!p)
 242     return URL_ERR_REL_NOTHING;
 243   d = p;
 244
 245   while (*a)
 246     {
 247       if (a[0] == '.')
 248         {
 249           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 250             {
 251               a++;
 252               if (a[0])
 253                 a++;
 254               continue;
 255             }
 256           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 257             {
 258               a += 2;
 259               if (d <= u->buf + 1)
 260                 /*
 261                  * RFC 1808 says we should leave ".." as a path segment, but
 262                  * we intentionally break the rule and refuse the URL.
 263                  */
 264                 return URL_PATH_UNDERFLOW;
 265               d--;                      /* Discard trailing slash */
 266               while (d[-1] != '/')
 267                 d--;
 268               if (a[0])
 269                 a++;
 270               continue;
 271             }
 272         }
 273       while (a[0] && a[0] != '/')
 274         {
 275           if (d >= e)
 276             return URL_ERR_TOO_LONG;
 277           *d++ = *a++;
 278         }
 279       if (a[0])
 280         *d++ = *a++;
 281     }
 282
 283 okay:
 284   *d++ = 0;
 285   u->buf = d;
 286   return 0;
 287
 288 copy:                                   /* Combine part of old URL with the new one */
 289   while (o < p)
 290     if (d < e)
 291       *d++ = *o++;
 292     else
 293       return URL_ERR_TOO_LONG;
 294   while (*a)
 295     if (d < e)
 296       *d++ = *a++;
 297     else
 298       return URL_ERR_TOO_LONG;
 299   goto okay;
 300 }
 301
 302 int
 303 url_normalize(struct url *u, struct url *b)
 304 {
 305   byte *k;
 306   int err;
 307
 308   /* Basic checks */
 309   if (url_proto_path_flags[u->protoid] && !u->host ||
 310       u->host && !*u->host ||
 311       !u->host && u->user ||
 312       !u->rest)
 313     return URL_SYNTAX_ERROR;
 314
 315   if (!u->protocol)
 316     {
 317       /* Now we know it's a relative URL. Do we have any base? */
 318       if (!b || !url_proto_path_flags[b->protoid])
 319         return URL_ERR_REL_NOTHING;
 320       u->protocol = b->protocol;
 321       u->protoid = b->protoid;
 322
 323       /* Reference to the same host */
 324       if (!u->host)
 325         {
 326           u->host = b->host;
 327           u->user = b->user;
 328           u->port = b->port;
 329           if (err = relpath_merge(u, b))
 330             return err;
 331         }
 332     }
 333
 334   /* Fill in missing info */
 335   if (u->port == ~0U)
 336     u->port = std_ports[u->protoid];
 337
 338   return 0;
 339 }
 340
 341 /* Name canonicalization */
 342
 343 static void
 344 lowercase(byte *b)
 345 {
 346   if (b)
 347     while (*b)
 348       {
 349         if (*b >= 'A' && *b <= 'Z')
 350           *b = *b + 0x20;
 351         b++;
 352       }
 353 }
 354
 355 static void
 356 kill_end_dot(byte *b)
 357 {
 358   byte *k;
 359
 360   if (b)
 361     {
 362       k = b + strlen(b) - 1;
 363       if (k > b && *k == '.')
 364         *k = 0;
 365     }
 366 }
 367
 368 int
 369 url_canonicalize(struct url *u)
 370 {
 371   char *c;
 372
 373   lowercase(u->protocol);
 374   lowercase(u->host);
 375   kill_end_dot(u->host);
 376   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 377     u->rest = "/";
 378   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 379     *c = 0;
 380   return 0;
 381 }
 382
 383 /* Pack a broken-down URL */
 384
 385 byte *
 386 append(byte *d, byte *s, byte *e)
 387 {
 388   if (d)
 389     while (*s)
 390       {
 391         if (d >= e)
 392           return NULL;
 393         *d++ = *s++;
 394       }
 395   return d;
 396 }
 397
 398 int
 399 url_pack(struct url *u, byte *d)
 400 {
 401   byte *e = d + MAX_URL_SIZE - 10;
 402
 403   if (u->protocol)
 404     {
 405       d = append(d, u->protocol, e);
 406       d = append(d, ":", e);
 407       u->protoid = identify_protocol(u->protocol);
 408     }
 409   if (u->host)
 410     {
 411       d = append(d, "//", e);
 412       if (u->user)
 413         {
 414           d = append(d, u->user, e);
 415           d = append(d, "@", e);
 416         }
 417       d = append(d, u->host, e);
 418       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 419         {
 420           char z[10];
 421           sprintf(z, "%d", u->port);
 422           d = append(d, ":", e);
 423           d = append(d, z, e);
 424         }
 425     }
 426   if (u->rest)
 427     d = append(d, u->rest, e);
 428   if (!d)
 429     return URL_ERR_TOO_LONG;
 430   *d = 0;
 431   return 0;
 432 }
 433
 434 /* Error messages */
 435
 436 static char *errmsg[] = {
 437   "Something is wrong",
 438   "Too long",
 439   "Invalid character",
 440   "Invalid escape",
 441   "Invalid escaped character",
 442   "Invalid port number",
 443   "Relative URL not allowed",
 444   "Unknown protocol",
 445   "Syntax error",
 446   "Path underflow"
 447 };
 448
 449 char *
 450 url_error(uns err)
 451 {
 452   if (err >= sizeof(errmsg) / sizeof(char *))
 453     err = 0;
 454   return errmsg[err];
 455 }
 456
 457 /* A "macro" for canonical split */
 458
 459 int
 460 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
 461 {
 462   int err;
 463
 464   if (err = url_deescape(u, buf1))
 465     return err;
 466   if (err = url_split(buf1, url, buf2))
 467     return err;
 468   if (err = url_normalize(url, NULL))
 469     return err;
 470   return url_canonicalize(url);
 471 }
 472
 473 /* Testing */
 474
 475 #ifdef TEST
 476
 477 int main(int argc, char **argv)
 478 {
 479   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 480   int err;
 481   struct url url, url0;
 482
 483   if (argc != 2)
 484     return 1;
 485   if (err = url_deescape(argv[1], buf1))
 486     {
 487       printf("deesc: error %d\n", err);
 488       return 1;
 489     }
 490   printf("deesc: %s\n", buf1);
 491   if (err = url_split(buf1, &url, buf2))
 492     {
 493       printf("split: error %d\n", err);
 494       return 1;
 495     }
 496   printf("split: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 497   if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment", &url0, buf3))
 498     {
 499       printf("split base: error %d\n", err);
 500       return 1;
 501     }
 502   if (err = url_normalize(&url0, NULL))
 503     {
 504       printf("normalize base: error %d\n", err);
 505       return 1;
 506     }
 507   printf("base: @%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.host, url0.port, url0.rest);
 508   if (err = url_normalize(&url, &url0))
 509     {
 510       printf("normalize: error %d\n", err);
 511       return 1;
 512     }
 513   printf("normalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 514   if (err = url_canonicalize(&url))
 515     {
 516       printf("canonicalize: error %d\n", err);
 517       return 1;
 518     }
 519   printf("canonicalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 520   if (err = url_pack(&url, buf4))
 521     {
 522       printf("pack: error %d\n", err);
 523       return 1;
 524     }
 525   printf("pack: %s\n", buf4);
 526   if (err = url_enescape(buf4, buf2))
 527     {
 528       printf("enesc: error %d\n", err);
 529       return 1;
 530     }
 531   printf("enesc: %s\n", buf2);
 532   return 0;
 533 }
 534
 535 #endif