lib/url.c

   1 /*
   2  *      Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
   3  *
   4  *      (c) 1997--2001 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001 Robert Spalek <robert@ucw.cz>
   6  */
   7
   8 #include "lib/lib.h"
   9 #include "lib/url.h"
  10 #include "lib/chartype.h"
  11 #include "lib/conf.h"
  12
  13 #include <string.h>
  14 #include <stdlib.h>
  15 #include <stdio.h>
  16
  17 /* Configuration */
  18
  19 static uns url_ignore_spaces;
  20 static uns url_ignore_underflow;
  21 static byte *url_component_separators = "/&?";
  22 static uns url_min_repeat_count = 0x7fffffff;
  23 static uns url_max_repeat_length = 0;
  24
  25 static struct cfitem url_config[] = {
  26   { "URL",              CT_SECTION,     NULL },
  27   { "IgnoreSpaces",     CT_INT,         &url_ignore_spaces },
  28   { "IgnoreUnderflow",  CT_INT,         &url_ignore_underflow },
  29   { "ComponentSeparators",      CT_STRING,      &url_component_separators },
  30   { "MinRepeatCount",           CT_INT,         &url_min_repeat_count },
  31   { "MaxRepeatLength",          CT_INT,         &url_max_repeat_length },
  32   { NULL,               CT_STOP,        NULL }
  33 };
  34
  35 static void CONSTRUCTOR url_init_config(void)
  36 {
  37   cf_register(url_config);
  38 }
  39
  40 /* Escaping and de-escaping */
  41
  42 static uns
  43 enhex(uns x)
  44 {
  45   return (x<10) ? (x + '0') : (x - 10 + 'A');
  46 }
  47
  48 int
  49 url_deescape(byte *s, byte *d)
  50 {
  51   byte *dstart = d;
  52   byte *end = d + MAX_URL_SIZE - 10;
  53   while (*s)
  54     {
  55       if (d >= end)
  56         return URL_ERR_TOO_LONG;
  57       if (*s == '%')
  58         {
  59           unsigned int val;
  60           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  61             return URL_ERR_INVALID_ESCAPE;
  62           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  63           if (val < 0x20)
  64             return URL_ERR_INVALID_ESCAPED_CHAR;
  65           switch (val)
  66             {
  67             case ';':
  68               val = NCC_SEMICOLON; break;
  69             case '/':
  70               val = NCC_SLASH; break;
  71             case '?':
  72               val = NCC_QUEST; break;
  73             case ':':
  74               val = NCC_COLON; break;
  75             case '@':
  76               val = NCC_AT; break;
  77             case '=':
  78               val = NCC_EQUAL; break;
  79             case '&':
  80               val = NCC_AND; break;
  81             case '#':
  82               val = NCC_HASH; break;
  83             }
  84           *d++ = val;
  85           s += 3;
  86         }
  87       else if (*s > 0x20)
  88         *d++ = *s++;
  89       else if (Cspace(*s))
  90         {
  91           byte *s0 = s;
  92           while (Cspace(*s))
  93             s++;
  94           if (!url_ignore_spaces || !(!*s || d == dstart))
  95             {
  96               while (Cspace(*s0))
  97                 {
  98                   if (d >= end)
  99                     return URL_ERR_TOO_LONG;
 100                   *d++ = *s0++;
 101                 }
 102             }
 103         }
 104       else
 105         return URL_ERR_INVALID_CHAR;
 106     }
 107   *d = 0;
 108   return 0;
 109 }
 110
 111 int
 112 url_enescape(byte *s, byte *d)
 113 {
 114   byte *end = d + MAX_URL_SIZE - 10;
 115   unsigned int c;
 116
 117   while (c = *s)
 118     {
 119       if (d >= end)
 120         return URL_ERR_TOO_LONG;
 121       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 122           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 123           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 124           c == ',' ||
 125           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 126           c == '=' || c == '&' || c == '#' || c == ';')
 127         *d++ = *s++;
 128       else
 129         {
 130           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
 131           *d++ = '%';
 132           *d++ = enhex(val >> 4);
 133           *d++ = enhex(val & 0x0f);
 134           s++;
 135         }
 136     }
 137   *d = 0;
 138   return 0;
 139 }
 140
 141 /* Split an URL (several parts may be copied to the destination buffer) */
 142
 143 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 144 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 145
 146 uns
 147 identify_protocol(byte *p)
 148 {
 149   uns i;
 150
 151   for(i=1; i<URL_PROTO_MAX; i++)
 152     if (!strcasecmp(p, url_proto_names[i]))
 153       return i;
 154   return URL_PROTO_UNKNOWN;
 155 }
 156
 157 int
 158 url_split(byte *s, struct url *u, byte *d)
 159 {
 160   bzero(u, sizeof(struct url));
 161   u->port = ~0;
 162   u->bufend = d + MAX_URL_SIZE - 10;
 163
 164   if (s[0] != '/')                      /* Seek for "protocol:" */
 165     {
 166       byte *p = s;
 167       while (*p && Calnum(*p))
 168         p++;
 169       if (p != s && *p == ':')
 170         {
 171           u->protocol = d;
 172           while (s < p)
 173             *d++ = *s++;
 174           *d++ = 0;
 175           u->protoid = identify_protocol(u->protocol);
 176           s++;
 177           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 178             {
 179               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 180               int len = d - u->protocol;
 181               d -= len;
 182               s -= len;
 183               u->protocol = NULL;
 184               u->protoid = 0;
 185             }
 186         }
 187     }
 188
 189   if (s[0] == '/')                      /* Host spec or absolute path */
 190     {
 191       if (s[1] == '/')                  /* Host spec */
 192         {
 193           byte *q, *w, *e;
 194           char *ep;
 195
 196           s += 2;
 197           q = d;
 198           while (*s && *s != '/')       /* Copy user:passwd@host:port */
 199             *d++ = *s++;
 200           *d++ = 0;
 201           w = strchr(q, '@');
 202           if (w)                        /* user:passwd present */
 203             {
 204               *w++ = 0;
 205               u->user = q;
 206             }
 207           else
 208             w = q;
 209           e = strchr(w, ':');
 210           if (e)                        /* host:port present */
 211             {
 212               uns p;
 213               *e++ = 0;
 214               p = strtoul(e, &ep, 10);
 215               if (ep && *ep || p > 65535)
 216                 return URL_ERR_INVALID_PORT;
 217               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 218                 u->port = p;
 219             }
 220           u->host = w;
 221         }
 222     }
 223
 224   u->rest = s;
 225   u->buf = d;
 226   return 0;
 227 }
 228
 229 /* Normalization according to given base URL */
 230
 231 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 232
 233 static int
 234 relpath_merge(struct url *u, struct url *b)
 235 {
 236   byte *a = u->rest;
 237   byte *o = b->rest;
 238   byte *d = u->buf;
 239   byte *e = u->bufend;
 240   byte *p;
 241
 242   if (a[0] == '/')                      /* Absolute path => OK */
 243     return 0;
 244   if (o[0] != '/')
 245     return URL_PATH_UNDERFLOW;
 246
 247   if (!a[0])                            /* Empty URL -> inherit everything */
 248     {
 249       u->rest = b->rest;
 250       return 0;
 251     }
 252
 253   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 254
 255   if (a[0] == '#')                      /* Another fragment */
 256     {
 257       for(p=o; *p && *p != '#'; p++)
 258         ;
 259       goto copy;
 260     }
 261   if (a[0] == '?')                      /* New query */
 262     {
 263       for(p=o; *p && *p != '#' && *p != '?'; p++)
 264         ;
 265       goto copy;
 266     }
 267   if (a[0] == ';')                      /* Change parameters */
 268     {
 269       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 270         ;
 271       goto copy;
 272     }
 273
 274   p = NULL;                             /* Copy original path and find the last slash */
 275   while (*o && *o != ';' && *o != '?' && *o != '#')
 276     {
 277       if (d >= e)
 278         return URL_ERR_TOO_LONG;
 279       if ((*d++ = *o++) == '/')
 280         p = d;
 281     }
 282   if (!p)
 283     return URL_ERR_REL_NOTHING;
 284   d = p;
 285
 286   while (*a)
 287     {
 288       if (a[0] == '.')
 289         {
 290           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 291             {
 292               a++;
 293               if (a[0])
 294                 a++;
 295               continue;
 296             }
 297           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 298             {
 299               a += 2;
 300               if (a[0])
 301                 a++;
 302               if (d <= u->buf + 1)
 303                 {
 304                   /*
 305                    * RFC 1808 says we should leave ".." as a path segment, but
 306                    * we intentionally break the rule and refuse the URL.
 307                    */
 308                   if (!url_ignore_underflow)
 309                     return URL_PATH_UNDERFLOW;
 310                 }
 311               else
 312                 {
 313                   d--;                  /* Discard trailing slash */
 314                   while (d[-1] != '/')
 315                     d--;
 316                 }
 317               continue;
 318             }
 319         }
 320       while (a[0] && a[0] != '/')
 321         {
 322           if (d >= e)
 323             return URL_ERR_TOO_LONG;
 324           *d++ = *a++;
 325         }
 326       if (a[0])
 327         *d++ = *a++;
 328     }
 329
 330 okay:
 331   *d++ = 0;
 332   u->buf = d;
 333   return 0;
 334
 335 copy:                                   /* Combine part of old URL with the new one */
 336   while (o < p)
 337     if (d < e)
 338       *d++ = *o++;
 339     else
 340       return URL_ERR_TOO_LONG;
 341   while (*a)
 342     if (d < e)
 343       *d++ = *a++;
 344     else
 345       return URL_ERR_TOO_LONG;
 346   goto okay;
 347 }
 348
 349 int
 350 url_normalize(struct url *u, struct url *b)
 351 {
 352   int err;
 353
 354   /* Basic checks */
 355   if (url_proto_path_flags[u->protoid] && !u->host ||
 356       u->host && !*u->host ||
 357       !u->host && u->user ||
 358       !u->rest)
 359     return URL_SYNTAX_ERROR;
 360
 361   if (!u->protocol)
 362     {
 363       /* Now we know it's a relative URL. Do we have any base? */
 364       if (!b || !url_proto_path_flags[b->protoid])
 365         return URL_ERR_REL_NOTHING;
 366       u->protocol = b->protocol;
 367       u->protoid = b->protoid;
 368
 369       /* Reference to the same host */
 370       if (!u->host)
 371         {
 372           u->host = b->host;
 373           u->user = b->user;
 374           u->port = b->port;
 375           if (err = relpath_merge(u, b))
 376             return err;
 377         }
 378     }
 379
 380   /* Fill in missing info */
 381   if (u->port == ~0U)
 382     u->port = std_ports[u->protoid];
 383
 384   return 0;
 385 }
 386
 387 /* Name canonicalization */
 388
 389 static void
 390 lowercase(byte *b)
 391 {
 392   if (b)
 393     while (*b)
 394       {
 395         if (*b >= 'A' && *b <= 'Z')
 396           *b = *b + 0x20;
 397         b++;
 398       }
 399 }
 400
 401 static void
 402 kill_end_dot(byte *b)
 403 {
 404   byte *k;
 405
 406   if (b)
 407     {
 408       k = b + strlen(b) - 1;
 409       if (k > b && *k == '.')
 410         *k = 0;
 411     }
 412 }
 413
 414 int
 415 url_canonicalize(struct url *u)
 416 {
 417   char *c;
 418
 419   lowercase(u->protocol);
 420   lowercase(u->host);
 421   kill_end_dot(u->host);
 422   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 423     u->rest = "/";
 424   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 425     *c = 0;
 426   return 0;
 427 }
 428
 429 /* Pack a broken-down URL */
 430
 431 static byte *
 432 append(byte *d, byte *s, byte *e)
 433 {
 434   if (d)
 435     while (*s)
 436       {
 437         if (d >= e)
 438           return NULL;
 439         *d++ = *s++;
 440       }
 441   return d;
 442 }
 443
 444 int
 445 url_pack(struct url *u, byte *d)
 446 {
 447   byte *e = d + MAX_URL_SIZE - 10;
 448
 449   if (u->protocol)
 450     {
 451       d = append(d, u->protocol, e);
 452       d = append(d, ":", e);
 453       u->protoid = identify_protocol(u->protocol);
 454     }
 455   if (u->host)
 456     {
 457       d = append(d, "//", e);
 458       if (u->user)
 459         {
 460           d = append(d, u->user, e);
 461           d = append(d, "@", e);
 462         }
 463       d = append(d, u->host, e);
 464       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 465         {
 466           char z[10];
 467           sprintf(z, "%d", u->port);
 468           d = append(d, ":", e);
 469           d = append(d, z, e);
 470         }
 471     }
 472   if (u->rest)
 473     d = append(d, u->rest, e);
 474   if (!d)
 475     return URL_ERR_TOO_LONG;
 476   *d = 0;
 477   return 0;
 478 }
 479
 480 /* Error messages */
 481
 482 static char *errmsg[] = {
 483   "Something is wrong",
 484   "Too long",
 485   "Invalid character",
 486   "Invalid escape",
 487   "Invalid escaped character",
 488   "Invalid port number",
 489   "Relative URL not allowed",
 490   "Unknown protocol",
 491   "Syntax error",
 492   "Path underflow"
 493 };
 494
 495 char *
 496 url_error(uns err)
 497 {
 498   if (err >= sizeof(errmsg) / sizeof(char *))
 499     err = 0;
 500   return errmsg[err];
 501 }
 502
 503 /* A "macro" for canonical split */
 504
 505 int
 506 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
 507 {
 508   int err;
 509
 510   if (err = url_deescape(u, buf1))
 511     return err;
 512   if (err = url_split(buf1, url, buf2))
 513     return err;
 514   if (err = url_normalize(url, NULL))
 515     return err;
 516   return url_canonicalize(url);
 517 }
 518
 519 /* Testing */
 520
 521 #ifdef TEST
 522
 523 int main(int argc, char **argv)
 524 {
 525   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 526   int err;
 527   struct url url, url0;
 528
 529   if (argc != 2)
 530     return 1;
 531   if (err = url_deescape(argv[1], buf1))
 532     {
 533       printf("deesc: error %d\n", err);
 534       return 1;
 535     }
 536   printf("deesc: %s\n", buf1);
 537   if (err = url_split(buf1, &url, buf2))
 538     {
 539       printf("split: error %d\n", err);
 540       return 1;
 541     }
 542   printf("split: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 543   if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment", &url0, buf3))
 544     {
 545       printf("split base: error %d\n", err);
 546       return 1;
 547     }
 548   if (err = url_normalize(&url0, NULL))
 549     {
 550       printf("normalize base: error %d\n", err);
 551       return 1;
 552     }
 553   printf("base: @%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.host, url0.port, url0.rest);
 554   if (err = url_normalize(&url, &url0))
 555     {
 556       printf("normalize: error %d\n", err);
 557       return 1;
 558     }
 559   printf("normalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 560   if (err = url_canonicalize(&url))
 561     {
 562       printf("canonicalize: error %d\n", err);
 563       return 1;
 564     }
 565   printf("canonicalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 566   if (err = url_pack(&url, buf4))
 567     {
 568       printf("pack: error %d\n", err);
 569       return 1;
 570     }
 571   printf("pack: %s\n", buf4);
 572   if (err = url_enescape(buf4, buf2))
 573     {
 574       printf("enesc: error %d\n", err);
 575       return 1;
 576     }
 577   printf("enesc: %s\n", buf2);
 578   return 0;
 579 }
 580
 581 #endif
 582
 583 int
 584 url_has_repeated_component(byte *url)
 585 {
 586         return 0;
 587 }