lib/url.c

   1 /*
   2  *      UCW Library -- URL Functions
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001--2005 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  *
  10  *      The URL syntax corresponds to RFC 2396 with several exceptions:
  11  *
  12  *         o  Escaping of special characters still follows RFC 1738.
  13  *         o  Interpretation of path parameters follows RFC 1808.
  14  *
  15  *      XXX: The buffer handling in this module is really horrible, but it works.
  16  */
  17
  18 #include "lib/lib.h"
  19 #include "lib/url.h"
  20 #include "lib/chartype.h"
  21 #include "lib/conf.h"
  22
  23 #include <string.h>
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <alloca.h>
  27
  28 /* Configuration */
  29
  30 static uns url_ignore_spaces;
  31 static uns url_ignore_underflow;
  32 static char *url_component_separators = "";
  33 static uns url_min_repeat_count = 0x7fffffff;
  34 static uns url_max_repeat_length = 0;
  35 static uns url_max_occurences = ~0U;
  36
  37 static struct cf_section url_config = {
  38   CF_ITEMS {
  39     CF_UNS("IgnoreSpaces", &url_ignore_spaces),
  40     CF_UNS("IgnoreUnderflow", &url_ignore_underflow),
  41     CF_STRING("ComponentSeparators", &url_component_separators),
  42     CF_UNS("MinRepeatCount", &url_min_repeat_count),
  43     CF_UNS("MaxRepeatLength", &url_max_repeat_length),
  44     CF_UNS("MaxOccurences", &url_max_occurences),
  45     CF_END
  46   }
  47 };
  48
  49 static void CONSTRUCTOR url_init_config(void)
  50 {
  51   cf_declare_section("URL", &url_config, 0);
  52 }
  53
  54 /* Escaping and de-escaping */
  55
  56 static uns
  57 enhex(uns x)
  58 {
  59   return (x<10) ? (x + '0') : (x - 10 + 'A');
  60 }
  61
  62 int
  63 url_deescape(const byte *s, byte *d)
  64 {
  65   byte *dstart = d;
  66   byte *end = d + MAX_URL_SIZE - 10;
  67   while (*s)
  68     {
  69       if (d >= end)
  70         return URL_ERR_TOO_LONG;
  71       if (*s == '%')
  72         {
  73           unsigned int val;
  74           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  75             return URL_ERR_INVALID_ESCAPE;
  76           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  77           if (val < 0x20)
  78             return URL_ERR_INVALID_ESCAPED_CHAR;
  79           switch (val)
  80             {
  81             case ';':
  82               val = NCC_SEMICOLON; break;
  83             case '/':
  84               val = NCC_SLASH; break;
  85             case '?':
  86               val = NCC_QUEST; break;
  87             case ':':
  88               val = NCC_COLON; break;
  89             case '@':
  90               val = NCC_AT; break;
  91             case '=':
  92               val = NCC_EQUAL; break;
  93             case '&':
  94               val = NCC_AND; break;
  95             case '#':
  96               val = NCC_HASH; break;
  97             }
  98           *d++ = val;
  99           s += 3;
 100         }
 101       else if (*s > 0x20)
 102         *d++ = *s++;
 103       else if (Cspace(*s))
 104         {
 105           const byte *s0 = s;
 106           while (Cspace(*s))
 107             s++;
 108           if (!url_ignore_spaces || !(!*s || d == dstart))
 109             {
 110               while (Cspace(*s0))
 111                 {
 112                   if (d >= end)
 113                     return URL_ERR_TOO_LONG;
 114                   *d++ = *s0++;
 115                 }
 116             }
 117         }
 118       else
 119         return URL_ERR_INVALID_CHAR;
 120     }
 121   *d = 0;
 122   return 0;
 123 }
 124
 125 int
 126 url_enescape(const byte *s, byte *d)
 127 {
 128   byte *end = d + MAX_URL_SIZE - 10;
 129   unsigned int c;
 130
 131   while (c = *s)
 132     {
 133       if (d >= end)
 134         return URL_ERR_TOO_LONG;
 135       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 136           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 137           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 138           c == ',' ||
 139           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 140           c == '=' || c == '&' || c == '#' || c == ';')
 141         *d++ = *s++;
 142       else
 143         {
 144           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
 145           *d++ = '%';
 146           *d++ = enhex(val >> 4);
 147           *d++ = enhex(val & 0x0f);
 148           s++;
 149         }
 150     }
 151   *d = 0;
 152   return 0;
 153 }
 154
 155 int
 156 url_enescape_friendly(const byte *src, byte *dest)
 157 {
 158   byte *end = dest + MAX_URL_SIZE - 10;
 159   while (*src)
 160     {
 161       if (dest >= end)
 162         return URL_ERR_TOO_LONG;
 163       if (*src < NCC_MAX)
 164         *dest++ = NCC_CHARS[*src++];
 165       else if (*src >= 0x20 && *src < 0x7f)
 166         *dest++ = *src++;
 167       else
 168         {
 169           *dest++ = '%';
 170           *dest++ = enhex(*src >> 4);
 171           *dest++ = enhex(*src++ & 0x0f);
 172         }
 173     }
 174   *dest = 0;
 175   return 0;
 176 }
 177
 178 /* Split an URL (several parts may be copied to the destination buffer) */
 179
 180 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 181 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 182
 183 uns
 184 identify_protocol(const byte *p)
 185 {
 186   uns i;
 187
 188   for(i=1; i<URL_PROTO_MAX; i++)
 189     if (!strcasecmp(p, url_proto_names[i]))
 190       return i;
 191   return URL_PROTO_UNKNOWN;
 192 }
 193
 194 int
 195 url_split(byte *s, struct url *u, byte *d)
 196 {
 197   bzero(u, sizeof(struct url));
 198   u->port = ~0;
 199   u->bufend = d + MAX_URL_SIZE - 10;
 200
 201   if (s[0] != '/')                      /* Seek for "protocol:" */
 202     {
 203       byte *p = s;
 204       while (*p && Calnum(*p))
 205         p++;
 206       if (p != s && *p == ':')
 207         {
 208           u->protocol = d;
 209           while (s < p)
 210             *d++ = *s++;
 211           *d++ = 0;
 212           u->protoid = identify_protocol(u->protocol);
 213           s++;
 214           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 215             {
 216               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 217               int len = d - u->protocol;
 218               d -= len;
 219               s -= len;
 220               u->protocol = NULL;
 221               u->protoid = 0;
 222             }
 223         }
 224     }
 225
 226   if (s[0] == '/')                      /* Host spec or absolute path */
 227     {
 228       if (s[1] == '/')                  /* Host spec */
 229         {
 230           byte *q, *e;
 231           byte *at = NULL;
 232           char *ep;
 233
 234           s += 2;
 235           q = d;
 236           while (*s && *s != '/' && *s != '?')  /* Copy user:passwd@host:port */
 237             {
 238               if (*s != '@')
 239                 *d++ = *s;
 240               else if (!at)
 241                 {
 242                   *d++ = 0;
 243                   at = d;
 244                 }
 245               else                      /* This shouldn't happen with sane URL's, but we need to be sure */
 246                 *d++ = NCC_AT;
 247               s++;
 248             }
 249           *d++ = 0;
 250           if (at)                       /* user:passwd present */
 251             {
 252               u->user = q;
 253               if (e = strchr(q, ':'))
 254                 {
 255                   *e++ = 0;
 256                   u->pass = e;
 257                 }
 258             }
 259           else
 260             at = q;
 261           e = strchr(at, ':');
 262           if (e)                        /* host:port present */
 263             {
 264               uns p;
 265               *e++ = 0;
 266               p = strtoul(e, &ep, 10);
 267               if (ep && *ep || p > 65535)
 268                 return URL_ERR_INVALID_PORT;
 269               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 270                 u->port = p;
 271             }
 272           u->host = at;
 273         }
 274     }
 275
 276   u->rest = s;
 277   u->buf = d;
 278   return 0;
 279 }
 280
 281 /* Normalization according to given base URL */
 282
 283 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 284
 285 static int
 286 relpath_merge(struct url *u, struct url *b)
 287 {
 288   byte *a = u->rest;
 289   byte *o = b->rest;
 290   byte *d = u->buf;
 291   byte *e = u->bufend;
 292   byte *p;
 293
 294   if (a[0] == '/')                      /* Absolute path => OK */
 295     return 0;
 296   if (o[0] != '/' && o[0] != '?')
 297     return URL_PATH_UNDERFLOW;
 298
 299   if (!a[0])                            /* Empty URL -> inherit everything */
 300     {
 301       u->rest = b->rest;
 302       return 0;
 303     }
 304
 305   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 306
 307   if (a[0] == '#')                      /* Another fragment */
 308     {
 309       for(p=o; *p && *p != '#'; p++)
 310         ;
 311       goto copy;
 312     }
 313   if (a[0] == '?')                      /* New query */
 314     {
 315       for(p=o; *p && *p != '#' && *p != '?'; p++)
 316         ;
 317       goto copy;
 318     }
 319   if (a[0] == ';')                      /* Change parameters */
 320     {
 321       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 322         ;
 323       goto copy;
 324     }
 325
 326   p = NULL;                             /* Copy original path and find the last slash */
 327   while (*o && *o != ';' && *o != '?' && *o != '#')
 328     {
 329       if (d >= e)
 330         return URL_ERR_TOO_LONG;
 331       if ((*d++ = *o++) == '/')
 332         p = d;
 333     }
 334   if (!p)
 335     return URL_ERR_REL_NOTHING;
 336   d = p;
 337
 338   while (*a)
 339     {
 340       if (a[0] == '.')
 341         {
 342           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 343             {
 344               a++;
 345               if (a[0])
 346                 a++;
 347               continue;
 348             }
 349           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 350             {
 351               a += 2;
 352               if (a[0])
 353                 a++;
 354               if (d <= u->buf + 1)
 355                 {
 356                   /*
 357                    * RFC 1808 says we should leave ".." as a path segment, but
 358                    * we intentionally break the rule and refuse the URL.
 359                    */
 360                   if (!url_ignore_underflow)
 361                     return URL_PATH_UNDERFLOW;
 362                 }
 363               else
 364                 {
 365                   d--;                  /* Discard trailing slash */
 366                   while (d[-1] != '/')
 367                     d--;
 368                 }
 369               continue;
 370             }
 371         }
 372       while (a[0] && a[0] != '/')
 373         {
 374           if (d >= e)
 375             return URL_ERR_TOO_LONG;
 376           *d++ = *a++;
 377         }
 378       if (a[0])
 379         *d++ = *a++;
 380     }
 381
 382 okay:
 383   *d++ = 0;
 384   u->buf = d;
 385   return 0;
 386
 387 copy:                                   /* Combine part of old URL with the new one */
 388   while (o < p)
 389     if (d < e)
 390       *d++ = *o++;
 391     else
 392       return URL_ERR_TOO_LONG;
 393   while (*a)
 394     if (d < e)
 395       *d++ = *a++;
 396     else
 397       return URL_ERR_TOO_LONG;
 398   goto okay;
 399 }
 400
 401 int
 402 url_normalize(struct url *u, struct url *b)
 403 {
 404   int err;
 405
 406   /* Basic checks */
 407   if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
 408       !u->host && u->user ||
 409       !u->user && u->pass ||
 410       !u->rest)
 411     return URL_SYNTAX_ERROR;
 412
 413   if (!u->protocol)
 414     {
 415       /* Now we know it's a relative URL. Do we have any base? */
 416       if (!b || !url_proto_path_flags[b->protoid])
 417         return URL_ERR_REL_NOTHING;
 418       u->protocol = b->protocol;
 419       u->protoid = b->protoid;
 420
 421       /* Reference to the same host */
 422       if (!u->host)
 423         {
 424           u->host = b->host;
 425           u->user = b->user;
 426           u->pass = b->pass;
 427           u->port = b->port;
 428           if (err = relpath_merge(u, b))
 429             return err;
 430         }
 431     }
 432
 433   /* Change path "?" to "/?" because it's the true meaning */
 434   if (u->rest[0] == '?')
 435     {
 436       int l = strlen(u->rest);
 437       if (u->bufend - u->buf < l+1)
 438         return URL_ERR_TOO_LONG;
 439       u->buf[0] = '/';
 440       memcpy(u->buf+1, u->rest, l+1);
 441       u->rest = u->buf;
 442       u->buf += l+2;
 443     }
 444
 445   /* Fill in missing info */
 446   if (u->port == ~0U)
 447     u->port = std_ports[u->protoid];
 448
 449   return 0;
 450 }
 451
 452 /* Name canonicalization */
 453
 454 static void
 455 lowercase(byte *b)
 456 {
 457   if (b)
 458     while (*b)
 459       {
 460         if (*b >= 'A' && *b <= 'Z')
 461           *b = *b + 0x20;
 462         b++;
 463       }
 464 }
 465
 466 static void
 467 kill_end_dot(byte *b)
 468 {
 469   byte *k;
 470
 471   if (b)
 472     {
 473       k = b + strlen(b) - 1;
 474       while (k > b && *k == '.')
 475         *k-- = 0;
 476     }
 477 }
 478
 479 int
 480 url_canonicalize(struct url *u)
 481 {
 482   char *c;
 483
 484   lowercase(u->protocol);
 485   lowercase(u->host);
 486   kill_end_dot(u->host);
 487   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 488     u->rest = "/";
 489   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 490     *c = 0;
 491   return 0;
 492 }
 493
 494 /* Pack a broken-down URL */
 495
 496 static byte *
 497 append(byte *d, const byte *s, byte *e)
 498 {
 499   if (d)
 500     while (*s)
 501       {
 502         if (d >= e)
 503           return NULL;
 504         *d++ = *s++;
 505       }
 506   return d;
 507 }
 508
 509 int
 510 url_pack(struct url *u, byte *d)
 511 {
 512   byte *e = d + MAX_URL_SIZE - 10;
 513
 514   if (u->protocol)
 515     {
 516       d = append(d, u->protocol, e);
 517       d = append(d, ":", e);
 518       u->protoid = identify_protocol(u->protocol);
 519     }
 520   if (u->host)
 521     {
 522       d = append(d, "//", e);
 523       if (u->user)
 524         {
 525           d = append(d, u->user, e);
 526           if (u->pass)
 527             {
 528               d = append(d, ":", e);
 529               d = append(d, u->pass, e);
 530             }
 531           d = append(d, "@", e);
 532         }
 533       d = append(d, u->host, e);
 534       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 535         {
 536           char z[10];
 537           sprintf(z, "%d", u->port);
 538           d = append(d, ":", e);
 539           d = append(d, z, e);
 540         }
 541     }
 542   if (u->rest)
 543     d = append(d, u->rest, e);
 544   if (!d)
 545     return URL_ERR_TOO_LONG;
 546   *d = 0;
 547   return 0;
 548 }
 549
 550 /* Error messages */
 551
 552 static char *errmsg[] = {
 553   "Something is wrong",
 554   "Too long",
 555   "Invalid character",
 556   "Invalid escape",
 557   "Invalid escaped character",
 558   "Invalid port number",
 559   "Relative URL not allowed",
 560   "Unknown protocol",
 561   "Syntax error",
 562   "Path underflow"
 563 };
 564
 565 char *
 566 url_error(uns err)
 567 {
 568   if (err >= sizeof(errmsg) / sizeof(char *))
 569     err = 0;
 570   return errmsg[err];
 571 }
 572
 573 /* Standard cookbook recipes */
 574
 575 int
 576 url_canon_split_rel(const byte *u, byte *buf1, byte *buf2, struct url *url, struct url *base)
 577 {
 578   int err;
 579
 580   if (err = url_deescape(u, buf1))
 581     return err;
 582   if (err = url_split(buf1, url, buf2))
 583     return err;
 584   if (err = url_normalize(url, base))
 585     return err;
 586   return url_canonicalize(url);
 587 }
 588
 589 int
 590 url_auto_canonicalize_rel(const byte *src, byte *dst, struct url *base)
 591 {
 592   byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 593   int err;
 594   struct url ur;
 595
 596   (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
 597    (err = url_pack(&ur, buf3)) ||
 598    (err = url_enescape(buf3, dst)));
 599   return err;
 600 }
 601
 602 /* Testing */
 603
 604 #ifdef TEST
 605
 606 int main(int argc, char **argv)
 607 {
 608   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 609   int err;
 610   struct url url, url0;
 611   char *base = "http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment";
 612
 613   if (argc != 2 && argc != 3)
 614     return 1;
 615   if (argc == 3)
 616     base = argv[2];
 617   if (err = url_deescape(argv[1], buf1))
 618     {
 619       printf("deesc: error %d\n", err);
 620       return 1;
 621     }
 622   printf("deesc: %s\n", buf1);
 623   if (err = url_split(buf1, &url, buf2))
 624     {
 625       printf("split: error %d\n", err);
 626       return 1;
 627     }
 628   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 629   if (err = url_split(base, &url0, buf3))
 630     {
 631       printf("split base: error %d\n", err);
 632       return 1;
 633     }
 634   if (err = url_normalize(&url0, NULL))
 635     {
 636       printf("normalize base: error %d\n", err);
 637       return 1;
 638     }
 639   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 640   if (err = url_normalize(&url, &url0))
 641     {
 642       printf("normalize: error %d\n", err);
 643       return 1;
 644     }
 645   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 646   if (err = url_canonicalize(&url))
 647     {
 648       printf("canonicalize: error %d\n", err);
 649       return 1;
 650     }
 651   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 652   if (err = url_pack(&url, buf4))
 653     {
 654       printf("pack: error %d\n", err);
 655       return 1;
 656     }
 657   printf("pack: %s\n", buf4);
 658   if (err = url_enescape(buf4, buf2))
 659     {
 660       printf("enesc: error %d\n", err);
 661       return 1;
 662     }
 663   printf("enesc: %s\n", buf2);
 664   return 0;
 665 }
 666
 667 #endif
 668
 669 struct component {
 670         const byte *start;
 671         int length;
 672         uns count;
 673         u32 hash;
 674 };
 675
 676 static inline u32
 677 hashf(const byte *start, int length)
 678 {
 679         u32 hf = length;
 680         while (length-- > 0)
 681                 hf = (hf << 8 | hf >> 24) ^ *start++;
 682         return hf;
 683 }
 684
 685 static inline uns
 686 repeat_count(struct component *comp, uns count, uns len)
 687 {
 688         struct component *orig_comp = comp;
 689         uns found = 0;
 690         while (1)
 691         {
 692                 uns i;
 693                 comp += len;
 694                 count -= len;
 695                 found++;
 696                 if (count < len)
 697                         return found;
 698                 for (i=0; i<len; i++)
 699                         if (comp[i].hash != orig_comp[i].hash
 700                         || comp[i].length != orig_comp[i].length
 701                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 702                                 return found;
 703         }
 704 }
 705
 706 int
 707 url_has_repeated_component(const byte *url)
 708 {
 709         struct component *comp;
 710         uns comps, comp_len, rep_prefix, hash_size, *hash, *next;
 711         const byte *c;
 712         uns i, j, k;
 713
 714         for (comps=0, c=url; c; comps++)
 715         {
 716                 c = strpbrk(c, url_component_separators);
 717                 if (c)
 718                         c++;
 719         }
 720         if (comps < url_min_repeat_count && comps <= url_max_occurences)
 721                 return 0;
 722         comp = alloca(comps * sizeof(*comp));
 723         for (i=0, c=url; c; i++)
 724         {
 725                 comp[i].start = c;
 726                 c = strpbrk(c, url_component_separators);
 727                 if (c)
 728                 {
 729                         comp[i].length = c - comp[i].start;
 730                         c++;
 731                 }
 732                 else
 733                         comp[i].length = strlen(comp[i].start);
 734         }
 735         ASSERT(i == comps);
 736         for (i=0; i<comps; i++)
 737                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 738         if (comps > url_max_occurences)
 739         {
 740                 hash_size = next_table_prime(comps);
 741                 hash = alloca(hash_size * sizeof(*hash));
 742                 next = alloca(comps * sizeof(*next));
 743                 memset(hash, 255, hash_size * sizeof(*hash));
 744                 for (i=0; i<comps; i++)
 745                 {
 746                         j = comp[i].hash % hash_size;
 747                         for (k = hash[j]; ~k && (comp[i].hash != comp[k].hash || comp[i].length != comp[k].length ||
 748                             memcmp(comp[k].start, comp[i].start, comp[i].length)); k = next[k]);
 749                         if (!~k)
 750                         {
 751                                 next[i] = hash[j];
 752                                 hash[j] = i;
 753                                 comp[i].count = 1;
 754                         }
 755                         else
 756                         {
 757                                 if (comp[k].count++ >= url_max_occurences)
 758                                         return 1;
 759                         }
 760                 }
 761         }
 762         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 763                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 764                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 765                                 return comp_len;
 766         return 0;
 767 }