lib/url.c

   1 /*
   2  *      UCW Library -- URL Functions
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001--2005 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  *
  10  *      The URL syntax corresponds to RFC 2396 with several exceptions:
  11  *
  12  *         o  Escaping of special characters still follows RFC 1738.
  13  *         o  Interpretation of path parameters follows RFC 1808.
  14  *
  15  *      XXX: The buffer handling in this module is really horrible, but it works.
  16  */
  17
  18 #include "lib/lib.h"
  19 #include "lib/url.h"
  20 #include "lib/chartype.h"
  21 #include "lib/conf2.h"
  22
  23 #include <string.h>
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <alloca.h>
  27
  28 /* Configuration */
  29
  30 static uns url_ignore_spaces;
  31 static uns url_ignore_underflow;
  32 static byte *url_component_separators = "";
  33 static uns url_min_repeat_count = 0x7fffffff;
  34 static uns url_max_repeat_length = 0;
  35
  36 static struct cf_section url_config = {
  37   CF_ITEMS {
  38     CF_UNS("IgnoreSpaces", &url_ignore_spaces),
  39     CF_UNS("IgnoreUnderflow", &url_ignore_underflow),
  40     CF_STRING("ComponentSeparators", &url_component_separators),
  41     CF_UNS("MinRepeatCount", &url_min_repeat_count),
  42     CF_UNS("MaxRepeatLength", &url_max_repeat_length),
  43     CF_END
  44   }
  45 };
  46
  47 static void CONSTRUCTOR url_init_config(void)
  48 {
  49   cf_declare_section("URL", &url_config, 0);
  50 }
  51
  52 /* Escaping and de-escaping */
  53
  54 static uns
  55 enhex(uns x)
  56 {
  57   return (x<10) ? (x + '0') : (x - 10 + 'A');
  58 }
  59
  60 int
  61 url_deescape(byte *s, byte *d)
  62 {
  63   byte *dstart = d;
  64   byte *end = d + MAX_URL_SIZE - 10;
  65   while (*s)
  66     {
  67       if (d >= end)
  68         return URL_ERR_TOO_LONG;
  69       if (*s == '%')
  70         {
  71           unsigned int val;
  72           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  73             return URL_ERR_INVALID_ESCAPE;
  74           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  75           if (val < 0x20)
  76             return URL_ERR_INVALID_ESCAPED_CHAR;
  77           switch (val)
  78             {
  79             case ';':
  80               val = NCC_SEMICOLON; break;
  81             case '/':
  82               val = NCC_SLASH; break;
  83             case '?':
  84               val = NCC_QUEST; break;
  85             case ':':
  86               val = NCC_COLON; break;
  87             case '@':
  88               val = NCC_AT; break;
  89             case '=':
  90               val = NCC_EQUAL; break;
  91             case '&':
  92               val = NCC_AND; break;
  93             case '#':
  94               val = NCC_HASH; break;
  95             }
  96           *d++ = val;
  97           s += 3;
  98         }
  99       else if (*s > 0x20)
 100         *d++ = *s++;
 101       else if (Cspace(*s))
 102         {
 103           byte *s0 = s;
 104           while (Cspace(*s))
 105             s++;
 106           if (!url_ignore_spaces || !(!*s || d == dstart))
 107             {
 108               while (Cspace(*s0))
 109                 {
 110                   if (d >= end)
 111                     return URL_ERR_TOO_LONG;
 112                   *d++ = *s0++;
 113                 }
 114             }
 115         }
 116       else
 117         return URL_ERR_INVALID_CHAR;
 118     }
 119   *d = 0;
 120   return 0;
 121 }
 122
 123 int
 124 url_enescape(byte *s, byte *d)
 125 {
 126   byte *end = d + MAX_URL_SIZE - 10;
 127   unsigned int c;
 128
 129   while (c = *s)
 130     {
 131       if (d >= end)
 132         return URL_ERR_TOO_LONG;
 133       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 134           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 135           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 136           c == ',' ||
 137           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 138           c == '=' || c == '&' || c == '#' || c == ';')
 139         *d++ = *s++;
 140       else
 141         {
 142           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
 143           *d++ = '%';
 144           *d++ = enhex(val >> 4);
 145           *d++ = enhex(val & 0x0f);
 146           s++;
 147         }
 148     }
 149   *d = 0;
 150   return 0;
 151 }
 152
 153 int
 154 url_enescape_friendly(byte *src, byte *dest)
 155 {
 156   byte *end = dest + MAX_URL_SIZE - 10;
 157   while (*src)
 158     {
 159       if (dest >= end)
 160         return URL_ERR_TOO_LONG;
 161       if (*src < NCC_MAX)
 162         *dest++ = NCC_CHARS[*src++];
 163       else if (*src >= 0x20 && *src < 0x7f)
 164         *dest++ = *src++;
 165       else
 166         {
 167           *dest++ = '%';
 168           *dest++ = enhex(*src >> 4);
 169           *dest++ = enhex(*src++ & 0x0f);
 170         }
 171     }
 172   *dest = 0;
 173   return 0;
 174 }
 175
 176 /* Split an URL (several parts may be copied to the destination buffer) */
 177
 178 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 179 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 180
 181 uns
 182 identify_protocol(byte *p)
 183 {
 184   uns i;
 185
 186   for(i=1; i<URL_PROTO_MAX; i++)
 187     if (!strcasecmp(p, url_proto_names[i]))
 188       return i;
 189   return URL_PROTO_UNKNOWN;
 190 }
 191
 192 int
 193 url_split(byte *s, struct url *u, byte *d)
 194 {
 195   bzero(u, sizeof(struct url));
 196   u->port = ~0;
 197   u->bufend = d + MAX_URL_SIZE - 10;
 198
 199   if (s[0] != '/')                      /* Seek for "protocol:" */
 200     {
 201       byte *p = s;
 202       while (*p && Calnum(*p))
 203         p++;
 204       if (p != s && *p == ':')
 205         {
 206           u->protocol = d;
 207           while (s < p)
 208             *d++ = *s++;
 209           *d++ = 0;
 210           u->protoid = identify_protocol(u->protocol);
 211           s++;
 212           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 213             {
 214               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 215               int len = d - u->protocol;
 216               d -= len;
 217               s -= len;
 218               u->protocol = NULL;
 219               u->protoid = 0;
 220             }
 221         }
 222     }
 223
 224   if (s[0] == '/')                      /* Host spec or absolute path */
 225     {
 226       if (s[1] == '/')                  /* Host spec */
 227         {
 228           byte *q, *e;
 229           byte *at = NULL;
 230           char *ep;
 231
 232           s += 2;
 233           q = d;
 234           while (*s && *s != '/' && *s != '?')  /* Copy user:passwd@host:port */
 235             {
 236               if (*s != '@')
 237                 *d++ = *s;
 238               else if (!at)
 239                 {
 240                   *d++ = 0;
 241                   at = d;
 242                 }
 243               else                      /* This shouldn't happen with sane URL's, but we need to be sure */
 244                 *d++ = NCC_AT;
 245               s++;
 246             }
 247           *d++ = 0;
 248           if (at)                       /* user:passwd present */
 249             {
 250               u->user = q;
 251               if (e = strchr(q, ':'))
 252                 {
 253                   *e++ = 0;
 254                   u->pass = e;
 255                 }
 256             }
 257           else
 258             at = q;
 259           e = strchr(at, ':');
 260           if (e)                        /* host:port present */
 261             {
 262               uns p;
 263               *e++ = 0;
 264               p = strtoul(e, &ep, 10);
 265               if (ep && *ep || p > 65535)
 266                 return URL_ERR_INVALID_PORT;
 267               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 268                 u->port = p;
 269             }
 270           u->host = at;
 271         }
 272     }
 273
 274   u->rest = s;
 275   u->buf = d;
 276   return 0;
 277 }
 278
 279 /* Normalization according to given base URL */
 280
 281 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 282
 283 static int
 284 relpath_merge(struct url *u, struct url *b)
 285 {
 286   byte *a = u->rest;
 287   byte *o = b->rest;
 288   byte *d = u->buf;
 289   byte *e = u->bufend;
 290   byte *p;
 291
 292   if (a[0] == '/')                      /* Absolute path => OK */
 293     return 0;
 294   if (o[0] != '/' && o[0] != '?')
 295     return URL_PATH_UNDERFLOW;
 296
 297   if (!a[0])                            /* Empty URL -> inherit everything */
 298     {
 299       u->rest = b->rest;
 300       return 0;
 301     }
 302
 303   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 304
 305   if (a[0] == '#')                      /* Another fragment */
 306     {
 307       for(p=o; *p && *p != '#'; p++)
 308         ;
 309       goto copy;
 310     }
 311   if (a[0] == '?')                      /* New query */
 312     {
 313       for(p=o; *p && *p != '#' && *p != '?'; p++)
 314         ;
 315       goto copy;
 316     }
 317   if (a[0] == ';')                      /* Change parameters */
 318     {
 319       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 320         ;
 321       goto copy;
 322     }
 323
 324   p = NULL;                             /* Copy original path and find the last slash */
 325   while (*o && *o != ';' && *o != '?' && *o != '#')
 326     {
 327       if (d >= e)
 328         return URL_ERR_TOO_LONG;
 329       if ((*d++ = *o++) == '/')
 330         p = d;
 331     }
 332   if (!p)
 333     return URL_ERR_REL_NOTHING;
 334   d = p;
 335
 336   while (*a)
 337     {
 338       if (a[0] == '.')
 339         {
 340           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 341             {
 342               a++;
 343               if (a[0])
 344                 a++;
 345               continue;
 346             }
 347           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 348             {
 349               a += 2;
 350               if (a[0])
 351                 a++;
 352               if (d <= u->buf + 1)
 353                 {
 354                   /*
 355                    * RFC 1808 says we should leave ".." as a path segment, but
 356                    * we intentionally break the rule and refuse the URL.
 357                    */
 358                   if (!url_ignore_underflow)
 359                     return URL_PATH_UNDERFLOW;
 360                 }
 361               else
 362                 {
 363                   d--;                  /* Discard trailing slash */
 364                   while (d[-1] != '/')
 365                     d--;
 366                 }
 367               continue;
 368             }
 369         }
 370       while (a[0] && a[0] != '/')
 371         {
 372           if (d >= e)
 373             return URL_ERR_TOO_LONG;
 374           *d++ = *a++;
 375         }
 376       if (a[0])
 377         *d++ = *a++;
 378     }
 379
 380 okay:
 381   *d++ = 0;
 382   u->buf = d;
 383   return 0;
 384
 385 copy:                                   /* Combine part of old URL with the new one */
 386   while (o < p)
 387     if (d < e)
 388       *d++ = *o++;
 389     else
 390       return URL_ERR_TOO_LONG;
 391   while (*a)
 392     if (d < e)
 393       *d++ = *a++;
 394     else
 395       return URL_ERR_TOO_LONG;
 396   goto okay;
 397 }
 398
 399 int
 400 url_normalize(struct url *u, struct url *b)
 401 {
 402   int err;
 403
 404   /* Basic checks */
 405   if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
 406       !u->host && u->user ||
 407       !u->user && u->pass ||
 408       !u->rest)
 409     return URL_SYNTAX_ERROR;
 410
 411   if (!u->protocol)
 412     {
 413       /* Now we know it's a relative URL. Do we have any base? */
 414       if (!b || !url_proto_path_flags[b->protoid])
 415         return URL_ERR_REL_NOTHING;
 416       u->protocol = b->protocol;
 417       u->protoid = b->protoid;
 418
 419       /* Reference to the same host */
 420       if (!u->host)
 421         {
 422           u->host = b->host;
 423           u->user = b->user;
 424           u->pass = b->pass;
 425           u->port = b->port;
 426           if (err = relpath_merge(u, b))
 427             return err;
 428         }
 429     }
 430
 431   /* Change path "?" to "/?" because it's the true meaning */
 432   if (u->rest[0] == '?')
 433     {
 434       int l = strlen(u->rest);
 435       if (u->bufend - u->buf < l+1)
 436         return URL_ERR_TOO_LONG;
 437       u->buf[0] = '/';
 438       memcpy(u->buf+1, u->rest, l+1);
 439       u->rest = u->buf;
 440       u->buf += l+2;
 441     }
 442
 443   /* Fill in missing info */
 444   if (u->port == ~0U)
 445     u->port = std_ports[u->protoid];
 446
 447   return 0;
 448 }
 449
 450 /* Name canonicalization */
 451
 452 static void
 453 lowercase(byte *b)
 454 {
 455   if (b)
 456     while (*b)
 457       {
 458         if (*b >= 'A' && *b <= 'Z')
 459           *b = *b + 0x20;
 460         b++;
 461       }
 462 }
 463
 464 static void
 465 kill_end_dot(byte *b)
 466 {
 467   byte *k;
 468
 469   if (b)
 470     {
 471       k = b + strlen(b) - 1;
 472       while (k > b && *k == '.')
 473         *k-- = 0;
 474     }
 475 }
 476
 477 int
 478 url_canonicalize(struct url *u)
 479 {
 480   char *c;
 481
 482   lowercase(u->protocol);
 483   lowercase(u->host);
 484   kill_end_dot(u->host);
 485   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 486     u->rest = "/";
 487   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 488     *c = 0;
 489   return 0;
 490 }
 491
 492 /* Pack a broken-down URL */
 493
 494 static byte *
 495 append(byte *d, byte *s, byte *e)
 496 {
 497   if (d)
 498     while (*s)
 499       {
 500         if (d >= e)
 501           return NULL;
 502         *d++ = *s++;
 503       }
 504   return d;
 505 }
 506
 507 int
 508 url_pack(struct url *u, byte *d)
 509 {
 510   byte *e = d + MAX_URL_SIZE - 10;
 511
 512   if (u->protocol)
 513     {
 514       d = append(d, u->protocol, e);
 515       d = append(d, ":", e);
 516       u->protoid = identify_protocol(u->protocol);
 517     }
 518   if (u->host)
 519     {
 520       d = append(d, "//", e);
 521       if (u->user)
 522         {
 523           d = append(d, u->user, e);
 524           if (u->pass)
 525             {
 526               d = append(d, ":", e);
 527               d = append(d, u->pass, e);
 528             }
 529           d = append(d, "@", e);
 530         }
 531       d = append(d, u->host, e);
 532       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 533         {
 534           char z[10];
 535           sprintf(z, "%d", u->port);
 536           d = append(d, ":", e);
 537           d = append(d, z, e);
 538         }
 539     }
 540   if (u->rest)
 541     d = append(d, u->rest, e);
 542   if (!d)
 543     return URL_ERR_TOO_LONG;
 544   *d = 0;
 545   return 0;
 546 }
 547
 548 /* Error messages */
 549
 550 static char *errmsg[] = {
 551   "Something is wrong",
 552   "Too long",
 553   "Invalid character",
 554   "Invalid escape",
 555   "Invalid escaped character",
 556   "Invalid port number",
 557   "Relative URL not allowed",
 558   "Unknown protocol",
 559   "Syntax error",
 560   "Path underflow"
 561 };
 562
 563 char *
 564 url_error(uns err)
 565 {
 566   if (err >= sizeof(errmsg) / sizeof(char *))
 567     err = 0;
 568   return errmsg[err];
 569 }
 570
 571 /* Standard cookbook recipes */
 572
 573 int
 574 url_canon_split_rel(byte *u, byte *buf1, byte *buf2, struct url *url, struct url *base)
 575 {
 576   int err;
 577
 578   if (err = url_deescape(u, buf1))
 579     return err;
 580   if (err = url_split(buf1, url, buf2))
 581     return err;
 582   if (err = url_normalize(url, base))
 583     return err;
 584   return url_canonicalize(url);
 585 }
 586
 587 int
 588 url_auto_canonicalize_rel(byte *src, byte *dst, struct url *base)
 589 {
 590   byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 591   int err;
 592   struct url ur;
 593
 594   (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
 595    (err = url_pack(&ur, buf3)) ||
 596    (err = url_enescape(buf3, dst)));
 597   return err;
 598 }
 599
 600 /* Testing */
 601
 602 #ifdef TEST
 603
 604 int main(int argc, char **argv)
 605 {
 606   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 607   int err;
 608   struct url url, url0;
 609   char *base = "http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment";
 610
 611   if (argc != 2 && argc != 3)
 612     return 1;
 613   if (argc == 3)
 614     base = argv[2];
 615   if (err = url_deescape(argv[1], buf1))
 616     {
 617       printf("deesc: error %d\n", err);
 618       return 1;
 619     }
 620   printf("deesc: %s\n", buf1);
 621   if (err = url_split(buf1, &url, buf2))
 622     {
 623       printf("split: error %d\n", err);
 624       return 1;
 625     }
 626   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 627   if (err = url_split(base, &url0, buf3))
 628     {
 629       printf("split base: error %d\n", err);
 630       return 1;
 631     }
 632   if (err = url_normalize(&url0, NULL))
 633     {
 634       printf("normalize base: error %d\n", err);
 635       return 1;
 636     }
 637   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 638   if (err = url_normalize(&url, &url0))
 639     {
 640       printf("normalize: error %d\n", err);
 641       return 1;
 642     }
 643   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 644   if (err = url_canonicalize(&url))
 645     {
 646       printf("canonicalize: error %d\n", err);
 647       return 1;
 648     }
 649   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 650   if (err = url_pack(&url, buf4))
 651     {
 652       printf("pack: error %d\n", err);
 653       return 1;
 654     }
 655   printf("pack: %s\n", buf4);
 656   if (err = url_enescape(buf4, buf2))
 657     {
 658       printf("enesc: error %d\n", err);
 659       return 1;
 660     }
 661   printf("enesc: %s\n", buf2);
 662   return 0;
 663 }
 664
 665 #endif
 666
 667 struct component {
 668         byte *start;
 669         int length;
 670         u32 hash;
 671 };
 672
 673 static inline u32
 674 hashf(byte *start, int length)
 675 {
 676         u32 hf = length;
 677         while (length-- > 0)
 678                 hf = (hf << 8 | hf >> 24) ^ *start++;
 679         return hf;
 680 }
 681
 682 static inline uns
 683 repeat_count(struct component *comp, uns count, uns len)
 684 {
 685         struct component *orig_comp = comp;
 686         uns found = 0;
 687         while (1)
 688         {
 689                 uns i;
 690                 comp += len;
 691                 count -= len;
 692                 found++;
 693                 if (count < len)
 694                         return found;
 695                 for (i=0; i<len; i++)
 696                         if (comp[i].hash != orig_comp[i].hash
 697                         || comp[i].length != orig_comp[i].length
 698                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 699                                 return found;
 700         }
 701 }
 702
 703 int
 704 url_has_repeated_component(byte *url)
 705 {
 706         struct component *comp;
 707         uns comps, comp_len, rep_prefix;
 708         byte *c;
 709         uns i;
 710
 711         for (comps=0, c=url; c; comps++)
 712         {
 713                 c = strpbrk(c, url_component_separators);
 714                 if (c)
 715                         c++;
 716         }
 717         if (comps < url_min_repeat_count)
 718                 return 0;
 719         comp = alloca(comps * sizeof(struct component));
 720         for (i=0, c=url; c; i++)
 721         {
 722                 comp[i].start = c;
 723                 c = strpbrk(c, url_component_separators);
 724                 if (c)
 725                 {
 726                         comp[i].length = c - comp[i].start;
 727                         c++;
 728                 }
 729                 else
 730                         comp[i].length = strlen(comp[i].start);
 731         }
 732         ASSERT(i == comps);
 733         for (i=0; i<comps; i++)
 734                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 735         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 736                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 737                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 738                                 return comp_len;
 739         return 0;
 740 }