lib/url.c

   1 /*
   2  *      UCW Library -- URL Functions
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001--2005 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  *
  10  *      The URL syntax corresponds to RFC 2396 with several exceptions:
  11  *
  12  *         o  Escaping of special characters still follows RFC 1738.
  13  *         o  Interpretation of path parameters follows RFC 1808.
  14  *
  15  *      XXX: The buffer handling in this module is really horrible, but it works.
  16  */
  17
  18 #include "lib/lib.h"
  19 #include "lib/url.h"
  20 #include "lib/chartype.h"
  21 #include "lib/conf.h"
  22
  23 #include <string.h>
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <alloca.h>
  27
  28 /* Configuration */
  29
  30 static uns url_ignore_spaces;
  31 static uns url_ignore_underflow;
  32 static byte *url_component_separators = "";
  33 static uns url_min_repeat_count = 0x7fffffff;
  34 static uns url_max_repeat_length = 0;
  35
  36 static struct cfitem url_config[] = {
  37   { "URL",                              CT_SECTION,     NULL },
  38   { "IgnoreSpaces",                     CT_INT,         &url_ignore_spaces },
  39   { "IgnoreUnderflow",                  CT_INT,         &url_ignore_underflow },
  40   { "ComponentSeparators",              CT_STRING,      &url_component_separators },
  41   { "MinRepeatCount",                   CT_INT,         &url_min_repeat_count },
  42   { "MaxRepeatLength",                  CT_INT,         &url_max_repeat_length },
  43   { NULL,                               CT_STOP,        NULL }
  44 };
  45
  46 static void CONSTRUCTOR url_init_config(void)
  47 {
  48   cf_register(url_config);
  49 }
  50
  51 /* Escaping and de-escaping */
  52
  53 static uns
  54 enhex(uns x)
  55 {
  56   return (x<10) ? (x + '0') : (x - 10 + 'A');
  57 }
  58
  59 int
  60 url_deescape(byte *s, byte *d)
  61 {
  62   byte *dstart = d;
  63   byte *end = d + MAX_URL_SIZE - 10;
  64   while (*s)
  65     {
  66       if (d >= end)
  67         return URL_ERR_TOO_LONG;
  68       if (*s == '%')
  69         {
  70           unsigned int val;
  71           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  72             return URL_ERR_INVALID_ESCAPE;
  73           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  74           if (val < 0x20)
  75             return URL_ERR_INVALID_ESCAPED_CHAR;
  76           switch (val)
  77             {
  78             case ';':
  79               val = NCC_SEMICOLON; break;
  80             case '/':
  81               val = NCC_SLASH; break;
  82             case '?':
  83               val = NCC_QUEST; break;
  84             case ':':
  85               val = NCC_COLON; break;
  86             case '@':
  87               val = NCC_AT; break;
  88             case '=':
  89               val = NCC_EQUAL; break;
  90             case '&':
  91               val = NCC_AND; break;
  92             case '#':
  93               val = NCC_HASH; break;
  94             }
  95           *d++ = val;
  96           s += 3;
  97         }
  98       else if (*s > 0x20)
  99         *d++ = *s++;
 100       else if (Cspace(*s))
 101         {
 102           byte *s0 = s;
 103           while (Cspace(*s))
 104             s++;
 105           if (!url_ignore_spaces || !(!*s || d == dstart))
 106             {
 107               while (Cspace(*s0))
 108                 {
 109                   if (d >= end)
 110                     return URL_ERR_TOO_LONG;
 111                   *d++ = *s0++;
 112                 }
 113             }
 114         }
 115       else
 116         return URL_ERR_INVALID_CHAR;
 117     }
 118   *d = 0;
 119   return 0;
 120 }
 121
 122 int
 123 url_enescape(byte *s, byte *d)
 124 {
 125   byte *end = d + MAX_URL_SIZE - 10;
 126   unsigned int c;
 127
 128   while (c = *s)
 129     {
 130       if (d >= end)
 131         return URL_ERR_TOO_LONG;
 132       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 133           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 134           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 135           c == ',' ||
 136           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 137           c == '=' || c == '&' || c == '#' || c == ';')
 138         *d++ = *s++;
 139       else
 140         {
 141           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
 142           *d++ = '%';
 143           *d++ = enhex(val >> 4);
 144           *d++ = enhex(val & 0x0f);
 145           s++;
 146         }
 147     }
 148   *d = 0;
 149   return 0;
 150 }
 151
 152 int
 153 url_enescape_friendly(byte *src, byte *dest)
 154 {
 155   byte *end = dest + MAX_URL_SIZE - 10;
 156   while (*src)
 157     {
 158       if (dest >= end)
 159         return URL_ERR_TOO_LONG;
 160       if (*src < NCC_MAX)
 161         *dest++ = NCC_CHARS[*src++];
 162       else if (*src >= 0x20 && *src < 0x7f)
 163         *dest++ = *src++;
 164       else
 165         {
 166           *dest++ = '%';
 167           *dest++ = enhex(*src >> 4);
 168           *dest++ = enhex(*src++ & 0x0f);
 169         }
 170     }
 171   *dest = 0;
 172   return 0;
 173 }
 174
 175 /* Split an URL (several parts may be copied to the destination buffer) */
 176
 177 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 178 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 179
 180 uns
 181 identify_protocol(byte *p)
 182 {
 183   uns i;
 184
 185   for(i=1; i<URL_PROTO_MAX; i++)
 186     if (!strcasecmp(p, url_proto_names[i]))
 187       return i;
 188   return URL_PROTO_UNKNOWN;
 189 }
 190
 191 int
 192 url_split(byte *s, struct url *u, byte *d)
 193 {
 194   bzero(u, sizeof(struct url));
 195   u->port = ~0;
 196   u->bufend = d + MAX_URL_SIZE - 10;
 197
 198   if (s[0] != '/')                      /* Seek for "protocol:" */
 199     {
 200       byte *p = s;
 201       while (*p && Calnum(*p))
 202         p++;
 203       if (p != s && *p == ':')
 204         {
 205           u->protocol = d;
 206           while (s < p)
 207             *d++ = *s++;
 208           *d++ = 0;
 209           u->protoid = identify_protocol(u->protocol);
 210           s++;
 211           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 212             {
 213               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 214               int len = d - u->protocol;
 215               d -= len;
 216               s -= len;
 217               u->protocol = NULL;
 218               u->protoid = 0;
 219             }
 220         }
 221     }
 222
 223   if (s[0] == '/')                      /* Host spec or absolute path */
 224     {
 225       if (s[1] == '/')                  /* Host spec */
 226         {
 227           byte *q, *e;
 228           byte *at = NULL;
 229           char *ep;
 230
 231           s += 2;
 232           q = d;
 233           while (*s && *s != '/' && *s != '?')  /* Copy user:passwd@host:port */
 234             {
 235               if (*s != '@')
 236                 *d++ = *s;
 237               else if (!at)
 238                 {
 239                   *d++ = 0;
 240                   at = d;
 241                 }
 242               else                      /* This shouldn't happen with sane URL's, but we need to be sure */
 243                 *d++ = NCC_AT;
 244               s++;
 245             }
 246           *d++ = 0;
 247           if (at)                       /* user:passwd present */
 248             {
 249               u->user = q;
 250               if (e = strchr(q, ':'))
 251                 {
 252                   *e++ = 0;
 253                   u->pass = e;
 254                 }
 255             }
 256           else
 257             at = q;
 258           e = strchr(at, ':');
 259           if (e)                        /* host:port present */
 260             {
 261               uns p;
 262               *e++ = 0;
 263               p = strtoul(e, &ep, 10);
 264               if (ep && *ep || p > 65535)
 265                 return URL_ERR_INVALID_PORT;
 266               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 267                 u->port = p;
 268             }
 269           u->host = at;
 270         }
 271     }
 272
 273   u->rest = s;
 274   u->buf = d;
 275   return 0;
 276 }
 277
 278 /* Normalization according to given base URL */
 279
 280 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 281
 282 static int
 283 relpath_merge(struct url *u, struct url *b)
 284 {
 285   byte *a = u->rest;
 286   byte *o = b->rest;
 287   byte *d = u->buf;
 288   byte *e = u->bufend;
 289   byte *p;
 290
 291   if (a[0] == '/')                      /* Absolute path => OK */
 292     return 0;
 293   if (o[0] != '/' && o[0] != '?')
 294     return URL_PATH_UNDERFLOW;
 295
 296   if (!a[0])                            /* Empty URL -> inherit everything */
 297     {
 298       u->rest = b->rest;
 299       return 0;
 300     }
 301
 302   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 303
 304   if (a[0] == '#')                      /* Another fragment */
 305     {
 306       for(p=o; *p && *p != '#'; p++)
 307         ;
 308       goto copy;
 309     }
 310   if (a[0] == '?')                      /* New query */
 311     {
 312       for(p=o; *p && *p != '#' && *p != '?'; p++)
 313         ;
 314       goto copy;
 315     }
 316   if (a[0] == ';')                      /* Change parameters */
 317     {
 318       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 319         ;
 320       goto copy;
 321     }
 322
 323   p = NULL;                             /* Copy original path and find the last slash */
 324   while (*o && *o != ';' && *o != '?' && *o != '#')
 325     {
 326       if (d >= e)
 327         return URL_ERR_TOO_LONG;
 328       if ((*d++ = *o++) == '/')
 329         p = d;
 330     }
 331   if (!p)
 332     return URL_ERR_REL_NOTHING;
 333   d = p;
 334
 335   while (*a)
 336     {
 337       if (a[0] == '.')
 338         {
 339           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 340             {
 341               a++;
 342               if (a[0])
 343                 a++;
 344               continue;
 345             }
 346           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 347             {
 348               a += 2;
 349               if (a[0])
 350                 a++;
 351               if (d <= u->buf + 1)
 352                 {
 353                   /*
 354                    * RFC 1808 says we should leave ".." as a path segment, but
 355                    * we intentionally break the rule and refuse the URL.
 356                    */
 357                   if (!url_ignore_underflow)
 358                     return URL_PATH_UNDERFLOW;
 359                 }
 360               else
 361                 {
 362                   d--;                  /* Discard trailing slash */
 363                   while (d[-1] != '/')
 364                     d--;
 365                 }
 366               continue;
 367             }
 368         }
 369       while (a[0] && a[0] != '/')
 370         {
 371           if (d >= e)
 372             return URL_ERR_TOO_LONG;
 373           *d++ = *a++;
 374         }
 375       if (a[0])
 376         *d++ = *a++;
 377     }
 378
 379 okay:
 380   *d++ = 0;
 381   u->buf = d;
 382   return 0;
 383
 384 copy:                                   /* Combine part of old URL with the new one */
 385   while (o < p)
 386     if (d < e)
 387       *d++ = *o++;
 388     else
 389       return URL_ERR_TOO_LONG;
 390   while (*a)
 391     if (d < e)
 392       *d++ = *a++;
 393     else
 394       return URL_ERR_TOO_LONG;
 395   goto okay;
 396 }
 397
 398 int
 399 url_normalize(struct url *u, struct url *b)
 400 {
 401   int err;
 402
 403   /* Basic checks */
 404   if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
 405       !u->host && u->user ||
 406       !u->user && u->pass ||
 407       !u->rest)
 408     return URL_SYNTAX_ERROR;
 409
 410   if (!u->protocol)
 411     {
 412       /* Now we know it's a relative URL. Do we have any base? */
 413       if (!b || !url_proto_path_flags[b->protoid])
 414         return URL_ERR_REL_NOTHING;
 415       u->protocol = b->protocol;
 416       u->protoid = b->protoid;
 417
 418       /* Reference to the same host */
 419       if (!u->host)
 420         {
 421           u->host = b->host;
 422           u->user = b->user;
 423           u->pass = b->pass;
 424           u->port = b->port;
 425           if (err = relpath_merge(u, b))
 426             return err;
 427         }
 428     }
 429
 430   /* Change path "?" to "/?" because it's the true meaning */
 431   if (u->rest[0] == '?')
 432     {
 433       int l = strlen(u->rest);
 434       if (u->bufend - u->buf < l+1)
 435         return URL_ERR_TOO_LONG;
 436       u->buf[0] = '/';
 437       memcpy(u->buf+1, u->rest, l+1);
 438       u->rest = u->buf;
 439       u->buf += l+2;
 440     }
 441
 442   /* Fill in missing info */
 443   if (u->port == ~0U)
 444     u->port = std_ports[u->protoid];
 445
 446   return 0;
 447 }
 448
 449 /* Name canonicalization */
 450
 451 static void
 452 lowercase(byte *b)
 453 {
 454   if (b)
 455     while (*b)
 456       {
 457         if (*b >= 'A' && *b <= 'Z')
 458           *b = *b + 0x20;
 459         b++;
 460       }
 461 }
 462
 463 static void
 464 kill_end_dot(byte *b)
 465 {
 466   byte *k;
 467
 468   if (b)
 469     {
 470       k = b + strlen(b) - 1;
 471       while (k > b && *k == '.')
 472         *k-- = 0;
 473     }
 474 }
 475
 476 int
 477 url_canonicalize(struct url *u)
 478 {
 479   char *c;
 480
 481   lowercase(u->protocol);
 482   lowercase(u->host);
 483   kill_end_dot(u->host);
 484   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 485     u->rest = "/";
 486   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 487     *c = 0;
 488   return 0;
 489 }
 490
 491 /* Pack a broken-down URL */
 492
 493 static byte *
 494 append(byte *d, byte *s, byte *e)
 495 {
 496   if (d)
 497     while (*s)
 498       {
 499         if (d >= e)
 500           return NULL;
 501         *d++ = *s++;
 502       }
 503   return d;
 504 }
 505
 506 int
 507 url_pack(struct url *u, byte *d)
 508 {
 509   byte *e = d + MAX_URL_SIZE - 10;
 510
 511   if (u->protocol)
 512     {
 513       d = append(d, u->protocol, e);
 514       d = append(d, ":", e);
 515       u->protoid = identify_protocol(u->protocol);
 516     }
 517   if (u->host)
 518     {
 519       d = append(d, "//", e);
 520       if (u->user)
 521         {
 522           d = append(d, u->user, e);
 523           if (u->pass)
 524             {
 525               d = append(d, ":", e);
 526               d = append(d, u->pass, e);
 527             }
 528           d = append(d, "@", e);
 529         }
 530       d = append(d, u->host, e);
 531       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 532         {
 533           char z[10];
 534           sprintf(z, "%d", u->port);
 535           d = append(d, ":", e);
 536           d = append(d, z, e);
 537         }
 538     }
 539   if (u->rest)
 540     d = append(d, u->rest, e);
 541   if (!d)
 542     return URL_ERR_TOO_LONG;
 543   *d = 0;
 544   return 0;
 545 }
 546
 547 /* Error messages */
 548
 549 static char *errmsg[] = {
 550   "Something is wrong",
 551   "Too long",
 552   "Invalid character",
 553   "Invalid escape",
 554   "Invalid escaped character",
 555   "Invalid port number",
 556   "Relative URL not allowed",
 557   "Unknown protocol",
 558   "Syntax error",
 559   "Path underflow"
 560 };
 561
 562 char *
 563 url_error(uns err)
 564 {
 565   if (err >= sizeof(errmsg) / sizeof(char *))
 566     err = 0;
 567   return errmsg[err];
 568 }
 569
 570 /* Standard cookbook recipes */
 571
 572 int
 573 url_canon_split_rel(byte *u, byte *buf1, byte *buf2, struct url *url, struct url *base)
 574 {
 575   int err;
 576
 577   if (err = url_deescape(u, buf1))
 578     return err;
 579   if (err = url_split(buf1, url, buf2))
 580     return err;
 581   if (err = url_normalize(url, base))
 582     return err;
 583   return url_canonicalize(url);
 584 }
 585
 586 int
 587 url_auto_canonicalize_rel(byte *src, byte *dst, struct url *base)
 588 {
 589   byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 590   int err;
 591   struct url ur;
 592
 593   (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
 594    (err = url_pack(&ur, buf3)) ||
 595    (err = url_enescape(buf3, dst)));
 596   return err;
 597 }
 598
 599 /* Testing */
 600
 601 #ifdef TEST
 602
 603 int main(int argc, char **argv)
 604 {
 605   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 606   int err;
 607   struct url url, url0;
 608   char *base = "http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment";
 609
 610   if (argc != 2 && argc != 3)
 611     return 1;
 612   if (argc == 3)
 613     base = argv[2];
 614   if (err = url_deescape(argv[1], buf1))
 615     {
 616       printf("deesc: error %d\n", err);
 617       return 1;
 618     }
 619   printf("deesc: %s\n", buf1);
 620   if (err = url_split(buf1, &url, buf2))
 621     {
 622       printf("split: error %d\n", err);
 623       return 1;
 624     }
 625   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 626   if (err = url_split(base, &url0, buf3))
 627     {
 628       printf("split base: error %d\n", err);
 629       return 1;
 630     }
 631   if (err = url_normalize(&url0, NULL))
 632     {
 633       printf("normalize base: error %d\n", err);
 634       return 1;
 635     }
 636   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 637   if (err = url_normalize(&url, &url0))
 638     {
 639       printf("normalize: error %d\n", err);
 640       return 1;
 641     }
 642   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 643   if (err = url_canonicalize(&url))
 644     {
 645       printf("canonicalize: error %d\n", err);
 646       return 1;
 647     }
 648   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 649   if (err = url_pack(&url, buf4))
 650     {
 651       printf("pack: error %d\n", err);
 652       return 1;
 653     }
 654   printf("pack: %s\n", buf4);
 655   if (err = url_enescape(buf4, buf2))
 656     {
 657       printf("enesc: error %d\n", err);
 658       return 1;
 659     }
 660   printf("enesc: %s\n", buf2);
 661   return 0;
 662 }
 663
 664 #endif
 665
 666 struct component {
 667         byte *start;
 668         int length;
 669         u32 hash;
 670 };
 671
 672 static inline u32
 673 hashf(byte *start, int length)
 674 {
 675         u32 hf = length;
 676         while (length-- > 0)
 677                 hf = (hf << 8 | hf >> 24) ^ *start++;
 678         return hf;
 679 }
 680
 681 static inline uns
 682 repeat_count(struct component *comp, uns count, uns len)
 683 {
 684         struct component *orig_comp = comp;
 685         uns found = 0;
 686         while (1)
 687         {
 688                 uns i;
 689                 comp += len;
 690                 count -= len;
 691                 found++;
 692                 if (count < len)
 693                         return found;
 694                 for (i=0; i<len; i++)
 695                         if (comp[i].hash != orig_comp[i].hash
 696                         || comp[i].length != orig_comp[i].length
 697                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 698                                 return found;
 699         }
 700 }
 701
 702 int
 703 url_has_repeated_component(byte *url)
 704 {
 705         struct component *comp;
 706         uns comps, comp_len, rep_prefix;
 707         byte *c;
 708         uns i;
 709
 710         for (comps=0, c=url; c; comps++)
 711         {
 712                 c = strpbrk(c, url_component_separators);
 713                 if (c)
 714                         c++;
 715         }
 716         if (comps < url_min_repeat_count)
 717                 return 0;
 718         comp = alloca(comps * sizeof(struct component));
 719         for (i=0, c=url; c; i++)
 720         {
 721                 comp[i].start = c;
 722                 c = strpbrk(c, url_component_separators);
 723                 if (c)
 724                 {
 725                         comp[i].length = c - comp[i].start;
 726                         c++;
 727                 }
 728                 else
 729                         comp[i].length = strlen(comp[i].start);
 730         }
 731         ASSERT(i == comps);
 732         for (i=0; i<comps; i++)
 733                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 734         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 735                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 736                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 737                                 return comp_len;
 738         return 0;
 739 }