ucw/url.c

   1 /*
   2  *      UCW Library -- URL Functions
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001--2005 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  *
  10  *      XXX: The buffer handling in this module is really horrible, but it works.
  11  */
  12
  13 #include "ucw/lib.h"
  14 #include "ucw/url.h"
  15 #include "ucw/chartype.h"
  16 #include "ucw/conf.h"
  17 #include "ucw/prime.h"
  18
  19 #include <string.h>
  20 #include <stdlib.h>
  21 #include <stdio.h>
  22 #include <alloca.h>
  23
  24 /* Configuration */
  25
  26 static uns url_ignore_spaces;
  27 static uns url_ignore_underflow;
  28 static char *url_component_separators = "";
  29 static uns url_min_repeat_count = 0x7fffffff;
  30 static uns url_max_repeat_length = 0;
  31 static uns url_max_occurences = ~0U;
  32
  33 static struct cf_section url_config = {
  34   CF_ITEMS {
  35     CF_UNS("IgnoreSpaces", &url_ignore_spaces),
  36     CF_UNS("IgnoreUnderflow", &url_ignore_underflow),
  37     CF_STRING("ComponentSeparators", &url_component_separators),
  38     CF_UNS("MinRepeatCount", &url_min_repeat_count),
  39     CF_UNS("MaxRepeatLength", &url_max_repeat_length),
  40     CF_UNS("MaxOccurences", &url_max_occurences),
  41     CF_END
  42   }
  43 };
  44
  45 static void CONSTRUCTOR url_init_config(void)
  46 {
  47   cf_declare_section("URL", &url_config, 0);
  48 }
  49
  50 /* Escaping and de-escaping */
  51
  52 static uns
  53 enhex(uns x)
  54 {
  55   return (x<10) ? (x + '0') : (x - 10 + 'A');
  56 }
  57
  58 int
  59 url_deescape(const char *s, char *d)
  60 {
  61   char *dstart = d;
  62   char *end = d + MAX_URL_SIZE - 10;
  63   while (*s)
  64     {
  65       if (d >= end)
  66         return URL_ERR_TOO_LONG;
  67       if (*s == '%')
  68         {
  69           unsigned int val;
  70           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  71             return URL_ERR_INVALID_ESCAPE;
  72           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  73           if (val < 0x20)
  74             return URL_ERR_INVALID_ESCAPED_CHAR;
  75           switch (val)
  76             {
  77             case ';':
  78               val = NCC_SEMICOLON; break;
  79             case '/':
  80               val = NCC_SLASH; break;
  81             case '?':
  82               val = NCC_QUEST; break;
  83             case ':':
  84               val = NCC_COLON; break;
  85             case '@':
  86               val = NCC_AT; break;
  87             case '=':
  88               val = NCC_EQUAL; break;
  89             case '&':
  90               val = NCC_AND; break;
  91             case '#':
  92               val = NCC_HASH; break;
  93             case '$':
  94               val = NCC_DOLLAR; break;
  95             case '+':
  96               val = NCC_PLUS; break;
  97             case ',':
  98               val = NCC_COMMA; break;
  99             }
 100           *d++ = val;
 101           s += 3;
 102         }
 103       else if ((byte) *s > 0x20)
 104         *d++ = *s++;
 105       else if (Cspace(*s))
 106         {
 107           const char *s0 = s;
 108           while (Cspace(*s))
 109             s++;
 110           if (!url_ignore_spaces || !(!*s || d == dstart))
 111             {
 112               while (Cspace(*s0))
 113                 {
 114                   if (d >= end)
 115                     return URL_ERR_TOO_LONG;
 116                   *d++ = *s0++;
 117                 }
 118             }
 119         }
 120       else
 121         return URL_ERR_INVALID_CHAR;
 122     }
 123   *d = 0;
 124   return 0;
 125 }
 126
 127 int
 128 url_enescape(const char *s, char *d)
 129 {
 130   char *end = d + MAX_URL_SIZE - 10;
 131   unsigned int c;
 132
 133   while (c = *s)
 134     {
 135       if (d >= end)
 136         return URL_ERR_TOO_LONG;
 137       if (Calnum(c) ||                                                  /* RFC 2396 (2.1-2.3): Only alphanumerics ... */
 138           c == '-' || c == '_' || c == '.' || c == '+' || c == '~' ||   /* ... and several other exceptions ... */
 139           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 140           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 141           c == '=' || c == '&' || c == '#' || c == ';' ||
 142           c == '$' || c == '+' || c == ',')
 143         *d++ = *s++;
 144       else
 145         {
 146           uns val = ((byte)*s < NCC_MAX) ? NCC_CHARS[(byte)*s] : *s;
 147           *d++ = '%';
 148           *d++ = enhex(val >> 4);
 149           *d++ = enhex(val & 0x0f);
 150           s++;
 151         }
 152     }
 153   *d = 0;
 154   return 0;
 155 }
 156
 157 int
 158 url_enescape_friendly(const char *src, char *dest)
 159 {
 160   char *end = dest + MAX_URL_SIZE - 10;
 161   const byte *srcb = src;
 162   while (*srcb)
 163     {
 164       if (dest >= end)
 165         return URL_ERR_TOO_LONG;
 166       if (*srcb < NCC_MAX)
 167         *dest++ = NCC_CHARS[*srcb++];
 168       else if (*srcb >= 0x20 && *srcb < 0x7f)
 169         *dest++ = *srcb++;
 170       else
 171         {
 172           *dest++ = '%';
 173           *dest++ = enhex(*srcb >> 4);
 174           *dest++ = enhex(*srcb++ & 0x0f);
 175         }
 176     }
 177   *dest = 0;
 178   return 0;
 179 }
 180
 181 /* Split an URL (several parts may be copied to the destination buffer) */
 182
 183 char *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 184 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 185
 186 uns
 187 url_identify_protocol(const char *p)
 188 {
 189   uns i;
 190
 191   for(i=1; i<URL_PROTO_MAX; i++)
 192     if (!strcasecmp(p, url_proto_names[i]))
 193       return i;
 194   return URL_PROTO_UNKNOWN;
 195 }
 196
 197 int
 198 url_split(char *s, struct url *u, char *d)
 199 {
 200   bzero(u, sizeof(struct url));
 201   u->port = ~0;
 202   u->bufend = d + MAX_URL_SIZE - 10;
 203
 204   if (s[0] != '/')                      /* Seek for "protocol:" */
 205     {
 206       char *p = s;
 207       while (*p && Calnum(*p))
 208         p++;
 209       if (p != s && *p == ':')
 210         {
 211           u->protocol = d;
 212           while (s < p)
 213             *d++ = *s++;
 214           *d++ = 0;
 215           u->protoid = url_identify_protocol(u->protocol);
 216           s++;
 217           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 218             {
 219               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 220               int len = d - u->protocol;
 221               d -= len;
 222               s -= len;
 223               u->protocol = NULL;
 224               u->protoid = 0;
 225             }
 226         }
 227     }
 228
 229   if (s[0] == '/')                      /* Host spec or absolute path */
 230     {
 231       if (s[1] == '/')                  /* Host spec */
 232         {
 233           char *q, *e;
 234           char *at = NULL;
 235           char *ep;
 236
 237           s += 2;
 238           q = d;
 239           while (*s && *s != '/' && *s != '?')  /* Copy user:passwd@host:port */
 240             {
 241               if (*s != '@')
 242                 *d++ = *s;
 243               else if (!at)
 244                 {
 245                   *d++ = 0;
 246                   at = d;
 247                 }
 248               else                      /* This shouldn't happen with sane URL's, but we need to be sure */
 249                 *d++ = NCC_AT;
 250               s++;
 251             }
 252           *d++ = 0;
 253           if (at)                       /* user:passwd present */
 254             {
 255               u->user = q;
 256               if (e = strchr(q, ':'))
 257                 {
 258                   *e++ = 0;
 259                   u->pass = e;
 260                 }
 261             }
 262           else
 263             at = q;
 264           e = strchr(at, ':');
 265           if (e)                        /* host:port present */
 266             {
 267               uns p;
 268               *e++ = 0;
 269               p = strtoul(e, &ep, 10);
 270               if (ep && *ep || p > 65535)
 271                 return URL_ERR_INVALID_PORT;
 272               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 273                 u->port = p;
 274             }
 275           u->host = at;
 276         }
 277     }
 278
 279   u->rest = s;
 280   u->buf = d;
 281   return 0;
 282 }
 283
 284 /* Normalization according to given base URL */
 285
 286 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 287
 288 static int
 289 relpath_merge(struct url *u, struct url *b)
 290 {
 291   char *a = u->rest;
 292   char *o = b->rest;
 293   char *d = u->buf;
 294   char *e = u->bufend;
 295   char *p;
 296
 297   if (a[0] == '/')                      /* Absolute path => OK */
 298     return 0;
 299   if (o[0] != '/' && o[0] != '?')
 300     return URL_PATH_UNDERFLOW;
 301
 302   if (!a[0])                            /* Empty URL -> inherit everything */
 303     {
 304       u->rest = b->rest;
 305       return 0;
 306     }
 307
 308   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 309
 310   if (a[0] == '#')                      /* Another fragment */
 311     {
 312       for(p=o; *p && *p != '#'; p++)
 313         ;
 314       goto copy;
 315     }
 316   if (a[0] == '?')                      /* New query */
 317     {
 318       for(p=o; *p && *p != '#' && *p != '?'; p++)
 319         ;
 320       goto copy;
 321     }
 322
 323   p = NULL;                             /* Copy original path and find the last slash */
 324   while (*o && *o != '?' && *o != '#')
 325     {
 326       if (d >= e)
 327         return URL_ERR_TOO_LONG;
 328       if ((*d++ = *o++) == '/')
 329         p = d;
 330     }
 331   if (!p)
 332     return URL_ERR_REL_NOTHING;
 333   d = p;
 334
 335   while (*a)
 336     {
 337       if (a[0] == '.')
 338         {
 339           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 340             {
 341               a++;
 342               if (a[0])
 343                 a++;
 344               continue;
 345             }
 346           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 347             {
 348               a += 2;
 349               if (a[0])
 350                 a++;
 351               if (d <= u->buf + 1)
 352                 {
 353                   /*
 354                    * RFC 1808 says we should leave ".." as a path segment, but
 355                    * we intentionally break the rule and refuse the URL.
 356                    */
 357                   if (!url_ignore_underflow)
 358                     return URL_PATH_UNDERFLOW;
 359                 }
 360               else
 361                 {
 362                   d--;                  /* Discard trailing slash */
 363                   while (d[-1] != '/')
 364                     d--;
 365                 }
 366               continue;
 367             }
 368         }
 369       while (a[0] && a[0] != '/')
 370         {
 371           if (d >= e)
 372             return URL_ERR_TOO_LONG;
 373           *d++ = *a++;
 374         }
 375       if (a[0])
 376         *d++ = *a++;
 377     }
 378
 379 okay:
 380   *d++ = 0;
 381   u->buf = d;
 382   return 0;
 383
 384 copy:                                   /* Combine part of old URL with the new one */
 385   while (o < p)
 386     if (d < e)
 387       *d++ = *o++;
 388     else
 389       return URL_ERR_TOO_LONG;
 390   while (*a)
 391     if (d < e)
 392       *d++ = *a++;
 393     else
 394       return URL_ERR_TOO_LONG;
 395   goto okay;
 396 }
 397
 398 int
 399 url_normalize(struct url *u, struct url *b)
 400 {
 401   int err;
 402
 403   /* Basic checks */
 404   if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
 405       !u->host && u->user ||
 406       !u->user && u->pass ||
 407       !u->rest)
 408     return URL_SYNTAX_ERROR;
 409
 410   if (!u->protocol)
 411     {
 412       /* Now we know it's a relative URL. Do we have any base? */
 413       if (!b || !url_proto_path_flags[b->protoid])
 414         return URL_ERR_REL_NOTHING;
 415       u->protocol = b->protocol;
 416       u->protoid = b->protoid;
 417
 418       /* Reference to the same host */
 419       if (!u->host)
 420         {
 421           u->host = b->host;
 422           u->user = b->user;
 423           u->pass = b->pass;
 424           u->port = b->port;
 425           if (err = relpath_merge(u, b))
 426             return err;
 427         }
 428     }
 429
 430   /* Change path "?" to "/?" because it's the true meaning */
 431   if (u->rest[0] == '?')
 432     {
 433       int l = strlen(u->rest);
 434       if (u->bufend - u->buf < l+1)
 435         return URL_ERR_TOO_LONG;
 436       u->buf[0] = '/';
 437       memcpy(u->buf+1, u->rest, l+1);
 438       u->rest = u->buf;
 439       u->buf += l+2;
 440     }
 441
 442   /* Fill in missing info */
 443   if (u->port == ~0U)
 444     u->port = std_ports[u->protoid];
 445
 446   return 0;
 447 }
 448
 449 /* Name canonicalization */
 450
 451 static void
 452 lowercase(char *b)
 453 {
 454   if (b)
 455     while (*b)
 456       {
 457         if (*b >= 'A' && *b <= 'Z')
 458           *b = *b + 0x20;
 459         b++;
 460       }
 461 }
 462
 463 static void
 464 kill_end_dot(char *b)
 465 {
 466   char *k;
 467
 468   if (b)
 469     {
 470       k = b + strlen(b) - 1;
 471       while (k > b && *k == '.')
 472         *k-- = 0;
 473     }
 474 }
 475
 476 int
 477 url_canonicalize(struct url *u)
 478 {
 479   char *c;
 480
 481   lowercase(u->protocol);
 482   lowercase(u->host);
 483   kill_end_dot(u->host);
 484   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 485     u->rest = "/";
 486   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 487     *c = 0;
 488   return 0;
 489 }
 490
 491 /* Pack a broken-down URL */
 492
 493 static char *
 494 append(char *d, const char *s, char *e)
 495 {
 496   if (d)
 497     while (*s)
 498       {
 499         if (d >= e)
 500           return NULL;
 501         *d++ = *s++;
 502       }
 503   return d;
 504 }
 505
 506 int
 507 url_pack(struct url *u, char *d)
 508 {
 509   char *e = d + MAX_URL_SIZE - 10;
 510
 511   if (u->protocol)
 512     {
 513       d = append(d, u->protocol, e);
 514       d = append(d, ":", e);
 515       u->protoid = url_identify_protocol(u->protocol);
 516     }
 517   if (u->host)
 518     {
 519       d = append(d, "//", e);
 520       if (u->user)
 521         {
 522           d = append(d, u->user, e);
 523           if (u->pass)
 524             {
 525               d = append(d, ":", e);
 526               d = append(d, u->pass, e);
 527             }
 528           d = append(d, "@", e);
 529         }
 530       d = append(d, u->host, e);
 531       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 532         {
 533           char z[10];
 534           sprintf(z, "%d", u->port);
 535           d = append(d, ":", e);
 536           d = append(d, z, e);
 537         }
 538     }
 539   if (u->rest)
 540     d = append(d, u->rest, e);
 541   if (!d)
 542     return URL_ERR_TOO_LONG;
 543   *d = 0;
 544   return 0;
 545 }
 546
 547 /* Error messages */
 548
 549 static char *errmsg[] = {
 550   "Something is wrong",
 551   "Too long",
 552   "Invalid character",
 553   "Invalid escape",
 554   "Invalid escaped character",
 555   "Invalid port number",
 556   "Relative URL not allowed",
 557   "Unknown protocol",
 558   "Syntax error",
 559   "Path underflow"
 560 };
 561
 562 char *
 563 url_error(uns err)
 564 {
 565   if (err >= sizeof(errmsg) / sizeof(char *))
 566     err = 0;
 567   return errmsg[err];
 568 }
 569
 570 /* Standard cookbook recipes */
 571
 572 int
 573 url_canon_split_rel(const char *u, char *buf1, char *buf2, struct url *url, struct url *base)
 574 {
 575   int err;
 576
 577   if (err = url_deescape(u, buf1))
 578     return err;
 579   if (err = url_split(buf1, url, buf2))
 580     return err;
 581   if (err = url_normalize(url, base))
 582     return err;
 583   return url_canonicalize(url);
 584 }
 585
 586 int
 587 url_auto_canonicalize_rel(const char *src, char *dst, struct url *base)
 588 {
 589   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 590   int err;
 591   struct url ur;
 592
 593   (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
 594    (err = url_pack(&ur, buf3)) ||
 595    (err = url_enescape(buf3, dst)));
 596   return err;
 597 }
 598
 599 /* Testing */
 600
 601 #ifdef TEST
 602
 603 int main(int argc, char **argv)
 604 {
 605   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 606   int err;
 607   struct url url, url0;
 608   char *base = "http://mj@www.hell.org/123/sub_dir;param/index.html;param?query&zzz/sub;query+#fragment?";
 609
 610   if (argc != 2 && argc != 3)
 611     return 1;
 612   if (argc == 3)
 613     base = argv[2];
 614   if (err = url_deescape(argv[1], buf1))
 615     {
 616       printf("deesc: error %d\n", err);
 617       return 1;
 618     }
 619   printf("deesc: %s\n", buf1);
 620   if (err = url_split(buf1, &url, buf2))
 621     {
 622       printf("split: error %d\n", err);
 623       return 1;
 624     }
 625   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 626   if (err = url_split(base, &url0, buf3))
 627     {
 628       printf("split base: error %d\n", err);
 629       return 1;
 630     }
 631   if (err = url_normalize(&url0, NULL))
 632     {
 633       printf("normalize base: error %d\n", err);
 634       return 1;
 635     }
 636   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 637   if (err = url_normalize(&url, &url0))
 638     {
 639       printf("normalize: error %d\n", err);
 640       return 1;
 641     }
 642   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 643   if (err = url_canonicalize(&url))
 644     {
 645       printf("canonicalize: error %d\n", err);
 646       return 1;
 647     }
 648   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 649   if (err = url_pack(&url, buf4))
 650     {
 651       printf("pack: error %d\n", err);
 652       return 1;
 653     }
 654   printf("pack: %s\n", buf4);
 655   if (err = url_enescape(buf4, buf2))
 656     {
 657       printf("enesc: error %d\n", err);
 658       return 1;
 659     }
 660   printf("enesc: %s\n", buf2);
 661   return 0;
 662 }
 663
 664 #endif
 665
 666 struct component {
 667         const char *start;
 668         int length;
 669         uns count;
 670         u32 hash;
 671 };
 672
 673 static inline u32
 674 hashf(const char *start, int length)
 675 {
 676         u32 hf = length;
 677         while (length-- > 0)
 678                 hf = (hf << 8 | hf >> 24) ^ *start++;
 679         return hf;
 680 }
 681
 682 static inline uns
 683 repeat_count(struct component *comp, uns count, uns len)
 684 {
 685         struct component *orig_comp = comp;
 686         uns found = 0;
 687         while (1)
 688         {
 689                 uns i;
 690                 comp += len;
 691                 count -= len;
 692                 found++;
 693                 if (count < len)
 694                         return found;
 695                 for (i=0; i<len; i++)
 696                         if (comp[i].hash != orig_comp[i].hash
 697                         || comp[i].length != orig_comp[i].length
 698                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 699                                 return found;
 700         }
 701 }
 702
 703 int
 704 url_has_repeated_component(const char *url)
 705 {
 706         struct component *comp;
 707         uns comps, comp_len, rep_prefix, hash_size, *hash, *next;
 708         const char *c;
 709         uns i, j, k;
 710
 711         for (comps=0, c=url; c; comps++)
 712         {
 713                 c = strpbrk(c, url_component_separators);
 714                 if (c)
 715                         c++;
 716         }
 717         if (comps < url_min_repeat_count && comps <= url_max_occurences)
 718                 return 0;
 719         comp = alloca(comps * sizeof(*comp));
 720         for (i=0, c=url; c; i++)
 721         {
 722                 comp[i].start = c;
 723                 c = strpbrk(c, url_component_separators);
 724                 if (c)
 725                 {
 726                         comp[i].length = c - comp[i].start;
 727                         c++;
 728                 }
 729                 else
 730                         comp[i].length = strlen(comp[i].start);
 731         }
 732         ASSERT(i == comps);
 733         for (i=0; i<comps; i++)
 734                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 735         if (comps > url_max_occurences)
 736         {
 737                 hash_size = next_table_prime(comps);
 738                 hash = alloca(hash_size * sizeof(*hash));
 739                 next = alloca(comps * sizeof(*next));
 740                 memset(hash, 255, hash_size * sizeof(*hash));
 741                 for (i=0; i<comps; i++)
 742                 {
 743                         j = comp[i].hash % hash_size;
 744                         for (k = hash[j]; ~k && (comp[i].hash != comp[k].hash || comp[i].length != comp[k].length ||
 745                             memcmp(comp[k].start, comp[i].start, comp[i].length)); k = next[k]);
 746                         if (!~k)
 747                         {
 748                                 next[i] = hash[j];
 749                                 hash[j] = i;
 750                                 comp[i].count = 1;
 751                         }
 752                         else
 753                         {
 754                                 if (comp[k].count++ >= url_max_occurences)
 755                                         return 1;
 756                         }
 757                 }
 758         }
 759         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 760                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 761                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 762                                 return comp_len;
 763         return 0;
 764 }