ucw/url.c

   1 /*
   2  *      UCW Library -- URL Functions
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001--2005 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  *
  10  *      The URL syntax corresponds to RFC 2396 with several exceptions:
  11  *
  12  *         o  Interpretation of path parameters follows RFC 1808.
  13  *
  14  *      XXX: The buffer handling in this module is really horrible, but it works.
  15  */
  16
  17 #include "ucw/lib.h"
  18 #include "ucw/url.h"
  19 #include "ucw/chartype.h"
  20 #include "ucw/conf.h"
  21 #include "ucw/prime.h"
  22
  23 #include <string.h>
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <alloca.h>
  27
  28 /* Configuration */
  29
  30 static uns url_ignore_spaces;
  31 static uns url_ignore_underflow;
  32 static char *url_component_separators = "";
  33 static uns url_min_repeat_count = 0x7fffffff;
  34 static uns url_max_repeat_length = 0;
  35 static uns url_max_occurences = ~0U;
  36
  37 static struct cf_section url_config = {
  38   CF_ITEMS {
  39     CF_UNS("IgnoreSpaces", &url_ignore_spaces),
  40     CF_UNS("IgnoreUnderflow", &url_ignore_underflow),
  41     CF_STRING("ComponentSeparators", &url_component_separators),
  42     CF_UNS("MinRepeatCount", &url_min_repeat_count),
  43     CF_UNS("MaxRepeatLength", &url_max_repeat_length),
  44     CF_UNS("MaxOccurences", &url_max_occurences),
  45     CF_END
  46   }
  47 };
  48
  49 static void CONSTRUCTOR url_init_config(void)
  50 {
  51   cf_declare_section("URL", &url_config, 0);
  52 }
  53
  54 /* Escaping and de-escaping */
  55
  56 static uns
  57 enhex(uns x)
  58 {
  59   return (x<10) ? (x + '0') : (x - 10 + 'A');
  60 }
  61
  62 int
  63 url_deescape(const char *s, char *d)
  64 {
  65   char *dstart = d;
  66   char *end = d + MAX_URL_SIZE - 10;
  67   while (*s)
  68     {
  69       if (d >= end)
  70         return URL_ERR_TOO_LONG;
  71       if (*s == '%')
  72         {
  73           unsigned int val;
  74           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  75             return URL_ERR_INVALID_ESCAPE;
  76           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  77           if (val < 0x20)
  78             return URL_ERR_INVALID_ESCAPED_CHAR;
  79           switch (val)
  80             {
  81             case ';':
  82               val = NCC_SEMICOLON; break;
  83             case '/':
  84               val = NCC_SLASH; break;
  85             case '?':
  86               val = NCC_QUEST; break;
  87             case ':':
  88               val = NCC_COLON; break;
  89             case '@':
  90               val = NCC_AT; break;
  91             case '=':
  92               val = NCC_EQUAL; break;
  93             case '&':
  94               val = NCC_AND; break;
  95             case '#':
  96               val = NCC_HASH; break;
  97             case '$':
  98               val = NCC_DOLLAR; break;
  99             case '+':
 100               val = NCC_PLUS; break;
 101             case ',':
 102               val = NCC_COMMA; break;
 103             }
 104           *d++ = val;
 105           s += 3;
 106         }
 107       else if ((byte) *s > 0x20)
 108         *d++ = *s++;
 109       else if (Cspace(*s))
 110         {
 111           const char *s0 = s;
 112           while (Cspace(*s))
 113             s++;
 114           if (!url_ignore_spaces || !(!*s || d == dstart))
 115             {
 116               while (Cspace(*s0))
 117                 {
 118                   if (d >= end)
 119                     return URL_ERR_TOO_LONG;
 120                   *d++ = *s0++;
 121                 }
 122             }
 123         }
 124       else
 125         return URL_ERR_INVALID_CHAR;
 126     }
 127   *d = 0;
 128   return 0;
 129 }
 130
 131 int
 132 url_enescape(const char *s, char *d)
 133 {
 134   char *end = d + MAX_URL_SIZE - 10;
 135   unsigned int c;
 136
 137   while (c = *s)
 138     {
 139       if (d >= end)
 140         return URL_ERR_TOO_LONG;
 141       if (Calnum(c) ||                                                  /* RFC 2396 (2.1-2.3): Only alphanumerics ... */
 142           c == '-' || c == '_' || c == '.' || c == '+' || c == '~' ||   /* ... and several other exceptions ... */
 143           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 144           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 145           c == '=' || c == '&' || c == '#' || c == ';' ||
 146           c == '$' || c == '+' || c == ',')
 147         *d++ = *s++;
 148       else
 149         {
 150           uns val = ((byte)*s < NCC_MAX) ? NCC_CHARS[(byte)*s] : *s;
 151           *d++ = '%';
 152           *d++ = enhex(val >> 4);
 153           *d++ = enhex(val & 0x0f);
 154           s++;
 155         }
 156     }
 157   *d = 0;
 158   return 0;
 159 }
 160
 161 int
 162 url_enescape_friendly(const char *src, char *dest)
 163 {
 164   char *end = dest + MAX_URL_SIZE - 10;
 165   const byte *srcb = src;
 166   while (*srcb)
 167     {
 168       if (dest >= end)
 169         return URL_ERR_TOO_LONG;
 170       if (*srcb < NCC_MAX)
 171         *dest++ = NCC_CHARS[*srcb++];
 172       else if (*srcb >= 0x20 && *srcb < 0x7f)
 173         *dest++ = *srcb++;
 174       else
 175         {
 176           *dest++ = '%';
 177           *dest++ = enhex(*srcb >> 4);
 178           *dest++ = enhex(*srcb++ & 0x0f);
 179         }
 180     }
 181   *dest = 0;
 182   return 0;
 183 }
 184
 185 /* Split an URL (several parts may be copied to the destination buffer) */
 186
 187 char *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 188 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 189
 190 uns
 191 identify_protocol(const char *p)
 192 {
 193   uns i;
 194
 195   for(i=1; i<URL_PROTO_MAX; i++)
 196     if (!strcasecmp(p, url_proto_names[i]))
 197       return i;
 198   return URL_PROTO_UNKNOWN;
 199 }
 200
 201 int
 202 url_split(char *s, struct url *u, char *d)
 203 {
 204   bzero(u, sizeof(struct url));
 205   u->port = ~0;
 206   u->bufend = d + MAX_URL_SIZE - 10;
 207
 208   if (s[0] != '/')                      /* Seek for "protocol:" */
 209     {
 210       char *p = s;
 211       while (*p && Calnum(*p))
 212         p++;
 213       if (p != s && *p == ':')
 214         {
 215           u->protocol = d;
 216           while (s < p)
 217             *d++ = *s++;
 218           *d++ = 0;
 219           u->protoid = identify_protocol(u->protocol);
 220           s++;
 221           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 222             {
 223               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 224               int len = d - u->protocol;
 225               d -= len;
 226               s -= len;
 227               u->protocol = NULL;
 228               u->protoid = 0;
 229             }
 230         }
 231     }
 232
 233   if (s[0] == '/')                      /* Host spec or absolute path */
 234     {
 235       if (s[1] == '/')                  /* Host spec */
 236         {
 237           char *q, *e;
 238           char *at = NULL;
 239           char *ep;
 240
 241           s += 2;
 242           q = d;
 243           while (*s && *s != '/' && *s != '?')  /* Copy user:passwd@host:port */
 244             {
 245               if (*s != '@')
 246                 *d++ = *s;
 247               else if (!at)
 248                 {
 249                   *d++ = 0;
 250                   at = d;
 251                 }
 252               else                      /* This shouldn't happen with sane URL's, but we need to be sure */
 253                 *d++ = NCC_AT;
 254               s++;
 255             }
 256           *d++ = 0;
 257           if (at)                       /* user:passwd present */
 258             {
 259               u->user = q;
 260               if (e = strchr(q, ':'))
 261                 {
 262                   *e++ = 0;
 263                   u->pass = e;
 264                 }
 265             }
 266           else
 267             at = q;
 268           e = strchr(at, ':');
 269           if (e)                        /* host:port present */
 270             {
 271               uns p;
 272               *e++ = 0;
 273               p = strtoul(e, &ep, 10);
 274               if (ep && *ep || p > 65535)
 275                 return URL_ERR_INVALID_PORT;
 276               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 277                 u->port = p;
 278             }
 279           u->host = at;
 280         }
 281     }
 282
 283   u->rest = s;
 284   u->buf = d;
 285   return 0;
 286 }
 287
 288 /* Normalization according to given base URL */
 289
 290 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 291
 292 static int
 293 relpath_merge(struct url *u, struct url *b)
 294 {
 295   char *a = u->rest;
 296   char *o = b->rest;
 297   char *d = u->buf;
 298   char *e = u->bufend;
 299   char *p;
 300
 301   if (a[0] == '/')                      /* Absolute path => OK */
 302     return 0;
 303   if (o[0] != '/' && o[0] != '?')
 304     return URL_PATH_UNDERFLOW;
 305
 306   if (!a[0])                            /* Empty URL -> inherit everything */
 307     {
 308       u->rest = b->rest;
 309       return 0;
 310     }
 311
 312   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 313
 314   if (a[0] == '#')                      /* Another fragment */
 315     {
 316       for(p=o; *p && *p != '#'; p++)
 317         ;
 318       goto copy;
 319     }
 320   if (a[0] == '?')                      /* New query */
 321     {
 322       for(p=o; *p && *p != '#' && *p != '?'; p++)
 323         ;
 324       goto copy;
 325     }
 326   if (a[0] == ';')                      /* Change parameters */
 327     {
 328       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 329         ;
 330       goto copy;
 331     }
 332
 333   p = NULL;                             /* Copy original path and find the last slash */
 334   while (*o && *o != ';' && *o != '?' && *o != '#')
 335     {
 336       if (d >= e)
 337         return URL_ERR_TOO_LONG;
 338       if ((*d++ = *o++) == '/')
 339         p = d;
 340     }
 341   if (!p)
 342     return URL_ERR_REL_NOTHING;
 343   d = p;
 344
 345   while (*a)
 346     {
 347       if (a[0] == '.')
 348         {
 349           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 350             {
 351               a++;
 352               if (a[0])
 353                 a++;
 354               continue;
 355             }
 356           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 357             {
 358               a += 2;
 359               if (a[0])
 360                 a++;
 361               if (d <= u->buf + 1)
 362                 {
 363                   /*
 364                    * RFC 1808 says we should leave ".." as a path segment, but
 365                    * we intentionally break the rule and refuse the URL.
 366                    */
 367                   if (!url_ignore_underflow)
 368                     return URL_PATH_UNDERFLOW;
 369                 }
 370               else
 371                 {
 372                   d--;                  /* Discard trailing slash */
 373                   while (d[-1] != '/')
 374                     d--;
 375                 }
 376               continue;
 377             }
 378         }
 379       while (a[0] && a[0] != '/')
 380         {
 381           if (d >= e)
 382             return URL_ERR_TOO_LONG;
 383           *d++ = *a++;
 384         }
 385       if (a[0])
 386         *d++ = *a++;
 387     }
 388
 389 okay:
 390   *d++ = 0;
 391   u->buf = d;
 392   return 0;
 393
 394 copy:                                   /* Combine part of old URL with the new one */
 395   while (o < p)
 396     if (d < e)
 397       *d++ = *o++;
 398     else
 399       return URL_ERR_TOO_LONG;
 400   while (*a)
 401     if (d < e)
 402       *d++ = *a++;
 403     else
 404       return URL_ERR_TOO_LONG;
 405   goto okay;
 406 }
 407
 408 int
 409 url_normalize(struct url *u, struct url *b)
 410 {
 411   int err;
 412
 413   /* Basic checks */
 414   if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
 415       !u->host && u->user ||
 416       !u->user && u->pass ||
 417       !u->rest)
 418     return URL_SYNTAX_ERROR;
 419
 420   if (!u->protocol)
 421     {
 422       /* Now we know it's a relative URL. Do we have any base? */
 423       if (!b || !url_proto_path_flags[b->protoid])
 424         return URL_ERR_REL_NOTHING;
 425       u->protocol = b->protocol;
 426       u->protoid = b->protoid;
 427
 428       /* Reference to the same host */
 429       if (!u->host)
 430         {
 431           u->host = b->host;
 432           u->user = b->user;
 433           u->pass = b->pass;
 434           u->port = b->port;
 435           if (err = relpath_merge(u, b))
 436             return err;
 437         }
 438     }
 439
 440   /* Change path "?" to "/?" because it's the true meaning */
 441   if (u->rest[0] == '?')
 442     {
 443       int l = strlen(u->rest);
 444       if (u->bufend - u->buf < l+1)
 445         return URL_ERR_TOO_LONG;
 446       u->buf[0] = '/';
 447       memcpy(u->buf+1, u->rest, l+1);
 448       u->rest = u->buf;
 449       u->buf += l+2;
 450     }
 451
 452   /* Fill in missing info */
 453   if (u->port == ~0U)
 454     u->port = std_ports[u->protoid];
 455
 456   return 0;
 457 }
 458
 459 /* Name canonicalization */
 460
 461 static void
 462 lowercase(char *b)
 463 {
 464   if (b)
 465     while (*b)
 466       {
 467         if (*b >= 'A' && *b <= 'Z')
 468           *b = *b + 0x20;
 469         b++;
 470       }
 471 }
 472
 473 static void
 474 kill_end_dot(char *b)
 475 {
 476   char *k;
 477
 478   if (b)
 479     {
 480       k = b + strlen(b) - 1;
 481       while (k > b && *k == '.')
 482         *k-- = 0;
 483     }
 484 }
 485
 486 int
 487 url_canonicalize(struct url *u)
 488 {
 489   char *c;
 490
 491   lowercase(u->protocol);
 492   lowercase(u->host);
 493   kill_end_dot(u->host);
 494   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 495     u->rest = "/";
 496   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 497     *c = 0;
 498   return 0;
 499 }
 500
 501 /* Pack a broken-down URL */
 502
 503 static char *
 504 append(char *d, const char *s, char *e)
 505 {
 506   if (d)
 507     while (*s)
 508       {
 509         if (d >= e)
 510           return NULL;
 511         *d++ = *s++;
 512       }
 513   return d;
 514 }
 515
 516 int
 517 url_pack(struct url *u, char *d)
 518 {
 519   char *e = d + MAX_URL_SIZE - 10;
 520
 521   if (u->protocol)
 522     {
 523       d = append(d, u->protocol, e);
 524       d = append(d, ":", e);
 525       u->protoid = identify_protocol(u->protocol);
 526     }
 527   if (u->host)
 528     {
 529       d = append(d, "//", e);
 530       if (u->user)
 531         {
 532           d = append(d, u->user, e);
 533           if (u->pass)
 534             {
 535               d = append(d, ":", e);
 536               d = append(d, u->pass, e);
 537             }
 538           d = append(d, "@", e);
 539         }
 540       d = append(d, u->host, e);
 541       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 542         {
 543           char z[10];
 544           sprintf(z, "%d", u->port);
 545           d = append(d, ":", e);
 546           d = append(d, z, e);
 547         }
 548     }
 549   if (u->rest)
 550     d = append(d, u->rest, e);
 551   if (!d)
 552     return URL_ERR_TOO_LONG;
 553   *d = 0;
 554   return 0;
 555 }
 556
 557 /* Error messages */
 558
 559 static char *errmsg[] = {
 560   "Something is wrong",
 561   "Too long",
 562   "Invalid character",
 563   "Invalid escape",
 564   "Invalid escaped character",
 565   "Invalid port number",
 566   "Relative URL not allowed",
 567   "Unknown protocol",
 568   "Syntax error",
 569   "Path underflow"
 570 };
 571
 572 char *
 573 url_error(uns err)
 574 {
 575   if (err >= sizeof(errmsg) / sizeof(char *))
 576     err = 0;
 577   return errmsg[err];
 578 }
 579
 580 /* Standard cookbook recipes */
 581
 582 int
 583 url_canon_split_rel(const char *u, char *buf1, char *buf2, struct url *url, struct url *base)
 584 {
 585   int err;
 586
 587   if (err = url_deescape(u, buf1))
 588     return err;
 589   if (err = url_split(buf1, url, buf2))
 590     return err;
 591   if (err = url_normalize(url, base))
 592     return err;
 593   return url_canonicalize(url);
 594 }
 595
 596 int
 597 url_auto_canonicalize_rel(const char *src, char *dst, struct url *base)
 598 {
 599   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 600   int err;
 601   struct url ur;
 602
 603   (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
 604    (err = url_pack(&ur, buf3)) ||
 605    (err = url_enescape(buf3, dst)));
 606   return err;
 607 }
 608
 609 /* Testing */
 610
 611 #ifdef TEST
 612
 613 int main(int argc, char **argv)
 614 {
 615   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 616   int err;
 617   struct url url, url0;
 618   char *base = "http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment";
 619
 620   if (argc != 2 && argc != 3)
 621     return 1;
 622   if (argc == 3)
 623     base = argv[2];
 624   if (err = url_deescape(argv[1], buf1))
 625     {
 626       printf("deesc: error %d\n", err);
 627       return 1;
 628     }
 629   printf("deesc: %s\n", buf1);
 630   if (err = url_split(buf1, &url, buf2))
 631     {
 632       printf("split: error %d\n", err);
 633       return 1;
 634     }
 635   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 636   if (err = url_split(base, &url0, buf3))
 637     {
 638       printf("split base: error %d\n", err);
 639       return 1;
 640     }
 641   if (err = url_normalize(&url0, NULL))
 642     {
 643       printf("normalize base: error %d\n", err);
 644       return 1;
 645     }
 646   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 647   if (err = url_normalize(&url, &url0))
 648     {
 649       printf("normalize: error %d\n", err);
 650       return 1;
 651     }
 652   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 653   if (err = url_canonicalize(&url))
 654     {
 655       printf("canonicalize: error %d\n", err);
 656       return 1;
 657     }
 658   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 659   if (err = url_pack(&url, buf4))
 660     {
 661       printf("pack: error %d\n", err);
 662       return 1;
 663     }
 664   printf("pack: %s\n", buf4);
 665   if (err = url_enescape(buf4, buf2))
 666     {
 667       printf("enesc: error %d\n", err);
 668       return 1;
 669     }
 670   printf("enesc: %s\n", buf2);
 671   return 0;
 672 }
 673
 674 #endif
 675
 676 struct component {
 677         const char *start;
 678         int length;
 679         uns count;
 680         u32 hash;
 681 };
 682
 683 static inline u32
 684 hashf(const char *start, int length)
 685 {
 686         u32 hf = length;
 687         while (length-- > 0)
 688                 hf = (hf << 8 | hf >> 24) ^ *start++;
 689         return hf;
 690 }
 691
 692 static inline uns
 693 repeat_count(struct component *comp, uns count, uns len)
 694 {
 695         struct component *orig_comp = comp;
 696         uns found = 0;
 697         while (1)
 698         {
 699                 uns i;
 700                 comp += len;
 701                 count -= len;
 702                 found++;
 703                 if (count < len)
 704                         return found;
 705                 for (i=0; i<len; i++)
 706                         if (comp[i].hash != orig_comp[i].hash
 707                         || comp[i].length != orig_comp[i].length
 708                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 709                                 return found;
 710         }
 711 }
 712
 713 int
 714 url_has_repeated_component(const char *url)
 715 {
 716         struct component *comp;
 717         uns comps, comp_len, rep_prefix, hash_size, *hash, *next;
 718         const char *c;
 719         uns i, j, k;
 720
 721         for (comps=0, c=url; c; comps++)
 722         {
 723                 c = strpbrk(c, url_component_separators);
 724                 if (c)
 725                         c++;
 726         }
 727         if (comps < url_min_repeat_count && comps <= url_max_occurences)
 728                 return 0;
 729         comp = alloca(comps * sizeof(*comp));
 730         for (i=0, c=url; c; i++)
 731         {
 732                 comp[i].start = c;
 733                 c = strpbrk(c, url_component_separators);
 734                 if (c)
 735                 {
 736                         comp[i].length = c - comp[i].start;
 737                         c++;
 738                 }
 739                 else
 740                         comp[i].length = strlen(comp[i].start);
 741         }
 742         ASSERT(i == comps);
 743         for (i=0; i<comps; i++)
 744                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 745         if (comps > url_max_occurences)
 746         {
 747                 hash_size = next_table_prime(comps);
 748                 hash = alloca(hash_size * sizeof(*hash));
 749                 next = alloca(comps * sizeof(*next));
 750                 memset(hash, 255, hash_size * sizeof(*hash));
 751                 for (i=0; i<comps; i++)
 752                 {
 753                         j = comp[i].hash % hash_size;
 754                         for (k = hash[j]; ~k && (comp[i].hash != comp[k].hash || comp[i].length != comp[k].length ||
 755                             memcmp(comp[k].start, comp[i].start, comp[i].length)); k = next[k]);
 756                         if (!~k)
 757                         {
 758                                 next[i] = hash[j];
 759                                 hash[j] = i;
 760                                 comp[i].count = 1;
 761                         }
 762                         else
 763                         {
 764                                 if (comp[k].count++ >= url_max_occurences)
 765                                         return 1;
 766                         }
 767                 }
 768         }
 769         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 770                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 771                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 772                                 return comp_len;
 773         return 0;
 774 }