ucw/url.c

   1 /*
   2  *      UCW Library -- URL Functions
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001--2005 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  *
  10  *      The URL syntax corresponds to RFC 2396 with several exceptions:
  11  *
  12  *         o  Escaping of special characters still follows RFC 1738.
  13  *         o  Interpretation of path parameters follows RFC 1808.
  14  *
  15  *      XXX: The buffer handling in this module is really horrible, but it works.
  16  */
  17
  18 #include "ucw/lib.h"
  19 #include "ucw/url.h"
  20 #include "ucw/chartype.h"
  21 #include "ucw/conf.h"
  22
  23 #include <string.h>
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <alloca.h>
  27
  28 /* Configuration */
  29
  30 static uns url_ignore_spaces;
  31 static uns url_ignore_underflow;
  32 static char *url_component_separators = "";
  33 static uns url_min_repeat_count = 0x7fffffff;
  34 static uns url_max_repeat_length = 0;
  35 static uns url_max_occurences = ~0U;
  36
  37 static struct cf_section url_config = {
  38   CF_ITEMS {
  39     CF_UNS("IgnoreSpaces", &url_ignore_spaces),
  40     CF_UNS("IgnoreUnderflow", &url_ignore_underflow),
  41     CF_STRING("ComponentSeparators", &url_component_separators),
  42     CF_UNS("MinRepeatCount", &url_min_repeat_count),
  43     CF_UNS("MaxRepeatLength", &url_max_repeat_length),
  44     CF_UNS("MaxOccurences", &url_max_occurences),
  45     CF_END
  46   }
  47 };
  48
  49 static void CONSTRUCTOR url_init_config(void)
  50 {
  51   cf_declare_section("URL", &url_config, 0);
  52 }
  53
  54 /* Escaping and de-escaping */
  55
  56 static uns
  57 enhex(uns x)
  58 {
  59   return (x<10) ? (x + '0') : (x - 10 + 'A');
  60 }
  61
  62 int
  63 url_deescape(const char *s, char *d)
  64 {
  65   char *dstart = d;
  66   char *end = d + MAX_URL_SIZE - 10;
  67   while (*s)
  68     {
  69       if (d >= end)
  70         return URL_ERR_TOO_LONG;
  71       if (*s == '%')
  72         {
  73           unsigned int val;
  74           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  75             return URL_ERR_INVALID_ESCAPE;
  76           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  77           if (val < 0x20)
  78             return URL_ERR_INVALID_ESCAPED_CHAR;
  79           switch (val)
  80             {
  81             case ';':
  82               val = NCC_SEMICOLON; break;
  83             case '/':
  84               val = NCC_SLASH; break;
  85             case '?':
  86               val = NCC_QUEST; break;
  87             case ':':
  88               val = NCC_COLON; break;
  89             case '@':
  90               val = NCC_AT; break;
  91             case '=':
  92               val = NCC_EQUAL; break;
  93             case '&':
  94               val = NCC_AND; break;
  95             case '#':
  96               val = NCC_HASH; break;
  97             }
  98           *d++ = val;
  99           s += 3;
 100         }
 101       else if ((byte) *s > 0x20)
 102         *d++ = *s++;
 103       else if (Cspace(*s))
 104         {
 105           const char *s0 = s;
 106           while (Cspace(*s))
 107             s++;
 108           if (!url_ignore_spaces || !(!*s || d == dstart))
 109             {
 110               while (Cspace(*s0))
 111                 {
 112                   if (d >= end)
 113                     return URL_ERR_TOO_LONG;
 114                   *d++ = *s0++;
 115                 }
 116             }
 117         }
 118       else
 119         return URL_ERR_INVALID_CHAR;
 120     }
 121   *d = 0;
 122   return 0;
 123 }
 124
 125 int
 126 url_enescape(const char *s, char *d)
 127 {
 128   char *end = d + MAX_URL_SIZE - 10;
 129   unsigned int c;
 130
 131   while (c = *s)
 132     {
 133       if (d >= end)
 134         return URL_ERR_TOO_LONG;
 135       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 136           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 137           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 138           c == ',' ||
 139           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 140           c == '=' || c == '&' || c == '#' || c == ';')
 141         *d++ = *s++;
 142       else
 143         {
 144           uns val = ((byte)*s < NCC_MAX) ? NCC_CHARS[(byte)*s] : *s;
 145           *d++ = '%';
 146           *d++ = enhex(val >> 4);
 147           *d++ = enhex(val & 0x0f);
 148           s++;
 149         }
 150     }
 151   *d = 0;
 152   return 0;
 153 }
 154
 155 int
 156 url_enescape_friendly(const char *src, char *dest)
 157 {
 158   char *end = dest + MAX_URL_SIZE - 10;
 159   const byte *srcb = src;
 160   while (*srcb)
 161     {
 162       if (dest >= end)
 163         return URL_ERR_TOO_LONG;
 164       if (*srcb < NCC_MAX)
 165         *dest++ = NCC_CHARS[*srcb++];
 166       else if (*srcb >= 0x20 && *srcb < 0x7f)
 167         *dest++ = *srcb++;
 168       else
 169         {
 170           *dest++ = '%';
 171           *dest++ = enhex(*srcb >> 4);
 172           *dest++ = enhex(*srcb++ & 0x0f);
 173         }
 174     }
 175   *dest = 0;
 176   return 0;
 177 }
 178
 179 /* Split an URL (several parts may be copied to the destination buffer) */
 180
 181 char *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 182 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 183
 184 uns
 185 identify_protocol(const char *p)
 186 {
 187   uns i;
 188
 189   for(i=1; i<URL_PROTO_MAX; i++)
 190     if (!strcasecmp(p, url_proto_names[i]))
 191       return i;
 192   return URL_PROTO_UNKNOWN;
 193 }
 194
 195 int
 196 url_split(char *s, struct url *u, char *d)
 197 {
 198   bzero(u, sizeof(struct url));
 199   u->port = ~0;
 200   u->bufend = d + MAX_URL_SIZE - 10;
 201
 202   if (s[0] != '/')                      /* Seek for "protocol:" */
 203     {
 204       char *p = s;
 205       while (*p && Calnum(*p))
 206         p++;
 207       if (p != s && *p == ':')
 208         {
 209           u->protocol = d;
 210           while (s < p)
 211             *d++ = *s++;
 212           *d++ = 0;
 213           u->protoid = identify_protocol(u->protocol);
 214           s++;
 215           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 216             {
 217               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 218               int len = d - u->protocol;
 219               d -= len;
 220               s -= len;
 221               u->protocol = NULL;
 222               u->protoid = 0;
 223             }
 224         }
 225     }
 226
 227   if (s[0] == '/')                      /* Host spec or absolute path */
 228     {
 229       if (s[1] == '/')                  /* Host spec */
 230         {
 231           char *q, *e;
 232           char *at = NULL;
 233           char *ep;
 234
 235           s += 2;
 236           q = d;
 237           while (*s && *s != '/' && *s != '?')  /* Copy user:passwd@host:port */
 238             {
 239               if (*s != '@')
 240                 *d++ = *s;
 241               else if (!at)
 242                 {
 243                   *d++ = 0;
 244                   at = d;
 245                 }
 246               else                      /* This shouldn't happen with sane URL's, but we need to be sure */
 247                 *d++ = NCC_AT;
 248               s++;
 249             }
 250           *d++ = 0;
 251           if (at)                       /* user:passwd present */
 252             {
 253               u->user = q;
 254               if (e = strchr(q, ':'))
 255                 {
 256                   *e++ = 0;
 257                   u->pass = e;
 258                 }
 259             }
 260           else
 261             at = q;
 262           e = strchr(at, ':');
 263           if (e)                        /* host:port present */
 264             {
 265               uns p;
 266               *e++ = 0;
 267               p = strtoul(e, &ep, 10);
 268               if (ep && *ep || p > 65535)
 269                 return URL_ERR_INVALID_PORT;
 270               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 271                 u->port = p;
 272             }
 273           u->host = at;
 274         }
 275     }
 276
 277   u->rest = s;
 278   u->buf = d;
 279   return 0;
 280 }
 281
 282 /* Normalization according to given base URL */
 283
 284 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 285
 286 static int
 287 relpath_merge(struct url *u, struct url *b)
 288 {
 289   char *a = u->rest;
 290   char *o = b->rest;
 291   char *d = u->buf;
 292   char *e = u->bufend;
 293   char *p;
 294
 295   if (a[0] == '/')                      /* Absolute path => OK */
 296     return 0;
 297   if (o[0] != '/' && o[0] != '?')
 298     return URL_PATH_UNDERFLOW;
 299
 300   if (!a[0])                            /* Empty URL -> inherit everything */
 301     {
 302       u->rest = b->rest;
 303       return 0;
 304     }
 305
 306   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 307
 308   if (a[0] == '#')                      /* Another fragment */
 309     {
 310       for(p=o; *p && *p != '#'; p++)
 311         ;
 312       goto copy;
 313     }
 314   if (a[0] == '?')                      /* New query */
 315     {
 316       for(p=o; *p && *p != '#' && *p != '?'; p++)
 317         ;
 318       goto copy;
 319     }
 320   if (a[0] == ';')                      /* Change parameters */
 321     {
 322       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 323         ;
 324       goto copy;
 325     }
 326
 327   p = NULL;                             /* Copy original path and find the last slash */
 328   while (*o && *o != ';' && *o != '?' && *o != '#')
 329     {
 330       if (d >= e)
 331         return URL_ERR_TOO_LONG;
 332       if ((*d++ = *o++) == '/')
 333         p = d;
 334     }
 335   if (!p)
 336     return URL_ERR_REL_NOTHING;
 337   d = p;
 338
 339   while (*a)
 340     {
 341       if (a[0] == '.')
 342         {
 343           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 344             {
 345               a++;
 346               if (a[0])
 347                 a++;
 348               continue;
 349             }
 350           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 351             {
 352               a += 2;
 353               if (a[0])
 354                 a++;
 355               if (d <= u->buf + 1)
 356                 {
 357                   /*
 358                    * RFC 1808 says we should leave ".." as a path segment, but
 359                    * we intentionally break the rule and refuse the URL.
 360                    */
 361                   if (!url_ignore_underflow)
 362                     return URL_PATH_UNDERFLOW;
 363                 }
 364               else
 365                 {
 366                   d--;                  /* Discard trailing slash */
 367                   while (d[-1] != '/')
 368                     d--;
 369                 }
 370               continue;
 371             }
 372         }
 373       while (a[0] && a[0] != '/')
 374         {
 375           if (d >= e)
 376             return URL_ERR_TOO_LONG;
 377           *d++ = *a++;
 378         }
 379       if (a[0])
 380         *d++ = *a++;
 381     }
 382
 383 okay:
 384   *d++ = 0;
 385   u->buf = d;
 386   return 0;
 387
 388 copy:                                   /* Combine part of old URL with the new one */
 389   while (o < p)
 390     if (d < e)
 391       *d++ = *o++;
 392     else
 393       return URL_ERR_TOO_LONG;
 394   while (*a)
 395     if (d < e)
 396       *d++ = *a++;
 397     else
 398       return URL_ERR_TOO_LONG;
 399   goto okay;
 400 }
 401
 402 int
 403 url_normalize(struct url *u, struct url *b)
 404 {
 405   int err;
 406
 407   /* Basic checks */
 408   if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
 409       !u->host && u->user ||
 410       !u->user && u->pass ||
 411       !u->rest)
 412     return URL_SYNTAX_ERROR;
 413
 414   if (!u->protocol)
 415     {
 416       /* Now we know it's a relative URL. Do we have any base? */
 417       if (!b || !url_proto_path_flags[b->protoid])
 418         return URL_ERR_REL_NOTHING;
 419       u->protocol = b->protocol;
 420       u->protoid = b->protoid;
 421
 422       /* Reference to the same host */
 423       if (!u->host)
 424         {
 425           u->host = b->host;
 426           u->user = b->user;
 427           u->pass = b->pass;
 428           u->port = b->port;
 429           if (err = relpath_merge(u, b))
 430             return err;
 431         }
 432     }
 433
 434   /* Change path "?" to "/?" because it's the true meaning */
 435   if (u->rest[0] == '?')
 436     {
 437       int l = strlen(u->rest);
 438       if (u->bufend - u->buf < l+1)
 439         return URL_ERR_TOO_LONG;
 440       u->buf[0] = '/';
 441       memcpy(u->buf+1, u->rest, l+1);
 442       u->rest = u->buf;
 443       u->buf += l+2;
 444     }
 445
 446   /* Fill in missing info */
 447   if (u->port == ~0U)
 448     u->port = std_ports[u->protoid];
 449
 450   return 0;
 451 }
 452
 453 /* Name canonicalization */
 454
 455 static void
 456 lowercase(char *b)
 457 {
 458   if (b)
 459     while (*b)
 460       {
 461         if (*b >= 'A' && *b <= 'Z')
 462           *b = *b + 0x20;
 463         b++;
 464       }
 465 }
 466
 467 static void
 468 kill_end_dot(char *b)
 469 {
 470   char *k;
 471
 472   if (b)
 473     {
 474       k = b + strlen(b) - 1;
 475       while (k > b && *k == '.')
 476         *k-- = 0;
 477     }
 478 }
 479
 480 int
 481 url_canonicalize(struct url *u)
 482 {
 483   char *c;
 484
 485   lowercase(u->protocol);
 486   lowercase(u->host);
 487   kill_end_dot(u->host);
 488   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 489     u->rest = "/";
 490   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 491     *c = 0;
 492   return 0;
 493 }
 494
 495 /* Pack a broken-down URL */
 496
 497 static char *
 498 append(char *d, const char *s, char *e)
 499 {
 500   if (d)
 501     while (*s)
 502       {
 503         if (d >= e)
 504           return NULL;
 505         *d++ = *s++;
 506       }
 507   return d;
 508 }
 509
 510 int
 511 url_pack(struct url *u, char *d)
 512 {
 513   char *e = d + MAX_URL_SIZE - 10;
 514
 515   if (u->protocol)
 516     {
 517       d = append(d, u->protocol, e);
 518       d = append(d, ":", e);
 519       u->protoid = identify_protocol(u->protocol);
 520     }
 521   if (u->host)
 522     {
 523       d = append(d, "//", e);
 524       if (u->user)
 525         {
 526           d = append(d, u->user, e);
 527           if (u->pass)
 528             {
 529               d = append(d, ":", e);
 530               d = append(d, u->pass, e);
 531             }
 532           d = append(d, "@", e);
 533         }
 534       d = append(d, u->host, e);
 535       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 536         {
 537           char z[10];
 538           sprintf(z, "%d", u->port);
 539           d = append(d, ":", e);
 540           d = append(d, z, e);
 541         }
 542     }
 543   if (u->rest)
 544     d = append(d, u->rest, e);
 545   if (!d)
 546     return URL_ERR_TOO_LONG;
 547   *d = 0;
 548   return 0;
 549 }
 550
 551 /* Error messages */
 552
 553 static char *errmsg[] = {
 554   "Something is wrong",
 555   "Too long",
 556   "Invalid character",
 557   "Invalid escape",
 558   "Invalid escaped character",
 559   "Invalid port number",
 560   "Relative URL not allowed",
 561   "Unknown protocol",
 562   "Syntax error",
 563   "Path underflow"
 564 };
 565
 566 char *
 567 url_error(uns err)
 568 {
 569   if (err >= sizeof(errmsg) / sizeof(char *))
 570     err = 0;
 571   return errmsg[err];
 572 }
 573
 574 /* Standard cookbook recipes */
 575
 576 int
 577 url_canon_split_rel(const char *u, char *buf1, char *buf2, struct url *url, struct url *base)
 578 {
 579   int err;
 580
 581   if (err = url_deescape(u, buf1))
 582     return err;
 583   if (err = url_split(buf1, url, buf2))
 584     return err;
 585   if (err = url_normalize(url, base))
 586     return err;
 587   return url_canonicalize(url);
 588 }
 589
 590 int
 591 url_auto_canonicalize_rel(const char *src, char *dst, struct url *base)
 592 {
 593   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 594   int err;
 595   struct url ur;
 596
 597   (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
 598    (err = url_pack(&ur, buf3)) ||
 599    (err = url_enescape(buf3, dst)));
 600   return err;
 601 }
 602
 603 /* Testing */
 604
 605 #ifdef TEST
 606
 607 int main(int argc, char **argv)
 608 {
 609   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 610   int err;
 611   struct url url, url0;
 612   char *base = "http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment";
 613
 614   if (argc != 2 && argc != 3)
 615     return 1;
 616   if (argc == 3)
 617     base = argv[2];
 618   if (err = url_deescape(argv[1], buf1))
 619     {
 620       printf("deesc: error %d\n", err);
 621       return 1;
 622     }
 623   printf("deesc: %s\n", buf1);
 624   if (err = url_split(buf1, &url, buf2))
 625     {
 626       printf("split: error %d\n", err);
 627       return 1;
 628     }
 629   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 630   if (err = url_split(base, &url0, buf3))
 631     {
 632       printf("split base: error %d\n", err);
 633       return 1;
 634     }
 635   if (err = url_normalize(&url0, NULL))
 636     {
 637       printf("normalize base: error %d\n", err);
 638       return 1;
 639     }
 640   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 641   if (err = url_normalize(&url, &url0))
 642     {
 643       printf("normalize: error %d\n", err);
 644       return 1;
 645     }
 646   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 647   if (err = url_canonicalize(&url))
 648     {
 649       printf("canonicalize: error %d\n", err);
 650       return 1;
 651     }
 652   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 653   if (err = url_pack(&url, buf4))
 654     {
 655       printf("pack: error %d\n", err);
 656       return 1;
 657     }
 658   printf("pack: %s\n", buf4);
 659   if (err = url_enescape(buf4, buf2))
 660     {
 661       printf("enesc: error %d\n", err);
 662       return 1;
 663     }
 664   printf("enesc: %s\n", buf2);
 665   return 0;
 666 }
 667
 668 #endif
 669
 670 struct component {
 671         const char *start;
 672         int length;
 673         uns count;
 674         u32 hash;
 675 };
 676
 677 static inline u32
 678 hashf(const char *start, int length)
 679 {
 680         u32 hf = length;
 681         while (length-- > 0)
 682                 hf = (hf << 8 | hf >> 24) ^ *start++;
 683         return hf;
 684 }
 685
 686 static inline uns
 687 repeat_count(struct component *comp, uns count, uns len)
 688 {
 689         struct component *orig_comp = comp;
 690         uns found = 0;
 691         while (1)
 692         {
 693                 uns i;
 694                 comp += len;
 695                 count -= len;
 696                 found++;
 697                 if (count < len)
 698                         return found;
 699                 for (i=0; i<len; i++)
 700                         if (comp[i].hash != orig_comp[i].hash
 701                         || comp[i].length != orig_comp[i].length
 702                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 703                                 return found;
 704         }
 705 }
 706
 707 int
 708 url_has_repeated_component(const char *url)
 709 {
 710         struct component *comp;
 711         uns comps, comp_len, rep_prefix, hash_size, *hash, *next;
 712         const char *c;
 713         uns i, j, k;
 714
 715         for (comps=0, c=url; c; comps++)
 716         {
 717                 c = strpbrk(c, url_component_separators);
 718                 if (c)
 719                         c++;
 720         }
 721         if (comps < url_min_repeat_count && comps <= url_max_occurences)
 722                 return 0;
 723         comp = alloca(comps * sizeof(*comp));
 724         for (i=0, c=url; c; i++)
 725         {
 726                 comp[i].start = c;
 727                 c = strpbrk(c, url_component_separators);
 728                 if (c)
 729                 {
 730                         comp[i].length = c - comp[i].start;
 731                         c++;
 732                 }
 733                 else
 734                         comp[i].length = strlen(comp[i].start);
 735         }
 736         ASSERT(i == comps);
 737         for (i=0; i<comps; i++)
 738                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 739         if (comps > url_max_occurences)
 740         {
 741                 hash_size = next_table_prime(comps);
 742                 hash = alloca(hash_size * sizeof(*hash));
 743                 next = alloca(comps * sizeof(*next));
 744                 memset(hash, 255, hash_size * sizeof(*hash));
 745                 for (i=0; i<comps; i++)
 746                 {
 747                         j = comp[i].hash % hash_size;
 748                         for (k = hash[j]; ~k && (comp[i].hash != comp[k].hash || comp[i].length != comp[k].length ||
 749                             memcmp(comp[k].start, comp[i].start, comp[i].length)); k = next[k]);
 750                         if (!~k)
 751                         {
 752                                 next[i] = hash[j];
 753                                 hash[j] = i;
 754                                 comp[i].count = 1;
 755                         }
 756                         else
 757                         {
 758                                 if (comp[k].count++ >= url_max_occurences)
 759                                         return 1;
 760                         }
 761                 }
 762         }
 763         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 764                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 765                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 766                                 return comp_len;
 767         return 0;
 768 }