ucw/url.c

   1 /*
   2  *      UCW Library -- URL Functions
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001--2005 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  *
  10  *      The URL syntax corresponds to RFC 2396 with several exceptions:
  11  *
  12  *         o  Escaping of special characters still follows RFC 1738.
  13  *         o  Interpretation of path parameters follows RFC 1808.
  14  *
  15  *      XXX: The buffer handling in this module is really horrible, but it works.
  16  */
  17
  18 #include "ucw/lib.h"
  19 #include "ucw/url.h"
  20 #include "ucw/chartype.h"
  21 #include "ucw/conf.h"
  22 #include "ucw/prime.h"
  23
  24 #include <string.h>
  25 #include <stdlib.h>
  26 #include <stdio.h>
  27 #include <alloca.h>
  28
  29 /* Configuration */
  30
  31 static uns url_ignore_spaces;
  32 static uns url_ignore_underflow;
  33 static char *url_component_separators = "";
  34 static uns url_min_repeat_count = 0x7fffffff;
  35 static uns url_max_repeat_length = 0;
  36 static uns url_max_occurences = ~0U;
  37
  38 static struct cf_section url_config = {
  39   CF_ITEMS {
  40     CF_UNS("IgnoreSpaces", &url_ignore_spaces),
  41     CF_UNS("IgnoreUnderflow", &url_ignore_underflow),
  42     CF_STRING("ComponentSeparators", &url_component_separators),
  43     CF_UNS("MinRepeatCount", &url_min_repeat_count),
  44     CF_UNS("MaxRepeatLength", &url_max_repeat_length),
  45     CF_UNS("MaxOccurences", &url_max_occurences),
  46     CF_END
  47   }
  48 };
  49
  50 static void CONSTRUCTOR url_init_config(void)
  51 {
  52   cf_declare_section("URL", &url_config, 0);
  53 }
  54
  55 /* Escaping and de-escaping */
  56
  57 static uns
  58 enhex(uns x)
  59 {
  60   return (x<10) ? (x + '0') : (x - 10 + 'A');
  61 }
  62
  63 int
  64 url_deescape(const char *s, char *d)
  65 {
  66   char *dstart = d;
  67   char *end = d + MAX_URL_SIZE - 10;
  68   while (*s)
  69     {
  70       if (d >= end)
  71         return URL_ERR_TOO_LONG;
  72       if (*s == '%')
  73         {
  74           unsigned int val;
  75           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  76             return URL_ERR_INVALID_ESCAPE;
  77           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  78           if (val < 0x20)
  79             return URL_ERR_INVALID_ESCAPED_CHAR;
  80           switch (val)
  81             {
  82             case ';':
  83               val = NCC_SEMICOLON; break;
  84             case '/':
  85               val = NCC_SLASH; break;
  86             case '?':
  87               val = NCC_QUEST; break;
  88             case ':':
  89               val = NCC_COLON; break;
  90             case '@':
  91               val = NCC_AT; break;
  92             case '=':
  93               val = NCC_EQUAL; break;
  94             case '&':
  95               val = NCC_AND; break;
  96             case '#':
  97               val = NCC_HASH; break;
  98             }
  99           *d++ = val;
 100           s += 3;
 101         }
 102       else if ((byte) *s > 0x20)
 103         *d++ = *s++;
 104       else if (Cspace(*s))
 105         {
 106           const char *s0 = s;
 107           while (Cspace(*s))
 108             s++;
 109           if (!url_ignore_spaces || !(!*s || d == dstart))
 110             {
 111               while (Cspace(*s0))
 112                 {
 113                   if (d >= end)
 114                     return URL_ERR_TOO_LONG;
 115                   *d++ = *s0++;
 116                 }
 117             }
 118         }
 119       else
 120         return URL_ERR_INVALID_CHAR;
 121     }
 122   *d = 0;
 123   return 0;
 124 }
 125
 126 int
 127 url_enescape(const char *s, char *d)
 128 {
 129   char *end = d + MAX_URL_SIZE - 10;
 130   unsigned int c;
 131
 132   while (c = *s)
 133     {
 134       if (d >= end)
 135         return URL_ERR_TOO_LONG;
 136       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 137           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 138           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 139           c == ',' ||
 140           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 141           c == '=' || c == '&' || c == '#' || c == ';')
 142         *d++ = *s++;
 143       else
 144         {
 145           uns val = ((byte)*s < NCC_MAX) ? NCC_CHARS[(byte)*s] : *s;
 146           *d++ = '%';
 147           *d++ = enhex(val >> 4);
 148           *d++ = enhex(val & 0x0f);
 149           s++;
 150         }
 151     }
 152   *d = 0;
 153   return 0;
 154 }
 155
 156 int
 157 url_enescape_friendly(const char *src, char *dest)
 158 {
 159   char *end = dest + MAX_URL_SIZE - 10;
 160   const byte *srcb = src;
 161   while (*srcb)
 162     {
 163       if (dest >= end)
 164         return URL_ERR_TOO_LONG;
 165       if (*srcb < NCC_MAX)
 166         *dest++ = NCC_CHARS[*srcb++];
 167       else if (*srcb >= 0x20 && *srcb < 0x7f)
 168         *dest++ = *srcb++;
 169       else
 170         {
 171           *dest++ = '%';
 172           *dest++ = enhex(*srcb >> 4);
 173           *dest++ = enhex(*srcb++ & 0x0f);
 174         }
 175     }
 176   *dest = 0;
 177   return 0;
 178 }
 179
 180 /* Split an URL (several parts may be copied to the destination buffer) */
 181
 182 char *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 183 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 184
 185 uns
 186 identify_protocol(const char *p)
 187 {
 188   uns i;
 189
 190   for(i=1; i<URL_PROTO_MAX; i++)
 191     if (!strcasecmp(p, url_proto_names[i]))
 192       return i;
 193   return URL_PROTO_UNKNOWN;
 194 }
 195
 196 int
 197 url_split(char *s, struct url *u, char *d)
 198 {
 199   bzero(u, sizeof(struct url));
 200   u->port = ~0;
 201   u->bufend = d + MAX_URL_SIZE - 10;
 202
 203   if (s[0] != '/')                      /* Seek for "protocol:" */
 204     {
 205       char *p = s;
 206       while (*p && Calnum(*p))
 207         p++;
 208       if (p != s && *p == ':')
 209         {
 210           u->protocol = d;
 211           while (s < p)
 212             *d++ = *s++;
 213           *d++ = 0;
 214           u->protoid = identify_protocol(u->protocol);
 215           s++;
 216           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 217             {
 218               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 219               int len = d - u->protocol;
 220               d -= len;
 221               s -= len;
 222               u->protocol = NULL;
 223               u->protoid = 0;
 224             }
 225         }
 226     }
 227
 228   if (s[0] == '/')                      /* Host spec or absolute path */
 229     {
 230       if (s[1] == '/')                  /* Host spec */
 231         {
 232           char *q, *e;
 233           char *at = NULL;
 234           char *ep;
 235
 236           s += 2;
 237           q = d;
 238           while (*s && *s != '/' && *s != '?')  /* Copy user:passwd@host:port */
 239             {
 240               if (*s != '@')
 241                 *d++ = *s;
 242               else if (!at)
 243                 {
 244                   *d++ = 0;
 245                   at = d;
 246                 }
 247               else                      /* This shouldn't happen with sane URL's, but we need to be sure */
 248                 *d++ = NCC_AT;
 249               s++;
 250             }
 251           *d++ = 0;
 252           if (at)                       /* user:passwd present */
 253             {
 254               u->user = q;
 255               if (e = strchr(q, ':'))
 256                 {
 257                   *e++ = 0;
 258                   u->pass = e;
 259                 }
 260             }
 261           else
 262             at = q;
 263           e = strchr(at, ':');
 264           if (e)                        /* host:port present */
 265             {
 266               uns p;
 267               *e++ = 0;
 268               p = strtoul(e, &ep, 10);
 269               if (ep && *ep || p > 65535)
 270                 return URL_ERR_INVALID_PORT;
 271               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 272                 u->port = p;
 273             }
 274           u->host = at;
 275         }
 276     }
 277
 278   u->rest = s;
 279   u->buf = d;
 280   return 0;
 281 }
 282
 283 /* Normalization according to given base URL */
 284
 285 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 286
 287 static int
 288 relpath_merge(struct url *u, struct url *b)
 289 {
 290   char *a = u->rest;
 291   char *o = b->rest;
 292   char *d = u->buf;
 293   char *e = u->bufend;
 294   char *p;
 295
 296   if (a[0] == '/')                      /* Absolute path => OK */
 297     return 0;
 298   if (o[0] != '/' && o[0] != '?')
 299     return URL_PATH_UNDERFLOW;
 300
 301   if (!a[0])                            /* Empty URL -> inherit everything */
 302     {
 303       u->rest = b->rest;
 304       return 0;
 305     }
 306
 307   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 308
 309   if (a[0] == '#')                      /* Another fragment */
 310     {
 311       for(p=o; *p && *p != '#'; p++)
 312         ;
 313       goto copy;
 314     }
 315   if (a[0] == '?')                      /* New query */
 316     {
 317       for(p=o; *p && *p != '#' && *p != '?'; p++)
 318         ;
 319       goto copy;
 320     }
 321   if (a[0] == ';')                      /* Change parameters */
 322     {
 323       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 324         ;
 325       goto copy;
 326     }
 327
 328   p = NULL;                             /* Copy original path and find the last slash */
 329   while (*o && *o != ';' && *o != '?' && *o != '#')
 330     {
 331       if (d >= e)
 332         return URL_ERR_TOO_LONG;
 333       if ((*d++ = *o++) == '/')
 334         p = d;
 335     }
 336   if (!p)
 337     return URL_ERR_REL_NOTHING;
 338   d = p;
 339
 340   while (*a)
 341     {
 342       if (a[0] == '.')
 343         {
 344           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 345             {
 346               a++;
 347               if (a[0])
 348                 a++;
 349               continue;
 350             }
 351           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 352             {
 353               a += 2;
 354               if (a[0])
 355                 a++;
 356               if (d <= u->buf + 1)
 357                 {
 358                   /*
 359                    * RFC 1808 says we should leave ".." as a path segment, but
 360                    * we intentionally break the rule and refuse the URL.
 361                    */
 362                   if (!url_ignore_underflow)
 363                     return URL_PATH_UNDERFLOW;
 364                 }
 365               else
 366                 {
 367                   d--;                  /* Discard trailing slash */
 368                   while (d[-1] != '/')
 369                     d--;
 370                 }
 371               continue;
 372             }
 373         }
 374       while (a[0] && a[0] != '/')
 375         {
 376           if (d >= e)
 377             return URL_ERR_TOO_LONG;
 378           *d++ = *a++;
 379         }
 380       if (a[0])
 381         *d++ = *a++;
 382     }
 383
 384 okay:
 385   *d++ = 0;
 386   u->buf = d;
 387   return 0;
 388
 389 copy:                                   /* Combine part of old URL with the new one */
 390   while (o < p)
 391     if (d < e)
 392       *d++ = *o++;
 393     else
 394       return URL_ERR_TOO_LONG;
 395   while (*a)
 396     if (d < e)
 397       *d++ = *a++;
 398     else
 399       return URL_ERR_TOO_LONG;
 400   goto okay;
 401 }
 402
 403 int
 404 url_normalize(struct url *u, struct url *b)
 405 {
 406   int err;
 407
 408   /* Basic checks */
 409   if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
 410       !u->host && u->user ||
 411       !u->user && u->pass ||
 412       !u->rest)
 413     return URL_SYNTAX_ERROR;
 414
 415   if (!u->protocol)
 416     {
 417       /* Now we know it's a relative URL. Do we have any base? */
 418       if (!b || !url_proto_path_flags[b->protoid])
 419         return URL_ERR_REL_NOTHING;
 420       u->protocol = b->protocol;
 421       u->protoid = b->protoid;
 422
 423       /* Reference to the same host */
 424       if (!u->host)
 425         {
 426           u->host = b->host;
 427           u->user = b->user;
 428           u->pass = b->pass;
 429           u->port = b->port;
 430           if (err = relpath_merge(u, b))
 431             return err;
 432         }
 433     }
 434
 435   /* Change path "?" to "/?" because it's the true meaning */
 436   if (u->rest[0] == '?')
 437     {
 438       int l = strlen(u->rest);
 439       if (u->bufend - u->buf < l+1)
 440         return URL_ERR_TOO_LONG;
 441       u->buf[0] = '/';
 442       memcpy(u->buf+1, u->rest, l+1);
 443       u->rest = u->buf;
 444       u->buf += l+2;
 445     }
 446
 447   /* Fill in missing info */
 448   if (u->port == ~0U)
 449     u->port = std_ports[u->protoid];
 450
 451   return 0;
 452 }
 453
 454 /* Name canonicalization */
 455
 456 static void
 457 lowercase(char *b)
 458 {
 459   if (b)
 460     while (*b)
 461       {
 462         if (*b >= 'A' && *b <= 'Z')
 463           *b = *b + 0x20;
 464         b++;
 465       }
 466 }
 467
 468 static void
 469 kill_end_dot(char *b)
 470 {
 471   char *k;
 472
 473   if (b)
 474     {
 475       k = b + strlen(b) - 1;
 476       while (k > b && *k == '.')
 477         *k-- = 0;
 478     }
 479 }
 480
 481 int
 482 url_canonicalize(struct url *u)
 483 {
 484   char *c;
 485
 486   lowercase(u->protocol);
 487   lowercase(u->host);
 488   kill_end_dot(u->host);
 489   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 490     u->rest = "/";
 491   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 492     *c = 0;
 493   return 0;
 494 }
 495
 496 /* Pack a broken-down URL */
 497
 498 static char *
 499 append(char *d, const char *s, char *e)
 500 {
 501   if (d)
 502     while (*s)
 503       {
 504         if (d >= e)
 505           return NULL;
 506         *d++ = *s++;
 507       }
 508   return d;
 509 }
 510
 511 int
 512 url_pack(struct url *u, char *d)
 513 {
 514   char *e = d + MAX_URL_SIZE - 10;
 515
 516   if (u->protocol)
 517     {
 518       d = append(d, u->protocol, e);
 519       d = append(d, ":", e);
 520       u->protoid = identify_protocol(u->protocol);
 521     }
 522   if (u->host)
 523     {
 524       d = append(d, "//", e);
 525       if (u->user)
 526         {
 527           d = append(d, u->user, e);
 528           if (u->pass)
 529             {
 530               d = append(d, ":", e);
 531               d = append(d, u->pass, e);
 532             }
 533           d = append(d, "@", e);
 534         }
 535       d = append(d, u->host, e);
 536       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 537         {
 538           char z[10];
 539           sprintf(z, "%d", u->port);
 540           d = append(d, ":", e);
 541           d = append(d, z, e);
 542         }
 543     }
 544   if (u->rest)
 545     d = append(d, u->rest, e);
 546   if (!d)
 547     return URL_ERR_TOO_LONG;
 548   *d = 0;
 549   return 0;
 550 }
 551
 552 /* Error messages */
 553
 554 static char *errmsg[] = {
 555   "Something is wrong",
 556   "Too long",
 557   "Invalid character",
 558   "Invalid escape",
 559   "Invalid escaped character",
 560   "Invalid port number",
 561   "Relative URL not allowed",
 562   "Unknown protocol",
 563   "Syntax error",
 564   "Path underflow"
 565 };
 566
 567 char *
 568 url_error(uns err)
 569 {
 570   if (err >= sizeof(errmsg) / sizeof(char *))
 571     err = 0;
 572   return errmsg[err];
 573 }
 574
 575 /* Standard cookbook recipes */
 576
 577 int
 578 url_canon_split_rel(const char *u, char *buf1, char *buf2, struct url *url, struct url *base)
 579 {
 580   int err;
 581
 582   if (err = url_deescape(u, buf1))
 583     return err;
 584   if (err = url_split(buf1, url, buf2))
 585     return err;
 586   if (err = url_normalize(url, base))
 587     return err;
 588   return url_canonicalize(url);
 589 }
 590
 591 int
 592 url_auto_canonicalize_rel(const char *src, char *dst, struct url *base)
 593 {
 594   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 595   int err;
 596   struct url ur;
 597
 598   (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
 599    (err = url_pack(&ur, buf3)) ||
 600    (err = url_enescape(buf3, dst)));
 601   return err;
 602 }
 603
 604 /* Testing */
 605
 606 #ifdef TEST
 607
 608 int main(int argc, char **argv)
 609 {
 610   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 611   int err;
 612   struct url url, url0;
 613   char *base = "http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment";
 614
 615   if (argc != 2 && argc != 3)
 616     return 1;
 617   if (argc == 3)
 618     base = argv[2];
 619   if (err = url_deescape(argv[1], buf1))
 620     {
 621       printf("deesc: error %d\n", err);
 622       return 1;
 623     }
 624   printf("deesc: %s\n", buf1);
 625   if (err = url_split(buf1, &url, buf2))
 626     {
 627       printf("split: error %d\n", err);
 628       return 1;
 629     }
 630   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 631   if (err = url_split(base, &url0, buf3))
 632     {
 633       printf("split base: error %d\n", err);
 634       return 1;
 635     }
 636   if (err = url_normalize(&url0, NULL))
 637     {
 638       printf("normalize base: error %d\n", err);
 639       return 1;
 640     }
 641   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 642   if (err = url_normalize(&url, &url0))
 643     {
 644       printf("normalize: error %d\n", err);
 645       return 1;
 646     }
 647   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 648   if (err = url_canonicalize(&url))
 649     {
 650       printf("canonicalize: error %d\n", err);
 651       return 1;
 652     }
 653   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 654   if (err = url_pack(&url, buf4))
 655     {
 656       printf("pack: error %d\n", err);
 657       return 1;
 658     }
 659   printf("pack: %s\n", buf4);
 660   if (err = url_enescape(buf4, buf2))
 661     {
 662       printf("enesc: error %d\n", err);
 663       return 1;
 664     }
 665   printf("enesc: %s\n", buf2);
 666   return 0;
 667 }
 668
 669 #endif
 670
 671 struct component {
 672         const char *start;
 673         int length;
 674         uns count;
 675         u32 hash;
 676 };
 677
 678 static inline u32
 679 hashf(const char *start, int length)
 680 {
 681         u32 hf = length;
 682         while (length-- > 0)
 683                 hf = (hf << 8 | hf >> 24) ^ *start++;
 684         return hf;
 685 }
 686
 687 static inline uns
 688 repeat_count(struct component *comp, uns count, uns len)
 689 {
 690         struct component *orig_comp = comp;
 691         uns found = 0;
 692         while (1)
 693         {
 694                 uns i;
 695                 comp += len;
 696                 count -= len;
 697                 found++;
 698                 if (count < len)
 699                         return found;
 700                 for (i=0; i<len; i++)
 701                         if (comp[i].hash != orig_comp[i].hash
 702                         || comp[i].length != orig_comp[i].length
 703                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 704                                 return found;
 705         }
 706 }
 707
 708 int
 709 url_has_repeated_component(const char *url)
 710 {
 711         struct component *comp;
 712         uns comps, comp_len, rep_prefix, hash_size, *hash, *next;
 713         const char *c;
 714         uns i, j, k;
 715
 716         for (comps=0, c=url; c; comps++)
 717         {
 718                 c = strpbrk(c, url_component_separators);
 719                 if (c)
 720                         c++;
 721         }
 722         if (comps < url_min_repeat_count && comps <= url_max_occurences)
 723                 return 0;
 724         comp = alloca(comps * sizeof(*comp));
 725         for (i=0, c=url; c; i++)
 726         {
 727                 comp[i].start = c;
 728                 c = strpbrk(c, url_component_separators);
 729                 if (c)
 730                 {
 731                         comp[i].length = c - comp[i].start;
 732                         c++;
 733                 }
 734                 else
 735                         comp[i].length = strlen(comp[i].start);
 736         }
 737         ASSERT(i == comps);
 738         for (i=0; i<comps; i++)
 739                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 740         if (comps > url_max_occurences)
 741         {
 742                 hash_size = next_table_prime(comps);
 743                 hash = alloca(hash_size * sizeof(*hash));
 744                 next = alloca(comps * sizeof(*next));
 745                 memset(hash, 255, hash_size * sizeof(*hash));
 746                 for (i=0; i<comps; i++)
 747                 {
 748                         j = comp[i].hash % hash_size;
 749                         for (k = hash[j]; ~k && (comp[i].hash != comp[k].hash || comp[i].length != comp[k].length ||
 750                             memcmp(comp[k].start, comp[i].start, comp[i].length)); k = next[k]);
 751                         if (!~k)
 752                         {
 753                                 next[i] = hash[j];
 754                                 hash[j] = i;
 755                                 comp[i].count = 1;
 756                         }
 757                         else
 758                         {
 759                                 if (comp[k].count++ >= url_max_occurences)
 760                                         return 1;
 761                         }
 762                 }
 763         }
 764         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 765                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 766                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 767                                 return comp_len;
 768         return 0;
 769 }