ucw/url.c

   1 /*
   2  *      UCW Library -- URL Functions
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001--2005 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  *
  10  *      XXX: The buffer handling in this module is really horrible, but it works.
  11  */
  12
  13 #include <ucw/lib.h>
  14 #include <ucw/url.h>
  15 #include <ucw/chartype.h>
  16 #include <ucw/conf.h>
  17 #include <ucw/prime.h>
  18
  19 #include <string.h>
  20 #include <stdlib.h>
  21 #include <stdio.h>
  22 #include <alloca.h>
  23
  24 /* Configuration */
  25
  26 static uint url_ignore_spaces;
  27 static uint url_ignore_underflow;
  28 static char *url_component_separators = "";
  29 static uint url_min_repeat_count = 0x7fffffff;
  30 static uint url_max_repeat_length = 0;
  31 static uint url_max_occurences = ~0U;
  32
  33 #ifndef TEST
  34 static struct cf_section url_config = {
  35   CF_ITEMS {
  36     CF_UINT("IgnoreSpaces", &url_ignore_spaces),
  37     CF_UINT("IgnoreUnderflow", &url_ignore_underflow),
  38     CF_STRING("ComponentSeparators", &url_component_separators),
  39     CF_UINT("MinRepeatCount", &url_min_repeat_count),
  40     CF_UINT("MaxRepeatLength", &url_max_repeat_length),
  41     CF_UINT("MaxOccurences", &url_max_occurences),
  42     CF_END
  43   }
  44 };
  45
  46 static void CONSTRUCTOR url_init_config(void)
  47 {
  48   cf_declare_section("URL", &url_config, 0);
  49 }
  50 #endif
  51
  52 /* Escaping and de-escaping */
  53
  54 static uint
  55 enhex(uint x)
  56 {
  57   return (x<10) ? (x + '0') : (x - 10 + 'A');
  58 }
  59
  60 int
  61 url_deescape(const char *s, char *d)
  62 {
  63   char *dstart = d;
  64   char *end = d + MAX_URL_SIZE - 10;
  65   while (*s)
  66     {
  67       if (d >= end)
  68         return URL_ERR_TOO_LONG;
  69       if (*s == '%')
  70         {
  71           uint val;
  72           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  73             return URL_ERR_INVALID_ESCAPE;
  74           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  75           if (val < 0x20)
  76             return URL_ERR_INVALID_ESCAPED_CHAR;
  77           switch (val)
  78             {
  79             case ';':
  80               val = NCC_SEMICOLON; break;
  81             case '/':
  82               val = NCC_SLASH; break;
  83             case '?':
  84               val = NCC_QUEST; break;
  85             case ':':
  86               val = NCC_COLON; break;
  87             case '@':
  88               val = NCC_AT; break;
  89             case '=':
  90               val = NCC_EQUAL; break;
  91             case '&':
  92               val = NCC_AND; break;
  93             case '#':
  94               val = NCC_HASH; break;
  95             case '$':
  96               val = NCC_DOLLAR; break;
  97             case '+':
  98               val = NCC_PLUS; break;
  99             case ',':
 100               val = NCC_COMMA; break;
 101             }
 102           *d++ = val;
 103           s += 3;
 104         }
 105       else if ((byte) *s > 0x20)
 106         *d++ = *s++;
 107       else if (Cspace(*s))
 108         {
 109           const char *s0 = s;
 110           while (Cspace(*s))
 111             s++;
 112           if (!url_ignore_spaces || !(!*s || d == dstart))
 113             {
 114               while (Cspace(*s0))
 115                 {
 116                   if (d >= end)
 117                     return URL_ERR_TOO_LONG;
 118                   *d++ = *s0++;
 119                 }
 120             }
 121         }
 122       else
 123         return URL_ERR_INVALID_CHAR;
 124     }
 125   *d = 0;
 126   return 0;
 127 }
 128
 129 int
 130 url_enescape(const char *s, char *d)
 131 {
 132   char *end = d + MAX_URL_SIZE - 10;
 133   uint c;
 134
 135   while (c = *s)
 136     {
 137       if (d >= end)
 138         return URL_ERR_TOO_LONG;
 139       if (Calnum(c) ||                                                  /* RFC 2396 (2.1-2.3): Only alphanumerics ... */
 140           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||  /* ... and some exceptions and reserved chars */
 141           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||
 142           c == ',' || c == '=' || c == '&' || c == '#' || c == ';' ||
 143           c == '/' || c == '?' || c == ':' || c == '@' || c == '~'
 144         )
 145         *d++ = *s++;
 146       else
 147         {
 148           uint val = (byte)(((byte)*s < NCC_MAX) ? NCC_CHARS[(byte)*s] : *s);
 149           *d++ = '%';
 150           *d++ = enhex(val >> 4);
 151           *d++ = enhex(val & 0x0f);
 152           s++;
 153         }
 154     }
 155   *d = 0;
 156   return 0;
 157 }
 158
 159 int
 160 url_enescape_friendly(const char *src, char *dest)
 161 {
 162   char *end = dest + MAX_URL_SIZE - 10;
 163   const byte *srcb = src;
 164   while (*srcb)
 165     {
 166       if (dest >= end)
 167         return URL_ERR_TOO_LONG;
 168       if ((byte)*srcb < NCC_MAX)
 169         *dest++ = NCC_CHARS[*srcb++];
 170       else if (*srcb >= 0x20 && *srcb < 0x7f)
 171         *dest++ = *srcb++;
 172       else
 173         {
 174           *dest++ = '%';
 175           *dest++ = enhex((byte)*srcb >> 4);
 176           *dest++ = enhex(*srcb++ & 0x0f);
 177         }
 178     }
 179   *dest = 0;
 180   return 0;
 181 }
 182
 183 /* Split an URL (several parts may be copied to the destination buffer) */
 184
 185 char *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 186 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 187
 188 uint
 189 url_identify_protocol(const char *p)
 190 {
 191   uint i;
 192
 193   for(i=1; i<URL_PROTO_MAX; i++)
 194     if (!strcasecmp(p, url_proto_names[i]))
 195       return i;
 196   return URL_PROTO_UNKNOWN;
 197 }
 198
 199 int
 200 url_split(char *s, struct url *u, char *d)
 201 {
 202   bzero(u, sizeof(struct url));
 203   u->port = ~0;
 204   u->bufend = d + MAX_URL_SIZE - 10;
 205
 206   if (s[0] != '/')                      /* Seek for "protocol:" */
 207     {
 208       char *p = s;
 209       while (*p && Calnum(*p))
 210         p++;
 211       if (p != s && *p == ':')
 212         {
 213           u->protocol = d;
 214           while (s < p)
 215             *d++ = *s++;
 216           *d++ = 0;
 217           u->protoid = url_identify_protocol(u->protocol);
 218           s++;
 219           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 220             {
 221               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 222               int len = d - u->protocol;
 223               d -= len;
 224               s -= len;
 225               u->protocol = NULL;
 226               u->protoid = 0;
 227             }
 228         }
 229     }
 230
 231   if (s[0] == '/')                      /* Host spec or absolute path */
 232     {
 233       if (s[1] == '/')                  /* Host spec */
 234         {
 235           char *q, *e;
 236           char *at = NULL;
 237           char *ep;
 238
 239           s += 2;
 240           q = d;
 241           while (*s && *s != '/' && *s != '?')  /* Copy user:passwd@host:port */
 242             {
 243               if (*s != '@')
 244                 *d++ = *s;
 245               else if (!at)
 246                 {
 247                   *d++ = 0;
 248                   at = d;
 249                 }
 250               else                      /* This shouldn't happen with sane URL's, but we need to be sure */
 251                 *d++ = NCC_AT;
 252               s++;
 253             }
 254           *d++ = 0;
 255           if (at)                       /* user:passwd present */
 256             {
 257               u->user = q;
 258               if (e = strchr(q, ':'))
 259                 {
 260                   *e++ = 0;
 261                   u->pass = e;
 262                 }
 263             }
 264           else
 265             at = q;
 266           e = strchr(at, ':');
 267           if (e)                        /* host:port present */
 268             {
 269               uint p;
 270               *e++ = 0;
 271               p = strtoul(e, &ep, 10);
 272               if (ep && *ep || p > 65535)
 273                 return URL_ERR_INVALID_PORT;
 274               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 275                 u->port = p;
 276             }
 277           u->host = at;
 278         }
 279     }
 280
 281   u->rest = s;
 282   u->buf = d;
 283   return 0;
 284 }
 285
 286 /* Normalization according to given base URL */
 287
 288 static uint std_ports[] = URL_DEFPORTS; /* Default port numbers */
 289
 290 static int
 291 relpath_merge(struct url *u, struct url *b)
 292 {
 293   char *a = u->rest;
 294   char *o = b->rest;
 295   char *d = u->buf;
 296   char *e = u->bufend;
 297   char *p;
 298
 299   if (a[0] == '/')                      /* Absolute path => OK */
 300     return 0;
 301   if (o[0] != '/' && o[0] != '?')
 302     return URL_PATH_UNDERFLOW;
 303
 304   if (!a[0])                            /* Empty URL -> inherit everything */
 305     {
 306       u->rest = b->rest;
 307       return 0;
 308     }
 309
 310   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 311
 312   if (a[0] == '#')                      /* Another fragment */
 313     {
 314       for(p=o; *p && *p != '#'; p++)
 315         ;
 316       goto copy;
 317     }
 318   if (a[0] == '?')                      /* New query */
 319     {
 320       for(p=o; *p && *p != '#' && *p != '?'; p++)
 321         ;
 322       goto copy;
 323     }
 324
 325   p = NULL;                             /* Copy original path and find the last slash */
 326   while (*o && *o != '?' && *o != '#')
 327     {
 328       if (d >= e)
 329         return URL_ERR_TOO_LONG;
 330       if ((*d++ = *o++) == '/')
 331         p = d;
 332     }
 333   if (!p)
 334     return URL_ERR_REL_NOTHING;
 335   d = p;
 336
 337   while (*a)
 338     {
 339       if (a[0] == '.')
 340         {
 341           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 342             {
 343               a++;
 344               if (a[0])
 345                 a++;
 346               continue;
 347             }
 348           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 349             {
 350               a += 2;
 351               if (a[0])
 352                 a++;
 353               if (d <= u->buf + 1)
 354                 {
 355                   /*
 356                    * RFC 1808 says we should leave ".." as a path segment, but
 357                    * we intentionally break the rule and refuse the URL.
 358                    */
 359                   if (!url_ignore_underflow)
 360                     return URL_PATH_UNDERFLOW;
 361                 }
 362               else
 363                 {
 364                   d--;                  /* Discard trailing slash */
 365                   while (d[-1] != '/')
 366                     d--;
 367                 }
 368               continue;
 369             }
 370         }
 371       while (a[0] && a[0] != '/')
 372         {
 373           if (d >= e)
 374             return URL_ERR_TOO_LONG;
 375           *d++ = *a++;
 376         }
 377       if (a[0])
 378         *d++ = *a++;
 379     }
 380
 381 okay:
 382   *d++ = 0;
 383   u->buf = d;
 384   return 0;
 385
 386 copy:                                   /* Combine part of old URL with the new one */
 387   while (o < p)
 388     if (d < e)
 389       *d++ = *o++;
 390     else
 391       return URL_ERR_TOO_LONG;
 392   while (*a)
 393     if (d < e)
 394       *d++ = *a++;
 395     else
 396       return URL_ERR_TOO_LONG;
 397   goto okay;
 398 }
 399
 400 int
 401 url_normalize(struct url *u, struct url *b)
 402 {
 403   int err;
 404
 405   /* Basic checks */
 406   if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
 407       !u->host && u->user ||
 408       !u->user && u->pass ||
 409       !u->rest)
 410     return URL_SYNTAX_ERROR;
 411
 412   if (!u->protocol)
 413     {
 414       /* Now we know it's a relative URL. Do we have any base? */
 415       if (!b || !url_proto_path_flags[b->protoid])
 416         return URL_ERR_REL_NOTHING;
 417       u->protocol = b->protocol;
 418       u->protoid = b->protoid;
 419
 420       /* Reference to the same host */
 421       if (!u->host)
 422         {
 423           u->host = b->host;
 424           u->user = b->user;
 425           u->pass = b->pass;
 426           u->port = b->port;
 427           if (err = relpath_merge(u, b))
 428             return err;
 429         }
 430     }
 431
 432   /* Change path "?" to "/?" because it's the true meaning */
 433   if (u->rest[0] == '?')
 434     {
 435       int l = strlen(u->rest);
 436       if (u->bufend - u->buf < l+1)
 437         return URL_ERR_TOO_LONG;
 438       u->buf[0] = '/';
 439       memcpy(u->buf+1, u->rest, l+1);
 440       u->rest = u->buf;
 441       u->buf += l+2;
 442     }
 443
 444   /* Fill in missing info */
 445   if (u->port == ~0U)
 446     u->port = std_ports[u->protoid];
 447
 448   return 0;
 449 }
 450
 451 /* Name canonicalization */
 452
 453 static void
 454 lowercase(char *b)
 455 {
 456   if (b)
 457     while (*b)
 458       {
 459         if (*b >= 'A' && *b <= 'Z')
 460           *b = *b + 0x20;
 461         b++;
 462       }
 463 }
 464
 465 static void
 466 kill_end_dot(char *b)
 467 {
 468   char *k;
 469
 470   if (b)
 471     {
 472       k = b + strlen(b) - 1;
 473       while (k > b && *k == '.')
 474         *k-- = 0;
 475     }
 476 }
 477
 478 int
 479 url_canonicalize(struct url *u)
 480 {
 481   char *c;
 482
 483   lowercase(u->protocol);
 484   lowercase(u->host);
 485   kill_end_dot(u->host);
 486   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 487     u->rest = "/";
 488   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 489     *c = 0;
 490   return 0;
 491 }
 492
 493 /* Pack a broken-down URL */
 494
 495 static char *
 496 append(char *d, const char *s, char *e)
 497 {
 498   if (d)
 499     while (*s)
 500       {
 501         if (d >= e)
 502           return NULL;
 503         *d++ = *s++;
 504       }
 505   return d;
 506 }
 507
 508 int
 509 url_pack(struct url *u, char *d)
 510 {
 511   char *e = d + MAX_URL_SIZE - 10;
 512
 513   if (u->protocol)
 514     {
 515       d = append(d, u->protocol, e);
 516       d = append(d, ":", e);
 517       u->protoid = url_identify_protocol(u->protocol);
 518     }
 519   if (u->host)
 520     {
 521       d = append(d, "//", e);
 522       if (u->user)
 523         {
 524           d = append(d, u->user, e);
 525           if (u->pass)
 526             {
 527               d = append(d, ":", e);
 528               d = append(d, u->pass, e);
 529             }
 530           d = append(d, "@", e);
 531         }
 532       d = append(d, u->host, e);
 533       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 534         {
 535           char z[10];
 536           sprintf(z, "%d", u->port);
 537           d = append(d, ":", e);
 538           d = append(d, z, e);
 539         }
 540     }
 541   if (u->rest)
 542     d = append(d, u->rest, e);
 543   if (!d)
 544     return URL_ERR_TOO_LONG;
 545   *d = 0;
 546   return 0;
 547 }
 548
 549 /* Error messages */
 550
 551 static char *errmsg[] = {
 552   "Something is wrong",
 553   "Too long",
 554   "Invalid character",
 555   "Invalid escape",
 556   "Invalid escaped character",
 557   "Invalid port number",
 558   "Relative URL not allowed",
 559   "Unknown protocol",
 560   "Syntax error",
 561   "Path underflow"
 562 };
 563
 564 char *
 565 url_error(uint err)
 566 {
 567   if (err >= sizeof(errmsg) / sizeof(char *))
 568     err = 0;
 569   return errmsg[err];
 570 }
 571
 572 /* Standard cookbook recipes */
 573
 574 int
 575 url_canon_split_rel(const char *u, char *buf1, char *buf2, struct url *url, struct url *base)
 576 {
 577   int err;
 578
 579   if (err = url_deescape(u, buf1))
 580     return err;
 581   if (err = url_split(buf1, url, buf2))
 582     return err;
 583   if (err = url_normalize(url, base))
 584     return err;
 585   return url_canonicalize(url);
 586 }
 587
 588 int
 589 url_auto_canonicalize_rel(const char *src, char *dst, struct url *base)
 590 {
 591   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 592   int err;
 593   struct url ur;
 594
 595   (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
 596    (err = url_pack(&ur, buf3)) ||
 597    (err = url_enescape(buf3, dst)));
 598   return err;
 599 }
 600
 601 /* Testing */
 602
 603 #ifdef TEST
 604
 605 int main(int argc, char **argv)
 606 {
 607   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 608   int err;
 609   struct url url, url0;
 610   char *base = "http://mj@www.hell.org/123/sub_dir;param/index.html;param?query&zzz/sub;query+#fragment?";
 611
 612   if (argc != 2 && argc != 3)
 613     return 1;
 614   if (argc == 3)
 615     base = argv[2];
 616   if (err = url_deescape(argv[1], buf1))
 617     {
 618       printf("deesc: error %d\n", err);
 619       return 1;
 620     }
 621   printf("deesc: %s\n", buf1);
 622   if (err = url_split(buf1, &url, buf2))
 623     {
 624       printf("split: error %d\n", err);
 625       return 1;
 626     }
 627   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 628   if (err = url_split(base, &url0, buf3))
 629     {
 630       printf("split base: error %d\n", err);
 631       return 1;
 632     }
 633   if (err = url_normalize(&url0, NULL))
 634     {
 635       printf("normalize base: error %d\n", err);
 636       return 1;
 637     }
 638   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 639   if (err = url_normalize(&url, &url0))
 640     {
 641       printf("normalize: error %d\n", err);
 642       return 1;
 643     }
 644   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 645   if (err = url_canonicalize(&url))
 646     {
 647       printf("canonicalize: error %d\n", err);
 648       return 1;
 649     }
 650   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 651   if (err = url_pack(&url, buf4))
 652     {
 653       printf("pack: error %d\n", err);
 654       return 1;
 655     }
 656   printf("pack: %s\n", buf4);
 657   if (err = url_enescape(buf4, buf2))
 658     {
 659       printf("enesc: error %d\n", err);
 660       return 1;
 661     }
 662   printf("enesc: %s\n", buf2);
 663   return 0;
 664 }
 665
 666 #endif
 667
 668 struct component {
 669         const char *start;
 670         int length;
 671         uint count;
 672         u32 hash;
 673 };
 674
 675 static inline u32
 676 hashf(const char *start, int length)
 677 {
 678         u32 hf = length;
 679         while (length-- > 0)
 680                 hf = (hf << 8 | hf >> 24) ^ *start++;
 681         return hf;
 682 }
 683
 684 static inline uint
 685 repeat_count(struct component *comp, uint count, uint len)
 686 {
 687         struct component *orig_comp = comp;
 688         uint found = 0;
 689         while (1)
 690         {
 691                 uint i;
 692                 comp += len;
 693                 count -= len;
 694                 found++;
 695                 if (count < len)
 696                         return found;
 697                 for (i=0; i<len; i++)
 698                         if (comp[i].hash != orig_comp[i].hash
 699                         || comp[i].length != orig_comp[i].length
 700                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 701                                 return found;
 702         }
 703 }
 704
 705 int
 706 url_has_repeated_component(const char *url)
 707 {
 708         struct component *comp;
 709         uint comps, comp_len, rep_prefix, hash_size, *hash, *next;
 710         const char *c;
 711         uint i, j, k;
 712
 713         for (comps=0, c=url; c; comps++)
 714         {
 715                 c = strpbrk(c, url_component_separators);
 716                 if (c)
 717                         c++;
 718         }
 719         if (comps < url_min_repeat_count && comps <= url_max_occurences)
 720                 return 0;
 721         comp = alloca(comps * sizeof(*comp));
 722         for (i=0, c=url; c; i++)
 723         {
 724                 comp[i].start = c;
 725                 c = strpbrk(c, url_component_separators);
 726                 if (c)
 727                 {
 728                         comp[i].length = c - comp[i].start;
 729                         c++;
 730                 }
 731                 else
 732                         comp[i].length = strlen(comp[i].start);
 733         }
 734         ASSERT(i == comps);
 735         for (i=0; i<comps; i++)
 736                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 737         if (comps > url_max_occurences)
 738         {
 739                 hash_size = next_table_prime(comps);
 740                 hash = alloca(hash_size * sizeof(*hash));
 741                 next = alloca(comps * sizeof(*next));
 742                 memset(hash, 255, hash_size * sizeof(*hash));
 743                 for (i=0; i<comps; i++)
 744                 {
 745                         j = comp[i].hash % hash_size;
 746                         for (k = hash[j]; ~k && (comp[i].hash != comp[k].hash || comp[i].length != comp[k].length ||
 747                             memcmp(comp[k].start, comp[i].start, comp[i].length)); k = next[k]);
 748                         if (!~k)
 749                         {
 750                                 next[i] = hash[j];
 751                                 hash[j] = i;
 752                                 comp[i].count = 1;
 753                         }
 754                         else
 755                         {
 756                                 if (comp[k].count++ >= url_max_occurences)
 757                                         return 1;
 758                         }
 759                 }
 760         }
 761         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 762                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 763                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 764                                 return comp_len;
 765         return 0;
 766 }