lib/url.c

   1 /*
   2  *      UCW Library -- URL Functions
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001--2005 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  *
  10  *      The URL syntax corresponds to RFC 2396 with several exceptions:
  11  *
  12  *         o  Escaping of special characters still follows RFC 1738.
  13  *         o  Interpretation of path parameters follows RFC 1808.
  14  *
  15  *      XXX: The buffer handling in this module is really horrible, but it works.
  16  */
  17
  18 #include "lib/lib.h"
  19 #include "lib/url.h"
  20 #include "lib/chartype.h"
  21 #include "lib/conf.h"
  22
  23 #include <string.h>
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <alloca.h>
  27
  28 /* Configuration */
  29
  30 static uns url_ignore_spaces;
  31 static uns url_ignore_underflow;
  32 static byte *url_component_separators = "";
  33 static uns url_min_repeat_count = 0x7fffffff;
  34 static uns url_max_repeat_length = 0;
  35
  36 static struct cfitem url_config[] = {
  37   { "URL",                              CT_SECTION,     NULL },
  38   { "IgnoreSpaces",                     CT_INT,         &url_ignore_spaces },
  39   { "IgnoreUnderflow",                  CT_INT,         &url_ignore_underflow },
  40   { "ComponentSeparators",              CT_STRING,      &url_component_separators },
  41   { "MinRepeatCount",                   CT_INT,         &url_min_repeat_count },
  42   { "MaxRepeatLength",                  CT_INT,         &url_max_repeat_length },
  43   { NULL,                               CT_STOP,        NULL }
  44 };
  45
  46 static void CONSTRUCTOR url_init_config(void)
  47 {
  48   cf_register(url_config);
  49 }
  50
  51 /* Escaping and de-escaping */
  52
  53 static uns
  54 enhex(uns x)
  55 {
  56   return (x<10) ? (x + '0') : (x - 10 + 'A');
  57 }
  58
  59 int
  60 url_deescape(byte *s, byte *d)
  61 {
  62   byte *dstart = d;
  63   byte *end = d + MAX_URL_SIZE - 10;
  64   while (*s)
  65     {
  66       if (d >= end)
  67         return URL_ERR_TOO_LONG;
  68       if (*s == '%')
  69         {
  70           unsigned int val;
  71           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  72             return URL_ERR_INVALID_ESCAPE;
  73           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  74           if (val < 0x20)
  75             return URL_ERR_INVALID_ESCAPED_CHAR;
  76           switch (val)
  77             {
  78             case ';':
  79               val = NCC_SEMICOLON; break;
  80             case '/':
  81               val = NCC_SLASH; break;
  82             case '?':
  83               val = NCC_QUEST; break;
  84             case ':':
  85               val = NCC_COLON; break;
  86             case '@':
  87               val = NCC_AT; break;
  88             case '=':
  89               val = NCC_EQUAL; break;
  90             case '&':
  91               val = NCC_AND; break;
  92             case '#':
  93               val = NCC_HASH; break;
  94             }
  95           *d++ = val;
  96           s += 3;
  97         }
  98       else if (*s > 0x20)
  99         *d++ = *s++;
 100       else if (Cspace(*s))
 101         {
 102           byte *s0 = s;
 103           while (Cspace(*s))
 104             s++;
 105           if (!url_ignore_spaces || !(!*s || d == dstart))
 106             {
 107               while (Cspace(*s0))
 108                 {
 109                   if (d >= end)
 110                     return URL_ERR_TOO_LONG;
 111                   *d++ = *s0++;
 112                 }
 113             }
 114         }
 115       else
 116         return URL_ERR_INVALID_CHAR;
 117     }
 118   *d = 0;
 119   return 0;
 120 }
 121
 122 int
 123 url_enescape(byte *s, byte *d)
 124 {
 125   byte *end = d + MAX_URL_SIZE - 10;
 126   unsigned int c;
 127
 128   while (c = *s)
 129     {
 130       if (d >= end)
 131         return URL_ERR_TOO_LONG;
 132       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 133           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 134           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 135           c == ',' ||
 136           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 137           c == '=' || c == '&' || c == '#' || c == ';')
 138         *d++ = *s++;
 139       else
 140         {
 141           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
 142           *d++ = '%';
 143           *d++ = enhex(val >> 4);
 144           *d++ = enhex(val & 0x0f);
 145           s++;
 146         }
 147     }
 148   *d = 0;
 149   return 0;
 150 }
 151
 152 int
 153 url_enescape_friendly(byte *src, byte *dest)
 154 {
 155   byte *end = dest + MAX_URL_SIZE - 10;
 156   while (*src)
 157     {
 158       if (dest >= end)
 159         return URL_ERR_TOO_LONG;
 160       if (*src < NCC_MAX)
 161         *dest++ = NCC_CHARS[*src++];
 162       else if (*src < 0x80)
 163         *dest++ = *src++;
 164       else
 165         {
 166           *dest++ = '%';
 167           *dest++ = enhex(*src >> 4);
 168           *dest++ = enhex(*src++ & 0x0f);
 169         }
 170     }
 171   *dest = 0;
 172   return 0;
 173 }
 174
 175 /* Split an URL (several parts may be copied to the destination buffer) */
 176
 177 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 178 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 179
 180 uns
 181 identify_protocol(byte *p)
 182 {
 183   uns i;
 184
 185   for(i=1; i<URL_PROTO_MAX; i++)
 186     if (!strcasecmp(p, url_proto_names[i]))
 187       return i;
 188   return URL_PROTO_UNKNOWN;
 189 }
 190
 191 int
 192 url_split(byte *s, struct url *u, byte *d)
 193 {
 194   bzero(u, sizeof(struct url));
 195   u->port = ~0;
 196   u->bufend = d + MAX_URL_SIZE - 10;
 197
 198   if (s[0] != '/')                      /* Seek for "protocol:" */
 199     {
 200       byte *p = s;
 201       while (*p && Calnum(*p))
 202         p++;
 203       if (p != s && *p == ':')
 204         {
 205           u->protocol = d;
 206           while (s < p)
 207             *d++ = *s++;
 208           *d++ = 0;
 209           u->protoid = identify_protocol(u->protocol);
 210           s++;
 211           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 212             {
 213               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 214               int len = d - u->protocol;
 215               d -= len;
 216               s -= len;
 217               u->protocol = NULL;
 218               u->protoid = 0;
 219             }
 220         }
 221     }
 222
 223   if (s[0] == '/')                      /* Host spec or absolute path */
 224     {
 225       if (s[1] == '/')                  /* Host spec */
 226         {
 227           byte *q, *w, *e;
 228           char *ep;
 229
 230           s += 2;
 231           q = d;
 232           while (*s && *s != '/' && *s != '?')  /* Copy user:passwd@host:port */
 233             *d++ = *s++;
 234           *d++ = 0;
 235           w = strchr(q, '@');
 236           if (w)                        /* user:passwd present */
 237             {
 238               *w++ = 0;
 239               u->user = q;
 240               if (e = strchr(q, ':'))
 241                 {
 242                   *e++ = 0;
 243                   u->pass = e;
 244                 }
 245             }
 246           else
 247             w = q;
 248           e = strchr(w, ':');
 249           if (e)                        /* host:port present */
 250             {
 251               uns p;
 252               *e++ = 0;
 253               p = strtoul(e, &ep, 10);
 254               if (ep && *ep || p > 65535)
 255                 return URL_ERR_INVALID_PORT;
 256               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 257                 u->port = p;
 258             }
 259           u->host = w;
 260         }
 261     }
 262
 263   u->rest = s;
 264   u->buf = d;
 265   return 0;
 266 }
 267
 268 /* Normalization according to given base URL */
 269
 270 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 271
 272 static int
 273 relpath_merge(struct url *u, struct url *b)
 274 {
 275   byte *a = u->rest;
 276   byte *o = b->rest;
 277   byte *d = u->buf;
 278   byte *e = u->bufend;
 279   byte *p;
 280
 281   if (a[0] == '/')                      /* Absolute path => OK */
 282     return 0;
 283   if (o[0] != '/' && o[0] != '?')
 284     return URL_PATH_UNDERFLOW;
 285
 286   if (!a[0])                            /* Empty URL -> inherit everything */
 287     {
 288       u->rest = b->rest;
 289       return 0;
 290     }
 291
 292   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 293
 294   if (a[0] == '#')                      /* Another fragment */
 295     {
 296       for(p=o; *p && *p != '#'; p++)
 297         ;
 298       goto copy;
 299     }
 300   if (a[0] == '?')                      /* New query */
 301     {
 302       for(p=o; *p && *p != '#' && *p != '?'; p++)
 303         ;
 304       goto copy;
 305     }
 306   if (a[0] == ';')                      /* Change parameters */
 307     {
 308       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 309         ;
 310       goto copy;
 311     }
 312
 313   p = NULL;                             /* Copy original path and find the last slash */
 314   while (*o && *o != ';' && *o != '?' && *o != '#')
 315     {
 316       if (d >= e)
 317         return URL_ERR_TOO_LONG;
 318       if ((*d++ = *o++) == '/')
 319         p = d;
 320     }
 321   if (!p)
 322     return URL_ERR_REL_NOTHING;
 323   d = p;
 324
 325   while (*a)
 326     {
 327       if (a[0] == '.')
 328         {
 329           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 330             {
 331               a++;
 332               if (a[0])
 333                 a++;
 334               continue;
 335             }
 336           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 337             {
 338               a += 2;
 339               if (a[0])
 340                 a++;
 341               if (d <= u->buf + 1)
 342                 {
 343                   /*
 344                    * RFC 1808 says we should leave ".." as a path segment, but
 345                    * we intentionally break the rule and refuse the URL.
 346                    */
 347                   if (!url_ignore_underflow)
 348                     return URL_PATH_UNDERFLOW;
 349                 }
 350               else
 351                 {
 352                   d--;                  /* Discard trailing slash */
 353                   while (d[-1] != '/')
 354                     d--;
 355                 }
 356               continue;
 357             }
 358         }
 359       while (a[0] && a[0] != '/')
 360         {
 361           if (d >= e)
 362             return URL_ERR_TOO_LONG;
 363           *d++ = *a++;
 364         }
 365       if (a[0])
 366         *d++ = *a++;
 367     }
 368
 369 okay:
 370   *d++ = 0;
 371   u->buf = d;
 372   return 0;
 373
 374 copy:                                   /* Combine part of old URL with the new one */
 375   while (o < p)
 376     if (d < e)
 377       *d++ = *o++;
 378     else
 379       return URL_ERR_TOO_LONG;
 380   while (*a)
 381     if (d < e)
 382       *d++ = *a++;
 383     else
 384       return URL_ERR_TOO_LONG;
 385   goto okay;
 386 }
 387
 388 int
 389 url_normalize(struct url *u, struct url *b)
 390 {
 391   int err;
 392
 393   /* Basic checks */
 394   if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
 395       !u->host && u->user ||
 396       !u->user && u->pass ||
 397       !u->rest)
 398     return URL_SYNTAX_ERROR;
 399
 400   if (!u->protocol)
 401     {
 402       /* Now we know it's a relative URL. Do we have any base? */
 403       if (!b || !url_proto_path_flags[b->protoid])
 404         return URL_ERR_REL_NOTHING;
 405       u->protocol = b->protocol;
 406       u->protoid = b->protoid;
 407
 408       /* Reference to the same host */
 409       if (!u->host)
 410         {
 411           u->host = b->host;
 412           u->user = b->user;
 413           u->pass = b->pass;
 414           u->port = b->port;
 415           if (err = relpath_merge(u, b))
 416             return err;
 417         }
 418     }
 419
 420   /* Change path "?" to "/?" because it's the true meaning */
 421   if (u->rest[0] == '?')
 422     {
 423       int l = strlen(u->rest);
 424       if (u->bufend - u->buf < l+1)
 425         return URL_ERR_TOO_LONG;
 426       u->buf[0] = '/';
 427       memcpy(u->buf+1, u->rest, l+1);
 428       u->rest = u->buf;
 429       u->buf += l+2;
 430     }
 431
 432   /* Fill in missing info */
 433   if (u->port == ~0U)
 434     u->port = std_ports[u->protoid];
 435
 436   return 0;
 437 }
 438
 439 /* Name canonicalization */
 440
 441 static void
 442 lowercase(byte *b)
 443 {
 444   if (b)
 445     while (*b)
 446       {
 447         if (*b >= 'A' && *b <= 'Z')
 448           *b = *b + 0x20;
 449         b++;
 450       }
 451 }
 452
 453 static void
 454 kill_end_dot(byte *b)
 455 {
 456   byte *k;
 457
 458   if (b)
 459     {
 460       k = b + strlen(b) - 1;
 461       while (k > b && *k == '.')
 462         *k-- = 0;
 463     }
 464 }
 465
 466 int
 467 url_canonicalize(struct url *u)
 468 {
 469   char *c;
 470
 471   lowercase(u->protocol);
 472   lowercase(u->host);
 473   kill_end_dot(u->host);
 474   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 475     u->rest = "/";
 476   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 477     *c = 0;
 478   return 0;
 479 }
 480
 481 /* Pack a broken-down URL */
 482
 483 static byte *
 484 append(byte *d, byte *s, byte *e)
 485 {
 486   if (d)
 487     while (*s)
 488       {
 489         if (d >= e)
 490           return NULL;
 491         *d++ = *s++;
 492       }
 493   return d;
 494 }
 495
 496 int
 497 url_pack(struct url *u, byte *d)
 498 {
 499   byte *e = d + MAX_URL_SIZE - 10;
 500
 501   if (u->protocol)
 502     {
 503       d = append(d, u->protocol, e);
 504       d = append(d, ":", e);
 505       u->protoid = identify_protocol(u->protocol);
 506     }
 507   if (u->host)
 508     {
 509       d = append(d, "//", e);
 510       if (u->user)
 511         {
 512           d = append(d, u->user, e);
 513           if (u->pass)
 514             {
 515               d = append(d, ":", e);
 516               d = append(d, u->pass, e);
 517             }
 518           d = append(d, "@", e);
 519         }
 520       d = append(d, u->host, e);
 521       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 522         {
 523           char z[10];
 524           sprintf(z, "%d", u->port);
 525           d = append(d, ":", e);
 526           d = append(d, z, e);
 527         }
 528     }
 529   if (u->rest)
 530     d = append(d, u->rest, e);
 531   if (!d)
 532     return URL_ERR_TOO_LONG;
 533   *d = 0;
 534   return 0;
 535 }
 536
 537 /* Error messages */
 538
 539 static char *errmsg[] = {
 540   "Something is wrong",
 541   "Too long",
 542   "Invalid character",
 543   "Invalid escape",
 544   "Invalid escaped character",
 545   "Invalid port number",
 546   "Relative URL not allowed",
 547   "Unknown protocol",
 548   "Syntax error",
 549   "Path underflow"
 550 };
 551
 552 char *
 553 url_error(uns err)
 554 {
 555   if (err >= sizeof(errmsg) / sizeof(char *))
 556     err = 0;
 557   return errmsg[err];
 558 }
 559
 560 /* Standard cookbook recipes */
 561
 562 int
 563 url_canon_split_rel(byte *u, byte *buf1, byte *buf2, struct url *url, struct url *base)
 564 {
 565   int err;
 566
 567   if (err = url_deescape(u, buf1))
 568     return err;
 569   if (err = url_split(buf1, url, buf2))
 570     return err;
 571   if (err = url_normalize(url, base))
 572     return err;
 573   return url_canonicalize(url);
 574 }
 575
 576 int
 577 url_auto_canonicalize_rel(byte *src, byte *dst, struct url *base)
 578 {
 579   byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 580   int err;
 581   struct url ur;
 582
 583   (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
 584    (err = url_pack(&ur, buf3)) ||
 585    (err = url_enescape(buf3, dst)));
 586   return err;
 587 }
 588
 589 /* Testing */
 590
 591 #ifdef TEST
 592
 593 int main(int argc, char **argv)
 594 {
 595   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 596   int err;
 597   struct url url, url0;
 598   char *base = "http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment";
 599
 600   if (argc != 2 && argc != 3)
 601     return 1;
 602   if (argc == 3)
 603     base = argv[2];
 604   if (err = url_deescape(argv[1], buf1))
 605     {
 606       printf("deesc: error %d\n", err);
 607       return 1;
 608     }
 609   printf("deesc: %s\n", buf1);
 610   if (err = url_split(buf1, &url, buf2))
 611     {
 612       printf("split: error %d\n", err);
 613       return 1;
 614     }
 615   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 616   if (err = url_split(base, &url0, buf3))
 617     {
 618       printf("split base: error %d\n", err);
 619       return 1;
 620     }
 621   if (err = url_normalize(&url0, NULL))
 622     {
 623       printf("normalize base: error %d\n", err);
 624       return 1;
 625     }
 626   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 627   if (err = url_normalize(&url, &url0))
 628     {
 629       printf("normalize: error %d\n", err);
 630       return 1;
 631     }
 632   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 633   if (err = url_canonicalize(&url))
 634     {
 635       printf("canonicalize: error %d\n", err);
 636       return 1;
 637     }
 638   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 639   if (err = url_pack(&url, buf4))
 640     {
 641       printf("pack: error %d\n", err);
 642       return 1;
 643     }
 644   printf("pack: %s\n", buf4);
 645   if (err = url_enescape(buf4, buf2))
 646     {
 647       printf("enesc: error %d\n", err);
 648       return 1;
 649     }
 650   printf("enesc: %s\n", buf2);
 651   return 0;
 652 }
 653
 654 #endif
 655
 656 struct component {
 657         byte *start;
 658         int length;
 659         u32 hash;
 660 };
 661
 662 static inline u32
 663 hashf(byte *start, int length)
 664 {
 665         u32 hf = length;
 666         while (length-- > 0)
 667                 hf = (hf << 8 | hf >> 24) ^ *start++;
 668         return hf;
 669 }
 670
 671 static inline uns
 672 repeat_count(struct component *comp, uns count, uns len)
 673 {
 674         struct component *orig_comp = comp;
 675         uns found = 0;
 676         while (1)
 677         {
 678                 uns i;
 679                 comp += len;
 680                 count -= len;
 681                 found++;
 682                 if (count < len)
 683                         return found;
 684                 for (i=0; i<len; i++)
 685                         if (comp[i].hash != orig_comp[i].hash
 686                         || comp[i].length != orig_comp[i].length
 687                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 688                                 return found;
 689         }
 690 }
 691
 692 int
 693 url_has_repeated_component(byte *url)
 694 {
 695         struct component *comp;
 696         uns comps, comp_len, rep_prefix;
 697         byte *c;
 698         uns i;
 699
 700         for (comps=0, c=url; c; comps++)
 701         {
 702                 c = strpbrk(c, url_component_separators);
 703                 if (c)
 704                         c++;
 705         }
 706         if (comps < url_min_repeat_count)
 707                 return 0;
 708         comp = alloca(comps * sizeof(struct component));
 709         for (i=0, c=url; c; i++)
 710         {
 711                 comp[i].start = c;
 712                 c = strpbrk(c, url_component_separators);
 713                 if (c)
 714                 {
 715                         comp[i].length = c - comp[i].start;
 716                         c++;
 717                 }
 718                 else
 719                         comp[i].length = strlen(comp[i].start);
 720         }
 721         ASSERT(i == comps);
 722         for (i=0; i<comps; i++)
 723                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 724         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 725                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 726                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 727                                 return comp_len;
 728         return 0;
 729 }