lib/url.c

   1 /*
   2  *      Sherlock Library -- URL Functions
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  *
  10  *      The URL syntax corresponds to RFC 2396 with several exceptions:
  11  *
  12  *         o  Escaping of special characters still follows RFC 1738.
  13  *         o  Interpretation of path parameters follows RFC 1808.
  14  *
  15  *      XXX: The buffer handling in this module is really horrible, but it works.
  16  */
  17
  18 #include "lib/lib.h"
  19 #include "lib/url.h"
  20 #include "lib/chartype.h"
  21 #include "lib/conf.h"
  22
  23 #include <string.h>
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <alloca.h>
  27
  28 /* Configuration */
  29
  30 static uns url_ignore_spaces;
  31 static uns url_ignore_underflow;
  32 static byte *url_component_separators = "";
  33 static uns url_min_repeat_count = 0x7fffffff;
  34 static uns url_max_repeat_length = 0;
  35
  36 static struct cfitem url_config[] = {
  37   { "URL",                              CT_SECTION,     NULL },
  38   { "IgnoreSpaces",                     CT_INT,         &url_ignore_spaces },
  39   { "IgnoreUnderflow",                  CT_INT,         &url_ignore_underflow },
  40   { "ComponentSeparators",              CT_STRING,      &url_component_separators },
  41   { "MinRepeatCount",                   CT_INT,         &url_min_repeat_count },
  42   { "MaxRepeatLength",                  CT_INT,         &url_max_repeat_length },
  43   { NULL,                               CT_STOP,        NULL }
  44 };
  45
  46 static void CONSTRUCTOR url_init_config(void)
  47 {
  48   cf_register(url_config);
  49 }
  50
  51 /* Escaping and de-escaping */
  52
  53 static uns
  54 enhex(uns x)
  55 {
  56   return (x<10) ? (x + '0') : (x - 10 + 'A');
  57 }
  58
  59 int
  60 url_deescape(byte *s, byte *d)
  61 {
  62   byte *dstart = d;
  63   byte *end = d + MAX_URL_SIZE - 10;
  64   while (*s)
  65     {
  66       if (d >= end)
  67         return URL_ERR_TOO_LONG;
  68       if (*s == '%')
  69         {
  70           unsigned int val;
  71           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  72             return URL_ERR_INVALID_ESCAPE;
  73           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  74           if (val < 0x20)
  75             return URL_ERR_INVALID_ESCAPED_CHAR;
  76           switch (val)
  77             {
  78             case ';':
  79               val = NCC_SEMICOLON; break;
  80             case '/':
  81               val = NCC_SLASH; break;
  82             case '?':
  83               val = NCC_QUEST; break;
  84             case ':':
  85               val = NCC_COLON; break;
  86             case '@':
  87               val = NCC_AT; break;
  88             case '=':
  89               val = NCC_EQUAL; break;
  90             case '&':
  91               val = NCC_AND; break;
  92             case '#':
  93               val = NCC_HASH; break;
  94             }
  95           *d++ = val;
  96           s += 3;
  97         }
  98       else if (*s > 0x20)
  99         *d++ = *s++;
 100       else if (Cspace(*s))
 101         {
 102           byte *s0 = s;
 103           while (Cspace(*s))
 104             s++;
 105           if (!url_ignore_spaces || !(!*s || d == dstart))
 106             {
 107               while (Cspace(*s0))
 108                 {
 109                   if (d >= end)
 110                     return URL_ERR_TOO_LONG;
 111                   *d++ = *s0++;
 112                 }
 113             }
 114         }
 115       else
 116         return URL_ERR_INVALID_CHAR;
 117     }
 118   *d = 0;
 119   return 0;
 120 }
 121
 122 int
 123 url_enescape(byte *s, byte *d)
 124 {
 125   byte *end = d + MAX_URL_SIZE - 10;
 126   unsigned int c;
 127
 128   while (c = *s)
 129     {
 130       if (d >= end)
 131         return URL_ERR_TOO_LONG;
 132       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 133           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 134           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 135           c == ',' ||
 136           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 137           c == '=' || c == '&' || c == '#' || c == ';')
 138         *d++ = *s++;
 139       else
 140         {
 141           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
 142           *d++ = '%';
 143           *d++ = enhex(val >> 4);
 144           *d++ = enhex(val & 0x0f);
 145           s++;
 146         }
 147     }
 148   *d = 0;
 149   return 0;
 150 }
 151
 152 /* Split an URL (several parts may be copied to the destination buffer) */
 153
 154 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 155 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 156
 157 uns
 158 identify_protocol(byte *p)
 159 {
 160   uns i;
 161
 162   for(i=1; i<URL_PROTO_MAX; i++)
 163     if (!strcasecmp(p, url_proto_names[i]))
 164       return i;
 165   return URL_PROTO_UNKNOWN;
 166 }
 167
 168 int
 169 url_split(byte *s, struct url *u, byte *d)
 170 {
 171   bzero(u, sizeof(struct url));
 172   u->port = ~0;
 173   u->bufend = d + MAX_URL_SIZE - 10;
 174
 175   if (s[0] != '/')                      /* Seek for "protocol:" */
 176     {
 177       byte *p = s;
 178       while (*p && Calnum(*p))
 179         p++;
 180       if (p != s && *p == ':')
 181         {
 182           u->protocol = d;
 183           while (s < p)
 184             *d++ = *s++;
 185           *d++ = 0;
 186           u->protoid = identify_protocol(u->protocol);
 187           s++;
 188           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 189             {
 190               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 191               int len = d - u->protocol;
 192               d -= len;
 193               s -= len;
 194               u->protocol = NULL;
 195               u->protoid = 0;
 196             }
 197         }
 198     }
 199
 200   if (s[0] == '/')                      /* Host spec or absolute path */
 201     {
 202       if (s[1] == '/')                  /* Host spec */
 203         {
 204           byte *q, *w, *e;
 205           char *ep;
 206
 207           s += 2;
 208           q = d;
 209           while (*s && *s != '/' && *s != '?')  /* Copy user:passwd@host:port */
 210             *d++ = *s++;
 211           *d++ = 0;
 212           w = strchr(q, '@');
 213           if (w)                        /* user:passwd present */
 214             {
 215               *w++ = 0;
 216               u->user = q;
 217               if (e = strchr(q, ':'))
 218                 {
 219                   *e++ = 0;
 220                   u->pass = e;
 221                 }
 222             }
 223           else
 224             w = q;
 225           e = strchr(w, ':');
 226           if (e)                        /* host:port present */
 227             {
 228               uns p;
 229               *e++ = 0;
 230               p = strtoul(e, &ep, 10);
 231               if (ep && *ep || p > 65535)
 232                 return URL_ERR_INVALID_PORT;
 233               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 234                 u->port = p;
 235             }
 236           u->host = w;
 237         }
 238     }
 239
 240   u->rest = s;
 241   u->buf = d;
 242   return 0;
 243 }
 244
 245 /* Normalization according to given base URL */
 246
 247 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 248
 249 static int
 250 relpath_merge(struct url *u, struct url *b)
 251 {
 252   byte *a = u->rest;
 253   byte *o = b->rest;
 254   byte *d = u->buf;
 255   byte *e = u->bufend;
 256   byte *p;
 257
 258   if (a[0] == '/')                      /* Absolute path => OK */
 259     return 0;
 260   if (o[0] != '/' && o[0] != '?')
 261     return URL_PATH_UNDERFLOW;
 262
 263   if (!a[0])                            /* Empty URL -> inherit everything */
 264     {
 265       u->rest = b->rest;
 266       return 0;
 267     }
 268
 269   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 270
 271   if (a[0] == '#')                      /* Another fragment */
 272     {
 273       for(p=o; *p && *p != '#'; p++)
 274         ;
 275       goto copy;
 276     }
 277   if (a[0] == '?')                      /* New query */
 278     {
 279       for(p=o; *p && *p != '#' && *p != '?'; p++)
 280         ;
 281       goto copy;
 282     }
 283   if (a[0] == ';')                      /* Change parameters */
 284     {
 285       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 286         ;
 287       goto copy;
 288     }
 289
 290   p = NULL;                             /* Copy original path and find the last slash */
 291   while (*o && *o != ';' && *o != '?' && *o != '#')
 292     {
 293       if (d >= e)
 294         return URL_ERR_TOO_LONG;
 295       if ((*d++ = *o++) == '/')
 296         p = d;
 297     }
 298   if (!p)
 299     return URL_ERR_REL_NOTHING;
 300   d = p;
 301
 302   while (*a)
 303     {
 304       if (a[0] == '.')
 305         {
 306           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 307             {
 308               a++;
 309               if (a[0])
 310                 a++;
 311               continue;
 312             }
 313           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 314             {
 315               a += 2;
 316               if (a[0])
 317                 a++;
 318               if (d <= u->buf + 1)
 319                 {
 320                   /*
 321                    * RFC 1808 says we should leave ".." as a path segment, but
 322                    * we intentionally break the rule and refuse the URL.
 323                    */
 324                   if (!url_ignore_underflow)
 325                     return URL_PATH_UNDERFLOW;
 326                 }
 327               else
 328                 {
 329                   d--;                  /* Discard trailing slash */
 330                   while (d[-1] != '/')
 331                     d--;
 332                 }
 333               continue;
 334             }
 335         }
 336       while (a[0] && a[0] != '/')
 337         {
 338           if (d >= e)
 339             return URL_ERR_TOO_LONG;
 340           *d++ = *a++;
 341         }
 342       if (a[0])
 343         *d++ = *a++;
 344     }
 345
 346 okay:
 347   *d++ = 0;
 348   u->buf = d;
 349   return 0;
 350
 351 copy:                                   /* Combine part of old URL with the new one */
 352   while (o < p)
 353     if (d < e)
 354       *d++ = *o++;
 355     else
 356       return URL_ERR_TOO_LONG;
 357   while (*a)
 358     if (d < e)
 359       *d++ = *a++;
 360     else
 361       return URL_ERR_TOO_LONG;
 362   goto okay;
 363 }
 364
 365 int
 366 url_normalize(struct url *u, struct url *b)
 367 {
 368   int err;
 369
 370   /* Basic checks */
 371   if (url_proto_path_flags[u->protoid] && !u->host ||
 372       u->host && !*u->host ||
 373       !u->host && u->user ||
 374       !u->user && u->pass ||
 375       !u->rest)
 376     return URL_SYNTAX_ERROR;
 377
 378   if (!u->protocol)
 379     {
 380       /* Now we know it's a relative URL. Do we have any base? */
 381       if (!b || !url_proto_path_flags[b->protoid])
 382         return URL_ERR_REL_NOTHING;
 383       u->protocol = b->protocol;
 384       u->protoid = b->protoid;
 385
 386       /* Reference to the same host */
 387       if (!u->host)
 388         {
 389           u->host = b->host;
 390           u->user = b->user;
 391           u->pass = b->pass;
 392           u->port = b->port;
 393           if (err = relpath_merge(u, b))
 394             return err;
 395         }
 396     }
 397
 398   /* Change path "?" to "/?" because it's the true meaning */
 399   if (u->rest[0] == '?')
 400     {
 401       int l = strlen(u->rest);
 402       if (u->bufend - u->buf < l+1)
 403         return URL_ERR_TOO_LONG;
 404       u->buf[0] = '/';
 405       memcpy(u->buf+1, u->rest, l+1);
 406       u->rest = u->buf;
 407       u->buf += l+2;
 408     }
 409
 410   /* Fill in missing info */
 411   if (u->port == ~0U)
 412     u->port = std_ports[u->protoid];
 413
 414   return 0;
 415 }
 416
 417 /* Name canonicalization */
 418
 419 static void
 420 lowercase(byte *b)
 421 {
 422   if (b)
 423     while (*b)
 424       {
 425         if (*b >= 'A' && *b <= 'Z')
 426           *b = *b + 0x20;
 427         b++;
 428       }
 429 }
 430
 431 static void
 432 kill_end_dot(byte *b)
 433 {
 434   byte *k;
 435
 436   if (b)
 437     {
 438       k = b + strlen(b) - 1;
 439       while (k > b && *k == '.')
 440         *k-- = 0;
 441     }
 442 }
 443
 444 int
 445 url_canonicalize(struct url *u)
 446 {
 447   char *c;
 448
 449   lowercase(u->protocol);
 450   lowercase(u->host);
 451   kill_end_dot(u->host);
 452   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 453     u->rest = "/";
 454   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 455     *c = 0;
 456   return 0;
 457 }
 458
 459 /* Pack a broken-down URL */
 460
 461 static byte *
 462 append(byte *d, byte *s, byte *e)
 463 {
 464   if (d)
 465     while (*s)
 466       {
 467         if (d >= e)
 468           return NULL;
 469         *d++ = *s++;
 470       }
 471   return d;
 472 }
 473
 474 int
 475 url_pack(struct url *u, byte *d)
 476 {
 477   byte *e = d + MAX_URL_SIZE - 10;
 478
 479   if (u->protocol)
 480     {
 481       d = append(d, u->protocol, e);
 482       d = append(d, ":", e);
 483       u->protoid = identify_protocol(u->protocol);
 484     }
 485   if (u->host)
 486     {
 487       d = append(d, "//", e);
 488       if (u->user)
 489         {
 490           d = append(d, u->user, e);
 491           if (u->pass)
 492             {
 493               d = append(d, ":", e);
 494               d = append(d, u->pass, e);
 495             }
 496           d = append(d, "@", e);
 497         }
 498       d = append(d, u->host, e);
 499       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 500         {
 501           char z[10];
 502           sprintf(z, "%d", u->port);
 503           d = append(d, ":", e);
 504           d = append(d, z, e);
 505         }
 506     }
 507   if (u->rest)
 508     d = append(d, u->rest, e);
 509   if (!d)
 510     return URL_ERR_TOO_LONG;
 511   *d = 0;
 512   return 0;
 513 }
 514
 515 /* Error messages */
 516
 517 static char *errmsg[] = {
 518   "Something is wrong",
 519   "Too long",
 520   "Invalid character",
 521   "Invalid escape",
 522   "Invalid escaped character",
 523   "Invalid port number",
 524   "Relative URL not allowed",
 525   "Unknown protocol",
 526   "Syntax error",
 527   "Path underflow"
 528 };
 529
 530 char *
 531 url_error(uns err)
 532 {
 533   if (err >= sizeof(errmsg) / sizeof(char *))
 534     err = 0;
 535   return errmsg[err];
 536 }
 537
 538 /* Standard cookbook recipes */
 539
 540 int
 541 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
 542 {
 543   int err;
 544
 545   if (err = url_deescape(u, buf1))
 546     return err;
 547   if (err = url_split(buf1, url, buf2))
 548     return err;
 549   if (err = url_normalize(url, NULL))
 550     return err;
 551   return url_canonicalize(url);
 552 }
 553
 554 int
 555 url_auto_canonicalize(byte *src, byte *dst)
 556 {
 557   byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 558   int err;
 559   struct url ur;
 560
 561   (void)((err = url_canon_split(src, buf1, buf2, &ur)) ||
 562    (err = url_pack(&ur, buf3)) ||
 563    (err = url_enescape(buf3, dst)));
 564   return err;
 565 }
 566
 567 /* Testing */
 568
 569 #ifdef TEST
 570
 571 int main(int argc, char **argv)
 572 {
 573   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 574   int err;
 575   struct url url, url0;
 576   char *base = "http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment";
 577
 578   if (argc != 2 && argc != 3)
 579     return 1;
 580   if (argc == 3)
 581     base = argv[2];
 582   if (err = url_deescape(argv[1], buf1))
 583     {
 584       printf("deesc: error %d\n", err);
 585       return 1;
 586     }
 587   printf("deesc: %s\n", buf1);
 588   if (err = url_split(buf1, &url, buf2))
 589     {
 590       printf("split: error %d\n", err);
 591       return 1;
 592     }
 593   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 594   if (err = url_split(base, &url0, buf3))
 595     {
 596       printf("split base: error %d\n", err);
 597       return 1;
 598     }
 599   if (err = url_normalize(&url0, NULL))
 600     {
 601       printf("normalize base: error %d\n", err);
 602       return 1;
 603     }
 604   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 605   if (err = url_normalize(&url, &url0))
 606     {
 607       printf("normalize: error %d\n", err);
 608       return 1;
 609     }
 610   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 611   if (err = url_canonicalize(&url))
 612     {
 613       printf("canonicalize: error %d\n", err);
 614       return 1;
 615     }
 616   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 617   if (err = url_pack(&url, buf4))
 618     {
 619       printf("pack: error %d\n", err);
 620       return 1;
 621     }
 622   printf("pack: %s\n", buf4);
 623   if (err = url_enescape(buf4, buf2))
 624     {
 625       printf("enesc: error %d\n", err);
 626       return 1;
 627     }
 628   printf("enesc: %s\n", buf2);
 629   return 0;
 630 }
 631
 632 #endif
 633
 634 struct component {
 635         byte *start;
 636         int length;
 637         u32 hash;
 638 };
 639
 640 static inline u32
 641 hashf(byte *start, int length)
 642 {
 643         u32 hf = length;
 644         while (length-- > 0)
 645                 hf = (hf << 8 | hf >> 24) ^ *start++;
 646         return hf;
 647 }
 648
 649 static inline uns
 650 repeat_count(struct component *comp, uns count, uns len)
 651 {
 652         struct component *orig_comp = comp;
 653         uns found = 0;
 654         while (1)
 655         {
 656                 uns i;
 657                 comp += len;
 658                 count -= len;
 659                 found++;
 660                 if (count < len)
 661                         return found;
 662                 for (i=0; i<len; i++)
 663                         if (comp[i].hash != orig_comp[i].hash
 664                         || comp[i].length != orig_comp[i].length
 665                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 666                                 return found;
 667         }
 668 }
 669
 670 int
 671 url_has_repeated_component(byte *url)
 672 {
 673         struct component *comp;
 674         uns comps, comp_len, rep_prefix;
 675         byte *c;
 676         uns i;
 677
 678         for (comps=0, c=url; c; comps++)
 679         {
 680                 c = strpbrk(c, url_component_separators);
 681                 if (c)
 682                         c++;
 683         }
 684         if (comps < url_min_repeat_count)
 685                 return 0;
 686         comp = alloca(comps * sizeof(struct component));
 687         for (i=0, c=url; c; i++)
 688         {
 689                 comp[i].start = c;
 690                 c = strpbrk(c, url_component_separators);
 691                 if (c)
 692                 {
 693                         comp[i].length = c - comp[i].start;
 694                         c++;
 695                 }
 696                 else
 697                         comp[i].length = strlen(comp[i].start);
 698         }
 699         ASSERT(i == comps);
 700         for (i=0; i<comps; i++)
 701                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 702         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 703                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 704                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 705                                 return comp_len;
 706         return 0;
 707 }