lib/url.c

   1 /*
   2  *      Sherlock Library -- URL Functions
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  *
  10  *      The URL syntax corresponds to RFC 2396 with several exceptions:
  11  *
  12  *         o  Escaping of special characters still follows RFC 1738.
  13  *         o  Interpretation of path parameters follows RFC 1808.
  14  *
  15  *      XXX: The buffer handling in this module is really horrible, but it works.
  16  */
  17
  18 #include "lib/lib.h"
  19 #include "lib/url.h"
  20 #include "lib/chartype.h"
  21 #include "lib/conf.h"
  22
  23 #include <string.h>
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <alloca.h>
  27
  28 /* Configuration */
  29
  30 static uns url_ignore_spaces;
  31 static uns url_ignore_underflow;
  32 static byte *url_component_separators = "";
  33 static uns url_min_repeat_count = 0x7fffffff;
  34 static uns url_max_repeat_length = 0;
  35
  36 static struct cfitem url_config[] = {
  37   { "URL",                              CT_SECTION,     NULL },
  38   { "IgnoreSpaces",                     CT_INT,         &url_ignore_spaces },
  39   { "IgnoreUnderflow",                  CT_INT,         &url_ignore_underflow },
  40   { "ComponentSeparators",              CT_STRING,      &url_component_separators },
  41   { "MinRepeatCount",                   CT_INT,         &url_min_repeat_count },
  42   { "MaxRepeatLength",                  CT_INT,         &url_max_repeat_length },
  43   { NULL,                               CT_STOP,        NULL }
  44 };
  45
  46 static void CONSTRUCTOR url_init_config(void)
  47 {
  48   cf_register(url_config);
  49 }
  50
  51 /* Escaping and de-escaping */
  52
  53 static uns
  54 enhex(uns x)
  55 {
  56   return (x<10) ? (x + '0') : (x - 10 + 'A');
  57 }
  58
  59 int
  60 url_deescape(byte *s, byte *d)
  61 {
  62   byte *dstart = d;
  63   byte *end = d + MAX_URL_SIZE - 10;
  64   while (*s)
  65     {
  66       if (d >= end)
  67         return URL_ERR_TOO_LONG;
  68       if (*s == '%')
  69         {
  70           unsigned int val;
  71           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  72             return URL_ERR_INVALID_ESCAPE;
  73           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  74           if (val < 0x20)
  75             return URL_ERR_INVALID_ESCAPED_CHAR;
  76           switch (val)
  77             {
  78             case ';':
  79               val = NCC_SEMICOLON; break;
  80             case '/':
  81               val = NCC_SLASH; break;
  82             case '?':
  83               val = NCC_QUEST; break;
  84             case ':':
  85               val = NCC_COLON; break;
  86             case '@':
  87               val = NCC_AT; break;
  88             case '=':
  89               val = NCC_EQUAL; break;
  90             case '&':
  91               val = NCC_AND; break;
  92             case '#':
  93               val = NCC_HASH; break;
  94             }
  95           *d++ = val;
  96           s += 3;
  97         }
  98       else if (*s > 0x20)
  99         *d++ = *s++;
 100       else if (Cspace(*s))
 101         {
 102           byte *s0 = s;
 103           while (Cspace(*s))
 104             s++;
 105           if (!url_ignore_spaces || !(!*s || d == dstart))
 106             {
 107               while (Cspace(*s0))
 108                 {
 109                   if (d >= end)
 110                     return URL_ERR_TOO_LONG;
 111                   *d++ = *s0++;
 112                 }
 113             }
 114         }
 115       else
 116         return URL_ERR_INVALID_CHAR;
 117     }
 118   *d = 0;
 119   return 0;
 120 }
 121
 122 int
 123 url_enescape(byte *s, byte *d)
 124 {
 125   byte *end = d + MAX_URL_SIZE - 10;
 126   unsigned int c;
 127
 128   while (c = *s)
 129     {
 130       if (d >= end)
 131         return URL_ERR_TOO_LONG;
 132       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 133           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 134           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 135           c == ',' ||
 136           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 137           c == '=' || c == '&' || c == '#' || c == ';')
 138         *d++ = *s++;
 139       else
 140         {
 141           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
 142           *d++ = '%';
 143           *d++ = enhex(val >> 4);
 144           *d++ = enhex(val & 0x0f);
 145           s++;
 146         }
 147     }
 148   *d = 0;
 149   return 0;
 150 }
 151
 152 /* Split an URL (several parts may be copied to the destination buffer) */
 153
 154 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 155 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 156
 157 uns
 158 identify_protocol(byte *p)
 159 {
 160   uns i;
 161
 162   for(i=1; i<URL_PROTO_MAX; i++)
 163     if (!strcasecmp(p, url_proto_names[i]))
 164       return i;
 165   return URL_PROTO_UNKNOWN;
 166 }
 167
 168 int
 169 url_split(byte *s, struct url *u, byte *d)
 170 {
 171   bzero(u, sizeof(struct url));
 172   u->port = ~0;
 173   u->bufend = d + MAX_URL_SIZE - 10;
 174
 175   if (s[0] != '/')                      /* Seek for "protocol:" */
 176     {
 177       byte *p = s;
 178       while (*p && Calnum(*p))
 179         p++;
 180       if (p != s && *p == ':')
 181         {
 182           u->protocol = d;
 183           while (s < p)
 184             *d++ = *s++;
 185           *d++ = 0;
 186           u->protoid = identify_protocol(u->protocol);
 187           s++;
 188           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 189             {
 190               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 191               int len = d - u->protocol;
 192               d -= len;
 193               s -= len;
 194               u->protocol = NULL;
 195               u->protoid = 0;
 196             }
 197         }
 198     }
 199
 200   if (s[0] == '/')                      /* Host spec or absolute path */
 201     {
 202       if (s[1] == '/')                  /* Host spec */
 203         {
 204           byte *q, *w, *e;
 205           char *ep;
 206
 207           s += 2;
 208           q = d;
 209           while (*s && *s != '/' && *s != '?')  /* Copy user:passwd@host:port */
 210             *d++ = *s++;
 211           *d++ = 0;
 212           w = strchr(q, '@');
 213           if (w)                        /* user:passwd present */
 214             {
 215               *w++ = 0;
 216               u->user = q;
 217               if (e = strchr(q, ':'))
 218                 {
 219                   *e++ = 0;
 220                   u->pass = e;
 221                 }
 222             }
 223           else
 224             w = q;
 225           e = strchr(w, ':');
 226           if (e)                        /* host:port present */
 227             {
 228               uns p;
 229               *e++ = 0;
 230               p = strtoul(e, &ep, 10);
 231               if (ep && *ep || p > 65535)
 232                 return URL_ERR_INVALID_PORT;
 233               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 234                 u->port = p;
 235             }
 236           u->host = w;
 237         }
 238     }
 239
 240   u->rest = s;
 241   u->buf = d;
 242   return 0;
 243 }
 244
 245 /* Normalization according to given base URL */
 246
 247 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 248
 249 static int
 250 relpath_merge(struct url *u, struct url *b)
 251 {
 252   byte *a = u->rest;
 253   byte *o = b->rest;
 254   byte *d = u->buf;
 255   byte *e = u->bufend;
 256   byte *p;
 257
 258   if (a[0] == '/')                      /* Absolute path => OK */
 259     return 0;
 260   if (o[0] != '/' && o[0] != '?')
 261     return URL_PATH_UNDERFLOW;
 262
 263   if (!a[0])                            /* Empty URL -> inherit everything */
 264     {
 265       u->rest = b->rest;
 266       return 0;
 267     }
 268
 269   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 270
 271   if (a[0] == '#')                      /* Another fragment */
 272     {
 273       for(p=o; *p && *p != '#'; p++)
 274         ;
 275       goto copy;
 276     }
 277   if (a[0] == '?')                      /* New query */
 278     {
 279       for(p=o; *p && *p != '#' && *p != '?'; p++)
 280         ;
 281       goto copy;
 282     }
 283   if (a[0] == ';')                      /* Change parameters */
 284     {
 285       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 286         ;
 287       goto copy;
 288     }
 289
 290   p = NULL;                             /* Copy original path and find the last slash */
 291   while (*o && *o != ';' && *o != '?' && *o != '#')
 292     {
 293       if (d >= e)
 294         return URL_ERR_TOO_LONG;
 295       if ((*d++ = *o++) == '/')
 296         p = d;
 297     }
 298   if (!p)
 299     return URL_ERR_REL_NOTHING;
 300   d = p;
 301
 302   while (*a)
 303     {
 304       if (a[0] == '.')
 305         {
 306           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 307             {
 308               a++;
 309               if (a[0])
 310                 a++;
 311               continue;
 312             }
 313           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 314             {
 315               a += 2;
 316               if (a[0])
 317                 a++;
 318               if (d <= u->buf + 1)
 319                 {
 320                   /*
 321                    * RFC 1808 says we should leave ".." as a path segment, but
 322                    * we intentionally break the rule and refuse the URL.
 323                    */
 324                   if (!url_ignore_underflow)
 325                     return URL_PATH_UNDERFLOW;
 326                 }
 327               else
 328                 {
 329                   d--;                  /* Discard trailing slash */
 330                   while (d[-1] != '/')
 331                     d--;
 332                 }
 333               continue;
 334             }
 335         }
 336       while (a[0] && a[0] != '/')
 337         {
 338           if (d >= e)
 339             return URL_ERR_TOO_LONG;
 340           *d++ = *a++;
 341         }
 342       if (a[0])
 343         *d++ = *a++;
 344     }
 345
 346 okay:
 347   *d++ = 0;
 348   u->buf = d;
 349   return 0;
 350
 351 copy:                                   /* Combine part of old URL with the new one */
 352   while (o < p)
 353     if (d < e)
 354       *d++ = *o++;
 355     else
 356       return URL_ERR_TOO_LONG;
 357   while (*a)
 358     if (d < e)
 359       *d++ = *a++;
 360     else
 361       return URL_ERR_TOO_LONG;
 362   goto okay;
 363 }
 364
 365 int
 366 url_normalize(struct url *u, struct url *b)
 367 {
 368   int err;
 369
 370   /* Basic checks */
 371   if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
 372       !u->host && u->user ||
 373       !u->user && u->pass ||
 374       !u->rest)
 375     return URL_SYNTAX_ERROR;
 376
 377   if (!u->protocol)
 378     {
 379       /* Now we know it's a relative URL. Do we have any base? */
 380       if (!b || !url_proto_path_flags[b->protoid])
 381         return URL_ERR_REL_NOTHING;
 382       u->protocol = b->protocol;
 383       u->protoid = b->protoid;
 384
 385       /* Reference to the same host */
 386       if (!u->host)
 387         {
 388           u->host = b->host;
 389           u->user = b->user;
 390           u->pass = b->pass;
 391           u->port = b->port;
 392           if (err = relpath_merge(u, b))
 393             return err;
 394         }
 395     }
 396
 397   /* Change path "?" to "/?" because it's the true meaning */
 398   if (u->rest[0] == '?')
 399     {
 400       int l = strlen(u->rest);
 401       if (u->bufend - u->buf < l+1)
 402         return URL_ERR_TOO_LONG;
 403       u->buf[0] = '/';
 404       memcpy(u->buf+1, u->rest, l+1);
 405       u->rest = u->buf;
 406       u->buf += l+2;
 407     }
 408
 409   /* Fill in missing info */
 410   if (u->port == ~0U)
 411     u->port = std_ports[u->protoid];
 412
 413   return 0;
 414 }
 415
 416 /* Name canonicalization */
 417
 418 static void
 419 lowercase(byte *b)
 420 {
 421   if (b)
 422     while (*b)
 423       {
 424         if (*b >= 'A' && *b <= 'Z')
 425           *b = *b + 0x20;
 426         b++;
 427       }
 428 }
 429
 430 static void
 431 kill_end_dot(byte *b)
 432 {
 433   byte *k;
 434
 435   if (b)
 436     {
 437       k = b + strlen(b) - 1;
 438       while (k > b && *k == '.')
 439         *k-- = 0;
 440     }
 441 }
 442
 443 int
 444 url_canonicalize(struct url *u)
 445 {
 446   char *c;
 447
 448   lowercase(u->protocol);
 449   lowercase(u->host);
 450   kill_end_dot(u->host);
 451   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 452     u->rest = "/";
 453   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 454     *c = 0;
 455   return 0;
 456 }
 457
 458 /* Pack a broken-down URL */
 459
 460 static byte *
 461 append(byte *d, byte *s, byte *e)
 462 {
 463   if (d)
 464     while (*s)
 465       {
 466         if (d >= e)
 467           return NULL;
 468         *d++ = *s++;
 469       }
 470   return d;
 471 }
 472
 473 int
 474 url_pack(struct url *u, byte *d)
 475 {
 476   byte *e = d + MAX_URL_SIZE - 10;
 477
 478   if (u->protocol)
 479     {
 480       d = append(d, u->protocol, e);
 481       d = append(d, ":", e);
 482       u->protoid = identify_protocol(u->protocol);
 483     }
 484   if (u->host)
 485     {
 486       d = append(d, "//", e);
 487       if (u->user)
 488         {
 489           d = append(d, u->user, e);
 490           if (u->pass)
 491             {
 492               d = append(d, ":", e);
 493               d = append(d, u->pass, e);
 494             }
 495           d = append(d, "@", e);
 496         }
 497       d = append(d, u->host, e);
 498       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 499         {
 500           char z[10];
 501           sprintf(z, "%d", u->port);
 502           d = append(d, ":", e);
 503           d = append(d, z, e);
 504         }
 505     }
 506   if (u->rest)
 507     d = append(d, u->rest, e);
 508   if (!d)
 509     return URL_ERR_TOO_LONG;
 510   *d = 0;
 511   return 0;
 512 }
 513
 514 /* Error messages */
 515
 516 static char *errmsg[] = {
 517   "Something is wrong",
 518   "Too long",
 519   "Invalid character",
 520   "Invalid escape",
 521   "Invalid escaped character",
 522   "Invalid port number",
 523   "Relative URL not allowed",
 524   "Unknown protocol",
 525   "Syntax error",
 526   "Path underflow"
 527 };
 528
 529 char *
 530 url_error(uns err)
 531 {
 532   if (err >= sizeof(errmsg) / sizeof(char *))
 533     err = 0;
 534   return errmsg[err];
 535 }
 536
 537 /* Standard cookbook recipes */
 538
 539 int
 540 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
 541 {
 542   int err;
 543
 544   if (err = url_deescape(u, buf1))
 545     return err;
 546   if (err = url_split(buf1, url, buf2))
 547     return err;
 548   if (err = url_normalize(url, NULL))
 549     return err;
 550   return url_canonicalize(url);
 551 }
 552
 553 int
 554 url_auto_canonicalize(byte *src, byte *dst)
 555 {
 556   byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 557   int err;
 558   struct url ur;
 559
 560   (void)((err = url_canon_split(src, buf1, buf2, &ur)) ||
 561    (err = url_pack(&ur, buf3)) ||
 562    (err = url_enescape(buf3, dst)));
 563   return err;
 564 }
 565
 566 /* Testing */
 567
 568 #ifdef TEST
 569
 570 int main(int argc, char **argv)
 571 {
 572   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 573   int err;
 574   struct url url, url0;
 575   char *base = "http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment";
 576
 577   if (argc != 2 && argc != 3)
 578     return 1;
 579   if (argc == 3)
 580     base = argv[2];
 581   if (err = url_deescape(argv[1], buf1))
 582     {
 583       printf("deesc: error %d\n", err);
 584       return 1;
 585     }
 586   printf("deesc: %s\n", buf1);
 587   if (err = url_split(buf1, &url, buf2))
 588     {
 589       printf("split: error %d\n", err);
 590       return 1;
 591     }
 592   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 593   if (err = url_split(base, &url0, buf3))
 594     {
 595       printf("split base: error %d\n", err);
 596       return 1;
 597     }
 598   if (err = url_normalize(&url0, NULL))
 599     {
 600       printf("normalize base: error %d\n", err);
 601       return 1;
 602     }
 603   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 604   if (err = url_normalize(&url, &url0))
 605     {
 606       printf("normalize: error %d\n", err);
 607       return 1;
 608     }
 609   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 610   if (err = url_canonicalize(&url))
 611     {
 612       printf("canonicalize: error %d\n", err);
 613       return 1;
 614     }
 615   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 616   if (err = url_pack(&url, buf4))
 617     {
 618       printf("pack: error %d\n", err);
 619       return 1;
 620     }
 621   printf("pack: %s\n", buf4);
 622   if (err = url_enescape(buf4, buf2))
 623     {
 624       printf("enesc: error %d\n", err);
 625       return 1;
 626     }
 627   printf("enesc: %s\n", buf2);
 628   return 0;
 629 }
 630
 631 #endif
 632
 633 struct component {
 634         byte *start;
 635         int length;
 636         u32 hash;
 637 };
 638
 639 static inline u32
 640 hashf(byte *start, int length)
 641 {
 642         u32 hf = length;
 643         while (length-- > 0)
 644                 hf = (hf << 8 | hf >> 24) ^ *start++;
 645         return hf;
 646 }
 647
 648 static inline uns
 649 repeat_count(struct component *comp, uns count, uns len)
 650 {
 651         struct component *orig_comp = comp;
 652         uns found = 0;
 653         while (1)
 654         {
 655                 uns i;
 656                 comp += len;
 657                 count -= len;
 658                 found++;
 659                 if (count < len)
 660                         return found;
 661                 for (i=0; i<len; i++)
 662                         if (comp[i].hash != orig_comp[i].hash
 663                         || comp[i].length != orig_comp[i].length
 664                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 665                                 return found;
 666         }
 667 }
 668
 669 int
 670 url_has_repeated_component(byte *url)
 671 {
 672         struct component *comp;
 673         uns comps, comp_len, rep_prefix;
 674         byte *c;
 675         uns i;
 676
 677         for (comps=0, c=url; c; comps++)
 678         {
 679                 c = strpbrk(c, url_component_separators);
 680                 if (c)
 681                         c++;
 682         }
 683         if (comps < url_min_repeat_count)
 684                 return 0;
 685         comp = alloca(comps * sizeof(struct component));
 686         for (i=0, c=url; c; i++)
 687         {
 688                 comp[i].start = c;
 689                 c = strpbrk(c, url_component_separators);
 690                 if (c)
 691                 {
 692                         comp[i].length = c - comp[i].start;
 693                         c++;
 694                 }
 695                 else
 696                         comp[i].length = strlen(comp[i].start);
 697         }
 698         ASSERT(i == comps);
 699         for (i=0; i<comps; i++)
 700                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 701         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 702                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 703                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 704                                 return comp_len;
 705         return 0;
 706 }