lib/url.c

   1 /*
   2  *      Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
   3  *
   4  *      (c) 1997--2002 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  */
  10
  11 #include "lib/lib.h"
  12 #include "lib/url.h"
  13 #include "lib/chartype.h"
  14 #include "lib/conf.h"
  15
  16 #include <string.h>
  17 #include <stdlib.h>
  18 #include <stdio.h>
  19
  20 /* Configuration */
  21
  22 static uns url_ignore_spaces;
  23 static uns url_ignore_underflow;
  24 static byte *url_component_separators = "";
  25 static uns url_min_repeat_count = 0x7fffffff;
  26 static uns url_max_repeat_length = 0;
  27
  28 static struct cfitem url_config[] = {
  29   { "URL",              CT_SECTION,     NULL },
  30   { "IgnoreSpaces",     CT_INT,         &url_ignore_spaces },
  31   { "IgnoreUnderflow",  CT_INT,         &url_ignore_underflow },
  32   { "ComponentSeparators",      CT_STRING,      &url_component_separators },
  33   { "MinRepeatCount",           CT_INT,         &url_min_repeat_count },
  34   { "MaxRepeatLength",          CT_INT,         &url_max_repeat_length },
  35   { NULL,               CT_STOP,        NULL }
  36 };
  37
  38 static void CONSTRUCTOR url_init_config(void)
  39 {
  40   cf_register(url_config);
  41 }
  42
  43 /* Escaping and de-escaping */
  44
  45 static uns
  46 enhex(uns x)
  47 {
  48   return (x<10) ? (x + '0') : (x - 10 + 'A');
  49 }
  50
  51 int
  52 url_deescape(byte *s, byte *d)
  53 {
  54   byte *dstart = d;
  55   byte *end = d + MAX_URL_SIZE - 10;
  56   while (*s)
  57     {
  58       if (d >= end)
  59         return URL_ERR_TOO_LONG;
  60       if (*s == '%')
  61         {
  62           unsigned int val;
  63           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  64             return URL_ERR_INVALID_ESCAPE;
  65           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  66           if (val < 0x20)
  67             return URL_ERR_INVALID_ESCAPED_CHAR;
  68           switch (val)
  69             {
  70             case ';':
  71               val = NCC_SEMICOLON; break;
  72             case '/':
  73               val = NCC_SLASH; break;
  74             case '?':
  75               val = NCC_QUEST; break;
  76             case ':':
  77               val = NCC_COLON; break;
  78             case '@':
  79               val = NCC_AT; break;
  80             case '=':
  81               val = NCC_EQUAL; break;
  82             case '&':
  83               val = NCC_AND; break;
  84             case '#':
  85               val = NCC_HASH; break;
  86             }
  87           *d++ = val;
  88           s += 3;
  89         }
  90       else if (*s > 0x20)
  91         *d++ = *s++;
  92       else if (Cspace(*s))
  93         {
  94           byte *s0 = s;
  95           while (Cspace(*s))
  96             s++;
  97           if (!url_ignore_spaces || !(!*s || d == dstart))
  98             {
  99               while (Cspace(*s0))
 100                 {
 101                   if (d >= end)
 102                     return URL_ERR_TOO_LONG;
 103                   *d++ = *s0++;
 104                 }
 105             }
 106         }
 107       else
 108         return URL_ERR_INVALID_CHAR;
 109     }
 110   *d = 0;
 111   return 0;
 112 }
 113
 114 int
 115 url_enescape(byte *s, byte *d)
 116 {
 117   byte *end = d + MAX_URL_SIZE - 10;
 118   unsigned int c;
 119
 120   while (c = *s)
 121     {
 122       if (d >= end)
 123         return URL_ERR_TOO_LONG;
 124       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 125           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 126           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 127           c == ',' ||
 128           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 129           c == '=' || c == '&' || c == '#' || c == ';')
 130         *d++ = *s++;
 131       else
 132         {
 133           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
 134           *d++ = '%';
 135           *d++ = enhex(val >> 4);
 136           *d++ = enhex(val & 0x0f);
 137           s++;
 138         }
 139     }
 140   *d = 0;
 141   return 0;
 142 }
 143
 144 /* Split an URL (several parts may be copied to the destination buffer) */
 145
 146 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 147 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 148
 149 uns
 150 identify_protocol(byte *p)
 151 {
 152   uns i;
 153
 154   for(i=1; i<URL_PROTO_MAX; i++)
 155     if (!strcasecmp(p, url_proto_names[i]))
 156       return i;
 157   return URL_PROTO_UNKNOWN;
 158 }
 159
 160 int
 161 url_split(byte *s, struct url *u, byte *d)
 162 {
 163   bzero(u, sizeof(struct url));
 164   u->port = ~0;
 165   u->bufend = d + MAX_URL_SIZE - 10;
 166
 167   if (s[0] != '/')                      /* Seek for "protocol:" */
 168     {
 169       byte *p = s;
 170       while (*p && Calnum(*p))
 171         p++;
 172       if (p != s && *p == ':')
 173         {
 174           u->protocol = d;
 175           while (s < p)
 176             *d++ = *s++;
 177           *d++ = 0;
 178           u->protoid = identify_protocol(u->protocol);
 179           s++;
 180           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 181             {
 182               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 183               int len = d - u->protocol;
 184               d -= len;
 185               s -= len;
 186               u->protocol = NULL;
 187               u->protoid = 0;
 188             }
 189         }
 190     }
 191
 192   if (s[0] == '/')                      /* Host spec or absolute path */
 193     {
 194       if (s[1] == '/')                  /* Host spec */
 195         {
 196           byte *q, *w, *e;
 197           char *ep;
 198
 199           s += 2;
 200           q = d;
 201           while (*s && *s != '/')       /* Copy user:passwd@host:port */
 202             *d++ = *s++;
 203           *d++ = 0;
 204           w = strchr(q, '@');
 205           if (w)                        /* user:passwd present */
 206             {
 207               *w++ = 0;
 208               u->user = q;
 209               if (e = strchr(q, ':'))
 210                 {
 211                   *e++ = 0;
 212                   u->pass = e;
 213                 }
 214             }
 215           else
 216             w = q;
 217           e = strchr(w, ':');
 218           if (e)                        /* host:port present */
 219             {
 220               uns p;
 221               *e++ = 0;
 222               p = strtoul(e, &ep, 10);
 223               if (ep && *ep || p > 65535)
 224                 return URL_ERR_INVALID_PORT;
 225               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 226                 u->port = p;
 227             }
 228           u->host = w;
 229         }
 230     }
 231
 232   u->rest = s;
 233   u->buf = d;
 234   return 0;
 235 }
 236
 237 /* Normalization according to given base URL */
 238
 239 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 240
 241 static int
 242 relpath_merge(struct url *u, struct url *b)
 243 {
 244   byte *a = u->rest;
 245   byte *o = b->rest;
 246   byte *d = u->buf;
 247   byte *e = u->bufend;
 248   byte *p;
 249
 250   if (a[0] == '/')                      /* Absolute path => OK */
 251     return 0;
 252   if (o[0] != '/')
 253     return URL_PATH_UNDERFLOW;
 254
 255   if (!a[0])                            /* Empty URL -> inherit everything */
 256     {
 257       u->rest = b->rest;
 258       return 0;
 259     }
 260
 261   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 262
 263   if (a[0] == '#')                      /* Another fragment */
 264     {
 265       for(p=o; *p && *p != '#'; p++)
 266         ;
 267       goto copy;
 268     }
 269   if (a[0] == '?')                      /* New query */
 270     {
 271       for(p=o; *p && *p != '#' && *p != '?'; p++)
 272         ;
 273       goto copy;
 274     }
 275   if (a[0] == ';')                      /* Change parameters */
 276     {
 277       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 278         ;
 279       goto copy;
 280     }
 281
 282   p = NULL;                             /* Copy original path and find the last slash */
 283   while (*o && *o != ';' && *o != '?' && *o != '#')
 284     {
 285       if (d >= e)
 286         return URL_ERR_TOO_LONG;
 287       if ((*d++ = *o++) == '/')
 288         p = d;
 289     }
 290   if (!p)
 291     return URL_ERR_REL_NOTHING;
 292   d = p;
 293
 294   while (*a)
 295     {
 296       if (a[0] == '.')
 297         {
 298           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 299             {
 300               a++;
 301               if (a[0])
 302                 a++;
 303               continue;
 304             }
 305           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 306             {
 307               a += 2;
 308               if (a[0])
 309                 a++;
 310               if (d <= u->buf + 1)
 311                 {
 312                   /*
 313                    * RFC 1808 says we should leave ".." as a path segment, but
 314                    * we intentionally break the rule and refuse the URL.
 315                    */
 316                   if (!url_ignore_underflow)
 317                     return URL_PATH_UNDERFLOW;
 318                 }
 319               else
 320                 {
 321                   d--;                  /* Discard trailing slash */
 322                   while (d[-1] != '/')
 323                     d--;
 324                 }
 325               continue;
 326             }
 327         }
 328       while (a[0] && a[0] != '/')
 329         {
 330           if (d >= e)
 331             return URL_ERR_TOO_LONG;
 332           *d++ = *a++;
 333         }
 334       if (a[0])
 335         *d++ = *a++;
 336     }
 337
 338 okay:
 339   *d++ = 0;
 340   u->buf = d;
 341   return 0;
 342
 343 copy:                                   /* Combine part of old URL with the new one */
 344   while (o < p)
 345     if (d < e)
 346       *d++ = *o++;
 347     else
 348       return URL_ERR_TOO_LONG;
 349   while (*a)
 350     if (d < e)
 351       *d++ = *a++;
 352     else
 353       return URL_ERR_TOO_LONG;
 354   goto okay;
 355 }
 356
 357 int
 358 url_normalize(struct url *u, struct url *b)
 359 {
 360   int err;
 361
 362   /* Basic checks */
 363   if (url_proto_path_flags[u->protoid] && !u->host ||
 364       u->host && !*u->host ||
 365       !u->host && u->user ||
 366       !u->user && u->pass ||
 367       !u->rest)
 368     return URL_SYNTAX_ERROR;
 369
 370   if (!u->protocol)
 371     {
 372       /* Now we know it's a relative URL. Do we have any base? */
 373       if (!b || !url_proto_path_flags[b->protoid])
 374         return URL_ERR_REL_NOTHING;
 375       u->protocol = b->protocol;
 376       u->protoid = b->protoid;
 377
 378       /* Reference to the same host */
 379       if (!u->host)
 380         {
 381           u->host = b->host;
 382           u->user = b->user;
 383           u->pass = b->pass;
 384           u->port = b->port;
 385           if (err = relpath_merge(u, b))
 386             return err;
 387         }
 388     }
 389
 390   /* Fill in missing info */
 391   if (u->port == ~0U)
 392     u->port = std_ports[u->protoid];
 393
 394   return 0;
 395 }
 396
 397 /* Name canonicalization */
 398
 399 static void
 400 lowercase(byte *b)
 401 {
 402   if (b)
 403     while (*b)
 404       {
 405         if (*b >= 'A' && *b <= 'Z')
 406           *b = *b + 0x20;
 407         b++;
 408       }
 409 }
 410
 411 static void
 412 kill_end_dot(byte *b)
 413 {
 414   byte *k;
 415
 416   if (b)
 417     {
 418       k = b + strlen(b) - 1;
 419       while (k > b && *k == '.')
 420         *k-- = 0;
 421     }
 422 }
 423
 424 int
 425 url_canonicalize(struct url *u)
 426 {
 427   char *c;
 428
 429   lowercase(u->protocol);
 430   lowercase(u->host);
 431   kill_end_dot(u->host);
 432   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 433     u->rest = "/";
 434   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 435     *c = 0;
 436   return 0;
 437 }
 438
 439 /* Pack a broken-down URL */
 440
 441 static byte *
 442 append(byte *d, byte *s, byte *e)
 443 {
 444   if (d)
 445     while (*s)
 446       {
 447         if (d >= e)
 448           return NULL;
 449         *d++ = *s++;
 450       }
 451   return d;
 452 }
 453
 454 int
 455 url_pack(struct url *u, byte *d)
 456 {
 457   byte *e = d + MAX_URL_SIZE - 10;
 458
 459   if (u->protocol)
 460     {
 461       d = append(d, u->protocol, e);
 462       d = append(d, ":", e);
 463       u->protoid = identify_protocol(u->protocol);
 464     }
 465   if (u->host)
 466     {
 467       d = append(d, "//", e);
 468       if (u->user)
 469         {
 470           d = append(d, u->user, e);
 471           if (u->pass)
 472             {
 473               d = append(d, ":", e);
 474               d = append(d, u->pass, e);
 475             }
 476           d = append(d, "@", e);
 477         }
 478       d = append(d, u->host, e);
 479       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 480         {
 481           char z[10];
 482           sprintf(z, "%d", u->port);
 483           d = append(d, ":", e);
 484           d = append(d, z, e);
 485         }
 486     }
 487   if (u->rest)
 488     d = append(d, u->rest, e);
 489   if (!d)
 490     return URL_ERR_TOO_LONG;
 491   *d = 0;
 492   return 0;
 493 }
 494
 495 /* Error messages */
 496
 497 static char *errmsg[] = {
 498   "Something is wrong",
 499   "Too long",
 500   "Invalid character",
 501   "Invalid escape",
 502   "Invalid escaped character",
 503   "Invalid port number",
 504   "Relative URL not allowed",
 505   "Unknown protocol",
 506   "Syntax error",
 507   "Path underflow"
 508 };
 509
 510 char *
 511 url_error(uns err)
 512 {
 513   if (err >= sizeof(errmsg) / sizeof(char *))
 514     err = 0;
 515   return errmsg[err];
 516 }
 517
 518 /* Standard cookbook recipes */
 519
 520 int
 521 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
 522 {
 523   int err;
 524
 525   if (err = url_deescape(u, buf1))
 526     return err;
 527   if (err = url_split(buf1, url, buf2))
 528     return err;
 529   if (err = url_normalize(url, NULL))
 530     return err;
 531   return url_canonicalize(url);
 532 }
 533
 534 int
 535 url_auto_canonicalize(byte *src, byte *dst)
 536 {
 537   byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 538   int err;
 539   struct url ur;
 540
 541   (void)((err = url_canon_split(src, buf1, buf2, &ur)) ||
 542    (err = url_pack(&ur, buf3)) ||
 543    (err = url_enescape(buf3, dst)));
 544   return err;
 545 }
 546
 547 /* Testing */
 548
 549 #ifdef TEST
 550
 551 int main(int argc, char **argv)
 552 {
 553   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 554   int err;
 555   struct url url, url0;
 556
 557   if (argc != 2)
 558     return 1;
 559   if (err = url_deescape(argv[1], buf1))
 560     {
 561       printf("deesc: error %d\n", err);
 562       return 1;
 563     }
 564   printf("deesc: %s\n", buf1);
 565   if (err = url_split(buf1, &url, buf2))
 566     {
 567       printf("split: error %d\n", err);
 568       return 1;
 569     }
 570   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 571   if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment", &url0, buf3))
 572     {
 573       printf("split base: error %d\n", err);
 574       return 1;
 575     }
 576   if (err = url_normalize(&url0, NULL))
 577     {
 578       printf("normalize base: error %d\n", err);
 579       return 1;
 580     }
 581   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 582   if (err = url_normalize(&url, &url0))
 583     {
 584       printf("normalize: error %d\n", err);
 585       return 1;
 586     }
 587   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 588   if (err = url_canonicalize(&url))
 589     {
 590       printf("canonicalize: error %d\n", err);
 591       return 1;
 592     }
 593   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 594   if (err = url_pack(&url, buf4))
 595     {
 596       printf("pack: error %d\n", err);
 597       return 1;
 598     }
 599   printf("pack: %s\n", buf4);
 600   if (err = url_enescape(buf4, buf2))
 601     {
 602       printf("enesc: error %d\n", err);
 603       return 1;
 604     }
 605   printf("enesc: %s\n", buf2);
 606   return 0;
 607 }
 608
 609 #endif
 610
 611 struct component {
 612         byte *start;
 613         int length;
 614         u32 hash;
 615 };
 616
 617 static inline u32
 618 hashf(byte *start, int length)
 619 {
 620         u32 hf = length;
 621         while (length-- > 0)
 622                 hf = (hf << 8 | hf >> 24) ^ *start++;
 623         return hf;
 624 }
 625
 626 static inline uns
 627 repeat_count(struct component *comp, uns count, uns len)
 628 {
 629         struct component *orig_comp = comp;
 630         uns found = 0;
 631         while (1)
 632         {
 633                 uns i;
 634                 comp += len;
 635                 count -= len;
 636                 found++;
 637                 if (count < len)
 638                         return found;
 639                 for (i=0; i<len; i++)
 640                         if (comp[i].hash != orig_comp[i].hash
 641                         || comp[i].length != orig_comp[i].length
 642                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 643                                 return found;
 644         }
 645 }
 646
 647 int
 648 url_has_repeated_component(byte *url)
 649 {
 650         struct component *comp;
 651         uns comps, comp_len, rep_prefix;
 652         byte *c;
 653         uns i;
 654
 655         for (comps=0, c=url; c; comps++)
 656         {
 657                 c = strpbrk(c, url_component_separators);
 658                 if (c)
 659                         c++;
 660         }
 661         if (comps < url_min_repeat_count)
 662                 return 0;
 663         comp = alloca(comps * sizeof(struct component));
 664         for (i=0, c=url; c; i++)
 665         {
 666                 comp[i].start = c;
 667                 c = strpbrk(c, url_component_separators);
 668                 if (c)
 669                 {
 670                         comp[i].length = c - comp[i].start;
 671                         c++;
 672                 }
 673                 else
 674                         comp[i].length = strlen(comp[i].start);
 675         }
 676         ASSERT(i == comps);
 677         for (i=0; i<comps; i++)
 678                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 679         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 680                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 681                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 682                                 return comp_len;
 683         return 0;
 684 }