lib/url.c

   1 /*
   2  *      Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
   3  *
   4  *      (c) 1997--2002 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  */
  10
  11 #include "lib/lib.h"
  12 #include "lib/url.h"
  13 #include "lib/chartype.h"
  14 #include "lib/conf.h"
  15
  16 #include <string.h>
  17 #include <stdlib.h>
  18 #include <stdio.h>
  19 #include <alloca.h>
  20
  21 /* Configuration */
  22
  23 static uns url_ignore_spaces;
  24 static uns url_ignore_underflow;
  25 static byte *url_component_separators = "";
  26 static uns url_min_repeat_count = 0x7fffffff;
  27 static uns url_max_repeat_length = 0;
  28
  29 static struct cfitem url_config[] = {
  30   { "URL",              CT_SECTION,     NULL },
  31   { "IgnoreSpaces",     CT_INT,         &url_ignore_spaces },
  32   { "IgnoreUnderflow",  CT_INT,         &url_ignore_underflow },
  33   { "ComponentSeparators",      CT_STRING,      &url_component_separators },
  34   { "MinRepeatCount",           CT_INT,         &url_min_repeat_count },
  35   { "MaxRepeatLength",          CT_INT,         &url_max_repeat_length },
  36   { NULL,               CT_STOP,        NULL }
  37 };
  38
  39 static void CONSTRUCTOR url_init_config(void)
  40 {
  41   cf_register(url_config);
  42 }
  43
  44 /* Escaping and de-escaping */
  45
  46 static uns
  47 enhex(uns x)
  48 {
  49   return (x<10) ? (x + '0') : (x - 10 + 'A');
  50 }
  51
  52 int
  53 url_deescape(byte *s, byte *d)
  54 {
  55   byte *dstart = d;
  56   byte *end = d + MAX_URL_SIZE - 10;
  57   while (*s)
  58     {
  59       if (d >= end)
  60         return URL_ERR_TOO_LONG;
  61       if (*s == '%')
  62         {
  63           unsigned int val;
  64           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  65             return URL_ERR_INVALID_ESCAPE;
  66           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  67           if (val < 0x20)
  68             return URL_ERR_INVALID_ESCAPED_CHAR;
  69           switch (val)
  70             {
  71             case ';':
  72               val = NCC_SEMICOLON; break;
  73             case '/':
  74               val = NCC_SLASH; break;
  75             case '?':
  76               val = NCC_QUEST; break;
  77             case ':':
  78               val = NCC_COLON; break;
  79             case '@':
  80               val = NCC_AT; break;
  81             case '=':
  82               val = NCC_EQUAL; break;
  83             case '&':
  84               val = NCC_AND; break;
  85             case '#':
  86               val = NCC_HASH; break;
  87             }
  88           *d++ = val;
  89           s += 3;
  90         }
  91       else if (*s > 0x20)
  92         *d++ = *s++;
  93       else if (Cspace(*s))
  94         {
  95           byte *s0 = s;
  96           while (Cspace(*s))
  97             s++;
  98           if (!url_ignore_spaces || !(!*s || d == dstart))
  99             {
 100               while (Cspace(*s0))
 101                 {
 102                   if (d >= end)
 103                     return URL_ERR_TOO_LONG;
 104                   *d++ = *s0++;
 105                 }
 106             }
 107         }
 108       else
 109         return URL_ERR_INVALID_CHAR;
 110     }
 111   *d = 0;
 112   return 0;
 113 }
 114
 115 int
 116 url_enescape(byte *s, byte *d)
 117 {
 118   byte *end = d + MAX_URL_SIZE - 10;
 119   unsigned int c;
 120
 121   while (c = *s)
 122     {
 123       if (d >= end)
 124         return URL_ERR_TOO_LONG;
 125       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 126           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 127           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 128           c == ',' ||
 129           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 130           c == '=' || c == '&' || c == '#' || c == ';')
 131         *d++ = *s++;
 132       else
 133         {
 134           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
 135           *d++ = '%';
 136           *d++ = enhex(val >> 4);
 137           *d++ = enhex(val & 0x0f);
 138           s++;
 139         }
 140     }
 141   *d = 0;
 142   return 0;
 143 }
 144
 145 /* Split an URL (several parts may be copied to the destination buffer) */
 146
 147 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 148 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 149
 150 uns
 151 identify_protocol(byte *p)
 152 {
 153   uns i;
 154
 155   for(i=1; i<URL_PROTO_MAX; i++)
 156     if (!strcasecmp(p, url_proto_names[i]))
 157       return i;
 158   return URL_PROTO_UNKNOWN;
 159 }
 160
 161 int
 162 url_split(byte *s, struct url *u, byte *d)
 163 {
 164   bzero(u, sizeof(struct url));
 165   u->port = ~0;
 166   u->bufend = d + MAX_URL_SIZE - 10;
 167
 168   if (s[0] != '/')                      /* Seek for "protocol:" */
 169     {
 170       byte *p = s;
 171       while (*p && Calnum(*p))
 172         p++;
 173       if (p != s && *p == ':')
 174         {
 175           u->protocol = d;
 176           while (s < p)
 177             *d++ = *s++;
 178           *d++ = 0;
 179           u->protoid = identify_protocol(u->protocol);
 180           s++;
 181           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 182             {
 183               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 184               int len = d - u->protocol;
 185               d -= len;
 186               s -= len;
 187               u->protocol = NULL;
 188               u->protoid = 0;
 189             }
 190         }
 191     }
 192
 193   if (s[0] == '/')                      /* Host spec or absolute path */
 194     {
 195       if (s[1] == '/')                  /* Host spec */
 196         {
 197           byte *q, *w, *e;
 198           char *ep;
 199
 200           s += 2;
 201           q = d;
 202           while (*s && *s != '/')       /* Copy user:passwd@host:port */
 203             *d++ = *s++;
 204           *d++ = 0;
 205           w = strchr(q, '@');
 206           if (w)                        /* user:passwd present */
 207             {
 208               *w++ = 0;
 209               u->user = q;
 210               if (e = strchr(q, ':'))
 211                 {
 212                   *e++ = 0;
 213                   u->pass = e;
 214                 }
 215             }
 216           else
 217             w = q;
 218           e = strchr(w, ':');
 219           if (e)                        /* host:port present */
 220             {
 221               uns p;
 222               *e++ = 0;
 223               p = strtoul(e, &ep, 10);
 224               if (ep && *ep || p > 65535)
 225                 return URL_ERR_INVALID_PORT;
 226               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 227                 u->port = p;
 228             }
 229           u->host = w;
 230         }
 231     }
 232
 233   u->rest = s;
 234   u->buf = d;
 235   return 0;
 236 }
 237
 238 /* Normalization according to given base URL */
 239
 240 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 241
 242 static int
 243 relpath_merge(struct url *u, struct url *b)
 244 {
 245   byte *a = u->rest;
 246   byte *o = b->rest;
 247   byte *d = u->buf;
 248   byte *e = u->bufend;
 249   byte *p;
 250
 251   if (a[0] == '/')                      /* Absolute path => OK */
 252     return 0;
 253   if (o[0] != '/')
 254     return URL_PATH_UNDERFLOW;
 255
 256   if (!a[0])                            /* Empty URL -> inherit everything */
 257     {
 258       u->rest = b->rest;
 259       return 0;
 260     }
 261
 262   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 263
 264   if (a[0] == '#')                      /* Another fragment */
 265     {
 266       for(p=o; *p && *p != '#'; p++)
 267         ;
 268       goto copy;
 269     }
 270   if (a[0] == '?')                      /* New query */
 271     {
 272       for(p=o; *p && *p != '#' && *p != '?'; p++)
 273         ;
 274       goto copy;
 275     }
 276   if (a[0] == ';')                      /* Change parameters */
 277     {
 278       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 279         ;
 280       goto copy;
 281     }
 282
 283   p = NULL;                             /* Copy original path and find the last slash */
 284   while (*o && *o != ';' && *o != '?' && *o != '#')
 285     {
 286       if (d >= e)
 287         return URL_ERR_TOO_LONG;
 288       if ((*d++ = *o++) == '/')
 289         p = d;
 290     }
 291   if (!p)
 292     return URL_ERR_REL_NOTHING;
 293   d = p;
 294
 295   while (*a)
 296     {
 297       if (a[0] == '.')
 298         {
 299           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 300             {
 301               a++;
 302               if (a[0])
 303                 a++;
 304               continue;
 305             }
 306           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 307             {
 308               a += 2;
 309               if (a[0])
 310                 a++;
 311               if (d <= u->buf + 1)
 312                 {
 313                   /*
 314                    * RFC 1808 says we should leave ".." as a path segment, but
 315                    * we intentionally break the rule and refuse the URL.
 316                    */
 317                   if (!url_ignore_underflow)
 318                     return URL_PATH_UNDERFLOW;
 319                 }
 320               else
 321                 {
 322                   d--;                  /* Discard trailing slash */
 323                   while (d[-1] != '/')
 324                     d--;
 325                 }
 326               continue;
 327             }
 328         }
 329       while (a[0] && a[0] != '/')
 330         {
 331           if (d >= e)
 332             return URL_ERR_TOO_LONG;
 333           *d++ = *a++;
 334         }
 335       if (a[0])
 336         *d++ = *a++;
 337     }
 338
 339 okay:
 340   *d++ = 0;
 341   u->buf = d;
 342   return 0;
 343
 344 copy:                                   /* Combine part of old URL with the new one */
 345   while (o < p)
 346     if (d < e)
 347       *d++ = *o++;
 348     else
 349       return URL_ERR_TOO_LONG;
 350   while (*a)
 351     if (d < e)
 352       *d++ = *a++;
 353     else
 354       return URL_ERR_TOO_LONG;
 355   goto okay;
 356 }
 357
 358 int
 359 url_normalize(struct url *u, struct url *b)
 360 {
 361   int err;
 362
 363   /* Basic checks */
 364   if (url_proto_path_flags[u->protoid] && !u->host ||
 365       u->host && !*u->host ||
 366       !u->host && u->user ||
 367       !u->user && u->pass ||
 368       !u->rest)
 369     return URL_SYNTAX_ERROR;
 370
 371   if (!u->protocol)
 372     {
 373       /* Now we know it's a relative URL. Do we have any base? */
 374       if (!b || !url_proto_path_flags[b->protoid])
 375         return URL_ERR_REL_NOTHING;
 376       u->protocol = b->protocol;
 377       u->protoid = b->protoid;
 378
 379       /* Reference to the same host */
 380       if (!u->host)
 381         {
 382           u->host = b->host;
 383           u->user = b->user;
 384           u->pass = b->pass;
 385           u->port = b->port;
 386           if (err = relpath_merge(u, b))
 387             return err;
 388         }
 389     }
 390
 391   /* Fill in missing info */
 392   if (u->port == ~0U)
 393     u->port = std_ports[u->protoid];
 394
 395   return 0;
 396 }
 397
 398 /* Name canonicalization */
 399
 400 static void
 401 lowercase(byte *b)
 402 {
 403   if (b)
 404     while (*b)
 405       {
 406         if (*b >= 'A' && *b <= 'Z')
 407           *b = *b + 0x20;
 408         b++;
 409       }
 410 }
 411
 412 static void
 413 kill_end_dot(byte *b)
 414 {
 415   byte *k;
 416
 417   if (b)
 418     {
 419       k = b + strlen(b) - 1;
 420       while (k > b && *k == '.')
 421         *k-- = 0;
 422     }
 423 }
 424
 425 int
 426 url_canonicalize(struct url *u)
 427 {
 428   char *c;
 429
 430   lowercase(u->protocol);
 431   lowercase(u->host);
 432   kill_end_dot(u->host);
 433   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 434     u->rest = "/";
 435   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 436     *c = 0;
 437   return 0;
 438 }
 439
 440 /* Pack a broken-down URL */
 441
 442 static byte *
 443 append(byte *d, byte *s, byte *e)
 444 {
 445   if (d)
 446     while (*s)
 447       {
 448         if (d >= e)
 449           return NULL;
 450         *d++ = *s++;
 451       }
 452   return d;
 453 }
 454
 455 int
 456 url_pack(struct url *u, byte *d)
 457 {
 458   byte *e = d + MAX_URL_SIZE - 10;
 459
 460   if (u->protocol)
 461     {
 462       d = append(d, u->protocol, e);
 463       d = append(d, ":", e);
 464       u->protoid = identify_protocol(u->protocol);
 465     }
 466   if (u->host)
 467     {
 468       d = append(d, "//", e);
 469       if (u->user)
 470         {
 471           d = append(d, u->user, e);
 472           if (u->pass)
 473             {
 474               d = append(d, ":", e);
 475               d = append(d, u->pass, e);
 476             }
 477           d = append(d, "@", e);
 478         }
 479       d = append(d, u->host, e);
 480       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 481         {
 482           char z[10];
 483           sprintf(z, "%d", u->port);
 484           d = append(d, ":", e);
 485           d = append(d, z, e);
 486         }
 487     }
 488   if (u->rest)
 489     d = append(d, u->rest, e);
 490   if (!d)
 491     return URL_ERR_TOO_LONG;
 492   *d = 0;
 493   return 0;
 494 }
 495
 496 /* Error messages */
 497
 498 static char *errmsg[] = {
 499   "Something is wrong",
 500   "Too long",
 501   "Invalid character",
 502   "Invalid escape",
 503   "Invalid escaped character",
 504   "Invalid port number",
 505   "Relative URL not allowed",
 506   "Unknown protocol",
 507   "Syntax error",
 508   "Path underflow"
 509 };
 510
 511 char *
 512 url_error(uns err)
 513 {
 514   if (err >= sizeof(errmsg) / sizeof(char *))
 515     err = 0;
 516   return errmsg[err];
 517 }
 518
 519 /* Standard cookbook recipes */
 520
 521 int
 522 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
 523 {
 524   int err;
 525
 526   if (err = url_deescape(u, buf1))
 527     return err;
 528   if (err = url_split(buf1, url, buf2))
 529     return err;
 530   if (err = url_normalize(url, NULL))
 531     return err;
 532   return url_canonicalize(url);
 533 }
 534
 535 int
 536 url_auto_canonicalize(byte *src, byte *dst)
 537 {
 538   byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 539   int err;
 540   struct url ur;
 541
 542   (void)((err = url_canon_split(src, buf1, buf2, &ur)) ||
 543    (err = url_pack(&ur, buf3)) ||
 544    (err = url_enescape(buf3, dst)));
 545   return err;
 546 }
 547
 548 /* Testing */
 549
 550 #ifdef TEST
 551
 552 int main(int argc, char **argv)
 553 {
 554   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 555   int err;
 556   struct url url, url0;
 557
 558   if (argc != 2)
 559     return 1;
 560   if (err = url_deescape(argv[1], buf1))
 561     {
 562       printf("deesc: error %d\n", err);
 563       return 1;
 564     }
 565   printf("deesc: %s\n", buf1);
 566   if (err = url_split(buf1, &url, buf2))
 567     {
 568       printf("split: error %d\n", err);
 569       return 1;
 570     }
 571   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 572   if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment", &url0, buf3))
 573     {
 574       printf("split base: error %d\n", err);
 575       return 1;
 576     }
 577   if (err = url_normalize(&url0, NULL))
 578     {
 579       printf("normalize base: error %d\n", err);
 580       return 1;
 581     }
 582   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 583   if (err = url_normalize(&url, &url0))
 584     {
 585       printf("normalize: error %d\n", err);
 586       return 1;
 587     }
 588   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 589   if (err = url_canonicalize(&url))
 590     {
 591       printf("canonicalize: error %d\n", err);
 592       return 1;
 593     }
 594   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 595   if (err = url_pack(&url, buf4))
 596     {
 597       printf("pack: error %d\n", err);
 598       return 1;
 599     }
 600   printf("pack: %s\n", buf4);
 601   if (err = url_enescape(buf4, buf2))
 602     {
 603       printf("enesc: error %d\n", err);
 604       return 1;
 605     }
 606   printf("enesc: %s\n", buf2);
 607   return 0;
 608 }
 609
 610 #endif
 611
 612 struct component {
 613         byte *start;
 614         int length;
 615         u32 hash;
 616 };
 617
 618 static inline u32
 619 hashf(byte *start, int length)
 620 {
 621         u32 hf = length;
 622         while (length-- > 0)
 623                 hf = (hf << 8 | hf >> 24) ^ *start++;
 624         return hf;
 625 }
 626
 627 static inline uns
 628 repeat_count(struct component *comp, uns count, uns len)
 629 {
 630         struct component *orig_comp = comp;
 631         uns found = 0;
 632         while (1)
 633         {
 634                 uns i;
 635                 comp += len;
 636                 count -= len;
 637                 found++;
 638                 if (count < len)
 639                         return found;
 640                 for (i=0; i<len; i++)
 641                         if (comp[i].hash != orig_comp[i].hash
 642                         || comp[i].length != orig_comp[i].length
 643                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 644                                 return found;
 645         }
 646 }
 647
 648 int
 649 url_has_repeated_component(byte *url)
 650 {
 651         struct component *comp;
 652         uns comps, comp_len, rep_prefix;
 653         byte *c;
 654         uns i;
 655
 656         for (comps=0, c=url; c; comps++)
 657         {
 658                 c = strpbrk(c, url_component_separators);
 659                 if (c)
 660                         c++;
 661         }
 662         if (comps < url_min_repeat_count)
 663                 return 0;
 664         comp = alloca(comps * sizeof(struct component));
 665         for (i=0, c=url; c; i++)
 666         {
 667                 comp[i].start = c;
 668                 c = strpbrk(c, url_component_separators);
 669                 if (c)
 670                 {
 671                         comp[i].length = c - comp[i].start;
 672                         c++;
 673                 }
 674                 else
 675                         comp[i].length = strlen(comp[i].start);
 676         }
 677         ASSERT(i == comps);
 678         for (i=0; i<comps; i++)
 679                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 680         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 681                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 682                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 683                                 return comp_len;
 684         return 0;
 685 }