lib/url.c

   1 /*
   2  *      Sherlock Library -- URL Functions
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  *
  10  *      The URL syntax corresponds to RFC 2396 with several exceptions:
  11  *
  12  *         o  Escaping of special characters still follows RFC 1738.
  13  *         o  Interpretation of path parameters follows RFC 1808.
  14  *         o  Parsing a relative URL "x" wrt. base "http://hell.org?y"
  15  *            gives an error, which might be wrong. However, I failed
  16  *            to find any rule applying to this case in the RFC.
  17  */
  18
  19 #include "lib/lib.h"
  20 #include "lib/url.h"
  21 #include "lib/chartype.h"
  22 #include "lib/conf.h"
  23
  24 #include <string.h>
  25 #include <stdlib.h>
  26 #include <stdio.h>
  27 #include <alloca.h>
  28
  29 /* Configuration */
  30
  31 static uns url_ignore_spaces;
  32 static uns url_ignore_underflow;
  33 static byte *url_component_separators = "";
  34 static uns url_min_repeat_count = 0x7fffffff;
  35 static uns url_max_repeat_length = 0;
  36
  37 static struct cfitem url_config[] = {
  38   { "URL",                              CT_SECTION,     NULL },
  39   { "IgnoreSpaces",                     CT_INT,         &url_ignore_spaces },
  40   { "IgnoreUnderflow",                  CT_INT,         &url_ignore_underflow },
  41   { "ComponentSeparators",              CT_STRING,      &url_component_separators },
  42   { "MinRepeatCount",                   CT_INT,         &url_min_repeat_count },
  43   { "MaxRepeatLength",                  CT_INT,         &url_max_repeat_length },
  44   { NULL,                               CT_STOP,        NULL }
  45 };
  46
  47 static void CONSTRUCTOR url_init_config(void)
  48 {
  49   cf_register(url_config);
  50 }
  51
  52 /* Escaping and de-escaping */
  53
  54 static uns
  55 enhex(uns x)
  56 {
  57   return (x<10) ? (x + '0') : (x - 10 + 'A');
  58 }
  59
  60 int
  61 url_deescape(byte *s, byte *d)
  62 {
  63   byte *dstart = d;
  64   byte *end = d + MAX_URL_SIZE - 10;
  65   while (*s)
  66     {
  67       if (d >= end)
  68         return URL_ERR_TOO_LONG;
  69       if (*s == '%')
  70         {
  71           unsigned int val;
  72           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  73             return URL_ERR_INVALID_ESCAPE;
  74           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  75           if (val < 0x20)
  76             return URL_ERR_INVALID_ESCAPED_CHAR;
  77           switch (val)
  78             {
  79             case ';':
  80               val = NCC_SEMICOLON; break;
  81             case '/':
  82               val = NCC_SLASH; break;
  83             case '?':
  84               val = NCC_QUEST; break;
  85             case ':':
  86               val = NCC_COLON; break;
  87             case '@':
  88               val = NCC_AT; break;
  89             case '=':
  90               val = NCC_EQUAL; break;
  91             case '&':
  92               val = NCC_AND; break;
  93             case '#':
  94               val = NCC_HASH; break;
  95             }
  96           *d++ = val;
  97           s += 3;
  98         }
  99       else if (*s > 0x20)
 100         *d++ = *s++;
 101       else if (Cspace(*s))
 102         {
 103           byte *s0 = s;
 104           while (Cspace(*s))
 105             s++;
 106           if (!url_ignore_spaces || !(!*s || d == dstart))
 107             {
 108               while (Cspace(*s0))
 109                 {
 110                   if (d >= end)
 111                     return URL_ERR_TOO_LONG;
 112                   *d++ = *s0++;
 113                 }
 114             }
 115         }
 116       else
 117         return URL_ERR_INVALID_CHAR;
 118     }
 119   *d = 0;
 120   return 0;
 121 }
 122
 123 int
 124 url_enescape(byte *s, byte *d)
 125 {
 126   byte *end = d + MAX_URL_SIZE - 10;
 127   unsigned int c;
 128
 129   while (c = *s)
 130     {
 131       if (d >= end)
 132         return URL_ERR_TOO_LONG;
 133       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 134           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 135           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 136           c == ',' ||
 137           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 138           c == '=' || c == '&' || c == '#' || c == ';')
 139         *d++ = *s++;
 140       else
 141         {
 142           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
 143           *d++ = '%';
 144           *d++ = enhex(val >> 4);
 145           *d++ = enhex(val & 0x0f);
 146           s++;
 147         }
 148     }
 149   *d = 0;
 150   return 0;
 151 }
 152
 153 /* Split an URL (several parts may be copied to the destination buffer) */
 154
 155 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 156 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 157
 158 uns
 159 identify_protocol(byte *p)
 160 {
 161   uns i;
 162
 163   for(i=1; i<URL_PROTO_MAX; i++)
 164     if (!strcasecmp(p, url_proto_names[i]))
 165       return i;
 166   return URL_PROTO_UNKNOWN;
 167 }
 168
 169 int
 170 url_split(byte *s, struct url *u, byte *d)
 171 {
 172   bzero(u, sizeof(struct url));
 173   u->port = ~0;
 174   u->bufend = d + MAX_URL_SIZE - 10;
 175
 176   if (s[0] != '/')                      /* Seek for "protocol:" */
 177     {
 178       byte *p = s;
 179       while (*p && Calnum(*p))
 180         p++;
 181       if (p != s && *p == ':')
 182         {
 183           u->protocol = d;
 184           while (s < p)
 185             *d++ = *s++;
 186           *d++ = 0;
 187           u->protoid = identify_protocol(u->protocol);
 188           s++;
 189           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 190             {
 191               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 192               int len = d - u->protocol;
 193               d -= len;
 194               s -= len;
 195               u->protocol = NULL;
 196               u->protoid = 0;
 197             }
 198         }
 199     }
 200
 201   if (s[0] == '/')                      /* Host spec or absolute path */
 202     {
 203       if (s[1] == '/')                  /* Host spec */
 204         {
 205           byte *q, *w, *e;
 206           char *ep;
 207
 208           s += 2;
 209           q = d;
 210           while (*s && *s != '/' && *s != '?')  /* Copy user:passwd@host:port */
 211             *d++ = *s++;
 212           *d++ = 0;
 213           w = strchr(q, '@');
 214           if (w)                        /* user:passwd present */
 215             {
 216               *w++ = 0;
 217               u->user = q;
 218               if (e = strchr(q, ':'))
 219                 {
 220                   *e++ = 0;
 221                   u->pass = e;
 222                 }
 223             }
 224           else
 225             w = q;
 226           e = strchr(w, ':');
 227           if (e)                        /* host:port present */
 228             {
 229               uns p;
 230               *e++ = 0;
 231               p = strtoul(e, &ep, 10);
 232               if (ep && *ep || p > 65535)
 233                 return URL_ERR_INVALID_PORT;
 234               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 235                 u->port = p;
 236             }
 237           u->host = w;
 238         }
 239     }
 240
 241   u->rest = s;
 242   u->buf = d;
 243   return 0;
 244 }
 245
 246 /* Normalization according to given base URL */
 247
 248 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 249
 250 static int
 251 relpath_merge(struct url *u, struct url *b)
 252 {
 253   byte *a = u->rest;
 254   byte *o = b->rest;
 255   byte *d = u->buf;
 256   byte *e = u->bufend;
 257   byte *p;
 258
 259   if (a[0] == '/')                      /* Absolute path => OK */
 260     return 0;
 261   if (o[0] != '/' && o[0] != '?')
 262     return URL_PATH_UNDERFLOW;
 263
 264   if (!a[0])                            /* Empty URL -> inherit everything */
 265     {
 266       u->rest = b->rest;
 267       return 0;
 268     }
 269
 270   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 271
 272   if (a[0] == '#')                      /* Another fragment */
 273     {
 274       for(p=o; *p && *p != '#'; p++)
 275         ;
 276       goto copy;
 277     }
 278   if (a[0] == '?')                      /* New query */
 279     {
 280       for(p=o; *p && *p != '#' && *p != '?'; p++)
 281         ;
 282       goto copy;
 283     }
 284   if (a[0] == ';')                      /* Change parameters */
 285     {
 286       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 287         ;
 288       goto copy;
 289     }
 290
 291   p = NULL;                             /* Copy original path and find the last slash */
 292   while (*o && *o != ';' && *o != '?' && *o != '#')
 293     {
 294       if (d >= e)
 295         return URL_ERR_TOO_LONG;
 296       if ((*d++ = *o++) == '/')
 297         p = d;
 298     }
 299   if (!p)
 300     return URL_ERR_REL_NOTHING;
 301   d = p;
 302
 303   while (*a)
 304     {
 305       if (a[0] == '.')
 306         {
 307           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 308             {
 309               a++;
 310               if (a[0])
 311                 a++;
 312               continue;
 313             }
 314           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 315             {
 316               a += 2;
 317               if (a[0])
 318                 a++;
 319               if (d <= u->buf + 1)
 320                 {
 321                   /*
 322                    * RFC 1808 says we should leave ".." as a path segment, but
 323                    * we intentionally break the rule and refuse the URL.
 324                    */
 325                   if (!url_ignore_underflow)
 326                     return URL_PATH_UNDERFLOW;
 327                 }
 328               else
 329                 {
 330                   d--;                  /* Discard trailing slash */
 331                   while (d[-1] != '/')
 332                     d--;
 333                 }
 334               continue;
 335             }
 336         }
 337       while (a[0] && a[0] != '/')
 338         {
 339           if (d >= e)
 340             return URL_ERR_TOO_LONG;
 341           *d++ = *a++;
 342         }
 343       if (a[0])
 344         *d++ = *a++;
 345     }
 346
 347 okay:
 348   *d++ = 0;
 349   u->buf = d;
 350   return 0;
 351
 352 copy:                                   /* Combine part of old URL with the new one */
 353   while (o < p)
 354     if (d < e)
 355       *d++ = *o++;
 356     else
 357       return URL_ERR_TOO_LONG;
 358   while (*a)
 359     if (d < e)
 360       *d++ = *a++;
 361     else
 362       return URL_ERR_TOO_LONG;
 363   goto okay;
 364 }
 365
 366 int
 367 url_normalize(struct url *u, struct url *b)
 368 {
 369   int err;
 370
 371   /* Basic checks */
 372   if (url_proto_path_flags[u->protoid] && !u->host ||
 373       u->host && !*u->host ||
 374       !u->host && u->user ||
 375       !u->user && u->pass ||
 376       !u->rest)
 377     return URL_SYNTAX_ERROR;
 378
 379   if (!u->protocol)
 380     {
 381       /* Now we know it's a relative URL. Do we have any base? */
 382       if (!b || !url_proto_path_flags[b->protoid])
 383         return URL_ERR_REL_NOTHING;
 384       u->protocol = b->protocol;
 385       u->protoid = b->protoid;
 386
 387       /* Reference to the same host */
 388       if (!u->host)
 389         {
 390           u->host = b->host;
 391           u->user = b->user;
 392           u->pass = b->pass;
 393           u->port = b->port;
 394           if (err = relpath_merge(u, b))
 395             return err;
 396         }
 397     }
 398
 399   /* Fill in missing info */
 400   if (u->port == ~0U)
 401     u->port = std_ports[u->protoid];
 402
 403   return 0;
 404 }
 405
 406 /* Name canonicalization */
 407
 408 static void
 409 lowercase(byte *b)
 410 {
 411   if (b)
 412     while (*b)
 413       {
 414         if (*b >= 'A' && *b <= 'Z')
 415           *b = *b + 0x20;
 416         b++;
 417       }
 418 }
 419
 420 static void
 421 kill_end_dot(byte *b)
 422 {
 423   byte *k;
 424
 425   if (b)
 426     {
 427       k = b + strlen(b) - 1;
 428       while (k > b && *k == '.')
 429         *k-- = 0;
 430     }
 431 }
 432
 433 int
 434 url_canonicalize(struct url *u)
 435 {
 436   char *c;
 437
 438   lowercase(u->protocol);
 439   lowercase(u->host);
 440   kill_end_dot(u->host);
 441   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 442     u->rest = "/";
 443   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 444     *c = 0;
 445   return 0;
 446 }
 447
 448 /* Pack a broken-down URL */
 449
 450 static byte *
 451 append(byte *d, byte *s, byte *e)
 452 {
 453   if (d)
 454     while (*s)
 455       {
 456         if (d >= e)
 457           return NULL;
 458         *d++ = *s++;
 459       }
 460   return d;
 461 }
 462
 463 int
 464 url_pack(struct url *u, byte *d)
 465 {
 466   byte *e = d + MAX_URL_SIZE - 10;
 467
 468   if (u->protocol)
 469     {
 470       d = append(d, u->protocol, e);
 471       d = append(d, ":", e);
 472       u->protoid = identify_protocol(u->protocol);
 473     }
 474   if (u->host)
 475     {
 476       d = append(d, "//", e);
 477       if (u->user)
 478         {
 479           d = append(d, u->user, e);
 480           if (u->pass)
 481             {
 482               d = append(d, ":", e);
 483               d = append(d, u->pass, e);
 484             }
 485           d = append(d, "@", e);
 486         }
 487       d = append(d, u->host, e);
 488       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 489         {
 490           char z[10];
 491           sprintf(z, "%d", u->port);
 492           d = append(d, ":", e);
 493           d = append(d, z, e);
 494         }
 495     }
 496   if (u->rest)
 497     d = append(d, u->rest, e);
 498   if (!d)
 499     return URL_ERR_TOO_LONG;
 500   *d = 0;
 501   return 0;
 502 }
 503
 504 /* Error messages */
 505
 506 static char *errmsg[] = {
 507   "Something is wrong",
 508   "Too long",
 509   "Invalid character",
 510   "Invalid escape",
 511   "Invalid escaped character",
 512   "Invalid port number",
 513   "Relative URL not allowed",
 514   "Unknown protocol",
 515   "Syntax error",
 516   "Path underflow"
 517 };
 518
 519 char *
 520 url_error(uns err)
 521 {
 522   if (err >= sizeof(errmsg) / sizeof(char *))
 523     err = 0;
 524   return errmsg[err];
 525 }
 526
 527 /* Standard cookbook recipes */
 528
 529 int
 530 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
 531 {
 532   int err;
 533
 534   if (err = url_deescape(u, buf1))
 535     return err;
 536   if (err = url_split(buf1, url, buf2))
 537     return err;
 538   if (err = url_normalize(url, NULL))
 539     return err;
 540   return url_canonicalize(url);
 541 }
 542
 543 int
 544 url_auto_canonicalize(byte *src, byte *dst)
 545 {
 546   byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 547   int err;
 548   struct url ur;
 549
 550   (void)((err = url_canon_split(src, buf1, buf2, &ur)) ||
 551    (err = url_pack(&ur, buf3)) ||
 552    (err = url_enescape(buf3, dst)));
 553   return err;
 554 }
 555
 556 /* Testing */
 557
 558 #ifdef TEST
 559
 560 int main(int argc, char **argv)
 561 {
 562   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 563   int err;
 564   struct url url, url0;
 565   char *base = "http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment";
 566
 567   if (argc != 2 && argc != 3)
 568     return 1;
 569   if (argc == 3)
 570     base = argv[2];
 571   if (err = url_deescape(argv[1], buf1))
 572     {
 573       printf("deesc: error %d\n", err);
 574       return 1;
 575     }
 576   printf("deesc: %s\n", buf1);
 577   if (err = url_split(buf1, &url, buf2))
 578     {
 579       printf("split: error %d\n", err);
 580       return 1;
 581     }
 582   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 583   if (err = url_split(base, &url0, buf3))
 584     {
 585       printf("split base: error %d\n", err);
 586       return 1;
 587     }
 588   if (err = url_normalize(&url0, NULL))
 589     {
 590       printf("normalize base: error %d\n", err);
 591       return 1;
 592     }
 593   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 594   if (err = url_normalize(&url, &url0))
 595     {
 596       printf("normalize: error %d\n", err);
 597       return 1;
 598     }
 599   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 600   if (err = url_canonicalize(&url))
 601     {
 602       printf("canonicalize: error %d\n", err);
 603       return 1;
 604     }
 605   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 606   if (err = url_pack(&url, buf4))
 607     {
 608       printf("pack: error %d\n", err);
 609       return 1;
 610     }
 611   printf("pack: %s\n", buf4);
 612   if (err = url_enescape(buf4, buf2))
 613     {
 614       printf("enesc: error %d\n", err);
 615       return 1;
 616     }
 617   printf("enesc: %s\n", buf2);
 618   return 0;
 619 }
 620
 621 #endif
 622
 623 struct component {
 624         byte *start;
 625         int length;
 626         u32 hash;
 627 };
 628
 629 static inline u32
 630 hashf(byte *start, int length)
 631 {
 632         u32 hf = length;
 633         while (length-- > 0)
 634                 hf = (hf << 8 | hf >> 24) ^ *start++;
 635         return hf;
 636 }
 637
 638 static inline uns
 639 repeat_count(struct component *comp, uns count, uns len)
 640 {
 641         struct component *orig_comp = comp;
 642         uns found = 0;
 643         while (1)
 644         {
 645                 uns i;
 646                 comp += len;
 647                 count -= len;
 648                 found++;
 649                 if (count < len)
 650                         return found;
 651                 for (i=0; i<len; i++)
 652                         if (comp[i].hash != orig_comp[i].hash
 653                         || comp[i].length != orig_comp[i].length
 654                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 655                                 return found;
 656         }
 657 }
 658
 659 int
 660 url_has_repeated_component(byte *url)
 661 {
 662         struct component *comp;
 663         uns comps, comp_len, rep_prefix;
 664         byte *c;
 665         uns i;
 666
 667         for (comps=0, c=url; c; comps++)
 668         {
 669                 c = strpbrk(c, url_component_separators);
 670                 if (c)
 671                         c++;
 672         }
 673         if (comps < url_min_repeat_count)
 674                 return 0;
 675         comp = alloca(comps * sizeof(struct component));
 676         for (i=0, c=url; c; i++)
 677         {
 678                 comp[i].start = c;
 679                 c = strpbrk(c, url_component_separators);
 680                 if (c)
 681                 {
 682                         comp[i].length = c - comp[i].start;
 683                         c++;
 684                 }
 685                 else
 686                         comp[i].length = strlen(comp[i].start);
 687         }
 688         ASSERT(i == comps);
 689         for (i=0; i<comps; i++)
 690                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 691         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 692                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 693                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 694                                 return comp_len;
 695         return 0;
 696 }