lib/url.c

   1 /*
   2  *      Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
   3  *
   4  *      (c) 1997--2001 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  */
  10
  11 #include "lib/lib.h"
  12 #include "lib/url.h"
  13 #include "lib/chartype.h"
  14 #include "lib/conf.h"
  15
  16 #include <string.h>
  17 #include <stdlib.h>
  18 #include <stdio.h>
  19
  20 /* Configuration */
  21
  22 static uns url_ignore_spaces;
  23 static uns url_ignore_underflow;
  24 static byte *url_component_separators = "";
  25 static uns url_min_repeat_count = 0x7fffffff;
  26 static uns url_max_repeat_length = 0;
  27
  28 static struct cfitem url_config[] = {
  29   { "URL",              CT_SECTION,     NULL },
  30   { "IgnoreSpaces",     CT_INT,         &url_ignore_spaces },
  31   { "IgnoreUnderflow",  CT_INT,         &url_ignore_underflow },
  32   { "ComponentSeparators",      CT_STRING,      &url_component_separators },
  33   { "MinRepeatCount",           CT_INT,         &url_min_repeat_count },
  34   { "MaxRepeatLength",          CT_INT,         &url_max_repeat_length },
  35   { NULL,               CT_STOP,        NULL }
  36 };
  37
  38 static void CONSTRUCTOR url_init_config(void)
  39 {
  40   cf_register(url_config);
  41 }
  42
  43 /* Escaping and de-escaping */
  44
  45 static uns
  46 enhex(uns x)
  47 {
  48   return (x<10) ? (x + '0') : (x - 10 + 'A');
  49 }
  50
  51 int
  52 url_deescape(byte *s, byte *d)
  53 {
  54   byte *dstart = d;
  55   byte *end = d + MAX_URL_SIZE - 10;
  56   while (*s)
  57     {
  58       if (d >= end)
  59         return URL_ERR_TOO_LONG;
  60       if (*s == '%')
  61         {
  62           unsigned int val;
  63           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  64             return URL_ERR_INVALID_ESCAPE;
  65           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  66           if (val < 0x20)
  67             return URL_ERR_INVALID_ESCAPED_CHAR;
  68           switch (val)
  69             {
  70             case ';':
  71               val = NCC_SEMICOLON; break;
  72             case '/':
  73               val = NCC_SLASH; break;
  74             case '?':
  75               val = NCC_QUEST; break;
  76             case ':':
  77               val = NCC_COLON; break;
  78             case '@':
  79               val = NCC_AT; break;
  80             case '=':
  81               val = NCC_EQUAL; break;
  82             case '&':
  83               val = NCC_AND; break;
  84             case '#':
  85               val = NCC_HASH; break;
  86             }
  87           *d++ = val;
  88           s += 3;
  89         }
  90       else if (*s > 0x20)
  91         *d++ = *s++;
  92       else if (Cspace(*s))
  93         {
  94           byte *s0 = s;
  95           while (Cspace(*s))
  96             s++;
  97           if (!url_ignore_spaces || !(!*s || d == dstart))
  98             {
  99               while (Cspace(*s0))
 100                 {
 101                   if (d >= end)
 102                     return URL_ERR_TOO_LONG;
 103                   *d++ = *s0++;
 104                 }
 105             }
 106         }
 107       else
 108         return URL_ERR_INVALID_CHAR;
 109     }
 110   *d = 0;
 111   return 0;
 112 }
 113
 114 int
 115 url_enescape(byte *s, byte *d)
 116 {
 117   byte *end = d + MAX_URL_SIZE - 10;
 118   unsigned int c;
 119
 120   while (c = *s)
 121     {
 122       if (d >= end)
 123         return URL_ERR_TOO_LONG;
 124       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 125           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 126           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 127           c == ',' ||
 128           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 129           c == '=' || c == '&' || c == '#' || c == ';')
 130         *d++ = *s++;
 131       else
 132         {
 133           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
 134           *d++ = '%';
 135           *d++ = enhex(val >> 4);
 136           *d++ = enhex(val & 0x0f);
 137           s++;
 138         }
 139     }
 140   *d = 0;
 141   return 0;
 142 }
 143
 144 /* Split an URL (several parts may be copied to the destination buffer) */
 145
 146 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 147 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 148
 149 uns
 150 identify_protocol(byte *p)
 151 {
 152   uns i;
 153
 154   for(i=1; i<URL_PROTO_MAX; i++)
 155     if (!strcasecmp(p, url_proto_names[i]))
 156       return i;
 157   return URL_PROTO_UNKNOWN;
 158 }
 159
 160 int
 161 url_split(byte *s, struct url *u, byte *d)
 162 {
 163   bzero(u, sizeof(struct url));
 164   u->port = ~0;
 165   u->bufend = d + MAX_URL_SIZE - 10;
 166
 167   if (s[0] != '/')                      /* Seek for "protocol:" */
 168     {
 169       byte *p = s;
 170       while (*p && Calnum(*p))
 171         p++;
 172       if (p != s && *p == ':')
 173         {
 174           u->protocol = d;
 175           while (s < p)
 176             *d++ = *s++;
 177           *d++ = 0;
 178           u->protoid = identify_protocol(u->protocol);
 179           s++;
 180           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 181             {
 182               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 183               int len = d - u->protocol;
 184               d -= len;
 185               s -= len;
 186               u->protocol = NULL;
 187               u->protoid = 0;
 188             }
 189         }
 190     }
 191
 192   if (s[0] == '/')                      /* Host spec or absolute path */
 193     {
 194       if (s[1] == '/')                  /* Host spec */
 195         {
 196           byte *q, *w, *e;
 197           char *ep;
 198
 199           s += 2;
 200           q = d;
 201           while (*s && *s != '/')       /* Copy user:passwd@host:port */
 202             *d++ = *s++;
 203           *d++ = 0;
 204           w = strchr(q, '@');
 205           if (w)                        /* user:passwd present */
 206             {
 207               *w++ = 0;
 208               u->user = q;
 209             }
 210           else
 211             w = q;
 212           e = strchr(w, ':');
 213           if (e)                        /* host:port present */
 214             {
 215               uns p;
 216               *e++ = 0;
 217               p = strtoul(e, &ep, 10);
 218               if (ep && *ep || p > 65535)
 219                 return URL_ERR_INVALID_PORT;
 220               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 221                 u->port = p;
 222             }
 223           u->host = w;
 224         }
 225     }
 226
 227   u->rest = s;
 228   u->buf = d;
 229   return 0;
 230 }
 231
 232 /* Normalization according to given base URL */
 233
 234 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 235
 236 static int
 237 relpath_merge(struct url *u, struct url *b)
 238 {
 239   byte *a = u->rest;
 240   byte *o = b->rest;
 241   byte *d = u->buf;
 242   byte *e = u->bufend;
 243   byte *p;
 244
 245   if (a[0] == '/')                      /* Absolute path => OK */
 246     return 0;
 247   if (o[0] != '/')
 248     return URL_PATH_UNDERFLOW;
 249
 250   if (!a[0])                            /* Empty URL -> inherit everything */
 251     {
 252       u->rest = b->rest;
 253       return 0;
 254     }
 255
 256   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 257
 258   if (a[0] == '#')                      /* Another fragment */
 259     {
 260       for(p=o; *p && *p != '#'; p++)
 261         ;
 262       goto copy;
 263     }
 264   if (a[0] == '?')                      /* New query */
 265     {
 266       for(p=o; *p && *p != '#' && *p != '?'; p++)
 267         ;
 268       goto copy;
 269     }
 270   if (a[0] == ';')                      /* Change parameters */
 271     {
 272       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 273         ;
 274       goto copy;
 275     }
 276
 277   p = NULL;                             /* Copy original path and find the last slash */
 278   while (*o && *o != ';' && *o != '?' && *o != '#')
 279     {
 280       if (d >= e)
 281         return URL_ERR_TOO_LONG;
 282       if ((*d++ = *o++) == '/')
 283         p = d;
 284     }
 285   if (!p)
 286     return URL_ERR_REL_NOTHING;
 287   d = p;
 288
 289   while (*a)
 290     {
 291       if (a[0] == '.')
 292         {
 293           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 294             {
 295               a++;
 296               if (a[0])
 297                 a++;
 298               continue;
 299             }
 300           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 301             {
 302               a += 2;
 303               if (a[0])
 304                 a++;
 305               if (d <= u->buf + 1)
 306                 {
 307                   /*
 308                    * RFC 1808 says we should leave ".." as a path segment, but
 309                    * we intentionally break the rule and refuse the URL.
 310                    */
 311                   if (!url_ignore_underflow)
 312                     return URL_PATH_UNDERFLOW;
 313                 }
 314               else
 315                 {
 316                   d--;                  /* Discard trailing slash */
 317                   while (d[-1] != '/')
 318                     d--;
 319                 }
 320               continue;
 321             }
 322         }
 323       while (a[0] && a[0] != '/')
 324         {
 325           if (d >= e)
 326             return URL_ERR_TOO_LONG;
 327           *d++ = *a++;
 328         }
 329       if (a[0])
 330         *d++ = *a++;
 331     }
 332
 333 okay:
 334   *d++ = 0;
 335   u->buf = d;
 336   return 0;
 337
 338 copy:                                   /* Combine part of old URL with the new one */
 339   while (o < p)
 340     if (d < e)
 341       *d++ = *o++;
 342     else
 343       return URL_ERR_TOO_LONG;
 344   while (*a)
 345     if (d < e)
 346       *d++ = *a++;
 347     else
 348       return URL_ERR_TOO_LONG;
 349   goto okay;
 350 }
 351
 352 int
 353 url_normalize(struct url *u, struct url *b)
 354 {
 355   int err;
 356
 357   /* Basic checks */
 358   if (url_proto_path_flags[u->protoid] && !u->host ||
 359       u->host && !*u->host ||
 360       !u->host && u->user ||
 361       !u->rest)
 362     return URL_SYNTAX_ERROR;
 363
 364   if (!u->protocol)
 365     {
 366       /* Now we know it's a relative URL. Do we have any base? */
 367       if (!b || !url_proto_path_flags[b->protoid])
 368         return URL_ERR_REL_NOTHING;
 369       u->protocol = b->protocol;
 370       u->protoid = b->protoid;
 371
 372       /* Reference to the same host */
 373       if (!u->host)
 374         {
 375           u->host = b->host;
 376           u->user = b->user;
 377           u->port = b->port;
 378           if (err = relpath_merge(u, b))
 379             return err;
 380         }
 381     }
 382
 383   /* Fill in missing info */
 384   if (u->port == ~0U)
 385     u->port = std_ports[u->protoid];
 386
 387   return 0;
 388 }
 389
 390 /* Name canonicalization */
 391
 392 static void
 393 lowercase(byte *b)
 394 {
 395   if (b)
 396     while (*b)
 397       {
 398         if (*b >= 'A' && *b <= 'Z')
 399           *b = *b + 0x20;
 400         b++;
 401       }
 402 }
 403
 404 static void
 405 kill_end_dot(byte *b)
 406 {
 407   byte *k;
 408
 409   if (b)
 410     {
 411       k = b + strlen(b) - 1;
 412       if (k > b && *k == '.')
 413         *k = 0;
 414     }
 415 }
 416
 417 int
 418 url_canonicalize(struct url *u)
 419 {
 420   char *c;
 421
 422   lowercase(u->protocol);
 423   lowercase(u->host);
 424   kill_end_dot(u->host);
 425   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 426     u->rest = "/";
 427   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 428     *c = 0;
 429   return 0;
 430 }
 431
 432 /* Pack a broken-down URL */
 433
 434 static byte *
 435 append(byte *d, byte *s, byte *e)
 436 {
 437   if (d)
 438     while (*s)
 439       {
 440         if (d >= e)
 441           return NULL;
 442         *d++ = *s++;
 443       }
 444   return d;
 445 }
 446
 447 int
 448 url_pack(struct url *u, byte *d)
 449 {
 450   byte *e = d + MAX_URL_SIZE - 10;
 451
 452   if (u->protocol)
 453     {
 454       d = append(d, u->protocol, e);
 455       d = append(d, ":", e);
 456       u->protoid = identify_protocol(u->protocol);
 457     }
 458   if (u->host)
 459     {
 460       d = append(d, "//", e);
 461       if (u->user)
 462         {
 463           d = append(d, u->user, e);
 464           d = append(d, "@", e);
 465         }
 466       d = append(d, u->host, e);
 467       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 468         {
 469           char z[10];
 470           sprintf(z, "%d", u->port);
 471           d = append(d, ":", e);
 472           d = append(d, z, e);
 473         }
 474     }
 475   if (u->rest)
 476     d = append(d, u->rest, e);
 477   if (!d)
 478     return URL_ERR_TOO_LONG;
 479   *d = 0;
 480   return 0;
 481 }
 482
 483 /* Error messages */
 484
 485 static char *errmsg[] = {
 486   "Something is wrong",
 487   "Too long",
 488   "Invalid character",
 489   "Invalid escape",
 490   "Invalid escaped character",
 491   "Invalid port number",
 492   "Relative URL not allowed",
 493   "Unknown protocol",
 494   "Syntax error",
 495   "Path underflow"
 496 };
 497
 498 char *
 499 url_error(uns err)
 500 {
 501   if (err >= sizeof(errmsg) / sizeof(char *))
 502     err = 0;
 503   return errmsg[err];
 504 }
 505
 506 /* Standard cookbook recipes */
 507
 508 int
 509 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
 510 {
 511   int err;
 512
 513   if (err = url_deescape(u, buf1))
 514     return err;
 515   if (err = url_split(buf1, url, buf2))
 516     return err;
 517   if (err = url_normalize(url, NULL))
 518     return err;
 519   return url_canonicalize(url);
 520 }
 521
 522 int
 523 url_auto_canonicalize(byte *src, byte *dst)
 524 {
 525   byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 526   int err;
 527   struct url ur;
 528
 529   (void)((err = url_canon_split(src, buf1, buf2, &ur)) ||
 530    (err = url_pack(&ur, buf3)) ||
 531    (err = url_enescape(buf3, dst)));
 532   return err;
 533 }
 534
 535 /* Testing */
 536
 537 #ifdef TEST
 538
 539 int main(int argc, char **argv)
 540 {
 541   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 542   int err;
 543   struct url url, url0;
 544
 545   if (argc != 2)
 546     return 1;
 547   if (err = url_deescape(argv[1], buf1))
 548     {
 549       printf("deesc: error %d\n", err);
 550       return 1;
 551     }
 552   printf("deesc: %s\n", buf1);
 553   if (err = url_split(buf1, &url, buf2))
 554     {
 555       printf("split: error %d\n", err);
 556       return 1;
 557     }
 558   printf("split: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 559   if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment", &url0, buf3))
 560     {
 561       printf("split base: error %d\n", err);
 562       return 1;
 563     }
 564   if (err = url_normalize(&url0, NULL))
 565     {
 566       printf("normalize base: error %d\n", err);
 567       return 1;
 568     }
 569   printf("base: @%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.host, url0.port, url0.rest);
 570   if (err = url_normalize(&url, &url0))
 571     {
 572       printf("normalize: error %d\n", err);
 573       return 1;
 574     }
 575   printf("normalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 576   if (err = url_canonicalize(&url))
 577     {
 578       printf("canonicalize: error %d\n", err);
 579       return 1;
 580     }
 581   printf("canonicalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 582   if (err = url_pack(&url, buf4))
 583     {
 584       printf("pack: error %d\n", err);
 585       return 1;
 586     }
 587   printf("pack: %s\n", buf4);
 588   if (err = url_enescape(buf4, buf2))
 589     {
 590       printf("enesc: error %d\n", err);
 591       return 1;
 592     }
 593   printf("enesc: %s\n", buf2);
 594   return 0;
 595 }
 596
 597 #endif
 598
 599 struct component {
 600         byte *start;
 601         int length;
 602         u32 hash;
 603 };
 604
 605 static inline u32
 606 hashf(byte *start, int length)
 607 {
 608         u32 hf = length;
 609         while (length-- > 0)
 610                 hf = (hf << 8 | hf >> 24) ^ *start++;
 611         return hf;
 612 }
 613
 614 static inline uns
 615 repeat_count(struct component *comp, uns count, uns len)
 616 {
 617         struct component *orig_comp = comp;
 618         uns found = 0;
 619         while (1)
 620         {
 621                 uns i;
 622                 comp += len;
 623                 count -= len;
 624                 found++;
 625                 if (count < len)
 626                         return found;
 627                 for (i=0; i<len; i++)
 628                         if (comp[i].hash != orig_comp[i].hash
 629                         || comp[i].length != orig_comp[i].length
 630                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 631                                 return found;
 632         }
 633 }
 634
 635 int
 636 url_has_repeated_component(byte *url)
 637 {
 638         struct component *comp;
 639         uns comps, comp_len, rep_prefix;
 640         byte *c;
 641         uns i;
 642
 643         for (comps=0, c=url; c; comps++)
 644         {
 645                 c = strpbrk(c, url_component_separators);
 646                 if (c)
 647                         c++;
 648         }
 649         if (comps < url_min_repeat_count)
 650                 return 0;
 651         comp = alloca(comps * sizeof(struct component));
 652         for (i=0, c=url; c; i++)
 653         {
 654                 comp[i].start = c;
 655                 c = strpbrk(c, url_component_separators);
 656                 if (c)
 657                 {
 658                         comp[i].length = c - comp[i].start;
 659                         c++;
 660                 }
 661                 else
 662                         comp[i].length = strlen(comp[i].start);
 663         }
 664         ASSERT(i == comps);
 665         for (i=0; i<comps; i++)
 666                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 667         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 668                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 669                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 670                                 return comp_len;
 671         return 0;
 672 }