lib/url.c

   1 /*
   2  *      Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
   3  *
   4  *      (c) 1997--2001 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001 Robert Spalek <robert@ucw.cz>
   6  */
   7
   8 #include "lib/lib.h"
   9 #include "lib/url.h"
  10 #include "lib/chartype.h"
  11 #include "lib/conf.h"
  12
  13 #include <string.h>
  14 #include <stdlib.h>
  15 #include <stdio.h>
  16
  17 /* Configuration */
  18
  19 static uns url_ignore_spaces;
  20 static uns url_ignore_underflow;
  21 static byte *url_component_separators = "";
  22 static uns url_min_repeat_count = 0x7fffffff;
  23 static uns url_max_repeat_length = 0;
  24
  25 static struct cfitem url_config[] = {
  26   { "URL",              CT_SECTION,     NULL },
  27   { "IgnoreSpaces",     CT_INT,         &url_ignore_spaces },
  28   { "IgnoreUnderflow",  CT_INT,         &url_ignore_underflow },
  29   { "ComponentSeparators",      CT_STRING,      &url_component_separators },
  30   { "MinRepeatCount",           CT_INT,         &url_min_repeat_count },
  31   { "MaxRepeatLength",          CT_INT,         &url_max_repeat_length },
  32   { NULL,               CT_STOP,        NULL }
  33 };
  34
  35 static void CONSTRUCTOR url_init_config(void)
  36 {
  37   cf_register(url_config);
  38 }
  39
  40 /* Escaping and de-escaping */
  41
  42 static uns
  43 enhex(uns x)
  44 {
  45   return (x<10) ? (x + '0') : (x - 10 + 'A');
  46 }
  47
  48 int
  49 url_deescape(byte *s, byte *d)
  50 {
  51   byte *dstart = d;
  52   byte *end = d + MAX_URL_SIZE - 10;
  53   while (*s)
  54     {
  55       if (d >= end)
  56         return URL_ERR_TOO_LONG;
  57       if (*s == '%')
  58         {
  59           unsigned int val;
  60           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  61             return URL_ERR_INVALID_ESCAPE;
  62           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  63           if (val < 0x20)
  64             return URL_ERR_INVALID_ESCAPED_CHAR;
  65           switch (val)
  66             {
  67             case ';':
  68               val = NCC_SEMICOLON; break;
  69             case '/':
  70               val = NCC_SLASH; break;
  71             case '?':
  72               val = NCC_QUEST; break;
  73             case ':':
  74               val = NCC_COLON; break;
  75             case '@':
  76               val = NCC_AT; break;
  77             case '=':
  78               val = NCC_EQUAL; break;
  79             case '&':
  80               val = NCC_AND; break;
  81             case '#':
  82               val = NCC_HASH; break;
  83             }
  84           *d++ = val;
  85           s += 3;
  86         }
  87       else if (*s > 0x20)
  88         *d++ = *s++;
  89       else if (Cspace(*s))
  90         {
  91           byte *s0 = s;
  92           while (Cspace(*s))
  93             s++;
  94           if (!url_ignore_spaces || !(!*s || d == dstart))
  95             {
  96               while (Cspace(*s0))
  97                 {
  98                   if (d >= end)
  99                     return URL_ERR_TOO_LONG;
 100                   *d++ = *s0++;
 101                 }
 102             }
 103         }
 104       else
 105         return URL_ERR_INVALID_CHAR;
 106     }
 107   *d = 0;
 108   return 0;
 109 }
 110
 111 int
 112 url_enescape(byte *s, byte *d)
 113 {
 114   byte *end = d + MAX_URL_SIZE - 10;
 115   unsigned int c;
 116
 117   while (c = *s)
 118     {
 119       if (d >= end)
 120         return URL_ERR_TOO_LONG;
 121       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 122           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 123           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 124           c == ',' ||
 125           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 126           c == '=' || c == '&' || c == '#' || c == ';')
 127         *d++ = *s++;
 128       else
 129         {
 130           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
 131           *d++ = '%';
 132           *d++ = enhex(val >> 4);
 133           *d++ = enhex(val & 0x0f);
 134           s++;
 135         }
 136     }
 137   *d = 0;
 138   return 0;
 139 }
 140
 141 /* Split an URL (several parts may be copied to the destination buffer) */
 142
 143 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 144 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 145
 146 uns
 147 identify_protocol(byte *p)
 148 {
 149   uns i;
 150
 151   for(i=1; i<URL_PROTO_MAX; i++)
 152     if (!strcasecmp(p, url_proto_names[i]))
 153       return i;
 154   return URL_PROTO_UNKNOWN;
 155 }
 156
 157 int
 158 url_split(byte *s, struct url *u, byte *d)
 159 {
 160   bzero(u, sizeof(struct url));
 161   u->port = ~0;
 162   u->bufend = d + MAX_URL_SIZE - 10;
 163
 164   if (s[0] != '/')                      /* Seek for "protocol:" */
 165     {
 166       byte *p = s;
 167       while (*p && Calnum(*p))
 168         p++;
 169       if (p != s && *p == ':')
 170         {
 171           u->protocol = d;
 172           while (s < p)
 173             *d++ = *s++;
 174           *d++ = 0;
 175           u->protoid = identify_protocol(u->protocol);
 176           s++;
 177           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 178             {
 179               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 180               int len = d - u->protocol;
 181               d -= len;
 182               s -= len;
 183               u->protocol = NULL;
 184               u->protoid = 0;
 185             }
 186         }
 187     }
 188
 189   if (s[0] == '/')                      /* Host spec or absolute path */
 190     {
 191       if (s[1] == '/')                  /* Host spec */
 192         {
 193           byte *q, *w, *e;
 194           char *ep;
 195
 196           s += 2;
 197           q = d;
 198           while (*s && *s != '/')       /* Copy user:passwd@host:port */
 199             *d++ = *s++;
 200           *d++ = 0;
 201           w = strchr(q, '@');
 202           if (w)                        /* user:passwd present */
 203             {
 204               *w++ = 0;
 205               u->user = q;
 206             }
 207           else
 208             w = q;
 209           e = strchr(w, ':');
 210           if (e)                        /* host:port present */
 211             {
 212               uns p;
 213               *e++ = 0;
 214               p = strtoul(e, &ep, 10);
 215               if (ep && *ep || p > 65535)
 216                 return URL_ERR_INVALID_PORT;
 217               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 218                 u->port = p;
 219             }
 220           u->host = w;
 221         }
 222     }
 223
 224   u->rest = s;
 225   u->buf = d;
 226   return 0;
 227 }
 228
 229 /* Normalization according to given base URL */
 230
 231 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 232
 233 static int
 234 relpath_merge(struct url *u, struct url *b)
 235 {
 236   byte *a = u->rest;
 237   byte *o = b->rest;
 238   byte *d = u->buf;
 239   byte *e = u->bufend;
 240   byte *p;
 241
 242   if (a[0] == '/')                      /* Absolute path => OK */
 243     return 0;
 244   if (o[0] != '/')
 245     return URL_PATH_UNDERFLOW;
 246
 247   if (!a[0])                            /* Empty URL -> inherit everything */
 248     {
 249       u->rest = b->rest;
 250       return 0;
 251     }
 252
 253   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 254
 255   if (a[0] == '#')                      /* Another fragment */
 256     {
 257       for(p=o; *p && *p != '#'; p++)
 258         ;
 259       goto copy;
 260     }
 261   if (a[0] == '?')                      /* New query */
 262     {
 263       for(p=o; *p && *p != '#' && *p != '?'; p++)
 264         ;
 265       goto copy;
 266     }
 267   if (a[0] == ';')                      /* Change parameters */
 268     {
 269       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 270         ;
 271       goto copy;
 272     }
 273
 274   p = NULL;                             /* Copy original path and find the last slash */
 275   while (*o && *o != ';' && *o != '?' && *o != '#')
 276     {
 277       if (d >= e)
 278         return URL_ERR_TOO_LONG;
 279       if ((*d++ = *o++) == '/')
 280         p = d;
 281     }
 282   if (!p)
 283     return URL_ERR_REL_NOTHING;
 284   d = p;
 285
 286   while (*a)
 287     {
 288       if (a[0] == '.')
 289         {
 290           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 291             {
 292               a++;
 293               if (a[0])
 294                 a++;
 295               continue;
 296             }
 297           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 298             {
 299               a += 2;
 300               if (a[0])
 301                 a++;
 302               if (d <= u->buf + 1)
 303                 {
 304                   /*
 305                    * RFC 1808 says we should leave ".." as a path segment, but
 306                    * we intentionally break the rule and refuse the URL.
 307                    */
 308                   if (!url_ignore_underflow)
 309                     return URL_PATH_UNDERFLOW;
 310                 }
 311               else
 312                 {
 313                   d--;                  /* Discard trailing slash */
 314                   while (d[-1] != '/')
 315                     d--;
 316                 }
 317               continue;
 318             }
 319         }
 320       while (a[0] && a[0] != '/')
 321         {
 322           if (d >= e)
 323             return URL_ERR_TOO_LONG;
 324           *d++ = *a++;
 325         }
 326       if (a[0])
 327         *d++ = *a++;
 328     }
 329
 330 okay:
 331   *d++ = 0;
 332   u->buf = d;
 333   return 0;
 334
 335 copy:                                   /* Combine part of old URL with the new one */
 336   while (o < p)
 337     if (d < e)
 338       *d++ = *o++;
 339     else
 340       return URL_ERR_TOO_LONG;
 341   while (*a)
 342     if (d < e)
 343       *d++ = *a++;
 344     else
 345       return URL_ERR_TOO_LONG;
 346   goto okay;
 347 }
 348
 349 int
 350 url_normalize(struct url *u, struct url *b)
 351 {
 352   int err;
 353
 354   /* Basic checks */
 355   if (url_proto_path_flags[u->protoid] && !u->host ||
 356       u->host && !*u->host ||
 357       !u->host && u->user ||
 358       !u->rest)
 359     return URL_SYNTAX_ERROR;
 360
 361   if (!u->protocol)
 362     {
 363       /* Now we know it's a relative URL. Do we have any base? */
 364       if (!b || !url_proto_path_flags[b->protoid])
 365         return URL_ERR_REL_NOTHING;
 366       u->protocol = b->protocol;
 367       u->protoid = b->protoid;
 368
 369       /* Reference to the same host */
 370       if (!u->host)
 371         {
 372           u->host = b->host;
 373           u->user = b->user;
 374           u->port = b->port;
 375           if (err = relpath_merge(u, b))
 376             return err;
 377         }
 378     }
 379
 380   /* Fill in missing info */
 381   if (u->port == ~0U)
 382     u->port = std_ports[u->protoid];
 383
 384   return 0;
 385 }
 386
 387 /* Name canonicalization */
 388
 389 static void
 390 lowercase(byte *b)
 391 {
 392   if (b)
 393     while (*b)
 394       {
 395         if (*b >= 'A' && *b <= 'Z')
 396           *b = *b + 0x20;
 397         b++;
 398       }
 399 }
 400
 401 static void
 402 kill_end_dot(byte *b)
 403 {
 404   byte *k;
 405
 406   if (b)
 407     {
 408       k = b + strlen(b) - 1;
 409       if (k > b && *k == '.')
 410         *k = 0;
 411     }
 412 }
 413
 414 int
 415 url_canonicalize(struct url *u)
 416 {
 417   char *c;
 418
 419   lowercase(u->protocol);
 420   lowercase(u->host);
 421   kill_end_dot(u->host);
 422   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 423     u->rest = "/";
 424   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 425     *c = 0;
 426   return 0;
 427 }
 428
 429 /* Pack a broken-down URL */
 430
 431 static byte *
 432 append(byte *d, byte *s, byte *e)
 433 {
 434   if (d)
 435     while (*s)
 436       {
 437         if (d >= e)
 438           return NULL;
 439         *d++ = *s++;
 440       }
 441   return d;
 442 }
 443
 444 int
 445 url_pack(struct url *u, byte *d)
 446 {
 447   byte *e = d + MAX_URL_SIZE - 10;
 448
 449   if (u->protocol)
 450     {
 451       d = append(d, u->protocol, e);
 452       d = append(d, ":", e);
 453       u->protoid = identify_protocol(u->protocol);
 454     }
 455   if (u->host)
 456     {
 457       d = append(d, "//", e);
 458       if (u->user)
 459         {
 460           d = append(d, u->user, e);
 461           d = append(d, "@", e);
 462         }
 463       d = append(d, u->host, e);
 464       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 465         {
 466           char z[10];
 467           sprintf(z, "%d", u->port);
 468           d = append(d, ":", e);
 469           d = append(d, z, e);
 470         }
 471     }
 472   if (u->rest)
 473     d = append(d, u->rest, e);
 474   if (!d)
 475     return URL_ERR_TOO_LONG;
 476   *d = 0;
 477   return 0;
 478 }
 479
 480 /* Error messages */
 481
 482 static char *errmsg[] = {
 483   "Something is wrong",
 484   "Too long",
 485   "Invalid character",
 486   "Invalid escape",
 487   "Invalid escaped character",
 488   "Invalid port number",
 489   "Relative URL not allowed",
 490   "Unknown protocol",
 491   "Syntax error",
 492   "Path underflow"
 493 };
 494
 495 char *
 496 url_error(uns err)
 497 {
 498   if (err >= sizeof(errmsg) / sizeof(char *))
 499     err = 0;
 500   return errmsg[err];
 501 }
 502
 503 /* Standard cookbook recipes */
 504
 505 int
 506 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
 507 {
 508   int err;
 509
 510   if (err = url_deescape(u, buf1))
 511     return err;
 512   if (err = url_split(buf1, url, buf2))
 513     return err;
 514   if (err = url_normalize(url, NULL))
 515     return err;
 516   return url_canonicalize(url);
 517 }
 518
 519 int
 520 url_auto_canonicalize(byte *src, byte *dst)
 521 {
 522   byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 523   int err;
 524   struct url ur;
 525
 526   (void)((err = url_canon_split(src, buf1, buf2, &ur)) ||
 527    (err = url_pack(&ur, buf3)) ||
 528    (err = url_enescape(buf3, dst)));
 529   return err;
 530 }
 531
 532 /* Testing */
 533
 534 #ifdef TEST
 535
 536 int main(int argc, char **argv)
 537 {
 538   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 539   int err;
 540   struct url url, url0;
 541
 542   if (argc != 2)
 543     return 1;
 544   if (err = url_deescape(argv[1], buf1))
 545     {
 546       printf("deesc: error %d\n", err);
 547       return 1;
 548     }
 549   printf("deesc: %s\n", buf1);
 550   if (err = url_split(buf1, &url, buf2))
 551     {
 552       printf("split: error %d\n", err);
 553       return 1;
 554     }
 555   printf("split: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 556   if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment", &url0, buf3))
 557     {
 558       printf("split base: error %d\n", err);
 559       return 1;
 560     }
 561   if (err = url_normalize(&url0, NULL))
 562     {
 563       printf("normalize base: error %d\n", err);
 564       return 1;
 565     }
 566   printf("base: @%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.host, url0.port, url0.rest);
 567   if (err = url_normalize(&url, &url0))
 568     {
 569       printf("normalize: error %d\n", err);
 570       return 1;
 571     }
 572   printf("normalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 573   if (err = url_canonicalize(&url))
 574     {
 575       printf("canonicalize: error %d\n", err);
 576       return 1;
 577     }
 578   printf("canonicalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 579   if (err = url_pack(&url, buf4))
 580     {
 581       printf("pack: error %d\n", err);
 582       return 1;
 583     }
 584   printf("pack: %s\n", buf4);
 585   if (err = url_enescape(buf4, buf2))
 586     {
 587       printf("enesc: error %d\n", err);
 588       return 1;
 589     }
 590   printf("enesc: %s\n", buf2);
 591   return 0;
 592 }
 593
 594 #endif
 595
 596 struct component {
 597         byte *start;
 598         int length;
 599         u32 hash;
 600 };
 601
 602 static inline u32
 603 hashf(byte *start, int length)
 604 {
 605         u32 hf = length;
 606         while (length-- > 0)
 607                 hf = (hf << 8 | hf >> 24) ^ *start++;
 608         return hf;
 609 }
 610
 611 static inline uns
 612 repeat_count(struct component *comp, uns count, uns len)
 613 {
 614         struct component *orig_comp = comp;
 615         uns found = 0;
 616         while (1)
 617         {
 618                 uns i;
 619                 comp += len;
 620                 count -= len;
 621                 found++;
 622                 if (count < len)
 623                         return found;
 624                 for (i=0; i<len; i++)
 625                         if (comp[i].hash != orig_comp[i].hash
 626                         || comp[i].length != orig_comp[i].length
 627                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 628                                 return found;
 629         }
 630 }
 631
 632 int
 633 url_has_repeated_component(byte *url)
 634 {
 635         struct component *comp;
 636         uns comps, comp_len, rep_prefix;
 637         byte *c;
 638         uns i;
 639
 640         for (comps=0, c=url; c; comps++)
 641         {
 642                 c = strpbrk(c, url_component_separators);
 643                 if (c)
 644                         c++;
 645         }
 646         if (comps < url_min_repeat_count)
 647                 return 0;
 648         comp = alloca(comps * sizeof(struct component));
 649         for (i=0, c=url; c; i++)
 650         {
 651                 comp[i].start = c;
 652                 c = strpbrk(c, url_component_separators);
 653                 if (c)
 654                 {
 655                         comp[i].length = c - comp[i].start;
 656                         c++;
 657                 }
 658                 else
 659                         comp[i].length = strlen(comp[i].start);
 660         }
 661         ASSERT(i == comps);
 662         for (i=0; i<comps; i++)
 663                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 664         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 665                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 666                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 667                                 return comp_len;
 668         return 0;
 669 }