ucw/url.c

   1 /*
   2  *      UCW Library -- URL Functions
   3  *
   4  *      (c) 1997--2004 Martin Mares <mj@ucw.cz>
   5  *      (c) 2001--2005 Robert Spalek <robert@ucw.cz>
   6  *
   7  *      This software may be freely distributed and used according to the terms
   8  *      of the GNU Lesser General Public License.
   9  *
  10  *      XXX: The buffer handling in this module is really horrible, but it works.
  11  */
  12
  13 #include "ucw/lib.h"
  14 #include "ucw/url.h"
  15 #include "ucw/chartype.h"
  16 #include "ucw/conf.h"
  17 #include "ucw/prime.h"
  18
  19 #include <string.h>
  20 #include <stdlib.h>
  21 #include <stdio.h>
  22 #include <alloca.h>
  23
  24 /* Configuration */
  25
  26 static uns url_ignore_spaces;
  27 static uns url_ignore_underflow;
  28 static char *url_component_separators = "";
  29 static uns url_min_repeat_count = 0x7fffffff;
  30 static uns url_max_repeat_length = 0;
  31 static uns url_max_occurences = ~0U;
  32
  33 #ifndef TEST
  34 static struct cf_section url_config = {
  35   CF_ITEMS {
  36     CF_UNS("IgnoreSpaces", &url_ignore_spaces),
  37     CF_UNS("IgnoreUnderflow", &url_ignore_underflow),
  38     CF_STRING("ComponentSeparators", &url_component_separators),
  39     CF_UNS("MinRepeatCount", &url_min_repeat_count),
  40     CF_UNS("MaxRepeatLength", &url_max_repeat_length),
  41     CF_UNS("MaxOccurences", &url_max_occurences),
  42     CF_END
  43   }
  44 };
  45
  46 static void CONSTRUCTOR url_init_config(void)
  47 {
  48   cf_declare_section("URL", &url_config, 0);
  49 }
  50 #endif
  51
  52 /* Escaping and de-escaping */
  53
  54 static uns
  55 enhex(uns x)
  56 {
  57   return (x<10) ? (x + '0') : (x - 10 + 'A');
  58 }
  59
  60 int
  61 url_deescape(const char *s, char *d)
  62 {
  63   char *dstart = d;
  64   char *end = d + MAX_URL_SIZE - 10;
  65   while (*s)
  66     {
  67       if (d >= end)
  68         return URL_ERR_TOO_LONG;
  69       if (*s == '%')
  70         {
  71           unsigned int val;
  72           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  73             return URL_ERR_INVALID_ESCAPE;
  74           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  75           if (val < 0x20)
  76             return URL_ERR_INVALID_ESCAPED_CHAR;
  77           switch (val)
  78             {
  79             case ';':
  80               val = NCC_SEMICOLON; break;
  81             case '/':
  82               val = NCC_SLASH; break;
  83             case '?':
  84               val = NCC_QUEST; break;
  85             case ':':
  86               val = NCC_COLON; break;
  87             case '@':
  88               val = NCC_AT; break;
  89             case '=':
  90               val = NCC_EQUAL; break;
  91             case '&':
  92               val = NCC_AND; break;
  93             case '#':
  94               val = NCC_HASH; break;
  95 #ifndef CONFIG_URL_ESCAPE_COMPAT
  96             case '$':
  97               val = NCC_DOLLAR; break;
  98             case '+':
  99               val = NCC_PLUS; break;
 100             case ',':
 101               val = NCC_COMMA; break;
 102 #endif
 103             }
 104           *d++ = val;
 105           s += 3;
 106         }
 107       else if ((byte) *s > 0x20)
 108         *d++ = *s++;
 109       else if (Cspace(*s))
 110         {
 111           const char *s0 = s;
 112           while (Cspace(*s))
 113             s++;
 114           if (!url_ignore_spaces || !(!*s || d == dstart))
 115             {
 116               while (Cspace(*s0))
 117                 {
 118                   if (d >= end)
 119                     return URL_ERR_TOO_LONG;
 120                   *d++ = *s0++;
 121                 }
 122             }
 123         }
 124       else
 125         return URL_ERR_INVALID_CHAR;
 126     }
 127   *d = 0;
 128   return 0;
 129 }
 130
 131 int
 132 url_enescape(const char *s, char *d)
 133 {
 134   char *end = d + MAX_URL_SIZE - 10;
 135   unsigned int c;
 136
 137   while (c = *s)
 138     {
 139       if (d >= end)
 140         return URL_ERR_TOO_LONG;
 141       if (Calnum(c) ||                                                  /* RFC 2396 (2.1-2.3): Only alphanumerics ... */
 142           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||  /* ... and some exceptions and reserved chars */
 143           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||
 144           c == ',' || c == '=' || c == '&' || c == '#' || c == ';' ||
 145           c == '/' || c == '?' || c == ':' || c == '@'
 146 #ifndef CONFIG_URL_ESCAPE_COMPAT
 147           || c == '~'
 148 #endif
 149         )
 150         *d++ = *s++;
 151       else
 152         {
 153           uns val = (byte)(((byte)*s < NCC_MAX) ? NCC_CHARS[(byte)*s] : *s);
 154           *d++ = '%';
 155           *d++ = enhex(val >> 4);
 156           *d++ = enhex(val & 0x0f);
 157           s++;
 158         }
 159     }
 160   *d = 0;
 161   return 0;
 162 }
 163
 164 int
 165 url_enescape_friendly(const char *src, char *dest)
 166 {
 167   char *end = dest + MAX_URL_SIZE - 10;
 168   const byte *srcb = src;
 169   while (*srcb)
 170     {
 171       if (dest >= end)
 172         return URL_ERR_TOO_LONG;
 173       if ((byte)*srcb < NCC_MAX)
 174         *dest++ = NCC_CHARS[*srcb++];
 175       else if (*srcb >= 0x20 && *srcb < 0x7f)
 176         *dest++ = *srcb++;
 177       else
 178         {
 179           *dest++ = '%';
 180           *dest++ = enhex((byte)*srcb >> 4);
 181           *dest++ = enhex(*srcb++ & 0x0f);
 182         }
 183     }
 184   *dest = 0;
 185   return 0;
 186 }
 187
 188 /* Split an URL (several parts may be copied to the destination buffer) */
 189
 190 char *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 191 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 192
 193 uns
 194 url_identify_protocol(const char *p)
 195 {
 196   uns i;
 197
 198   for(i=1; i<URL_PROTO_MAX; i++)
 199     if (!strcasecmp(p, url_proto_names[i]))
 200       return i;
 201   return URL_PROTO_UNKNOWN;
 202 }
 203
 204 int
 205 url_split(char *s, struct url *u, char *d)
 206 {
 207   bzero(u, sizeof(struct url));
 208   u->port = ~0;
 209   u->bufend = d + MAX_URL_SIZE - 10;
 210
 211   if (s[0] != '/')                      /* Seek for "protocol:" */
 212     {
 213       char *p = s;
 214       while (*p && Calnum(*p))
 215         p++;
 216       if (p != s && *p == ':')
 217         {
 218           u->protocol = d;
 219           while (s < p)
 220             *d++ = *s++;
 221           *d++ = 0;
 222           u->protoid = url_identify_protocol(u->protocol);
 223           s++;
 224           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 225             {
 226               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 227               int len = d - u->protocol;
 228               d -= len;
 229               s -= len;
 230               u->protocol = NULL;
 231               u->protoid = 0;
 232             }
 233         }
 234     }
 235
 236   if (s[0] == '/')                      /* Host spec or absolute path */
 237     {
 238       if (s[1] == '/')                  /* Host spec */
 239         {
 240           char *q, *e;
 241           char *at = NULL;
 242           char *ep;
 243
 244           s += 2;
 245           q = d;
 246           while (*s && *s != '/' && *s != '?')  /* Copy user:passwd@host:port */
 247             {
 248               if (*s != '@')
 249                 *d++ = *s;
 250               else if (!at)
 251                 {
 252                   *d++ = 0;
 253                   at = d;
 254                 }
 255               else                      /* This shouldn't happen with sane URL's, but we need to be sure */
 256                 *d++ = NCC_AT;
 257               s++;
 258             }
 259           *d++ = 0;
 260           if (at)                       /* user:passwd present */
 261             {
 262               u->user = q;
 263               if (e = strchr(q, ':'))
 264                 {
 265                   *e++ = 0;
 266                   u->pass = e;
 267                 }
 268             }
 269           else
 270             at = q;
 271           e = strchr(at, ':');
 272           if (e)                        /* host:port present */
 273             {
 274               uns p;
 275               *e++ = 0;
 276               p = strtoul(e, &ep, 10);
 277               if (ep && *ep || p > 65535)
 278                 return URL_ERR_INVALID_PORT;
 279               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 280                 u->port = p;
 281             }
 282           u->host = at;
 283         }
 284     }
 285
 286   u->rest = s;
 287   u->buf = d;
 288   return 0;
 289 }
 290
 291 /* Normalization according to given base URL */
 292
 293 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 294
 295 static int
 296 relpath_merge(struct url *u, struct url *b)
 297 {
 298   char *a = u->rest;
 299   char *o = b->rest;
 300   char *d = u->buf;
 301   char *e = u->bufend;
 302   char *p;
 303
 304   if (a[0] == '/')                      /* Absolute path => OK */
 305     return 0;
 306   if (o[0] != '/' && o[0] != '?')
 307     return URL_PATH_UNDERFLOW;
 308
 309   if (!a[0])                            /* Empty URL -> inherit everything */
 310     {
 311       u->rest = b->rest;
 312       return 0;
 313     }
 314
 315   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 316
 317   if (a[0] == '#')                      /* Another fragment */
 318     {
 319       for(p=o; *p && *p != '#'; p++)
 320         ;
 321       goto copy;
 322     }
 323   if (a[0] == '?')                      /* New query */
 324     {
 325       for(p=o; *p && *p != '#' && *p != '?'; p++)
 326         ;
 327       goto copy;
 328     }
 329
 330   p = NULL;                             /* Copy original path and find the last slash */
 331   while (*o && *o != '?' && *o != '#')
 332     {
 333       if (d >= e)
 334         return URL_ERR_TOO_LONG;
 335       if ((*d++ = *o++) == '/')
 336         p = d;
 337     }
 338   if (!p)
 339     return URL_ERR_REL_NOTHING;
 340   d = p;
 341
 342   while (*a)
 343     {
 344       if (a[0] == '.')
 345         {
 346           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 347             {
 348               a++;
 349               if (a[0])
 350                 a++;
 351               continue;
 352             }
 353           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 354             {
 355               a += 2;
 356               if (a[0])
 357                 a++;
 358               if (d <= u->buf + 1)
 359                 {
 360                   /*
 361                    * RFC 1808 says we should leave ".." as a path segment, but
 362                    * we intentionally break the rule and refuse the URL.
 363                    */
 364                   if (!url_ignore_underflow)
 365                     return URL_PATH_UNDERFLOW;
 366                 }
 367               else
 368                 {
 369                   d--;                  /* Discard trailing slash */
 370                   while (d[-1] != '/')
 371                     d--;
 372                 }
 373               continue;
 374             }
 375         }
 376       while (a[0] && a[0] != '/')
 377         {
 378           if (d >= e)
 379             return URL_ERR_TOO_LONG;
 380           *d++ = *a++;
 381         }
 382       if (a[0])
 383         *d++ = *a++;
 384     }
 385
 386 okay:
 387   *d++ = 0;
 388   u->buf = d;
 389   return 0;
 390
 391 copy:                                   /* Combine part of old URL with the new one */
 392   while (o < p)
 393     if (d < e)
 394       *d++ = *o++;
 395     else
 396       return URL_ERR_TOO_LONG;
 397   while (*a)
 398     if (d < e)
 399       *d++ = *a++;
 400     else
 401       return URL_ERR_TOO_LONG;
 402   goto okay;
 403 }
 404
 405 int
 406 url_normalize(struct url *u, struct url *b)
 407 {
 408   int err;
 409
 410   /* Basic checks */
 411   if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
 412       !u->host && u->user ||
 413       !u->user && u->pass ||
 414       !u->rest)
 415     return URL_SYNTAX_ERROR;
 416
 417   if (!u->protocol)
 418     {
 419       /* Now we know it's a relative URL. Do we have any base? */
 420       if (!b || !url_proto_path_flags[b->protoid])
 421         return URL_ERR_REL_NOTHING;
 422       u->protocol = b->protocol;
 423       u->protoid = b->protoid;
 424
 425       /* Reference to the same host */
 426       if (!u->host)
 427         {
 428           u->host = b->host;
 429           u->user = b->user;
 430           u->pass = b->pass;
 431           u->port = b->port;
 432           if (err = relpath_merge(u, b))
 433             return err;
 434         }
 435     }
 436
 437   /* Change path "?" to "/?" because it's the true meaning */
 438   if (u->rest[0] == '?')
 439     {
 440       int l = strlen(u->rest);
 441       if (u->bufend - u->buf < l+1)
 442         return URL_ERR_TOO_LONG;
 443       u->buf[0] = '/';
 444       memcpy(u->buf+1, u->rest, l+1);
 445       u->rest = u->buf;
 446       u->buf += l+2;
 447     }
 448
 449   /* Fill in missing info */
 450   if (u->port == ~0U)
 451     u->port = std_ports[u->protoid];
 452
 453   return 0;
 454 }
 455
 456 /* Name canonicalization */
 457
 458 static void
 459 lowercase(char *b)
 460 {
 461   if (b)
 462     while (*b)
 463       {
 464         if (*b >= 'A' && *b <= 'Z')
 465           *b = *b + 0x20;
 466         b++;
 467       }
 468 }
 469
 470 static void
 471 kill_end_dot(char *b)
 472 {
 473   char *k;
 474
 475   if (b)
 476     {
 477       k = b + strlen(b) - 1;
 478       while (k > b && *k == '.')
 479         *k-- = 0;
 480     }
 481 }
 482
 483 int
 484 url_canonicalize(struct url *u)
 485 {
 486   char *c;
 487
 488   lowercase(u->protocol);
 489   lowercase(u->host);
 490   kill_end_dot(u->host);
 491   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 492     u->rest = "/";
 493   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 494     *c = 0;
 495   return 0;
 496 }
 497
 498 /* Pack a broken-down URL */
 499
 500 static char *
 501 append(char *d, const char *s, char *e)
 502 {
 503   if (d)
 504     while (*s)
 505       {
 506         if (d >= e)
 507           return NULL;
 508         *d++ = *s++;
 509       }
 510   return d;
 511 }
 512
 513 int
 514 url_pack(struct url *u, char *d)
 515 {
 516   char *e = d + MAX_URL_SIZE - 10;
 517
 518   if (u->protocol)
 519     {
 520       d = append(d, u->protocol, e);
 521       d = append(d, ":", e);
 522       u->protoid = url_identify_protocol(u->protocol);
 523     }
 524   if (u->host)
 525     {
 526       d = append(d, "//", e);
 527       if (u->user)
 528         {
 529           d = append(d, u->user, e);
 530           if (u->pass)
 531             {
 532               d = append(d, ":", e);
 533               d = append(d, u->pass, e);
 534             }
 535           d = append(d, "@", e);
 536         }
 537       d = append(d, u->host, e);
 538       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 539         {
 540           char z[10];
 541           sprintf(z, "%d", u->port);
 542           d = append(d, ":", e);
 543           d = append(d, z, e);
 544         }
 545     }
 546   if (u->rest)
 547     d = append(d, u->rest, e);
 548   if (!d)
 549     return URL_ERR_TOO_LONG;
 550   *d = 0;
 551   return 0;
 552 }
 553
 554 /* Error messages */
 555
 556 static char *errmsg[] = {
 557   "Something is wrong",
 558   "Too long",
 559   "Invalid character",
 560   "Invalid escape",
 561   "Invalid escaped character",
 562   "Invalid port number",
 563   "Relative URL not allowed",
 564   "Unknown protocol",
 565   "Syntax error",
 566   "Path underflow"
 567 };
 568
 569 char *
 570 url_error(uns err)
 571 {
 572   if (err >= sizeof(errmsg) / sizeof(char *))
 573     err = 0;
 574   return errmsg[err];
 575 }
 576
 577 /* Standard cookbook recipes */
 578
 579 int
 580 url_canon_split_rel(const char *u, char *buf1, char *buf2, struct url *url, struct url *base)
 581 {
 582   int err;
 583
 584   if (err = url_deescape(u, buf1))
 585     return err;
 586   if (err = url_split(buf1, url, buf2))
 587     return err;
 588   if (err = url_normalize(url, base))
 589     return err;
 590   return url_canonicalize(url);
 591 }
 592
 593 int
 594 url_auto_canonicalize_rel(const char *src, char *dst, struct url *base)
 595 {
 596   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
 597   int err;
 598   struct url ur;
 599
 600   (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
 601    (err = url_pack(&ur, buf3)) ||
 602    (err = url_enescape(buf3, dst)));
 603   return err;
 604 }
 605
 606 /* Testing */
 607
 608 #ifdef TEST
 609
 610 int main(int argc, char **argv)
 611 {
 612   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 613   int err;
 614   struct url url, url0;
 615   char *base = "http://mj@www.hell.org/123/sub_dir;param/index.html;param?query&zzz/sub;query+#fragment?";
 616
 617   if (argc != 2 && argc != 3)
 618     return 1;
 619   if (argc == 3)
 620     base = argv[2];
 621   if (err = url_deescape(argv[1], buf1))
 622     {
 623       printf("deesc: error %d\n", err);
 624       return 1;
 625     }
 626   printf("deesc: %s\n", buf1);
 627   if (err = url_split(buf1, &url, buf2))
 628     {
 629       printf("split: error %d\n", err);
 630       return 1;
 631     }
 632   printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 633   if (err = url_split(base, &url0, buf3))
 634     {
 635       printf("split base: error %d\n", err);
 636       return 1;
 637     }
 638   if (err = url_normalize(&url0, NULL))
 639     {
 640       printf("normalize base: error %d\n", err);
 641       return 1;
 642     }
 643   printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
 644   if (err = url_normalize(&url, &url0))
 645     {
 646       printf("normalize: error %d\n", err);
 647       return 1;
 648     }
 649   printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 650   if (err = url_canonicalize(&url))
 651     {
 652       printf("canonicalize: error %d\n", err);
 653       return 1;
 654     }
 655   printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
 656   if (err = url_pack(&url, buf4))
 657     {
 658       printf("pack: error %d\n", err);
 659       return 1;
 660     }
 661   printf("pack: %s\n", buf4);
 662   if (err = url_enescape(buf4, buf2))
 663     {
 664       printf("enesc: error %d\n", err);
 665       return 1;
 666     }
 667   printf("enesc: %s\n", buf2);
 668   return 0;
 669 }
 670
 671 #endif
 672
 673 struct component {
 674         const char *start;
 675         int length;
 676         uns count;
 677         u32 hash;
 678 };
 679
 680 static inline u32
 681 hashf(const char *start, int length)
 682 {
 683         u32 hf = length;
 684         while (length-- > 0)
 685                 hf = (hf << 8 | hf >> 24) ^ *start++;
 686         return hf;
 687 }
 688
 689 static inline uns
 690 repeat_count(struct component *comp, uns count, uns len)
 691 {
 692         struct component *orig_comp = comp;
 693         uns found = 0;
 694         while (1)
 695         {
 696                 uns i;
 697                 comp += len;
 698                 count -= len;
 699                 found++;
 700                 if (count < len)
 701                         return found;
 702                 for (i=0; i<len; i++)
 703                         if (comp[i].hash != orig_comp[i].hash
 704                         || comp[i].length != orig_comp[i].length
 705                         || memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
 706                                 return found;
 707         }
 708 }
 709
 710 int
 711 url_has_repeated_component(const char *url)
 712 {
 713         struct component *comp;
 714         uns comps, comp_len, rep_prefix, hash_size, *hash, *next;
 715         const char *c;
 716         uns i, j, k;
 717
 718         for (comps=0, c=url; c; comps++)
 719         {
 720                 c = strpbrk(c, url_component_separators);
 721                 if (c)
 722                         c++;
 723         }
 724         if (comps < url_min_repeat_count && comps <= url_max_occurences)
 725                 return 0;
 726         comp = alloca(comps * sizeof(*comp));
 727         for (i=0, c=url; c; i++)
 728         {
 729                 comp[i].start = c;
 730                 c = strpbrk(c, url_component_separators);
 731                 if (c)
 732                 {
 733                         comp[i].length = c - comp[i].start;
 734                         c++;
 735                 }
 736                 else
 737                         comp[i].length = strlen(comp[i].start);
 738         }
 739         ASSERT(i == comps);
 740         for (i=0; i<comps; i++)
 741                 comp[i].hash = hashf(comp[i].start, comp[i].length);
 742         if (comps > url_max_occurences)
 743         {
 744                 hash_size = next_table_prime(comps);
 745                 hash = alloca(hash_size * sizeof(*hash));
 746                 next = alloca(comps * sizeof(*next));
 747                 memset(hash, 255, hash_size * sizeof(*hash));
 748                 for (i=0; i<comps; i++)
 749                 {
 750                         j = comp[i].hash % hash_size;
 751                         for (k = hash[j]; ~k && (comp[i].hash != comp[k].hash || comp[i].length != comp[k].length ||
 752                             memcmp(comp[k].start, comp[i].start, comp[i].length)); k = next[k]);
 753                         if (!~k)
 754                         {
 755                                 next[i] = hash[j];
 756                                 hash[j] = i;
 757                                 comp[i].count = 1;
 758                         }
 759                         else
 760                         {
 761                                 if (comp[k].count++ >= url_max_occurences)
 762                                         return 1;
 763                         }
 764                 }
 765         }
 766         for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
 767                 for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
 768                         if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
 769                                 return comp_len;
 770         return 0;
 771 }