lib/url.c

   1 /*
   2  *      Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
   3  *
   4  *      (c) 1997--2001 Martin Mares <mj@ucw.cz>
   5  */
   6
   7 #include "lib/lib.h"
   8 #include "lib/url.h"
   9 #include "lib/chartype.h"
  10 #include "lib/conf.h"
  11
  12 #include <string.h>
  13 #include <stdlib.h>
  14 #include <stdio.h>
  15
  16 /* Configuration */
  17
  18 static uns url_ignore_spaces;
  19 static uns url_ignore_underflow;
  20
  21 static struct cfitem url_config[] = {
  22   { "URL",              CT_SECTION,     NULL },
  23   { "IgnoreSpaces",     CT_INT,         &url_ignore_spaces },
  24   { "IgnoreUnderflow",  CT_INT,         &url_ignore_underflow },
  25   { NULL,               CT_STOP,        NULL }
  26 };
  27
  28 static void CONSTRUCTOR url_init_config(void)
  29 {
  30   cf_register(url_config);
  31 }
  32
  33 /* Escaping and de-escaping */
  34
  35 static uns
  36 enhex(uns x)
  37 {
  38   return (x<10) ? (x + '0') : (x - 10 + 'A');
  39 }
  40
  41 int
  42 url_deescape(byte *s, byte *d)
  43 {
  44   byte *dstart = d;
  45   byte *end = d + MAX_URL_SIZE - 10;
  46   while (*s)
  47     {
  48       if (d >= end)
  49         return URL_ERR_TOO_LONG;
  50       if (*s == '%')
  51         {
  52           unsigned int val;
  53           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  54             return URL_ERR_INVALID_ESCAPE;
  55           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  56           if (val < 0x20)
  57             return URL_ERR_INVALID_ESCAPED_CHAR;
  58           switch (val)
  59             {
  60             case ';':
  61               val = NCC_SEMICOLON; break;
  62             case '/':
  63               val = NCC_SLASH; break;
  64             case '?':
  65               val = NCC_QUEST; break;
  66             case ':':
  67               val = NCC_COLON; break;
  68             case '@':
  69               val = NCC_AT; break;
  70             case '=':
  71               val = NCC_EQUAL; break;
  72             case '&':
  73               val = NCC_AND; break;
  74             case '#':
  75               val = NCC_HASH; break;
  76             }
  77           *d++ = val;
  78           s += 3;
  79         }
  80       else if (*s > 0x20)
  81         *d++ = *s++;
  82       else if (Cspace(*s))
  83         {
  84           byte *s0 = s;
  85           while (Cspace(*s))
  86             s++;
  87           if (!url_ignore_spaces || !(!*s || d == dstart))
  88             {
  89               while (Cspace(*s0))
  90                 {
  91                   if (d >= end)
  92                     return URL_ERR_TOO_LONG;
  93                   *d++ = *s0++;
  94                 }
  95             }
  96         }
  97       else
  98         return URL_ERR_INVALID_CHAR;
  99     }
 100   *d = 0;
 101   return 0;
 102 }
 103
 104 int
 105 url_enescape(byte *s, byte *d)
 106 {
 107   byte *end = d + MAX_URL_SIZE - 10;
 108   unsigned int c;
 109
 110   while (c = *s)
 111     {
 112       if (d >= end)
 113         return URL_ERR_TOO_LONG;
 114       if (Calnum(c) ||                                                  /* RFC 1738(2.2): Only alphanumerics ... */
 115           c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||   /* ... and several other exceptions ... */
 116           c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||
 117           c == ',' ||
 118           c == '/' || c == '?' || c == ':' || c == '@' ||               /* ... and reserved chars used for reserved purpose */
 119           c == '=' || c == '&' || c == '#' || c == ';')
 120         *d++ = *s++;
 121       else
 122         {
 123           uns val = (*s < NCC_MAX) ? NCC_CHARS[*s] : *s;
 124           *d++ = '%';
 125           *d++ = enhex(val >> 4);
 126           *d++ = enhex(val & 0x0f);
 127           s++;
 128         }
 129     }
 130   *d = 0;
 131   return 0;
 132 }
 133
 134 /* Split an URL (several parts may be copied to the destination buffer) */
 135
 136 byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
 137 static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
 138
 139 uns
 140 identify_protocol(byte *p)
 141 {
 142   uns i;
 143
 144   for(i=1; i<URL_PROTO_MAX; i++)
 145     if (!strcasecmp(p, url_proto_names[i]))
 146       return i;
 147   return URL_PROTO_UNKNOWN;
 148 }
 149
 150 int
 151 url_split(byte *s, struct url *u, byte *d)
 152 {
 153   bzero(u, sizeof(struct url));
 154   u->port = ~0;
 155   u->bufend = d + MAX_URL_SIZE - 10;
 156
 157   if (s[0] != '/')                      /* Seek for "protocol:" */
 158     {
 159       byte *p = s;
 160       while (*p && Calnum(*p))
 161         p++;
 162       if (p != s && *p == ':')
 163         {
 164           u->protocol = d;
 165           while (s < p)
 166             *d++ = *s++;
 167           *d++ = 0;
 168           u->protoid = identify_protocol(u->protocol);
 169           s++;
 170           if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
 171             {
 172               /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
 173               int len = d - u->protocol;
 174               d -= len;
 175               s -= len;
 176               u->protocol = NULL;
 177               u->protoid = 0;
 178             }
 179         }
 180     }
 181
 182   if (s[0] == '/')                      /* Host spec or absolute path */
 183     {
 184       if (s[1] == '/')                  /* Host spec */
 185         {
 186           byte *q, *w, *e;
 187           char *ep;
 188
 189           s += 2;
 190           q = d;
 191           while (*s && *s != '/')       /* Copy user:passwd@host:port */
 192             *d++ = *s++;
 193           *d++ = 0;
 194           w = strchr(q, '@');
 195           if (w)                        /* user:passwd present */
 196             {
 197               *w++ = 0;
 198               u->user = q;
 199             }
 200           else
 201             w = q;
 202           e = strchr(w, ':');
 203           if (e)                        /* host:port present */
 204             {
 205               uns p;
 206               *e++ = 0;
 207               p = strtoul(e, &ep, 10);
 208               if (ep && *ep || p > 65535)
 209                 return URL_ERR_INVALID_PORT;
 210               else if (p)               /* Port 0 (e.g. in :/) is treated as default port */
 211                 u->port = p;
 212             }
 213           u->host = w;
 214         }
 215     }
 216
 217   u->rest = s;
 218   u->buf = d;
 219   return 0;
 220 }
 221
 222 /* Normalization according to given base URL */
 223
 224 static uns std_ports[] = URL_DEFPORTS;  /* Default port numbers */
 225
 226 static int
 227 relpath_merge(struct url *u, struct url *b)
 228 {
 229   byte *a = u->rest;
 230   byte *o = b->rest;
 231   byte *d = u->buf;
 232   byte *e = u->bufend;
 233   byte *p;
 234
 235   if (a[0] == '/')                      /* Absolute path => OK */
 236     return 0;
 237   if (o[0] != '/')
 238     return URL_PATH_UNDERFLOW;
 239
 240   if (!a[0])                            /* Empty URL -> inherit everything */
 241     {
 242       u->rest = b->rest;
 243       return 0;
 244     }
 245
 246   u->rest = d;                          /* We know we'll need to copy the path somewhere else */
 247
 248   if (a[0] == '#')                      /* Another fragment */
 249     {
 250       for(p=o; *p && *p != '#'; p++)
 251         ;
 252       goto copy;
 253     }
 254   if (a[0] == '?')                      /* New query */
 255     {
 256       for(p=o; *p && *p != '#' && *p != '?'; p++)
 257         ;
 258       goto copy;
 259     }
 260   if (a[0] == ';')                      /* Change parameters */
 261     {
 262       for(p=o; *p && *p != ';' && *p != '?' && *p != '#'; p++)
 263         ;
 264       goto copy;
 265     }
 266
 267   p = NULL;                             /* Copy original path and find the last slash */
 268   while (*o && *o != ';' && *o != '?' && *o != '#')
 269     {
 270       if (d >= e)
 271         return URL_ERR_TOO_LONG;
 272       if ((*d++ = *o++) == '/')
 273         p = d;
 274     }
 275   if (!p)
 276     return URL_ERR_REL_NOTHING;
 277   d = p;
 278
 279   while (*a)
 280     {
 281       if (a[0] == '.')
 282         {
 283           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 284             {
 285               a++;
 286               if (a[0])
 287                 a++;
 288               continue;
 289             }
 290           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 291             {
 292               a += 2;
 293               if (a[0])
 294                 a++;
 295               if (d <= u->buf + 1)
 296                 {
 297                   /*
 298                    * RFC 1808 says we should leave ".." as a path segment, but
 299                    * we intentionally break the rule and refuse the URL.
 300                    */
 301                   if (!url_ignore_underflow)
 302                     return URL_PATH_UNDERFLOW;
 303                 }
 304               else
 305                 {
 306                   d--;                  /* Discard trailing slash */
 307                   while (d[-1] != '/')
 308                     d--;
 309                 }
 310               continue;
 311             }
 312         }
 313       while (a[0] && a[0] != '/')
 314         {
 315           if (d >= e)
 316             return URL_ERR_TOO_LONG;
 317           *d++ = *a++;
 318         }
 319       if (a[0])
 320         *d++ = *a++;
 321     }
 322
 323 okay:
 324   *d++ = 0;
 325   u->buf = d;
 326   return 0;
 327
 328 copy:                                   /* Combine part of old URL with the new one */
 329   while (o < p)
 330     if (d < e)
 331       *d++ = *o++;
 332     else
 333       return URL_ERR_TOO_LONG;
 334   while (*a)
 335     if (d < e)
 336       *d++ = *a++;
 337     else
 338       return URL_ERR_TOO_LONG;
 339   goto okay;
 340 }
 341
 342 int
 343 url_normalize(struct url *u, struct url *b)
 344 {
 345   int err;
 346
 347   /* Basic checks */
 348   if (url_proto_path_flags[u->protoid] && !u->host ||
 349       u->host && !*u->host ||
 350       !u->host && u->user ||
 351       !u->rest)
 352     return URL_SYNTAX_ERROR;
 353
 354   if (!u->protocol)
 355     {
 356       /* Now we know it's a relative URL. Do we have any base? */
 357       if (!b || !url_proto_path_flags[b->protoid])
 358         return URL_ERR_REL_NOTHING;
 359       u->protocol = b->protocol;
 360       u->protoid = b->protoid;
 361
 362       /* Reference to the same host */
 363       if (!u->host)
 364         {
 365           u->host = b->host;
 366           u->user = b->user;
 367           u->port = b->port;
 368           if (err = relpath_merge(u, b))
 369             return err;
 370         }
 371     }
 372
 373   /* Fill in missing info */
 374   if (u->port == ~0U)
 375     u->port = std_ports[u->protoid];
 376
 377   return 0;
 378 }
 379
 380 /* Name canonicalization */
 381
 382 static void
 383 lowercase(byte *b)
 384 {
 385   if (b)
 386     while (*b)
 387       {
 388         if (*b >= 'A' && *b <= 'Z')
 389           *b = *b + 0x20;
 390         b++;
 391       }
 392 }
 393
 394 static void
 395 kill_end_dot(byte *b)
 396 {
 397   byte *k;
 398
 399   if (b)
 400     {
 401       k = b + strlen(b) - 1;
 402       if (k > b && *k == '.')
 403         *k = 0;
 404     }
 405 }
 406
 407 int
 408 url_canonicalize(struct url *u)
 409 {
 410   char *c;
 411
 412   lowercase(u->protocol);
 413   lowercase(u->host);
 414   kill_end_dot(u->host);
 415   if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
 416     u->rest = "/";
 417   if (u->rest && (c = strchr(u->rest, '#')))    /* Kill fragment reference */
 418     *c = 0;
 419   return 0;
 420 }
 421
 422 /* Pack a broken-down URL */
 423
 424 static byte *
 425 append(byte *d, byte *s, byte *e)
 426 {
 427   if (d)
 428     while (*s)
 429       {
 430         if (d >= e)
 431           return NULL;
 432         *d++ = *s++;
 433       }
 434   return d;
 435 }
 436
 437 int
 438 url_pack(struct url *u, byte *d)
 439 {
 440   byte *e = d + MAX_URL_SIZE - 10;
 441
 442   if (u->protocol)
 443     {
 444       d = append(d, u->protocol, e);
 445       d = append(d, ":", e);
 446       u->protoid = identify_protocol(u->protocol);
 447     }
 448   if (u->host)
 449     {
 450       d = append(d, "//", e);
 451       if (u->user)
 452         {
 453           d = append(d, u->user, e);
 454           d = append(d, "@", e);
 455         }
 456       d = append(d, u->host, e);
 457       if (u->port != std_ports[u->protoid] && u->port != ~0U)
 458         {
 459           char z[10];
 460           sprintf(z, "%d", u->port);
 461           d = append(d, ":", e);
 462           d = append(d, z, e);
 463         }
 464     }
 465   if (u->rest)
 466     d = append(d, u->rest, e);
 467   if (!d)
 468     return URL_ERR_TOO_LONG;
 469   *d = 0;
 470   return 0;
 471 }
 472
 473 /* Error messages */
 474
 475 static char *errmsg[] = {
 476   "Something is wrong",
 477   "Too long",
 478   "Invalid character",
 479   "Invalid escape",
 480   "Invalid escaped character",
 481   "Invalid port number",
 482   "Relative URL not allowed",
 483   "Unknown protocol",
 484   "Syntax error",
 485   "Path underflow"
 486 };
 487
 488 char *
 489 url_error(uns err)
 490 {
 491   if (err >= sizeof(errmsg) / sizeof(char *))
 492     err = 0;
 493   return errmsg[err];
 494 }
 495
 496 /* A "macro" for canonical split */
 497
 498 int
 499 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
 500 {
 501   int err;
 502
 503   if (err = url_deescape(u, buf1))
 504     return err;
 505   if (err = url_split(buf1, url, buf2))
 506     return err;
 507   if (err = url_normalize(url, NULL))
 508     return err;
 509   return url_canonicalize(url);
 510 }
 511
 512 /* Testing */
 513
 514 #ifdef TEST
 515
 516 int main(int argc, char **argv)
 517 {
 518   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 519   int err;
 520   struct url url, url0;
 521
 522   if (argc != 2)
 523     return 1;
 524   if (err = url_deescape(argv[1], buf1))
 525     {
 526       printf("deesc: error %d\n", err);
 527       return 1;
 528     }
 529   printf("deesc: %s\n", buf1);
 530   if (err = url_split(buf1, &url, buf2))
 531     {
 532       printf("split: error %d\n", err);
 533       return 1;
 534     }
 535   printf("split: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 536   if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html;param?query&zzz/subquery#fragment", &url0, buf3))
 537     {
 538       printf("split base: error %d\n", err);
 539       return 1;
 540     }
 541   if (err = url_normalize(&url0, NULL))
 542     {
 543       printf("normalize base: error %d\n", err);
 544       return 1;
 545     }
 546   printf("base: @%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.host, url0.port, url0.rest);
 547   if (err = url_normalize(&url, &url0))
 548     {
 549       printf("normalize: error %d\n", err);
 550       return 1;
 551     }
 552   printf("normalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 553   if (err = url_canonicalize(&url))
 554     {
 555       printf("canonicalize: error %d\n", err);
 556       return 1;
 557     }
 558   printf("canonicalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 559   if (err = url_pack(&url, buf4))
 560     {
 561       printf("pack: error %d\n", err);
 562       return 1;
 563     }
 564   printf("pack: %s\n", buf4);
 565   if (err = url_enescape(buf4, buf2))
 566     {
 567       printf("enesc: error %d\n", err);
 568       return 1;
 569     }
 570   printf("enesc: %s\n", buf2);
 571   return 0;
 572 }
 573
 574 #endif