lib/url.c

   1 /*
   2  *      Sherlock Library -- URL Functions (according to RFC 1738 and 1808)
   3  *
   4  *      (c) 1997 Martin Mares, <mj@atrey.karlin.mff.cuni.cz>
   5  */
   6
   7 #include <string.h>
   8 #include <stdlib.h>
   9 #include <stdio.h>
  10
  11 #include "lib.h"
  12 #include "url.h"
  13 #include "string.h"
  14
  15 /* Escaping and de-escaping */
  16
  17 static uns
  18 enhex(uns x)
  19 {
  20   return (x<10) ? (x + '0') : (x - 10 + 'A');
  21 }
  22
  23 int
  24 url_deescape(byte *s, byte *d)
  25 {
  26   byte *end = d + MAX_URL_SIZE - 10;
  27   while (*s)
  28     {
  29       if (d >= end)
  30         return URL_ERR_TOO_LONG;
  31       if (*s == '%')
  32         {
  33           unsigned int val;
  34           if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
  35             return URL_ERR_INVALID_ESCAPE;
  36           val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
  37           if (!Cprint(val))
  38             return URL_ERR_INVALID_ESCAPED_CHAR;
  39           switch (val)
  40             {
  41             case ';':
  42               val = NCC_SEMICOLON; break;
  43             case '/':
  44               val = NCC_SLASH; break;
  45             case '?':
  46               val = NCC_QUEST; break;
  47             case ':':
  48               val = NCC_COLON; break;
  49             case '@':
  50               val = NCC_AT; break;
  51             case '=':
  52               val = NCC_EQUAL; break;
  53             case '&':
  54               val = NCC_AND; break;
  55             }
  56           *d++ = val;
  57           s += 3;
  58         }
  59       else if (*s >= 0x20 && *s <= 0x7e)
  60         *d++ = *s++;
  61       else
  62         return URL_ERR_INVALID_CHAR;
  63     }
  64   *d = 0;
  65   return 0;
  66 }
  67
  68 int
  69 url_enescape(byte *s, byte *d)
  70 {
  71   byte *end = d + MAX_URL_SIZE - 10;
  72
  73   while (*s)
  74     {
  75       if (d >= end)
  76         return URL_ERR_TOO_LONG;
  77       if (   *s >= 'A' && *s <= 'Z'
  78              || *s >= 'a' && *s <= 'z'
  79              || *s >= '0' && *s <= '9'
  80              || *s == '$' || *s == '-' || *s == '.' || *s == '+'
  81              || *s == '!' || *s == '*' || *s == '\'' || *s == '('
  82              || *s == ')' || *s == '_' || *s == ';' || *s == '/'
  83              || *s == '?' || *s == ':' || *s == '@' || *s == '='
  84              || *s == '&')
  85         *d++ = *s++;
  86       else
  87         {
  88           uns val = (*s < NCC_MAX) ? ";/?:@=&"[*s] : *s;
  89           *d++ = '%';
  90           *d++ = enhex(val >> 4);
  91           *d++ = enhex(val & 0x0f);
  92           s++;
  93         }
  94     }
  95   *d = 0;
  96   return 0;
  97 }
  98
  99 /* Split an URL (several parts may be copied to the destination buffer) */
 100
 101 uns
 102 identify_protocol(byte *p)
 103 {
 104   if (!strcasecmp(p, "http"))
 105     return URL_PROTO_HTTP;
 106   if (!strcasecmp(p, "ftp"))
 107     return URL_PROTO_FTP;
 108   return 0;
 109 }
 110
 111 int
 112 url_split(byte *s, struct url *u, byte *d)
 113 {
 114   bzero(u, sizeof(struct url));
 115   u->port = ~0;
 116   u->bufend = d + MAX_URL_SIZE - 10;
 117
 118   if (s[0] != '/')                      /* Seek for "protocol:" */
 119     {
 120       byte *p = s;
 121       while (*p && Calnum(*p))
 122         p++;
 123       if (p != s && *p == ':')
 124         {
 125           u->protocol = d;
 126           while (s < p)
 127             *d++ = *s++;
 128           *d++ = 0;
 129           u->protoid = identify_protocol(u->protocol);
 130           s++;
 131         }
 132     }
 133
 134   if (s[0] == '/')                      /* Host spec or absolute path */
 135     {
 136       if (s[1] == '/')                  /* Host spec */
 137         {
 138           byte *q, *w, *e;
 139           char *ep;
 140
 141           s += 2;
 142           q = d;
 143           while (*s && *s != '/')       /* Copy user:passwd@host:port */
 144             *d++ = *s++;
 145           *d++ = 0;
 146           w = strchr(q, '@');
 147           if (w)                        /* user:passwd present */
 148             {
 149               *w++ = 0;
 150               u->user = q;
 151             }
 152           else
 153             w = q;
 154           e = strchr(w, ':');
 155           if (e)                        /* host:port present */
 156             {
 157               *e++ = 0;
 158               u->port = strtoul(e, &ep, 10);
 159               if (ep && *ep || u->port > 65535 || !u->port)
 160                 return URL_ERR_INVALID_PORT;
 161             }
 162           u->host = w;
 163         }
 164     }
 165
 166   u->rest = s;
 167   u->buf = d;
 168   return 0;
 169 }
 170
 171 /* Normalization according to given base URL */
 172
 173 static uns std_ports[] = { ~0, 80, 21 }; /* Default port numbers */
 174
 175 static int
 176 relpath_merge(struct url *u, struct url *b)
 177 {
 178   byte *a = u->rest;
 179   byte *o = b->rest;
 180   byte *d = u->buf;
 181   byte *e = u->bufend;
 182   byte *p;
 183
 184   if (a[0] == '/')                      /* Absolute path => OK */
 185     return 0;
 186   if (o[0] != '/')
 187     return URL_PATH_UNDERFLOW;
 188
 189   if (!a[0])                            /* Empty relative URL is a special case */
 190     {
 191       u->rest = b->rest;
 192       return 0;
 193     }
 194
 195   u->rest = d;
 196   p = strrchr(o, '/');                  /* Must be found! */
 197   while (o <= p)                        /* Copy original path */
 198     {
 199       if (d >= e)
 200         return URL_ERR_TOO_LONG;
 201       *d++ = *o++;
 202     }
 203
 204   while (*a)
 205     {
 206       if (a[0] == '.')
 207         {
 208           if (a[1] == '/' || !a[1])     /* Skip "./" and ".$" */
 209             {
 210               a++;
 211               if (a[0])
 212                 a++;
 213               continue;
 214             }
 215           else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
 216             {
 217               a += 2;
 218               if (d <= u->buf + 1)
 219                 return URL_PATH_UNDERFLOW;
 220               d--;                      /* Discard trailing slash */
 221               while (d[-1] != '/')
 222                 d--;
 223               if (a[0])
 224                 a++;
 225               continue;
 226             }
 227         }
 228       while (a[0] && a[0] != '/')
 229         {
 230           if (d >= e)
 231             return URL_ERR_TOO_LONG;
 232           *d++ = *a++;
 233         }
 234       if (a[0])
 235         *d++ = *a++;
 236     }
 237
 238   *d++ = 0;
 239   u->buf = d;
 240   return 0;
 241 }
 242
 243 int
 244 url_normalize(struct url *u, struct url *b)
 245 {
 246   byte *k;
 247
 248   if (u->protocol && !u->protoid)
 249     return 0;
 250
 251   if ((u->protoid == URL_PROTO_HTTP || (!u->protoid && b && b->protoid == URL_PROTO_HTTP))
 252       && u->rest && (k = strchr(u->rest, '#')))
 253     *k = 0;                             /* Kill fragment reference */
 254
 255   if (u->port == ~0)
 256     u->port = std_ports[u->protoid];
 257
 258   if (   u->protocol && !u->host
 259          || u->host && !*u->host
 260          || !u->host && u->user
 261          || !u->rest)
 262     return URL_SYNTAX_ERROR;
 263
 264   if (u->protocol)                      /* Absolute URL */
 265     return 0;
 266
 267   if (!b)                               /* Relative to something? */
 268     return URL_ERR_REL_NOTHING;
 269   if (!b->protoid)
 270     return URL_ERR_UNKNOWN_PROTOCOL;
 271
 272   if (!u->protocol)
 273     {
 274       u->protocol = b->protocol;
 275       u->protoid = b->protoid;
 276     }
 277
 278   if (!u->host)
 279     {
 280       u->host = b->host;
 281       u->user = b->user;
 282       u->port = b->port;
 283       return relpath_merge(u, b);
 284     }
 285
 286   return 0;
 287 }
 288
 289 /* Name canonicalization */
 290
 291 static void
 292 lowercase(byte *b)
 293 {
 294   if (b)
 295     while (*b)
 296       {
 297         if (*b >= 'A' && *b <= 'Z')
 298           *b = *b + 0x20;
 299         b++;
 300       }
 301 }
 302
 303 static void
 304 kill_end_dot(byte *b)
 305 {
 306   byte *k;
 307
 308   if (b)
 309     {
 310       k = b + strlen(b) - 1;
 311       if (k > b && *k == '.')
 312         *k = 0;
 313     }
 314 }
 315
 316 int
 317 url_canonicalize(struct url *u)
 318 {
 319   lowercase(u->protocol);
 320   lowercase(u->host);
 321   kill_end_dot(u->host);
 322   if ((!u->rest || !*u->rest) && (u->protoid == URL_PROTO_HTTP || u->protoid == URL_PROTO_FTP))
 323     u->rest = "/";
 324   return 0;
 325 }
 326
 327 /* Pack a broken-down URL */
 328
 329 byte *
 330 append(byte *d, byte *s, byte *e)
 331 {
 332   if (d)
 333     while (*s)
 334       {
 335         if (d >= e)
 336           return NULL;
 337         *d++ = *s++;
 338       }
 339   return d;
 340 }
 341
 342 int
 343 url_pack(struct url *u, byte *d)
 344 {
 345   byte *e = d + MAX_URL_SIZE - 10;
 346
 347   if (u->protocol)
 348     {
 349       d = append(d, u->protocol, e);
 350       d = append(d, ":", e);
 351       u->protoid = identify_protocol(u->protocol);
 352     }
 353   if (u->host)
 354     {
 355       d = append(d, "//", e);
 356       if (u->user)
 357         {
 358           d = append(d, u->user, e);
 359           d = append(d, "@", e);
 360         }
 361       d = append(d, u->host, e);
 362       if (u->port != std_ports[u->protoid] && u->port != ~0)
 363         {
 364           char z[10];
 365           sprintf(z, "%d", u->port);
 366           d = append(d, ":", e);
 367           d = append(d, z, e);
 368         }
 369     }
 370   if (u->rest)
 371     d = append(d, u->rest, e);
 372   if (!d)
 373     return URL_ERR_TOO_LONG;
 374   *d = 0;
 375   return 0;
 376 }
 377
 378 /* Error messages */
 379
 380 static char *errmsg[] = {
 381   "Something is wrong",
 382   "Too long",
 383   "Invalid character",
 384   "Invalid escape",
 385   "Invalid escaped character",
 386   "Invalid port number",
 387   "Relative URL not allowed",
 388   "Unknown protocol",
 389   "Syntax error",
 390   "Path underflow"
 391 };
 392
 393 char *
 394 url_error(uns err)
 395 {
 396   if (err >= sizeof(errmsg) / sizeof(char *))
 397     err = 0;
 398   return errmsg[err];
 399 }
 400
 401 /* A "macro" for canonical split */
 402
 403 int
 404 url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
 405 {
 406   int err;
 407
 408   if (err = url_deescape(u, buf1))
 409     return err;
 410   if (err = url_split(buf1, url, buf2))
 411     return err;
 412   if (err = url_normalize(url, NULL))
 413     return err;
 414   return url_canonicalize(url);
 415 }
 416
 417 /* Testing */
 418
 419 #ifdef TEST
 420
 421 int main(int argc, char **argv)
 422 {
 423   char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
 424   int err;
 425   struct url url, url0;
 426
 427   if (argc != 2)
 428     return 1;
 429   if (err = url_deescape(argv[1], buf1))
 430     {
 431       printf("deesc: error %d\n", err);
 432       return 1;
 433     }
 434   printf("deesc: %s\n", buf1);
 435   if (err = url_split(buf1, &url, buf2))
 436     {
 437       printf("split: error %d\n", err);
 438       return 1;
 439     }
 440   printf("split: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 441   if (err = url_split("http://mj@www.hell.org/123/sub_dir/index.html", &url0, buf3))
 442     {
 443       printf("split base: error %d\n", err);
 444       return 1;
 445     }
 446   if (err = url_normalize(&url0, NULL))
 447     {
 448       printf("normalize base: error %d\n", err);
 449       return 1;
 450     }
 451   printf("base: @%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.host, url0.port, url0.rest);
 452   if (err = url_normalize(&url, &url0))
 453     {
 454       printf("normalize: error %d\n", err);
 455       return 1;
 456     }
 457   printf("normalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 458   if (err = url_canonicalize(&url))
 459     {
 460       printf("canonicalize: error %d\n", err);
 461       return 1;
 462     }
 463   printf("canonicalize: @%s@%s@%s@%d@%s\n", url.protocol, url.user, url.host, url.port, url.rest);
 464   if (err = url_pack(&url, buf4))
 465     {
 466       printf("pack: error %d\n", err);
 467       return 1;
 468     }
 469   printf("pack: %s\n", buf1);
 470   if (err = url_enescape(buf4, buf2))
 471     {
 472       printf("enesc: error %d\n", err);
 473       return 1;
 474     }
 475   printf("enesc: %s\n", buf2);
 476   return 0;
 477 }
 478
 479 #endif