/*
- * Sherlock Library -- URL Functions
+ * UCW Library -- URL Functions
*
* (c) 1997--2004 Martin Mares <mj@ucw.cz>
- * (c) 2001 Robert Spalek <robert@ucw.cz>
+ * (c) 2001--2005 Robert Spalek <robert@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
static uns url_ignore_spaces;
static uns url_ignore_underflow;
-static byte *url_component_separators = "";
+static char *url_component_separators = "";
static uns url_min_repeat_count = 0x7fffffff;
static uns url_max_repeat_length = 0;
-
-static struct cfitem url_config[] = {
- { "URL", CT_SECTION, NULL },
- { "IgnoreSpaces", CT_INT, &url_ignore_spaces },
- { "IgnoreUnderflow", CT_INT, &url_ignore_underflow },
- { "ComponentSeparators", CT_STRING, &url_component_separators },
- { "MinRepeatCount", CT_INT, &url_min_repeat_count },
- { "MaxRepeatLength", CT_INT, &url_max_repeat_length },
- { NULL, CT_STOP, NULL }
+static uns url_max_occurences = ~0U;
+
+static struct cf_section url_config = {
+ CF_ITEMS {
+ CF_UNS("IgnoreSpaces", &url_ignore_spaces),
+ CF_UNS("IgnoreUnderflow", &url_ignore_underflow),
+ CF_STRING("ComponentSeparators", &url_component_separators),
+ CF_UNS("MinRepeatCount", &url_min_repeat_count),
+ CF_UNS("MaxRepeatLength", &url_max_repeat_length),
+ CF_UNS("MaxOccurences", &url_max_occurences),
+ CF_END
+ }
};
static void CONSTRUCTOR url_init_config(void)
{
- cf_register(url_config);
+ cf_declare_section("URL", &url_config, 0);
}
/* Escaping and de-escaping */
}
int
-url_deescape(byte *s, byte *d)
+url_deescape(const byte *s, byte *d)
{
byte *dstart = d;
byte *end = d + MAX_URL_SIZE - 10;
*d++ = *s++;
else if (Cspace(*s))
{
- byte *s0 = s;
+ const byte *s0 = s;
while (Cspace(*s))
s++;
if (!url_ignore_spaces || !(!*s || d == dstart))
}
int
-url_enescape(byte *s, byte *d)
+url_enescape(const byte *s, byte *d)
{
byte *end = d + MAX_URL_SIZE - 10;
unsigned int c;
return 0;
}
+int
+url_enescape_friendly(const byte *src, byte *dest)
+{
+ byte *end = dest + MAX_URL_SIZE - 10;
+ while (*src)
+ {
+ if (dest >= end)
+ return URL_ERR_TOO_LONG;
+ if (*src < NCC_MAX)
+ *dest++ = NCC_CHARS[*src++];
+ else if (*src >= 0x20 && *src < 0x7f)
+ *dest++ = *src++;
+ else
+ {
+ *dest++ = '%';
+ *dest++ = enhex(*src >> 4);
+ *dest++ = enhex(*src++ & 0x0f);
+ }
+ }
+ *dest = 0;
+ return 0;
+}
+
/* Split an URL (several parts may be copied to the destination buffer) */
byte *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
uns
-identify_protocol(byte *p)
+identify_protocol(const byte *p)
{
uns i;
{
if (s[1] == '/') /* Host spec */
{
- byte *q, *w, *e;
+ byte *q, *e;
+ byte *at = NULL;
char *ep;
s += 2;
q = d;
while (*s && *s != '/' && *s != '?') /* Copy user:passwd@host:port */
- *d++ = *s++;
+ {
+ if (*s != '@')
+ *d++ = *s;
+ else if (!at)
+ {
+ *d++ = 0;
+ at = d;
+ }
+ else /* This shouldn't happen with sane URL's, but we need to be sure */
+ *d++ = NCC_AT;
+ s++;
+ }
*d++ = 0;
- w = strchr(q, '@');
- if (w) /* user:passwd present */
+ if (at) /* user:passwd present */
{
- *w++ = 0;
u->user = q;
if (e = strchr(q, ':'))
{
}
}
else
- w = q;
- e = strchr(w, ':');
+ at = q;
+ e = strchr(at, ':');
if (e) /* host:port present */
{
uns p;
else if (p) /* Port 0 (e.g. in :/) is treated as default port */
u->port = p;
}
- u->host = w;
+ u->host = at;
}
}
/* Pack a broken-down URL */
static byte *
-append(byte *d, byte *s, byte *e)
+append(byte *d, const byte *s, byte *e)
{
if (d)
while (*s)
/* Standard cookbook recipes */
int
-url_canon_split(byte *u, byte *buf1, byte *buf2, struct url *url)
+url_canon_split_rel(const byte *u, byte *buf1, byte *buf2, struct url *url, struct url *base)
{
int err;
return err;
if (err = url_split(buf1, url, buf2))
return err;
- if (err = url_normalize(url, NULL))
+ if (err = url_normalize(url, base))
return err;
return url_canonicalize(url);
}
int
-url_auto_canonicalize(byte *src, byte *dst)
+url_auto_canonicalize_rel(const byte *src, byte *dst, struct url *base)
{
byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
int err;
struct url ur;
- (void)((err = url_canon_split(src, buf1, buf2, &ur)) ||
+ (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
(err = url_pack(&ur, buf3)) ||
(err = url_enescape(buf3, dst)));
return err;
#endif
struct component {
- byte *start;
+ const byte *start;
int length;
+ uns count;
u32 hash;
};
static inline u32
-hashf(byte *start, int length)
+hashf(const byte *start, int length)
{
u32 hf = length;
while (length-- > 0)
}
int
-url_has_repeated_component(byte *url)
+url_has_repeated_component(const byte *url)
{
struct component *comp;
- uns comps, comp_len, rep_prefix;
- byte *c;
- uns i;
+ uns comps, comp_len, rep_prefix, hash_size, *hash, *next;
+ const byte *c;
+ uns i, j, k;
for (comps=0, c=url; c; comps++)
{
if (c)
c++;
}
- if (comps < url_min_repeat_count)
+ if (comps < url_min_repeat_count && comps <= url_max_occurences)
return 0;
- comp = alloca(comps * sizeof(struct component));
+ comp = alloca(comps * sizeof(*comp));
for (i=0, c=url; c; i++)
{
comp[i].start = c;
ASSERT(i == comps);
for (i=0; i<comps; i++)
comp[i].hash = hashf(comp[i].start, comp[i].length);
+ if (comps > url_max_occurences)
+ {
+ hash_size = next_table_prime(comps);
+ hash = alloca(hash_size * sizeof(*hash));
+ next = alloca(comps * sizeof(*next));
+ memset(hash, 255, hash_size * sizeof(*hash));
+ for (i=0; i<comps; i++)
+ {
+ j = comp[i].hash % hash_size;
+ for (k = hash[j]; ~k && (comp[i].hash != comp[k].hash || comp[i].length != comp[k].length ||
+ memcmp(comp[k].start, comp[i].start, comp[i].length)); k = next[k]);
+ if (!~k)
+ {
+ next[i] = hash[j];
+ hash[j] = i;
+ comp[i].count = 1;
+ }
+ else
+ {
+ if (comp[k].count++ >= url_max_occurences)
+ return 1;
+ }
+ }
+ }
for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)