From e16fbdbeaeefc2b01ad1ed71d63ed98530bcbae5 Mon Sep 17 00:00:00 2001 From: Martin Mares Date: Fri, 13 Feb 2015 00:24:26 +0100 Subject: [PATCH] XML: Changed internal representation of attributes As most elements have only a couple of attributes, I switched the main attribute data structure from a hash table to a list. String comparisons are hopefully avoided in most cases, as we keep a hash of the lookup key in each entry. More importantly, the structure is populated in two steps: first, original (i.e., qualified) names of all attributes are entered; second, namespaces are resolved and attributes are checked for uniqueness. If linear time complexity of attribute lookups turns out to be too slow, we can add a secondary hash table for elements with many attributes. This table will be populated from the list in the 2nd step above. --- ucw-xml/common.c | 10 --- ucw-xml/internals.h | 5 -- ucw-xml/parse.c | 144 +++++++++++++++++++++++--------------------- ucw-xml/xml.h | 5 +- 4 files changed, 78 insertions(+), 86 deletions(-) diff --git a/ucw-xml/common.c b/ucw-xml/common.c index d6614496..5cfd8513 100644 --- a/ucw-xml/common.c +++ b/ucw-xml/common.c @@ -96,19 +96,12 @@ static struct xml_context xml_defaults = { }, }; -static void -xml_do_init(struct xml_context *ctx) -{ - xml_attrs_table_init(ctx); -} - void xml_init(struct xml_context *ctx) { *ctx = xml_defaults; ctx->pool = mp_new(65536); ctx->stack = mp_new(65536); - xml_do_init(ctx); TRACE(ctx, "init"); } @@ -116,7 +109,6 @@ void xml_cleanup(struct xml_context *ctx) { TRACE(ctx, "cleanup"); - xml_attrs_table_cleanup(ctx); xml_dtd_cleanup(ctx); xml_sources_cleanup(ctx); xml_ns_cleanup(ctx); @@ -129,7 +121,6 @@ xml_reset(struct xml_context *ctx) { TRACE(ctx, "reset"); struct mempool *pool = ctx->pool, *stack = ctx->stack; - xml_attrs_table_cleanup(ctx); xml_dtd_cleanup(ctx); xml_sources_cleanup(ctx); mp_flush(pool); @@ -138,5 +129,4 @@ xml_reset(struct xml_context *ctx) ctx->pool = pool; ctx->stack = stack; xml_ns_reset(ctx); - xml_do_init(ctx); } diff --git a/ucw-xml/internals.h b/ucw-xml/internals.h index 54101d69..0a81ecad 100644 --- a/ucw-xml/internals.h +++ b/ucw-xml/internals.h @@ -14,8 +14,6 @@ #include #ifdef CONFIG_UCW_CLEAN_ABI -#define xml_attrs_table_cleanup ucw_xml_attrs_table_cleanup -#define xml_attrs_table_init ucw_xml_attrs_table_init #define xml_fatal_expected ucw_xml_fatal_expected #define xml_fatal_expected_quot ucw_xml_fatal_expected_quot #define xml_fatal_expected_white ucw_xml_fatal_expected_white @@ -318,9 +316,6 @@ void xml_push_pi(struct xml_context *ctx); void xml_pop_pi(struct xml_context *ctx); void xml_skip_pi(struct xml_context *ctx); -void xml_attrs_table_init(struct xml_context *ctx); -void xml_attrs_table_cleanup(struct xml_context *ctx); - void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value); /*** Namespaces ***/ diff --git a/ucw-xml/parse.c b/ucw-xml/parse.c index 48ad7021..2ddfa2b8 100644 --- a/ucw-xml/parse.c +++ b/ucw-xml/parse.c @@ -586,46 +586,25 @@ xml_normalize_white(struct xml_context *ctx UNUSED, char *text) /*** Attributes ***/ -struct xml_attrs_table; - -static inline uint -xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_node *e, uint ns, char *n) -{ - return hash_pointer(e) ^ hash_string(n) ^ hash_u32(ns); -} - -static inline int -xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_node *e1, uint ns1, char *n1, struct xml_node *e2, uint ns2, char *n2) -{ - return (e1 == e2) && (ns1 == ns2) && !strcmp(n1, n2); -} - -static inline void -xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_node *e, uint ns, char *name) +static void +xml_raw_add_attr(struct xml_context *ctx, struct xml_node *e, char *name, char *value) { + struct xml_attr *a = mp_alloc(ctx->pool, sizeof(*a)); a->elem = e; - a->ns = ns; + a->ns = 0; /* Namespaces will be resolved later */ a->name = name; - a->val = NULL; + a->val = value; + a->dtd = NULL; a->user = NULL; + /* a->hash will be calculated later */ slist_add_tail(&e->attrs, &a->n); } -#define HASH_PREFIX(x) xml_attrs_##x -#define HASH_NODE struct xml_attr -#define HASH_KEY_COMPLEX(x) x elem, x ns, x name -#define HASH_KEY_DECL struct xml_node *elem, uint ns, char *name -#define HASH_TABLE_DYNAMIC -#define HASH_GIVE_EQ -#define HASH_GIVE_HASHFN -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_CLEANUP -#define HASH_WANT_REMOVE -#define HASH_WANT_LOOKUP -#define HASH_WANT_FIND -#define HASH_GIVE_ALLOC -XML_HASH_GIVE_ALLOC -#include +static inline uint +xml_attr_hash(uint ns, char *name) +{ + return hash_string(name) ^ hash_u32(ns); +} static void xml_parse_attr(struct xml_context *ctx) @@ -634,58 +613,61 @@ xml_parse_attr(struct xml_context *ctx) /* Attribute ::= Name Eq AttValue */ struct xml_node *e = ctx->node; char *n = xml_parse_name(ctx, ctx->pool); - // FIXME: This is wrong! This way, we never find attributes in a non-default NS. - struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, 0, n); xml_parse_eq(ctx); char *v = xml_parse_attr_value(ctx, NULL); - if (a->val) + xml_raw_add_attr(ctx, e, n, v); +} + +static void +xml_process_attr(struct xml_context *ctx, struct xml_attr *a) +{ + struct xml_node *e = a->elem; + a->hash = xml_attr_hash(a->ns, a->name); + + XML_ATTR_FOR_EACH(a2, e) { - xml_error(ctx, "Attribute %s is not unique in element <%s>", n, e->name); - return; + if (a2 == a) + break; + if (a2->hash == a->hash && a2->ns == a->ns && !strcmp(a2->name, a->name)) + xml_error(ctx, "Attribute %s is not unique in element <%s>", xml_attr_qname(ctx, a), xml_node_qname(ctx, e)); } - a->val = v; - if (!e->dtd) - a->dtd = NULL; - else if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name))) - xml_error(ctx, "Undefined attribute %s in element <%s>", n, e->name); - else - xml_validate_attr(ctx, a->dtd, a->val); } struct xml_attr * xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name) { - return xml_attrs_find(ctx->tab_attrs, node, 0, name); + return xml_attr_find_ns(ctx, node, 0, name); } struct xml_attr * -xml_attr_find_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name) +xml_attr_find_ns(struct xml_context *ctx UNUSED, struct xml_node *node, uint ns, char *name) { - return xml_attrs_find(ctx->tab_attrs, node, ns, name); + ASSERT(node->type == XML_NODE_ELEM); + uint hash = xml_attr_hash(ns, name); + XML_ATTR_FOR_EACH(a, node) + if (a->hash == hash && a->ns == ns && !strcmp(a->name, name)) + return a; + return NULL; } char * -xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name) +xml_attr_value_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name) { - struct xml_attr *attr = xml_attrs_find(ctx->tab_attrs, node, 0, name); + struct xml_attr *attr = xml_attr_find_ns(ctx, node, ns, name); if (attr) return attr->val; if (!node->dtd) return NULL; + if (ns) /* So far, our DTD support is not namespace-aware */ + return NULL; struct xml_dtd_attr *dtd = xml_dtd_find_attr(ctx, node->dtd, name); return dtd ? dtd->default_value : NULL; } -void -xml_attrs_table_init(struct xml_context *ctx) -{ - xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table))); -} - -void -xml_attrs_table_cleanup(struct xml_context *ctx) +char * +xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name) { - xml_attrs_cleanup(ctx->tab_attrs); + return xml_attr_value_ns(ctx, node, 0, name); } char * @@ -754,6 +736,7 @@ xml_push_element(struct xml_context *ctx) } } + /* Parse attributes */ while (1) { uint white = xml_parse_white(ctx, 0); @@ -772,22 +755,38 @@ xml_push_element(struct xml_context *ctx) xml_parse_attr(ctx); } + /* Resolve namespaces */ xml_ns_push_element(ctx); + /* Once we have namespaces, hash attribute names */ + XML_ATTR_FOR_EACH(a, e) + xml_process_attr(ctx, a); + /* FIXME: DTD logic is not namespace-aware */ if (e->dtd) - SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs) - if (a->default_mode == XML_ATTR_REQUIRED) - { - if (!xml_attrs_find(ctx->tab_attrs, e, 0, a->name)) - xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name); + { + XML_ATTR_FOR_EACH(a, e) + { + if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name))) + xml_error(ctx, "Undefined attribute %s in element <%s>", a->name, e->name); + else + xml_validate_attr(ctx, a->dtd, a->val); } - else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS) - { - struct xml_attr *attr = xml_attrs_lookup(ctx->tab_attrs, e, 0, a->name); - if (!attr->val) - attr->val = a->default_value; + SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs) + { + if (a->default_mode == XML_ATTR_REQUIRED) + { + if (!xml_attr_find(ctx, e, a->name)) + xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name); + } + else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS) + { + if (!xml_attr_find(ctx, e, a->name)) + xml_raw_add_attr(ctx, e, a->name, a->default_value); + } } + } + if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag) ctx->h_stag(ctx); } @@ -807,7 +806,11 @@ xml_pop_element(struct xml_context *ctx) { if (!e->parent) ctx->dom = NULL; - /* Restore hash table of attributes */ +#if 0 + /* + * With the current data structures, freeing of attributes is not necessary, + * but it might be if we switch to a global hash table of large elements. + */ SLIST_FOR_EACH(struct xml_attr *, a, e->attrs) xml_attrs_remove(ctx->tab_attrs, a); struct xml_node *n; @@ -821,6 +824,7 @@ xml_pop_element(struct xml_context *ctx) } clist_remove(&n->n); } +#endif } xml_pop_dom(ctx, free); diff --git a/ucw-xml/xml.h b/ucw-xml/xml.h index 0c335c65..9fe06f3b 100644 --- a/ucw-xml/xml.h +++ b/ucw-xml/xml.h @@ -159,6 +159,7 @@ struct xml_node { struct xml_attr { snode n; /* Node for elem->attrs */ + uint hash; /* Internal hash of ns + name */ struct xml_node *elem; /* Parent element */ struct xml_dtd_attr *dtd; /* Attribute DTD */ uint ns; /* Namespace ID */ @@ -207,7 +208,6 @@ struct xml_context { struct fastbuf chars; /* Character data / attribute value */ struct mempool_state chars_state; /* Mempool state before the current character block has started */ char *chars_trivial; /* If not empty, it will be appended to chars */ - void *tab_attrs; /* Hash table of element attributes */ /* Input */ struct xml_source *src; /* Current source */ @@ -300,6 +300,9 @@ struct xml_attr *xml_attr_find_ns(struct xml_context *ctx, struct xml_node *node /* Similar to xml_attr_find, but it deals also with default values */ char *xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name); +/* The same, but namespace-aware */ +char *xml_attr_value_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name); + /* The default value of h_find_entity(), knows <, >, &, ' and " */ struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name); -- 2.39.2