XML: Implementation of XML namespaces

author Martin Mares <mj@ucw.cz>

Thu, 12 Feb 2015 22:12:03 +0000 (23:12 +0100)

committer Martin Mares <mj@ucw.cz>

Thu, 12 Feb 2015 22:12:03 +0000 (23:12 +0100)
author Martin Mares <mj@ucw.cz>
Thu, 12 Feb 2015 22:12:03 +0000 (23:12 +0100)
committer Martin Mares <mj@ucw.cz>
Thu, 12 Feb 2015 22:12:03 +0000 (23:12 +0100)
diff --git a/ucw-xml/Makefile b/ucw-xml/Makefile

index 28083ccdc5e4e012fc79a19997d317b29fc15720..5b3918890e8791a90dde4aaa9051ea6f22ed7c1e 100644 (file)
--- a/ucw-xml/Makefile
+++ b/ucw-xml/Makefile
@@ -4,7 +4,7 @@
  DIRS+=ucw-xml
  PROGS+=$(o)/ucw-xml/xml-test
  
-LIBXML_MODS=common source parse dtd
+LIBXML_MODS=common source parse dtd ns
  LIBXML_MOD_PATHS=$(addprefix $(o)/ucw-xml/,$(LIBXML_MODS))
  LIBXML_INCLUDES=xml.h dtd.h
  LIBXML_DEPS=$(LIBUCW) $(LIBCHARSET)
@@ -18,14 +18,8 @@ ifdef CONFIG_INSTALL_API
  $(o)/ucw-xml/libucw-xml.pc: $(addprefix $(o)/ucw-xml/libucw-xml$(LV),.a .so)
  endif
  
-$(o)/ucw-xml/common.o: $(o)/ucw-xml/unicat.h
-$(o)/ucw-xml/common.oo: $(o)/ucw-xml/unicat.h
-$(o)/ucw-xml/source.o: $(o)/ucw-xml/unicat.h
-$(o)/ucw-xml/source.oo: $(o)/ucw-xml/unicat.h
-$(o)/ucw-xml/dtd.o: $(o)/ucw-xml/unicat.h
-$(o)/ucw-xml/dtd.oo: $(o)/ucw-xml/unicat.h
-$(o)/ucw-xml/parse.o: $(o)/ucw-xml/unicat.h
-$(o)/ucw-xml/parse.oo: $(o)/ucw-xml/unicat.h
+$(addsuffix .o,$(LIBXML_MOD_PATHS)): $(o)/ucw-xml/unicat.h
+$(addsuffix .oo,$(LIBXML_MOD_PATHS)): $(o)/ucw-xml/unicat.h
  $(o)/ucw-xml/unicat.h: $(s)/ucw-xml/unicat.pl
         $(M)GEN $(addprefix $(o)/ucw-xml/unicat,.h .c)
         $(Q)$< $(addprefix $(o)/ucw-xml/unicat,.h .c)
diff --git a/ucw-xml/common.c b/ucw-xml/common.c

index 8b37d597ad5ab1ef9fba0aa9c29476b3bbf25d4d..d6614496a61ff8404010977614f3498e708a3f6c 100644 (file)
--- a/ucw-xml/common.c
+++ b/ucw-xml/common.c
@@ -119,6 +119,7 @@ xml_cleanup(struct xml_context *ctx)
    xml_attrs_table_cleanup(ctx);
    xml_dtd_cleanup(ctx);
    xml_sources_cleanup(ctx);
+  xml_ns_cleanup(ctx);
    mp_delete(ctx->pool);
    mp_delete(ctx->stack);
  }
@@ -136,5 +137,6 @@ xml_reset(struct xml_context *ctx)
    *ctx = xml_defaults;
    ctx->pool = pool;
    ctx->stack = stack;
+  xml_ns_reset(ctx);
    xml_do_init(ctx);
  }
diff --git a/ucw-xml/internals.h b/ucw-xml/internals.h

index a67cd8eb4a7393e7cecf3d351b3ad3aa9be86189..54101d69c97e7e82cac2b76d367d38cf5e078dcc 100644 (file)
--- a/ucw-xml/internals.h
+++ b/ucw-xml/internals.h
@@ -323,4 +323,11 @@ void xml_attrs_table_cleanup(struct xml_context *ctx);
  
  void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value);
  
+/*** Namespaces ***/
+
+void xml_ns_cleanup(struct xml_context *ctx);
+void xml_ns_reset(struct xml_context *ctx);
+void xml_ns_push_element(struct xml_context *ctx);
+void xml_ns_pop_element(struct xml_context *ctx);
+
  #endif
diff --git a/ucw-xml/ns.c b/ucw-xml/ns.c

new file mode 100644 (file)

index 0000000..8774590
--- /dev/null
+++ b/ucw-xml/ns.c
@@ -0,0 +1,240 @@
+/*
+ *     UCW Library -- A simple XML parser -- Namespaces
+ *
+ *     (c) 2015 Martin Mares <mj@ucw.cz>
+ *
+ *     This software may be freely distributed and used according to the terms
+ *     of the GNU Lesser General Public License.
+ */
+
+#undef LOCAL_DEBUG
+
+#include <ucw/lib.h>
+#include <ucw/gary.h>
+#include <ucw-xml/xml.h>
+#include <ucw-xml/internals.h>
+
+/*
+ *  This is an implementation of XML namespaces according to
+ *  http://www.w3.org/TR/REC-xml-names/.
+ *
+ *  Currently, we assume that the document does not contain a plethora
+ *  of namespaces and prefixes. So we keep them in memory until the
+ *  document ends.
+ */
+
+struct ns_hash_entry {
+  uint ns;
+  char name[1];
+};
+
+#define HASH_NODE struct ns_hash_entry
+#define HASH_PREFIX(x) ns_hash_##x
+#define HASH_KEY_ENDSTRING name
+#define HASH_WANT_CLEANUP
+#define HASH_WANT_FIND
+#define HASH_WANT_LOOKUP
+#define HASH_TABLE_DYNAMIC
+#define HASH_LOOKUP_DETECT_NEW
+#define HASH_GIVE_ALLOC
+XML_HASH_GIVE_ALLOC
+#include <ucw/hashtable.h>
+
+struct xml_ns_prefix {
+  struct xml_ns_prefix *prev;
+  struct xml_node *e;                  /* Which element defined this prefix */
+  struct ns_hash_entry *he;            /* NULL if changing default NS */
+  uint prev_ns;                                /* Previous NS ID assigned to this prefix */
+};
+
+static bool
+ns_enabled(struct xml_context *ctx)
+{
+  return (ctx->flags & XML_NAMESPACES);
+}
+
+void
+xml_ns_enable(struct xml_context *ctx)
+{
+  if (ns_enabled(ctx))
+    return;
+
+  TRACE(ctx, "NS: Enabling");
+  ctx->flags |= XML_NAMESPACES;
+  if (!ctx->ns_pool)
+    {
+      TRACE(ctx, "NS: Allocating data structures");
+      ctx->ns_pool = mp_new(4096);
+      GARY_INIT(ctx->ns_by_id, 16);
+    }
+
+  ctx->ns_by_name = xml_hash_new(ctx->ns_pool, sizeof(struct ns_hash_table));
+  ns_hash_init(ctx->ns_by_name);
+
+  ctx->ns_by_prefix = xml_hash_new(ctx->ns_pool, sizeof(struct ns_hash_table));
+  ns_hash_init(ctx->ns_by_prefix);
+
+  /* Intern well-known namespaces */
+  GARY_RESIZE(ctx->ns_by_id, 0);
+  uint none_ns = xml_ns_by_name(ctx, "");
+  uint xmlns_ns = xml_ns_by_name(ctx, "http://www.w3.org/2000/xmlns/");
+  uint xml_ns = xml_ns_by_name(ctx, "http://www.w3.org/XML/1998/namespace");
+  ASSERT(none_ns == XML_NS_NONE && xmlns_ns == XML_NS_XMLNS && xml_ns == XML_NS_XML);
+
+  /* Intern standard prefixes */
+  int new_xmlns, new_xml;
+  ns_hash_lookup(ctx->ns_by_prefix, "xmlns", &new_xmlns)->ns = xmlns_ns;
+  ns_hash_lookup(ctx->ns_by_prefix, "xml", &new_xml)->ns = xml_ns;
+  ASSERT(new_xmlns && new_xml);
+}
+
+void
+xml_ns_cleanup(struct xml_context *ctx)
+{
+  if (!ctx->ns_pool)
+    return;
+
+  TRACE(ctx, "NS: Cleanup");
+  ns_hash_cleanup(ctx->ns_by_prefix);
+  ns_hash_cleanup(ctx->ns_by_name);
+  GARY_FREE(ctx->ns_by_id);
+  mp_delete(ctx->ns_pool);
+}
+
+void
+xml_ns_reset(struct xml_context *ctx)
+{
+  if (!ns_enabled(ctx))
+    return;
+
+  TRACE(ctx, "NS: Reset");
+  GARY_RESIZE(ctx->ns_by_id, 1);
+  ctx->ns_by_id[0] = "";
+  mp_flush(ctx->ns_pool);
+}
+
+const char *
+xml_ns_by_id(struct xml_context *ctx, uint ns)
+{
+  ASSERT(ns < GARY_SIZE(ctx->ns_by_id));
+  return ctx->ns_by_id[ns];
+}
+
+uint
+xml_ns_by_name(struct xml_context *ctx, const char *name)
+{
+  int new_p;
+  struct ns_hash_entry *he = ns_hash_lookup(ctx->ns_by_name, (char *) name, &new_p);
+  if (new_p)
+    {
+      he->ns = GARY_SIZE(ctx->ns_by_id);
+      ASSERT(he->ns < ~0U);
+      *GARY_PUSH(ctx->ns_by_id) = he->name;
+      TRACE(ctx, "NS: New namespace <%s> with ID %u", he->name, he->ns);
+    }
+  return he->ns;
+}
+
+static struct xml_ns_prefix *
+ns_push_prefix(struct xml_context *ctx)
+{
+  struct xml_ns_prefix *px = mp_alloc(ctx->stack, sizeof(*px));
+  px->prev = ctx->ns_prefix_stack;
+  ctx->ns_prefix_stack = px;
+  px->e = ctx->node;
+  return px;
+}
+
+static uint
+ns_resolve(struct xml_context *ctx, char **namep, uint default_ns)
+{
+  char *name = *namep;
+  char *colon = strchr(name, ':');
+  if (colon)
+    {
+      *colon = 0;
+      struct ns_hash_entry *he = ns_hash_find(ctx->ns_by_prefix, name);
+      *colon = ':';
+      if (he && he->ns)
+       {
+         *namep = colon + 1;
+         return he->ns;
+       }
+      else
+       {
+         xml_error(ctx, "Unknown namespace prefix for %s", name);
+         return 0;
+       }
+    }
+  else
+    return default_ns;
+}
+
+void xml_ns_push_element(struct xml_context *ctx)
+{
+  struct xml_node *e = ctx->node;
+  if (!ns_enabled(ctx))
+    {
+      e->ns = 0;
+      return;
+    }
+
+  /* Scan attributes for prefix definitions */
+  XML_ATTR_FOR_EACH(a, e)
+    if (!memcmp(a->name, "xmlns", 5))
+      {
+       struct xml_ns_prefix *px = ns_push_prefix(ctx);
+       uint ns = xml_ns_by_name(ctx, a->val);
+       if (a->name[5] == ':')
+         {
+           if (a->name[6])
+             {
+               /* New NS prefix */
+               int new_p;
+               struct ns_hash_entry *he = ns_hash_lookup(ctx->ns_by_prefix, a->name + 6, &new_p);
+               if (new_p)
+                 he->ns = 0;
+               px->he = he;
+               px->prev_ns = he->ns;
+               he->ns = ns;
+               TRACE(ctx, "NS: New prefix <%s> -> ID %u", he->name, he->ns);
+             }
+         }
+       else
+         {
+           /* New default NS */
+           px->he = NULL;
+           px->prev_ns = ctx->ns_default;
+           ctx->ns_default = ns;
+           TRACE(ctx, "New default NS -> ID %u", ns);
+         }
+      }
+
+  /* Resolve namespaces */
+  e->ns = ns_resolve(ctx, &e->name, ctx->ns_default);
+  XML_ATTR_FOR_EACH(a, e)
+    a->ns = ns_resolve(ctx, &a->name, 0);
+}
+
+void xml_ns_pop_element(struct xml_context *ctx)
+{
+  if (!ns_enabled(ctx))
+    return;
+
+  struct xml_ns_prefix *px;
+  while ((px = ctx->ns_prefix_stack) && px->e == ctx->node)
+    {
+      struct ns_hash_entry *he = px->he;
+      if (he)
+       {
+         TRACE(ctx, "NS: Restoring prefix <%s> -> ID %u", he->name, px->prev_ns);
+         he->ns = px->prev_ns;
+       }
+      else
+       {
+         TRACE(ctx, "NS: Restoring default NS -> ID %u", px->prev_ns);
+         ctx->ns_default = px->prev_ns;
+       }
+      ctx->ns_prefix_stack = px->prev;
+    }
+}
diff --git a/ucw-xml/parse.c b/ucw-xml/parse.c

index 3402b7324f0a959ed1357a8b6c003302174daf69..48ad70218ce93d9ac6205d4dc6aec0aba9224272 100644 (file)
--- a/ucw-xml/parse.c
+++ b/ucw-xml/parse.c
@@ -2,6 +2,7 @@
   *     UCW Library -- A simple XML parser
   *
   *     (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
+ *     (c) 2015 Martin Mares <mj@ucw.cz>
   *
   *     This software may be freely distributed and used according to the terms
   *     of the GNU Lesser General Public License.
@@ -58,7 +59,8 @@ xml_parse_eq(struct xml_context *ctx)
  static char *
  xml_parse_string(struct xml_context *ctx, struct mempool *pool, uint first_cat, uint next_cat, char *err)
  {
-  char *p = mp_start_noalign(pool, 1);
+  char *p = mp_start_noalign(pool, 2);
+  *p++ = '<';          /* We always prepend a '<', so we can seek backwards in the string */
    if (unlikely(!(xml_peek_cat(ctx) & first_cat)))
      xml_fatal(ctx, "%s", err);
    do
@@ -68,7 +70,7 @@ xml_parse_string(struct xml_context *ctx, struct mempool *pool, uint first_cat,
      }
    while (xml_peek_cat(ctx) & next_cat);
    *p++ = 0;
-  return mp_end(pool, p);
+  return mp_end(pool, p) + 1;
  }
  
  static void
@@ -587,21 +589,22 @@ xml_normalize_white(struct xml_context *ctx UNUSED, char *text)
  struct xml_attrs_table;
  
  static inline uint
-xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_node *e, char *n)
+xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_node *e, uint ns, char *n)
  {
-  return hash_pointer(e) ^ hash_string(n);
+  return hash_pointer(e) ^ hash_string(n) ^ hash_u32(ns);
  }
  
  static inline int
-xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_node *e1, char *n1, struct xml_node *e2, char *n2)
+xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_node *e1, uint ns1, char *n1, struct xml_node *e2, uint ns2, char *n2)
  {
-  return (e1 == e2) && !strcmp(n1, n2);
+  return (e1 == e2) && (ns1 == ns2) && !strcmp(n1, n2);
  }
  
  static inline void
-xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_node *e, char *name)
+xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_node *e, uint ns, char *name)
  {
    a->elem = e;
+  a->ns = ns;
    a->name = name;
    a->val = NULL;
    a->user = NULL;
@@ -610,8 +613,8 @@ xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct
  
  #define HASH_PREFIX(x) xml_attrs_##x
  #define HASH_NODE struct xml_attr
-#define HASH_KEY_COMPLEX(x) x elem, x name
-#define HASH_KEY_DECL struct xml_node *elem, char *name
+#define HASH_KEY_COMPLEX(x) x elem, x ns, x name
+#define HASH_KEY_DECL struct xml_node *elem, uint ns, char *name
  #define HASH_TABLE_DYNAMIC
  #define HASH_GIVE_EQ
  #define HASH_GIVE_HASHFN
@@ -631,7 +634,8 @@ xml_parse_attr(struct xml_context *ctx)
    /* Attribute ::= Name Eq AttValue */
    struct xml_node *e = ctx->node;
    char *n = xml_parse_name(ctx, ctx->pool);
-  struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, n);
+  // FIXME: This is wrong! This way, we never find attributes in a non-default NS.
+  struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, 0, n);
    xml_parse_eq(ctx);
    char *v = xml_parse_attr_value(ctx, NULL);
    if (a->val)
@@ -651,13 +655,19 @@ xml_parse_attr(struct xml_context *ctx)
  struct xml_attr *
  xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name)
  {
-  return xml_attrs_find(ctx->tab_attrs, node, name);
+  return xml_attrs_find(ctx->tab_attrs, node, 0, name);
+}
+
+struct xml_attr *
+xml_attr_find_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name)
+{
+  return xml_attrs_find(ctx->tab_attrs, node, ns, name);
  }
  
  char *
  xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name)
  {
-  struct xml_attr *attr = xml_attrs_find(ctx->tab_attrs, node, name);
+  struct xml_attr *attr = xml_attrs_find(ctx->tab_attrs, node, 0, name);
    if (attr)
      return attr->val;
    if (!node->dtd)
@@ -678,6 +688,15 @@ xml_attrs_table_cleanup(struct xml_context *ctx)
    xml_attrs_cleanup(ctx->tab_attrs);
  }
  
+char *
+xml_attr_qname(struct xml_context *ctx UNUSED, struct xml_attr *attr)
+{
+  char *n = attr->name;
+  while (n[-1] != '<')
+    n--;
+  return n;
+}
+
  /*** Elements ***/
  
  static uint
@@ -705,12 +724,14 @@ xml_push_element(struct xml_context *ctx)
    e->type = XML_NODE_ELEM;
    e->name = xml_parse_name(ctx, ctx->pool);
    slist_init(&e->attrs);
+
    if (!e->parent)
      {
        ctx->dom = e;
        if (ctx->doctype && strcmp(e->name, ctx->doctype))
         xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype);
      }
+
    if (!ctx->dtd)
      e->dtd = NULL;
    else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name)))
@@ -732,6 +753,7 @@ xml_push_element(struct xml_context *ctx)
               xml_error(ctx, "Unexpected element <%s>", e->name);
           }
      }
+
    while (1)
      {
        uint white = xml_parse_white(ctx, 0);
@@ -749,16 +771,20 @@ xml_push_element(struct xml_context *ctx)
        xml_unget_char(ctx);
        xml_parse_attr(ctx);
      }
+
+  xml_ns_push_element(ctx);
+
+  /* FIXME: DTD logic is not namespace-aware */
    if (e->dtd)
      SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs)
        if (a->default_mode == XML_ATTR_REQUIRED)
          {
-         if (!xml_attrs_find(ctx->tab_attrs, e, a->name))
+         if (!xml_attrs_find(ctx->tab_attrs, e, 0, a->name))
             xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name);
         }
        else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS)
          {
-         struct xml_attr *attr = xml_attrs_lookup(ctx->tab_attrs, e, a->name);
+         struct xml_attr *attr = xml_attrs_lookup(ctx->tab_attrs, e, 0, a->name);
           if (!attr->val)
             attr->val = a->default_value;
         }
@@ -772,6 +798,9 @@ xml_pop_element(struct xml_context *ctx)
    TRACE(ctx, "pop_element");
    if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag)
      ctx->h_etag(ctx);
+
+  xml_ns_pop_element(ctx);
+
    struct xml_node *e = ctx->node;
    uint free = !(ctx->flags & XML_ALLOC_TAGS);
    if (free)
@@ -793,6 +822,7 @@ xml_pop_element(struct xml_context *ctx)
           clist_remove(&n->n);
         }
      }
+
    xml_pop_dom(ctx, free);
    xml_dec(ctx);
  }
@@ -804,7 +834,7 @@ xml_parse_etag(struct xml_context *ctx)
    * Already parsed: '<' */
    struct xml_node *e = ctx->node;
    ASSERT(e);
-  char *n = e->name;
+  char *n = xml_node_qname(ctx, e);
    while (*n)
      {
        uint c;
@@ -822,6 +852,16 @@ recover:
    xml_dec(ctx);
  }
  
+char *
+xml_node_qname(struct xml_context *ctx UNUSED, struct xml_node *node)
+{
+  ASSERT(node->type == XML_NODE_ELEM);
+  char *n = node->name;
+  while (n[-1] != '<')
+    n--;
+  return n;
+}
+
  /*** Document type declaration ***/
  
  static void
diff --git a/ucw-xml/xml-test.c b/ucw-xml/xml-test.c

index 758b037370e2d2930cea8bef47bb76457c452679..7b7fcd1c641e8d218503876cfc841fb9587334af 100644 (file)
--- a/ucw-xml/xml-test.c
+++ b/ucw-xml/xml-test.c
@@ -12,6 +12,7 @@
  #include <ucw-xml/dtd.h>
  #include <ucw/getopt.h>
  #include <ucw/fastbuf.h>
+#include <ucw/gary.h>
  
  #include <stdio.h>
  #include <stdlib.h>
@@ -25,21 +26,24 @@ enum {
    WANT_REPORT_BLOCKS,
    WANT_REPORT_IGNORABLE,
    WANT_FILE_ENTITIES,
+  WANT_QNAMES,
  };
  
-static char *shortopts = "spdt" CF_SHORT_OPTS;
+static char *shortopts = "spdtn" CF_SHORT_OPTS;
  static struct option longopts[] = {
    CF_LONG_OPTS
    { "sax",             0, 0, 's' },
    { "pull",            0, 0, 'p' },
    { "dom",             0, 0, 't' },
    { "dtd",             0, 0, 'd' },
+  { "namespaces",      0, 0, 'n' },
    { "hide-errors",     0, 0, WANT_HIDE_ERRORS },
    { "ignore-comments", 0, 0, WANT_IGNORE_COMMENTS },
    { "ignore-pis",      0, 0, WANT_IGNORE_PIS },
    { "report-blocks",   0, 0, WANT_REPORT_BLOCKS },
    { "report-ignorable",        0, 0, WANT_REPORT_IGNORABLE },
    { "file-entities",   0, 0, WANT_FILE_ENTITIES },
+  { "qnames",          0, 0, WANT_QNAMES },
    { NULL,              0, 0, 0 }
  };
  
@@ -56,12 +60,14 @@ CF_USAGE
  -s, --sax               Test SAX interface\n\
  -t, --dom               Test DOM interface\n\
  -d, --dtd               Enable parsing of DTD\n\
+-n, --namespaces       Resolve namespaces\n\
      --hide-errors       Hide warnings and error messages\n\
      --ignore-comments   Ignore comments\n\
      --ignore-pis        Ignore processing instructions\n\
      --report-blocks    Report blocks or characters and CDATA sections\n\
      --report-ignorable  Report ignorable whitespace\n\
      --file-entities     Resolve file external entities (not fully normative)\n\
+    --qnames           Display qualified names including namespace prefixes\n\
  \n", stderr);
    exit(1);
  }
@@ -69,6 +75,7 @@ CF_USAGE
  static uint want_sax;
  static uint want_pull;
  static uint want_dom;
+static uint want_ns;
  static uint want_parse_dtd;
  static uint want_hide_errors;
  static uint want_ignore_comments;
@@ -76,6 +83,7 @@ static uint want_ignore_pis;
  static uint want_report_blocks;
  static uint want_report_ignorable;
  static uint want_file_entities;
+static uint want_qnames;
  
  static struct fastbuf *out;
  
@@ -93,14 +101,20 @@ node_type(struct xml_node *node)
  }
  
  static void
-show_node(struct xml_node *node)
+show_node(struct xml_context *ctx, struct xml_node *node)
  {
    switch (node->type)
      {
        case XML_NODE_ELEM:
-       bprintf(out, " <%s>", node->name);
+       if (want_ns)
+         bprintf(out, " (ns%u)<%s>", node->ns, (want_qnames ? xml_node_qname(ctx, node) : node->name));
+       else
+         bprintf(out, " <%s>", node->name);
          XML_ATTR_FOR_EACH(a, node)
-          bprintf(out, " %s='%s'", a->name, a->val);
+         if (want_ns)
+           bprintf(out, " (ns%u)%s='%s'", a->ns, (want_qnames ? xml_attr_qname(ctx, a) : a->name), a->val);
+         else
+           bprintf(out, " %s='%s'", a->name, a->val);
         bputc(out, '\n');
         break;
        case XML_NODE_COMMENT:
@@ -118,7 +132,7 @@ show_node(struct xml_node *node)
  }
  
  static void
-show_tree(struct xml_node *node, uint level)
+show_tree(struct xml_context *ctx, struct xml_node *node, uint level)
  {
    if (!node)
      return;
@@ -126,10 +140,10 @@ show_tree(struct xml_node *node, uint level)
    for (uint i = 0; i < level; i++)
      bputs(out, "    ");
    bputs(out, node_type(node));
-  show_node(node);
+  show_node(ctx, node);
    if (node->type == XML_NODE_ELEM)
      XML_NODE_FOR_EACH(son, node)
-      show_tree(son, level + 1);
+      show_tree(ctx, son, level + 1);
  }
  
  static void
@@ -168,21 +182,21 @@ static void
  h_comment(struct xml_context *ctx)
  {
    bputs(out, "SAX:  comment");
-  show_node(ctx->node);
+  show_node(ctx, ctx->node);
  }
  
  static void
  h_pi(struct xml_context *ctx)
  {
    bputs(out, "SAX:  pi");
-  show_node(ctx->node);
+  show_node(ctx, ctx->node);
  }
  
  static void
  h_stag(struct xml_context *ctx)
  {
    bputs(out, "SAX:  stag");
-  show_node(ctx->node);
+  show_node(ctx, ctx->node);
  }
  
  static void
@@ -195,7 +209,7 @@ static void
  h_chars(struct xml_context *ctx)
  {
    bputs(out, "SAX:  chars");
-  show_node(ctx->node);
+  show_node(ctx, ctx->node);
  }
  
  static void
@@ -255,6 +269,9 @@ main(int argc, char **argv)
         case 'd':
           want_parse_dtd++;
           break;
+       case 'n':
+         want_ns++;
+         break;
         case WANT_HIDE_ERRORS:
           want_hide_errors++;
           break;
@@ -273,6 +290,9 @@ main(int argc, char **argv)
         case WANT_FILE_ENTITIES:
           want_file_entities++;
           break;
+       case WANT_QNAMES:
+         want_qnames++;
+         break;
         default:
           usage();
        }
@@ -315,6 +335,8 @@ main(int argc, char **argv)
      ctx.flags &= ~(XML_REPORT_PIS | XML_ALLOC_PIS);
    if (want_file_entities)
      ctx.h_resolve_entity = h_resolve_entity;
+  if (want_ns)
+    xml_ns_enable(&ctx);
    xml_push_fastbuf(&ctx, bfdopen_shared(0, 4096));
    bputs(out, "PULL: start\n");
    if (want_pull)
@@ -326,22 +348,22 @@ main(int argc, char **argv)
           {
             case XML_STATE_CHARS:
               bputs(out, "PULL: chars");
-             show_node(ctx.node);
+             show_node(&ctx, ctx.node);
               break;
             case XML_STATE_STAG:
               bputs(out, "PULL: stag");
-             show_node(ctx.node);
+             show_node(&ctx, ctx.node);
               break;
             case XML_STATE_ETAG:
               bprintf(out, "PULL: etag </%s>\n", ctx.node->name);
               break;
             case XML_STATE_COMMENT:
               bputs(out, "PULL: comment");
-             show_node(ctx.node);
+             show_node(&ctx, ctx.node);
               break;
             case XML_STATE_PI:
               bputs(out, "PULL: pi");
-             show_node(ctx.node);
+             show_node(&ctx, ctx.node);
               break;
             default:
               bputs(out, "PULL: unknown\n");
@@ -356,7 +378,14 @@ main(int argc, char **argv)
      {
        bputs(out, "PULL: eof\n");
        if (want_dom)
-       show_tree(ctx.dom, 0);
+       show_tree(&ctx, ctx.dom, 0);
+    }
+
+  if (want_ns)
+    {
+      bputs(out, "Known namespaces:\n");
+      for (uns i=0; i < GARY_SIZE(ctx.ns_by_id); i++)
+       bprintf(out, "%u\t%s\n", i, ctx.ns_by_id[i]);
      }
  
    xml_cleanup(&ctx);
diff --git a/ucw-xml/xml.h b/ucw-xml/xml.h

index c048f56cab8a44acb09a0c43af03a2d542f5184a..0c335c65205196860521b4cbbdc5fc147292117a 100644 (file)
--- a/ucw-xml/xml.h
+++ b/ucw-xml/xml.h
@@ -2,6 +2,7 @@
   *     UCW Library -- A simple XML parser
   *
   *     (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
+ *     (c) 2015 Martin Mares <mj@ucw.cz>
   *
   *     This software may be freely distributed and used according to the terms
   *     of the GNU Lesser General Public License.
@@ -104,6 +105,7 @@ enum xml_flags {
    XML_PARSE_DTD =                      0x00000200,     /* Enable parsing of DTD */
    XML_NO_CHARS =                       0x00000400,     /* The current element must not contain character data (filled automaticaly if using DTD) */
    XML_ALLOC_DEFAULT_ATTRS =            0x00000800,     /* Allocate default attribute values so they can be found by XML_ATTR_FOR_EACH */
+  XML_NAMESPACES =                     0x00001000,     /* Parse namespaces, use xml_ns_enable() to set this */
  
    /* Internals, do not change! */
    XML_EMPTY_ELEM_TAG =                 0x00010000,     /* The current element match EmptyElemTag */
@@ -131,6 +133,14 @@ struct xml_node {
    cnode n;                                             /* Node for list of parent's sons */
    uint type;                                           /* XML_NODE_x */
    struct xml_node *parent;                             /* Parent node */
+  /*
+   *  If namespaces are enabled, node->name points to the local part of the name
+   *  and node->ns is the resolved namespace ID.
+   *
+   *  However, the namespace prefix is kept in memory just before the local part,
+   *  so you can use xml_node_qname() to find out the full qualified name.
+   *  The same applies to attributes, but the function is xml_attr_qname().
+   */
    char *name;                                          /* Element name / PI target */
    clist sons;                                          /* Children nodes */
    union {
@@ -139,6 +149,7 @@ struct xml_node {
        uint len;                                                /* Text length in bytes */
      };
      struct {
+      uint ns;                                         /* Namespace ID */
        struct xml_dtd_elem *dtd;                                /* Element DTD */
        slist attrs;                                     /* Link list of element attributes */
      };
@@ -150,7 +161,8 @@ struct xml_attr {
    snode n;                                             /* Node for elem->attrs */
    struct xml_node *elem;                               /* Parent element */
    struct xml_dtd_attr *dtd;                            /* Attribute DTD */
-  char *name;                                          /* Attribute name */
+  uint ns;                                             /* Namespace ID */
+  char *name;                                          /* Attribute name without NS prefix */
    char *val;                                           /* Attribute value */
    void *user;                                          /* User-defined (initialized to NULL) */
  };
@@ -228,6 +240,14 @@ struct xml_context {
    struct xml_node *dom;                                        /* DOM root */
    struct xml_node *node;                               /* Current DOM node */
  
+  /* Namespaces */
+  struct mempool *ns_pool;                             /* Memory pool for NS definitions */
+  const char **ns_by_id;                               /* A growing array translating NS IDs to their names */
+  void *ns_by_name;                                    /* Hash table translating NS names to their IDs */
+  void *ns_by_prefix;                                  /* Hash table translating current prefixes to NS IDs, allocated from xml->stack */
+  struct xml_ns_prefix *ns_prefix_stack;               /* A stack of prefix definitions, allocated from xml->stack */
+  uint ns_default;                                     /* Current default namespace */
+
    char *version_str;
    uint standalone;
    char *doctype;                                       /* The document type (or NULL if unknown) */
@@ -265,9 +285,18 @@ uint xml_skip_element(struct xml_context *ctx);
  /* Returns the current row number in the document entity */
  uint xml_row(struct xml_context *ctx);
  
+/* Finds a qualified name (including namespace prefix) of a given element node. */
+char *xml_node_qname(struct xml_context *ctx, struct xml_node *node);
+
+/* Finds a qualified name (including namespace prefix) of a given attribute. */
+char *xml_attr_qname(struct xml_context *ctx, struct xml_attr *node);
+
  /* Finds a given attribute value in a XML_NODE_ELEM node */
  struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name);
  
+/* The same, but namespace-aware */
+struct xml_attr *xml_attr_find_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name);
+
  /* Similar to xml_attr_find, but it deals also with default values */
  char *xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name);
  
@@ -291,4 +320,18 @@ void xml_warn(struct xml_context *ctx, const char *format, ...);
  void xml_error(struct xml_context *ctx, const char *format, ...);
  void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...);
  
+/* Request processing of namespaces */
+void xml_ns_enable(struct xml_context *ctx);
+
+/* Looks up namespace by its ID, dies on an invalid ID */
+const char *xml_ns_by_id(struct xml_context *ctx, uint ns);
+
+/* Looks up namespace by its name and returns its ID. Creates a new ID if necessary. */
+uint xml_ns_by_name(struct xml_context *ctx, const char *name);
+
+/* Well-known namespaces */
+#define XML_NS_NONE            0       /* This element has no namespace */
+#define XML_NS_XMLNS           1       /* xmlns: */
+#define XML_NS_XML             2       /* xml: */
+
  #endif
author	Martin Mares <mj@ucw.cz>
	Thu, 12 Feb 2015 22:12:03 +0000 (23:12 +0100)
committer	Martin Mares <mj@ucw.cz>
	Thu, 12 Feb 2015 22:12:03 +0000 (23:12 +0100)
ucw-xml/Makefile		patch \| blob \| history
ucw-xml/common.c		patch \| blob \| history
ucw-xml/internals.h		patch \| blob \| history
ucw-xml/ns.c	[new file with mode: 0644]	patch \| blob
ucw-xml/parse.c		patch \| blob \| history
ucw-xml/xml-test.c		patch \| blob \| history
ucw-xml/xml.h		patch \| blob \| history