+++ /dev/null
-# Makefile for the XML parser
-# (c) 2007 Pavel Charvat <pchar@ucw.cz>
-
-DIRS+=sherlock/xml
-PROGS+=$(o)/sherlock/xml/xml-test
-
-LIBSHXML_MODS=common source parse dtd
-LIBSHXML_INCLUDES=xml.h dtd.h
-
-LIBSHXML_MOD_PATHS=$(addprefix $(o)/sherlock/xml/,$(LIBSHXML_MODS))
-
-$(o)/sherlock/xml/libshxml.a: $(addsuffix .o,$(LIBSHXML_MOD_PATHS))
-$(o)/sherlock/xml/libshxml.so: $(addsuffix .oo,$(LIBSHXML_MOD_PATHS))
-$(o)/sherlock/xml/libshxml.pc: $(LIBSH) $(LIBCHARSET)
-
-$(o)/sherlock/xml/common.o: $(o)/sherlock/xml/unicat.h
-$(o)/sherlock/xml/common.oo: $(o)/sherlock/xml/unicat.h
-$(o)/sherlock/xml/source.o: $(o)/sherlock/xml/unicat.h
-$(o)/sherlock/xml/source.oo: $(o)/sherlock/xml/unicat.h
-$(o)/sherlock/xml/dtd.o: $(o)/sherlock/xml/unicat.h
-$(o)/sherlock/xml/dtd.oo: $(o)/sherlock/xml/unicat.h
-$(o)/sherlock/xml/parse.o: $(o)/sherlock/xml/unicat.h
-$(o)/sherlock/xml/parse.oo: $(o)/sherlock/xml/unicat.h
-$(o)/sherlock/xml/unicat.h: $(s)/sherlock/xml/unicat.pl
- $(M)GEN $(addprefix $(o)/sherlock/xml/unicat,.h .c)
- $(Q)$< $(addprefix $(o)/sherlock/xml/unicat,.h .c)
- $(Q)touch $@
-
-TESTS+=$(o)/sherlock/xml/xml-test.test
-$(o)/sherlock/xml/xml-test: $(o)/sherlock/xml/xml-test.o $(LIBSHXML)
-$(o)/sherlock/xml/xml-test.test: $(o)/sherlock/xml/xml-test
-
-API_LIBS+=libshxml
-API_INCLUDES+=$(o)/sherlock/xml/.include-stamp
-$(o)/sherlock/xml/.include-stamp: $(addprefix $(s)/sherlock/xml/,$(LIBSHXML_INCLUDES))
-$(o)/sherlock/xml/.include-stamp: IDST=sherlock/xml
-run/lib/pkgconfig/libshxml.pc: $(o)/sherlock/xml/libshxml.pc
-
-INSTALL_TARGETS+=install-sh-xml
-install-sh-xml:
- install -d -m 755 $(DESTDIR)$(INSTALL_INCLUDE_DIR)/sherlock/xml $(DESTDIR)$(INSTALL_LIB_DIR) $(DESTDIR)$(INSTALL_PKGCONFIG_DIR)
- install -m 644 $(addprefix run/include/sherlock/xml/,$(LIBSHXML_INCLUDES)) $(DESTDIR)$(INSTALL_INCLUDE_DIR)/sherlock/xml
- install -m 644 run/lib/pkgconfig/libshxml.pc $(DESTDIR)$(INSTALL_PKGCONFIG_DIR)
- install -m 644 run/lib/libshxml.$(LS) $(DESTDIR)$(INSTALL_LIB_DIR)
-
-.PHONY: install-sh-xml
+++ /dev/null
-Non-normative / not-implemented:
--- introduce numeric error codes
--- cycle detection in internal entities (and possibly external?)
--- conditional sections in DTD
--- validation of elements (regular expressions, non-cdata)
--- validation of attributes (unfinished)
--- notations
--- URI normalization
--- support for xml:space
--- support for xml:lang
--- full support for standalone documents
--- Unicode normalization
-
-Optimizations:
--- detect definitions of trivial entities
+++ /dev/null
-/*
- * Sherlock Library -- A simple XML parser
- *
- * (c) 2007 Pavel Charvat <pchar@ucw.cz>
- *
- * This software may be freely distributed and used according to the terms
- * of the GNU Lesser General Public License.
- */
-
-#undef LOCAL_DEBUG
-
-#include "sherlock/sherlock.h"
-#include "sherlock/xml/xml.h"
-#include "sherlock/xml/dtd.h"
-#include "sherlock/xml/internals.h"
-#include "ucw/stkstring.h"
-#include "ucw/ff-unicode.h"
-
-#include <setjmp.h>
-
-/*** Error handling ***/
-
-void NONRET
-xml_throw(struct xml_context *ctx)
-{
- ASSERT(ctx->err_code && ctx->throw_buf);
- longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code);
-}
-
-void
-xml_warn(struct xml_context *ctx, const char *format, ...)
-{
- if (ctx->h_warn)
- {
- va_list args;
- va_start(args, format);
- ctx->err_msg = stk_vprintf(format, args);
- ctx->err_code = XML_ERR_WARN;
- va_end(args);
- ctx->h_warn(ctx);
- ctx->err_msg = NULL;
- ctx->err_code = XML_ERR_OK;
- }
-}
-
-void
-xml_error(struct xml_context *ctx, const char *format, ...)
-{
- if (ctx->h_error)
- {
- va_list args;
- va_start(args, format);
- ctx->err_msg = stk_vprintf(format, args);
- ctx->err_code = XML_ERR_ERROR;
- va_end(args);
- ctx->h_error(ctx);
- ctx->err_msg = NULL;
- ctx->err_code = XML_ERR_OK;
- }
-}
-
-void NONRET
-xml_fatal(struct xml_context *ctx, const char *format, ...)
-{
- va_list args;
- va_start(args, format);
- ctx->err_msg = mp_vprintf(ctx->stack, format, args);
- ctx->err_code = XML_ERR_FATAL;
- ctx->state = XML_STATE_EOF;
- va_end(args);
- if (ctx->h_fatal)
- ctx->h_fatal(ctx);
- xml_throw(ctx);
-}
-
-/*** Memory management ***/
-
-void *
-xml_hash_new(struct mempool *pool, uns size)
-{
- void *tab = mp_alloc_zero(pool, size + XML_HASH_HDR_SIZE);
- *(void **)tab = pool;
- return tab + XML_HASH_HDR_SIZE;
-}
-
-/*** Initialization ***/
-
-static struct xml_context xml_defaults = {
- .flags = XML_SRC_EOF | XML_REPORT_ALL,
- .state = XML_STATE_START,
- .h_resolve_entity = xml_def_resolve_entity,
- .chars = {
- .name = "<xml_chars>",
- .spout = xml_spout_chars,
- .can_overwrite_buffer = 1,
- },
-};
-
-static void
-xml_do_init(struct xml_context *ctx)
-{
- xml_attrs_table_init(ctx);
-}
-
-void
-xml_init(struct xml_context *ctx)
-{
- *ctx = xml_defaults;
- ctx->pool = mp_new(65536);
- ctx->stack = mp_new(65536);
- xml_do_init(ctx);
- TRACE(ctx, "init");
-}
-
-void
-xml_cleanup(struct xml_context *ctx)
-{
- TRACE(ctx, "cleanup");
- xml_attrs_table_cleanup(ctx);
- xml_dtd_cleanup(ctx);
- xml_sources_cleanup(ctx);
- mp_delete(ctx->pool);
- mp_delete(ctx->stack);
-}
-
-void
-xml_reset(struct xml_context *ctx)
-{
- TRACE(ctx, "reset");
- struct mempool *pool = ctx->pool, *stack = ctx->stack;
- xml_attrs_table_cleanup(ctx);
- xml_dtd_cleanup(ctx);
- xml_sources_cleanup(ctx);
- mp_flush(pool);
- mp_flush(stack);
- *ctx = xml_defaults;
- ctx->pool = pool;
- ctx->stack = stack;
- xml_do_init(ctx);
-}
+++ /dev/null
-/*
- * Sherlock Library -- A simple XML parser
- *
- * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
- *
- * This software may be freely distributed and used according to the terms
- * of the GNU Lesser General Public License.
- */
-
-#undef LOCAL_DEBUG
-
-#include "sherlock/sherlock.h"
-#include "sherlock/xml/xml.h"
-#include "sherlock/xml/dtd.h"
-#include "sherlock/xml/internals.h"
-#include "ucw/fastbuf.h"
-#include "ucw/ff-unicode.h"
-#include "ucw/unicode.h"
-
-/* Notations */
-
-#define HASH_PREFIX(x) xml_dtd_notns_##x
-#define HASH_NODE struct xml_dtd_notn
-#define HASH_KEY_STRING name
-#define HASH_ZERO_FILL
-#define HASH_TABLE_DYNAMIC
-#define HASH_WANT_LOOKUP
-#define HASH_WANT_FIND
-#define HASH_GIVE_ALLOC
-#define HASH_TABLE_ALLOC
-XML_HASH_GIVE_ALLOC
-#include "ucw/hashtable.h"
-
-struct xml_dtd_notn *
-xml_dtd_find_notn(struct xml_context *ctx, char *name)
-{
- struct xml_dtd *dtd = ctx->dtd;
- struct xml_dtd_notn *notn = xml_dtd_notns_find(dtd->tab_notns, name);
- return !notn ? NULL : (notn->flags & XML_DTD_NOTN_DECLARED) ? notn : NULL;
-}
-
-/* General entities */
-
-#define HASH_PREFIX(x) xml_dtd_ents_##x
-#define HASH_NODE struct xml_dtd_entity
-#define HASH_KEY_STRING name
-#define HASH_ZERO_FILL
-#define HASH_TABLE_DYNAMIC
-#define HASH_WANT_FIND
-#define HASH_WANT_LOOKUP
-#define HASH_GIVE_ALLOC
-#define HASH_TABLE_ALLOC
-XML_HASH_GIVE_ALLOC
-#include "ucw/hashtable.h"
-
-static struct xml_dtd_entity *
-xml_dtd_declare_trivial_entity(struct xml_context *ctx, char *name, char *text)
-{
- struct xml_dtd *dtd = ctx->dtd;
- struct xml_dtd_entity *ent = xml_dtd_ents_lookup(dtd->tab_ents, name);
- if (ent->flags & XML_DTD_ENTITY_DECLARED)
- {
- xml_warn(ctx, "Entity &%s; already declared", name);
- return NULL;
- }
- slist_add_tail(&dtd->ents, &ent->n);
- ent->flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL;
- ent->text = text;
- return ent;
-}
-
-static void
-xml_dtd_declare_default_entities(struct xml_context *ctx)
-{
- xml_dtd_declare_trivial_entity(ctx, "lt", "<");
- xml_dtd_declare_trivial_entity(ctx, "gt", ">");
- xml_dtd_declare_trivial_entity(ctx, "amp", "&");
- xml_dtd_declare_trivial_entity(ctx, "apos", "'");
- xml_dtd_declare_trivial_entity(ctx, "quot", "\"");
-}
-
-struct xml_dtd_entity *
-xml_def_find_entity(struct xml_context *ctx UNUSED, char *name)
-{
-#define ENT(n, t) ent_##n = { .name = #n, .text = t, .flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL }
- static struct xml_dtd_entity ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\"");
-#undef ENT
- switch (name[0])
- {
- case 'l':
- if (!strcmp(name, "lt"))
- return &ent_lt;
- break;
- case 'g':
- if (!strcmp(name, "gt"))
- return &ent_gt;
- break;
- case 'a':
- if (!strcmp(name, "amp"))
- return &ent_amp;
- if (!strcmp(name, "apos"))
- return &ent_apos;
- break;
- case 'q':
- if (!strcmp(name, "quot"))
- return &ent_quot;
- break;
- }
- return NULL;
-}
-
-struct xml_dtd_entity *
-xml_dtd_find_entity(struct xml_context *ctx, char *name)
-{
- struct xml_dtd *dtd = ctx->dtd;
- if (ctx->h_find_entity)
- return ctx->h_find_entity(ctx, name);
- else if (dtd)
- {
- struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_ents, name);
- return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL;
- }
- else
- return xml_def_find_entity(ctx, name);
-}
-
-/* Parameter entities */
-
-static struct xml_dtd_entity *
-xml_dtd_find_pentity(struct xml_context *ctx, char *name)
-{
- struct xml_dtd *dtd = ctx->dtd;
- struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_pents, name);
- return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL;
-}
-
-/* Elements */
-
-struct xml_dtd_elems_table;
-
-static void
-xml_dtd_elems_init_data(struct xml_dtd_elems_table *tab UNUSED, struct xml_dtd_elem *e)
-{
- slist_init(&e->attrs);
-}
-
-#define HASH_PREFIX(x) xml_dtd_elems_##x
-#define HASH_NODE struct xml_dtd_elem
-#define HASH_KEY_STRING name
-#define HASH_TABLE_DYNAMIC
-#define HASH_ZERO_FILL
-#define HASH_WANT_FIND
-#define HASH_WANT_LOOKUP
-#define HASH_GIVE_ALLOC
-#define HASH_GIVE_INIT_DATA
-#define HASH_TABLE_ALLOC
-XML_HASH_GIVE_ALLOC
-#include "ucw/hashtable.h"
-
-struct xml_dtd_elem *
-xml_dtd_find_elem(struct xml_context *ctx, char *name)
-{
- return ctx->dtd ? xml_dtd_elems_find(ctx->dtd->tab_elems, name) : NULL;
-}
-
-/* Element sons */
-
-struct xml_dtd_enodes_table;
-
-static inline uns
-xml_dtd_enodes_hash(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem)
-{
- return hash_pointer(parent) ^ hash_pointer(elem);
-}
-
-static inline int
-xml_dtd_enodes_eq(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent1, struct xml_dtd_elem *elem1, struct xml_dtd_elem_node *parent2, struct xml_dtd_elem *elem2)
-{
- return (parent1 == parent2) && (elem1 == elem2);
-}
-
-static inline void
-xml_dtd_enodes_init_key(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *node, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem)
-{
- node->parent = parent;
- node->elem = elem;
-}
-
-#define HASH_PREFIX(x) xml_dtd_enodes_##x
-#define HASH_NODE struct xml_dtd_elem_node
-#define HASH_KEY_COMPLEX(x) x parent, x elem
-#define HASH_KEY_DECL struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem
-#define HASH_GIVE_HASHFN
-#define HASH_GIVE_EQ
-#define HASH_GIVE_INIT_KEY
-#define HASH_TABLE_DYNAMIC
-#define HASH_ZERO_FILL
-#define HASH_WANT_FIND
-#define HASH_WANT_NEW
-#define HASH_GIVE_ALLOC
-#define HASH_TABLE_ALLOC
-XML_HASH_GIVE_ALLOC
-#include "ucw/hashtable.h"
-
-/* Element attributes */
-
-struct xml_dtd_attrs_table;
-
-static inline uns
-xml_dtd_attrs_hash(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem, char *name)
-{
- return hash_pointer(elem) ^ hash_string(name);
-}
-
-static inline int
-xml_dtd_attrs_eq(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem1, char *name1, struct xml_dtd_elem *elem2, char *name2)
-{
- return (elem1 == elem2) && !strcmp(name1, name2);
-}
-
-static inline void
-xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_elem *elem, char *name)
-{
- attr->elem = elem;
- attr->name = name;
- slist_add_tail(&elem->attrs, &attr->n);
-}
-
-#define HASH_PREFIX(x) xml_dtd_attrs_##x
-#define HASH_NODE struct xml_dtd_attr
-#define HASH_ZERO_FILL
-#define HASH_TABLE_DYNAMIC
-#define HASH_KEY_COMPLEX(x) x elem, x name
-#define HASH_KEY_DECL struct xml_dtd_elem *elem, char *name
-#define HASH_GIVE_HASHFN
-#define HASH_GIVE_EQ
-#define HASH_GIVE_INIT_KEY
-#define HASH_WANT_FIND
-#define HASH_WANT_NEW
-#define HASH_GIVE_ALLOC
-#define HASH_TABLE_ALLOC
-XML_HASH_GIVE_ALLOC
-#include "ucw/hashtable.h"
-
-struct xml_dtd_attr *
-xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name)
-{
- return ctx->dtd ? xml_dtd_attrs_find(ctx->dtd->tab_attrs, elem, name) : NULL;
-}
-
-/* Enumerated attribute values */
-
-struct xml_dtd_evals_table;
-
-static inline uns
-xml_dtd_evals_hash(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr, char *val)
-{
- return hash_pointer(attr) ^ hash_string(val);
-}
-
-static inline int
-xml_dtd_evals_eq(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr1, char *val1, struct xml_dtd_attr *attr2, char *val2)
-{
- return (attr1 == attr2) && !strcmp(val1, val2);
-}
-
-static inline void
-xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_eval *eval, struct xml_dtd_attr *attr, char *val)
-{
- eval->attr = attr;
- eval->val = val;
-}
-
-#define HASH_PREFIX(x) xml_dtd_evals_##x
-#define HASH_NODE struct xml_dtd_eval
-#define HASH_TABLE_DYNAMIC
-#define HASH_KEY_COMPLEX(x) x attr, x val
-#define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val
-#define HASH_GIVE_HASHFN
-#define HASH_GIVE_EQ
-#define HASH_GIVE_INIT_KEY
-#define HASH_WANT_FIND
-#define HASH_WANT_NEW
-#define HASH_GIVE_ALLOC
-#define HASH_TABLE_ALLOC
-XML_HASH_GIVE_ALLOC
-#include "ucw/hashtable.h"
-
-/* Enumerated attribute notations */
-
-struct xml_dtd_enotns_table;
-
-static inline uns
-xml_dtd_enotns_hash(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn)
-{
- return hash_pointer(attr) ^ hash_pointer(notn);
-}
-
-static inline int
-xml_dtd_enotns_eq(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr1, struct xml_dtd_notn *notn1, struct xml_dtd_attr *attr2, struct xml_dtd_notn *notn2)
-{
- return (attr1 == attr2) && (notn1 == notn2);
-}
-
-static inline void
-xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_enotn *enotn, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn)
-{
- enotn->attr = attr;
- enotn->notn = notn;
-}
-
-#define HASH_PREFIX(x) xml_dtd_enotns_##x
-#define HASH_NODE struct xml_dtd_enotn
-#define HASH_TABLE_DYNAMIC
-#define HASH_KEY_COMPLEX(x) x attr, x notn
-#define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn
-#define HASH_GIVE_HASHFN
-#define HASH_GIVE_EQ
-#define HASH_GIVE_INIT_KEY
-#define HASH_WANT_FIND
-#define HASH_WANT_NEW
-#define HASH_GIVE_ALLOC
-#define HASH_TABLE_ALLOC
-XML_HASH_GIVE_ALLOC
-#include "ucw/hashtable.h"
-
-/* DTD initialization/cleanup */
-
-void
-xml_dtd_init(struct xml_context *ctx)
-{
- if (ctx->dtd)
- return;
- struct mempool *pool = mp_new(4096);
- struct xml_dtd *dtd = ctx->dtd = mp_alloc_zero(pool, sizeof(*ctx->dtd));
- dtd->pool = pool;
- xml_dtd_ents_init(dtd->tab_ents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table)));
- xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table)));
- xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table)));
- xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table)));
- xml_dtd_enodes_init(dtd->tab_enodes = xml_hash_new(pool, sizeof(struct xml_dtd_enodes_table)));
- xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table)));
- xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table)));
- xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table)));
- xml_dtd_declare_default_entities(ctx);
-}
-
-void
-xml_dtd_cleanup(struct xml_context *ctx)
-{
- if (!ctx->dtd)
- return;
- mp_delete(ctx->dtd->pool);
- ctx->dtd = NULL;
-}
-
-void
-xml_dtd_finish(struct xml_context *ctx)
-{
- if (!ctx->dtd)
- return;
- // FIXME: validity checks
-}
-
-/*** Parsing functions ***/
-
-/* References to parameter entities */
-
-void
-xml_parse_pe_ref(struct xml_context *ctx)
-{
- /* PEReference ::= '%' Name ';'
- * Already parsed: '%' */
- struct mempool_state state;
- mp_save(ctx->stack, &state);
- char *name = xml_parse_name(ctx, ctx->stack);
- xml_parse_char(ctx, ';');
- struct xml_dtd_entity *ent = xml_dtd_find_pentity(ctx, name);
- if (!ent)
- xml_error(ctx, "Unknown entity %%%s;", name);
- else
- {
- TRACE(ctx, "Pushed entity %%%s;", name);
- mp_restore(ctx->stack, &state);
- xml_dec(ctx);
- xml_push_entity(ctx, ent);
- return;
- }
- mp_restore(ctx->stack, &state);
- xml_dec(ctx);
-}
-
-static uns
-xml_parse_dtd_pe(struct xml_context *ctx, uns entity_decl)
-{
- /* Already parsed: '%' */
- do
- {
- xml_inc(ctx);
- if (!~entity_decl && (xml_peek_cat(ctx) & XML_CHAR_WHITE))
- {
- xml_dec(ctx);
- return ~0U;
- }
- xml_parse_pe_ref(ctx);
- while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
- xml_skip_char(ctx);
- }
- while (xml_get_char(ctx) == '%');
- xml_unget_char(ctx);
- return 1;
-}
-
-static inline uns
-xml_parse_dtd_white(struct xml_context *ctx, uns mandatory)
-{
- /* Whitespace or parameter entity,
- * mandatory==~0U has a special maening of the whitespace before the '%' character in an parameter entity declaration */
- uns cnt = 0;
- while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
- {
- xml_skip_char(ctx);
- cnt = 1;
- }
- if (xml_peek_char(ctx) == '%')
- {
- xml_skip_char(ctx);
- return xml_parse_dtd_pe(ctx, mandatory);
- }
- else if (unlikely(mandatory && !cnt))
- xml_fatal_expected_white(ctx);
- return cnt;
-}
-
-static void
-xml_dtd_parse_external_id(struct xml_context *ctx, char **system_id, char **public_id, uns allow_public)
-{
- struct xml_dtd *dtd = ctx->dtd;
- uns c = xml_peek_char(ctx);
- if (c == 'S')
- {
- xml_parse_seq(ctx, "SYSTEM");
- xml_parse_dtd_white(ctx, 1);
- *public_id = NULL;
- *system_id = xml_parse_system_literal(ctx, dtd->pool);
- }
- else if (c == 'P')
- {
- xml_parse_seq(ctx, "PUBLIC");
- xml_parse_dtd_white(ctx, 1);
- *system_id = NULL;
- *public_id = xml_parse_pubid_literal(ctx, dtd->pool);
- if (xml_parse_dtd_white(ctx, !allow_public))
- if ((c = xml_peek_char(ctx)) == '\'' || c == '"' || !allow_public)
- *system_id = xml_parse_system_literal(ctx, dtd->pool);
- }
- else
- xml_fatal(ctx, "Expected an external ID");
-}
-
-/* DTD: <!NOTATION ...> */
-
-void
-xml_parse_notation_decl(struct xml_context *ctx)
-{
- /* NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
- * Already parsed: '<!NOTATION' */
- TRACE(ctx, "parse_notation_decl");
- struct xml_dtd *dtd = ctx->dtd;
- xml_parse_dtd_white(ctx, 1);
-
- struct xml_dtd_notn *notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool));
- xml_parse_dtd_white(ctx, 1);
- char *system_id, *public_id;
- xml_dtd_parse_external_id(ctx, &system_id, &public_id, 1);
- xml_parse_dtd_white(ctx, 0);
- xml_parse_char(ctx, '>');
-
- if (notn->flags & XML_DTD_NOTN_DECLARED)
- xml_warn(ctx, "Notation %s already declared", notn->name);
- else
- {
- notn->flags = XML_DTD_NOTN_DECLARED;
- notn->system_id = system_id;
- notn->public_id = public_id;
- slist_add_tail(&dtd->notns, ¬n->n);
- }
- xml_dec(ctx);
-}
-
-/* DTD: <!ENTITY ...> */
-
-void
-xml_parse_entity_decl(struct xml_context *ctx)
-{
- /* Already parsed: '<!ENTITY' */
- TRACE(ctx, "parse_entity_decl");
- struct xml_dtd *dtd = ctx->dtd;
- uns flags = ~xml_parse_dtd_white(ctx, ~0U) ? 0 : XML_DTD_ENTITY_PARAMETER;
- if (flags)
- xml_parse_dtd_white(ctx, 1);
- struct xml_dtd_entity *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool));
- xml_parse_dtd_white(ctx, 1);
- slist *list = flags ? &dtd->pents : &dtd->ents;
- if (ent->flags & XML_DTD_ENTITY_DECLARED)
- {
- xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name);
- // FIXME: should be only warning
- }
- uns c, sep = xml_get_char(ctx);
- if (sep == '\'' || sep == '"')
- {
- /* Internal entity:
- * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */
- char *p = mp_start_noalign(dtd->pool, 1);
- while (1)
- {
- if ((c = xml_get_char(ctx)) == sep)
- break;
- if (c == '%')
- {
- // FIXME
- ASSERT(0);
- //xml_parse_parameter_ref(ctx);
- continue;
- }
- if (c == '&')
- {
- xml_inc(ctx);
- if (xml_peek_char(ctx) != '#')
- {
- /* Bypass references to general entities */
- struct mempool_state state;
- mp_save(ctx->stack, &state);
- char *n = xml_parse_name(ctx, ctx->stack);
- xml_parse_char(ctx, ';');
- xml_dec(ctx);
- uns l = strlen(n);
- p = mp_spread(dtd->pool, p, 3 + l);
- *p++ = '&';
- memcpy(p, n, l);
- p += l;
- *p++ = ';';;
- mp_restore(ctx->stack, &state);
- continue;
- }
- else
- {
- xml_skip_char(ctx);
- c = xml_parse_char_ref(ctx);
- }
- }
- p = mp_spread(dtd->pool, p, 5);
- p = utf8_32_put(p, c);
- }
- *p = 0;
- ent->len = p - (char *)mp_ptr(dtd->pool);
- ent->text = mp_end(dtd->pool, p + 1);
- slist_add_tail(list, &ent->n);
- ent->flags = flags | XML_DTD_ENTITY_DECLARED;
- }
- else
- {
- /* External entity */
- struct xml_dtd_notn *notn = NULL;
- char *system_id, *public_id;
- xml_unget_char(ctx);
- xml_dtd_parse_external_id(ctx, &system_id, &public_id, 0);
- if (xml_parse_dtd_white(ctx, 0) && flags && xml_peek_char(ctx) != '>')
- {
- /* General external unparsed entity */
- flags |= XML_DTD_ENTITY_UNPARSED;
- xml_parse_seq(ctx, "NDATA");
- xml_parse_dtd_white(ctx, 1);
- notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool));
- }
- slist_add_tail(list, &ent->n);
- ent->flags = flags | XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_EXTERNAL;
- ent->system_id = system_id;
- ent->public_id = public_id;
- ent->notn = notn;
- }
- xml_parse_dtd_white(ctx, 0);
- xml_parse_char(ctx, '>');
- xml_dec(ctx);
-}
-
-/* DTD: <!ELEMENT ...> */
-
-void
-xml_parse_element_decl(struct xml_context *ctx)
-{
- /* Elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
- * Already parsed: '<!ELEMENT' */
- struct xml_dtd *dtd = ctx->dtd;
- xml_parse_dtd_white(ctx, 1);
- char *name = xml_parse_name(ctx, dtd->pool);
- xml_parse_dtd_white(ctx, 1);
- struct xml_dtd_elem *elem = xml_dtd_elems_lookup(dtd->tab_elems, name);
- if (elem->flags & XML_DTD_ELEM_DECLARED)
- xml_fatal(ctx, "Element <%s> already declared", name);
-
- /* contentspec ::= 'EMPTY' | 'ANY' | Mixed | children */
- uns c = xml_peek_char(ctx);
- if (c == 'E')
- {
- xml_parse_seq(ctx, "EMPTY");
- elem->type = XML_DTD_ELEM_EMPTY;
- }
- else if (c == 'A')
- {
- xml_parse_seq(ctx, "ANY");
- elem->type = XML_DTD_ELEM_ANY;
- }
- else if (c == '(')
- {
- xml_skip_char(ctx);
- xml_inc(ctx);
- xml_parse_dtd_white(ctx, 0);
- struct xml_dtd_elem_node *parent = elem->node = mp_alloc_zero(dtd->pool, sizeof(*parent));
- if (xml_peek_char(ctx) == '#')
- {
- /* Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' */
- xml_skip_char(ctx);
- xml_parse_seq(ctx, "PCDATA");
- elem->type = XML_DTD_ELEM_MIXED;
- parent->type = XML_DTD_ELEM_PCDATA;
- while (1)
- {
- xml_parse_dtd_white(ctx, 0);
- if ((c = xml_get_char(ctx)) == ')')
- break;
- else if (c != '|')
- xml_fatal_expected(ctx, ')');
- xml_parse_dtd_white(ctx, 0);
- struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool));
- if (xml_dtd_enodes_find(dtd->tab_enodes, parent, son_elem))
- xml_error(ctx, "Duplicate content '%s'", son_elem->name);
- else
- {
- struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem);
- slist_add_tail(&parent->sons, &son->n);
- }
- }
- xml_dec(ctx);
- if (xml_peek_char(ctx) == '*')
- {
- xml_skip_char(ctx);
- parent->occur = XML_DTD_ELEM_OCCUR_MULT;
- }
- else if (!slist_head(&parent->sons))
- parent->occur = XML_DTD_ELEM_OCCUR_ONCE;
- else
- xml_fatal_expected(ctx, '*');
- }
- else
- {
- /* children ::= (choice | seq) ('?' | '*' | '+')?
- * cp ::= (Name | choice | seq) ('?' | '*' | '+')?
- * choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'
- * seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' */
-
- elem->type = XML_DTD_ELEM_CHILDREN;
- parent->type = XML_DTD_ELEM_PCDATA;
- uns c;
- goto first;
-
- while (1)
- {
- /* After name */
- xml_parse_dtd_white(ctx, 0);
- if ((c = xml_get_char(ctx)) == ')')
- {
- xml_dec(ctx);
- if (parent->type == XML_DTD_ELEM_PCDATA)
- parent->type = XML_DTD_ELEM_SEQ;
- if ((c = xml_get_char(ctx)) == '?')
- parent->occur = XML_DTD_ELEM_OCCUR_OPT;
- else if (c == '*')
- parent->occur = XML_DTD_ELEM_OCCUR_MULT;
- else if (c == '+')
- parent->occur = XML_DTD_ELEM_OCCUR_PLUS;
- else
- {
- xml_unget_char(ctx);
- parent->occur = XML_DTD_ELEM_OCCUR_ONCE;
- }
- if (!parent->parent)
- break;
- parent = parent->parent;
- continue;
- }
- else if (c == '|')
- {
- if (parent->type == XML_DTD_ELEM_PCDATA)
- parent->type = XML_DTD_ELEM_OR;
- else if (parent->type != XML_DTD_ELEM_OR)
- xml_fatal(ctx, "Mixed operators in the list of element children");
- }
- else if (c == ',')
- {
- if (parent->type == XML_DTD_ELEM_PCDATA)
- parent->type = XML_DTD_ELEM_SEQ;
- else if (parent->type != XML_DTD_ELEM_SEQ)
- xml_fatal(ctx, "Mixed operators in the list of element children");
- }
- else if (c == '(')
- {
- xml_inc(ctx);
- struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son));
- son->parent = parent;
- slist_add_tail(&parent->sons, &son->n);
- parent = son->parent;
- son->type = XML_DTD_ELEM_MIXED;
- }
- else
- xml_unget_char(ctx);
-
- /* Before name */
- xml_parse_dtd_white(ctx, 0);
-first:;
- struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool));
- // FIXME: duplicates, occurance
- //struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem);
- struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son));
- son->parent = parent;
- son->elem = son_elem;
- slist_add_tail(&parent->sons, &son->n);
- }
- }
- }
- else
- xml_fatal(ctx, "Expected element content specification");
-
- xml_parse_dtd_white(ctx, 0);
- xml_parse_char(ctx, '>');
- xml_dec(ctx);
-}
-
-void
-xml_parse_attr_list_decl(struct xml_context *ctx)
-{
- /* AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
- * AttDef ::= S Name S AttType S DefaultDecl
- * Already parsed: '<!ATTLIST' */
- struct xml_dtd *dtd = ctx->dtd;
- xml_parse_dtd_white(ctx, 1);
- struct xml_dtd_elem *elem = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx, dtd->pool));
-
- while (xml_parse_dtd_white(ctx, 0) && xml_peek_char(ctx) != '>')
- {
- char *name = xml_parse_name(ctx, dtd->pool);
- struct xml_dtd_attr *attr = xml_dtd_attrs_find(dtd->tab_attrs, elem, name);
- uns ignored = 0;
- if (attr)
- {
- xml_warn(ctx, "Duplicate attribute definition");
- ignored++;
- }
- else
- attr = xml_dtd_attrs_new(ctx->dtd->tab_attrs, elem, name);
- xml_parse_dtd_white(ctx, 1);
- if (xml_peek_char(ctx) == '(')
- {
- xml_skip_char(ctx); // FIXME: xml_inc/dec ?
- if (!ignored)
- attr->type = XML_ATTR_ENUM;
- do
- {
- xml_parse_dtd_white(ctx, 0);
- char *value = xml_parse_nmtoken(ctx, dtd->pool);
- if (!ignored)
- if (xml_dtd_evals_find(ctx->dtd->tab_evals, attr, value))
- xml_error(ctx, "Duplicate enumeration value");
- else
- xml_dtd_evals_new(ctx->dtd->tab_evals, attr, value);
- xml_parse_dtd_white(ctx, 0);
- }
- while (xml_get_char(ctx) == '|');
- xml_unget_char(ctx);
- xml_parse_char(ctx, ')');
- }
- else
- {
- char *type = xml_parse_name(ctx, dtd->pool);
- enum xml_dtd_attr_type t = XML_ATTR_CDATA;
- if (!strcmp(type, "CDATA"))
- t = XML_ATTR_CDATA;
- else if (!strcmp(type, "ID"))
- t = XML_ATTR_ID;
- else if (!strcmp(type, "IDREF"))
- t = XML_ATTR_IDREF;
- else if (!strcmp(type, "IDREFS"))
- t = XML_ATTR_IDREFS;
- else if (!strcmp(type, "ENTITY"))
- t = XML_ATTR_ENTITY;
- else if (!strcmp(type, "ENTITIES"))
- t = XML_ATTR_ENTITIES;
- else if (!strcmp(type, "NMTOKEN"))
- t = XML_ATTR_NMTOKEN;
- else if (!strcmp(type, "NMTOKENS"))
- t = XML_ATTR_NMTOKENS;
- else if (!strcmp(type, "NOTATION"))
- {
- if (elem->type == XML_DTD_ELEM_EMPTY)
- xml_fatal(ctx, "Empty element must not have notation attribute");
- // FIXME: An element type MUST NOT have more than one NOTATION attribute specified.
- t = XML_ATTR_NOTATION;
- xml_parse_dtd_white(ctx, 1);
- xml_parse_char(ctx, '(');
- do
- {
- xml_parse_dtd_white(ctx, 0);
- struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx, dtd->pool));
- if (!ignored)
- if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, attr, n))
- xml_error(ctx, "Duplicate enumerated notation");
- else
- xml_dtd_enotns_new(ctx->dtd->tab_enotns, attr, n);
- xml_parse_dtd_white(ctx, 0);
- }
- while (xml_get_char(ctx) == '|');
- xml_unget_char(ctx);
- xml_parse_char(ctx, ')');
- }
- else
- xml_fatal(ctx, "Unknown attribute type");
- if (!ignored)
- attr->type = t;
- }
- xml_parse_dtd_white(ctx, 1);
- enum xml_dtd_attr_default def = XML_ATTR_NONE;
- if (xml_get_char(ctx) == '#')
- switch (xml_peek_char(ctx))
- {
- case 'R':
- xml_parse_seq(ctx, "REQUIRED");
- def = XML_ATTR_REQUIRED;
- break;
- case 'I':
- xml_parse_seq(ctx, "IMPLIED");
- def = XML_ATTR_IMPLIED;
- break;
- case 'F':
- xml_parse_seq(ctx, "FIXED");
- def = XML_ATTR_FIXED;
- xml_parse_dtd_white(ctx, 1);
- break;
- default:
- xml_fatal(ctx, "Expected a modifier for default attribute value");
- }
- else
- xml_unget_char(ctx);
- if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED)
- {
- char *v = xml_parse_attr_value(ctx, attr);
- if (!ignored)
- attr->default_value = v;
- }
- if (!ignored)
- attr->default_mode = def;
- }
- xml_skip_char(ctx);
- xml_dec(ctx);
-}
-
-void
-xml_skip_internal_subset(struct xml_context *ctx)
-{
- TRACE(ctx, "skip_internal_subset");
- /* AlreadyParsed: '[' */
- uns c;
- while ((c = xml_get_char(ctx)) != ']')
- {
- if (c != '<')
- continue;
- if ((c = xml_get_char(ctx)) == '?')
- {
- xml_inc(ctx);
- xml_skip_pi(ctx);
- }
- else if (c != '!')
- xml_dec(ctx);
- else if (xml_get_char(ctx) == '-')
- {
- xml_inc(ctx);
- xml_skip_comment(ctx);
- }
- else
- while ((c = xml_get_char(ctx)) != '>')
- if (c == '\'' || c == '"')
- while (xml_get_char(ctx) != c);
- }
- xml_dec(ctx);
-}
-
-/*** Validation of attribute values ***/
-
-static uns
-xml_check_tokens(char *value, uns first_cat, uns next_cat, uns seq)
-{
- char *p = value;
- uns u;
- while (1)
- {
- p = utf8_32_get(p, &u);
- if (!(xml_char_cat(u) & first_cat))
- return 0;
- while (*p & ~0x20)
- {
- p = utf8_32_get(p, &u);
- if (!(xml_char_cat(u) & next_cat))
- return 0;
- }
- if (!*p)
- return 1;
- if (!seq)
- return 0;
- p++;
- }
-}
-
-static uns
-xml_is_name(struct xml_context *ctx, char *value)
-{
- /* Name ::= NameStartChar (NameChar)* */
- return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 0);
-}
-
-static uns
-xml_is_names(struct xml_context *ctx, char *value)
-{
- /* Names ::= Name (#x20 Name)* */
- return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 1);
-}
-
-static uns
-xml_is_nmtoken(struct xml_context *ctx, char *value)
-{
- /* Nmtoken ::= (NameChar)+ */
- return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 0);
-}
-
-static uns
-xml_is_nmtokens(struct xml_context *ctx, char *value)
-{
- /* Nmtokens ::= Nmtoken (#x20 Nmtoken)* */
- return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 1);
-}
-
-static void
-xml_err_attr_format(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *type)
-{
- xml_error(ctx, "Attribute %s in <%s> does not match the production of %s", dtd->name, dtd->elem->name, type);
-}
-
-void
-xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value)
-{
- if (dtd->type == XML_ATTR_CDATA)
- return;
- xml_normalize_white(ctx, value);
- switch (dtd->type)
- {
- case XML_ATTR_ID:
- if (!xml_is_name(ctx, value))
- xml_err_attr_format(ctx, dtd, "NAME");
- //FIXME: add to a hash table
- break;
- case XML_ATTR_IDREF:
- if (!xml_is_name(ctx, value))
- xml_err_attr_format(ctx, dtd, "NAME");
- // FIXME: find in hash table (beware forward references)
- break;
- case XML_ATTR_IDREFS:
- if (!xml_is_names(ctx, value))
- xml_err_attr_format(ctx, dtd, "NAMES");
- // FIXME: find
- break;
- case XML_ATTR_ENTITY:
- // FIXME
- break;
- case XML_ATTR_ENTITIES:
- // FIXME
- break;
- case XML_ATTR_NMTOKEN:
- if (!xml_is_nmtoken(ctx, value))
- xml_err_attr_format(ctx, dtd, "NMTOKEN");
- break;
- case XML_ATTR_NMTOKENS:
- if (!xml_is_nmtokens(ctx, value))
- xml_err_attr_format(ctx, dtd, "NMTOKENS");
- break;
- case XML_ATTR_ENUM:
- if (!xml_dtd_evals_find(ctx->dtd->tab_evals, dtd, value))
- xml_error(ctx, "Attribute %s in <%s> contains an undefined enumeration value", dtd->name, dtd->elem->name);
- break;
- case XML_ATTR_NOTATION:
- if (!xml_dtd_find_notn(ctx, value))
- xml_error(ctx, "Attribute %s in <%s> contains an undefined notation", dtd->name, dtd->elem->name);
- break;
- }
-}
+++ /dev/null
-/*
- * Sherlock Library -- A simple XML parser
- *
- * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
- *
- * This software may be freely distributed and used according to the terms
- * of the GNU Lesser General Public License.
- */
-
-#ifndef _SHERLOCK_XML_DTD_H
-#define _SHERLOCK_XML_DTD_H
-
-#include "sherlock/xml/xml.h"
-
-struct xml_dtd {
- struct mempool *pool; /* Memory pool where to allocate DTD */
- slist ents; /* Link list of general entities */
- slist pents; /* Link list of parameter entities */
- slist notns; /* Link list of notations */
- slist elems; /* Link list of elements */
- void *tab_ents; /* Hash table of general entities */
- void *tab_pents; /* Hash table of parameter entities */
- void *tab_notns; /* Hash table of notations */
- void *tab_elems; /* Hash table of elements */
- void *tab_enodes; /* Hash table of element sons */
- void *tab_attrs; /* Hash table of element attributes */
- void *tab_evals; /* Hash table of enumerated attribute values */
- void *tab_enotns; /* hash table of enumerated attribute notations */
-};
-
-/* Notations */
-
-enum xml_dtd_notn_flags {
- XML_DTD_NOTN_DECLARED = 0x1, /* The notation has been declared (internal usage) */
-};
-
-struct xml_dtd_notn {
- snode n; /* Node in xml_dtd.notns */
- uns flags; /* XML_DTD_NOTN_x */
- char *name; /* Notation name */
- char *system_id; /* External ID */
- char *public_id;
- void *user; /* User-defined */
-};
-
-struct xml_dtd_notn *xml_dtd_find_notn(struct xml_context *ctx, char *name);
-
-/* Entities */
-
-enum xml_dtd_entity_flags {
- XML_DTD_ENTITY_DECLARED = 0x1, /* The entity has been declared (internal usage) */
- XML_DTD_ENTITY_VISITED = 0x2, /* Cycle detection (internal usage) */
- XML_DTD_ENTITY_PARAMETER = 0x4, /* Parameter entity, general otherwise */
- XML_DTD_ENTITY_EXTERNAL = 0x8, /* External entity, internal otherwise */
- XML_DTD_ENTITY_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */
- XML_DTD_ENTITY_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */
-};
-
-struct xml_dtd_entity {
- snode n; /* Node in xml_dtd.[gp]ents */
- uns flags; /* XML_DTD_ENT_x */
- char *name; /* Entity name */
- char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */
- uns len; /* Text length */
- char *system_id; /* External ID */
- char *public_id;
- struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */
- void *user; /* User-defined */
-};
-
-struct xml_dtd_entity *xml_dtd_find_entity(struct xml_context *ctx, char *name);
-
-/* Elements */
-
-enum xml_dtd_elem_flags {
- XML_DTD_ELEM_DECLARED = 0x1, /* The element has been declared (internal usage) */
-};
-
-enum xml_dtd_elem_type {
- XML_DTD_ELEM_EMPTY,
- XML_DTD_ELEM_ANY,
- XML_DTD_ELEM_MIXED,
- XML_DTD_ELEM_CHILDREN,
-};
-
-struct xml_dtd_elem {
- snode n;
- uns flags;
- uns type;
- char *name;
- struct xml_dtd_elem_node *node;
- slist attrs;
- void *user; /* User-defined */
-};
-
-struct xml_dtd_elem_node {
- snode n;
- struct xml_dtd_elem_node *parent;
- struct xml_dtd_elem *elem;
- slist sons;
- uns type;
- uns occur;
- void *user; /* User-defined */
-};
-
-enum xml_dtd_elem_node_type {
- XML_DTD_ELEM_PCDATA,
- XML_DTD_ELEM_SEQ,
- XML_DTD_ELEM_OR,
-};
-
-enum xml_dtd_elem_node_occur {
- XML_DTD_ELEM_OCCUR_ONCE,
- XML_DTD_ELEM_OCCUR_OPT,
- XML_DTD_ELEM_OCCUR_MULT,
- XML_DTD_ELEM_OCCUR_PLUS,
-};
-
-struct xml_dtd_elem *xml_dtd_find_elem(struct xml_context *ctx, char *name);
-
-/* Attributes */
-
-enum xml_dtd_attr_default {
- XML_ATTR_NONE,
- XML_ATTR_REQUIRED,
- XML_ATTR_IMPLIED,
- XML_ATTR_FIXED,
-};
-
-enum xml_dtd_attr_type {
- XML_ATTR_CDATA,
- XML_ATTR_ID,
- XML_ATTR_IDREF,
- XML_ATTR_IDREFS,
- XML_ATTR_ENTITY,
- XML_ATTR_ENTITIES,
- XML_ATTR_NMTOKEN,
- XML_ATTR_NMTOKENS,
- XML_ATTR_ENUM,
- XML_ATTR_NOTATION,
-};
-
-struct xml_dtd_attr {
- snode n;
- char *name; /* Attribute name */
- struct xml_dtd_elem *elem; /* Owner element */
- uns type; /* See enum xml_dtd_attr_type */
- uns default_mode; /* See enum xml_dtd_attr_default */
- char *default_value; /* The default value defined in DTD (or NULL) */
-};
-
-struct xml_dtd_eval {
- struct xml_dtd_attr *attr;
- char *val;
-};
-
-struct xml_dtd_enotn {
- struct xml_dtd_attr *attr;
- struct xml_dtd_notn *notn;
-};
-
-void xml_dtd_init(struct xml_context *ctx);
-void xml_dtd_cleanup(struct xml_context *ctx);
-void xml_dtd_finish(struct xml_context *ctx);
-
-struct xml_dtd_attr *xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name);
-
-#endif
+++ /dev/null
-/*
- * Sherlock Library -- A simple XML parser
- *
- * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
- *
- * This software may be freely distributed and used according to the terms
- * of the GNU Lesser General Public License.
- */
-
-#ifndef _SHERLOCK_XML_INTERNALS_H
-#define _SHERLOCK_XML_INTERNALS_H
-
-#include "sherlock/xml/xml.h"
-#include "sherlock/xml/dtd.h"
-
-/*** Debugging ***/
-
-#ifdef LOCAL_DEBUG
-#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0)
-#else
-#define TRACE(c, f, p...) do {} while(0)
-#endif
-
-/*** Error handling ***/
-
-void NONRET xml_throw(struct xml_context *ctx);
-
-/*** Memory management ***/
-
-struct xml_stack {
- struct xml_stack *next;
- struct mempool_state state;
- uns flags;
-};
-
-static inline void *
-xml_do_push(struct xml_context *ctx, uns size)
-{
- /* Saves ctx->stack and ctx->flags state */
- struct mempool_state state;
- mp_save(ctx->stack, &state);
- struct xml_stack *s = mp_alloc(ctx->stack, size);
- s->state = state;
- s->flags = ctx->flags;
- s->next = ctx->stack_list;
- ctx->stack_list = s;
- return s;
-}
-
-static inline void
-xml_do_pop(struct xml_context *ctx, struct xml_stack *s)
-{
- /* Restore ctx->stack and ctx->flags state */
- ctx->stack_list = s->next;
- ctx->flags = s->flags;
- mp_restore(ctx->stack, &s->state);
-}
-
-static inline void
-xml_push(struct xml_context *ctx)
-{
- TRACE(ctx, "push");
- xml_do_push(ctx, sizeof(struct xml_stack));
-}
-
-static inline void
-xml_pop(struct xml_context *ctx)
-{
- TRACE(ctx, "pop");
- ASSERT(ctx->stack_list);
- xml_do_pop(ctx, ctx->stack_list);
-}
-
-struct xml_dom_stack {
- struct xml_stack stack;
- struct mempool_state state;
-};
-
-static inline struct xml_node *
-xml_push_dom(struct xml_context *ctx, struct mempool_state *state)
-{
- /* Create a new DOM node */
- TRACE(ctx, "push_dom");
- struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s));
- if (state)
- s->state = *state;
- else
- mp_save(ctx->pool, &s->state);
- struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n));
- n->user = NULL;
- if (n->parent = ctx->node)
- clist_add_tail(&n->parent->sons, &n->n);
- return ctx->node = n;
-}
-
-static inline void
-xml_pop_dom(struct xml_context *ctx, uns free)
-{
- /* Leave DOM subtree */
- TRACE(ctx, "pop_dom");
- ASSERT(ctx->node);
- struct xml_node *p = ctx->node->parent;
- struct xml_dom_stack *s = (void *)ctx->stack_list;
- if (free)
- {
- /* See xml_pop_element() for cleanup of attribute hash table */
- if (p)
- clist_remove(&ctx->node->n);
- mp_restore(ctx->pool, &s->state);
- }
- ctx->node = p;
- xml_do_pop(ctx, &s->stack);
-}
-
-#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN)
-#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \
- static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uns size) \
- { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \
- static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {}
-
-void *xml_hash_new(struct mempool *pool, uns size);
-
-void xml_spout_chars(struct fastbuf *fb);
-
-/*** Reading of document/external entities ***/
-
-void NONRET xml_fatal_nested(struct xml_context *ctx);
-
-static inline void
-xml_inc(struct xml_context *ctx)
-{
- /* Called after the first character of a block */
- TRACE(ctx, "inc");
- ctx->depth++;
-}
-
-static inline void
-xml_dec(struct xml_context *ctx)
-{
- /* Called after the last character of a block */
- TRACE(ctx, "dec");
- if (unlikely(!ctx->depth--))
- xml_fatal_nested(ctx);
-}
-
-#include "obj/sherlock/xml/unicat.h"
-
-static inline uns
-xml_char_cat(uns c)
-{
- if (c < 0x10000)
- return 1U << xml_char_tab1[(c & 0xff) + xml_char_tab2[c >> 8]];
- else if (likely(c < 0x110000))
- return 1U << xml_char_tab3[c >> 16];
- else
- return 1;
-}
-
-static inline uns
-xml_ascii_cat(uns c)
-{
- return xml_char_tab1[c];
-}
-
-struct xml_source *xml_push_source(struct xml_context *ctx);
-void xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent);
-
-void xml_refill(struct xml_context *ctx);
-
-static inline uns
-xml_peek_char(struct xml_context *ctx)
-{
- if (ctx->bptr == ctx->bstop)
- xml_refill(ctx);
- return ctx->bptr[0];
-}
-
-static inline uns
-xml_peek_cat(struct xml_context *ctx)
-{
- if (ctx->bptr == ctx->bstop)
- xml_refill(ctx);
- return ctx->bptr[1];
-}
-
-static inline uns
-xml_get_char(struct xml_context *ctx)
-{
- uns c = xml_peek_char(ctx);
- ctx->bptr += 2;
- return c;
-}
-
-static inline uns
-xml_get_cat(struct xml_context *ctx)
-{
- uns c = xml_peek_cat(ctx);
- ctx->bptr += 2;
- return c;
-}
-
-static inline uns
-xml_last_char(struct xml_context *ctx)
-{
- return ctx->bptr[-2];
-}
-
-static inline uns
-xml_last_cat(struct xml_context *ctx)
-{
- return ctx->bptr[-1];
-}
-
-static inline uns
-xml_skip_char(struct xml_context *ctx)
-{
- uns c = ctx->bptr[0];
- ctx->bptr += 2;
- return c;
-}
-
-static inline uns
-xml_unget_char(struct xml_context *ctx)
-{
- return *(ctx->bptr -= 2);
-}
-
-void xml_sources_cleanup(struct xml_context *ctx);
-
-/*** Parsing ***/
-
-void NONRET xml_fatal_expected(struct xml_context *ctx, uns c);
-void NONRET xml_fatal_expected_white(struct xml_context *ctx);
-void NONRET xml_fatal_expected_quot(struct xml_context *ctx);
-
-static inline uns
-xml_parse_white(struct xml_context *ctx, uns mandatory)
-{
- /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+
- * mandatory=0 -> S? */
- uns cnt = 0;
- while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
- {
- xml_skip_char(ctx);
- cnt++;
- }
- if (unlikely(mandatory && !cnt))
- xml_fatal_expected_white(ctx);
- return cnt;
-}
-
-static inline void
-xml_parse_char(struct xml_context *ctx, uns c)
-{
- /* Consumes a given Unicode character */
- if (unlikely(c != xml_get_char(ctx)))
- xml_fatal_expected(ctx, c);
-}
-
-static inline void
-xml_parse_seq(struct xml_context *ctx, const char *seq)
-{
- /* Consumes a given sequence of ASCII characters */
- while (*seq)
- xml_parse_char(ctx, *seq++);
-}
-
-void xml_parse_eq(struct xml_context *ctx);
-
-static inline uns
-xml_parse_quote(struct xml_context *ctx)
-{
- /* "'" | '"' */
- uns c = xml_get_char(ctx);
- if (unlikely(c != '\'' && c != '\"'))
- xml_fatal_expected_quot(ctx);
- return c;
-}
-
-char *xml_parse_name(struct xml_context *ctx, struct mempool *pool);
-void xml_skip_name(struct xml_context *ctx);
-char *xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool);
-
-char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool);
-char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool);
-
-uns xml_parse_char_ref(struct xml_context *ctx);
-void xml_parse_pe_ref(struct xml_context *ctx);
-
-char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr);
-
-void xml_skip_internal_subset(struct xml_context *ctx);
-void xml_parse_notation_decl(struct xml_context *ctx);
-void xml_parse_entity_decl(struct xml_context *ctx);
-void xml_parse_element_decl(struct xml_context *ctx);
-void xml_parse_attr_list_decl(struct xml_context *ctx);
-
-void xml_push_comment(struct xml_context *ctx);
-void xml_pop_comment(struct xml_context *ctx);
-void xml_skip_comment(struct xml_context *ctx);
-
-void xml_push_pi(struct xml_context *ctx);
-void xml_pop_pi(struct xml_context *ctx);
-void xml_skip_pi(struct xml_context *ctx);
-
-void xml_attrs_table_init(struct xml_context *ctx);
-void xml_attrs_table_cleanup(struct xml_context *ctx);
-
-void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value);
-
-#endif
+++ /dev/null
-# pkg-config metadata for libshxml
-
-libdir=@LIBDIR@
-incdir=.
-
-Name: libshxml
-Description: XML parser for Sherlock project
-Version: @SHERLOCK_VERSION@
-Cflags: -I${incdir}
-Libs: -L${libdir} -lshxml
-Requires: @DEPS@
+++ /dev/null
-/*
- * Sherlock Library -- A simple XML parser
- *
- * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
- *
- * This software may be freely distributed and used according to the terms
- * of the GNU Lesser General Public License.
- */
-
-#undef LOCAL_DEBUG
-
-#include "sherlock/sherlock.h"
-#include "sherlock/xml/xml.h"
-#include "sherlock/xml/dtd.h"
-#include "sherlock/xml/internals.h"
-#include "ucw/fastbuf.h"
-#include "ucw/ff-unicode.h"
-#include "ucw/unicode.h"
-#include "ucw/chartype.h"
-#include "ucw/hashfunc.h"
-
-#include <setjmp.h>
-
-/*** Basic parsing ***/
-
-void NONRET
-xml_fatal_expected(struct xml_context *ctx, uns c)
-{
- if (c >= 32 && c < 128)
- xml_fatal(ctx, "Expected '%c'", c);
- else
- xml_fatal(ctx, "Expected U+%04x", c);
-}
-
-void NONRET
-xml_fatal_expected_white(struct xml_context *ctx)
-{
- xml_fatal(ctx, "Expected a white space");
-}
-
-void NONRET
-xml_fatal_expected_quot(struct xml_context *ctx)
-{
- xml_fatal(ctx, "Expected a quotation mark");
-}
-
-void
-xml_parse_eq(struct xml_context *ctx)
-{
- /* Eq ::= S? '=' S? */
- xml_parse_white(ctx, 0);
- xml_parse_char(ctx, '=');
- xml_parse_white(ctx, 0);
-}
-
-/*** Names and nmtokens ***/
-
-static char *
-xml_parse_string(struct xml_context *ctx, struct mempool *pool, uns first_cat, uns next_cat, char *err)
-{
- char *p = mp_start_noalign(pool, 1);
- if (unlikely(!(xml_peek_cat(ctx) & first_cat)))
- xml_fatal(ctx, "%s", err);
- do
- {
- p = mp_spread(pool, p, 5);
- p = utf8_32_put(p, xml_skip_char(ctx));
- }
- while (xml_peek_cat(ctx) & next_cat);
- *p++ = 0;
- return mp_end(pool, p);
-}
-
-static void
-xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err)
-{
- if (unlikely(!(xml_get_cat(ctx) & first_cat)))
- xml_fatal(ctx, "%s", err);
- while (xml_peek_cat(ctx) & next_cat)
- xml_skip_char(ctx);
-}
-
-char *
-xml_parse_name(struct xml_context *ctx, struct mempool *pool)
-{
- /* Name ::= NameStartChar (NameChar)* */
- return xml_parse_string(ctx, pool, ctx->cat_sname, ctx->cat_name, "Expected a name");
-}
-
-void
-xml_skip_name(struct xml_context *ctx)
-{
- xml_skip_string(ctx, ctx->cat_sname, ctx->cat_name, "Expected a name");
-}
-
-char *
-xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool)
-{
- /* Nmtoken ::= (NameChar)+ */
- return xml_parse_string(ctx, pool, ctx->cat_name, ctx->cat_name, "Expected a nmtoken");
-}
-
-/*** Simple literals ***/
-
-char *
-xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool)
-{
- /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */
- char *p = mp_start_noalign(pool, 1);
- uns q = xml_parse_quote(ctx), c;
- while ((c = xml_get_char(ctx)) != q)
- {
- p = mp_spread(pool, p, 5);
- p = utf8_32_put(p, c);
- }
- *p++ = 0;
- return mp_end(pool, p);
-}
-
-char *
-xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool)
-{
- /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */
- char *p = mp_start_noalign(pool, 1);
- uns q = xml_parse_quote(ctx), c;
- while ((c = xml_get_char(ctx)) != q)
- {
- if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID)))
- xml_fatal(ctx, "Expected a pubid character");
- p = mp_spread(pool, p, 2);
- *p++ = c;
- }
- *p++ = 0;
- return mp_end(pool, p);
-}
-
-/*** Comments ***/
-
-void
-xml_push_comment(struct xml_context *ctx)
-{
- TRACE(ctx, "push_comment");
- /* Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
- * Already parsed: '<!-' */
- xml_parse_char(ctx, '-');
- struct xml_node *n = xml_push_dom(ctx, NULL);
- n->type = XML_NODE_COMMENT;
- char *p = mp_start_noalign(ctx->pool, 6);
- while (1)
- {
- if (xml_get_char(ctx) == '-')
- if (xml_get_char(ctx) == '-')
- break;
- else
- *p++ = '-';
- p = utf8_32_put(p, xml_last_char(ctx));
- p = mp_spread(ctx->pool, p, 6);
- }
- xml_parse_char(ctx, '>');
- *p = 0;
- n->len = p - (char *)mp_ptr(ctx->pool);
- n->text = mp_end(ctx->pool, p + 1);
- if ((ctx->flags & XML_REPORT_COMMENTS) && ctx->h_comment)
- ctx->h_comment(ctx);
-}
-
-void
-xml_pop_comment(struct xml_context *ctx)
-{
- xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_COMMENTS));
- xml_dec(ctx);
- TRACE(ctx, "pop_comment");
-}
-
-void
-xml_skip_comment(struct xml_context *ctx)
-{
- TRACE(ctx, "skip_comment");
- xml_parse_char(ctx, '-');
- while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-');
- xml_parse_char(ctx, '>');
- xml_dec(ctx);
-}
-
-/*** Processing instructions ***/
-
-void
-xml_push_pi(struct xml_context *ctx)
-{
- TRACE(ctx, "push_pi");
- /* Parses a PI to ctx->value and ctx->name:
- * PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
- * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
- * Already parsed: '<?' */
- struct xml_node *n = xml_push_dom(ctx, NULL);
- n->type = XML_NODE_PI;
- n->name = xml_parse_name(ctx, ctx->pool);
- if (unlikely(!strcasecmp(n->name, "xml")))
- xml_error(ctx, "Reserved PI target");
- char *p = mp_start_noalign(ctx->pool, 5);
- if (!xml_parse_white(ctx, 0))
- xml_parse_seq(ctx, "?>");
- else
- while (1)
- {
- if (xml_get_char(ctx) == '?')
- if (xml_peek_char(ctx) == '>')
- {
- xml_skip_char(ctx);
- break;
- }
- else
- *p++ = '?';
- else
- p = utf8_32_put(p, xml_last_char(ctx));
- p = mp_spread(ctx->pool, p, 5);
- }
- *p = 0;
- n->len = p - (char *)mp_ptr(ctx->pool);
- n->text = mp_end(ctx->pool, p + 1);
- if ((ctx->flags & XML_REPORT_PIS) && ctx->h_pi)
- ctx->h_pi(ctx);
-}
-
-void
-xml_pop_pi(struct xml_context *ctx)
-{
- xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_PIS));
- xml_dec(ctx);
- TRACE(ctx, "pop_pi");
-}
-
-void
-xml_skip_pi(struct xml_context *ctx)
-{
- TRACE(ctx, "skip_pi");
- if (ctx->flags & XML_VALIDATING)
- {
- struct mempool_state state;
- mp_save(ctx->stack, &state);
- if (unlikely(!strcasecmp(xml_parse_name(ctx, ctx->stack), "xml")))
- xml_error(ctx, "Reserved PI target");
- mp_restore(ctx->stack, &state);
- if (!xml_parse_white(ctx, 0))
- {
- xml_parse_seq(ctx, "?>");
- xml_dec(ctx);
- return;
- }
- }
- while (1)
- if (xml_get_char(ctx) == '?')
- if (xml_peek_char(ctx) == '>')
- break;
- xml_skip_char(ctx);
- xml_dec(ctx);
-}
-
-/*** Character references ***/
-
-uns
-xml_parse_char_ref(struct xml_context *ctx)
-{
- TRACE(ctx, "parse_char_ref");
- /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
- * Already parsed: '&#' */
- uns v = 0;
- if (xml_get_char(ctx) == 'x')
- {
- if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT))
- {
- xml_error(ctx, "Expected a hexadecimal value of character reference");
- goto recover;
- }
- do
- {
- v = (v << 4) + Cxvalue(xml_last_char(ctx));
- }
- while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT));
- }
- else
- {
- if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT))
- {
- xml_error(ctx, "Expected a numeric value of character reference");
- goto recover;
- }
- do
- {
- v = v * 10 + xml_last_char(ctx) - '0';
- }
- while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT));
- }
- uns cat = xml_char_cat(v);
- if (!(cat & ctx->cat_unrestricted))
- {
- xml_error(ctx, "Character reference out of range");
- goto recover;
- }
- if (xml_last_char(ctx) == ';')
- {
- xml_dec(ctx);
- return v;
- }
- xml_error(ctx, "Expected ';'");
-recover:
- while (xml_last_char(ctx) != ';')
- xml_get_char(ctx);
- xml_dec(ctx);
- return UNI_REPLACEMENT;
-}
-
-/*** References to general entities ***/
-
-static void
-xml_parse_ref(struct xml_context *ctx)
-{
- /* Reference ::= EntityRef | CharRef
- * EntityRef ::= '&' Name ';'
- * Already parsed: '&' */
- struct fastbuf *out = &ctx->chars;
- if (xml_peek_char(ctx) == '#')
- {
- xml_skip_char(ctx);
- bput_utf8_32(out, xml_parse_char_ref(ctx));
- }
- else
- {
- TRACE(ctx, "parse_ge_ref");
- struct mempool_state state;
- mp_save(ctx->stack, &state);
- char *name = xml_parse_name(ctx, ctx->stack);
- xml_parse_char(ctx, ';');
- struct xml_dtd_entity *ent = xml_dtd_find_entity(ctx, name);
- if (!ent)
- {
- xml_error(ctx, "Unknown entity &%s;", name);
- bputc(out, '&');
- bputs(out, name);
- bputc(out, ';');
- }
- else if (ent->flags & XML_DTD_ENTITY_TRIVIAL)
- {
- TRACE(ctx, "Trivial entity &%s;", name);
- bputs(out, ent->text);
- }
- else
- {
- TRACE(ctx, "Pushed entity &%s;", name);
- mp_restore(ctx->stack, &state);
- xml_dec(ctx);
- xml_push_entity(ctx, ent);
- return;
- }
- mp_restore(ctx->stack, &state);
- xml_dec(ctx);
- }
-}
-
-/*** Character data ***/
-
-void
-xml_spout_chars(struct fastbuf *fb)
-{
- if (fb->bptr < fb->bufend)
- return;
- struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb);
- struct mempool *pool = ctx->pool;
- if (fb->bufend != fb->buffer)
- {
- TRACE(ctx, "growing chars");
- uns len = fb->bufend - fb->buffer;
- uns reported = fb->bstop - fb->buffer;
- fb->buffer = mp_expand(pool);
- fb->bufend = fb->buffer + mp_avail(pool);
- fb->bptr = fb->buffer + len;
- fb->bstop = fb->buffer + reported;
- }
- else
- {
- TRACE(ctx, "starting chars");
- mp_save(pool, &ctx->chars_state);
- fb->bptr = fb->buffer = fb->bstop = mp_start_noalign(pool, 2);
- fb->bufend = fb->buffer + mp_avail(pool) - 1;
- }
-}
-
-static inline uns
-xml_end_chars(struct xml_context *ctx, char **out)
-{
- struct fastbuf *fb = &ctx->chars;
- uns len = fb->bptr - fb->buffer;
- if (len)
- {
- TRACE(ctx, "ending chars");
- *fb->bptr = 0;
- *out = mp_end(ctx->pool, fb->bptr + 1);
- fb->bufend = fb->bstop = fb->bptr = fb->buffer;
- }
- return len;
-}
-
-static inline uns
-xml_report_chars(struct xml_context *ctx, char **out)
-{
- struct fastbuf *fb = &ctx->chars;
- uns len = fb->bptr - fb->buffer;
- if (len)
- {
- *fb->bptr = 0;
- *out = fb->bstop;
- fb->bstop = fb->bptr;
- }
- return len;
-}
-
-static inline uns
-xml_flush_chars(struct xml_context *ctx)
-{
- char *text, *rtext;
- uns len = xml_end_chars(ctx, &text), rlen;
- if (len)
- {
- if (ctx->flags & XML_NO_CHARS)
- {
- if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_ignorable)
- ctx->h_ignorable(ctx, text, len);
- mp_restore(ctx->pool, &ctx->chars_state);
- return 0;
- }
- if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext)))
- ctx->h_block(ctx, rtext, rlen);
- if (!(ctx->flags & XML_ALLOC_CHARS) && !(ctx->flags & XML_REPORT_CHARS))
- {
- mp_restore(ctx->pool, &ctx->chars_state);
- return 0;
- }
- struct xml_node *n = xml_push_dom(ctx, &ctx->chars_state);
- n->type = XML_NODE_CHARS;
- n->text = text;
- n->len = len;
- if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars)
- ctx->h_chars(ctx);
- }
- return len;
-}
-
-static inline void
-xml_pop_chars(struct xml_context *ctx)
-{
- xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS));
- TRACE(ctx, "pop_chars");
-}
-
-static inline void
-xml_append_chars(struct xml_context *ctx)
-{
- TRACE(ctx, "append_chars");
- struct fastbuf *out = &ctx->chars;
- if (ctx->flags & XML_NO_CHARS)
- while (xml_get_char(ctx) != '<')
- if (xml_last_cat(ctx) & XML_CHAR_WHITE)
- bput_utf8_32(out, xml_last_char(ctx));
- else
- {
- xml_error(ctx, "This element must not contain character data");
- while (xml_get_char(ctx) != '<');
- break;
- }
- else
- while (xml_get_char(ctx) != '<')
- if (xml_last_char(ctx) == '&')
- {
- xml_inc(ctx);
- xml_parse_ref(ctx);
- }
- else
- bput_utf8_32(out, xml_last_char(ctx));
- xml_unget_char(ctx);
-}
-
-/*** CDATA sections ***/
-
-static void
-xml_skip_cdata(struct xml_context *ctx)
-{
- TRACE(ctx, "skip_cdata");
- xml_parse_seq(ctx, "CDATA[");
- while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>');
- xml_dec(ctx);
-}
-
-static void
-xml_append_cdata(struct xml_context *ctx)
-{
- /* CDSect :== '<![CDATA[' (Char* - (Char* ']]>' Char*)) ']]>'
- * Already parsed: '<![' */
- TRACE(ctx, "append_cdata");
- if (ctx->flags & XML_NO_CHARS)
- {
- xml_error(ctx, "This element must not contain CDATA");
- xml_skip_cdata(ctx);
- return;
- }
- xml_parse_seq(ctx, "CDATA[");
- struct fastbuf *out = &ctx->chars;
- uns rlen;
- char *rtext;
- if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext)))
- ctx->h_block(ctx, rtext, rlen);
- while (1)
- {
- if (xml_get_char(ctx) == ']')
- {
- if (xml_get_char(ctx) == ']')
- if (xml_get_char(ctx) == '>')
- break;
- else
- bputc(out, ']');
- bputc(out, ']');
- }
- bput_utf8_32(out, xml_last_char(ctx));
- }
- if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata && (rlen = xml_report_chars(ctx, &rtext)))
- ctx->h_cdata(ctx, rtext, rlen);
- xml_dec(ctx);
-}
-
-/*** Attribute values ***/
-
-char *
-xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED)
-{
- TRACE(ctx, "parse_attr_value");
- /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */
- /* FIXME: -- check value constrains / normalize leading/trailing WS and repeated WS */
- struct mempool_state state;
- uns quote = xml_parse_quote(ctx);
- mp_save(ctx->stack, &state);
- struct fastbuf *out = &ctx->chars;
- struct xml_source *src = ctx->src;
- while (1)
- {
- uns c = xml_get_char(ctx);
- if (c == '&')
- {
- xml_inc(ctx);
- xml_parse_ref(ctx);
- }
- else if (c == quote && src == ctx->src)
- break;
- else if (c == '<')
- xml_error(ctx, "Attribute value must not contain '<'");
- else if (xml_last_cat(ctx) & XML_CHAR_WHITE)
- bputc(out, ' ');
- else
- bput_utf8_32(out, c);
- }
- mp_restore(ctx->stack, &state);
- char *text;
- return xml_end_chars(ctx, &text) ? text : "";
-}
-
-uns
-xml_normalize_white(struct xml_context *ctx UNUSED, char *text)
-{
- char *s = text, *d = text;
- while (*s == 0x20)
- s++;
- while (1)
- {
- while (*s & ~0x20)
- *d++ = *s++;
- if (!*s)
- break;
- while (*++s == 0x20);
- *d++ = 0x20;
- }
- if (d != text && d[-1] == 0x20)
- d--;
- *d = 0;
- return d - text;
-}
-
-/*** Attributes ***/
-
-struct xml_attrs_table;
-
-static inline uns
-xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_node *e, char *n)
-{
- return hash_pointer(e) ^ hash_string(n);
-}
-
-static inline int
-xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_node *e1, char *n1, struct xml_node *e2, char *n2)
-{
- return (e1 == e2) && !strcmp(n1, n2);
-}
-
-static inline void
-xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_node *e, char *name)
-{
- a->elem = e;
- a->name = name;
- a->val = NULL;
- a->user = NULL;
- slist_add_tail(&e->attrs, &a->n);
-}
-
-#define HASH_PREFIX(x) xml_attrs_##x
-#define HASH_NODE struct xml_attr
-#define HASH_KEY_COMPLEX(x) x elem, x name
-#define HASH_KEY_DECL struct xml_node *elem, char *name
-#define HASH_TABLE_DYNAMIC
-#define HASH_GIVE_EQ
-#define HASH_GIVE_HASHFN
-#define HASH_GIVE_INIT_KEY
-#define HASH_WANT_CLEANUP
-#define HASH_WANT_REMOVE
-#define HASH_WANT_LOOKUP
-#define HASH_WANT_FIND
-#define HASH_GIVE_ALLOC
-XML_HASH_GIVE_ALLOC
-#include "ucw/hashtable.h"
-
-static void
-xml_parse_attr(struct xml_context *ctx)
-{
- TRACE(ctx, "parse_attr");
- /* Attribute ::= Name Eq AttValue */
- struct xml_node *e = ctx->node;
- char *n = xml_parse_name(ctx, ctx->pool);
- struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, n);
- xml_parse_eq(ctx);
- char *v = xml_parse_attr_value(ctx, NULL);
- if (a->val)
- {
- xml_error(ctx, "Attribute %s is not unique in element <%s>", n, e->name);
- return;
- }
- a->val = v;
- if (!e->dtd)
- a->dtd = NULL;
- else if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name)))
- xml_error(ctx, "Undefined attribute %s in element <%s>", n, e->name);
- else
- xml_validate_attr(ctx, a->dtd, a->val);
-}
-
-struct xml_attr *
-xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name)
-{
- return xml_attrs_find(ctx->tab_attrs, node, name);
-}
-
-char *
-xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name)
-{
- struct xml_attr *attr = xml_attrs_find(ctx->tab_attrs, node, name);
- if (attr)
- return attr->val;
- if (!node->dtd)
- return NULL;
- struct xml_dtd_attr *dtd = xml_dtd_find_attr(ctx, node->dtd, name);
- return dtd ? dtd->default_value : NULL;
-}
-
-void
-xml_attrs_table_init(struct xml_context *ctx)
-{
- xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table)));
-}
-
-void
-xml_attrs_table_cleanup(struct xml_context *ctx)
-{
- xml_attrs_cleanup(ctx->tab_attrs);
-}
-
-/*** Elements ***/
-
-static uns
-xml_validate_element(struct xml_dtd_elem_node *root, struct xml_dtd_elem *elem)
-{
- if (root->elem)
- return elem == root->elem;
- else
- SLIST_FOR_EACH(struct xml_dtd_elem_node *, son, root->sons)
- if (xml_validate_element(son, elem))
- return 1;
- return 0;
-}
-
-static void
-xml_push_element(struct xml_context *ctx)
-{
- TRACE(ctx, "push_element");
- /* EmptyElemTag | STag
- * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
- * STag ::= '<' Name (S Attribute)* S? '>'
- * Already parsed: '<' */
- struct xml_node *e = xml_push_dom(ctx, NULL);
- clist_init(&e->sons);
- e->type = XML_NODE_ELEM;
- e->name = xml_parse_name(ctx, ctx->pool);
- slist_init(&e->attrs);
- if (!e->parent)
- {
- ctx->dom = e;
- if (ctx->doctype && strcmp(e->name, ctx->doctype))
- xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype);
- }
- if (!ctx->dtd)
- e->dtd = NULL;
- else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name)))
- xml_error(ctx, "Undefined element <%s>", e->name);
- else
- {
- struct xml_dtd_elem *dtd = e->dtd, *parent_dtd = e->parent ? e->parent->dtd : NULL;
- if (dtd->type == XML_DTD_ELEM_MIXED)
- ctx->flags &= ~XML_NO_CHARS;
- else
- ctx->flags |= XML_NO_CHARS;
- if (parent_dtd)
- if (parent_dtd->type == XML_DTD_ELEM_EMPTY)
- xml_error(ctx, "Empty element must not contain children");
- else if (parent_dtd->type != XML_DTD_ELEM_ANY)
- {
- // FIXME: validate regular expressions
- if (!xml_validate_element(parent_dtd->node, dtd))
- xml_error(ctx, "Unexpected element <%s>", e->name);
- }
- }
- while (1)
- {
- uns white = xml_parse_white(ctx, 0);
- uns c = xml_get_char(ctx);
- if (c == '/')
- {
- xml_parse_char(ctx, '>');
- ctx->flags |= XML_EMPTY_ELEM_TAG;
- break;
- }
- else if (c == '>')
- break;
- else if (!white)
- xml_fatal_expected_white(ctx);
- xml_unget_char(ctx);
- xml_parse_attr(ctx);
- }
- if (e->dtd)
- SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs)
- if (a->default_mode == XML_ATTR_REQUIRED)
- {
- if (!xml_attrs_find(ctx->tab_attrs, e, a->name))
- xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name);
- }
- else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS)
- {
- struct xml_attr *attr = xml_attrs_lookup(ctx->tab_attrs, e, a->name);
- if (!attr->val)
- attr->val = a->default_value;
- }
- if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag)
- ctx->h_stag(ctx);
-}
-
-static void
-xml_pop_element(struct xml_context *ctx)
-{
- TRACE(ctx, "pop_element");
- if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag)
- ctx->h_etag(ctx);
- struct xml_node *e = ctx->node;
- uns free = !(ctx->flags & XML_ALLOC_TAGS);
- if (free)
- {
- if (!e->parent)
- ctx->dom = NULL;
- /* Restore hash table of attributes */
- SLIST_FOR_EACH(struct xml_attr *, a, e->attrs)
- xml_attrs_remove(ctx->tab_attrs, a);
- struct xml_node *n;
- while (n = clist_head(&e->sons))
- {
- if (n->type == XML_NODE_ELEM)
- {
- SLIST_FOR_EACH(struct xml_attr *, a, n->attrs)
- xml_attrs_remove(ctx->tab_attrs, a);
- clist_insert_list_after(&n->sons, &n->n);
- }
- clist_remove(&n->n);
- }
- }
- xml_pop_dom(ctx, free);
- xml_dec(ctx);
-}
-
-static void
-xml_parse_etag(struct xml_context *ctx)
-{
- /* ETag ::= '</' Name S? '>'
- * Already parsed: '<' */
- struct xml_node *e = ctx->node;
- ASSERT(e);
- char *n = e->name;
- while (*n)
- {
- uns c;
- n = utf8_32_get(n, &c);
- if (xml_get_char(ctx) != c)
- goto recover;
- }
- xml_parse_white(ctx, 0);
- if (xml_get_char(ctx) != '>')
- {
-recover:
- xml_error(ctx, "Invalid ETag, expected </%s>", e->name);
- while (xml_get_char(ctx) != '>');
- }
- xml_dec(ctx);
-}
-
-/*** Document type declaration ***/
-
-static void
-xml_parse_doctype_decl(struct xml_context *ctx)
-{
- TRACE(ctx, "parse_doctype_decl");
- /* doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
- * Already parsed: '<!'
- * Terminated before '[' or '>' */
- if (ctx->doctype)
- xml_fatal(ctx, "Multiple document types not allowed");
- xml_parse_seq(ctx, "DOCTYPE");
- xml_parse_white(ctx, 1);
- ctx->doctype = xml_parse_name(ctx, ctx->pool);
- TRACE(ctx, "doctype=%s", ctx->doctype);
- uns c;
- if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P'))
- {
- if (c == 'S')
- {
- xml_parse_seq(ctx, "SYSTEM");
- xml_parse_white(ctx, 1);
- ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
- }
- else
- {
- xml_parse_seq(ctx, "PUBLIC");
- xml_parse_white(ctx, 1);
- ctx->public_id = xml_parse_pubid_literal(ctx, ctx->pool);
- xml_parse_white(ctx, 1);
- ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
- }
- xml_parse_white(ctx, 0);
- ctx->flags |= XML_HAS_EXTERNAL_SUBSET;
- }
- if (xml_peek_char(ctx) == '[')
- {
- ctx->flags |= XML_HAS_INTERNAL_SUBSET;
- xml_skip_char(ctx);
- xml_inc(ctx);
- }
- if (ctx->h_doctype_decl)
- ctx->h_doctype_decl(ctx);
-}
-
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/* DTD: Internal subset */
-
-static void
-xml_parse_subset(struct xml_context *ctx, uns external)
-{
- // FIXME:
- // -- comments/pi have no parent
- // -- conditional sections in external subset
- // -- check corectness of parameter entities
-
- /* '[' intSubset ']'
- * intSubset :== (markupdecl | DeclSep)
- * Already parsed: '['
- *
- * extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)*
- */
- while (1)
- {
- xml_parse_white(ctx, 0);
- uns c = xml_get_char(ctx);
- xml_inc(ctx);
- if (c == '<')
- if ((c = xml_get_char(ctx)) == '!')
- switch (c = xml_get_char(ctx))
- {
- case '-':
- xml_push_comment(ctx);
- xml_pop_comment(ctx);
- break;
- case 'N':
- xml_parse_seq(ctx, "OTATION");
- xml_parse_notation_decl(ctx);
- break;
- case 'E':
- if ((c = xml_get_char(ctx)) == 'N')
- {
- xml_parse_seq(ctx, "TITY");
- xml_parse_entity_decl(ctx);
- }
- else if (c == 'L')
- {
- xml_parse_seq(ctx, "EMENT");
- xml_parse_element_decl(ctx);
- }
- else
- goto invalid_markup;
- break;
- case 'A':
- xml_parse_seq(ctx, "TTLIST");
- xml_parse_attr_list_decl(ctx);
- break;
- default:
- goto invalid_markup;
- }
- else if (c == '?')
- {
- xml_push_pi(ctx);
- xml_pop_pi(ctx);
- }
- else
- goto invalid_markup;
- else if (c == '%')
- xml_parse_pe_ref(ctx);
- else if (c == ']' && !external)
- {
- break;
- }
- else if (c == '>' && external)
- {
- break;
- }
- else
- goto invalid_markup;
- }
- xml_dec(ctx);
- return;
-invalid_markup: ;
- xml_fatal(ctx, "Invalid markup in the %s subset", external ? "external" : "internal");
-}
-
-/*** The State Machine ***/
-
-uns
-xml_next(struct xml_context *ctx)
-{
- /* A nasty state machine */
-
-#define PULL(x) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##x; case XML_STATE_##x: ; } while (0)
-#define PULL_STATE(x, s) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##s, XML_STATE_##x; case XML_STATE_##s: ; } while (0)
-
- TRACE(ctx, "xml_next (state=%u)", ctx->state);
- jmp_buf throw_buf;
- ctx->throw_buf = &throw_buf;
- if (setjmp(throw_buf))
- {
-error:
- if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal)
- ctx->h_fatal(ctx);
- TRACE(ctx, "raised fatal error");
- return ctx->state = XML_STATE_EOF;
- }
- uns c;
- switch (ctx->state)
- {
- case XML_STATE_START:
- TRACE(ctx, "entering prolog");
- ctx->flags |= XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL;
- if (ctx->h_document_start)
- ctx->h_document_start(ctx);
- /* XMLDecl */
- xml_refill(ctx);
- if (ctx->h_xml_decl)
- ctx->h_xml_decl(ctx);
- PULL(XML_DECL);
-
- /* Misc* (doctypedecl Misc*)? */
- while (1)
- {
- xml_parse_white(ctx, 0);
- xml_parse_char(ctx, '<');
- xml_inc(ctx);
- if ((c = xml_get_char(ctx)) == '?')
- /* Processing intruction */
- if (!(ctx->flags & XML_REPORT_PIS))
- xml_skip_pi(ctx);
- else
- {
- xml_push_pi(ctx);
- PULL_STATE(PI, PROLOG_PI);
- xml_pop_pi(ctx);
- }
- else if (c != '!')
- {
- /* Found the root tag */
- xml_unget_char(ctx);
- goto first_tag;
- }
- else if (xml_get_char(ctx) == '-')
- if (!(ctx->flags & XML_REPORT_COMMENTS))
- xml_skip_comment(ctx);
- else
- {
- xml_push_comment(ctx);
- PULL_STATE(COMMENT, PROLOG_COMMENT);
- xml_pop_comment(ctx);
- }
- else
- {
- /* DocTypeDecl */
- xml_unget_char(ctx);
- xml_parse_doctype_decl(ctx);
- PULL(DOCTYPE_DECL);
- if (ctx->flags & XML_HAS_DTD)
- if (ctx->flags & XML_PARSE_DTD)
- {
- xml_dtd_init(ctx);
- if (ctx->h_dtd_start)
- ctx->h_dtd_start(ctx);
- if (ctx->flags & XML_HAS_INTERNAL_SUBSET)
- {
- xml_parse_subset(ctx, 0);
- xml_dec(ctx);
- }
- if (ctx->flags & XML_HAS_EXTERNAL_SUBSET)
- {
- struct xml_dtd_entity ent = {
- .system_id = ctx->system_id,
- .public_id = ctx->public_id,
- };
- xml_parse_white(ctx, 0);
- xml_parse_char(ctx, '>');
- xml_unget_char(ctx);
- ASSERT(ctx->h_resolve_entity);
- ctx->h_resolve_entity(ctx, &ent);
- ctx->flags |= XML_SRC_EXPECTED_DECL;
- xml_parse_subset(ctx, 1);
- xml_unget_char(ctx);;
- }
- if (ctx->h_dtd_end)
- ctx->h_dtd_end(ctx);
- }
- else if (ctx->flags & XML_HAS_INTERNAL_SUBSET)
- xml_skip_internal_subset(ctx);
- xml_parse_white(ctx, 0);
- xml_parse_char(ctx, '>');
- xml_dec(ctx);
- }
- }
-
- case XML_STATE_CHARS:
-
- while (1)
- {
- if (xml_peek_char(ctx) != '<')
- {
- /* CharData */
- xml_append_chars(ctx);
- continue;
- }
- else
- xml_skip_char(ctx);
- xml_inc(ctx);
-first_tag:
-
- if ((c = xml_get_char(ctx)) == '?')
- {
- /* PI */
- if (!(ctx->flags & (XML_REPORT_PIS | XML_ALLOC_PIS)))
- xml_skip_pi(ctx);
- else
- {
- if (xml_flush_chars(ctx))
- {
- PULL_STATE(CHARS, CHARS_BEFORE_PI);
- xml_pop_chars(ctx);
- }
- xml_push_pi(ctx);
- PULL(PI);
- xml_pop_pi(ctx);
- }
- }
-
- else if (c == '!')
- if ((c = xml_get_char(ctx)) == '-')
- {
- /* Comment */
- if (!(ctx->flags & (XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS)))
- xml_skip_comment(ctx);
- else
- {
- if (xml_flush_chars(ctx))
- {
- PULL_STATE(CHARS, CHARS_BEFORE_COMMENT);
- xml_pop_chars(ctx);
- }
- xml_push_comment(ctx);
- PULL(COMMENT);
- xml_pop_comment(ctx);
- }
- }
- else if (c == '[')
- {
- /* CDATA */
- xml_append_cdata(ctx);
- }
- else
- xml_fatal(ctx, "Unexpected character after '<!'");
-
- else if (c != '/')
- {
- /* STag | EmptyElemTag */
- xml_unget_char(ctx);
- if (xml_flush_chars(ctx))
- {
- PULL_STATE(CHARS, CHARS_BEFORE_STAG);
- xml_pop_chars(ctx);
- }
-
- xml_push_element(ctx);
- PULL(STAG);
- if (ctx->flags & XML_EMPTY_ELEM_TAG)
- goto pop_element;
- }
-
- else
- {
- /* ETag */
- if (xml_flush_chars(ctx))
- {
- PULL_STATE(CHARS, CHARS_BEFORE_ETAG);
- xml_pop_chars(ctx);
- }
-
- xml_parse_etag(ctx);
-pop_element:
- PULL(ETAG);
- xml_pop_element(ctx);
- if (!ctx->node)
- goto epilog;
- }
- }
-
-epilog:
- /* Misc* */
- TRACE(ctx, "entering epilog");
- while (1)
- {
- /* Epilog whitespace is the only place, where a valid document can reach EOF */
- if (setjmp(throw_buf))
- if (ctx->err_code == XML_ERR_EOF)
- {
- TRACE(ctx, "reached EOF");
- ctx->state = XML_STATE_EOF;
- if (ctx->h_document_end)
- ctx->h_document_end(ctx);
- case XML_STATE_EOF:
- ctx->err_code = 0;
- ctx->err_msg = NULL;
- return XML_STATE_EOF;
- }
- else
- goto error;
- xml_parse_white(ctx, 0);
- if (setjmp(throw_buf))
- goto error;
-
- /* Misc */
- xml_parse_char(ctx, '<');
- xml_inc(ctx);
- if ((c = xml_get_char(ctx)) == '?')
- /* Processing instruction */
- if (!(ctx->flags & XML_REPORT_PIS))
- xml_skip_pi(ctx);
- else
- {
- xml_push_pi(ctx);
- PULL_STATE(PI, EPILOG_PI);
- xml_pop_pi(ctx);
- }
- else if (c == '!')
- {
- xml_parse_char(ctx, '-');
- /* Comment */
- if (!(ctx->flags & XML_REPORT_COMMENTS))
- xml_skip_comment(ctx);
- else
- {
- xml_push_comment(ctx);
- PULL_STATE(COMMENT, EPILOG_COMMENT);
- xml_pop_comment(ctx);
- }
- }
- else
- xml_fatal(ctx, "Syntax error in the epilog");
- }
-
- }
- ASSERT(0);
-}
-
-uns
-xml_next_state(struct xml_context *ctx, uns pull)
-{
- uns saved = ctx->pull;
- ctx->pull = pull;
- uns res = xml_next(ctx);
- ctx->pull = saved;
- return res;
-}
-
-uns
-xml_skip_element(struct xml_context *ctx)
-{
- ASSERT(ctx->state == XML_STATE_STAG);
- struct xml_node *node = ctx->node;
- uns saved = ctx->pull, res;
- ctx->pull = XML_PULL_ETAG;
- while ((res = xml_next(ctx)) && ctx->node != node);
- ctx->pull = saved;
- return res;
-}
-
-uns
-xml_parse(struct xml_context *ctx)
-{
- /* This cycle should run only once unless the user overrides the value of ctx->pull in a SAX handler */
- do
- {
- ctx->pull = 0;
- }
- while (xml_next(ctx));
- return ctx->err_code;
-}
-
-char *
-xml_merge_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool)
-{
- ASSERT(node->type == XML_NODE_ELEM);
- char *p = mp_start_noalign(pool, 1);
- XML_NODE_FOR_EACH(son, node)
- if (son->type == XML_NODE_CHARS)
- {
- p = mp_spread(pool, p, son->len + 1);
- memcpy(p, son->text, son->len);
- p += son->len;
- }
- *p++ = 0;
- return mp_end(pool, p);
-}
-
-static char *
-xml_append_dom_chars(char *p, struct mempool *pool, struct xml_node *node)
-{
- XML_NODE_FOR_EACH(son, node)
- if (son->type == XML_NODE_CHARS)
- {
- p = mp_spread(pool, p, son->len + 1);
- memcpy(p, son->text, son->len);
- p += son->len;
- }
- else if (son->type == XML_NODE_ELEM)
- p = xml_append_dom_chars(p, pool, son);
- return p;
-}
-
-char *
-xml_merge_dom_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool)
-{
- ASSERT(node->type == XML_NODE_ELEM);
- char *p = mp_start_noalign(pool, 1);
- p = xml_append_dom_chars(p, pool, node);
- *p++ = 0;
- return mp_end(pool, p);
-}
+++ /dev/null
-/*
- * Sherlock Library -- A simple XML parser
- *
- * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
- *
- * This software may be freely distributed and used according to the terms
- * of the GNU Lesser General Public License.
- */
-
-#undef LOCAL_DEBUG
-
-#include "sherlock/sherlock.h"
-#include "sherlock/xml/xml.h"
-#include "sherlock/xml/dtd.h"
-#include "sherlock/xml/internals.h"
-#include "ucw/unicode.h"
-#include "ucw/ff-unicode.h"
-#include "charset/charconv.h"
-#include "charset/fb-charconv.h"
-
-/*** Charecter categorization ***/
-
-#include "obj/sherlock/xml/unicat.c"
-
-static void
-xml_init_cats(struct xml_context *ctx)
-{
- if (!(ctx->flags & XML_VERSION_1_1))
- {
- ctx->cat_chars = XML_CHAR_VALID_1_0;
- ctx->cat_unrestricted = XML_CHAR_VALID_1_0;
- ctx->cat_new_line = XML_CHAR_NEW_LINE_1_0;
- ctx->cat_name = XML_CHAR_NAME_1_0;
- ctx->cat_sname = XML_CHAR_SNAME_1_0;
- }
- else
- {
- ctx->cat_chars = XML_CHAR_VALID_1_1;
- ctx->cat_unrestricted = XML_CHAR_UNRESTRICTED_1_1;
- ctx->cat_new_line = XML_CHAR_NEW_LINE_1_1;
- ctx->cat_name = XML_CHAR_NAME_1_1;
- ctx->cat_sname = XML_CHAR_SNAME_1_1;
- }
-}
-
-/*** Reading of document/external entities ***/
-
-static void NONRET
-xml_eof(struct xml_context *ctx)
-{
- ctx->err_msg = "Unexpected EOF";
- ctx->err_code = XML_ERR_EOF;
- xml_throw(ctx);
-}
-
-void NONRET
-xml_fatal_nested(struct xml_context *ctx)
-{
- xml_fatal(ctx, "Entity is not nested correctly");
-}
-
-static inline void
-xml_add_char(u32 **bstop, uns c)
-{
- *(*bstop)++ = c;
- *(*bstop)++ = xml_char_cat(c);
-}
-
-struct xml_source *
-xml_push_source(struct xml_context *ctx)
-{
- xml_push(ctx);
- struct xml_source *src = ctx->src;
- if (src)
- {
- src->bptr = ctx->bptr;
- src->bstop = ctx->bstop;
- }
- src = mp_alloc_zero(ctx->stack, sizeof(*src));
- src->next = ctx->src;
- src->saved_depth = ctx->depth;
- ctx->src = src;
- ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_DOCUMENT);
- ctx->bstop = ctx->bptr = src->buf;
- ctx->depth = 0;
- return src;
-}
-
-struct xml_source *
-xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb)
-{
- struct xml_source *src = xml_push_source(ctx);
- src->fb = fb;
- return src;
-}
-
-static void
-xml_close_source(struct xml_source *src)
-{
- bclose(src->fb);
- if (src->wrapped_fb)
- bclose(src->wrapped_fb);
-}
-
-static void
-xml_pop_source(struct xml_context *ctx)
-{
- TRACE(ctx, "pop_source");
- if (unlikely(ctx->depth != 0))
- xml_fatal(ctx, "Unexpected end of entity");
- struct xml_source *src = ctx->src;
- if (!src)
- xml_fatal(ctx, "Undefined source");
- xml_close_source(src);
- ctx->depth = src->saved_depth;
- ctx->src = src = src->next;
- if (src)
- {
- ctx->bptr = src->bptr;
- ctx->bstop = src->bstop;
- }
- xml_pop(ctx);
- if (unlikely(!src))
- xml_eof(ctx);
-}
-
-void
-xml_sources_cleanup(struct xml_context *ctx)
-{
- struct xml_source *s;
- while (s = ctx->src)
- {
- ctx->src = s->next;
- xml_close_source(s);
- }
-}
-
-static void xml_refill_utf8(struct xml_context *ctx);
-
-void
-xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent UNUSED)
-{
- xml_error(ctx, "References to external entities are not supported");
-}
-
-void
-xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent)
-{
- TRACE(ctx, "xml_push_entity");
- struct xml_source *src;
- if (ent->flags & XML_DTD_ENTITY_EXTERNAL)
- {
- ASSERT(ctx->h_resolve_entity);
- ctx->h_resolve_entity(ctx, ent);
- ctx->flags |= XML_SRC_EXPECTED_DECL;
- src = ctx->src;
- }
- else
- {
- src = xml_push_source(ctx);
- fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, strlen(ent->text), 0);
- }
- src->refill = xml_refill_utf8;
- src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line;
- src->refill_cat2 = ctx->cat_new_line;
-}
-
-static uns
-xml_error_restricted(struct xml_context *ctx, uns c)
-{
- if (c == ~1U)
- xml_error(ctx, "Corrupted encoding");
- else
- xml_error(ctx, "Restricted char U+%04X", c);
- return UNI_REPLACEMENT;
-}
-
-void xml_parse_decl(struct xml_context *ctx);
-
-#define REFILL(ctx, func, params...) \
- struct xml_source *src = ctx->src; \
- struct fastbuf *fb = src->fb; \
- if (ctx->bptr == ctx->bstop) \
- ctx->bptr = ctx->bstop = src->buf; \
- uns c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \
- u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \
- *last_0xd = src->pending_0xd ? bstop : NULL; \
- do \
- { \
- c = func(fb, ##params); \
- uns t = xml_char_cat(c); \
- if (t & t1) \
- /* Typical branch */ \
- *bstop++ = c, *bstop++ = t; \
- else if (t & t2) \
- { \
- /* New line */ \
- /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \
- /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \
- if (c == 0xd) \
- last_0xd = bstop + 2; \
- else if (c != 0x2028 && last_0xd == bstop) \
- { \
- last_0xd = NULL; \
- continue; \
- } \
- xml_add_char(&bstop, 0xa), row++; \
- } \
- else if (c == '>') \
- { \
- /* Used only in XML/TextDecl to switch the encoding */ \
- *bstop++ = c, *bstop++ = t; \
- break; \
- } \
- else if (~c) \
- /* Restricted character */ \
- xml_add_char(&bstop, xml_error_restricted(ctx, c)); \
- else \
- { \
- /* EOF */ \
- ctx->flags |= XML_SRC_EOF; \
- break; \
- } \
- } \
- while (bstop < bend); \
- src->pending_0xd = (last_0xd == bstop); \
- ctx->bstop = bstop; \
- src->row = row;
-
-static void
-xml_refill_utf8(struct xml_context *ctx)
-{
- REFILL(ctx, bget_utf8_repl, ~1U);
-}
-
-static void
-xml_refill_utf16_le(struct xml_context *ctx)
-{
- REFILL(ctx, bget_utf16_le_repl, ~1U);
-}
-
-static void
-xml_refill_utf16_be(struct xml_context *ctx)
-{
- REFILL(ctx, bget_utf16_be_repl, ~1U);
-}
-
-#undef REFILL
-
-void
-xml_refill(struct xml_context *ctx)
-{
- do
- {
- if (ctx->flags & XML_SRC_EOF)
- xml_pop_source(ctx);
- else if (ctx->flags & XML_SRC_EXPECTED_DECL)
- xml_parse_decl(ctx);
- else
- {
- ctx->src->refill(ctx);
- TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2));
- }
- }
- while (ctx->bptr == ctx->bstop);
-}
-
-static uns
-xml_source_row(struct xml_context *ctx, struct xml_source *src)
-{
- uns row = src->row;
- for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2)
- if (p[-1] & src->refill_cat2)
- row--;
- return row + 1;
-}
-
-uns
-xml_row(struct xml_context *ctx)
-{
- return ctx->src ? xml_source_row(ctx, ctx->src) : 0;
-}
-
-/* Document/external entity header */
-
-static char *
-xml_parse_encoding_name(struct xml_context *ctx)
-{
- /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */
- char *p = mp_start_noalign(ctx->pool, 1);
- uns q = xml_parse_quote(ctx);
- if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME)))
- xml_fatal(ctx, "Invalid character in the encoding name");
- while (1)
- {
- p = mp_spread(ctx->pool, p, 2);
- *p++ = xml_last_char(ctx);
- if (xml_get_char(ctx) == q)
- break;
- if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME)))
- xml_fatal(ctx, "Invalid character in the encoding name");
- }
- *p++ = 0;
- return mp_end(ctx->pool, p);
-}
-
-static void
-xml_init_charconv(struct xml_context *ctx, int cs)
-{
- // XXX: with a direct access to libcharset tables could be faster
- struct xml_source *src = ctx->src;
- TRACE(ctx, "wrapping charset %s", charset_name(cs));
- src->wrapped_fb = src->fb;
- src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8);
-}
-
-void
-xml_parse_decl(struct xml_context *ctx)
-{
- TRACE(ctx, "xml_parse_decl");
- struct xml_source *src = ctx->src;
- ctx->flags &= ~XML_SRC_EXPECTED_DECL;
- uns doc = ctx->flags & XML_SRC_DOCUMENT;
-
- /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */
- if (doc)
- xml_init_cats(ctx);
- src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line & ~XML_CHAR_GT;
- src->refill_cat2 = ctx->cat_new_line;
-
- /* Initialize the supplied charset (if any) or try to guess it */
- char *expected_encoding = src->expected_encoding;
- src->refill = xml_refill_utf8;
- int bom = bpeekc(src->fb);
- if (bom < 0)
- ctx->flags |= XML_SRC_EOF;
- if (!src->fb_encoding)
- {
- if (bom == 0xfe)
- src->refill = xml_refill_utf16_be;
- else if (bom == 0xff)
- src->refill = xml_refill_utf16_le;
- }
- else
- {
- int cs = find_charset_by_name(src->fb_encoding);
- if (cs == CONV_CHARSET_UTF8)
- {}
- else if (cs >= 0)
- {
- xml_init_charconv(ctx, cs);
- bom = 0;
- }
- else if (strcasecmp(src->fb_encoding, "UTF-16"))
- {
- src->refill = xml_refill_utf16_be;
- if (bom == 0xff)
- src->refill = xml_refill_utf16_le;
- }
- else if (strcasecmp(src->fb_encoding, "UTF-16BE"))
- src->refill = xml_refill_utf16_be;
- else if (strcasecmp(src->fb_encoding, "UTF-16LE"))
- src->refill = xml_refill_utf16_le;
- else
- {
- xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding);
- expected_encoding = NULL;
- }
- }
- uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be;
- if (utf16)
- src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE";
- if (!expected_encoding)
- expected_encoding = src->fb_encoding;
- if (bom > 0 && xml_peek_char(ctx) == 0xfeff)
- xml_skip_char(ctx);
- else if (utf16)
- xml_error(ctx, "Missing or corrupted BOM");
- TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?");
-
- /* Look ahead for presence of XMLDecl or optional TextDecl */
- if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf))
- xml_refill(ctx);
- u32 *bptr = ctx->bptr;
- uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) &&
- bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L');
- if (!have_decl)
- {
- if (doc)
- xml_fatal(ctx, "Missing or corrupted XML header");
- else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16)
- xml_error(ctx, "Missing or corrupted entity header");
- goto exit;
- }
- ctx->bptr = bptr + 12;
- xml_parse_white(ctx, 0);
-
- /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */
- if (xml_peek_char(ctx) == 'v')
- {
- xml_parse_seq(ctx, "version");
- xml_parse_eq(ctx);
- char *version = xml_parse_pubid_literal(ctx, ctx->pool);
- TRACE(ctx, "version=%s", version);
- uns v = 0;
- if (!strcmp(version, "1.1"))
- v = XML_VERSION_1_1;
- else if (strcmp(version, "1.0"))
- {
- xml_error(ctx, "Unknown XML version string '%s'", version);
- version = "1.0";
- }
- if (doc)
- {
- ctx->version_str = version;
- ctx->flags |= v;
- }
- else if (v > (ctx->flags & XML_VERSION_1_1))
- xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document");
- if (!xml_parse_white(ctx, !doc))
- goto end;
- }
- else if (doc)
- {
- xml_error(ctx, "Expected XML version");
- ctx->version_str = "1.0";
- }
-
- /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */
- if (xml_peek_char(ctx) == 'e')
- {
- xml_parse_seq(ctx, "encoding");
- xml_parse_eq(ctx);
- src->decl_encoding = xml_parse_encoding_name(ctx);
- TRACE(ctx, "encoding=%s", src->decl_encoding);
- if (!xml_parse_white(ctx, 0))
- goto end;
- }
- else if (!doc)
- xml_error(ctx, "Expected XML encoding");
-
- /* Parse whether the document is standalone (optional in XMLDecl) */
- if (doc && xml_peek_char(ctx) == 's')
- {
- xml_parse_seq(ctx, "standalone");
- xml_parse_eq(ctx);
- uns c = xml_parse_quote(ctx);
- if (ctx->standalone = (xml_peek_char(ctx) == 'y'))
- xml_parse_seq(ctx, "yes");
- else
- xml_parse_seq(ctx, "no");
- xml_parse_char(ctx, c);
- TRACE(ctx, "standalone=%d", ctx->standalone);
- xml_parse_white(ctx, 0);
- }
-end:
- xml_parse_seq(ctx, "?>");
-
- /* Switch to the final encoding */
- if (src->decl_encoding)
- {
- int cs = find_charset_by_name(src->decl_encoding);
- if (cs < 0 && !expected_encoding)
- xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding);
- else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8)
- {
- xml_init_charconv(ctx, cs);
- src->fb_encoding = src->decl_encoding;
- }
- else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 ||
- !(!strcasecmp(src->decl_encoding, "UTF-16") ||
- (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) ||
- (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE")))))
- xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding);
- }
- if (!src->fb_encoding)
- src->fb_encoding = "UTF-8";
- TRACE(ctx, "Final encoding=%s", src->fb_encoding);
-
-exit:
- /* Update valid Unicode ranges */
- if (doc)
- xml_init_cats(ctx);
- src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line;
- src->refill_cat2 = ctx->cat_new_line;
-}
+++ /dev/null
-#!/usr/bin/perl
-#
-# UCW Library -- Character map for the XML parser
-#
-# (c) 2007 Pavel Charvat <pchar@ucw.cz>
-#
-# This software may be freely distributed and used according to the terms
-# of the GNU Lesser General Public License.
-#
-
-my @cat = ();
-my @lcat = ();
-my %ids = ();
-my %cls = ();
-for (my $i = 0; $i < 0x10000; $i++) { $cat[$i] = 0; }
-for (my $i = 0; $i < 0x11; $i++) { $lcat[$i] = 0; }
-
-my @white = (0x9, 0xA, 0xD, 0x20);
-my @base_char_1_0 = (
- [0x0041,0x005A], [0x0061,0x007A], [0x00C0,0x00D6], [0x00D8,0x00F6], [0x00F8,0x00FF], [0x0100,0x0131],
- [0x0134,0x013E], [0x0141,0x0148], [0x014A,0x017E], [0x0180,0x01C3], [0x01CD,0x01F0], [0x01F4,0x01F5],
- [0x01FA,0x0217], [0x0250,0x02A8], [0x02BB,0x02C1], 0x0386, [0x0388,0x038A], 0x038C, [0x038E,0x03A1],
- [0x03A3,0x03CE], [0x03D0,0x03D6], 0x03DA, 0x03DC, 0x03DE, 0x03E0, [0x03E2,0x03F3], [0x0401,0x040C],
- [0x040E,0x044F], [0x0451,0x045C], [0x045E,0x0481], [0x0490,0x04C4], [0x04C7,0x04C8], [0x04CB,0x04CC],
- [0x04D0,0x04EB], [0x04EE,0x04F5], [0x04F8,0x04F9], [0x0531,0x0556], 0x0559, [0x0561,0x0586], [0x05D0,0x05EA],
- [0x05F0,0x05F2], [0x0621,0x063A], [0x0641,0x064A], [0x0671,0x06B7], [0x06BA,0x06BE], [0x06C0,0x06CE],
- [0x06D0,0x06D3], 0x06D5, [0x06E5,0x06E6], [0x0905,0x0939], 0x093D, [0x0958,0x0961], [0x0985,0x098C],
- [0x098F,0x0990], [0x0993,0x09A8], [0x09AA,0x09B0], 0x09B2, [0x09B6,0x09B9], [0x09DC,0x09DD], [0x09DF,0x09E1],
- [0x09F0,0x09F1], [0x0A05,0x0A0A], [0x0A0F,0x0A10], [0x0A13,0x0A28], [0x0A2A,0x0A30], [0x0A32,0x0A33],
- [0x0A35,0x0A36], [0x0A38,0x0A39], [0x0A59,0x0A5C], 0x0A5E, [0x0A72,0x0A74], [0x0A85,0x0A8B], 0x0A8D,
- [0x0A8F,0x0A91], [0x0A93,0x0AA8], [0x0AAA,0x0AB0], [0x0AB2,0x0AB3], [0x0AB5,0x0AB9], 0x0ABD, 0x0AE0,
- [0x0B05,0x0B0C], [0x0B0F,0x0B10], [0x0B13,0x0B28], [0x0B2A,0x0B30], [0x0B32,0x0B33], [0x0B36,0x0B39],
- 0x0B3D, [0x0B5C,0x0B5D], [0x0B5F,0x0B61], [0x0B85,0x0B8A], [0x0B8E,0x0B90], [0x0B92,0x0B95], [0x0B99,0x0B9A],
- 0x0B9C, [0x0B9E,0x0B9F], [0x0BA3,0x0BA4], [0x0BA8,0x0BAA], [0x0BAE,0x0BB5], [0x0BB7,0x0BB9], [0x0C05,0x0C0C],
- [0x0C0E,0x0C10], [0x0C12,0x0C28], [0x0C2A,0x0C33], [0x0C35,0x0C39], [0x0C60,0x0C61], [0x0C85,0x0C8C],
- [0x0C8E,0x0C90], [0x0C92,0x0CA8], [0x0CAA,0x0CB3], [0x0CB5,0x0CB9], 0x0CDE, [0x0CE0,0x0CE1], [0x0D05,0x0D0C],
- [0x0D0E,0x0D10], [0x0D12,0x0D28], [0x0D2A,0x0D39], [0x0D60,0x0D61], [0x0E01,0x0E2E], 0x0E30, [0x0E32,0x0E33],
- [0x0E40,0x0E45], [0x0E81,0x0E82], 0x0E84, [0x0E87,0x0E88], 0x0E8A, 0x0E8D, [0x0E94,0x0E97], [0x0E99,0x0E9F],
- [0x0EA1,0x0EA3], 0x0EA5, 0x0EA7, [0x0EAA,0x0EAB], [0x0EAD,0x0EAE], 0x0EB0, [0x0EB2,0x0EB3], 0x0EBD,
- [0x0EC0,0x0EC4], [0x0F40,0x0F47], [0x0F49,0x0F69], [0x10A0,0x10C5], [0x10D0,0x10F6], 0x1100, [0x1102,0x1103],
- [0x1105,0x1107], 0x1109, [0x110B,0x110C], [0x110E,0x1112], 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150,
- [0x1154,0x1155], 0x1159, [0x115F,0x1161], 0x1163, 0x1165, 0x1167, 0x1169, [0x116D,0x116E], [0x1172,0x1173],
- 0x1175, 0x119E, 0x11A8, 0x11AB, [0x11AE,0x11AF], [0x11B7,0x11B8], 0x11BA, [0x11BC,0x11C2], 0x11EB, 0x11F0,
- 0x11F9, [0x1E00,0x1E9B], [0x1EA0,0x1EF9], [0x1F00,0x1F15], [0x1F18,0x1F1D], [0x1F20,0x1F45], [0x1F48,0x1F4D],
- [0x1F50,0x1F57], 0x1F59, 0x1F5B, 0x1F5D, [0x1F5F,0x1F7D], [0x1F80,0x1FB4], [0x1FB6,0x1FBC], 0x1FBE,
- [0x1FC2,0x1FC4], [0x1FC6,0x1FCC], [0x1FD0,0x1FD3], [0x1FD6,0x1FDB], [0x1FE0,0x1FEC], [0x1FF2,0x1FF4],
- [0x1FF6,0x1FFC], 0x2126, [0x212A,0x212B], 0x212E, [0x2180,0x2182], [0x3041,0x3094], [0x30A1,0x30FA],
- [0x3105,0x312C], [0xAC00,0xD7A3]);
-my @ideographic_1_0 = ([0x4E00,0x9FA5], 0x3007, [0x3021,0x3029]);
-my @combining_char_1_0 = (
- [0x0300,0x0345], [0x0360,0x0361], [0x0483,0x0486], [0x0591,0x05A1], [0x05A3,0x05B9], [0x05BB,0x05BD],
- 0x05BF, [0x05C1,0x05C2], 0x05C4, [0x064B,0x0652], 0x0670, [0x06D6,0x06DC], [0x06DD,0x06DF], [0x06E0,0x06E4],
- [0x06E7,0x06E8], [0x06EA,0x06ED], [0x0901,0x0903], 0x093C, [0x093E,0x094C], 0x094D, [0x0951,0x0954],
- [0x0962,0x0963], [0x0981,0x0983], 0x09BC, 0x09BE, 0x09BF, [0x09C0,0x09C4], [0x09C7,0x09C8], [0x09CB,0x09CD],
- 0x09D7, [0x09E2,0x09E3], 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, [0x0A40,0x0A42], [0x0A47,0x0A48], [0x0A4B,0x0A4D],
- [0x0A70,0x0A71], [0x0A81,0x0A83], 0x0ABC, [0x0ABE,0x0AC5], [0x0AC7,0x0AC9], [0x0ACB,0x0ACD], [0x0B01,0x0B03],
- 0x0B3C, [0x0B3E,0x0B43], [0x0B47,0x0B48], [0x0B4B,0x0B4D], [0x0B56,0x0B57], [0x0B82,0x0B83], [0x0BBE,0x0BC2],
- [0x0BC6,0x0BC8], [0x0BCA,0x0BCD], 0x0BD7, [0x0C01,0x0C03], [0x0C3E,0x0C44], [0x0C46,0x0C48], [0x0C4A,0x0C4D],
- [0x0C55,0x0C56], [0x0C82,0x0C83], [0x0CBE,0x0CC4], [0x0CC6,0x0CC8], [0x0CCA,0x0CCD], [0x0CD5,0x0CD6],
- [0x0D02,0x0D03], [0x0D3E,0x0D43], [0x0D46,0x0D48], [0x0D4A,0x0D4D], 0x0D57, 0x0E31, [0x0E34,0x0E3A],
- [0x0E47,0x0E4E], 0x0EB1, [0x0EB4,0x0EB9], [0x0EBB,0x0EBC], [0x0EC8,0x0ECD], [0x0F18,0x0F19], 0x0F35,
- 0x0F37, 0x0F39, 0x0F3E, 0x0F3F, [0x0F71,0x0F84], [0x0F86,0x0F8B], [0x0F90,0x0F95], 0x0F97, [0x0F99,0x0FAD],
- [0x0FB1,0x0FB7], 0x0FB9, [0x20D0,0x20DC], 0x20E1, [0x302A,0x302F], 0x3099, 0x309A);
-my @digit_1_0 = (
- [0x0030,0x0039], [0x0660,0x0669], [0x06F0,0x06F9], [0x0966,0x096F], [0x09E6,0x09EF], [0x0A66,0x0A6F],
- [0x0AE6,0x0AEF], [0x0B66,0x0B6F], [0x0BE7,0x0BEF], [0x0C66,0x0C6F], [0x0CE6,0x0CEF], [0x0D66,0x0D6F],
- [0x0E50,0x0E59], [0x0ED0,0x0ED9], [0x0F20,0x0F29]);
-my @extender_1_0 = (
- 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, [0x3031,0x3035], [0x309D,0x309E], [0x30FC,0x30FE]);
-my @sname_1_1 = (
- "[:A-Z_a-z]", [0xC0,0xD6], [0xD8,0xF6], [0xF8,0x2FF], [0x370,0x37D], [0x37F,0x1FFF],
- [0x200C,0x200D], [0x2070,0x218F], [0x2C00,0x2FEF], [0x3001,0xD7FF], [0xF900,0xFDCF], [0xFDF0,0xFFFD], [0x10000,0xEFFFF]);
-
-set("WHITE", @white);
-set("NEW_LINE_1_0", 0xA, 0xD);
-set("NEW_LINE_1_1", 0xA, 0xD, 0x85, 0x2028);
-set("DIGIT", "[0-9]");
-set("XDIGIT", "[0-9a-fA-F]");
-set("VALID_1_0", @white, [0x20,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]);
-set("VALID_1_1", [0x1,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]);
-set("UNRESTRICTED_1_1", @white, [0x20,0x7E], 0x85, [0xA0,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]);
-set("PUBID", 0x20, 0xD, 0xA, "[-a-zA-Z0-9'()+,./:=?:!*#@\$_%]");
-set("ENC_SNAME", "[a-zA-Z]");
-set("ENC_NAME", "[-a-zA-Z0-9._]");
-set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0);
-set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0);
-set("SNAME_1_1", @sname_1_1);
-set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]);
-set("GT", "[>]");
-
-($ARGV[0] eq "" || $ARGV[1] eq "") && die("Invalid usage");
-find_cls();
-open(H, ">", $ARGV[0]) or die("Cannot create $ARGV[0]");
-open(C, ">", $ARGV[1]) or die("Cannot create $ARGV[1]");
-gen_enum();
-gen_tabs();
-close(H);
-close(C);
-
-sub set {
- my $id = shift;
- $ids{$id} = scalar keys(%ids) if !defined($ids{$id});
- my $mask = 1 << $ids{$id};
- foreach my $i (@_) {
- if (ref($i) eq "ARRAY") {
- my $j = $i->[0];
- for (; $j <= $i->[1] && $j < 0x10000; $j++) { $cat[$j] |= $mask; }
- for (; $j <= $i->[1]; $j += 0x10000) { $lcat[$j >> 16] |= $mask; }
- }
- elsif ($i =~ /^\[/) { for (my $j=0; $j < 128; $j++) { if (chr($j) =~ /$i/) { $cat[$j] |= $mask; } } }
- else { $cat[$i] |= $mask; }
- }
-}
-
-sub find_cls {
- foreach (my $i=0; $i<@cat; $i++) { $cls{$cat[$i]} = scalar keys(%cls) if !defined($cls{$cat[$i]}); }
- foreach (my $i=0; $i<@lcat; $i++) { $cls{$lcat[$i]} = scalar keys(%cls) if !defined($cls{$lcat[$i]}); }
-}
-
-sub gen_enum {
- print H "enum xml_char_type {\n";
- foreach my $id (sort keys %ids) {
- my $mask = 0;
- foreach my $i (keys %cls) {
- $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id}));
- }
- printf H " XML_CHAR_%-20s = 0x%08x,\n", $id, $mask;
- }
- print H "};\n\n";
-}
-
-sub gen_tabs {
- my @tab = ();
- my %hash = ();
-
- print H "extern const byte xml_char_tab1[];\n";
- print H "extern const uns xml_char_tab2[];\n";
- print H "extern const byte xml_char_tab3[];\n";
-
- print C "const uns xml_char_tab2[] = {\n ";
- for (my $t=0; $t<256; $t++) {
- my $i = $t * 256;
- my @x = ();
- for (my $j=0; $j<256; $j += 32) {
- push @x, join(",", map($cls{$_}, @cat[$i+$j..$i+$j+31]));
- }
- my $sub = " " . join(",\n ", @x);
- if (!defined($hash{$sub})) {
- $hash{$sub} = 256 * scalar @tab;
- push @tab, $sub;
- }
- printf C "0x%x", $hash{$sub};
- print C ((~$t & 15) ? "," : ($t < 255) ? ",\n " : "\n};\n\n");
- }
-
- print C "const byte xml_char_tab1[] = {\n";
- print C join(",\n\n", @tab);
- print C "\n};\n\n";
-
- my @l = ();
- for (my $i=0; $i<0x11; $i++) {
- push @l, sprintf("%d", $cls{$lcat[$i]});
- }
- print C "const byte xml_char_tab3[] = {" . join(",", @l) . "};\n";
-}
+++ /dev/null
-/*
- * Sherlock Library -- A simple XML parser
- *
- * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
- *
- * This software may be freely distributed and used according to the terms
- * of the GNU Lesser General Public License.
- */
-
-#include "sherlock/sherlock.h"
-#include "sherlock/xml/xml.h"
-#include "sherlock/xml/dtd.h"
-#include "ucw/getopt.h"
-#include "ucw/fastbuf.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-
-enum {
- WANT_FIRST = 0x100,
- WANT_HIDE_ERRORS,
- WANT_IGNORE_COMMENTS,
- WANT_IGNORE_PIS,
- WANT_REPORT_BLOCKS,
- WANT_REPORT_IGNORABLE,
- WANT_FILE_ENTITIES,
-};
-
-static char *shortopts = "spdt" CF_SHORT_OPTS;
-static struct option longopts[] = {
- CF_LONG_OPTS
- { "sax", 0, 0, 's' },
- { "pull", 0, 0, 'p' },
- { "dom", 0, 0, 't' },
- { "dtd", 0, 0, 'd' },
- { "hide-errors", 0, 0, WANT_HIDE_ERRORS },
- { "ignore-comments", 0, 0, WANT_IGNORE_COMMENTS },
- { "ignore-pis", 0, 0, WANT_IGNORE_PIS },
- { "report-blocks", 0, 0, WANT_REPORT_BLOCKS },
- { "report-ignorable", 0, 0, WANT_REPORT_IGNORABLE },
- { "file-entities", 0, 0, WANT_FILE_ENTITIES },
- { NULL, 0, 0, 0 }
-};
-
-static void NONRET
-usage(void)
-{
- fputs("\
-Usage: xml-test [options] < input.xml\n\
-\n\
-Options:\n"
-CF_USAGE
-"\
--p, --pull Test PULL interface\n\
--s, --sax Test SAX interface\n\
--t, --dom Test DOM interface\n\
--d, --dtd Enable parsing of DTD\n\
- --hide-errors Hide warnings and error messages\n\
- --ignore-comments Ignore comments\n\
- --ignore-pis Ignore processing instructions\n\
- --report-blocks Report blocks or characters and CDATA sections\n\
- --report-ignorable Report ignorable whitespace\n\
- --file-entities Resolve file external entities (not fully normative)\n\
-\n", stderr);
- exit(1);
-}
-
-static uns want_sax;
-static uns want_pull;
-static uns want_dom;
-static uns want_parse_dtd;
-static uns want_hide_errors;
-static uns want_ignore_comments;
-static uns want_ignore_pis;
-static uns want_report_blocks;
-static uns want_report_ignorable;
-static uns want_file_entities;
-
-static struct fastbuf *out;
-
-static char *
-node_type(struct xml_node *node)
-{
- switch (node->type)
- {
- case XML_NODE_ELEM: return "element";
- case XML_NODE_COMMENT: return "comment";
- case XML_NODE_PI: return "pi";
- case XML_NODE_CHARS: return "chars";
- default: return "unknown";
- }
-}
-
-static void
-show_node(struct xml_node *node)
-{
- switch (node->type)
- {
- case XML_NODE_ELEM:
- bprintf(out, " <%s>", node->name);
- XML_ATTR_FOR_EACH(a, node)
- bprintf(out, " %s='%s'", a->name, a->val);
- bputc(out, '\n');
- break;
- case XML_NODE_COMMENT:
- bprintf(out, " text='%s'\n", node->text);
- break;
- case XML_NODE_PI:
- bprintf(out, " target=%s text='%s'\n", node->name, node->text);
- break;
- case XML_NODE_CHARS:
- bprintf(out, " text='%s'\n", node->text);
- break;
- default:
- bputc(out, '\n');
- }
-}
-
-static void
-show_tree(struct xml_node *node, uns level)
-{
- if (!node)
- return;
- bputs(out, "DOM: ");
- for (uns i = 0; i < level; i++)
- bputs(out, " ");
- bputs(out, node_type(node));
- show_node(node);
- if (node->type == XML_NODE_ELEM)
- XML_NODE_FOR_EACH(son, node)
- show_tree(son, level + 1);
-}
-
-static void
-h_error(struct xml_context *ctx)
-{
- bprintf(out, "SAX: %s at %u: %s\n", (ctx->err_code < XML_ERR_ERROR) ? "warn" : "error", xml_row(ctx), ctx->err_msg);
-}
-
-static void
-h_document_start(struct xml_context *ctx UNUSED)
-{
- bputs(out, "SAX: document_start\n");
-}
-
-static void
-h_document_end(struct xml_context *ctx UNUSED)
-{
- bputs(out, "SAX: document_end\n");
-}
-
-static void
-h_xml_decl(struct xml_context *ctx)
-{
- bprintf(out, "SAX: xml_decl version=%s standalone=%d fb_encoding=%s\n", ctx->version_str, ctx->standalone, ctx->src->fb_encoding);
-}
-
-static void
-h_doctype_decl(struct xml_context *ctx)
-{
- bprintf(out, "SAX: doctype_decl type=%s public='%s' system='%s' extsub=%d intsub=%d\n",
- ctx->doctype, ctx->public_id ? : "", ctx->system_id ? : "",
- !!(ctx->flags & XML_HAS_EXTERNAL_SUBSET), !!(ctx->flags & XML_HAS_INTERNAL_SUBSET));
-}
-
-static void
-h_comment(struct xml_context *ctx)
-{
- bputs(out, "SAX: comment");
- show_node(ctx->node);
-}
-
-static void
-h_pi(struct xml_context *ctx)
-{
- bputs(out, "SAX: pi");
- show_node(ctx->node);
-}
-
-static void
-h_stag(struct xml_context *ctx)
-{
- bputs(out, "SAX: stag");
- show_node(ctx->node);
-}
-
-static void
-h_etag(struct xml_context *ctx)
-{
- bprintf(out, "SAX: etag </%s>\n", ctx->node->name);
-}
-
-static void
-h_chars(struct xml_context *ctx)
-{
- bputs(out, "SAX: chars");
- show_node(ctx->node);
-}
-
-static void
-h_block(struct xml_context *ctx UNUSED, char *text, uns len UNUSED)
-{
- bprintf(out, "SAX: block text='%s'\n", text);
-}
-
-static void
-h_cdata(struct xml_context *ctx UNUSED, char *text, uns len UNUSED)
-{
- bprintf(out, "SAX: cdata text='%s'\n", text);
-}
-
-static void
-h_ignorable(struct xml_context *ctx UNUSED, char *text, uns len UNUSED)
-{
- bprintf(out, "SAX: ignorable text='%s'\n", text);
-}
-
-static void
-h_dtd_start(struct xml_context *ctx UNUSED)
-{
- bputs(out, "SAX: dtd_start\n");
-}
-
-static void
-h_dtd_end(struct xml_context *ctx UNUSED)
-{
- bputs(out, "SAX: dtd_end\n");
-}
-
-static void
-h_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *e)
-{
- xml_push_fastbuf(ctx, bopen(e->system_id, O_RDONLY, 4096));
-}
-
-int
-main(int argc, char **argv)
-{
- int opt;
- cf_def_file = NULL;
- log_init(argv[0]);
- while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0)
- switch (opt)
- {
- case 's':
- want_sax++;
- break;
- case 'p':
- want_pull++;
- break;
- case 't':
- want_dom++;
- break;
- case 'd':
- want_parse_dtd++;
- break;
- case WANT_HIDE_ERRORS:
- want_hide_errors++;
- break;
- case WANT_IGNORE_COMMENTS:
- want_ignore_comments++;
- break;
- case WANT_IGNORE_PIS:
- want_ignore_pis++;
- break;
- case WANT_REPORT_BLOCKS:
- want_report_blocks++;
- break;
- case WANT_REPORT_IGNORABLE:
- want_report_ignorable++;
- break;
- case WANT_FILE_ENTITIES:
- want_file_entities++;
- break;
- default:
- usage();
- }
- if (optind != argc)
- usage();
-
- out = bfdopen_shared(1, 4096);
- struct xml_context ctx;
- xml_init(&ctx);
- if (!want_hide_errors)
- ctx.h_warn = ctx.h_error = ctx.h_fatal = h_error;
- if (want_sax)
- {
- ctx.h_document_start = h_document_start;
- ctx.h_document_end = h_document_end;
- ctx.h_xml_decl = h_xml_decl;
- ctx.h_doctype_decl = h_doctype_decl;
- ctx.h_comment = h_comment;
- ctx.h_pi = h_pi;
- ctx.h_stag = h_stag;
- ctx.h_etag = h_etag;
- ctx.h_chars = h_chars;
- if (want_report_blocks)
- {
- ctx.h_block = h_block;
- ctx.h_cdata = h_cdata;
- }
- if (want_report_ignorable)
- ctx.h_ignorable = h_ignorable;
- ctx.h_dtd_start = h_dtd_start;
- ctx.h_dtd_end = h_dtd_end;
- }
- if (want_dom)
- ctx.flags |= XML_ALLOC_ALL;
- if (want_parse_dtd)
- ctx.flags |= XML_PARSE_DTD;
- if (want_ignore_comments)
- ctx.flags &= ~(XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS);
- if (want_ignore_pis)
- ctx.flags &= ~(XML_REPORT_PIS | XML_ALLOC_PIS);
- if (want_file_entities)
- ctx.h_resolve_entity = h_resolve_entity;
- xml_push_fastbuf(&ctx, bfdopen_shared(0, 4096));
- bputs(out, "PULL: start\n");
- if (want_pull)
- {
- ctx.pull = XML_PULL_CHARS | XML_PULL_STAG | XML_PULL_ETAG | XML_PULL_COMMENT | XML_PULL_PI;
- uns state;
- while (state = xml_next(&ctx))
- switch (state)
- {
- case XML_STATE_CHARS:
- bputs(out, "PULL: chars");
- show_node(ctx.node);
- break;
- case XML_STATE_STAG:
- bputs(out, "PULL: stag");
- show_node(ctx.node);
- break;
- case XML_STATE_ETAG:
- bprintf(out, "PULL: etag </%s>\n", ctx.node->name);
- break;
- case XML_STATE_COMMENT:
- bputs(out, "PULL: comment");
- show_node(ctx.node);
- break;
- case XML_STATE_PI:
- bputs(out, "PULL: pi");
- show_node(ctx.node);
- break;
- default:
- bputs(out, "PULL: unknown\n");
- break;
- }
- }
- else
- xml_parse(&ctx);
- if (ctx.err_code)
- bprintf(out, "PULL: fatal error at %u: %s\n", xml_row(&ctx), ctx.err_msg);
- else
- {
- bputs(out, "PULL: eof\n");
- if (want_dom)
- show_tree(ctx.dom, 0);
- }
-
- xml_cleanup(&ctx);
- bclose(out);
- return 0;
-}
+++ /dev/null
-# Tests for the XML parser
-# (c) 2008 Pavel Charvat <pchar@ucw.cz>
-
-Run: ../obj/sherlock/xml/xml-test
-In: <?xml version="1.0"?>
- <html></html>
-Out: PULL: start
- PULL: eof
-
-Run: ../obj/sherlock/xml/xml-test -s
-In: <?xml version="1.0" encoding="ISO-8859-1"?>
- <html><a a1="val1" a2="val2">text1&amp;<</a>text2</html>
-Out: PULL: start
- SAX: document_start
- SAX: xml_decl version=1.0 standalone=0 fb_encoding=ISO-8859-1
- SAX: stag <html>
- SAX: stag <a> a1='val1' a2='val2'
- SAX: chars text='text1&<'
- SAX: etag </a>
- SAX: chars text='text2'
- SAX: etag </html>
- SAX: document_end
- PULL: eof
-
-Run: ../obj/sherlock/xml/xml-test -sptd
-In: <?xml version="1.0"?>
- <!DOCTYPE root [
- <!ELEMENT root (#PCDATA|a)*>
- <!ENTITY % pe1 "<!ENTITY e1 'text'>">
- %pe1;
- <!ENTITY e2 '<&e1;>'>
- <!ELEMENT a (#PCDATA)*>
- ]>
- <root>&e1;<a>&e2;</a></root>
-Out: PULL: start
- SAX: document_start
- SAX: xml_decl version=1.0 standalone=0 fb_encoding=UTF-8
- SAX: doctype_decl type=root public='' system='' extsub=0 intsub=1
- SAX: dtd_start
- SAX: dtd_end
- SAX: stag <root>
- PULL: stag <root>
- SAX: chars text='text'
- PULL: chars text='text'
- SAX: stag <a>
- PULL: stag <a>
- SAX: chars text='<text>'
- PULL: chars text='<text>'
- PULL: etag </a>
- SAX: etag </a>
- PULL: etag </root>
- SAX: etag </root>
- SAX: document_end
- PULL: eof
- DOM: element <root>
- DOM: chars text='text'
- DOM: element <a>
- DOM: chars text='<text>'
+++ /dev/null
-/*
- * Sherlock Library -- A simple XML parser
- *
- * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
- *
- * This software may be freely distributed and used according to the terms
- * of the GNU Lesser General Public License.
- */
-
-#ifndef _SHERLOCK_XML_XML_H
-#define _SHERLOCK_XML_XML_H
-
-#include "ucw/clists.h"
-#include "ucw/slists.h"
-#include "ucw/mempool.h"
-#include "ucw/fastbuf.h"
-
-struct xml_context;
-struct xml_dtd_entity;
-
-enum xml_error {
- XML_ERR_OK = 0,
- XML_ERR_WARN = 1000, /* Warning */
- XML_ERR_ERROR = 2000, /* Recoverable error */
- XML_ERR_FATAL = 3000, /* Unrecoverable error */
- XML_ERR_EOF,
-};
-
-enum xml_state {
- XML_STATE_EOF, /* EOF or a fatal error */
- XML_STATE_START, /* Initial state */
- XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */
- XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */
- XML_STATE_CHARS, /* XML_PULL_CHARS */
- XML_STATE_STAG, /* XML_PULL_STAG */
- XML_STATE_ETAG, /* XML_PULL_ETAG */
- XML_STATE_COMMENT, /* XML_PULL_COMMENT */
- XML_STATE_PI, /* XML_PULL_PI */
-
- /* Internal states */
- XML_STATE_CHARS_BEFORE_STAG,
- XML_STATE_CHARS_BEFORE_ETAG,
- XML_STATE_CHARS_BEFORE_CDATA,
- XML_STATE_CHARS_BEFORE_COMMENT,
- XML_STATE_CHARS_BEFORE_PI,
- XML_STATE_PROLOG_COMMENT,
- XML_STATE_PROLOG_PI,
- XML_STATE_EPILOG_COMMENT,
- XML_STATE_EPILOG_PI,
-};
-
-enum xml_pull {
- XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */
- XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */
- XML_PULL_CHARS = 0x00000004,
- XML_PULL_STAG = 0x00000008,
- XML_PULL_ETAG = 0x00000010,
- XML_PULL_COMMENT = 0x00000020,
- XML_PULL_PI = 0x00000040,
- XML_PULL_ALL = 0xffffffff,
-};
-
-enum xml_flags {
- /* Enable reporting of various events via SAX and/or PUSH interface */
- XML_REPORT_COMMENTS = 0x00000001, /* Report comments */
- XML_REPORT_PIS = 0x00000002, /* Report processing instructions */
- XML_REPORT_CHARS = 0x00000004, /* Report characters */
- XML_REPORT_TAGS = 0x00000008, /* Report element starts/ends */
- XML_REPORT_MISC = XML_REPORT_COMMENTS | XML_REPORT_PIS,
- XML_REPORT_ALL = XML_REPORT_MISC | XML_REPORT_CHARS | XML_REPORT_TAGS,
-
- /* Enable construction of DOM for these types */
- XML_ALLOC_COMMENTS = 0x00000010, /* Create comment nodes */
- XML_ALLOC_PIS = 0x00000020, /* Create processing instruction nodes */
- XML_ALLOC_CHARS = 0x00000040, /* Create character nodes */
- XML_ALLOC_TAGS = 0x00000080, /* Create element nodes */
- XML_ALLOC_MISC = XML_ALLOC_COMMENTS | XML_ALLOC_PIS,
- XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS,
-
- /* Other parameters */
- XML_VALIDATING = 0x00000100, /* Validate everything (not fully implemented!) */
- XML_PARSE_DTD = 0x00000200, /* Enable parsing of DTD */
- XML_NO_CHARS = 0x00000400, /* The current element must not contain character data (filled automaticaly if using DTD) */
- XML_ALLOC_DEFAULT_ATTRS = 0x00000800, /* Allocate default attribute values so they can be found by XML_ATTR_FOR_EACH */
-
- /* Internals, do not change! */
- XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */
- XML_VERSION_1_1 = 0x00020000, /* XML version is 1.1, otherwise 1.0 */
- XML_HAS_EXTERNAL_SUBSET = 0x00040000, /* The document contains a reference to external DTD subset */
- XML_HAS_INTERNAL_SUBSET = 0x00080000, /* The document contains an internal subset */
- XML_HAS_DTD = XML_HAS_EXTERNAL_SUBSET | XML_HAS_INTERNAL_SUBSET,
- XML_SRC_EOF = 0x00100000, /* EOF reached */
- XML_SRC_EXPECTED_DECL = 0x00200000, /* Just before optional or required XMLDecl/TextDecl */
- XML_SRC_DOCUMENT = 0x00400000, /* The document entity */
- XML_SRC_EXTERNAL = 0x00800000, /* An external entity */
-};
-
-enum xml_node_type {
- XML_NODE_ELEM,
- XML_NODE_COMMENT,
- XML_NODE_CHARS,
- XML_NODE_PI,
-};
-
-#define XML_NODE_FOR_EACH(var, node) CLIST_FOR_EACH(struct xml_node *, var, (node)->sons)
-#define XML_ATTR_FOR_EACH(var, node) SLIST_FOR_EACH(struct xml_attr *, var, (node)->attrs)
-
-struct xml_node {
- cnode n; /* Node for list of parent's sons */
- uns type; /* XML_NODE_x */
- struct xml_node *parent; /* Parent node */
- char *name; /* Element name / PI target */
- clist sons; /* Children nodes */
- union {
- struct {
- char *text; /* PI text / Comment / CDATA */
- uns len; /* Text length in bytes */
- };
- struct {
- struct xml_dtd_elem *dtd; /* Element DTD */
- slist attrs; /* Link list of element attributes */
- };
- };
- void *user; /* User-defined (initialized to NULL) */
-};
-
-struct xml_attr {
- snode n; /* Node for elem->attrs */
- struct xml_node *elem; /* Parent element */
- struct xml_dtd_attr *dtd; /* Attribute DTD */
- char *name; /* Attribute name */
- char *val; /* Attribute value */
- void *user; /* User-defined (initialized to NULL) */
-};
-
-#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */
-
-struct xml_source {
- struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */
- struct fastbuf *fb; /* Source fastbuf */
- struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */
- struct fastbuf wrap_fb; /* Fbmem wrapper */
- u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */
- u32 *bptr, *bstop; /* Current state of the buffer */
- uns row; /* File position */
- char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */
- char *fb_encoding; /* Encoding of the source fastbuf */
- char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */
- uns refill_cat1; /* Character categories, which should be directly passed to the buffer */
- uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in
- sequences) */
- void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */
- unsigned short *refill_in_to_x; /* Libcharset input table */
- uns saved_depth; /* Saved ctx->depth */
- uns pending_0xd; /* The last read character is 0xD */
-};
-
-struct xml_context {
- /* Error handling */
- char *err_msg; /* Last error message */
- enum xml_error err_code; /* Last error code */
- void *throw_buf; /* Where to jump on error */
- void (*h_warn)(struct xml_context *ctx); /* Warning callback */
- void (*h_error)(struct xml_context *ctx); /* Recoverable error callback */
- void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */
-
- /* Memory management */
- struct mempool *pool; /* DOM pool */
- struct mempool *stack; /* Stack pool (freed as soon as possible) */
- struct xml_stack *stack_list; /* See xml_push(), xml_pop() */
- uns flags; /* XML_FLAG_x (restored on xml_pop()) */
- uns depth; /* Nesting level (for checking of valid source nesting -> valid pushes/pops on memory pools) */
- struct fastbuf chars; /* Character data / attribute value */
- struct mempool_state chars_state; /* Mempool state before the current character block has started */
- char *chars_trivial; /* If not empty, it will be appended to chars */
- void *tab_attrs; /* Hash table of element attributes */
-
- /* Input */
- struct xml_source *src; /* Current source */
- u32 *bptr, *bstop; /* Buffer with preprocessed characters (validated UCS-4 + category flags) */
- uns cat_chars; /* Unicode range of supported characters (cdata, attribute values, ...) */
- uns cat_unrestricted; /* Unrestricted characters (may appear in document/external entities) */
- uns cat_new_line; /* New line characters */
- uns cat_name; /* Characters that may appear in names */
- uns cat_sname; /* Characters that may begin a name */
-
- /* SAX-like interface */
- void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */
- void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */
- void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */
- void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration (before optional internal subset) */
- void (*h_comment)(struct xml_context *ctx); /* Called after a comment (only with XML_REPORT_COMMENTS) */
- void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */
- void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */
- void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */
- void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */
- void (*h_block)(struct xml_context *ctx, char *text, uns len); /* Called for each continuous block of characters not reported by h_cdata() (only with XML_REPORT_CHARS) */
- void (*h_cdata)(struct xml_context *ctx, char *text, uns len); /* Called for each CDATA section (only with XML_REPORT_CHARS) */
- void (*h_ignorable)(struct xml_context *ctx, char *text, uns len); /* Called for ignorable whitespace (content in tags without #PCDATA) */
- void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */
- void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */
- struct xml_dtd_entity *(*h_find_entity)(struct xml_context *ctx, char *name); /* Called when needed to resolve a general entity */
- void (*h_resolve_entity)(struct xml_context *ctx, struct xml_dtd_entity *ent); /* User should push source fastbuf for a parsed external entity (either general or parameter) */
-
- /* DOM */
- struct xml_node *dom; /* DOM root */
- struct xml_node *node; /* Current DOM node */
-
- char *version_str;
- uns standalone;
- char *doctype; /* The document type (or NULL if unknown) */
- char *system_id; /* DTD external id */
- char *public_id; /* DTD public id */
- struct xml_dtd *dtd; /* The DTD structure (or NULL) */
- uns state; /* Current state for the PULL interface (XML_STATE_x) */
- uns pull; /* Parameters for the PULL interface (XML_PULL_x) */
-};
-
-/* Initialize XML context */
-void xml_init(struct xml_context *ctx);
-
-/* Clean up all internal structures */
-void xml_cleanup(struct xml_context *ctx);
-
-/* Reuse XML context, equivalent to xml_cleanup() and xml_init() */
-void xml_reset(struct xml_context *ctx);
-
-/* Add XML source (fastbuf will be automatically closed) */
-struct xml_source *xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb);
-
-/* Parse without the PUSH interface, return XML_ERR_x code (zero on success) */
-uns xml_parse(struct xml_context *ctx);
-
-/* Parse with the PUSH interface, return XML_STATE_x (zero on EOF or fatal error) */
-uns xml_next(struct xml_context *ctx);
-
-/* Equivalent to xml_next, but with temporarily changed ctx->pull value */
-uns xml_next_state(struct xml_context *ctx, uns pull);
-
-/* May be called on XML_STATE_STAG to skip it's content; can return XML_STATE_ETAG or XML_STATE_EOF on fatal error */
-uns xml_skip_element(struct xml_context *ctx);
-
-/* Returns the current row number in the document entity */
-uns xml_row(struct xml_context *ctx);
-
-/* Finds a given attribute value in a XML_NODE_ELEM node */
-struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name);
-
-/* Similar to xml_attr_find, but it deals also with default values */
-char *xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name);
-
-/* The default value of h_find_entity(), knows <, >, &, ' and " */
-struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name);
-
-/* The default value of h_resolve_entity(), throws an error */
-void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent);
-
-/* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */
-uns xml_normalize_white(struct xml_context *ctx, char *value);
-
-/* Merge character contents of a given element to a single string (not recursive) */
-char *xml_merge_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool);
-
-/* Merge character contents of a given subtree to a single string */
-char *xml_merge_dom_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool);
-
-/* Public part of error handling */
-void xml_warn(struct xml_context *ctx, const char *format, ...);
-void xml_error(struct xml_context *ctx, const char *format, ...);
-void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...);
-
-#endif
--- /dev/null
+# Makefile for the XML parser
+# (c) 2007 Pavel Charvat <pchar@ucw.cz>
+
+DIRS+=sherlock/xml
+PROGS+=$(o)/sherlock/xml/xml-test
+
+LIBSHXML_MODS=common source parse dtd
+LIBSHXML_INCLUDES=xml.h dtd.h
+
+LIBSHXML_MOD_PATHS=$(addprefix $(o)/sherlock/xml/,$(LIBSHXML_MODS))
+
+$(o)/sherlock/xml/libshxml.a: $(addsuffix .o,$(LIBSHXML_MOD_PATHS))
+$(o)/sherlock/xml/libshxml.so: $(addsuffix .oo,$(LIBSHXML_MOD_PATHS))
+$(o)/sherlock/xml/libshxml.pc: $(LIBSH) $(LIBCHARSET)
+
+$(o)/sherlock/xml/common.o: $(o)/sherlock/xml/unicat.h
+$(o)/sherlock/xml/common.oo: $(o)/sherlock/xml/unicat.h
+$(o)/sherlock/xml/source.o: $(o)/sherlock/xml/unicat.h
+$(o)/sherlock/xml/source.oo: $(o)/sherlock/xml/unicat.h
+$(o)/sherlock/xml/dtd.o: $(o)/sherlock/xml/unicat.h
+$(o)/sherlock/xml/dtd.oo: $(o)/sherlock/xml/unicat.h
+$(o)/sherlock/xml/parse.o: $(o)/sherlock/xml/unicat.h
+$(o)/sherlock/xml/parse.oo: $(o)/sherlock/xml/unicat.h
+$(o)/sherlock/xml/unicat.h: $(s)/sherlock/xml/unicat.pl
+ $(M)GEN $(addprefix $(o)/sherlock/xml/unicat,.h .c)
+ $(Q)$< $(addprefix $(o)/sherlock/xml/unicat,.h .c)
+ $(Q)touch $@
+
+TESTS+=$(o)/sherlock/xml/xml-test.test
+$(o)/sherlock/xml/xml-test: $(o)/sherlock/xml/xml-test.o $(LIBSHXML)
+$(o)/sherlock/xml/xml-test.test: $(o)/sherlock/xml/xml-test
+
+API_LIBS+=libshxml
+API_INCLUDES+=$(o)/sherlock/xml/.include-stamp
+$(o)/sherlock/xml/.include-stamp: $(addprefix $(s)/sherlock/xml/,$(LIBSHXML_INCLUDES))
+$(o)/sherlock/xml/.include-stamp: IDST=sherlock/xml
+run/lib/pkgconfig/libshxml.pc: $(o)/sherlock/xml/libshxml.pc
+
+INSTALL_TARGETS+=install-sh-xml
+install-sh-xml:
+ install -d -m 755 $(DESTDIR)$(INSTALL_INCLUDE_DIR)/sherlock/xml $(DESTDIR)$(INSTALL_LIB_DIR) $(DESTDIR)$(INSTALL_PKGCONFIG_DIR)
+ install -m 644 $(addprefix run/include/sherlock/xml/,$(LIBSHXML_INCLUDES)) $(DESTDIR)$(INSTALL_INCLUDE_DIR)/sherlock/xml
+ install -m 644 run/lib/pkgconfig/libshxml.pc $(DESTDIR)$(INSTALL_PKGCONFIG_DIR)
+ install -m 644 run/lib/libshxml.$(LS) $(DESTDIR)$(INSTALL_LIB_DIR)
+
+.PHONY: install-sh-xml
--- /dev/null
+Non-normative / not-implemented:
+-- introduce numeric error codes
+-- cycle detection in internal entities (and possibly external?)
+-- conditional sections in DTD
+-- validation of elements (regular expressions, non-cdata)
+-- validation of attributes (unfinished)
+-- notations
+-- URI normalization
+-- support for xml:space
+-- support for xml:lang
+-- full support for standalone documents
+-- Unicode normalization
+
+Optimizations:
+-- detect definitions of trivial entities
--- /dev/null
+/*
+ * Sherlock Library -- A simple XML parser
+ *
+ * (c) 2007 Pavel Charvat <pchar@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+#undef LOCAL_DEBUG
+
+#include "sherlock/sherlock.h"
+#include "sherlock/xml/xml.h"
+#include "sherlock/xml/dtd.h"
+#include "sherlock/xml/internals.h"
+#include "ucw/stkstring.h"
+#include "ucw/ff-unicode.h"
+
+#include <setjmp.h>
+
+/*** Error handling ***/
+
+void NONRET
+xml_throw(struct xml_context *ctx)
+{
+ ASSERT(ctx->err_code && ctx->throw_buf);
+ longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code);
+}
+
+void
+xml_warn(struct xml_context *ctx, const char *format, ...)
+{
+ if (ctx->h_warn)
+ {
+ va_list args;
+ va_start(args, format);
+ ctx->err_msg = stk_vprintf(format, args);
+ ctx->err_code = XML_ERR_WARN;
+ va_end(args);
+ ctx->h_warn(ctx);
+ ctx->err_msg = NULL;
+ ctx->err_code = XML_ERR_OK;
+ }
+}
+
+void
+xml_error(struct xml_context *ctx, const char *format, ...)
+{
+ if (ctx->h_error)
+ {
+ va_list args;
+ va_start(args, format);
+ ctx->err_msg = stk_vprintf(format, args);
+ ctx->err_code = XML_ERR_ERROR;
+ va_end(args);
+ ctx->h_error(ctx);
+ ctx->err_msg = NULL;
+ ctx->err_code = XML_ERR_OK;
+ }
+}
+
+void NONRET
+xml_fatal(struct xml_context *ctx, const char *format, ...)
+{
+ va_list args;
+ va_start(args, format);
+ ctx->err_msg = mp_vprintf(ctx->stack, format, args);
+ ctx->err_code = XML_ERR_FATAL;
+ ctx->state = XML_STATE_EOF;
+ va_end(args);
+ if (ctx->h_fatal)
+ ctx->h_fatal(ctx);
+ xml_throw(ctx);
+}
+
+/*** Memory management ***/
+
+void *
+xml_hash_new(struct mempool *pool, uns size)
+{
+ void *tab = mp_alloc_zero(pool, size + XML_HASH_HDR_SIZE);
+ *(void **)tab = pool;
+ return tab + XML_HASH_HDR_SIZE;
+}
+
+/*** Initialization ***/
+
+static struct xml_context xml_defaults = {
+ .flags = XML_SRC_EOF | XML_REPORT_ALL,
+ .state = XML_STATE_START,
+ .h_resolve_entity = xml_def_resolve_entity,
+ .chars = {
+ .name = "<xml_chars>",
+ .spout = xml_spout_chars,
+ .can_overwrite_buffer = 1,
+ },
+};
+
+static void
+xml_do_init(struct xml_context *ctx)
+{
+ xml_attrs_table_init(ctx);
+}
+
+void
+xml_init(struct xml_context *ctx)
+{
+ *ctx = xml_defaults;
+ ctx->pool = mp_new(65536);
+ ctx->stack = mp_new(65536);
+ xml_do_init(ctx);
+ TRACE(ctx, "init");
+}
+
+void
+xml_cleanup(struct xml_context *ctx)
+{
+ TRACE(ctx, "cleanup");
+ xml_attrs_table_cleanup(ctx);
+ xml_dtd_cleanup(ctx);
+ xml_sources_cleanup(ctx);
+ mp_delete(ctx->pool);
+ mp_delete(ctx->stack);
+}
+
+void
+xml_reset(struct xml_context *ctx)
+{
+ TRACE(ctx, "reset");
+ struct mempool *pool = ctx->pool, *stack = ctx->stack;
+ xml_attrs_table_cleanup(ctx);
+ xml_dtd_cleanup(ctx);
+ xml_sources_cleanup(ctx);
+ mp_flush(pool);
+ mp_flush(stack);
+ *ctx = xml_defaults;
+ ctx->pool = pool;
+ ctx->stack = stack;
+ xml_do_init(ctx);
+}
--- /dev/null
+/*
+ * Sherlock Library -- A simple XML parser
+ *
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+#undef LOCAL_DEBUG
+
+#include "sherlock/sherlock.h"
+#include "sherlock/xml/xml.h"
+#include "sherlock/xml/dtd.h"
+#include "sherlock/xml/internals.h"
+#include "ucw/fastbuf.h"
+#include "ucw/ff-unicode.h"
+#include "ucw/unicode.h"
+
+/* Notations */
+
+#define HASH_PREFIX(x) xml_dtd_notns_##x
+#define HASH_NODE struct xml_dtd_notn
+#define HASH_KEY_STRING name
+#define HASH_ZERO_FILL
+#define HASH_TABLE_DYNAMIC
+#define HASH_WANT_LOOKUP
+#define HASH_WANT_FIND
+#define HASH_GIVE_ALLOC
+#define HASH_TABLE_ALLOC
+XML_HASH_GIVE_ALLOC
+#include "ucw/hashtable.h"
+
+struct xml_dtd_notn *
+xml_dtd_find_notn(struct xml_context *ctx, char *name)
+{
+ struct xml_dtd *dtd = ctx->dtd;
+ struct xml_dtd_notn *notn = xml_dtd_notns_find(dtd->tab_notns, name);
+ return !notn ? NULL : (notn->flags & XML_DTD_NOTN_DECLARED) ? notn : NULL;
+}
+
+/* General entities */
+
+#define HASH_PREFIX(x) xml_dtd_ents_##x
+#define HASH_NODE struct xml_dtd_entity
+#define HASH_KEY_STRING name
+#define HASH_ZERO_FILL
+#define HASH_TABLE_DYNAMIC
+#define HASH_WANT_FIND
+#define HASH_WANT_LOOKUP
+#define HASH_GIVE_ALLOC
+#define HASH_TABLE_ALLOC
+XML_HASH_GIVE_ALLOC
+#include "ucw/hashtable.h"
+
+static struct xml_dtd_entity *
+xml_dtd_declare_trivial_entity(struct xml_context *ctx, char *name, char *text)
+{
+ struct xml_dtd *dtd = ctx->dtd;
+ struct xml_dtd_entity *ent = xml_dtd_ents_lookup(dtd->tab_ents, name);
+ if (ent->flags & XML_DTD_ENTITY_DECLARED)
+ {
+ xml_warn(ctx, "Entity &%s; already declared", name);
+ return NULL;
+ }
+ slist_add_tail(&dtd->ents, &ent->n);
+ ent->flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL;
+ ent->text = text;
+ return ent;
+}
+
+static void
+xml_dtd_declare_default_entities(struct xml_context *ctx)
+{
+ xml_dtd_declare_trivial_entity(ctx, "lt", "<");
+ xml_dtd_declare_trivial_entity(ctx, "gt", ">");
+ xml_dtd_declare_trivial_entity(ctx, "amp", "&");
+ xml_dtd_declare_trivial_entity(ctx, "apos", "'");
+ xml_dtd_declare_trivial_entity(ctx, "quot", "\"");
+}
+
+struct xml_dtd_entity *
+xml_def_find_entity(struct xml_context *ctx UNUSED, char *name)
+{
+#define ENT(n, t) ent_##n = { .name = #n, .text = t, .flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL }
+ static struct xml_dtd_entity ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\"");
+#undef ENT
+ switch (name[0])
+ {
+ case 'l':
+ if (!strcmp(name, "lt"))
+ return &ent_lt;
+ break;
+ case 'g':
+ if (!strcmp(name, "gt"))
+ return &ent_gt;
+ break;
+ case 'a':
+ if (!strcmp(name, "amp"))
+ return &ent_amp;
+ if (!strcmp(name, "apos"))
+ return &ent_apos;
+ break;
+ case 'q':
+ if (!strcmp(name, "quot"))
+ return &ent_quot;
+ break;
+ }
+ return NULL;
+}
+
+struct xml_dtd_entity *
+xml_dtd_find_entity(struct xml_context *ctx, char *name)
+{
+ struct xml_dtd *dtd = ctx->dtd;
+ if (ctx->h_find_entity)
+ return ctx->h_find_entity(ctx, name);
+ else if (dtd)
+ {
+ struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_ents, name);
+ return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL;
+ }
+ else
+ return xml_def_find_entity(ctx, name);
+}
+
+/* Parameter entities */
+
+static struct xml_dtd_entity *
+xml_dtd_find_pentity(struct xml_context *ctx, char *name)
+{
+ struct xml_dtd *dtd = ctx->dtd;
+ struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_pents, name);
+ return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL;
+}
+
+/* Elements */
+
+struct xml_dtd_elems_table;
+
+static void
+xml_dtd_elems_init_data(struct xml_dtd_elems_table *tab UNUSED, struct xml_dtd_elem *e)
+{
+ slist_init(&e->attrs);
+}
+
+#define HASH_PREFIX(x) xml_dtd_elems_##x
+#define HASH_NODE struct xml_dtd_elem
+#define HASH_KEY_STRING name
+#define HASH_TABLE_DYNAMIC
+#define HASH_ZERO_FILL
+#define HASH_WANT_FIND
+#define HASH_WANT_LOOKUP
+#define HASH_GIVE_ALLOC
+#define HASH_GIVE_INIT_DATA
+#define HASH_TABLE_ALLOC
+XML_HASH_GIVE_ALLOC
+#include "ucw/hashtable.h"
+
+struct xml_dtd_elem *
+xml_dtd_find_elem(struct xml_context *ctx, char *name)
+{
+ return ctx->dtd ? xml_dtd_elems_find(ctx->dtd->tab_elems, name) : NULL;
+}
+
+/* Element sons */
+
+struct xml_dtd_enodes_table;
+
+static inline uns
+xml_dtd_enodes_hash(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem)
+{
+ return hash_pointer(parent) ^ hash_pointer(elem);
+}
+
+static inline int
+xml_dtd_enodes_eq(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent1, struct xml_dtd_elem *elem1, struct xml_dtd_elem_node *parent2, struct xml_dtd_elem *elem2)
+{
+ return (parent1 == parent2) && (elem1 == elem2);
+}
+
+static inline void
+xml_dtd_enodes_init_key(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *node, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem)
+{
+ node->parent = parent;
+ node->elem = elem;
+}
+
+#define HASH_PREFIX(x) xml_dtd_enodes_##x
+#define HASH_NODE struct xml_dtd_elem_node
+#define HASH_KEY_COMPLEX(x) x parent, x elem
+#define HASH_KEY_DECL struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem
+#define HASH_GIVE_HASHFN
+#define HASH_GIVE_EQ
+#define HASH_GIVE_INIT_KEY
+#define HASH_TABLE_DYNAMIC
+#define HASH_ZERO_FILL
+#define HASH_WANT_FIND
+#define HASH_WANT_NEW
+#define HASH_GIVE_ALLOC
+#define HASH_TABLE_ALLOC
+XML_HASH_GIVE_ALLOC
+#include "ucw/hashtable.h"
+
+/* Element attributes */
+
+struct xml_dtd_attrs_table;
+
+static inline uns
+xml_dtd_attrs_hash(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem, char *name)
+{
+ return hash_pointer(elem) ^ hash_string(name);
+}
+
+static inline int
+xml_dtd_attrs_eq(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem1, char *name1, struct xml_dtd_elem *elem2, char *name2)
+{
+ return (elem1 == elem2) && !strcmp(name1, name2);
+}
+
+static inline void
+xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_elem *elem, char *name)
+{
+ attr->elem = elem;
+ attr->name = name;
+ slist_add_tail(&elem->attrs, &attr->n);
+}
+
+#define HASH_PREFIX(x) xml_dtd_attrs_##x
+#define HASH_NODE struct xml_dtd_attr
+#define HASH_ZERO_FILL
+#define HASH_TABLE_DYNAMIC
+#define HASH_KEY_COMPLEX(x) x elem, x name
+#define HASH_KEY_DECL struct xml_dtd_elem *elem, char *name
+#define HASH_GIVE_HASHFN
+#define HASH_GIVE_EQ
+#define HASH_GIVE_INIT_KEY
+#define HASH_WANT_FIND
+#define HASH_WANT_NEW
+#define HASH_GIVE_ALLOC
+#define HASH_TABLE_ALLOC
+XML_HASH_GIVE_ALLOC
+#include "ucw/hashtable.h"
+
+struct xml_dtd_attr *
+xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name)
+{
+ return ctx->dtd ? xml_dtd_attrs_find(ctx->dtd->tab_attrs, elem, name) : NULL;
+}
+
+/* Enumerated attribute values */
+
+struct xml_dtd_evals_table;
+
+static inline uns
+xml_dtd_evals_hash(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr, char *val)
+{
+ return hash_pointer(attr) ^ hash_string(val);
+}
+
+static inline int
+xml_dtd_evals_eq(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr1, char *val1, struct xml_dtd_attr *attr2, char *val2)
+{
+ return (attr1 == attr2) && !strcmp(val1, val2);
+}
+
+static inline void
+xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_eval *eval, struct xml_dtd_attr *attr, char *val)
+{
+ eval->attr = attr;
+ eval->val = val;
+}
+
+#define HASH_PREFIX(x) xml_dtd_evals_##x
+#define HASH_NODE struct xml_dtd_eval
+#define HASH_TABLE_DYNAMIC
+#define HASH_KEY_COMPLEX(x) x attr, x val
+#define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val
+#define HASH_GIVE_HASHFN
+#define HASH_GIVE_EQ
+#define HASH_GIVE_INIT_KEY
+#define HASH_WANT_FIND
+#define HASH_WANT_NEW
+#define HASH_GIVE_ALLOC
+#define HASH_TABLE_ALLOC
+XML_HASH_GIVE_ALLOC
+#include "ucw/hashtable.h"
+
+/* Enumerated attribute notations */
+
+struct xml_dtd_enotns_table;
+
+static inline uns
+xml_dtd_enotns_hash(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn)
+{
+ return hash_pointer(attr) ^ hash_pointer(notn);
+}
+
+static inline int
+xml_dtd_enotns_eq(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr1, struct xml_dtd_notn *notn1, struct xml_dtd_attr *attr2, struct xml_dtd_notn *notn2)
+{
+ return (attr1 == attr2) && (notn1 == notn2);
+}
+
+static inline void
+xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_enotn *enotn, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn)
+{
+ enotn->attr = attr;
+ enotn->notn = notn;
+}
+
+#define HASH_PREFIX(x) xml_dtd_enotns_##x
+#define HASH_NODE struct xml_dtd_enotn
+#define HASH_TABLE_DYNAMIC
+#define HASH_KEY_COMPLEX(x) x attr, x notn
+#define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn
+#define HASH_GIVE_HASHFN
+#define HASH_GIVE_EQ
+#define HASH_GIVE_INIT_KEY
+#define HASH_WANT_FIND
+#define HASH_WANT_NEW
+#define HASH_GIVE_ALLOC
+#define HASH_TABLE_ALLOC
+XML_HASH_GIVE_ALLOC
+#include "ucw/hashtable.h"
+
+/* DTD initialization/cleanup */
+
+void
+xml_dtd_init(struct xml_context *ctx)
+{
+ if (ctx->dtd)
+ return;
+ struct mempool *pool = mp_new(4096);
+ struct xml_dtd *dtd = ctx->dtd = mp_alloc_zero(pool, sizeof(*ctx->dtd));
+ dtd->pool = pool;
+ xml_dtd_ents_init(dtd->tab_ents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table)));
+ xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table)));
+ xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table)));
+ xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table)));
+ xml_dtd_enodes_init(dtd->tab_enodes = xml_hash_new(pool, sizeof(struct xml_dtd_enodes_table)));
+ xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table)));
+ xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table)));
+ xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table)));
+ xml_dtd_declare_default_entities(ctx);
+}
+
+void
+xml_dtd_cleanup(struct xml_context *ctx)
+{
+ if (!ctx->dtd)
+ return;
+ mp_delete(ctx->dtd->pool);
+ ctx->dtd = NULL;
+}
+
+void
+xml_dtd_finish(struct xml_context *ctx)
+{
+ if (!ctx->dtd)
+ return;
+ // FIXME: validity checks
+}
+
+/*** Parsing functions ***/
+
+/* References to parameter entities */
+
+void
+xml_parse_pe_ref(struct xml_context *ctx)
+{
+ /* PEReference ::= '%' Name ';'
+ * Already parsed: '%' */
+ struct mempool_state state;
+ mp_save(ctx->stack, &state);
+ char *name = xml_parse_name(ctx, ctx->stack);
+ xml_parse_char(ctx, ';');
+ struct xml_dtd_entity *ent = xml_dtd_find_pentity(ctx, name);
+ if (!ent)
+ xml_error(ctx, "Unknown entity %%%s;", name);
+ else
+ {
+ TRACE(ctx, "Pushed entity %%%s;", name);
+ mp_restore(ctx->stack, &state);
+ xml_dec(ctx);
+ xml_push_entity(ctx, ent);
+ return;
+ }
+ mp_restore(ctx->stack, &state);
+ xml_dec(ctx);
+}
+
+static uns
+xml_parse_dtd_pe(struct xml_context *ctx, uns entity_decl)
+{
+ /* Already parsed: '%' */
+ do
+ {
+ xml_inc(ctx);
+ if (!~entity_decl && (xml_peek_cat(ctx) & XML_CHAR_WHITE))
+ {
+ xml_dec(ctx);
+ return ~0U;
+ }
+ xml_parse_pe_ref(ctx);
+ while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
+ xml_skip_char(ctx);
+ }
+ while (xml_get_char(ctx) == '%');
+ xml_unget_char(ctx);
+ return 1;
+}
+
+static inline uns
+xml_parse_dtd_white(struct xml_context *ctx, uns mandatory)
+{
+ /* Whitespace or parameter entity,
+ * mandatory==~0U has a special maening of the whitespace before the '%' character in an parameter entity declaration */
+ uns cnt = 0;
+ while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
+ {
+ xml_skip_char(ctx);
+ cnt = 1;
+ }
+ if (xml_peek_char(ctx) == '%')
+ {
+ xml_skip_char(ctx);
+ return xml_parse_dtd_pe(ctx, mandatory);
+ }
+ else if (unlikely(mandatory && !cnt))
+ xml_fatal_expected_white(ctx);
+ return cnt;
+}
+
+static void
+xml_dtd_parse_external_id(struct xml_context *ctx, char **system_id, char **public_id, uns allow_public)
+{
+ struct xml_dtd *dtd = ctx->dtd;
+ uns c = xml_peek_char(ctx);
+ if (c == 'S')
+ {
+ xml_parse_seq(ctx, "SYSTEM");
+ xml_parse_dtd_white(ctx, 1);
+ *public_id = NULL;
+ *system_id = xml_parse_system_literal(ctx, dtd->pool);
+ }
+ else if (c == 'P')
+ {
+ xml_parse_seq(ctx, "PUBLIC");
+ xml_parse_dtd_white(ctx, 1);
+ *system_id = NULL;
+ *public_id = xml_parse_pubid_literal(ctx, dtd->pool);
+ if (xml_parse_dtd_white(ctx, !allow_public))
+ if ((c = xml_peek_char(ctx)) == '\'' || c == '"' || !allow_public)
+ *system_id = xml_parse_system_literal(ctx, dtd->pool);
+ }
+ else
+ xml_fatal(ctx, "Expected an external ID");
+}
+
+/* DTD: <!NOTATION ...> */
+
+void
+xml_parse_notation_decl(struct xml_context *ctx)
+{
+ /* NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
+ * Already parsed: '<!NOTATION' */
+ TRACE(ctx, "parse_notation_decl");
+ struct xml_dtd *dtd = ctx->dtd;
+ xml_parse_dtd_white(ctx, 1);
+
+ struct xml_dtd_notn *notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool));
+ xml_parse_dtd_white(ctx, 1);
+ char *system_id, *public_id;
+ xml_dtd_parse_external_id(ctx, &system_id, &public_id, 1);
+ xml_parse_dtd_white(ctx, 0);
+ xml_parse_char(ctx, '>');
+
+ if (notn->flags & XML_DTD_NOTN_DECLARED)
+ xml_warn(ctx, "Notation %s already declared", notn->name);
+ else
+ {
+ notn->flags = XML_DTD_NOTN_DECLARED;
+ notn->system_id = system_id;
+ notn->public_id = public_id;
+ slist_add_tail(&dtd->notns, ¬n->n);
+ }
+ xml_dec(ctx);
+}
+
+/* DTD: <!ENTITY ...> */
+
+void
+xml_parse_entity_decl(struct xml_context *ctx)
+{
+ /* Already parsed: '<!ENTITY' */
+ TRACE(ctx, "parse_entity_decl");
+ struct xml_dtd *dtd = ctx->dtd;
+ uns flags = ~xml_parse_dtd_white(ctx, ~0U) ? 0 : XML_DTD_ENTITY_PARAMETER;
+ if (flags)
+ xml_parse_dtd_white(ctx, 1);
+ struct xml_dtd_entity *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool));
+ xml_parse_dtd_white(ctx, 1);
+ slist *list = flags ? &dtd->pents : &dtd->ents;
+ if (ent->flags & XML_DTD_ENTITY_DECLARED)
+ {
+ xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name);
+ // FIXME: should be only warning
+ }
+ uns c, sep = xml_get_char(ctx);
+ if (sep == '\'' || sep == '"')
+ {
+ /* Internal entity:
+ * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */
+ char *p = mp_start_noalign(dtd->pool, 1);
+ while (1)
+ {
+ if ((c = xml_get_char(ctx)) == sep)
+ break;
+ if (c == '%')
+ {
+ // FIXME
+ ASSERT(0);
+ //xml_parse_parameter_ref(ctx);
+ continue;
+ }
+ if (c == '&')
+ {
+ xml_inc(ctx);
+ if (xml_peek_char(ctx) != '#')
+ {
+ /* Bypass references to general entities */
+ struct mempool_state state;
+ mp_save(ctx->stack, &state);
+ char *n = xml_parse_name(ctx, ctx->stack);
+ xml_parse_char(ctx, ';');
+ xml_dec(ctx);
+ uns l = strlen(n);
+ p = mp_spread(dtd->pool, p, 3 + l);
+ *p++ = '&';
+ memcpy(p, n, l);
+ p += l;
+ *p++ = ';';;
+ mp_restore(ctx->stack, &state);
+ continue;
+ }
+ else
+ {
+ xml_skip_char(ctx);
+ c = xml_parse_char_ref(ctx);
+ }
+ }
+ p = mp_spread(dtd->pool, p, 5);
+ p = utf8_32_put(p, c);
+ }
+ *p = 0;
+ ent->len = p - (char *)mp_ptr(dtd->pool);
+ ent->text = mp_end(dtd->pool, p + 1);
+ slist_add_tail(list, &ent->n);
+ ent->flags = flags | XML_DTD_ENTITY_DECLARED;
+ }
+ else
+ {
+ /* External entity */
+ struct xml_dtd_notn *notn = NULL;
+ char *system_id, *public_id;
+ xml_unget_char(ctx);
+ xml_dtd_parse_external_id(ctx, &system_id, &public_id, 0);
+ if (xml_parse_dtd_white(ctx, 0) && flags && xml_peek_char(ctx) != '>')
+ {
+ /* General external unparsed entity */
+ flags |= XML_DTD_ENTITY_UNPARSED;
+ xml_parse_seq(ctx, "NDATA");
+ xml_parse_dtd_white(ctx, 1);
+ notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool));
+ }
+ slist_add_tail(list, &ent->n);
+ ent->flags = flags | XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_EXTERNAL;
+ ent->system_id = system_id;
+ ent->public_id = public_id;
+ ent->notn = notn;
+ }
+ xml_parse_dtd_white(ctx, 0);
+ xml_parse_char(ctx, '>');
+ xml_dec(ctx);
+}
+
+/* DTD: <!ELEMENT ...> */
+
+void
+xml_parse_element_decl(struct xml_context *ctx)
+{
+ /* Elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
+ * Already parsed: '<!ELEMENT' */
+ struct xml_dtd *dtd = ctx->dtd;
+ xml_parse_dtd_white(ctx, 1);
+ char *name = xml_parse_name(ctx, dtd->pool);
+ xml_parse_dtd_white(ctx, 1);
+ struct xml_dtd_elem *elem = xml_dtd_elems_lookup(dtd->tab_elems, name);
+ if (elem->flags & XML_DTD_ELEM_DECLARED)
+ xml_fatal(ctx, "Element <%s> already declared", name);
+
+ /* contentspec ::= 'EMPTY' | 'ANY' | Mixed | children */
+ uns c = xml_peek_char(ctx);
+ if (c == 'E')
+ {
+ xml_parse_seq(ctx, "EMPTY");
+ elem->type = XML_DTD_ELEM_EMPTY;
+ }
+ else if (c == 'A')
+ {
+ xml_parse_seq(ctx, "ANY");
+ elem->type = XML_DTD_ELEM_ANY;
+ }
+ else if (c == '(')
+ {
+ xml_skip_char(ctx);
+ xml_inc(ctx);
+ xml_parse_dtd_white(ctx, 0);
+ struct xml_dtd_elem_node *parent = elem->node = mp_alloc_zero(dtd->pool, sizeof(*parent));
+ if (xml_peek_char(ctx) == '#')
+ {
+ /* Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' */
+ xml_skip_char(ctx);
+ xml_parse_seq(ctx, "PCDATA");
+ elem->type = XML_DTD_ELEM_MIXED;
+ parent->type = XML_DTD_ELEM_PCDATA;
+ while (1)
+ {
+ xml_parse_dtd_white(ctx, 0);
+ if ((c = xml_get_char(ctx)) == ')')
+ break;
+ else if (c != '|')
+ xml_fatal_expected(ctx, ')');
+ xml_parse_dtd_white(ctx, 0);
+ struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool));
+ if (xml_dtd_enodes_find(dtd->tab_enodes, parent, son_elem))
+ xml_error(ctx, "Duplicate content '%s'", son_elem->name);
+ else
+ {
+ struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem);
+ slist_add_tail(&parent->sons, &son->n);
+ }
+ }
+ xml_dec(ctx);
+ if (xml_peek_char(ctx) == '*')
+ {
+ xml_skip_char(ctx);
+ parent->occur = XML_DTD_ELEM_OCCUR_MULT;
+ }
+ else if (!slist_head(&parent->sons))
+ parent->occur = XML_DTD_ELEM_OCCUR_ONCE;
+ else
+ xml_fatal_expected(ctx, '*');
+ }
+ else
+ {
+ /* children ::= (choice | seq) ('?' | '*' | '+')?
+ * cp ::= (Name | choice | seq) ('?' | '*' | '+')?
+ * choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'
+ * seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' */
+
+ elem->type = XML_DTD_ELEM_CHILDREN;
+ parent->type = XML_DTD_ELEM_PCDATA;
+ uns c;
+ goto first;
+
+ while (1)
+ {
+ /* After name */
+ xml_parse_dtd_white(ctx, 0);
+ if ((c = xml_get_char(ctx)) == ')')
+ {
+ xml_dec(ctx);
+ if (parent->type == XML_DTD_ELEM_PCDATA)
+ parent->type = XML_DTD_ELEM_SEQ;
+ if ((c = xml_get_char(ctx)) == '?')
+ parent->occur = XML_DTD_ELEM_OCCUR_OPT;
+ else if (c == '*')
+ parent->occur = XML_DTD_ELEM_OCCUR_MULT;
+ else if (c == '+')
+ parent->occur = XML_DTD_ELEM_OCCUR_PLUS;
+ else
+ {
+ xml_unget_char(ctx);
+ parent->occur = XML_DTD_ELEM_OCCUR_ONCE;
+ }
+ if (!parent->parent)
+ break;
+ parent = parent->parent;
+ continue;
+ }
+ else if (c == '|')
+ {
+ if (parent->type == XML_DTD_ELEM_PCDATA)
+ parent->type = XML_DTD_ELEM_OR;
+ else if (parent->type != XML_DTD_ELEM_OR)
+ xml_fatal(ctx, "Mixed operators in the list of element children");
+ }
+ else if (c == ',')
+ {
+ if (parent->type == XML_DTD_ELEM_PCDATA)
+ parent->type = XML_DTD_ELEM_SEQ;
+ else if (parent->type != XML_DTD_ELEM_SEQ)
+ xml_fatal(ctx, "Mixed operators in the list of element children");
+ }
+ else if (c == '(')
+ {
+ xml_inc(ctx);
+ struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son));
+ son->parent = parent;
+ slist_add_tail(&parent->sons, &son->n);
+ parent = son->parent;
+ son->type = XML_DTD_ELEM_MIXED;
+ }
+ else
+ xml_unget_char(ctx);
+
+ /* Before name */
+ xml_parse_dtd_white(ctx, 0);
+first:;
+ struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool));
+ // FIXME: duplicates, occurance
+ //struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem);
+ struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son));
+ son->parent = parent;
+ son->elem = son_elem;
+ slist_add_tail(&parent->sons, &son->n);
+ }
+ }
+ }
+ else
+ xml_fatal(ctx, "Expected element content specification");
+
+ xml_parse_dtd_white(ctx, 0);
+ xml_parse_char(ctx, '>');
+ xml_dec(ctx);
+}
+
+void
+xml_parse_attr_list_decl(struct xml_context *ctx)
+{
+ /* AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
+ * AttDef ::= S Name S AttType S DefaultDecl
+ * Already parsed: '<!ATTLIST' */
+ struct xml_dtd *dtd = ctx->dtd;
+ xml_parse_dtd_white(ctx, 1);
+ struct xml_dtd_elem *elem = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx, dtd->pool));
+
+ while (xml_parse_dtd_white(ctx, 0) && xml_peek_char(ctx) != '>')
+ {
+ char *name = xml_parse_name(ctx, dtd->pool);
+ struct xml_dtd_attr *attr = xml_dtd_attrs_find(dtd->tab_attrs, elem, name);
+ uns ignored = 0;
+ if (attr)
+ {
+ xml_warn(ctx, "Duplicate attribute definition");
+ ignored++;
+ }
+ else
+ attr = xml_dtd_attrs_new(ctx->dtd->tab_attrs, elem, name);
+ xml_parse_dtd_white(ctx, 1);
+ if (xml_peek_char(ctx) == '(')
+ {
+ xml_skip_char(ctx); // FIXME: xml_inc/dec ?
+ if (!ignored)
+ attr->type = XML_ATTR_ENUM;
+ do
+ {
+ xml_parse_dtd_white(ctx, 0);
+ char *value = xml_parse_nmtoken(ctx, dtd->pool);
+ if (!ignored)
+ if (xml_dtd_evals_find(ctx->dtd->tab_evals, attr, value))
+ xml_error(ctx, "Duplicate enumeration value");
+ else
+ xml_dtd_evals_new(ctx->dtd->tab_evals, attr, value);
+ xml_parse_dtd_white(ctx, 0);
+ }
+ while (xml_get_char(ctx) == '|');
+ xml_unget_char(ctx);
+ xml_parse_char(ctx, ')');
+ }
+ else
+ {
+ char *type = xml_parse_name(ctx, dtd->pool);
+ enum xml_dtd_attr_type t = XML_ATTR_CDATA;
+ if (!strcmp(type, "CDATA"))
+ t = XML_ATTR_CDATA;
+ else if (!strcmp(type, "ID"))
+ t = XML_ATTR_ID;
+ else if (!strcmp(type, "IDREF"))
+ t = XML_ATTR_IDREF;
+ else if (!strcmp(type, "IDREFS"))
+ t = XML_ATTR_IDREFS;
+ else if (!strcmp(type, "ENTITY"))
+ t = XML_ATTR_ENTITY;
+ else if (!strcmp(type, "ENTITIES"))
+ t = XML_ATTR_ENTITIES;
+ else if (!strcmp(type, "NMTOKEN"))
+ t = XML_ATTR_NMTOKEN;
+ else if (!strcmp(type, "NMTOKENS"))
+ t = XML_ATTR_NMTOKENS;
+ else if (!strcmp(type, "NOTATION"))
+ {
+ if (elem->type == XML_DTD_ELEM_EMPTY)
+ xml_fatal(ctx, "Empty element must not have notation attribute");
+ // FIXME: An element type MUST NOT have more than one NOTATION attribute specified.
+ t = XML_ATTR_NOTATION;
+ xml_parse_dtd_white(ctx, 1);
+ xml_parse_char(ctx, '(');
+ do
+ {
+ xml_parse_dtd_white(ctx, 0);
+ struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx, dtd->pool));
+ if (!ignored)
+ if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, attr, n))
+ xml_error(ctx, "Duplicate enumerated notation");
+ else
+ xml_dtd_enotns_new(ctx->dtd->tab_enotns, attr, n);
+ xml_parse_dtd_white(ctx, 0);
+ }
+ while (xml_get_char(ctx) == '|');
+ xml_unget_char(ctx);
+ xml_parse_char(ctx, ')');
+ }
+ else
+ xml_fatal(ctx, "Unknown attribute type");
+ if (!ignored)
+ attr->type = t;
+ }
+ xml_parse_dtd_white(ctx, 1);
+ enum xml_dtd_attr_default def = XML_ATTR_NONE;
+ if (xml_get_char(ctx) == '#')
+ switch (xml_peek_char(ctx))
+ {
+ case 'R':
+ xml_parse_seq(ctx, "REQUIRED");
+ def = XML_ATTR_REQUIRED;
+ break;
+ case 'I':
+ xml_parse_seq(ctx, "IMPLIED");
+ def = XML_ATTR_IMPLIED;
+ break;
+ case 'F':
+ xml_parse_seq(ctx, "FIXED");
+ def = XML_ATTR_FIXED;
+ xml_parse_dtd_white(ctx, 1);
+ break;
+ default:
+ xml_fatal(ctx, "Expected a modifier for default attribute value");
+ }
+ else
+ xml_unget_char(ctx);
+ if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED)
+ {
+ char *v = xml_parse_attr_value(ctx, attr);
+ if (!ignored)
+ attr->default_value = v;
+ }
+ if (!ignored)
+ attr->default_mode = def;
+ }
+ xml_skip_char(ctx);
+ xml_dec(ctx);
+}
+
+void
+xml_skip_internal_subset(struct xml_context *ctx)
+{
+ TRACE(ctx, "skip_internal_subset");
+ /* AlreadyParsed: '[' */
+ uns c;
+ while ((c = xml_get_char(ctx)) != ']')
+ {
+ if (c != '<')
+ continue;
+ if ((c = xml_get_char(ctx)) == '?')
+ {
+ xml_inc(ctx);
+ xml_skip_pi(ctx);
+ }
+ else if (c != '!')
+ xml_dec(ctx);
+ else if (xml_get_char(ctx) == '-')
+ {
+ xml_inc(ctx);
+ xml_skip_comment(ctx);
+ }
+ else
+ while ((c = xml_get_char(ctx)) != '>')
+ if (c == '\'' || c == '"')
+ while (xml_get_char(ctx) != c);
+ }
+ xml_dec(ctx);
+}
+
+/*** Validation of attribute values ***/
+
+static uns
+xml_check_tokens(char *value, uns first_cat, uns next_cat, uns seq)
+{
+ char *p = value;
+ uns u;
+ while (1)
+ {
+ p = utf8_32_get(p, &u);
+ if (!(xml_char_cat(u) & first_cat))
+ return 0;
+ while (*p & ~0x20)
+ {
+ p = utf8_32_get(p, &u);
+ if (!(xml_char_cat(u) & next_cat))
+ return 0;
+ }
+ if (!*p)
+ return 1;
+ if (!seq)
+ return 0;
+ p++;
+ }
+}
+
+static uns
+xml_is_name(struct xml_context *ctx, char *value)
+{
+ /* Name ::= NameStartChar (NameChar)* */
+ return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 0);
+}
+
+static uns
+xml_is_names(struct xml_context *ctx, char *value)
+{
+ /* Names ::= Name (#x20 Name)* */
+ return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 1);
+}
+
+static uns
+xml_is_nmtoken(struct xml_context *ctx, char *value)
+{
+ /* Nmtoken ::= (NameChar)+ */
+ return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 0);
+}
+
+static uns
+xml_is_nmtokens(struct xml_context *ctx, char *value)
+{
+ /* Nmtokens ::= Nmtoken (#x20 Nmtoken)* */
+ return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 1);
+}
+
+static void
+xml_err_attr_format(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *type)
+{
+ xml_error(ctx, "Attribute %s in <%s> does not match the production of %s", dtd->name, dtd->elem->name, type);
+}
+
+void
+xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value)
+{
+ if (dtd->type == XML_ATTR_CDATA)
+ return;
+ xml_normalize_white(ctx, value);
+ switch (dtd->type)
+ {
+ case XML_ATTR_ID:
+ if (!xml_is_name(ctx, value))
+ xml_err_attr_format(ctx, dtd, "NAME");
+ //FIXME: add to a hash table
+ break;
+ case XML_ATTR_IDREF:
+ if (!xml_is_name(ctx, value))
+ xml_err_attr_format(ctx, dtd, "NAME");
+ // FIXME: find in hash table (beware forward references)
+ break;
+ case XML_ATTR_IDREFS:
+ if (!xml_is_names(ctx, value))
+ xml_err_attr_format(ctx, dtd, "NAMES");
+ // FIXME: find
+ break;
+ case XML_ATTR_ENTITY:
+ // FIXME
+ break;
+ case XML_ATTR_ENTITIES:
+ // FIXME
+ break;
+ case XML_ATTR_NMTOKEN:
+ if (!xml_is_nmtoken(ctx, value))
+ xml_err_attr_format(ctx, dtd, "NMTOKEN");
+ break;
+ case XML_ATTR_NMTOKENS:
+ if (!xml_is_nmtokens(ctx, value))
+ xml_err_attr_format(ctx, dtd, "NMTOKENS");
+ break;
+ case XML_ATTR_ENUM:
+ if (!xml_dtd_evals_find(ctx->dtd->tab_evals, dtd, value))
+ xml_error(ctx, "Attribute %s in <%s> contains an undefined enumeration value", dtd->name, dtd->elem->name);
+ break;
+ case XML_ATTR_NOTATION:
+ if (!xml_dtd_find_notn(ctx, value))
+ xml_error(ctx, "Attribute %s in <%s> contains an undefined notation", dtd->name, dtd->elem->name);
+ break;
+ }
+}
--- /dev/null
+/*
+ * Sherlock Library -- A simple XML parser
+ *
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+#ifndef _SHERLOCK_XML_DTD_H
+#define _SHERLOCK_XML_DTD_H
+
+#include "sherlock/xml/xml.h"
+
+struct xml_dtd {
+ struct mempool *pool; /* Memory pool where to allocate DTD */
+ slist ents; /* Link list of general entities */
+ slist pents; /* Link list of parameter entities */
+ slist notns; /* Link list of notations */
+ slist elems; /* Link list of elements */
+ void *tab_ents; /* Hash table of general entities */
+ void *tab_pents; /* Hash table of parameter entities */
+ void *tab_notns; /* Hash table of notations */
+ void *tab_elems; /* Hash table of elements */
+ void *tab_enodes; /* Hash table of element sons */
+ void *tab_attrs; /* Hash table of element attributes */
+ void *tab_evals; /* Hash table of enumerated attribute values */
+ void *tab_enotns; /* hash table of enumerated attribute notations */
+};
+
+/* Notations */
+
+enum xml_dtd_notn_flags {
+ XML_DTD_NOTN_DECLARED = 0x1, /* The notation has been declared (internal usage) */
+};
+
+struct xml_dtd_notn {
+ snode n; /* Node in xml_dtd.notns */
+ uns flags; /* XML_DTD_NOTN_x */
+ char *name; /* Notation name */
+ char *system_id; /* External ID */
+ char *public_id;
+ void *user; /* User-defined */
+};
+
+struct xml_dtd_notn *xml_dtd_find_notn(struct xml_context *ctx, char *name);
+
+/* Entities */
+
+enum xml_dtd_entity_flags {
+ XML_DTD_ENTITY_DECLARED = 0x1, /* The entity has been declared (internal usage) */
+ XML_DTD_ENTITY_VISITED = 0x2, /* Cycle detection (internal usage) */
+ XML_DTD_ENTITY_PARAMETER = 0x4, /* Parameter entity, general otherwise */
+ XML_DTD_ENTITY_EXTERNAL = 0x8, /* External entity, internal otherwise */
+ XML_DTD_ENTITY_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */
+ XML_DTD_ENTITY_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */
+};
+
+struct xml_dtd_entity {
+ snode n; /* Node in xml_dtd.[gp]ents */
+ uns flags; /* XML_DTD_ENT_x */
+ char *name; /* Entity name */
+ char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */
+ uns len; /* Text length */
+ char *system_id; /* External ID */
+ char *public_id;
+ struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */
+ void *user; /* User-defined */
+};
+
+struct xml_dtd_entity *xml_dtd_find_entity(struct xml_context *ctx, char *name);
+
+/* Elements */
+
+enum xml_dtd_elem_flags {
+ XML_DTD_ELEM_DECLARED = 0x1, /* The element has been declared (internal usage) */
+};
+
+enum xml_dtd_elem_type {
+ XML_DTD_ELEM_EMPTY,
+ XML_DTD_ELEM_ANY,
+ XML_DTD_ELEM_MIXED,
+ XML_DTD_ELEM_CHILDREN,
+};
+
+struct xml_dtd_elem {
+ snode n;
+ uns flags;
+ uns type;
+ char *name;
+ struct xml_dtd_elem_node *node;
+ slist attrs;
+ void *user; /* User-defined */
+};
+
+struct xml_dtd_elem_node {
+ snode n;
+ struct xml_dtd_elem_node *parent;
+ struct xml_dtd_elem *elem;
+ slist sons;
+ uns type;
+ uns occur;
+ void *user; /* User-defined */
+};
+
+enum xml_dtd_elem_node_type {
+ XML_DTD_ELEM_PCDATA,
+ XML_DTD_ELEM_SEQ,
+ XML_DTD_ELEM_OR,
+};
+
+enum xml_dtd_elem_node_occur {
+ XML_DTD_ELEM_OCCUR_ONCE,
+ XML_DTD_ELEM_OCCUR_OPT,
+ XML_DTD_ELEM_OCCUR_MULT,
+ XML_DTD_ELEM_OCCUR_PLUS,
+};
+
+struct xml_dtd_elem *xml_dtd_find_elem(struct xml_context *ctx, char *name);
+
+/* Attributes */
+
+enum xml_dtd_attr_default {
+ XML_ATTR_NONE,
+ XML_ATTR_REQUIRED,
+ XML_ATTR_IMPLIED,
+ XML_ATTR_FIXED,
+};
+
+enum xml_dtd_attr_type {
+ XML_ATTR_CDATA,
+ XML_ATTR_ID,
+ XML_ATTR_IDREF,
+ XML_ATTR_IDREFS,
+ XML_ATTR_ENTITY,
+ XML_ATTR_ENTITIES,
+ XML_ATTR_NMTOKEN,
+ XML_ATTR_NMTOKENS,
+ XML_ATTR_ENUM,
+ XML_ATTR_NOTATION,
+};
+
+struct xml_dtd_attr {
+ snode n;
+ char *name; /* Attribute name */
+ struct xml_dtd_elem *elem; /* Owner element */
+ uns type; /* See enum xml_dtd_attr_type */
+ uns default_mode; /* See enum xml_dtd_attr_default */
+ char *default_value; /* The default value defined in DTD (or NULL) */
+};
+
+struct xml_dtd_eval {
+ struct xml_dtd_attr *attr;
+ char *val;
+};
+
+struct xml_dtd_enotn {
+ struct xml_dtd_attr *attr;
+ struct xml_dtd_notn *notn;
+};
+
+void xml_dtd_init(struct xml_context *ctx);
+void xml_dtd_cleanup(struct xml_context *ctx);
+void xml_dtd_finish(struct xml_context *ctx);
+
+struct xml_dtd_attr *xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name);
+
+#endif
--- /dev/null
+/*
+ * Sherlock Library -- A simple XML parser
+ *
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+#ifndef _SHERLOCK_XML_INTERNALS_H
+#define _SHERLOCK_XML_INTERNALS_H
+
+#include "sherlock/xml/xml.h"
+#include "sherlock/xml/dtd.h"
+
+/*** Debugging ***/
+
+#ifdef LOCAL_DEBUG
+#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0)
+#else
+#define TRACE(c, f, p...) do {} while(0)
+#endif
+
+/*** Error handling ***/
+
+void NONRET xml_throw(struct xml_context *ctx);
+
+/*** Memory management ***/
+
+struct xml_stack {
+ struct xml_stack *next;
+ struct mempool_state state;
+ uns flags;
+};
+
+static inline void *
+xml_do_push(struct xml_context *ctx, uns size)
+{
+ /* Saves ctx->stack and ctx->flags state */
+ struct mempool_state state;
+ mp_save(ctx->stack, &state);
+ struct xml_stack *s = mp_alloc(ctx->stack, size);
+ s->state = state;
+ s->flags = ctx->flags;
+ s->next = ctx->stack_list;
+ ctx->stack_list = s;
+ return s;
+}
+
+static inline void
+xml_do_pop(struct xml_context *ctx, struct xml_stack *s)
+{
+ /* Restore ctx->stack and ctx->flags state */
+ ctx->stack_list = s->next;
+ ctx->flags = s->flags;
+ mp_restore(ctx->stack, &s->state);
+}
+
+static inline void
+xml_push(struct xml_context *ctx)
+{
+ TRACE(ctx, "push");
+ xml_do_push(ctx, sizeof(struct xml_stack));
+}
+
+static inline void
+xml_pop(struct xml_context *ctx)
+{
+ TRACE(ctx, "pop");
+ ASSERT(ctx->stack_list);
+ xml_do_pop(ctx, ctx->stack_list);
+}
+
+struct xml_dom_stack {
+ struct xml_stack stack;
+ struct mempool_state state;
+};
+
+static inline struct xml_node *
+xml_push_dom(struct xml_context *ctx, struct mempool_state *state)
+{
+ /* Create a new DOM node */
+ TRACE(ctx, "push_dom");
+ struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s));
+ if (state)
+ s->state = *state;
+ else
+ mp_save(ctx->pool, &s->state);
+ struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n));
+ n->user = NULL;
+ if (n->parent = ctx->node)
+ clist_add_tail(&n->parent->sons, &n->n);
+ return ctx->node = n;
+}
+
+static inline void
+xml_pop_dom(struct xml_context *ctx, uns free)
+{
+ /* Leave DOM subtree */
+ TRACE(ctx, "pop_dom");
+ ASSERT(ctx->node);
+ struct xml_node *p = ctx->node->parent;
+ struct xml_dom_stack *s = (void *)ctx->stack_list;
+ if (free)
+ {
+ /* See xml_pop_element() for cleanup of attribute hash table */
+ if (p)
+ clist_remove(&ctx->node->n);
+ mp_restore(ctx->pool, &s->state);
+ }
+ ctx->node = p;
+ xml_do_pop(ctx, &s->stack);
+}
+
+#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN)
+#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \
+ static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uns size) \
+ { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \
+ static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {}
+
+void *xml_hash_new(struct mempool *pool, uns size);
+
+void xml_spout_chars(struct fastbuf *fb);
+
+/*** Reading of document/external entities ***/
+
+void NONRET xml_fatal_nested(struct xml_context *ctx);
+
+static inline void
+xml_inc(struct xml_context *ctx)
+{
+ /* Called after the first character of a block */
+ TRACE(ctx, "inc");
+ ctx->depth++;
+}
+
+static inline void
+xml_dec(struct xml_context *ctx)
+{
+ /* Called after the last character of a block */
+ TRACE(ctx, "dec");
+ if (unlikely(!ctx->depth--))
+ xml_fatal_nested(ctx);
+}
+
+#include "obj/sherlock/xml/unicat.h"
+
+static inline uns
+xml_char_cat(uns c)
+{
+ if (c < 0x10000)
+ return 1U << xml_char_tab1[(c & 0xff) + xml_char_tab2[c >> 8]];
+ else if (likely(c < 0x110000))
+ return 1U << xml_char_tab3[c >> 16];
+ else
+ return 1;
+}
+
+static inline uns
+xml_ascii_cat(uns c)
+{
+ return xml_char_tab1[c];
+}
+
+struct xml_source *xml_push_source(struct xml_context *ctx);
+void xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent);
+
+void xml_refill(struct xml_context *ctx);
+
+static inline uns
+xml_peek_char(struct xml_context *ctx)
+{
+ if (ctx->bptr == ctx->bstop)
+ xml_refill(ctx);
+ return ctx->bptr[0];
+}
+
+static inline uns
+xml_peek_cat(struct xml_context *ctx)
+{
+ if (ctx->bptr == ctx->bstop)
+ xml_refill(ctx);
+ return ctx->bptr[1];
+}
+
+static inline uns
+xml_get_char(struct xml_context *ctx)
+{
+ uns c = xml_peek_char(ctx);
+ ctx->bptr += 2;
+ return c;
+}
+
+static inline uns
+xml_get_cat(struct xml_context *ctx)
+{
+ uns c = xml_peek_cat(ctx);
+ ctx->bptr += 2;
+ return c;
+}
+
+static inline uns
+xml_last_char(struct xml_context *ctx)
+{
+ return ctx->bptr[-2];
+}
+
+static inline uns
+xml_last_cat(struct xml_context *ctx)
+{
+ return ctx->bptr[-1];
+}
+
+static inline uns
+xml_skip_char(struct xml_context *ctx)
+{
+ uns c = ctx->bptr[0];
+ ctx->bptr += 2;
+ return c;
+}
+
+static inline uns
+xml_unget_char(struct xml_context *ctx)
+{
+ return *(ctx->bptr -= 2);
+}
+
+void xml_sources_cleanup(struct xml_context *ctx);
+
+/*** Parsing ***/
+
+void NONRET xml_fatal_expected(struct xml_context *ctx, uns c);
+void NONRET xml_fatal_expected_white(struct xml_context *ctx);
+void NONRET xml_fatal_expected_quot(struct xml_context *ctx);
+
+static inline uns
+xml_parse_white(struct xml_context *ctx, uns mandatory)
+{
+ /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+
+ * mandatory=0 -> S? */
+ uns cnt = 0;
+ while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
+ {
+ xml_skip_char(ctx);
+ cnt++;
+ }
+ if (unlikely(mandatory && !cnt))
+ xml_fatal_expected_white(ctx);
+ return cnt;
+}
+
+static inline void
+xml_parse_char(struct xml_context *ctx, uns c)
+{
+ /* Consumes a given Unicode character */
+ if (unlikely(c != xml_get_char(ctx)))
+ xml_fatal_expected(ctx, c);
+}
+
+static inline void
+xml_parse_seq(struct xml_context *ctx, const char *seq)
+{
+ /* Consumes a given sequence of ASCII characters */
+ while (*seq)
+ xml_parse_char(ctx, *seq++);
+}
+
+void xml_parse_eq(struct xml_context *ctx);
+
+static inline uns
+xml_parse_quote(struct xml_context *ctx)
+{
+ /* "'" | '"' */
+ uns c = xml_get_char(ctx);
+ if (unlikely(c != '\'' && c != '\"'))
+ xml_fatal_expected_quot(ctx);
+ return c;
+}
+
+char *xml_parse_name(struct xml_context *ctx, struct mempool *pool);
+void xml_skip_name(struct xml_context *ctx);
+char *xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool);
+
+char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool);
+char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool);
+
+uns xml_parse_char_ref(struct xml_context *ctx);
+void xml_parse_pe_ref(struct xml_context *ctx);
+
+char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr);
+
+void xml_skip_internal_subset(struct xml_context *ctx);
+void xml_parse_notation_decl(struct xml_context *ctx);
+void xml_parse_entity_decl(struct xml_context *ctx);
+void xml_parse_element_decl(struct xml_context *ctx);
+void xml_parse_attr_list_decl(struct xml_context *ctx);
+
+void xml_push_comment(struct xml_context *ctx);
+void xml_pop_comment(struct xml_context *ctx);
+void xml_skip_comment(struct xml_context *ctx);
+
+void xml_push_pi(struct xml_context *ctx);
+void xml_pop_pi(struct xml_context *ctx);
+void xml_skip_pi(struct xml_context *ctx);
+
+void xml_attrs_table_init(struct xml_context *ctx);
+void xml_attrs_table_cleanup(struct xml_context *ctx);
+
+void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value);
+
+#endif
--- /dev/null
+# pkg-config metadata for libshxml
+
+libdir=@LIBDIR@
+incdir=.
+
+Name: libshxml
+Description: XML parser for Sherlock project
+Version: @SHERLOCK_VERSION@
+Cflags: -I${incdir}
+Libs: -L${libdir} -lshxml
+Requires: @DEPS@
--- /dev/null
+/*
+ * Sherlock Library -- A simple XML parser
+ *
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+#undef LOCAL_DEBUG
+
+#include "sherlock/sherlock.h"
+#include "sherlock/xml/xml.h"
+#include "sherlock/xml/dtd.h"
+#include "sherlock/xml/internals.h"
+#include "ucw/fastbuf.h"
+#include "ucw/ff-unicode.h"
+#include "ucw/unicode.h"
+#include "ucw/chartype.h"
+#include "ucw/hashfunc.h"
+
+#include <setjmp.h>
+
+/*** Basic parsing ***/
+
+void NONRET
+xml_fatal_expected(struct xml_context *ctx, uns c)
+{
+ if (c >= 32 && c < 128)
+ xml_fatal(ctx, "Expected '%c'", c);
+ else
+ xml_fatal(ctx, "Expected U+%04x", c);
+}
+
+void NONRET
+xml_fatal_expected_white(struct xml_context *ctx)
+{
+ xml_fatal(ctx, "Expected a white space");
+}
+
+void NONRET
+xml_fatal_expected_quot(struct xml_context *ctx)
+{
+ xml_fatal(ctx, "Expected a quotation mark");
+}
+
+void
+xml_parse_eq(struct xml_context *ctx)
+{
+ /* Eq ::= S? '=' S? */
+ xml_parse_white(ctx, 0);
+ xml_parse_char(ctx, '=');
+ xml_parse_white(ctx, 0);
+}
+
+/*** Names and nmtokens ***/
+
+static char *
+xml_parse_string(struct xml_context *ctx, struct mempool *pool, uns first_cat, uns next_cat, char *err)
+{
+ char *p = mp_start_noalign(pool, 1);
+ if (unlikely(!(xml_peek_cat(ctx) & first_cat)))
+ xml_fatal(ctx, "%s", err);
+ do
+ {
+ p = mp_spread(pool, p, 5);
+ p = utf8_32_put(p, xml_skip_char(ctx));
+ }
+ while (xml_peek_cat(ctx) & next_cat);
+ *p++ = 0;
+ return mp_end(pool, p);
+}
+
+static void
+xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err)
+{
+ if (unlikely(!(xml_get_cat(ctx) & first_cat)))
+ xml_fatal(ctx, "%s", err);
+ while (xml_peek_cat(ctx) & next_cat)
+ xml_skip_char(ctx);
+}
+
+char *
+xml_parse_name(struct xml_context *ctx, struct mempool *pool)
+{
+ /* Name ::= NameStartChar (NameChar)* */
+ return xml_parse_string(ctx, pool, ctx->cat_sname, ctx->cat_name, "Expected a name");
+}
+
+void
+xml_skip_name(struct xml_context *ctx)
+{
+ xml_skip_string(ctx, ctx->cat_sname, ctx->cat_name, "Expected a name");
+}
+
+char *
+xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool)
+{
+ /* Nmtoken ::= (NameChar)+ */
+ return xml_parse_string(ctx, pool, ctx->cat_name, ctx->cat_name, "Expected a nmtoken");
+}
+
+/*** Simple literals ***/
+
+char *
+xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool)
+{
+ /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */
+ char *p = mp_start_noalign(pool, 1);
+ uns q = xml_parse_quote(ctx), c;
+ while ((c = xml_get_char(ctx)) != q)
+ {
+ p = mp_spread(pool, p, 5);
+ p = utf8_32_put(p, c);
+ }
+ *p++ = 0;
+ return mp_end(pool, p);
+}
+
+char *
+xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool)
+{
+ /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */
+ char *p = mp_start_noalign(pool, 1);
+ uns q = xml_parse_quote(ctx), c;
+ while ((c = xml_get_char(ctx)) != q)
+ {
+ if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID)))
+ xml_fatal(ctx, "Expected a pubid character");
+ p = mp_spread(pool, p, 2);
+ *p++ = c;
+ }
+ *p++ = 0;
+ return mp_end(pool, p);
+}
+
+/*** Comments ***/
+
+void
+xml_push_comment(struct xml_context *ctx)
+{
+ TRACE(ctx, "push_comment");
+ /* Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
+ * Already parsed: '<!-' */
+ xml_parse_char(ctx, '-');
+ struct xml_node *n = xml_push_dom(ctx, NULL);
+ n->type = XML_NODE_COMMENT;
+ char *p = mp_start_noalign(ctx->pool, 6);
+ while (1)
+ {
+ if (xml_get_char(ctx) == '-')
+ if (xml_get_char(ctx) == '-')
+ break;
+ else
+ *p++ = '-';
+ p = utf8_32_put(p, xml_last_char(ctx));
+ p = mp_spread(ctx->pool, p, 6);
+ }
+ xml_parse_char(ctx, '>');
+ *p = 0;
+ n->len = p - (char *)mp_ptr(ctx->pool);
+ n->text = mp_end(ctx->pool, p + 1);
+ if ((ctx->flags & XML_REPORT_COMMENTS) && ctx->h_comment)
+ ctx->h_comment(ctx);
+}
+
+void
+xml_pop_comment(struct xml_context *ctx)
+{
+ xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_COMMENTS));
+ xml_dec(ctx);
+ TRACE(ctx, "pop_comment");
+}
+
+void
+xml_skip_comment(struct xml_context *ctx)
+{
+ TRACE(ctx, "skip_comment");
+ xml_parse_char(ctx, '-');
+ while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-');
+ xml_parse_char(ctx, '>');
+ xml_dec(ctx);
+}
+
+/*** Processing instructions ***/
+
+void
+xml_push_pi(struct xml_context *ctx)
+{
+ TRACE(ctx, "push_pi");
+ /* Parses a PI to ctx->value and ctx->name:
+ * PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
+ * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
+ * Already parsed: '<?' */
+ struct xml_node *n = xml_push_dom(ctx, NULL);
+ n->type = XML_NODE_PI;
+ n->name = xml_parse_name(ctx, ctx->pool);
+ if (unlikely(!strcasecmp(n->name, "xml")))
+ xml_error(ctx, "Reserved PI target");
+ char *p = mp_start_noalign(ctx->pool, 5);
+ if (!xml_parse_white(ctx, 0))
+ xml_parse_seq(ctx, "?>");
+ else
+ while (1)
+ {
+ if (xml_get_char(ctx) == '?')
+ if (xml_peek_char(ctx) == '>')
+ {
+ xml_skip_char(ctx);
+ break;
+ }
+ else
+ *p++ = '?';
+ else
+ p = utf8_32_put(p, xml_last_char(ctx));
+ p = mp_spread(ctx->pool, p, 5);
+ }
+ *p = 0;
+ n->len = p - (char *)mp_ptr(ctx->pool);
+ n->text = mp_end(ctx->pool, p + 1);
+ if ((ctx->flags & XML_REPORT_PIS) && ctx->h_pi)
+ ctx->h_pi(ctx);
+}
+
+void
+xml_pop_pi(struct xml_context *ctx)
+{
+ xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_PIS));
+ xml_dec(ctx);
+ TRACE(ctx, "pop_pi");
+}
+
+void
+xml_skip_pi(struct xml_context *ctx)
+{
+ TRACE(ctx, "skip_pi");
+ if (ctx->flags & XML_VALIDATING)
+ {
+ struct mempool_state state;
+ mp_save(ctx->stack, &state);
+ if (unlikely(!strcasecmp(xml_parse_name(ctx, ctx->stack), "xml")))
+ xml_error(ctx, "Reserved PI target");
+ mp_restore(ctx->stack, &state);
+ if (!xml_parse_white(ctx, 0))
+ {
+ xml_parse_seq(ctx, "?>");
+ xml_dec(ctx);
+ return;
+ }
+ }
+ while (1)
+ if (xml_get_char(ctx) == '?')
+ if (xml_peek_char(ctx) == '>')
+ break;
+ xml_skip_char(ctx);
+ xml_dec(ctx);
+}
+
+/*** Character references ***/
+
+uns
+xml_parse_char_ref(struct xml_context *ctx)
+{
+ TRACE(ctx, "parse_char_ref");
+ /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
+ * Already parsed: '&#' */
+ uns v = 0;
+ if (xml_get_char(ctx) == 'x')
+ {
+ if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT))
+ {
+ xml_error(ctx, "Expected a hexadecimal value of character reference");
+ goto recover;
+ }
+ do
+ {
+ v = (v << 4) + Cxvalue(xml_last_char(ctx));
+ }
+ while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT));
+ }
+ else
+ {
+ if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT))
+ {
+ xml_error(ctx, "Expected a numeric value of character reference");
+ goto recover;
+ }
+ do
+ {
+ v = v * 10 + xml_last_char(ctx) - '0';
+ }
+ while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT));
+ }
+ uns cat = xml_char_cat(v);
+ if (!(cat & ctx->cat_unrestricted))
+ {
+ xml_error(ctx, "Character reference out of range");
+ goto recover;
+ }
+ if (xml_last_char(ctx) == ';')
+ {
+ xml_dec(ctx);
+ return v;
+ }
+ xml_error(ctx, "Expected ';'");
+recover:
+ while (xml_last_char(ctx) != ';')
+ xml_get_char(ctx);
+ xml_dec(ctx);
+ return UNI_REPLACEMENT;
+}
+
+/*** References to general entities ***/
+
+static void
+xml_parse_ref(struct xml_context *ctx)
+{
+ /* Reference ::= EntityRef | CharRef
+ * EntityRef ::= '&' Name ';'
+ * Already parsed: '&' */
+ struct fastbuf *out = &ctx->chars;
+ if (xml_peek_char(ctx) == '#')
+ {
+ xml_skip_char(ctx);
+ bput_utf8_32(out, xml_parse_char_ref(ctx));
+ }
+ else
+ {
+ TRACE(ctx, "parse_ge_ref");
+ struct mempool_state state;
+ mp_save(ctx->stack, &state);
+ char *name = xml_parse_name(ctx, ctx->stack);
+ xml_parse_char(ctx, ';');
+ struct xml_dtd_entity *ent = xml_dtd_find_entity(ctx, name);
+ if (!ent)
+ {
+ xml_error(ctx, "Unknown entity &%s;", name);
+ bputc(out, '&');
+ bputs(out, name);
+ bputc(out, ';');
+ }
+ else if (ent->flags & XML_DTD_ENTITY_TRIVIAL)
+ {
+ TRACE(ctx, "Trivial entity &%s;", name);
+ bputs(out, ent->text);
+ }
+ else
+ {
+ TRACE(ctx, "Pushed entity &%s;", name);
+ mp_restore(ctx->stack, &state);
+ xml_dec(ctx);
+ xml_push_entity(ctx, ent);
+ return;
+ }
+ mp_restore(ctx->stack, &state);
+ xml_dec(ctx);
+ }
+}
+
+/*** Character data ***/
+
+void
+xml_spout_chars(struct fastbuf *fb)
+{
+ if (fb->bptr < fb->bufend)
+ return;
+ struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb);
+ struct mempool *pool = ctx->pool;
+ if (fb->bufend != fb->buffer)
+ {
+ TRACE(ctx, "growing chars");
+ uns len = fb->bufend - fb->buffer;
+ uns reported = fb->bstop - fb->buffer;
+ fb->buffer = mp_expand(pool);
+ fb->bufend = fb->buffer + mp_avail(pool);
+ fb->bptr = fb->buffer + len;
+ fb->bstop = fb->buffer + reported;
+ }
+ else
+ {
+ TRACE(ctx, "starting chars");
+ mp_save(pool, &ctx->chars_state);
+ fb->bptr = fb->buffer = fb->bstop = mp_start_noalign(pool, 2);
+ fb->bufend = fb->buffer + mp_avail(pool) - 1;
+ }
+}
+
+static inline uns
+xml_end_chars(struct xml_context *ctx, char **out)
+{
+ struct fastbuf *fb = &ctx->chars;
+ uns len = fb->bptr - fb->buffer;
+ if (len)
+ {
+ TRACE(ctx, "ending chars");
+ *fb->bptr = 0;
+ *out = mp_end(ctx->pool, fb->bptr + 1);
+ fb->bufend = fb->bstop = fb->bptr = fb->buffer;
+ }
+ return len;
+}
+
+static inline uns
+xml_report_chars(struct xml_context *ctx, char **out)
+{
+ struct fastbuf *fb = &ctx->chars;
+ uns len = fb->bptr - fb->buffer;
+ if (len)
+ {
+ *fb->bptr = 0;
+ *out = fb->bstop;
+ fb->bstop = fb->bptr;
+ }
+ return len;
+}
+
+static inline uns
+xml_flush_chars(struct xml_context *ctx)
+{
+ char *text, *rtext;
+ uns len = xml_end_chars(ctx, &text), rlen;
+ if (len)
+ {
+ if (ctx->flags & XML_NO_CHARS)
+ {
+ if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_ignorable)
+ ctx->h_ignorable(ctx, text, len);
+ mp_restore(ctx->pool, &ctx->chars_state);
+ return 0;
+ }
+ if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext)))
+ ctx->h_block(ctx, rtext, rlen);
+ if (!(ctx->flags & XML_ALLOC_CHARS) && !(ctx->flags & XML_REPORT_CHARS))
+ {
+ mp_restore(ctx->pool, &ctx->chars_state);
+ return 0;
+ }
+ struct xml_node *n = xml_push_dom(ctx, &ctx->chars_state);
+ n->type = XML_NODE_CHARS;
+ n->text = text;
+ n->len = len;
+ if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars)
+ ctx->h_chars(ctx);
+ }
+ return len;
+}
+
+static inline void
+xml_pop_chars(struct xml_context *ctx)
+{
+ xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS));
+ TRACE(ctx, "pop_chars");
+}
+
+static inline void
+xml_append_chars(struct xml_context *ctx)
+{
+ TRACE(ctx, "append_chars");
+ struct fastbuf *out = &ctx->chars;
+ if (ctx->flags & XML_NO_CHARS)
+ while (xml_get_char(ctx) != '<')
+ if (xml_last_cat(ctx) & XML_CHAR_WHITE)
+ bput_utf8_32(out, xml_last_char(ctx));
+ else
+ {
+ xml_error(ctx, "This element must not contain character data");
+ while (xml_get_char(ctx) != '<');
+ break;
+ }
+ else
+ while (xml_get_char(ctx) != '<')
+ if (xml_last_char(ctx) == '&')
+ {
+ xml_inc(ctx);
+ xml_parse_ref(ctx);
+ }
+ else
+ bput_utf8_32(out, xml_last_char(ctx));
+ xml_unget_char(ctx);
+}
+
+/*** CDATA sections ***/
+
+static void
+xml_skip_cdata(struct xml_context *ctx)
+{
+ TRACE(ctx, "skip_cdata");
+ xml_parse_seq(ctx, "CDATA[");
+ while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>');
+ xml_dec(ctx);
+}
+
+static void
+xml_append_cdata(struct xml_context *ctx)
+{
+ /* CDSect :== '<![CDATA[' (Char* - (Char* ']]>' Char*)) ']]>'
+ * Already parsed: '<![' */
+ TRACE(ctx, "append_cdata");
+ if (ctx->flags & XML_NO_CHARS)
+ {
+ xml_error(ctx, "This element must not contain CDATA");
+ xml_skip_cdata(ctx);
+ return;
+ }
+ xml_parse_seq(ctx, "CDATA[");
+ struct fastbuf *out = &ctx->chars;
+ uns rlen;
+ char *rtext;
+ if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext)))
+ ctx->h_block(ctx, rtext, rlen);
+ while (1)
+ {
+ if (xml_get_char(ctx) == ']')
+ {
+ if (xml_get_char(ctx) == ']')
+ if (xml_get_char(ctx) == '>')
+ break;
+ else
+ bputc(out, ']');
+ bputc(out, ']');
+ }
+ bput_utf8_32(out, xml_last_char(ctx));
+ }
+ if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata && (rlen = xml_report_chars(ctx, &rtext)))
+ ctx->h_cdata(ctx, rtext, rlen);
+ xml_dec(ctx);
+}
+
+/*** Attribute values ***/
+
+char *
+xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED)
+{
+ TRACE(ctx, "parse_attr_value");
+ /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */
+ /* FIXME: -- check value constrains / normalize leading/trailing WS and repeated WS */
+ struct mempool_state state;
+ uns quote = xml_parse_quote(ctx);
+ mp_save(ctx->stack, &state);
+ struct fastbuf *out = &ctx->chars;
+ struct xml_source *src = ctx->src;
+ while (1)
+ {
+ uns c = xml_get_char(ctx);
+ if (c == '&')
+ {
+ xml_inc(ctx);
+ xml_parse_ref(ctx);
+ }
+ else if (c == quote && src == ctx->src)
+ break;
+ else if (c == '<')
+ xml_error(ctx, "Attribute value must not contain '<'");
+ else if (xml_last_cat(ctx) & XML_CHAR_WHITE)
+ bputc(out, ' ');
+ else
+ bput_utf8_32(out, c);
+ }
+ mp_restore(ctx->stack, &state);
+ char *text;
+ return xml_end_chars(ctx, &text) ? text : "";
+}
+
+uns
+xml_normalize_white(struct xml_context *ctx UNUSED, char *text)
+{
+ char *s = text, *d = text;
+ while (*s == 0x20)
+ s++;
+ while (1)
+ {
+ while (*s & ~0x20)
+ *d++ = *s++;
+ if (!*s)
+ break;
+ while (*++s == 0x20);
+ *d++ = 0x20;
+ }
+ if (d != text && d[-1] == 0x20)
+ d--;
+ *d = 0;
+ return d - text;
+}
+
+/*** Attributes ***/
+
+struct xml_attrs_table;
+
+static inline uns
+xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_node *e, char *n)
+{
+ return hash_pointer(e) ^ hash_string(n);
+}
+
+static inline int
+xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_node *e1, char *n1, struct xml_node *e2, char *n2)
+{
+ return (e1 == e2) && !strcmp(n1, n2);
+}
+
+static inline void
+xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_node *e, char *name)
+{
+ a->elem = e;
+ a->name = name;
+ a->val = NULL;
+ a->user = NULL;
+ slist_add_tail(&e->attrs, &a->n);
+}
+
+#define HASH_PREFIX(x) xml_attrs_##x
+#define HASH_NODE struct xml_attr
+#define HASH_KEY_COMPLEX(x) x elem, x name
+#define HASH_KEY_DECL struct xml_node *elem, char *name
+#define HASH_TABLE_DYNAMIC
+#define HASH_GIVE_EQ
+#define HASH_GIVE_HASHFN
+#define HASH_GIVE_INIT_KEY
+#define HASH_WANT_CLEANUP
+#define HASH_WANT_REMOVE
+#define HASH_WANT_LOOKUP
+#define HASH_WANT_FIND
+#define HASH_GIVE_ALLOC
+XML_HASH_GIVE_ALLOC
+#include "ucw/hashtable.h"
+
+static void
+xml_parse_attr(struct xml_context *ctx)
+{
+ TRACE(ctx, "parse_attr");
+ /* Attribute ::= Name Eq AttValue */
+ struct xml_node *e = ctx->node;
+ char *n = xml_parse_name(ctx, ctx->pool);
+ struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, n);
+ xml_parse_eq(ctx);
+ char *v = xml_parse_attr_value(ctx, NULL);
+ if (a->val)
+ {
+ xml_error(ctx, "Attribute %s is not unique in element <%s>", n, e->name);
+ return;
+ }
+ a->val = v;
+ if (!e->dtd)
+ a->dtd = NULL;
+ else if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name)))
+ xml_error(ctx, "Undefined attribute %s in element <%s>", n, e->name);
+ else
+ xml_validate_attr(ctx, a->dtd, a->val);
+}
+
+struct xml_attr *
+xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name)
+{
+ return xml_attrs_find(ctx->tab_attrs, node, name);
+}
+
+char *
+xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name)
+{
+ struct xml_attr *attr = xml_attrs_find(ctx->tab_attrs, node, name);
+ if (attr)
+ return attr->val;
+ if (!node->dtd)
+ return NULL;
+ struct xml_dtd_attr *dtd = xml_dtd_find_attr(ctx, node->dtd, name);
+ return dtd ? dtd->default_value : NULL;
+}
+
+void
+xml_attrs_table_init(struct xml_context *ctx)
+{
+ xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table)));
+}
+
+void
+xml_attrs_table_cleanup(struct xml_context *ctx)
+{
+ xml_attrs_cleanup(ctx->tab_attrs);
+}
+
+/*** Elements ***/
+
+static uns
+xml_validate_element(struct xml_dtd_elem_node *root, struct xml_dtd_elem *elem)
+{
+ if (root->elem)
+ return elem == root->elem;
+ else
+ SLIST_FOR_EACH(struct xml_dtd_elem_node *, son, root->sons)
+ if (xml_validate_element(son, elem))
+ return 1;
+ return 0;
+}
+
+static void
+xml_push_element(struct xml_context *ctx)
+{
+ TRACE(ctx, "push_element");
+ /* EmptyElemTag | STag
+ * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
+ * STag ::= '<' Name (S Attribute)* S? '>'
+ * Already parsed: '<' */
+ struct xml_node *e = xml_push_dom(ctx, NULL);
+ clist_init(&e->sons);
+ e->type = XML_NODE_ELEM;
+ e->name = xml_parse_name(ctx, ctx->pool);
+ slist_init(&e->attrs);
+ if (!e->parent)
+ {
+ ctx->dom = e;
+ if (ctx->doctype && strcmp(e->name, ctx->doctype))
+ xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype);
+ }
+ if (!ctx->dtd)
+ e->dtd = NULL;
+ else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name)))
+ xml_error(ctx, "Undefined element <%s>", e->name);
+ else
+ {
+ struct xml_dtd_elem *dtd = e->dtd, *parent_dtd = e->parent ? e->parent->dtd : NULL;
+ if (dtd->type == XML_DTD_ELEM_MIXED)
+ ctx->flags &= ~XML_NO_CHARS;
+ else
+ ctx->flags |= XML_NO_CHARS;
+ if (parent_dtd)
+ if (parent_dtd->type == XML_DTD_ELEM_EMPTY)
+ xml_error(ctx, "Empty element must not contain children");
+ else if (parent_dtd->type != XML_DTD_ELEM_ANY)
+ {
+ // FIXME: validate regular expressions
+ if (!xml_validate_element(parent_dtd->node, dtd))
+ xml_error(ctx, "Unexpected element <%s>", e->name);
+ }
+ }
+ while (1)
+ {
+ uns white = xml_parse_white(ctx, 0);
+ uns c = xml_get_char(ctx);
+ if (c == '/')
+ {
+ xml_parse_char(ctx, '>');
+ ctx->flags |= XML_EMPTY_ELEM_TAG;
+ break;
+ }
+ else if (c == '>')
+ break;
+ else if (!white)
+ xml_fatal_expected_white(ctx);
+ xml_unget_char(ctx);
+ xml_parse_attr(ctx);
+ }
+ if (e->dtd)
+ SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs)
+ if (a->default_mode == XML_ATTR_REQUIRED)
+ {
+ if (!xml_attrs_find(ctx->tab_attrs, e, a->name))
+ xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name);
+ }
+ else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS)
+ {
+ struct xml_attr *attr = xml_attrs_lookup(ctx->tab_attrs, e, a->name);
+ if (!attr->val)
+ attr->val = a->default_value;
+ }
+ if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag)
+ ctx->h_stag(ctx);
+}
+
+static void
+xml_pop_element(struct xml_context *ctx)
+{
+ TRACE(ctx, "pop_element");
+ if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag)
+ ctx->h_etag(ctx);
+ struct xml_node *e = ctx->node;
+ uns free = !(ctx->flags & XML_ALLOC_TAGS);
+ if (free)
+ {
+ if (!e->parent)
+ ctx->dom = NULL;
+ /* Restore hash table of attributes */
+ SLIST_FOR_EACH(struct xml_attr *, a, e->attrs)
+ xml_attrs_remove(ctx->tab_attrs, a);
+ struct xml_node *n;
+ while (n = clist_head(&e->sons))
+ {
+ if (n->type == XML_NODE_ELEM)
+ {
+ SLIST_FOR_EACH(struct xml_attr *, a, n->attrs)
+ xml_attrs_remove(ctx->tab_attrs, a);
+ clist_insert_list_after(&n->sons, &n->n);
+ }
+ clist_remove(&n->n);
+ }
+ }
+ xml_pop_dom(ctx, free);
+ xml_dec(ctx);
+}
+
+static void
+xml_parse_etag(struct xml_context *ctx)
+{
+ /* ETag ::= '</' Name S? '>'
+ * Already parsed: '<' */
+ struct xml_node *e = ctx->node;
+ ASSERT(e);
+ char *n = e->name;
+ while (*n)
+ {
+ uns c;
+ n = utf8_32_get(n, &c);
+ if (xml_get_char(ctx) != c)
+ goto recover;
+ }
+ xml_parse_white(ctx, 0);
+ if (xml_get_char(ctx) != '>')
+ {
+recover:
+ xml_error(ctx, "Invalid ETag, expected </%s>", e->name);
+ while (xml_get_char(ctx) != '>');
+ }
+ xml_dec(ctx);
+}
+
+/*** Document type declaration ***/
+
+static void
+xml_parse_doctype_decl(struct xml_context *ctx)
+{
+ TRACE(ctx, "parse_doctype_decl");
+ /* doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
+ * Already parsed: '<!'
+ * Terminated before '[' or '>' */
+ if (ctx->doctype)
+ xml_fatal(ctx, "Multiple document types not allowed");
+ xml_parse_seq(ctx, "DOCTYPE");
+ xml_parse_white(ctx, 1);
+ ctx->doctype = xml_parse_name(ctx, ctx->pool);
+ TRACE(ctx, "doctype=%s", ctx->doctype);
+ uns c;
+ if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P'))
+ {
+ if (c == 'S')
+ {
+ xml_parse_seq(ctx, "SYSTEM");
+ xml_parse_white(ctx, 1);
+ ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
+ }
+ else
+ {
+ xml_parse_seq(ctx, "PUBLIC");
+ xml_parse_white(ctx, 1);
+ ctx->public_id = xml_parse_pubid_literal(ctx, ctx->pool);
+ xml_parse_white(ctx, 1);
+ ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
+ }
+ xml_parse_white(ctx, 0);
+ ctx->flags |= XML_HAS_EXTERNAL_SUBSET;
+ }
+ if (xml_peek_char(ctx) == '[')
+ {
+ ctx->flags |= XML_HAS_INTERNAL_SUBSET;
+ xml_skip_char(ctx);
+ xml_inc(ctx);
+ }
+ if (ctx->h_doctype_decl)
+ ctx->h_doctype_decl(ctx);
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/* DTD: Internal subset */
+
+static void
+xml_parse_subset(struct xml_context *ctx, uns external)
+{
+ // FIXME:
+ // -- comments/pi have no parent
+ // -- conditional sections in external subset
+ // -- check corectness of parameter entities
+
+ /* '[' intSubset ']'
+ * intSubset :== (markupdecl | DeclSep)
+ * Already parsed: '['
+ *
+ * extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)*
+ */
+ while (1)
+ {
+ xml_parse_white(ctx, 0);
+ uns c = xml_get_char(ctx);
+ xml_inc(ctx);
+ if (c == '<')
+ if ((c = xml_get_char(ctx)) == '!')
+ switch (c = xml_get_char(ctx))
+ {
+ case '-':
+ xml_push_comment(ctx);
+ xml_pop_comment(ctx);
+ break;
+ case 'N':
+ xml_parse_seq(ctx, "OTATION");
+ xml_parse_notation_decl(ctx);
+ break;
+ case 'E':
+ if ((c = xml_get_char(ctx)) == 'N')
+ {
+ xml_parse_seq(ctx, "TITY");
+ xml_parse_entity_decl(ctx);
+ }
+ else if (c == 'L')
+ {
+ xml_parse_seq(ctx, "EMENT");
+ xml_parse_element_decl(ctx);
+ }
+ else
+ goto invalid_markup;
+ break;
+ case 'A':
+ xml_parse_seq(ctx, "TTLIST");
+ xml_parse_attr_list_decl(ctx);
+ break;
+ default:
+ goto invalid_markup;
+ }
+ else if (c == '?')
+ {
+ xml_push_pi(ctx);
+ xml_pop_pi(ctx);
+ }
+ else
+ goto invalid_markup;
+ else if (c == '%')
+ xml_parse_pe_ref(ctx);
+ else if (c == ']' && !external)
+ {
+ break;
+ }
+ else if (c == '>' && external)
+ {
+ break;
+ }
+ else
+ goto invalid_markup;
+ }
+ xml_dec(ctx);
+ return;
+invalid_markup: ;
+ xml_fatal(ctx, "Invalid markup in the %s subset", external ? "external" : "internal");
+}
+
+/*** The State Machine ***/
+
+uns
+xml_next(struct xml_context *ctx)
+{
+ /* A nasty state machine */
+
+#define PULL(x) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##x; case XML_STATE_##x: ; } while (0)
+#define PULL_STATE(x, s) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##s, XML_STATE_##x; case XML_STATE_##s: ; } while (0)
+
+ TRACE(ctx, "xml_next (state=%u)", ctx->state);
+ jmp_buf throw_buf;
+ ctx->throw_buf = &throw_buf;
+ if (setjmp(throw_buf))
+ {
+error:
+ if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal)
+ ctx->h_fatal(ctx);
+ TRACE(ctx, "raised fatal error");
+ return ctx->state = XML_STATE_EOF;
+ }
+ uns c;
+ switch (ctx->state)
+ {
+ case XML_STATE_START:
+ TRACE(ctx, "entering prolog");
+ ctx->flags |= XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL;
+ if (ctx->h_document_start)
+ ctx->h_document_start(ctx);
+ /* XMLDecl */
+ xml_refill(ctx);
+ if (ctx->h_xml_decl)
+ ctx->h_xml_decl(ctx);
+ PULL(XML_DECL);
+
+ /* Misc* (doctypedecl Misc*)? */
+ while (1)
+ {
+ xml_parse_white(ctx, 0);
+ xml_parse_char(ctx, '<');
+ xml_inc(ctx);
+ if ((c = xml_get_char(ctx)) == '?')
+ /* Processing intruction */
+ if (!(ctx->flags & XML_REPORT_PIS))
+ xml_skip_pi(ctx);
+ else
+ {
+ xml_push_pi(ctx);
+ PULL_STATE(PI, PROLOG_PI);
+ xml_pop_pi(ctx);
+ }
+ else if (c != '!')
+ {
+ /* Found the root tag */
+ xml_unget_char(ctx);
+ goto first_tag;
+ }
+ else if (xml_get_char(ctx) == '-')
+ if (!(ctx->flags & XML_REPORT_COMMENTS))
+ xml_skip_comment(ctx);
+ else
+ {
+ xml_push_comment(ctx);
+ PULL_STATE(COMMENT, PROLOG_COMMENT);
+ xml_pop_comment(ctx);
+ }
+ else
+ {
+ /* DocTypeDecl */
+ xml_unget_char(ctx);
+ xml_parse_doctype_decl(ctx);
+ PULL(DOCTYPE_DECL);
+ if (ctx->flags & XML_HAS_DTD)
+ if (ctx->flags & XML_PARSE_DTD)
+ {
+ xml_dtd_init(ctx);
+ if (ctx->h_dtd_start)
+ ctx->h_dtd_start(ctx);
+ if (ctx->flags & XML_HAS_INTERNAL_SUBSET)
+ {
+ xml_parse_subset(ctx, 0);
+ xml_dec(ctx);
+ }
+ if (ctx->flags & XML_HAS_EXTERNAL_SUBSET)
+ {
+ struct xml_dtd_entity ent = {
+ .system_id = ctx->system_id,
+ .public_id = ctx->public_id,
+ };
+ xml_parse_white(ctx, 0);
+ xml_parse_char(ctx, '>');
+ xml_unget_char(ctx);
+ ASSERT(ctx->h_resolve_entity);
+ ctx->h_resolve_entity(ctx, &ent);
+ ctx->flags |= XML_SRC_EXPECTED_DECL;
+ xml_parse_subset(ctx, 1);
+ xml_unget_char(ctx);;
+ }
+ if (ctx->h_dtd_end)
+ ctx->h_dtd_end(ctx);
+ }
+ else if (ctx->flags & XML_HAS_INTERNAL_SUBSET)
+ xml_skip_internal_subset(ctx);
+ xml_parse_white(ctx, 0);
+ xml_parse_char(ctx, '>');
+ xml_dec(ctx);
+ }
+ }
+
+ case XML_STATE_CHARS:
+
+ while (1)
+ {
+ if (xml_peek_char(ctx) != '<')
+ {
+ /* CharData */
+ xml_append_chars(ctx);
+ continue;
+ }
+ else
+ xml_skip_char(ctx);
+ xml_inc(ctx);
+first_tag:
+
+ if ((c = xml_get_char(ctx)) == '?')
+ {
+ /* PI */
+ if (!(ctx->flags & (XML_REPORT_PIS | XML_ALLOC_PIS)))
+ xml_skip_pi(ctx);
+ else
+ {
+ if (xml_flush_chars(ctx))
+ {
+ PULL_STATE(CHARS, CHARS_BEFORE_PI);
+ xml_pop_chars(ctx);
+ }
+ xml_push_pi(ctx);
+ PULL(PI);
+ xml_pop_pi(ctx);
+ }
+ }
+
+ else if (c == '!')
+ if ((c = xml_get_char(ctx)) == '-')
+ {
+ /* Comment */
+ if (!(ctx->flags & (XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS)))
+ xml_skip_comment(ctx);
+ else
+ {
+ if (xml_flush_chars(ctx))
+ {
+ PULL_STATE(CHARS, CHARS_BEFORE_COMMENT);
+ xml_pop_chars(ctx);
+ }
+ xml_push_comment(ctx);
+ PULL(COMMENT);
+ xml_pop_comment(ctx);
+ }
+ }
+ else if (c == '[')
+ {
+ /* CDATA */
+ xml_append_cdata(ctx);
+ }
+ else
+ xml_fatal(ctx, "Unexpected character after '<!'");
+
+ else if (c != '/')
+ {
+ /* STag | EmptyElemTag */
+ xml_unget_char(ctx);
+ if (xml_flush_chars(ctx))
+ {
+ PULL_STATE(CHARS, CHARS_BEFORE_STAG);
+ xml_pop_chars(ctx);
+ }
+
+ xml_push_element(ctx);
+ PULL(STAG);
+ if (ctx->flags & XML_EMPTY_ELEM_TAG)
+ goto pop_element;
+ }
+
+ else
+ {
+ /* ETag */
+ if (xml_flush_chars(ctx))
+ {
+ PULL_STATE(CHARS, CHARS_BEFORE_ETAG);
+ xml_pop_chars(ctx);
+ }
+
+ xml_parse_etag(ctx);
+pop_element:
+ PULL(ETAG);
+ xml_pop_element(ctx);
+ if (!ctx->node)
+ goto epilog;
+ }
+ }
+
+epilog:
+ /* Misc* */
+ TRACE(ctx, "entering epilog");
+ while (1)
+ {
+ /* Epilog whitespace is the only place, where a valid document can reach EOF */
+ if (setjmp(throw_buf))
+ if (ctx->err_code == XML_ERR_EOF)
+ {
+ TRACE(ctx, "reached EOF");
+ ctx->state = XML_STATE_EOF;
+ if (ctx->h_document_end)
+ ctx->h_document_end(ctx);
+ case XML_STATE_EOF:
+ ctx->err_code = 0;
+ ctx->err_msg = NULL;
+ return XML_STATE_EOF;
+ }
+ else
+ goto error;
+ xml_parse_white(ctx, 0);
+ if (setjmp(throw_buf))
+ goto error;
+
+ /* Misc */
+ xml_parse_char(ctx, '<');
+ xml_inc(ctx);
+ if ((c = xml_get_char(ctx)) == '?')
+ /* Processing instruction */
+ if (!(ctx->flags & XML_REPORT_PIS))
+ xml_skip_pi(ctx);
+ else
+ {
+ xml_push_pi(ctx);
+ PULL_STATE(PI, EPILOG_PI);
+ xml_pop_pi(ctx);
+ }
+ else if (c == '!')
+ {
+ xml_parse_char(ctx, '-');
+ /* Comment */
+ if (!(ctx->flags & XML_REPORT_COMMENTS))
+ xml_skip_comment(ctx);
+ else
+ {
+ xml_push_comment(ctx);
+ PULL_STATE(COMMENT, EPILOG_COMMENT);
+ xml_pop_comment(ctx);
+ }
+ }
+ else
+ xml_fatal(ctx, "Syntax error in the epilog");
+ }
+
+ }
+ ASSERT(0);
+}
+
+uns
+xml_next_state(struct xml_context *ctx, uns pull)
+{
+ uns saved = ctx->pull;
+ ctx->pull = pull;
+ uns res = xml_next(ctx);
+ ctx->pull = saved;
+ return res;
+}
+
+uns
+xml_skip_element(struct xml_context *ctx)
+{
+ ASSERT(ctx->state == XML_STATE_STAG);
+ struct xml_node *node = ctx->node;
+ uns saved = ctx->pull, res;
+ ctx->pull = XML_PULL_ETAG;
+ while ((res = xml_next(ctx)) && ctx->node != node);
+ ctx->pull = saved;
+ return res;
+}
+
+uns
+xml_parse(struct xml_context *ctx)
+{
+ /* This cycle should run only once unless the user overrides the value of ctx->pull in a SAX handler */
+ do
+ {
+ ctx->pull = 0;
+ }
+ while (xml_next(ctx));
+ return ctx->err_code;
+}
+
+char *
+xml_merge_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool)
+{
+ ASSERT(node->type == XML_NODE_ELEM);
+ char *p = mp_start_noalign(pool, 1);
+ XML_NODE_FOR_EACH(son, node)
+ if (son->type == XML_NODE_CHARS)
+ {
+ p = mp_spread(pool, p, son->len + 1);
+ memcpy(p, son->text, son->len);
+ p += son->len;
+ }
+ *p++ = 0;
+ return mp_end(pool, p);
+}
+
+static char *
+xml_append_dom_chars(char *p, struct mempool *pool, struct xml_node *node)
+{
+ XML_NODE_FOR_EACH(son, node)
+ if (son->type == XML_NODE_CHARS)
+ {
+ p = mp_spread(pool, p, son->len + 1);
+ memcpy(p, son->text, son->len);
+ p += son->len;
+ }
+ else if (son->type == XML_NODE_ELEM)
+ p = xml_append_dom_chars(p, pool, son);
+ return p;
+}
+
+char *
+xml_merge_dom_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool)
+{
+ ASSERT(node->type == XML_NODE_ELEM);
+ char *p = mp_start_noalign(pool, 1);
+ p = xml_append_dom_chars(p, pool, node);
+ *p++ = 0;
+ return mp_end(pool, p);
+}
--- /dev/null
+/*
+ * Sherlock Library -- A simple XML parser
+ *
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+#undef LOCAL_DEBUG
+
+#include "sherlock/sherlock.h"
+#include "sherlock/xml/xml.h"
+#include "sherlock/xml/dtd.h"
+#include "sherlock/xml/internals.h"
+#include "ucw/unicode.h"
+#include "ucw/ff-unicode.h"
+#include "charset/charconv.h"
+#include "charset/fb-charconv.h"
+
+/*** Charecter categorization ***/
+
+#include "obj/sherlock/xml/unicat.c"
+
+static void
+xml_init_cats(struct xml_context *ctx)
+{
+ if (!(ctx->flags & XML_VERSION_1_1))
+ {
+ ctx->cat_chars = XML_CHAR_VALID_1_0;
+ ctx->cat_unrestricted = XML_CHAR_VALID_1_0;
+ ctx->cat_new_line = XML_CHAR_NEW_LINE_1_0;
+ ctx->cat_name = XML_CHAR_NAME_1_0;
+ ctx->cat_sname = XML_CHAR_SNAME_1_0;
+ }
+ else
+ {
+ ctx->cat_chars = XML_CHAR_VALID_1_1;
+ ctx->cat_unrestricted = XML_CHAR_UNRESTRICTED_1_1;
+ ctx->cat_new_line = XML_CHAR_NEW_LINE_1_1;
+ ctx->cat_name = XML_CHAR_NAME_1_1;
+ ctx->cat_sname = XML_CHAR_SNAME_1_1;
+ }
+}
+
+/*** Reading of document/external entities ***/
+
+static void NONRET
+xml_eof(struct xml_context *ctx)
+{
+ ctx->err_msg = "Unexpected EOF";
+ ctx->err_code = XML_ERR_EOF;
+ xml_throw(ctx);
+}
+
+void NONRET
+xml_fatal_nested(struct xml_context *ctx)
+{
+ xml_fatal(ctx, "Entity is not nested correctly");
+}
+
+static inline void
+xml_add_char(u32 **bstop, uns c)
+{
+ *(*bstop)++ = c;
+ *(*bstop)++ = xml_char_cat(c);
+}
+
+struct xml_source *
+xml_push_source(struct xml_context *ctx)
+{
+ xml_push(ctx);
+ struct xml_source *src = ctx->src;
+ if (src)
+ {
+ src->bptr = ctx->bptr;
+ src->bstop = ctx->bstop;
+ }
+ src = mp_alloc_zero(ctx->stack, sizeof(*src));
+ src->next = ctx->src;
+ src->saved_depth = ctx->depth;
+ ctx->src = src;
+ ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_DOCUMENT);
+ ctx->bstop = ctx->bptr = src->buf;
+ ctx->depth = 0;
+ return src;
+}
+
+struct xml_source *
+xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb)
+{
+ struct xml_source *src = xml_push_source(ctx);
+ src->fb = fb;
+ return src;
+}
+
+static void
+xml_close_source(struct xml_source *src)
+{
+ bclose(src->fb);
+ if (src->wrapped_fb)
+ bclose(src->wrapped_fb);
+}
+
+static void
+xml_pop_source(struct xml_context *ctx)
+{
+ TRACE(ctx, "pop_source");
+ if (unlikely(ctx->depth != 0))
+ xml_fatal(ctx, "Unexpected end of entity");
+ struct xml_source *src = ctx->src;
+ if (!src)
+ xml_fatal(ctx, "Undefined source");
+ xml_close_source(src);
+ ctx->depth = src->saved_depth;
+ ctx->src = src = src->next;
+ if (src)
+ {
+ ctx->bptr = src->bptr;
+ ctx->bstop = src->bstop;
+ }
+ xml_pop(ctx);
+ if (unlikely(!src))
+ xml_eof(ctx);
+}
+
+void
+xml_sources_cleanup(struct xml_context *ctx)
+{
+ struct xml_source *s;
+ while (s = ctx->src)
+ {
+ ctx->src = s->next;
+ xml_close_source(s);
+ }
+}
+
+static void xml_refill_utf8(struct xml_context *ctx);
+
+void
+xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent UNUSED)
+{
+ xml_error(ctx, "References to external entities are not supported");
+}
+
+void
+xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent)
+{
+ TRACE(ctx, "xml_push_entity");
+ struct xml_source *src;
+ if (ent->flags & XML_DTD_ENTITY_EXTERNAL)
+ {
+ ASSERT(ctx->h_resolve_entity);
+ ctx->h_resolve_entity(ctx, ent);
+ ctx->flags |= XML_SRC_EXPECTED_DECL;
+ src = ctx->src;
+ }
+ else
+ {
+ src = xml_push_source(ctx);
+ fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, strlen(ent->text), 0);
+ }
+ src->refill = xml_refill_utf8;
+ src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line;
+ src->refill_cat2 = ctx->cat_new_line;
+}
+
+static uns
+xml_error_restricted(struct xml_context *ctx, uns c)
+{
+ if (c == ~1U)
+ xml_error(ctx, "Corrupted encoding");
+ else
+ xml_error(ctx, "Restricted char U+%04X", c);
+ return UNI_REPLACEMENT;
+}
+
+void xml_parse_decl(struct xml_context *ctx);
+
+#define REFILL(ctx, func, params...) \
+ struct xml_source *src = ctx->src; \
+ struct fastbuf *fb = src->fb; \
+ if (ctx->bptr == ctx->bstop) \
+ ctx->bptr = ctx->bstop = src->buf; \
+ uns c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \
+ u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \
+ *last_0xd = src->pending_0xd ? bstop : NULL; \
+ do \
+ { \
+ c = func(fb, ##params); \
+ uns t = xml_char_cat(c); \
+ if (t & t1) \
+ /* Typical branch */ \
+ *bstop++ = c, *bstop++ = t; \
+ else if (t & t2) \
+ { \
+ /* New line */ \
+ /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \
+ /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \
+ if (c == 0xd) \
+ last_0xd = bstop + 2; \
+ else if (c != 0x2028 && last_0xd == bstop) \
+ { \
+ last_0xd = NULL; \
+ continue; \
+ } \
+ xml_add_char(&bstop, 0xa), row++; \
+ } \
+ else if (c == '>') \
+ { \
+ /* Used only in XML/TextDecl to switch the encoding */ \
+ *bstop++ = c, *bstop++ = t; \
+ break; \
+ } \
+ else if (~c) \
+ /* Restricted character */ \
+ xml_add_char(&bstop, xml_error_restricted(ctx, c)); \
+ else \
+ { \
+ /* EOF */ \
+ ctx->flags |= XML_SRC_EOF; \
+ break; \
+ } \
+ } \
+ while (bstop < bend); \
+ src->pending_0xd = (last_0xd == bstop); \
+ ctx->bstop = bstop; \
+ src->row = row;
+
+static void
+xml_refill_utf8(struct xml_context *ctx)
+{
+ REFILL(ctx, bget_utf8_repl, ~1U);
+}
+
+static void
+xml_refill_utf16_le(struct xml_context *ctx)
+{
+ REFILL(ctx, bget_utf16_le_repl, ~1U);
+}
+
+static void
+xml_refill_utf16_be(struct xml_context *ctx)
+{
+ REFILL(ctx, bget_utf16_be_repl, ~1U);
+}
+
+#undef REFILL
+
+void
+xml_refill(struct xml_context *ctx)
+{
+ do
+ {
+ if (ctx->flags & XML_SRC_EOF)
+ xml_pop_source(ctx);
+ else if (ctx->flags & XML_SRC_EXPECTED_DECL)
+ xml_parse_decl(ctx);
+ else
+ {
+ ctx->src->refill(ctx);
+ TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2));
+ }
+ }
+ while (ctx->bptr == ctx->bstop);
+}
+
+static uns
+xml_source_row(struct xml_context *ctx, struct xml_source *src)
+{
+ uns row = src->row;
+ for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2)
+ if (p[-1] & src->refill_cat2)
+ row--;
+ return row + 1;
+}
+
+uns
+xml_row(struct xml_context *ctx)
+{
+ return ctx->src ? xml_source_row(ctx, ctx->src) : 0;
+}
+
+/* Document/external entity header */
+
+static char *
+xml_parse_encoding_name(struct xml_context *ctx)
+{
+ /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */
+ char *p = mp_start_noalign(ctx->pool, 1);
+ uns q = xml_parse_quote(ctx);
+ if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME)))
+ xml_fatal(ctx, "Invalid character in the encoding name");
+ while (1)
+ {
+ p = mp_spread(ctx->pool, p, 2);
+ *p++ = xml_last_char(ctx);
+ if (xml_get_char(ctx) == q)
+ break;
+ if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME)))
+ xml_fatal(ctx, "Invalid character in the encoding name");
+ }
+ *p++ = 0;
+ return mp_end(ctx->pool, p);
+}
+
+static void
+xml_init_charconv(struct xml_context *ctx, int cs)
+{
+ // XXX: with a direct access to libcharset tables could be faster
+ struct xml_source *src = ctx->src;
+ TRACE(ctx, "wrapping charset %s", charset_name(cs));
+ src->wrapped_fb = src->fb;
+ src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8);
+}
+
+void
+xml_parse_decl(struct xml_context *ctx)
+{
+ TRACE(ctx, "xml_parse_decl");
+ struct xml_source *src = ctx->src;
+ ctx->flags &= ~XML_SRC_EXPECTED_DECL;
+ uns doc = ctx->flags & XML_SRC_DOCUMENT;
+
+ /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */
+ if (doc)
+ xml_init_cats(ctx);
+ src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line & ~XML_CHAR_GT;
+ src->refill_cat2 = ctx->cat_new_line;
+
+ /* Initialize the supplied charset (if any) or try to guess it */
+ char *expected_encoding = src->expected_encoding;
+ src->refill = xml_refill_utf8;
+ int bom = bpeekc(src->fb);
+ if (bom < 0)
+ ctx->flags |= XML_SRC_EOF;
+ if (!src->fb_encoding)
+ {
+ if (bom == 0xfe)
+ src->refill = xml_refill_utf16_be;
+ else if (bom == 0xff)
+ src->refill = xml_refill_utf16_le;
+ }
+ else
+ {
+ int cs = find_charset_by_name(src->fb_encoding);
+ if (cs == CONV_CHARSET_UTF8)
+ {}
+ else if (cs >= 0)
+ {
+ xml_init_charconv(ctx, cs);
+ bom = 0;
+ }
+ else if (strcasecmp(src->fb_encoding, "UTF-16"))
+ {
+ src->refill = xml_refill_utf16_be;
+ if (bom == 0xff)
+ src->refill = xml_refill_utf16_le;
+ }
+ else if (strcasecmp(src->fb_encoding, "UTF-16BE"))
+ src->refill = xml_refill_utf16_be;
+ else if (strcasecmp(src->fb_encoding, "UTF-16LE"))
+ src->refill = xml_refill_utf16_le;
+ else
+ {
+ xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding);
+ expected_encoding = NULL;
+ }
+ }
+ uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be;
+ if (utf16)
+ src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE";
+ if (!expected_encoding)
+ expected_encoding = src->fb_encoding;
+ if (bom > 0 && xml_peek_char(ctx) == 0xfeff)
+ xml_skip_char(ctx);
+ else if (utf16)
+ xml_error(ctx, "Missing or corrupted BOM");
+ TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?");
+
+ /* Look ahead for presence of XMLDecl or optional TextDecl */
+ if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf))
+ xml_refill(ctx);
+ u32 *bptr = ctx->bptr;
+ uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) &&
+ bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L');
+ if (!have_decl)
+ {
+ if (doc)
+ xml_fatal(ctx, "Missing or corrupted XML header");
+ else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16)
+ xml_error(ctx, "Missing or corrupted entity header");
+ goto exit;
+ }
+ ctx->bptr = bptr + 12;
+ xml_parse_white(ctx, 0);
+
+ /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */
+ if (xml_peek_char(ctx) == 'v')
+ {
+ xml_parse_seq(ctx, "version");
+ xml_parse_eq(ctx);
+ char *version = xml_parse_pubid_literal(ctx, ctx->pool);
+ TRACE(ctx, "version=%s", version);
+ uns v = 0;
+ if (!strcmp(version, "1.1"))
+ v = XML_VERSION_1_1;
+ else if (strcmp(version, "1.0"))
+ {
+ xml_error(ctx, "Unknown XML version string '%s'", version);
+ version = "1.0";
+ }
+ if (doc)
+ {
+ ctx->version_str = version;
+ ctx->flags |= v;
+ }
+ else if (v > (ctx->flags & XML_VERSION_1_1))
+ xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document");
+ if (!xml_parse_white(ctx, !doc))
+ goto end;
+ }
+ else if (doc)
+ {
+ xml_error(ctx, "Expected XML version");
+ ctx->version_str = "1.0";
+ }
+
+ /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */
+ if (xml_peek_char(ctx) == 'e')
+ {
+ xml_parse_seq(ctx, "encoding");
+ xml_parse_eq(ctx);
+ src->decl_encoding = xml_parse_encoding_name(ctx);
+ TRACE(ctx, "encoding=%s", src->decl_encoding);
+ if (!xml_parse_white(ctx, 0))
+ goto end;
+ }
+ else if (!doc)
+ xml_error(ctx, "Expected XML encoding");
+
+ /* Parse whether the document is standalone (optional in XMLDecl) */
+ if (doc && xml_peek_char(ctx) == 's')
+ {
+ xml_parse_seq(ctx, "standalone");
+ xml_parse_eq(ctx);
+ uns c = xml_parse_quote(ctx);
+ if (ctx->standalone = (xml_peek_char(ctx) == 'y'))
+ xml_parse_seq(ctx, "yes");
+ else
+ xml_parse_seq(ctx, "no");
+ xml_parse_char(ctx, c);
+ TRACE(ctx, "standalone=%d", ctx->standalone);
+ xml_parse_white(ctx, 0);
+ }
+end:
+ xml_parse_seq(ctx, "?>");
+
+ /* Switch to the final encoding */
+ if (src->decl_encoding)
+ {
+ int cs = find_charset_by_name(src->decl_encoding);
+ if (cs < 0 && !expected_encoding)
+ xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding);
+ else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8)
+ {
+ xml_init_charconv(ctx, cs);
+ src->fb_encoding = src->decl_encoding;
+ }
+ else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 ||
+ !(!strcasecmp(src->decl_encoding, "UTF-16") ||
+ (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) ||
+ (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE")))))
+ xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding);
+ }
+ if (!src->fb_encoding)
+ src->fb_encoding = "UTF-8";
+ TRACE(ctx, "Final encoding=%s", src->fb_encoding);
+
+exit:
+ /* Update valid Unicode ranges */
+ if (doc)
+ xml_init_cats(ctx);
+ src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line;
+ src->refill_cat2 = ctx->cat_new_line;
+}
--- /dev/null
+#!/usr/bin/perl
+#
+# UCW Library -- Character map for the XML parser
+#
+# (c) 2007 Pavel Charvat <pchar@ucw.cz>
+#
+# This software may be freely distributed and used according to the terms
+# of the GNU Lesser General Public License.
+#
+
+my @cat = ();
+my @lcat = ();
+my %ids = ();
+my %cls = ();
+for (my $i = 0; $i < 0x10000; $i++) { $cat[$i] = 0; }
+for (my $i = 0; $i < 0x11; $i++) { $lcat[$i] = 0; }
+
+my @white = (0x9, 0xA, 0xD, 0x20);
+my @base_char_1_0 = (
+ [0x0041,0x005A], [0x0061,0x007A], [0x00C0,0x00D6], [0x00D8,0x00F6], [0x00F8,0x00FF], [0x0100,0x0131],
+ [0x0134,0x013E], [0x0141,0x0148], [0x014A,0x017E], [0x0180,0x01C3], [0x01CD,0x01F0], [0x01F4,0x01F5],
+ [0x01FA,0x0217], [0x0250,0x02A8], [0x02BB,0x02C1], 0x0386, [0x0388,0x038A], 0x038C, [0x038E,0x03A1],
+ [0x03A3,0x03CE], [0x03D0,0x03D6], 0x03DA, 0x03DC, 0x03DE, 0x03E0, [0x03E2,0x03F3], [0x0401,0x040C],
+ [0x040E,0x044F], [0x0451,0x045C], [0x045E,0x0481], [0x0490,0x04C4], [0x04C7,0x04C8], [0x04CB,0x04CC],
+ [0x04D0,0x04EB], [0x04EE,0x04F5], [0x04F8,0x04F9], [0x0531,0x0556], 0x0559, [0x0561,0x0586], [0x05D0,0x05EA],
+ [0x05F0,0x05F2], [0x0621,0x063A], [0x0641,0x064A], [0x0671,0x06B7], [0x06BA,0x06BE], [0x06C0,0x06CE],
+ [0x06D0,0x06D3], 0x06D5, [0x06E5,0x06E6], [0x0905,0x0939], 0x093D, [0x0958,0x0961], [0x0985,0x098C],
+ [0x098F,0x0990], [0x0993,0x09A8], [0x09AA,0x09B0], 0x09B2, [0x09B6,0x09B9], [0x09DC,0x09DD], [0x09DF,0x09E1],
+ [0x09F0,0x09F1], [0x0A05,0x0A0A], [0x0A0F,0x0A10], [0x0A13,0x0A28], [0x0A2A,0x0A30], [0x0A32,0x0A33],
+ [0x0A35,0x0A36], [0x0A38,0x0A39], [0x0A59,0x0A5C], 0x0A5E, [0x0A72,0x0A74], [0x0A85,0x0A8B], 0x0A8D,
+ [0x0A8F,0x0A91], [0x0A93,0x0AA8], [0x0AAA,0x0AB0], [0x0AB2,0x0AB3], [0x0AB5,0x0AB9], 0x0ABD, 0x0AE0,
+ [0x0B05,0x0B0C], [0x0B0F,0x0B10], [0x0B13,0x0B28], [0x0B2A,0x0B30], [0x0B32,0x0B33], [0x0B36,0x0B39],
+ 0x0B3D, [0x0B5C,0x0B5D], [0x0B5F,0x0B61], [0x0B85,0x0B8A], [0x0B8E,0x0B90], [0x0B92,0x0B95], [0x0B99,0x0B9A],
+ 0x0B9C, [0x0B9E,0x0B9F], [0x0BA3,0x0BA4], [0x0BA8,0x0BAA], [0x0BAE,0x0BB5], [0x0BB7,0x0BB9], [0x0C05,0x0C0C],
+ [0x0C0E,0x0C10], [0x0C12,0x0C28], [0x0C2A,0x0C33], [0x0C35,0x0C39], [0x0C60,0x0C61], [0x0C85,0x0C8C],
+ [0x0C8E,0x0C90], [0x0C92,0x0CA8], [0x0CAA,0x0CB3], [0x0CB5,0x0CB9], 0x0CDE, [0x0CE0,0x0CE1], [0x0D05,0x0D0C],
+ [0x0D0E,0x0D10], [0x0D12,0x0D28], [0x0D2A,0x0D39], [0x0D60,0x0D61], [0x0E01,0x0E2E], 0x0E30, [0x0E32,0x0E33],
+ [0x0E40,0x0E45], [0x0E81,0x0E82], 0x0E84, [0x0E87,0x0E88], 0x0E8A, 0x0E8D, [0x0E94,0x0E97], [0x0E99,0x0E9F],
+ [0x0EA1,0x0EA3], 0x0EA5, 0x0EA7, [0x0EAA,0x0EAB], [0x0EAD,0x0EAE], 0x0EB0, [0x0EB2,0x0EB3], 0x0EBD,
+ [0x0EC0,0x0EC4], [0x0F40,0x0F47], [0x0F49,0x0F69], [0x10A0,0x10C5], [0x10D0,0x10F6], 0x1100, [0x1102,0x1103],
+ [0x1105,0x1107], 0x1109, [0x110B,0x110C], [0x110E,0x1112], 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150,
+ [0x1154,0x1155], 0x1159, [0x115F,0x1161], 0x1163, 0x1165, 0x1167, 0x1169, [0x116D,0x116E], [0x1172,0x1173],
+ 0x1175, 0x119E, 0x11A8, 0x11AB, [0x11AE,0x11AF], [0x11B7,0x11B8], 0x11BA, [0x11BC,0x11C2], 0x11EB, 0x11F0,
+ 0x11F9, [0x1E00,0x1E9B], [0x1EA0,0x1EF9], [0x1F00,0x1F15], [0x1F18,0x1F1D], [0x1F20,0x1F45], [0x1F48,0x1F4D],
+ [0x1F50,0x1F57], 0x1F59, 0x1F5B, 0x1F5D, [0x1F5F,0x1F7D], [0x1F80,0x1FB4], [0x1FB6,0x1FBC], 0x1FBE,
+ [0x1FC2,0x1FC4], [0x1FC6,0x1FCC], [0x1FD0,0x1FD3], [0x1FD6,0x1FDB], [0x1FE0,0x1FEC], [0x1FF2,0x1FF4],
+ [0x1FF6,0x1FFC], 0x2126, [0x212A,0x212B], 0x212E, [0x2180,0x2182], [0x3041,0x3094], [0x30A1,0x30FA],
+ [0x3105,0x312C], [0xAC00,0xD7A3]);
+my @ideographic_1_0 = ([0x4E00,0x9FA5], 0x3007, [0x3021,0x3029]);
+my @combining_char_1_0 = (
+ [0x0300,0x0345], [0x0360,0x0361], [0x0483,0x0486], [0x0591,0x05A1], [0x05A3,0x05B9], [0x05BB,0x05BD],
+ 0x05BF, [0x05C1,0x05C2], 0x05C4, [0x064B,0x0652], 0x0670, [0x06D6,0x06DC], [0x06DD,0x06DF], [0x06E0,0x06E4],
+ [0x06E7,0x06E8], [0x06EA,0x06ED], [0x0901,0x0903], 0x093C, [0x093E,0x094C], 0x094D, [0x0951,0x0954],
+ [0x0962,0x0963], [0x0981,0x0983], 0x09BC, 0x09BE, 0x09BF, [0x09C0,0x09C4], [0x09C7,0x09C8], [0x09CB,0x09CD],
+ 0x09D7, [0x09E2,0x09E3], 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, [0x0A40,0x0A42], [0x0A47,0x0A48], [0x0A4B,0x0A4D],
+ [0x0A70,0x0A71], [0x0A81,0x0A83], 0x0ABC, [0x0ABE,0x0AC5], [0x0AC7,0x0AC9], [0x0ACB,0x0ACD], [0x0B01,0x0B03],
+ 0x0B3C, [0x0B3E,0x0B43], [0x0B47,0x0B48], [0x0B4B,0x0B4D], [0x0B56,0x0B57], [0x0B82,0x0B83], [0x0BBE,0x0BC2],
+ [0x0BC6,0x0BC8], [0x0BCA,0x0BCD], 0x0BD7, [0x0C01,0x0C03], [0x0C3E,0x0C44], [0x0C46,0x0C48], [0x0C4A,0x0C4D],
+ [0x0C55,0x0C56], [0x0C82,0x0C83], [0x0CBE,0x0CC4], [0x0CC6,0x0CC8], [0x0CCA,0x0CCD], [0x0CD5,0x0CD6],
+ [0x0D02,0x0D03], [0x0D3E,0x0D43], [0x0D46,0x0D48], [0x0D4A,0x0D4D], 0x0D57, 0x0E31, [0x0E34,0x0E3A],
+ [0x0E47,0x0E4E], 0x0EB1, [0x0EB4,0x0EB9], [0x0EBB,0x0EBC], [0x0EC8,0x0ECD], [0x0F18,0x0F19], 0x0F35,
+ 0x0F37, 0x0F39, 0x0F3E, 0x0F3F, [0x0F71,0x0F84], [0x0F86,0x0F8B], [0x0F90,0x0F95], 0x0F97, [0x0F99,0x0FAD],
+ [0x0FB1,0x0FB7], 0x0FB9, [0x20D0,0x20DC], 0x20E1, [0x302A,0x302F], 0x3099, 0x309A);
+my @digit_1_0 = (
+ [0x0030,0x0039], [0x0660,0x0669], [0x06F0,0x06F9], [0x0966,0x096F], [0x09E6,0x09EF], [0x0A66,0x0A6F],
+ [0x0AE6,0x0AEF], [0x0B66,0x0B6F], [0x0BE7,0x0BEF], [0x0C66,0x0C6F], [0x0CE6,0x0CEF], [0x0D66,0x0D6F],
+ [0x0E50,0x0E59], [0x0ED0,0x0ED9], [0x0F20,0x0F29]);
+my @extender_1_0 = (
+ 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, [0x3031,0x3035], [0x309D,0x309E], [0x30FC,0x30FE]);
+my @sname_1_1 = (
+ "[:A-Z_a-z]", [0xC0,0xD6], [0xD8,0xF6], [0xF8,0x2FF], [0x370,0x37D], [0x37F,0x1FFF],
+ [0x200C,0x200D], [0x2070,0x218F], [0x2C00,0x2FEF], [0x3001,0xD7FF], [0xF900,0xFDCF], [0xFDF0,0xFFFD], [0x10000,0xEFFFF]);
+
+set("WHITE", @white);
+set("NEW_LINE_1_0", 0xA, 0xD);
+set("NEW_LINE_1_1", 0xA, 0xD, 0x85, 0x2028);
+set("DIGIT", "[0-9]");
+set("XDIGIT", "[0-9a-fA-F]");
+set("VALID_1_0", @white, [0x20,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]);
+set("VALID_1_1", [0x1,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]);
+set("UNRESTRICTED_1_1", @white, [0x20,0x7E], 0x85, [0xA0,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]);
+set("PUBID", 0x20, 0xD, 0xA, "[-a-zA-Z0-9'()+,./:=?:!*#@\$_%]");
+set("ENC_SNAME", "[a-zA-Z]");
+set("ENC_NAME", "[-a-zA-Z0-9._]");
+set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0);
+set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0);
+set("SNAME_1_1", @sname_1_1);
+set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]);
+set("GT", "[>]");
+
+($ARGV[0] eq "" || $ARGV[1] eq "") && die("Invalid usage");
+find_cls();
+open(H, ">", $ARGV[0]) or die("Cannot create $ARGV[0]");
+open(C, ">", $ARGV[1]) or die("Cannot create $ARGV[1]");
+gen_enum();
+gen_tabs();
+close(H);
+close(C);
+
+sub set {
+ my $id = shift;
+ $ids{$id} = scalar keys(%ids) if !defined($ids{$id});
+ my $mask = 1 << $ids{$id};
+ foreach my $i (@_) {
+ if (ref($i) eq "ARRAY") {
+ my $j = $i->[0];
+ for (; $j <= $i->[1] && $j < 0x10000; $j++) { $cat[$j] |= $mask; }
+ for (; $j <= $i->[1]; $j += 0x10000) { $lcat[$j >> 16] |= $mask; }
+ }
+ elsif ($i =~ /^\[/) { for (my $j=0; $j < 128; $j++) { if (chr($j) =~ /$i/) { $cat[$j] |= $mask; } } }
+ else { $cat[$i] |= $mask; }
+ }
+}
+
+sub find_cls {
+ foreach (my $i=0; $i<@cat; $i++) { $cls{$cat[$i]} = scalar keys(%cls) if !defined($cls{$cat[$i]}); }
+ foreach (my $i=0; $i<@lcat; $i++) { $cls{$lcat[$i]} = scalar keys(%cls) if !defined($cls{$lcat[$i]}); }
+}
+
+sub gen_enum {
+ print H "enum xml_char_type {\n";
+ foreach my $id (sort keys %ids) {
+ my $mask = 0;
+ foreach my $i (keys %cls) {
+ $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id}));
+ }
+ printf H " XML_CHAR_%-20s = 0x%08x,\n", $id, $mask;
+ }
+ print H "};\n\n";
+}
+
+sub gen_tabs {
+ my @tab = ();
+ my %hash = ();
+
+ print H "extern const byte xml_char_tab1[];\n";
+ print H "extern const uns xml_char_tab2[];\n";
+ print H "extern const byte xml_char_tab3[];\n";
+
+ print C "const uns xml_char_tab2[] = {\n ";
+ for (my $t=0; $t<256; $t++) {
+ my $i = $t * 256;
+ my @x = ();
+ for (my $j=0; $j<256; $j += 32) {
+ push @x, join(",", map($cls{$_}, @cat[$i+$j..$i+$j+31]));
+ }
+ my $sub = " " . join(",\n ", @x);
+ if (!defined($hash{$sub})) {
+ $hash{$sub} = 256 * scalar @tab;
+ push @tab, $sub;
+ }
+ printf C "0x%x", $hash{$sub};
+ print C ((~$t & 15) ? "," : ($t < 255) ? ",\n " : "\n};\n\n");
+ }
+
+ print C "const byte xml_char_tab1[] = {\n";
+ print C join(",\n\n", @tab);
+ print C "\n};\n\n";
+
+ my @l = ();
+ for (my $i=0; $i<0x11; $i++) {
+ push @l, sprintf("%d", $cls{$lcat[$i]});
+ }
+ print C "const byte xml_char_tab3[] = {" . join(",", @l) . "};\n";
+}
--- /dev/null
+/*
+ * Sherlock Library -- A simple XML parser
+ *
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+#include "sherlock/sherlock.h"
+#include "sherlock/xml/xml.h"
+#include "sherlock/xml/dtd.h"
+#include "ucw/getopt.h"
+#include "ucw/fastbuf.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+
+enum {
+ WANT_FIRST = 0x100,
+ WANT_HIDE_ERRORS,
+ WANT_IGNORE_COMMENTS,
+ WANT_IGNORE_PIS,
+ WANT_REPORT_BLOCKS,
+ WANT_REPORT_IGNORABLE,
+ WANT_FILE_ENTITIES,
+};
+
+static char *shortopts = "spdt" CF_SHORT_OPTS;
+static struct option longopts[] = {
+ CF_LONG_OPTS
+ { "sax", 0, 0, 's' },
+ { "pull", 0, 0, 'p' },
+ { "dom", 0, 0, 't' },
+ { "dtd", 0, 0, 'd' },
+ { "hide-errors", 0, 0, WANT_HIDE_ERRORS },
+ { "ignore-comments", 0, 0, WANT_IGNORE_COMMENTS },
+ { "ignore-pis", 0, 0, WANT_IGNORE_PIS },
+ { "report-blocks", 0, 0, WANT_REPORT_BLOCKS },
+ { "report-ignorable", 0, 0, WANT_REPORT_IGNORABLE },
+ { "file-entities", 0, 0, WANT_FILE_ENTITIES },
+ { NULL, 0, 0, 0 }
+};
+
+static void NONRET
+usage(void)
+{
+ fputs("\
+Usage: xml-test [options] < input.xml\n\
+\n\
+Options:\n"
+CF_USAGE
+"\
+-p, --pull Test PULL interface\n\
+-s, --sax Test SAX interface\n\
+-t, --dom Test DOM interface\n\
+-d, --dtd Enable parsing of DTD\n\
+ --hide-errors Hide warnings and error messages\n\
+ --ignore-comments Ignore comments\n\
+ --ignore-pis Ignore processing instructions\n\
+ --report-blocks Report blocks or characters and CDATA sections\n\
+ --report-ignorable Report ignorable whitespace\n\
+ --file-entities Resolve file external entities (not fully normative)\n\
+\n", stderr);
+ exit(1);
+}
+
+static uns want_sax;
+static uns want_pull;
+static uns want_dom;
+static uns want_parse_dtd;
+static uns want_hide_errors;
+static uns want_ignore_comments;
+static uns want_ignore_pis;
+static uns want_report_blocks;
+static uns want_report_ignorable;
+static uns want_file_entities;
+
+static struct fastbuf *out;
+
+static char *
+node_type(struct xml_node *node)
+{
+ switch (node->type)
+ {
+ case XML_NODE_ELEM: return "element";
+ case XML_NODE_COMMENT: return "comment";
+ case XML_NODE_PI: return "pi";
+ case XML_NODE_CHARS: return "chars";
+ default: return "unknown";
+ }
+}
+
+static void
+show_node(struct xml_node *node)
+{
+ switch (node->type)
+ {
+ case XML_NODE_ELEM:
+ bprintf(out, " <%s>", node->name);
+ XML_ATTR_FOR_EACH(a, node)
+ bprintf(out, " %s='%s'", a->name, a->val);
+ bputc(out, '\n');
+ break;
+ case XML_NODE_COMMENT:
+ bprintf(out, " text='%s'\n", node->text);
+ break;
+ case XML_NODE_PI:
+ bprintf(out, " target=%s text='%s'\n", node->name, node->text);
+ break;
+ case XML_NODE_CHARS:
+ bprintf(out, " text='%s'\n", node->text);
+ break;
+ default:
+ bputc(out, '\n');
+ }
+}
+
+static void
+show_tree(struct xml_node *node, uns level)
+{
+ if (!node)
+ return;
+ bputs(out, "DOM: ");
+ for (uns i = 0; i < level; i++)
+ bputs(out, " ");
+ bputs(out, node_type(node));
+ show_node(node);
+ if (node->type == XML_NODE_ELEM)
+ XML_NODE_FOR_EACH(son, node)
+ show_tree(son, level + 1);
+}
+
+static void
+h_error(struct xml_context *ctx)
+{
+ bprintf(out, "SAX: %s at %u: %s\n", (ctx->err_code < XML_ERR_ERROR) ? "warn" : "error", xml_row(ctx), ctx->err_msg);
+}
+
+static void
+h_document_start(struct xml_context *ctx UNUSED)
+{
+ bputs(out, "SAX: document_start\n");
+}
+
+static void
+h_document_end(struct xml_context *ctx UNUSED)
+{
+ bputs(out, "SAX: document_end\n");
+}
+
+static void
+h_xml_decl(struct xml_context *ctx)
+{
+ bprintf(out, "SAX: xml_decl version=%s standalone=%d fb_encoding=%s\n", ctx->version_str, ctx->standalone, ctx->src->fb_encoding);
+}
+
+static void
+h_doctype_decl(struct xml_context *ctx)
+{
+ bprintf(out, "SAX: doctype_decl type=%s public='%s' system='%s' extsub=%d intsub=%d\n",
+ ctx->doctype, ctx->public_id ? : "", ctx->system_id ? : "",
+ !!(ctx->flags & XML_HAS_EXTERNAL_SUBSET), !!(ctx->flags & XML_HAS_INTERNAL_SUBSET));
+}
+
+static void
+h_comment(struct xml_context *ctx)
+{
+ bputs(out, "SAX: comment");
+ show_node(ctx->node);
+}
+
+static void
+h_pi(struct xml_context *ctx)
+{
+ bputs(out, "SAX: pi");
+ show_node(ctx->node);
+}
+
+static void
+h_stag(struct xml_context *ctx)
+{
+ bputs(out, "SAX: stag");
+ show_node(ctx->node);
+}
+
+static void
+h_etag(struct xml_context *ctx)
+{
+ bprintf(out, "SAX: etag </%s>\n", ctx->node->name);
+}
+
+static void
+h_chars(struct xml_context *ctx)
+{
+ bputs(out, "SAX: chars");
+ show_node(ctx->node);
+}
+
+static void
+h_block(struct xml_context *ctx UNUSED, char *text, uns len UNUSED)
+{
+ bprintf(out, "SAX: block text='%s'\n", text);
+}
+
+static void
+h_cdata(struct xml_context *ctx UNUSED, char *text, uns len UNUSED)
+{
+ bprintf(out, "SAX: cdata text='%s'\n", text);
+}
+
+static void
+h_ignorable(struct xml_context *ctx UNUSED, char *text, uns len UNUSED)
+{
+ bprintf(out, "SAX: ignorable text='%s'\n", text);
+}
+
+static void
+h_dtd_start(struct xml_context *ctx UNUSED)
+{
+ bputs(out, "SAX: dtd_start\n");
+}
+
+static void
+h_dtd_end(struct xml_context *ctx UNUSED)
+{
+ bputs(out, "SAX: dtd_end\n");
+}
+
+static void
+h_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *e)
+{
+ xml_push_fastbuf(ctx, bopen(e->system_id, O_RDONLY, 4096));
+}
+
+int
+main(int argc, char **argv)
+{
+ int opt;
+ cf_def_file = NULL;
+ log_init(argv[0]);
+ while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0)
+ switch (opt)
+ {
+ case 's':
+ want_sax++;
+ break;
+ case 'p':
+ want_pull++;
+ break;
+ case 't':
+ want_dom++;
+ break;
+ case 'd':
+ want_parse_dtd++;
+ break;
+ case WANT_HIDE_ERRORS:
+ want_hide_errors++;
+ break;
+ case WANT_IGNORE_COMMENTS:
+ want_ignore_comments++;
+ break;
+ case WANT_IGNORE_PIS:
+ want_ignore_pis++;
+ break;
+ case WANT_REPORT_BLOCKS:
+ want_report_blocks++;
+ break;
+ case WANT_REPORT_IGNORABLE:
+ want_report_ignorable++;
+ break;
+ case WANT_FILE_ENTITIES:
+ want_file_entities++;
+ break;
+ default:
+ usage();
+ }
+ if (optind != argc)
+ usage();
+
+ out = bfdopen_shared(1, 4096);
+ struct xml_context ctx;
+ xml_init(&ctx);
+ if (!want_hide_errors)
+ ctx.h_warn = ctx.h_error = ctx.h_fatal = h_error;
+ if (want_sax)
+ {
+ ctx.h_document_start = h_document_start;
+ ctx.h_document_end = h_document_end;
+ ctx.h_xml_decl = h_xml_decl;
+ ctx.h_doctype_decl = h_doctype_decl;
+ ctx.h_comment = h_comment;
+ ctx.h_pi = h_pi;
+ ctx.h_stag = h_stag;
+ ctx.h_etag = h_etag;
+ ctx.h_chars = h_chars;
+ if (want_report_blocks)
+ {
+ ctx.h_block = h_block;
+ ctx.h_cdata = h_cdata;
+ }
+ if (want_report_ignorable)
+ ctx.h_ignorable = h_ignorable;
+ ctx.h_dtd_start = h_dtd_start;
+ ctx.h_dtd_end = h_dtd_end;
+ }
+ if (want_dom)
+ ctx.flags |= XML_ALLOC_ALL;
+ if (want_parse_dtd)
+ ctx.flags |= XML_PARSE_DTD;
+ if (want_ignore_comments)
+ ctx.flags &= ~(XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS);
+ if (want_ignore_pis)
+ ctx.flags &= ~(XML_REPORT_PIS | XML_ALLOC_PIS);
+ if (want_file_entities)
+ ctx.h_resolve_entity = h_resolve_entity;
+ xml_push_fastbuf(&ctx, bfdopen_shared(0, 4096));
+ bputs(out, "PULL: start\n");
+ if (want_pull)
+ {
+ ctx.pull = XML_PULL_CHARS | XML_PULL_STAG | XML_PULL_ETAG | XML_PULL_COMMENT | XML_PULL_PI;
+ uns state;
+ while (state = xml_next(&ctx))
+ switch (state)
+ {
+ case XML_STATE_CHARS:
+ bputs(out, "PULL: chars");
+ show_node(ctx.node);
+ break;
+ case XML_STATE_STAG:
+ bputs(out, "PULL: stag");
+ show_node(ctx.node);
+ break;
+ case XML_STATE_ETAG:
+ bprintf(out, "PULL: etag </%s>\n", ctx.node->name);
+ break;
+ case XML_STATE_COMMENT:
+ bputs(out, "PULL: comment");
+ show_node(ctx.node);
+ break;
+ case XML_STATE_PI:
+ bputs(out, "PULL: pi");
+ show_node(ctx.node);
+ break;
+ default:
+ bputs(out, "PULL: unknown\n");
+ break;
+ }
+ }
+ else
+ xml_parse(&ctx);
+ if (ctx.err_code)
+ bprintf(out, "PULL: fatal error at %u: %s\n", xml_row(&ctx), ctx.err_msg);
+ else
+ {
+ bputs(out, "PULL: eof\n");
+ if (want_dom)
+ show_tree(ctx.dom, 0);
+ }
+
+ xml_cleanup(&ctx);
+ bclose(out);
+ return 0;
+}
--- /dev/null
+# Tests for the XML parser
+# (c) 2008 Pavel Charvat <pchar@ucw.cz>
+
+Run: ../obj/sherlock/xml/xml-test
+In: <?xml version="1.0"?>
+ <html></html>
+Out: PULL: start
+ PULL: eof
+
+Run: ../obj/sherlock/xml/xml-test -s
+In: <?xml version="1.0" encoding="ISO-8859-1"?>
+ <html><a a1="val1" a2="val2">text1&amp;<</a>text2</html>
+Out: PULL: start
+ SAX: document_start
+ SAX: xml_decl version=1.0 standalone=0 fb_encoding=ISO-8859-1
+ SAX: stag <html>
+ SAX: stag <a> a1='val1' a2='val2'
+ SAX: chars text='text1&<'
+ SAX: etag </a>
+ SAX: chars text='text2'
+ SAX: etag </html>
+ SAX: document_end
+ PULL: eof
+
+Run: ../obj/sherlock/xml/xml-test -sptd
+In: <?xml version="1.0"?>
+ <!DOCTYPE root [
+ <!ELEMENT root (#PCDATA|a)*>
+ <!ENTITY % pe1 "<!ENTITY e1 'text'>">
+ %pe1;
+ <!ENTITY e2 '<&e1;>'>
+ <!ELEMENT a (#PCDATA)*>
+ ]>
+ <root>&e1;<a>&e2;</a></root>
+Out: PULL: start
+ SAX: document_start
+ SAX: xml_decl version=1.0 standalone=0 fb_encoding=UTF-8
+ SAX: doctype_decl type=root public='' system='' extsub=0 intsub=1
+ SAX: dtd_start
+ SAX: dtd_end
+ SAX: stag <root>
+ PULL: stag <root>
+ SAX: chars text='text'
+ PULL: chars text='text'
+ SAX: stag <a>
+ PULL: stag <a>
+ SAX: chars text='<text>'
+ PULL: chars text='<text>'
+ PULL: etag </a>
+ SAX: etag </a>
+ PULL: etag </root>
+ SAX: etag </root>
+ SAX: document_end
+ PULL: eof
+ DOM: element <root>
+ DOM: chars text='text'
+ DOM: element <a>
+ DOM: chars text='<text>'
--- /dev/null
+/*
+ * Sherlock Library -- A simple XML parser
+ *
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+#ifndef _SHERLOCK_XML_XML_H
+#define _SHERLOCK_XML_XML_H
+
+#include "ucw/clists.h"
+#include "ucw/slists.h"
+#include "ucw/mempool.h"
+#include "ucw/fastbuf.h"
+
+struct xml_context;
+struct xml_dtd_entity;
+
+enum xml_error {
+ XML_ERR_OK = 0,
+ XML_ERR_WARN = 1000, /* Warning */
+ XML_ERR_ERROR = 2000, /* Recoverable error */
+ XML_ERR_FATAL = 3000, /* Unrecoverable error */
+ XML_ERR_EOF,
+};
+
+enum xml_state {
+ XML_STATE_EOF, /* EOF or a fatal error */
+ XML_STATE_START, /* Initial state */
+ XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */
+ XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */
+ XML_STATE_CHARS, /* XML_PULL_CHARS */
+ XML_STATE_STAG, /* XML_PULL_STAG */
+ XML_STATE_ETAG, /* XML_PULL_ETAG */
+ XML_STATE_COMMENT, /* XML_PULL_COMMENT */
+ XML_STATE_PI, /* XML_PULL_PI */
+
+ /* Internal states */
+ XML_STATE_CHARS_BEFORE_STAG,
+ XML_STATE_CHARS_BEFORE_ETAG,
+ XML_STATE_CHARS_BEFORE_CDATA,
+ XML_STATE_CHARS_BEFORE_COMMENT,
+ XML_STATE_CHARS_BEFORE_PI,
+ XML_STATE_PROLOG_COMMENT,
+ XML_STATE_PROLOG_PI,
+ XML_STATE_EPILOG_COMMENT,
+ XML_STATE_EPILOG_PI,
+};
+
+enum xml_pull {
+ XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */
+ XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */
+ XML_PULL_CHARS = 0x00000004,
+ XML_PULL_STAG = 0x00000008,
+ XML_PULL_ETAG = 0x00000010,
+ XML_PULL_COMMENT = 0x00000020,
+ XML_PULL_PI = 0x00000040,
+ XML_PULL_ALL = 0xffffffff,
+};
+
+enum xml_flags {
+ /* Enable reporting of various events via SAX and/or PUSH interface */
+ XML_REPORT_COMMENTS = 0x00000001, /* Report comments */
+ XML_REPORT_PIS = 0x00000002, /* Report processing instructions */
+ XML_REPORT_CHARS = 0x00000004, /* Report characters */
+ XML_REPORT_TAGS = 0x00000008, /* Report element starts/ends */
+ XML_REPORT_MISC = XML_REPORT_COMMENTS | XML_REPORT_PIS,
+ XML_REPORT_ALL = XML_REPORT_MISC | XML_REPORT_CHARS | XML_REPORT_TAGS,
+
+ /* Enable construction of DOM for these types */
+ XML_ALLOC_COMMENTS = 0x00000010, /* Create comment nodes */
+ XML_ALLOC_PIS = 0x00000020, /* Create processing instruction nodes */
+ XML_ALLOC_CHARS = 0x00000040, /* Create character nodes */
+ XML_ALLOC_TAGS = 0x00000080, /* Create element nodes */
+ XML_ALLOC_MISC = XML_ALLOC_COMMENTS | XML_ALLOC_PIS,
+ XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS,
+
+ /* Other parameters */
+ XML_VALIDATING = 0x00000100, /* Validate everything (not fully implemented!) */
+ XML_PARSE_DTD = 0x00000200, /* Enable parsing of DTD */
+ XML_NO_CHARS = 0x00000400, /* The current element must not contain character data (filled automaticaly if using DTD) */
+ XML_ALLOC_DEFAULT_ATTRS = 0x00000800, /* Allocate default attribute values so they can be found by XML_ATTR_FOR_EACH */
+
+ /* Internals, do not change! */
+ XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */
+ XML_VERSION_1_1 = 0x00020000, /* XML version is 1.1, otherwise 1.0 */
+ XML_HAS_EXTERNAL_SUBSET = 0x00040000, /* The document contains a reference to external DTD subset */
+ XML_HAS_INTERNAL_SUBSET = 0x00080000, /* The document contains an internal subset */
+ XML_HAS_DTD = XML_HAS_EXTERNAL_SUBSET | XML_HAS_INTERNAL_SUBSET,
+ XML_SRC_EOF = 0x00100000, /* EOF reached */
+ XML_SRC_EXPECTED_DECL = 0x00200000, /* Just before optional or required XMLDecl/TextDecl */
+ XML_SRC_DOCUMENT = 0x00400000, /* The document entity */
+ XML_SRC_EXTERNAL = 0x00800000, /* An external entity */
+};
+
+enum xml_node_type {
+ XML_NODE_ELEM,
+ XML_NODE_COMMENT,
+ XML_NODE_CHARS,
+ XML_NODE_PI,
+};
+
+#define XML_NODE_FOR_EACH(var, node) CLIST_FOR_EACH(struct xml_node *, var, (node)->sons)
+#define XML_ATTR_FOR_EACH(var, node) SLIST_FOR_EACH(struct xml_attr *, var, (node)->attrs)
+
+struct xml_node {
+ cnode n; /* Node for list of parent's sons */
+ uns type; /* XML_NODE_x */
+ struct xml_node *parent; /* Parent node */
+ char *name; /* Element name / PI target */
+ clist sons; /* Children nodes */
+ union {
+ struct {
+ char *text; /* PI text / Comment / CDATA */
+ uns len; /* Text length in bytes */
+ };
+ struct {
+ struct xml_dtd_elem *dtd; /* Element DTD */
+ slist attrs; /* Link list of element attributes */
+ };
+ };
+ void *user; /* User-defined (initialized to NULL) */
+};
+
+struct xml_attr {
+ snode n; /* Node for elem->attrs */
+ struct xml_node *elem; /* Parent element */
+ struct xml_dtd_attr *dtd; /* Attribute DTD */
+ char *name; /* Attribute name */
+ char *val; /* Attribute value */
+ void *user; /* User-defined (initialized to NULL) */
+};
+
+#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */
+
+struct xml_source {
+ struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */
+ struct fastbuf *fb; /* Source fastbuf */
+ struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */
+ struct fastbuf wrap_fb; /* Fbmem wrapper */
+ u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */
+ u32 *bptr, *bstop; /* Current state of the buffer */
+ uns row; /* File position */
+ char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */
+ char *fb_encoding; /* Encoding of the source fastbuf */
+ char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */
+ uns refill_cat1; /* Character categories, which should be directly passed to the buffer */
+ uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in
+ sequences) */
+ void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */
+ unsigned short *refill_in_to_x; /* Libcharset input table */
+ uns saved_depth; /* Saved ctx->depth */
+ uns pending_0xd; /* The last read character is 0xD */
+};
+
+struct xml_context {
+ /* Error handling */
+ char *err_msg; /* Last error message */
+ enum xml_error err_code; /* Last error code */
+ void *throw_buf; /* Where to jump on error */
+ void (*h_warn)(struct xml_context *ctx); /* Warning callback */
+ void (*h_error)(struct xml_context *ctx); /* Recoverable error callback */
+ void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */
+
+ /* Memory management */
+ struct mempool *pool; /* DOM pool */
+ struct mempool *stack; /* Stack pool (freed as soon as possible) */
+ struct xml_stack *stack_list; /* See xml_push(), xml_pop() */
+ uns flags; /* XML_FLAG_x (restored on xml_pop()) */
+ uns depth; /* Nesting level (for checking of valid source nesting -> valid pushes/pops on memory pools) */
+ struct fastbuf chars; /* Character data / attribute value */
+ struct mempool_state chars_state; /* Mempool state before the current character block has started */
+ char *chars_trivial; /* If not empty, it will be appended to chars */
+ void *tab_attrs; /* Hash table of element attributes */
+
+ /* Input */
+ struct xml_source *src; /* Current source */
+ u32 *bptr, *bstop; /* Buffer with preprocessed characters (validated UCS-4 + category flags) */
+ uns cat_chars; /* Unicode range of supported characters (cdata, attribute values, ...) */
+ uns cat_unrestricted; /* Unrestricted characters (may appear in document/external entities) */
+ uns cat_new_line; /* New line characters */
+ uns cat_name; /* Characters that may appear in names */
+ uns cat_sname; /* Characters that may begin a name */
+
+ /* SAX-like interface */
+ void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */
+ void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */
+ void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */
+ void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration (before optional internal subset) */
+ void (*h_comment)(struct xml_context *ctx); /* Called after a comment (only with XML_REPORT_COMMENTS) */
+ void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */
+ void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */
+ void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */
+ void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */
+ void (*h_block)(struct xml_context *ctx, char *text, uns len); /* Called for each continuous block of characters not reported by h_cdata() (only with XML_REPORT_CHARS) */
+ void (*h_cdata)(struct xml_context *ctx, char *text, uns len); /* Called for each CDATA section (only with XML_REPORT_CHARS) */
+ void (*h_ignorable)(struct xml_context *ctx, char *text, uns len); /* Called for ignorable whitespace (content in tags without #PCDATA) */
+ void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */
+ void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */
+ struct xml_dtd_entity *(*h_find_entity)(struct xml_context *ctx, char *name); /* Called when needed to resolve a general entity */
+ void (*h_resolve_entity)(struct xml_context *ctx, struct xml_dtd_entity *ent); /* User should push source fastbuf for a parsed external entity (either general or parameter) */
+
+ /* DOM */
+ struct xml_node *dom; /* DOM root */
+ struct xml_node *node; /* Current DOM node */
+
+ char *version_str;
+ uns standalone;
+ char *doctype; /* The document type (or NULL if unknown) */
+ char *system_id; /* DTD external id */
+ char *public_id; /* DTD public id */
+ struct xml_dtd *dtd; /* The DTD structure (or NULL) */
+ uns state; /* Current state for the PULL interface (XML_STATE_x) */
+ uns pull; /* Parameters for the PULL interface (XML_PULL_x) */
+};
+
+/* Initialize XML context */
+void xml_init(struct xml_context *ctx);
+
+/* Clean up all internal structures */
+void xml_cleanup(struct xml_context *ctx);
+
+/* Reuse XML context, equivalent to xml_cleanup() and xml_init() */
+void xml_reset(struct xml_context *ctx);
+
+/* Add XML source (fastbuf will be automatically closed) */
+struct xml_source *xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb);
+
+/* Parse without the PUSH interface, return XML_ERR_x code (zero on success) */
+uns xml_parse(struct xml_context *ctx);
+
+/* Parse with the PUSH interface, return XML_STATE_x (zero on EOF or fatal error) */
+uns xml_next(struct xml_context *ctx);
+
+/* Equivalent to xml_next, but with temporarily changed ctx->pull value */
+uns xml_next_state(struct xml_context *ctx, uns pull);
+
+/* May be called on XML_STATE_STAG to skip it's content; can return XML_STATE_ETAG or XML_STATE_EOF on fatal error */
+uns xml_skip_element(struct xml_context *ctx);
+
+/* Returns the current row number in the document entity */
+uns xml_row(struct xml_context *ctx);
+
+/* Finds a given attribute value in a XML_NODE_ELEM node */
+struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name);
+
+/* Similar to xml_attr_find, but it deals also with default values */
+char *xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name);
+
+/* The default value of h_find_entity(), knows <, >, &, ' and " */
+struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name);
+
+/* The default value of h_resolve_entity(), throws an error */
+void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent);
+
+/* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */
+uns xml_normalize_white(struct xml_context *ctx, char *value);
+
+/* Merge character contents of a given element to a single string (not recursive) */
+char *xml_merge_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool);
+
+/* Merge character contents of a given subtree to a single string */
+char *xml_merge_dom_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool);
+
+/* Public part of error handling */
+void xml_warn(struct xml_context *ctx, const char *format, ...);
+void xml_error(struct xml_context *ctx, const char *format, ...);
+void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...);
+
+#endif