From: Pavel Charvat Date: Fri, 6 Dec 2013 14:34:22 +0000 (+0100) Subject: Renamed shxml/* to xml/*. X-Git-Tag: v5.99~63 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=94b06fc32d62e644ebda4c2148c76d65759702e6;p=libucw.git Renamed shxml/* to xml/*. --- diff --git a/Makefile b/Makefile index 53fdd562..68ea310d 100644 --- a/Makefile +++ b/Makefile @@ -50,9 +50,9 @@ LIBIMAGES=$(o)/images/libucw-images.pc include $(s)/images/Makefile endif -ifdef CONFIG_SHXML -LIBSHXML=$(o)/shxml/libucw-xml.pc -include $(s)/shxml/Makefile +ifdef CONFIG_XML +LIBXML=$(o)/xml/libucw-xml.pc +include $(s)/xml/Makefile endif # Build documentation by default? @@ -60,7 +60,7 @@ ifdef CONFIG_DOC all: docs endif -libs: $(LIBUCW) $(LIBSHXML) $(LIBIMAGES) $(LIBCHARSET) +libs: $(LIBUCW) $(LIBXML) $(LIBIMAGES) $(LIBCHARSET) # And finally the default rules of the build system include $(BUILDSYS)/Makebottom diff --git a/debug/default.cfg b/debug/default.cfg index 114ad3a4..cbd208ae 100644 --- a/debug/default.cfg +++ b/debug/default.cfg @@ -14,4 +14,4 @@ Set("CONFIG_IMAGES_DUP"); Set("CONFIG_IMAGES_SIM"); Set("CONFIG_CHARSET"); -Set("CONFIG_SHXML"); +Set("CONFIG_XML"); diff --git a/default.cfg b/default.cfg index 3ec89117..53f7a90d 100644 --- a/default.cfg +++ b/default.cfg @@ -36,7 +36,7 @@ UnSet("CONFIG_CHARSET"); Set("CONFIG_CHARSET_UTILS"); # Libucw-xml -UnSet("CONFIG_UCW_XML"); +UnSet("CONFIG_XML"); # Return success 1; diff --git a/shxml/Makefile b/shxml/Makefile deleted file mode 100644 index 598a1c66..00000000 --- a/shxml/Makefile +++ /dev/null @@ -1,62 +0,0 @@ -# Makefile for the XML parser -# (c) 2007 Pavel Charvat - -DIRS+=shxml -PROGS+=$(o)/shxml/xml-test - -LIBSHXML_MODS=common source parse dtd -LIBSHXML_INCLUDES=xml.h dtd.h - -LIBSHXML_MOD_PATHS=$(addprefix $(o)/shxml/,$(LIBSHXML_MODS)) - -$(o)/shxml/libucw-xml.a: $(addsuffix .o,$(LIBSHXML_MOD_PATHS)) -$(o)/shxml/libucw-xml-pic.a: $(addsuffix .oo,$(LIBSHXML_MOD_PATHS)) -$(o)/shxml/libucw-xml.so: $(addsuffix .oo,$(LIBSHXML_MOD_PATHS)) -$(o)/shxml/libucw-xml.so: SONAME_SUFFIX=.$(UCW_ABI_VERSION) -$(o)/shxml/libucw-xml.pc: $(LIBCHARSET) - -ifdef CONFIG_STATIC_PIC -$(o)/shxml/libucw-xml.pc: $(o)/shxml/libucw-xml-pic.a -endif -ifdef CONFIG_INSTALL_API -$(o)/shxml/libucw-xml.pc: $(o)/shxml/libucw-xml.a $(o)/shxml/libucw-xml-pic.a $(o)/shxml/libucw-xml.so -endif - -$(o)/shxml/common.o: $(o)/shxml/unicat.h -$(o)/shxml/common.oo: $(o)/shxml/unicat.h -$(o)/shxml/source.o: $(o)/shxml/unicat.h -$(o)/shxml/source.oo: $(o)/shxml/unicat.h -$(o)/shxml/dtd.o: $(o)/shxml/unicat.h -$(o)/shxml/dtd.oo: $(o)/shxml/unicat.h -$(o)/shxml/parse.o: $(o)/shxml/unicat.h -$(o)/shxml/parse.oo: $(o)/shxml/unicat.h -$(o)/shxml/unicat.h: $(s)/shxml/unicat.pl - $(M)GEN $(addprefix $(o)/shxml/unicat,.h .c) - $(Q)$< $(addprefix $(o)/shxml/unicat,.h .c) - $(Q)touch $@ - -TESTS+=$(o)/shxml/xml-test.test -$(o)/shxml/xml-test: $(o)/shxml/xml-test.o $(LIBSHXML) -$(o)/shxml/xml-test.test: $(o)/shxml/xml-test - -API_LIBS+=libucw-xml -API_INCLUDES+=$(o)/shxml/.include-stamp -$(o)/shxml/.include-stamp: $(addprefix $(s)/shxml/,$(LIBSHXML_INCLUDES)) -$(o)/shxml/.include-stamp: IDST=shxml -run/lib/pkgconfig/libucw-xml.pc: $(o)/shxml/libucw-xml.pc - -INSTALL_TARGETS+=install-libucw-xml install-libucw-xml-api - -install-libucw-xml: - install -d -m 755 $(DESTDIR)$(INSTALL_LIB_DIR) - install -m 644 run/lib/libucw-xml.so.$(UCW_ABI_VERSION) $(DESTDIR)$(INSTALL_LIB_DIR) - -install-libucw-xml-api: - install -d -m 755 $(DESTDIR)$(INSTALL_INCLUDE_DIR)/shxml $(DESTDIR)$(INSTALL_LIB_DIR) $(DESTDIR)$(INSTALL_PKGCONFIG_DIR) - install -m 644 run/lib/pkgconfig/libucw-xml.pc $(DESTDIR)$(INSTALL_PKGCONFIG_DIR) - install -m 644 $(addprefix run/include/shxml/,$(LIBSHXML_INCLUDES)) $(DESTDIR)$(INSTALL_INCLUDE_DIR)/shxml - ln -sf libucw-xml.so.$(UCW_ABI_VERSION) $(DESTDIR)$(INSTALL_LIB_DIR)/libucw-xml.so - install -m 644 run/lib/libucw-xml.a $(DESTDIR)$(INSTALL_LIB_DIR) - install -m 644 run/lib/libucw-xml-pic.a $(DESTDIR)$(INSTALL_LIB_DIR) - -.PHONY: install-libucw-xml install-libucw-xml-api diff --git a/shxml/TODO b/shxml/TODO deleted file mode 100644 index b8dbc29c..00000000 --- a/shxml/TODO +++ /dev/null @@ -1,15 +0,0 @@ -Non-normative / not-implemented: --- introduce numeric error codes --- cycle detection in internal entities (and possibly external?) --- conditional sections in DTD --- validation of elements (regular expressions, non-cdata) --- validation of attributes (unfinished) --- notations --- URI normalization --- support for xml:space --- support for xml:lang --- full support for standalone documents --- Unicode normalization - -Optimizations: --- detect definitions of trivial entities diff --git a/shxml/common.c b/shxml/common.c deleted file mode 100644 index cfccbf97..00000000 --- a/shxml/common.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Sherlock Library -- A simple XML parser - * - * (c) 2007 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#undef LOCAL_DEBUG - -#include -#include -#include -#include -#include -#include - -#include - -/*** Error handling ***/ - -void NONRET -xml_throw(struct xml_context *ctx) -{ - ASSERT(ctx->err_code && ctx->throw_buf); - longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code); -} - -void -xml_warn(struct xml_context *ctx, const char *format, ...) -{ - if (ctx->h_warn) - { - va_list args; - va_start(args, format); - ctx->err_msg = stk_vprintf(format, args); - ctx->err_code = XML_ERR_WARN; - va_end(args); - ctx->h_warn(ctx); - ctx->err_msg = NULL; - ctx->err_code = XML_ERR_OK; - } -} - -void -xml_error(struct xml_context *ctx, const char *format, ...) -{ - if (ctx->h_error) - { - va_list args; - va_start(args, format); - ctx->err_msg = stk_vprintf(format, args); - ctx->err_code = XML_ERR_ERROR; - va_end(args); - ctx->h_error(ctx); - ctx->err_msg = NULL; - ctx->err_code = XML_ERR_OK; - } -} - -void NONRET -xml_fatal(struct xml_context *ctx, const char *format, ...) -{ - va_list args; - va_start(args, format); - ctx->err_msg = mp_vprintf(ctx->stack, format, args); - ctx->err_code = XML_ERR_FATAL; - ctx->state = XML_STATE_EOF; - va_end(args); - if (ctx->h_fatal) - ctx->h_fatal(ctx); - xml_throw(ctx); -} - -/*** Memory management ***/ - -void * -xml_hash_new(struct mempool *pool, uns size) -{ - void *tab = mp_alloc_zero(pool, size + XML_HASH_HDR_SIZE); - *(void **)tab = pool; - return tab + XML_HASH_HDR_SIZE; -} - -/*** Initialization ***/ - -static struct xml_context xml_defaults = { - .flags = XML_SRC_EOF | XML_REPORT_ALL, - .state = XML_STATE_START, - .h_resolve_entity = xml_def_resolve_entity, - .chars = { - .name = "", - .spout = xml_spout_chars, - .can_overwrite_buffer = 1, - }, -}; - -static void -xml_do_init(struct xml_context *ctx) -{ - xml_attrs_table_init(ctx); -} - -void -xml_init(struct xml_context *ctx) -{ - *ctx = xml_defaults; - ctx->pool = mp_new(65536); - ctx->stack = mp_new(65536); - xml_do_init(ctx); - TRACE(ctx, "init"); -} - -void -xml_cleanup(struct xml_context *ctx) -{ - TRACE(ctx, "cleanup"); - xml_attrs_table_cleanup(ctx); - xml_dtd_cleanup(ctx); - xml_sources_cleanup(ctx); - mp_delete(ctx->pool); - mp_delete(ctx->stack); -} - -void -xml_reset(struct xml_context *ctx) -{ - TRACE(ctx, "reset"); - struct mempool *pool = ctx->pool, *stack = ctx->stack; - xml_attrs_table_cleanup(ctx); - xml_dtd_cleanup(ctx); - xml_sources_cleanup(ctx); - mp_flush(pool); - mp_flush(stack); - *ctx = xml_defaults; - ctx->pool = pool; - ctx->stack = stack; - xml_do_init(ctx); -} diff --git a/shxml/dtd.c b/shxml/dtd.c deleted file mode 100644 index 9c3402d2..00000000 --- a/shxml/dtd.c +++ /dev/null @@ -1,1003 +0,0 @@ -/* - * Sherlock Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#undef LOCAL_DEBUG - -#include -#include -#include -#include -#include -#include -#include - -/* Notations */ - -#define HASH_PREFIX(x) xml_dtd_notns_##x -#define HASH_NODE struct xml_dtd_notn -#define HASH_KEY_STRING name -#define HASH_ZERO_FILL -#define HASH_TABLE_DYNAMIC -#define HASH_WANT_LOOKUP -#define HASH_WANT_FIND -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -struct xml_dtd_notn * -xml_dtd_find_notn(struct xml_context *ctx, char *name) -{ - struct xml_dtd *dtd = ctx->dtd; - struct xml_dtd_notn *notn = xml_dtd_notns_find(dtd->tab_notns, name); - return !notn ? NULL : (notn->flags & XML_DTD_NOTN_DECLARED) ? notn : NULL; -} - -/* General entities */ - -#define HASH_PREFIX(x) xml_dtd_ents_##x -#define HASH_NODE struct xml_dtd_entity -#define HASH_KEY_STRING name -#define HASH_ZERO_FILL -#define HASH_TABLE_DYNAMIC -#define HASH_WANT_FIND -#define HASH_WANT_LOOKUP -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -static struct xml_dtd_entity * -xml_dtd_declare_trivial_entity(struct xml_context *ctx, char *name, char *text) -{ - struct xml_dtd *dtd = ctx->dtd; - struct xml_dtd_entity *ent = xml_dtd_ents_lookup(dtd->tab_ents, name); - if (ent->flags & XML_DTD_ENTITY_DECLARED) - { - xml_warn(ctx, "Entity &%s; already declared", name); - return NULL; - } - slist_add_tail(&dtd->ents, &ent->n); - ent->flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL; - ent->text = text; - return ent; -} - -static void -xml_dtd_declare_default_entities(struct xml_context *ctx) -{ - xml_dtd_declare_trivial_entity(ctx, "lt", "<"); - xml_dtd_declare_trivial_entity(ctx, "gt", ">"); - xml_dtd_declare_trivial_entity(ctx, "amp", "&"); - xml_dtd_declare_trivial_entity(ctx, "apos", "'"); - xml_dtd_declare_trivial_entity(ctx, "quot", "\""); -} - -struct xml_dtd_entity * -xml_def_find_entity(struct xml_context *ctx UNUSED, char *name) -{ -#define ENT(n, t) ent_##n = { .name = #n, .text = t, .flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL } - static struct xml_dtd_entity ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\""); -#undef ENT - switch (name[0]) - { - case 'l': - if (!strcmp(name, "lt")) - return &ent_lt; - break; - case 'g': - if (!strcmp(name, "gt")) - return &ent_gt; - break; - case 'a': - if (!strcmp(name, "amp")) - return &ent_amp; - if (!strcmp(name, "apos")) - return &ent_apos; - break; - case 'q': - if (!strcmp(name, "quot")) - return &ent_quot; - break; - } - return NULL; -} - -struct xml_dtd_entity * -xml_dtd_find_entity(struct xml_context *ctx, char *name) -{ - struct xml_dtd *dtd = ctx->dtd; - if (ctx->h_find_entity) - return ctx->h_find_entity(ctx, name); - else if (dtd) - { - struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_ents, name); - return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL; - } - else - return xml_def_find_entity(ctx, name); -} - -/* Parameter entities */ - -static struct xml_dtd_entity * -xml_dtd_find_pentity(struct xml_context *ctx, char *name) -{ - struct xml_dtd *dtd = ctx->dtd; - struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_pents, name); - return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL; -} - -/* Elements */ - -struct xml_dtd_elems_table; - -static void -xml_dtd_elems_init_data(struct xml_dtd_elems_table *tab UNUSED, struct xml_dtd_elem *e) -{ - slist_init(&e->attrs); -} - -#define HASH_PREFIX(x) xml_dtd_elems_##x -#define HASH_NODE struct xml_dtd_elem -#define HASH_KEY_STRING name -#define HASH_TABLE_DYNAMIC -#define HASH_ZERO_FILL -#define HASH_WANT_FIND -#define HASH_WANT_LOOKUP -#define HASH_GIVE_ALLOC -#define HASH_GIVE_INIT_DATA -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -struct xml_dtd_elem * -xml_dtd_find_elem(struct xml_context *ctx, char *name) -{ - return ctx->dtd ? xml_dtd_elems_find(ctx->dtd->tab_elems, name) : NULL; -} - -/* Element sons */ - -struct xml_dtd_enodes_table; - -static inline uns -xml_dtd_enodes_hash(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) -{ - return hash_pointer(parent) ^ hash_pointer(elem); -} - -static inline int -xml_dtd_enodes_eq(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent1, struct xml_dtd_elem *elem1, struct xml_dtd_elem_node *parent2, struct xml_dtd_elem *elem2) -{ - return (parent1 == parent2) && (elem1 == elem2); -} - -static inline void -xml_dtd_enodes_init_key(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *node, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) -{ - node->parent = parent; - node->elem = elem; -} - -#define HASH_PREFIX(x) xml_dtd_enodes_##x -#define HASH_NODE struct xml_dtd_elem_node -#define HASH_KEY_COMPLEX(x) x parent, x elem -#define HASH_KEY_DECL struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_TABLE_DYNAMIC -#define HASH_ZERO_FILL -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -/* Element attributes */ - -struct xml_dtd_attrs_table; - -static inline uns -xml_dtd_attrs_hash(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem, char *name) -{ - return hash_pointer(elem) ^ hash_string(name); -} - -static inline int -xml_dtd_attrs_eq(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem1, char *name1, struct xml_dtd_elem *elem2, char *name2) -{ - return (elem1 == elem2) && !strcmp(name1, name2); -} - -static inline void -xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_elem *elem, char *name) -{ - attr->elem = elem; - attr->name = name; - slist_add_tail(&elem->attrs, &attr->n); -} - -#define HASH_PREFIX(x) xml_dtd_attrs_##x -#define HASH_NODE struct xml_dtd_attr -#define HASH_ZERO_FILL -#define HASH_TABLE_DYNAMIC -#define HASH_KEY_COMPLEX(x) x elem, x name -#define HASH_KEY_DECL struct xml_dtd_elem *elem, char *name -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -struct xml_dtd_attr * -xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name) -{ - return ctx->dtd ? xml_dtd_attrs_find(ctx->dtd->tab_attrs, elem, name) : NULL; -} - -/* Enumerated attribute values */ - -struct xml_dtd_evals_table; - -static inline uns -xml_dtd_evals_hash(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr, char *val) -{ - return hash_pointer(attr) ^ hash_string(val); -} - -static inline int -xml_dtd_evals_eq(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr1, char *val1, struct xml_dtd_attr *attr2, char *val2) -{ - return (attr1 == attr2) && !strcmp(val1, val2); -} - -static inline void -xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_eval *eval, struct xml_dtd_attr *attr, char *val) -{ - eval->attr = attr; - eval->val = val; -} - -#define HASH_PREFIX(x) xml_dtd_evals_##x -#define HASH_NODE struct xml_dtd_eval -#define HASH_TABLE_DYNAMIC -#define HASH_KEY_COMPLEX(x) x attr, x val -#define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -/* Enumerated attribute notations */ - -struct xml_dtd_enotns_table; - -static inline uns -xml_dtd_enotns_hash(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) -{ - return hash_pointer(attr) ^ hash_pointer(notn); -} - -static inline int -xml_dtd_enotns_eq(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr1, struct xml_dtd_notn *notn1, struct xml_dtd_attr *attr2, struct xml_dtd_notn *notn2) -{ - return (attr1 == attr2) && (notn1 == notn2); -} - -static inline void -xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_enotn *enotn, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) -{ - enotn->attr = attr; - enotn->notn = notn; -} - -#define HASH_PREFIX(x) xml_dtd_enotns_##x -#define HASH_NODE struct xml_dtd_enotn -#define HASH_TABLE_DYNAMIC -#define HASH_KEY_COMPLEX(x) x attr, x notn -#define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -/* DTD initialization/cleanup */ - -void -xml_dtd_init(struct xml_context *ctx) -{ - if (ctx->dtd) - return; - struct mempool *pool = mp_new(4096); - struct xml_dtd *dtd = ctx->dtd = mp_alloc_zero(pool, sizeof(*ctx->dtd)); - dtd->pool = pool; - xml_dtd_ents_init(dtd->tab_ents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); - xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); - xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table))); - xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table))); - xml_dtd_enodes_init(dtd->tab_enodes = xml_hash_new(pool, sizeof(struct xml_dtd_enodes_table))); - xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table))); - xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table))); - xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table))); - xml_dtd_declare_default_entities(ctx); -} - -void -xml_dtd_cleanup(struct xml_context *ctx) -{ - if (!ctx->dtd) - return; - mp_delete(ctx->dtd->pool); - ctx->dtd = NULL; -} - -void -xml_dtd_finish(struct xml_context *ctx) -{ - if (!ctx->dtd) - return; - // FIXME: validity checks -} - -/*** Parsing functions ***/ - -/* References to parameter entities */ - -void -xml_parse_pe_ref(struct xml_context *ctx) -{ - /* PEReference ::= '%' Name ';' - * Already parsed: '%' */ - struct mempool_state state; - mp_save(ctx->stack, &state); - char *name = xml_parse_name(ctx, ctx->stack); - xml_parse_char(ctx, ';'); - struct xml_dtd_entity *ent = xml_dtd_find_pentity(ctx, name); - if (!ent) - xml_error(ctx, "Unknown entity %%%s;", name); - else - { - TRACE(ctx, "Pushed entity %%%s;", name); - mp_restore(ctx->stack, &state); - xml_dec(ctx); - xml_push_entity(ctx, ent); - return; - } - mp_restore(ctx->stack, &state); - xml_dec(ctx); -} - -static uns -xml_parse_dtd_pe(struct xml_context *ctx, uns entity_decl) -{ - /* Already parsed: '%' */ - do - { - xml_inc(ctx); - if (!~entity_decl && (xml_peek_cat(ctx) & XML_CHAR_WHITE)) - { - xml_dec(ctx); - return ~0U; - } - xml_parse_pe_ref(ctx); - while (xml_peek_cat(ctx) & XML_CHAR_WHITE) - xml_skip_char(ctx); - } - while (xml_get_char(ctx) == '%'); - xml_unget_char(ctx); - return 1; -} - -static inline uns -xml_parse_dtd_white(struct xml_context *ctx, uns mandatory) -{ - /* Whitespace or parameter entity, - * mandatory==~0U has a special maening of the whitespace before the '%' character in an parameter entity declaration */ - uns cnt = 0; - while (xml_peek_cat(ctx) & XML_CHAR_WHITE) - { - xml_skip_char(ctx); - cnt = 1; - } - if (xml_peek_char(ctx) == '%') - { - xml_skip_char(ctx); - return xml_parse_dtd_pe(ctx, mandatory); - } - else if (unlikely(mandatory && !cnt)) - xml_fatal_expected_white(ctx); - return cnt; -} - -static void -xml_dtd_parse_external_id(struct xml_context *ctx, char **system_id, char **public_id, uns allow_public) -{ - struct xml_dtd *dtd = ctx->dtd; - uns c = xml_peek_char(ctx); - if (c == 'S') - { - xml_parse_seq(ctx, "SYSTEM"); - xml_parse_dtd_white(ctx, 1); - *public_id = NULL; - *system_id = xml_parse_system_literal(ctx, dtd->pool); - } - else if (c == 'P') - { - xml_parse_seq(ctx, "PUBLIC"); - xml_parse_dtd_white(ctx, 1); - *system_id = NULL; - *public_id = xml_parse_pubid_literal(ctx, dtd->pool); - if (xml_parse_dtd_white(ctx, !allow_public)) - if ((c = xml_peek_char(ctx)) == '\'' || c == '"' || !allow_public) - *system_id = xml_parse_system_literal(ctx, dtd->pool); - } - else - xml_fatal(ctx, "Expected an external ID"); -} - -/* DTD: */ - -void -xml_parse_notation_decl(struct xml_context *ctx) -{ - /* NotationDecl ::= '' - * Already parsed: 'dtd; - xml_parse_dtd_white(ctx, 1); - - struct xml_dtd_notn *notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); - xml_parse_dtd_white(ctx, 1); - char *system_id, *public_id; - xml_dtd_parse_external_id(ctx, &system_id, &public_id, 1); - xml_parse_dtd_white(ctx, 0); - xml_parse_char(ctx, '>'); - - if (notn->flags & XML_DTD_NOTN_DECLARED) - xml_warn(ctx, "Notation %s already declared", notn->name); - else - { - notn->flags = XML_DTD_NOTN_DECLARED; - notn->system_id = system_id; - notn->public_id = public_id; - slist_add_tail(&dtd->notns, ¬n->n); - } - xml_dec(ctx); -} - -/* DTD: */ - -void -xml_parse_entity_decl(struct xml_context *ctx) -{ - /* Already parsed: 'dtd; - uns flags = ~xml_parse_dtd_white(ctx, ~0U) ? 0 : XML_DTD_ENTITY_PARAMETER; - if (flags) - xml_parse_dtd_white(ctx, 1); - struct xml_dtd_entity *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool)); - xml_parse_dtd_white(ctx, 1); - slist *list = flags ? &dtd->pents : &dtd->ents; - if (ent->flags & XML_DTD_ENTITY_DECLARED) - { - xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); - // FIXME: should be only warning - } - uns c, sep = xml_get_char(ctx); - if (sep == '\'' || sep == '"') - { - /* Internal entity: - * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */ - char *p = mp_start_noalign(dtd->pool, 1); - while (1) - { - if ((c = xml_get_char(ctx)) == sep) - break; - if (c == '%') - { - // FIXME - ASSERT(0); - //xml_parse_parameter_ref(ctx); - continue; - } - if (c == '&') - { - xml_inc(ctx); - if (xml_peek_char(ctx) != '#') - { - /* Bypass references to general entities */ - struct mempool_state state; - mp_save(ctx->stack, &state); - char *n = xml_parse_name(ctx, ctx->stack); - xml_parse_char(ctx, ';'); - xml_dec(ctx); - uns l = strlen(n); - p = mp_spread(dtd->pool, p, 3 + l); - *p++ = '&'; - memcpy(p, n, l); - p += l; - *p++ = ';';; - mp_restore(ctx->stack, &state); - continue; - } - else - { - xml_skip_char(ctx); - c = xml_parse_char_ref(ctx); - } - } - p = mp_spread(dtd->pool, p, 5); - p = utf8_32_put(p, c); - } - *p = 0; - ent->len = p - (char *)mp_ptr(dtd->pool); - ent->text = mp_end(dtd->pool, p + 1); - slist_add_tail(list, &ent->n); - ent->flags = flags | XML_DTD_ENTITY_DECLARED; - } - else - { - /* External entity */ - struct xml_dtd_notn *notn = NULL; - char *system_id, *public_id; - xml_unget_char(ctx); - xml_dtd_parse_external_id(ctx, &system_id, &public_id, 0); - if (xml_parse_dtd_white(ctx, 0) && flags && xml_peek_char(ctx) != '>') - { - /* General external unparsed entity */ - flags |= XML_DTD_ENTITY_UNPARSED; - xml_parse_seq(ctx, "NDATA"); - xml_parse_dtd_white(ctx, 1); - notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); - } - slist_add_tail(list, &ent->n); - ent->flags = flags | XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_EXTERNAL; - ent->system_id = system_id; - ent->public_id = public_id; - ent->notn = notn; - } - xml_parse_dtd_white(ctx, 0); - xml_parse_char(ctx, '>'); - xml_dec(ctx); -} - -/* DTD: */ - -void -xml_parse_element_decl(struct xml_context *ctx) -{ - /* Elementdecl ::= '' - * Already parsed: 'dtd; - xml_parse_dtd_white(ctx, 1); - char *name = xml_parse_name(ctx, dtd->pool); - xml_parse_dtd_white(ctx, 1); - struct xml_dtd_elem *elem = xml_dtd_elems_lookup(dtd->tab_elems, name); - if (elem->flags & XML_DTD_ELEM_DECLARED) - xml_fatal(ctx, "Element <%s> already declared", name); - - /* contentspec ::= 'EMPTY' | 'ANY' | Mixed | children */ - uns c = xml_peek_char(ctx); - if (c == 'E') - { - xml_parse_seq(ctx, "EMPTY"); - elem->type = XML_DTD_ELEM_EMPTY; - } - else if (c == 'A') - { - xml_parse_seq(ctx, "ANY"); - elem->type = XML_DTD_ELEM_ANY; - } - else if (c == '(') - { - xml_skip_char(ctx); - xml_inc(ctx); - xml_parse_dtd_white(ctx, 0); - struct xml_dtd_elem_node *parent = elem->node = mp_alloc_zero(dtd->pool, sizeof(*parent)); - if (xml_peek_char(ctx) == '#') - { - /* Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' */ - xml_skip_char(ctx); - xml_parse_seq(ctx, "PCDATA"); - elem->type = XML_DTD_ELEM_MIXED; - parent->type = XML_DTD_ELEM_PCDATA; - while (1) - { - xml_parse_dtd_white(ctx, 0); - if ((c = xml_get_char(ctx)) == ')') - break; - else if (c != '|') - xml_fatal_expected(ctx, ')'); - xml_parse_dtd_white(ctx, 0); - struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); - if (xml_dtd_enodes_find(dtd->tab_enodes, parent, son_elem)) - xml_error(ctx, "Duplicate content '%s'", son_elem->name); - else - { - struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); - slist_add_tail(&parent->sons, &son->n); - } - } - xml_dec(ctx); - if (xml_peek_char(ctx) == '*') - { - xml_skip_char(ctx); - parent->occur = XML_DTD_ELEM_OCCUR_MULT; - } - else if (!slist_head(&parent->sons)) - parent->occur = XML_DTD_ELEM_OCCUR_ONCE; - else - xml_fatal_expected(ctx, '*'); - } - else - { - /* children ::= (choice | seq) ('?' | '*' | '+')? - * cp ::= (Name | choice | seq) ('?' | '*' | '+')? - * choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' - * seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' */ - - elem->type = XML_DTD_ELEM_CHILDREN; - parent->type = XML_DTD_ELEM_PCDATA; - uns c; - goto first; - - while (1) - { - /* After name */ - xml_parse_dtd_white(ctx, 0); - if ((c = xml_get_char(ctx)) == ')') - { - xml_dec(ctx); - if (parent->type == XML_DTD_ELEM_PCDATA) - parent->type = XML_DTD_ELEM_SEQ; - if ((c = xml_get_char(ctx)) == '?') - parent->occur = XML_DTD_ELEM_OCCUR_OPT; - else if (c == '*') - parent->occur = XML_DTD_ELEM_OCCUR_MULT; - else if (c == '+') - parent->occur = XML_DTD_ELEM_OCCUR_PLUS; - else - { - xml_unget_char(ctx); - parent->occur = XML_DTD_ELEM_OCCUR_ONCE; - } - if (!parent->parent) - break; - parent = parent->parent; - continue; - } - else if (c == '|') - { - if (parent->type == XML_DTD_ELEM_PCDATA) - parent->type = XML_DTD_ELEM_OR; - else if (parent->type != XML_DTD_ELEM_OR) - xml_fatal(ctx, "Mixed operators in the list of element children"); - } - else if (c == ',') - { - if (parent->type == XML_DTD_ELEM_PCDATA) - parent->type = XML_DTD_ELEM_SEQ; - else if (parent->type != XML_DTD_ELEM_SEQ) - xml_fatal(ctx, "Mixed operators in the list of element children"); - } - else if (c == '(') - { - xml_inc(ctx); - struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); - son->parent = parent; - slist_add_tail(&parent->sons, &son->n); - parent = son->parent; - son->type = XML_DTD_ELEM_MIXED; - } - else - xml_unget_char(ctx); - - /* Before name */ - xml_parse_dtd_white(ctx, 0); -first:; - struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); - // FIXME: duplicates, occurance - //struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); - struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); - son->parent = parent; - son->elem = son_elem; - slist_add_tail(&parent->sons, &son->n); - } - } - } - else - xml_fatal(ctx, "Expected element content specification"); - - xml_parse_dtd_white(ctx, 0); - xml_parse_char(ctx, '>'); - xml_dec(ctx); -} - -void -xml_parse_attr_list_decl(struct xml_context *ctx) -{ - /* AttlistDecl ::= '' - * AttDef ::= S Name S AttType S DefaultDecl - * Already parsed: 'dtd; - xml_parse_dtd_white(ctx, 1); - struct xml_dtd_elem *elem = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); - - while (xml_parse_dtd_white(ctx, 0) && xml_peek_char(ctx) != '>') - { - char *name = xml_parse_name(ctx, dtd->pool); - struct xml_dtd_attr *attr = xml_dtd_attrs_find(dtd->tab_attrs, elem, name); - uns ignored = 0; - if (attr) - { - xml_warn(ctx, "Duplicate attribute definition"); - ignored++; - } - else - attr = xml_dtd_attrs_new(ctx->dtd->tab_attrs, elem, name); - xml_parse_dtd_white(ctx, 1); - if (xml_peek_char(ctx) == '(') - { - xml_skip_char(ctx); // FIXME: xml_inc/dec ? - if (!ignored) - attr->type = XML_ATTR_ENUM; - do - { - xml_parse_dtd_white(ctx, 0); - char *value = xml_parse_nmtoken(ctx, dtd->pool); - if (!ignored) - if (xml_dtd_evals_find(ctx->dtd->tab_evals, attr, value)) - xml_error(ctx, "Duplicate enumeration value"); - else - xml_dtd_evals_new(ctx->dtd->tab_evals, attr, value); - xml_parse_dtd_white(ctx, 0); - } - while (xml_get_char(ctx) == '|'); - xml_unget_char(ctx); - xml_parse_char(ctx, ')'); - } - else - { - char *type = xml_parse_name(ctx, dtd->pool); - enum xml_dtd_attr_type t = XML_ATTR_CDATA; - if (!strcmp(type, "CDATA")) - t = XML_ATTR_CDATA; - else if (!strcmp(type, "ID")) - t = XML_ATTR_ID; - else if (!strcmp(type, "IDREF")) - t = XML_ATTR_IDREF; - else if (!strcmp(type, "IDREFS")) - t = XML_ATTR_IDREFS; - else if (!strcmp(type, "ENTITY")) - t = XML_ATTR_ENTITY; - else if (!strcmp(type, "ENTITIES")) - t = XML_ATTR_ENTITIES; - else if (!strcmp(type, "NMTOKEN")) - t = XML_ATTR_NMTOKEN; - else if (!strcmp(type, "NMTOKENS")) - t = XML_ATTR_NMTOKENS; - else if (!strcmp(type, "NOTATION")) - { - if (elem->type == XML_DTD_ELEM_EMPTY) - xml_fatal(ctx, "Empty element must not have notation attribute"); - // FIXME: An element type MUST NOT have more than one NOTATION attribute specified. - t = XML_ATTR_NOTATION; - xml_parse_dtd_white(ctx, 1); - xml_parse_char(ctx, '('); - do - { - xml_parse_dtd_white(ctx, 0); - struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); - if (!ignored) - if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, attr, n)) - xml_error(ctx, "Duplicate enumerated notation"); - else - xml_dtd_enotns_new(ctx->dtd->tab_enotns, attr, n); - xml_parse_dtd_white(ctx, 0); - } - while (xml_get_char(ctx) == '|'); - xml_unget_char(ctx); - xml_parse_char(ctx, ')'); - } - else - xml_fatal(ctx, "Unknown attribute type"); - if (!ignored) - attr->type = t; - } - xml_parse_dtd_white(ctx, 1); - enum xml_dtd_attr_default def = XML_ATTR_NONE; - if (xml_get_char(ctx) == '#') - switch (xml_peek_char(ctx)) - { - case 'R': - xml_parse_seq(ctx, "REQUIRED"); - def = XML_ATTR_REQUIRED; - break; - case 'I': - xml_parse_seq(ctx, "IMPLIED"); - def = XML_ATTR_IMPLIED; - break; - case 'F': - xml_parse_seq(ctx, "FIXED"); - def = XML_ATTR_FIXED; - xml_parse_dtd_white(ctx, 1); - break; - default: - xml_fatal(ctx, "Expected a modifier for default attribute value"); - } - else - xml_unget_char(ctx); - if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED) - { - char *v = xml_parse_attr_value(ctx, attr); - if (!ignored) - attr->default_value = v; - } - if (!ignored) - attr->default_mode = def; - } - xml_skip_char(ctx); - xml_dec(ctx); -} - -void -xml_skip_internal_subset(struct xml_context *ctx) -{ - TRACE(ctx, "skip_internal_subset"); - /* AlreadyParsed: '[' */ - uns c; - while ((c = xml_get_char(ctx)) != ']') - { - if (c != '<') - continue; - if ((c = xml_get_char(ctx)) == '?') - { - xml_inc(ctx); - xml_skip_pi(ctx); - } - else if (c != '!') - xml_dec(ctx); - else if (xml_get_char(ctx) == '-') - { - xml_inc(ctx); - xml_skip_comment(ctx); - } - else - while ((c = xml_get_char(ctx)) != '>') - if (c == '\'' || c == '"') - while (xml_get_char(ctx) != c); - } - xml_dec(ctx); -} - -/*** Validation of attribute values ***/ - -static uns -xml_check_tokens(char *value, uns first_cat, uns next_cat, uns seq) -{ - char *p = value; - uns u; - while (1) - { - p = utf8_32_get(p, &u); - if (!(xml_char_cat(u) & first_cat)) - return 0; - while (*p & ~0x20) - { - p = utf8_32_get(p, &u); - if (!(xml_char_cat(u) & next_cat)) - return 0; - } - if (!*p) - return 1; - if (!seq) - return 0; - p++; - } -} - -static uns -xml_is_name(struct xml_context *ctx, char *value) -{ - /* Name ::= NameStartChar (NameChar)* */ - return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 0); -} - -static uns -xml_is_names(struct xml_context *ctx, char *value) -{ - /* Names ::= Name (#x20 Name)* */ - return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 1); -} - -static uns -xml_is_nmtoken(struct xml_context *ctx, char *value) -{ - /* Nmtoken ::= (NameChar)+ */ - return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 0); -} - -static uns -xml_is_nmtokens(struct xml_context *ctx, char *value) -{ - /* Nmtokens ::= Nmtoken (#x20 Nmtoken)* */ - return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 1); -} - -static void -xml_err_attr_format(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *type) -{ - xml_error(ctx, "Attribute %s in <%s> does not match the production of %s", dtd->name, dtd->elem->name, type); -} - -void -xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value) -{ - if (dtd->type == XML_ATTR_CDATA) - return; - xml_normalize_white(ctx, value); - switch (dtd->type) - { - case XML_ATTR_ID: - if (!xml_is_name(ctx, value)) - xml_err_attr_format(ctx, dtd, "NAME"); - //FIXME: add to a hash table - break; - case XML_ATTR_IDREF: - if (!xml_is_name(ctx, value)) - xml_err_attr_format(ctx, dtd, "NAME"); - // FIXME: find in hash table (beware forward references) - break; - case XML_ATTR_IDREFS: - if (!xml_is_names(ctx, value)) - xml_err_attr_format(ctx, dtd, "NAMES"); - // FIXME: find - break; - case XML_ATTR_ENTITY: - // FIXME - break; - case XML_ATTR_ENTITIES: - // FIXME - break; - case XML_ATTR_NMTOKEN: - if (!xml_is_nmtoken(ctx, value)) - xml_err_attr_format(ctx, dtd, "NMTOKEN"); - break; - case XML_ATTR_NMTOKENS: - if (!xml_is_nmtokens(ctx, value)) - xml_err_attr_format(ctx, dtd, "NMTOKENS"); - break; - case XML_ATTR_ENUM: - if (!xml_dtd_evals_find(ctx->dtd->tab_evals, dtd, value)) - xml_error(ctx, "Attribute %s in <%s> contains an undefined enumeration value", dtd->name, dtd->elem->name); - break; - case XML_ATTR_NOTATION: - if (!xml_dtd_find_notn(ctx, value)) - xml_error(ctx, "Attribute %s in <%s> contains an undefined notation", dtd->name, dtd->elem->name); - break; - } -} diff --git a/shxml/dtd.h b/shxml/dtd.h deleted file mode 100644 index 493a5f66..00000000 --- a/shxml/dtd.h +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Sherlock Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#ifndef _SHERLOCK_XML_DTD_H -#define _SHERLOCK_XML_DTD_H - -#include - -struct xml_dtd { - struct mempool *pool; /* Memory pool where to allocate DTD */ - slist ents; /* Link list of general entities */ - slist pents; /* Link list of parameter entities */ - slist notns; /* Link list of notations */ - slist elems; /* Link list of elements */ - void *tab_ents; /* Hash table of general entities */ - void *tab_pents; /* Hash table of parameter entities */ - void *tab_notns; /* Hash table of notations */ - void *tab_elems; /* Hash table of elements */ - void *tab_enodes; /* Hash table of element sons */ - void *tab_attrs; /* Hash table of element attributes */ - void *tab_evals; /* Hash table of enumerated attribute values */ - void *tab_enotns; /* hash table of enumerated attribute notations */ -}; - -/* Notations */ - -enum xml_dtd_notn_flags { - XML_DTD_NOTN_DECLARED = 0x1, /* The notation has been declared (internal usage) */ -}; - -struct xml_dtd_notn { - snode n; /* Node in xml_dtd.notns */ - uns flags; /* XML_DTD_NOTN_x */ - char *name; /* Notation name */ - char *system_id; /* External ID */ - char *public_id; - void *user; /* User-defined */ -}; - -struct xml_dtd_notn *xml_dtd_find_notn(struct xml_context *ctx, char *name); - -/* Entities */ - -enum xml_dtd_entity_flags { - XML_DTD_ENTITY_DECLARED = 0x1, /* The entity has been declared (internal usage) */ - XML_DTD_ENTITY_VISITED = 0x2, /* Cycle detection (internal usage) */ - XML_DTD_ENTITY_PARAMETER = 0x4, /* Parameter entity, general otherwise */ - XML_DTD_ENTITY_EXTERNAL = 0x8, /* External entity, internal otherwise */ - XML_DTD_ENTITY_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */ - XML_DTD_ENTITY_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */ -}; - -struct xml_dtd_entity { - snode n; /* Node in xml_dtd.[gp]ents */ - uns flags; /* XML_DTD_ENT_x */ - char *name; /* Entity name */ - char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */ - uns len; /* Text length */ - char *system_id; /* External ID */ - char *public_id; - struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */ - void *user; /* User-defined */ -}; - -struct xml_dtd_entity *xml_dtd_find_entity(struct xml_context *ctx, char *name); - -/* Elements */ - -enum xml_dtd_elem_flags { - XML_DTD_ELEM_DECLARED = 0x1, /* The element has been declared (internal usage) */ -}; - -enum xml_dtd_elem_type { - XML_DTD_ELEM_EMPTY, - XML_DTD_ELEM_ANY, - XML_DTD_ELEM_MIXED, - XML_DTD_ELEM_CHILDREN, -}; - -struct xml_dtd_elem { - snode n; - uns flags; - uns type; - char *name; - struct xml_dtd_elem_node *node; - slist attrs; - void *user; /* User-defined */ -}; - -struct xml_dtd_elem_node { - snode n; - struct xml_dtd_elem_node *parent; - struct xml_dtd_elem *elem; - slist sons; - uns type; - uns occur; - void *user; /* User-defined */ -}; - -enum xml_dtd_elem_node_type { - XML_DTD_ELEM_PCDATA, - XML_DTD_ELEM_SEQ, - XML_DTD_ELEM_OR, -}; - -enum xml_dtd_elem_node_occur { - XML_DTD_ELEM_OCCUR_ONCE, - XML_DTD_ELEM_OCCUR_OPT, - XML_DTD_ELEM_OCCUR_MULT, - XML_DTD_ELEM_OCCUR_PLUS, -}; - -struct xml_dtd_elem *xml_dtd_find_elem(struct xml_context *ctx, char *name); - -/* Attributes */ - -enum xml_dtd_attr_default { - XML_ATTR_NONE, - XML_ATTR_REQUIRED, - XML_ATTR_IMPLIED, - XML_ATTR_FIXED, -}; - -enum xml_dtd_attr_type { - XML_ATTR_CDATA, - XML_ATTR_ID, - XML_ATTR_IDREF, - XML_ATTR_IDREFS, - XML_ATTR_ENTITY, - XML_ATTR_ENTITIES, - XML_ATTR_NMTOKEN, - XML_ATTR_NMTOKENS, - XML_ATTR_ENUM, - XML_ATTR_NOTATION, -}; - -struct xml_dtd_attr { - snode n; - char *name; /* Attribute name */ - struct xml_dtd_elem *elem; /* Owner element */ - uns type; /* See enum xml_dtd_attr_type */ - uns default_mode; /* See enum xml_dtd_attr_default */ - char *default_value; /* The default value defined in DTD (or NULL) */ -}; - -struct xml_dtd_eval { - struct xml_dtd_attr *attr; - char *val; -}; - -struct xml_dtd_enotn { - struct xml_dtd_attr *attr; - struct xml_dtd_notn *notn; -}; - -void xml_dtd_init(struct xml_context *ctx); -void xml_dtd_cleanup(struct xml_context *ctx); -void xml_dtd_finish(struct xml_context *ctx); - -struct xml_dtd_attr *xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name); - -#endif diff --git a/shxml/internals.h b/shxml/internals.h deleted file mode 100644 index ad2c3a8a..00000000 --- a/shxml/internals.h +++ /dev/null @@ -1,311 +0,0 @@ -/* - * Sherlock Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#ifndef _SHERLOCK_XML_INTERNALS_H -#define _SHERLOCK_XML_INTERNALS_H - -#include -#include - -/*** Debugging ***/ - -#ifdef LOCAL_DEBUG -#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0) -#else -#define TRACE(c, f, p...) do {} while(0) -#endif - -/*** Error handling ***/ - -void NONRET xml_throw(struct xml_context *ctx); - -/*** Memory management ***/ - -struct xml_stack { - struct xml_stack *next; - struct mempool_state state; - uns flags; -}; - -static inline void * -xml_do_push(struct xml_context *ctx, uns size) -{ - /* Saves ctx->stack and ctx->flags state */ - struct mempool_state state; - mp_save(ctx->stack, &state); - struct xml_stack *s = mp_alloc(ctx->stack, size); - s->state = state; - s->flags = ctx->flags; - s->next = ctx->stack_list; - ctx->stack_list = s; - return s; -} - -static inline void -xml_do_pop(struct xml_context *ctx, struct xml_stack *s) -{ - /* Restore ctx->stack and ctx->flags state */ - ctx->stack_list = s->next; - ctx->flags = s->flags; - mp_restore(ctx->stack, &s->state); -} - -static inline void -xml_push(struct xml_context *ctx) -{ - TRACE(ctx, "push"); - xml_do_push(ctx, sizeof(struct xml_stack)); -} - -static inline void -xml_pop(struct xml_context *ctx) -{ - TRACE(ctx, "pop"); - ASSERT(ctx->stack_list); - xml_do_pop(ctx, ctx->stack_list); -} - -struct xml_dom_stack { - struct xml_stack stack; - struct mempool_state state; -}; - -static inline struct xml_node * -xml_push_dom(struct xml_context *ctx, struct mempool_state *state) -{ - /* Create a new DOM node */ - TRACE(ctx, "push_dom"); - struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s)); - if (state) - s->state = *state; - else - mp_save(ctx->pool, &s->state); - struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n)); - n->user = NULL; - if (n->parent = ctx->node) - clist_add_tail(&n->parent->sons, &n->n); - return ctx->node = n; -} - -static inline void -xml_pop_dom(struct xml_context *ctx, uns free) -{ - /* Leave DOM subtree */ - TRACE(ctx, "pop_dom"); - ASSERT(ctx->node); - struct xml_node *p = ctx->node->parent; - struct xml_dom_stack *s = (void *)ctx->stack_list; - if (free) - { - /* See xml_pop_element() for cleanup of attribute hash table */ - if (p) - clist_remove(&ctx->node->n); - mp_restore(ctx->pool, &s->state); - } - ctx->node = p; - xml_do_pop(ctx, &s->stack); -} - -#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN) -#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \ - static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uns size) \ - { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \ - static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {} - -void *xml_hash_new(struct mempool *pool, uns size); - -void xml_spout_chars(struct fastbuf *fb); - -/*** Reading of document/external entities ***/ - -void NONRET xml_fatal_nested(struct xml_context *ctx); - -static inline void -xml_inc(struct xml_context *ctx) -{ - /* Called after the first character of a block */ - TRACE(ctx, "inc"); - ctx->depth++; -} - -static inline void -xml_dec(struct xml_context *ctx) -{ - /* Called after the last character of a block */ - TRACE(ctx, "dec"); - if (unlikely(!ctx->depth--)) - xml_fatal_nested(ctx); -} - -#include "obj/shxml/unicat.h" - -static inline uns -xml_char_cat(uns c) -{ - if (c < 0x10000) - return 1U << xml_char_tab1[(c & 0xff) + xml_char_tab2[c >> 8]]; - else if (likely(c < 0x110000)) - return 1U << xml_char_tab3[c >> 16]; - else - return 1; -} - -static inline uns -xml_ascii_cat(uns c) -{ - return xml_char_tab1[c]; -} - -struct xml_source *xml_push_source(struct xml_context *ctx); -void xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); - -void xml_refill(struct xml_context *ctx); - -static inline uns -xml_peek_char(struct xml_context *ctx) -{ - if (ctx->bptr == ctx->bstop) - xml_refill(ctx); - return ctx->bptr[0]; -} - -static inline uns -xml_peek_cat(struct xml_context *ctx) -{ - if (ctx->bptr == ctx->bstop) - xml_refill(ctx); - return ctx->bptr[1]; -} - -static inline uns -xml_get_char(struct xml_context *ctx) -{ - uns c = xml_peek_char(ctx); - ctx->bptr += 2; - return c; -} - -static inline uns -xml_get_cat(struct xml_context *ctx) -{ - uns c = xml_peek_cat(ctx); - ctx->bptr += 2; - return c; -} - -static inline uns -xml_last_char(struct xml_context *ctx) -{ - return ctx->bptr[-2]; -} - -static inline uns -xml_last_cat(struct xml_context *ctx) -{ - return ctx->bptr[-1]; -} - -static inline uns -xml_skip_char(struct xml_context *ctx) -{ - uns c = ctx->bptr[0]; - ctx->bptr += 2; - return c; -} - -static inline uns -xml_unget_char(struct xml_context *ctx) -{ - return *(ctx->bptr -= 2); -} - -void xml_sources_cleanup(struct xml_context *ctx); - -/*** Parsing ***/ - -void NONRET xml_fatal_expected(struct xml_context *ctx, uns c); -void NONRET xml_fatal_expected_white(struct xml_context *ctx); -void NONRET xml_fatal_expected_quot(struct xml_context *ctx); - -static inline uns -xml_parse_white(struct xml_context *ctx, uns mandatory) -{ - /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+ - * mandatory=0 -> S? */ - uns cnt = 0; - while (xml_peek_cat(ctx) & XML_CHAR_WHITE) - { - xml_skip_char(ctx); - cnt++; - } - if (unlikely(mandatory && !cnt)) - xml_fatal_expected_white(ctx); - return cnt; -} - -static inline void -xml_parse_char(struct xml_context *ctx, uns c) -{ - /* Consumes a given Unicode character */ - if (unlikely(c != xml_get_char(ctx))) - xml_fatal_expected(ctx, c); -} - -static inline void -xml_parse_seq(struct xml_context *ctx, const char *seq) -{ - /* Consumes a given sequence of ASCII characters */ - while (*seq) - xml_parse_char(ctx, *seq++); -} - -void xml_parse_eq(struct xml_context *ctx); - -static inline uns -xml_parse_quote(struct xml_context *ctx) -{ - /* "'" | '"' */ - uns c = xml_get_char(ctx); - if (unlikely(c != '\'' && c != '\"')) - xml_fatal_expected_quot(ctx); - return c; -} - -char *xml_parse_name(struct xml_context *ctx, struct mempool *pool); -void xml_skip_name(struct xml_context *ctx); -char *xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool); - -char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool); -char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool); - -uns xml_parse_char_ref(struct xml_context *ctx); -void xml_parse_pe_ref(struct xml_context *ctx); - -char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr); - -void xml_skip_internal_subset(struct xml_context *ctx); -void xml_parse_notation_decl(struct xml_context *ctx); -void xml_parse_entity_decl(struct xml_context *ctx); -void xml_parse_element_decl(struct xml_context *ctx); -void xml_parse_attr_list_decl(struct xml_context *ctx); - -void xml_push_comment(struct xml_context *ctx); -void xml_pop_comment(struct xml_context *ctx); -void xml_skip_comment(struct xml_context *ctx); - -void xml_push_pi(struct xml_context *ctx); -void xml_pop_pi(struct xml_context *ctx); -void xml_skip_pi(struct xml_context *ctx); - -void xml_attrs_table_init(struct xml_context *ctx); -void xml_attrs_table_cleanup(struct xml_context *ctx); - -void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value); - -#endif diff --git a/shxml/libucw-xml.pc b/shxml/libucw-xml.pc deleted file mode 100644 index 2115af95..00000000 --- a/shxml/libucw-xml.pc +++ /dev/null @@ -1,14 +0,0 @@ -# pkg-config metadata for libucw-xml - -libdir=@LIBDIR@ -incdir=. - -# Override if you want to use the -pic version -picsuffix= - -Name: libucw-xml -Description: XML parser for LibUCW project -Version: @UCW_ABI_VERSION@ -Cflags: -I${incdir} -Libs: -L${libdir} -lucw-xml${picsuffix} -Requires: @DEPS@ diff --git a/shxml/parse.c b/shxml/parse.c deleted file mode 100644 index 0865dca9..00000000 --- a/shxml/parse.c +++ /dev/null @@ -1,1287 +0,0 @@ -/* - * Sherlock Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#undef LOCAL_DEBUG - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/*** Basic parsing ***/ - -void NONRET -xml_fatal_expected(struct xml_context *ctx, uns c) -{ - if (c >= 32 && c < 128) - xml_fatal(ctx, "Expected '%c'", c); - else - xml_fatal(ctx, "Expected U+%04x", c); -} - -void NONRET -xml_fatal_expected_white(struct xml_context *ctx) -{ - xml_fatal(ctx, "Expected a white space"); -} - -void NONRET -xml_fatal_expected_quot(struct xml_context *ctx) -{ - xml_fatal(ctx, "Expected a quotation mark"); -} - -void -xml_parse_eq(struct xml_context *ctx) -{ - /* Eq ::= S? '=' S? */ - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '='); - xml_parse_white(ctx, 0); -} - -/*** Names and nmtokens ***/ - -static char * -xml_parse_string(struct xml_context *ctx, struct mempool *pool, uns first_cat, uns next_cat, char *err) -{ - char *p = mp_start_noalign(pool, 1); - if (unlikely(!(xml_peek_cat(ctx) & first_cat))) - xml_fatal(ctx, "%s", err); - do - { - p = mp_spread(pool, p, 5); - p = utf8_32_put(p, xml_skip_char(ctx)); - } - while (xml_peek_cat(ctx) & next_cat); - *p++ = 0; - return mp_end(pool, p); -} - -static void -xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err) -{ - if (unlikely(!(xml_get_cat(ctx) & first_cat))) - xml_fatal(ctx, "%s", err); - while (xml_peek_cat(ctx) & next_cat) - xml_skip_char(ctx); -} - -char * -xml_parse_name(struct xml_context *ctx, struct mempool *pool) -{ - /* Name ::= NameStartChar (NameChar)* */ - return xml_parse_string(ctx, pool, ctx->cat_sname, ctx->cat_name, "Expected a name"); -} - -void -xml_skip_name(struct xml_context *ctx) -{ - xml_skip_string(ctx, ctx->cat_sname, ctx->cat_name, "Expected a name"); -} - -char * -xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool) -{ - /* Nmtoken ::= (NameChar)+ */ - return xml_parse_string(ctx, pool, ctx->cat_name, ctx->cat_name, "Expected a nmtoken"); -} - -/*** Simple literals ***/ - -char * -xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool) -{ - /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */ - char *p = mp_start_noalign(pool, 1); - uns q = xml_parse_quote(ctx), c; - while ((c = xml_get_char(ctx)) != q) - { - p = mp_spread(pool, p, 5); - p = utf8_32_put(p, c); - } - *p++ = 0; - return mp_end(pool, p); -} - -char * -xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool) -{ - /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */ - char *p = mp_start_noalign(pool, 1); - uns q = xml_parse_quote(ctx), c; - while ((c = xml_get_char(ctx)) != q) - { - if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID))) - xml_fatal(ctx, "Expected a pubid character"); - p = mp_spread(pool, p, 2); - *p++ = c; - } - *p++ = 0; - return mp_end(pool, p); -} - -/*** Comments ***/ - -void -xml_push_comment(struct xml_context *ctx) -{ - TRACE(ctx, "push_comment"); - /* Comment ::= '' - * Already parsed: 'type = XML_NODE_COMMENT; - char *p = mp_start_noalign(ctx->pool, 6); - while (1) - { - if (xml_get_char(ctx) == '-') - if (xml_get_char(ctx) == '-') - break; - else - *p++ = '-'; - p = utf8_32_put(p, xml_last_char(ctx)); - p = mp_spread(ctx->pool, p, 6); - } - xml_parse_char(ctx, '>'); - *p = 0; - n->len = p - (char *)mp_ptr(ctx->pool); - n->text = mp_end(ctx->pool, p + 1); - if ((ctx->flags & XML_REPORT_COMMENTS) && ctx->h_comment) - ctx->h_comment(ctx); -} - -void -xml_pop_comment(struct xml_context *ctx) -{ - xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_COMMENTS)); - xml_dec(ctx); - TRACE(ctx, "pop_comment"); -} - -void -xml_skip_comment(struct xml_context *ctx) -{ - TRACE(ctx, "skip_comment"); - xml_parse_char(ctx, '-'); - while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-'); - xml_parse_char(ctx, '>'); - xml_dec(ctx); -} - -/*** Processing instructions ***/ - -void -xml_push_pi(struct xml_context *ctx) -{ - TRACE(ctx, "push_pi"); - /* Parses a PI to ctx->value and ctx->name: - * PI ::= '' Char*)))? '?>' - * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) - * Already parsed: 'type = XML_NODE_PI; - n->name = xml_parse_name(ctx, ctx->pool); - if (unlikely(!strcasecmp(n->name, "xml"))) - xml_error(ctx, "Reserved PI target"); - char *p = mp_start_noalign(ctx->pool, 5); - if (!xml_parse_white(ctx, 0)) - xml_parse_seq(ctx, "?>"); - else - while (1) - { - if (xml_get_char(ctx) == '?') - if (xml_peek_char(ctx) == '>') - { - xml_skip_char(ctx); - break; - } - else - *p++ = '?'; - else - p = utf8_32_put(p, xml_last_char(ctx)); - p = mp_spread(ctx->pool, p, 5); - } - *p = 0; - n->len = p - (char *)mp_ptr(ctx->pool); - n->text = mp_end(ctx->pool, p + 1); - if ((ctx->flags & XML_REPORT_PIS) && ctx->h_pi) - ctx->h_pi(ctx); -} - -void -xml_pop_pi(struct xml_context *ctx) -{ - xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_PIS)); - xml_dec(ctx); - TRACE(ctx, "pop_pi"); -} - -void -xml_skip_pi(struct xml_context *ctx) -{ - TRACE(ctx, "skip_pi"); - if (ctx->flags & XML_VALIDATING) - { - struct mempool_state state; - mp_save(ctx->stack, &state); - if (unlikely(!strcasecmp(xml_parse_name(ctx, ctx->stack), "xml"))) - xml_error(ctx, "Reserved PI target"); - mp_restore(ctx->stack, &state); - if (!xml_parse_white(ctx, 0)) - { - xml_parse_seq(ctx, "?>"); - xml_dec(ctx); - return; - } - } - while (1) - if (xml_get_char(ctx) == '?') - if (xml_peek_char(ctx) == '>') - break; - xml_skip_char(ctx); - xml_dec(ctx); -} - -/*** Character references ***/ - -uns -xml_parse_char_ref(struct xml_context *ctx) -{ - TRACE(ctx, "parse_char_ref"); - /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' - * Already parsed: '&#' */ - uns v = 0; - if (xml_get_char(ctx) == 'x') - { - if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT)) - { - xml_error(ctx, "Expected a hexadecimal value of character reference"); - goto recover; - } - do - { - v = (v << 4) + Cxvalue(xml_last_char(ctx)); - } - while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT)); - } - else - { - if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT)) - { - xml_error(ctx, "Expected a numeric value of character reference"); - goto recover; - } - do - { - v = v * 10 + xml_last_char(ctx) - '0'; - } - while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT)); - } - uns cat = xml_char_cat(v); - if (!(cat & ctx->cat_unrestricted)) - { - xml_error(ctx, "Character reference out of range"); - goto recover; - } - if (xml_last_char(ctx) == ';') - { - xml_dec(ctx); - return v; - } - xml_error(ctx, "Expected ';'"); -recover: - while (xml_last_char(ctx) != ';') - xml_get_char(ctx); - xml_dec(ctx); - return UNI_REPLACEMENT; -} - -/*** References to general entities ***/ - -static void -xml_parse_ref(struct xml_context *ctx) -{ - /* Reference ::= EntityRef | CharRef - * EntityRef ::= '&' Name ';' - * Already parsed: '&' */ - struct fastbuf *out = &ctx->chars; - if (xml_peek_char(ctx) == '#') - { - xml_skip_char(ctx); - bput_utf8_32(out, xml_parse_char_ref(ctx)); - } - else - { - TRACE(ctx, "parse_ge_ref"); - struct mempool_state state; - mp_save(ctx->stack, &state); - char *name = xml_parse_name(ctx, ctx->stack); - xml_parse_char(ctx, ';'); - struct xml_dtd_entity *ent = xml_dtd_find_entity(ctx, name); - if (!ent) - { - xml_error(ctx, "Unknown entity &%s;", name); - bputc(out, '&'); - bputs(out, name); - bputc(out, ';'); - } - else if (ent->flags & XML_DTD_ENTITY_TRIVIAL) - { - TRACE(ctx, "Trivial entity &%s;", name); - bputs(out, ent->text); - } - else - { - TRACE(ctx, "Pushed entity &%s;", name); - mp_restore(ctx->stack, &state); - xml_dec(ctx); - xml_push_entity(ctx, ent); - return; - } - mp_restore(ctx->stack, &state); - xml_dec(ctx); - } -} - -/*** Character data ***/ - -void -xml_spout_chars(struct fastbuf *fb) -{ - if (fb->bptr < fb->bufend) - return; - struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb); - struct mempool *pool = ctx->pool; - if (fb->bufend != fb->buffer) - { - TRACE(ctx, "growing chars"); - uns len = fb->bufend - fb->buffer; - uns reported = fb->bstop - fb->buffer; - fb->buffer = mp_expand(pool); - fb->bufend = fb->buffer + mp_avail(pool); - fb->bptr = fb->buffer + len; - fb->bstop = fb->buffer + reported; - } - else - { - TRACE(ctx, "starting chars"); - mp_save(pool, &ctx->chars_state); - fb->bptr = fb->buffer = fb->bstop = mp_start_noalign(pool, 2); - fb->bufend = fb->buffer + mp_avail(pool) - 1; - } -} - -static inline uns -xml_end_chars(struct xml_context *ctx, char **out) -{ - struct fastbuf *fb = &ctx->chars; - uns len = fb->bptr - fb->buffer; - if (len) - { - TRACE(ctx, "ending chars"); - *fb->bptr = 0; - *out = mp_end(ctx->pool, fb->bptr + 1); - fb->bufend = fb->bstop = fb->bptr = fb->buffer; - } - return len; -} - -static inline uns -xml_report_chars(struct xml_context *ctx, char **out) -{ - struct fastbuf *fb = &ctx->chars; - uns len = fb->bptr - fb->buffer; - if (len) - { - *fb->bptr = 0; - *out = fb->bstop; - fb->bstop = fb->bptr; - } - return len; -} - -static inline uns -xml_flush_chars(struct xml_context *ctx) -{ - char *text, *rtext; - uns len = xml_end_chars(ctx, &text), rlen; - if (len) - { - if (ctx->flags & XML_NO_CHARS) - { - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_ignorable) - ctx->h_ignorable(ctx, text, len); - mp_restore(ctx->pool, &ctx->chars_state); - return 0; - } - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) - ctx->h_block(ctx, rtext, rlen); - if (!(ctx->flags & XML_ALLOC_CHARS) && !(ctx->flags & XML_REPORT_CHARS)) - { - mp_restore(ctx->pool, &ctx->chars_state); - return 0; - } - struct xml_node *n = xml_push_dom(ctx, &ctx->chars_state); - n->type = XML_NODE_CHARS; - n->text = text; - n->len = len; - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars) - ctx->h_chars(ctx); - } - return len; -} - -static inline void -xml_pop_chars(struct xml_context *ctx) -{ - xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS)); - TRACE(ctx, "pop_chars"); -} - -static inline void -xml_append_chars(struct xml_context *ctx) -{ - TRACE(ctx, "append_chars"); - struct fastbuf *out = &ctx->chars; - if (ctx->flags & XML_NO_CHARS) - while (xml_get_char(ctx) != '<') - if (xml_last_cat(ctx) & XML_CHAR_WHITE) - bput_utf8_32(out, xml_last_char(ctx)); - else - { - xml_error(ctx, "This element must not contain character data"); - while (xml_get_char(ctx) != '<'); - break; - } - else - while (xml_get_char(ctx) != '<') - if (xml_last_char(ctx) == '&') - { - xml_inc(ctx); - xml_parse_ref(ctx); - } - else - bput_utf8_32(out, xml_last_char(ctx)); - xml_unget_char(ctx); -} - -/*** CDATA sections ***/ - -static void -xml_skip_cdata(struct xml_context *ctx) -{ - TRACE(ctx, "skip_cdata"); - xml_parse_seq(ctx, "CDATA["); - while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>'); - xml_dec(ctx); -} - -static void -xml_append_cdata(struct xml_context *ctx) -{ - /* CDSect :== '' Char*)) ']]>' - * Already parsed: 'flags & XML_NO_CHARS) - { - xml_error(ctx, "This element must not contain CDATA"); - xml_skip_cdata(ctx); - return; - } - xml_parse_seq(ctx, "CDATA["); - struct fastbuf *out = &ctx->chars; - uns rlen; - char *rtext; - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) - ctx->h_block(ctx, rtext, rlen); - while (1) - { - if (xml_get_char(ctx) == ']') - { - if (xml_get_char(ctx) == ']') - if (xml_get_char(ctx) == '>') - break; - else - bputc(out, ']'); - bputc(out, ']'); - } - bput_utf8_32(out, xml_last_char(ctx)); - } - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata && (rlen = xml_report_chars(ctx, &rtext))) - ctx->h_cdata(ctx, rtext, rlen); - xml_dec(ctx); -} - -/*** Attribute values ***/ - -char * -xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) -{ - TRACE(ctx, "parse_attr_value"); - /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */ - /* FIXME: -- check value constrains / normalize leading/trailing WS and repeated WS */ - struct mempool_state state; - uns quote = xml_parse_quote(ctx); - mp_save(ctx->stack, &state); - struct fastbuf *out = &ctx->chars; - struct xml_source *src = ctx->src; - while (1) - { - uns c = xml_get_char(ctx); - if (c == '&') - { - xml_inc(ctx); - xml_parse_ref(ctx); - } - else if (c == quote && src == ctx->src) - break; - else if (c == '<') - xml_error(ctx, "Attribute value must not contain '<'"); - else if (xml_last_cat(ctx) & XML_CHAR_WHITE) - bputc(out, ' '); - else - bput_utf8_32(out, c); - } - mp_restore(ctx->stack, &state); - char *text; - return xml_end_chars(ctx, &text) ? text : ""; -} - -uns -xml_normalize_white(struct xml_context *ctx UNUSED, char *text) -{ - char *s = text, *d = text; - while (*s == 0x20) - s++; - while (1) - { - while (*s & ~0x20) - *d++ = *s++; - if (!*s) - break; - while (*++s == 0x20); - *d++ = 0x20; - } - if (d != text && d[-1] == 0x20) - d--; - *d = 0; - return d - text; -} - -/*** Attributes ***/ - -struct xml_attrs_table; - -static inline uns -xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_node *e, char *n) -{ - return hash_pointer(e) ^ hash_string(n); -} - -static inline int -xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_node *e1, char *n1, struct xml_node *e2, char *n2) -{ - return (e1 == e2) && !strcmp(n1, n2); -} - -static inline void -xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_node *e, char *name) -{ - a->elem = e; - a->name = name; - a->val = NULL; - a->user = NULL; - slist_add_tail(&e->attrs, &a->n); -} - -#define HASH_PREFIX(x) xml_attrs_##x -#define HASH_NODE struct xml_attr -#define HASH_KEY_COMPLEX(x) x elem, x name -#define HASH_KEY_DECL struct xml_node *elem, char *name -#define HASH_TABLE_DYNAMIC -#define HASH_GIVE_EQ -#define HASH_GIVE_HASHFN -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_CLEANUP -#define HASH_WANT_REMOVE -#define HASH_WANT_LOOKUP -#define HASH_WANT_FIND -#define HASH_GIVE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -static void -xml_parse_attr(struct xml_context *ctx) -{ - TRACE(ctx, "parse_attr"); - /* Attribute ::= Name Eq AttValue */ - struct xml_node *e = ctx->node; - char *n = xml_parse_name(ctx, ctx->pool); - struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, n); - xml_parse_eq(ctx); - char *v = xml_parse_attr_value(ctx, NULL); - if (a->val) - { - xml_error(ctx, "Attribute %s is not unique in element <%s>", n, e->name); - return; - } - a->val = v; - if (!e->dtd) - a->dtd = NULL; - else if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name))) - xml_error(ctx, "Undefined attribute %s in element <%s>", n, e->name); - else - xml_validate_attr(ctx, a->dtd, a->val); -} - -struct xml_attr * -xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name) -{ - return xml_attrs_find(ctx->tab_attrs, node, name); -} - -char * -xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name) -{ - struct xml_attr *attr = xml_attrs_find(ctx->tab_attrs, node, name); - if (attr) - return attr->val; - if (!node->dtd) - return NULL; - struct xml_dtd_attr *dtd = xml_dtd_find_attr(ctx, node->dtd, name); - return dtd ? dtd->default_value : NULL; -} - -void -xml_attrs_table_init(struct xml_context *ctx) -{ - xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table))); -} - -void -xml_attrs_table_cleanup(struct xml_context *ctx) -{ - xml_attrs_cleanup(ctx->tab_attrs); -} - -/*** Elements ***/ - -static uns -xml_validate_element(struct xml_dtd_elem_node *root, struct xml_dtd_elem *elem) -{ - if (root->elem) - return elem == root->elem; - else - SLIST_FOR_EACH(struct xml_dtd_elem_node *, son, root->sons) - if (xml_validate_element(son, elem)) - return 1; - return 0; -} - -static void -xml_push_element(struct xml_context *ctx) -{ - TRACE(ctx, "push_element"); - /* EmptyElemTag | STag - * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' - * STag ::= '<' Name (S Attribute)* S? '>' - * Already parsed: '<' */ - struct xml_node *e = xml_push_dom(ctx, NULL); - clist_init(&e->sons); - e->type = XML_NODE_ELEM; - e->name = xml_parse_name(ctx, ctx->pool); - slist_init(&e->attrs); - if (!e->parent) - { - ctx->dom = e; - if (ctx->doctype && strcmp(e->name, ctx->doctype)) - xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype); - } - if (!ctx->dtd) - e->dtd = NULL; - else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name))) - xml_error(ctx, "Undefined element <%s>", e->name); - else - { - struct xml_dtd_elem *dtd = e->dtd, *parent_dtd = e->parent ? e->parent->dtd : NULL; - if (dtd->type == XML_DTD_ELEM_MIXED) - ctx->flags &= ~XML_NO_CHARS; - else - ctx->flags |= XML_NO_CHARS; - if (parent_dtd) - if (parent_dtd->type == XML_DTD_ELEM_EMPTY) - xml_error(ctx, "Empty element must not contain children"); - else if (parent_dtd->type != XML_DTD_ELEM_ANY) - { - // FIXME: validate regular expressions - if (!xml_validate_element(parent_dtd->node, dtd)) - xml_error(ctx, "Unexpected element <%s>", e->name); - } - } - while (1) - { - uns white = xml_parse_white(ctx, 0); - uns c = xml_get_char(ctx); - if (c == '/') - { - xml_parse_char(ctx, '>'); - ctx->flags |= XML_EMPTY_ELEM_TAG; - break; - } - else if (c == '>') - break; - else if (!white) - xml_fatal_expected_white(ctx); - xml_unget_char(ctx); - xml_parse_attr(ctx); - } - if (e->dtd) - SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs) - if (a->default_mode == XML_ATTR_REQUIRED) - { - if (!xml_attrs_find(ctx->tab_attrs, e, a->name)) - xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name); - } - else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS) - { - struct xml_attr *attr = xml_attrs_lookup(ctx->tab_attrs, e, a->name); - if (!attr->val) - attr->val = a->default_value; - } - if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag) - ctx->h_stag(ctx); -} - -static void -xml_pop_element(struct xml_context *ctx) -{ - TRACE(ctx, "pop_element"); - if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag) - ctx->h_etag(ctx); - struct xml_node *e = ctx->node; - uns free = !(ctx->flags & XML_ALLOC_TAGS); - if (free) - { - if (!e->parent) - ctx->dom = NULL; - /* Restore hash table of attributes */ - SLIST_FOR_EACH(struct xml_attr *, a, e->attrs) - xml_attrs_remove(ctx->tab_attrs, a); - struct xml_node *n; - while (n = clist_head(&e->sons)) - { - if (n->type == XML_NODE_ELEM) - { - SLIST_FOR_EACH(struct xml_attr *, a, n->attrs) - xml_attrs_remove(ctx->tab_attrs, a); - clist_insert_list_after(&n->sons, &n->n); - } - clist_remove(&n->n); - } - } - xml_pop_dom(ctx, free); - xml_dec(ctx); -} - -static void -xml_parse_etag(struct xml_context *ctx) -{ - /* ETag ::= '' - * Already parsed: '<' */ - struct xml_node *e = ctx->node; - ASSERT(e); - char *n = e->name; - while (*n) - { - uns c; - n = utf8_32_get(n, &c); - if (xml_get_char(ctx) != c) - goto recover; - } - xml_parse_white(ctx, 0); - if (xml_get_char(ctx) != '>') - { -recover: - xml_error(ctx, "Invalid ETag, expected ", e->name); - while (xml_get_char(ctx) != '>'); - } - xml_dec(ctx); -} - -/*** Document type declaration ***/ - -static void -xml_parse_doctype_decl(struct xml_context *ctx) -{ - TRACE(ctx, "parse_doctype_decl"); - /* doctypedecl ::= '' - * Already parsed: '' */ - if (ctx->doctype) - xml_fatal(ctx, "Multiple document types not allowed"); - xml_parse_seq(ctx, "DOCTYPE"); - xml_parse_white(ctx, 1); - ctx->doctype = xml_parse_name(ctx, ctx->pool); - TRACE(ctx, "doctype=%s", ctx->doctype); - uns c; - if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P')) - { - if (c == 'S') - { - xml_parse_seq(ctx, "SYSTEM"); - xml_parse_white(ctx, 1); - ctx->system_id = xml_parse_system_literal(ctx, ctx->pool); - } - else - { - xml_parse_seq(ctx, "PUBLIC"); - xml_parse_white(ctx, 1); - ctx->public_id = xml_parse_pubid_literal(ctx, ctx->pool); - xml_parse_white(ctx, 1); - ctx->system_id = xml_parse_system_literal(ctx, ctx->pool); - } - xml_parse_white(ctx, 0); - ctx->flags |= XML_HAS_EXTERNAL_SUBSET; - } - if (xml_peek_char(ctx) == '[') - { - ctx->flags |= XML_HAS_INTERNAL_SUBSET; - xml_skip_char(ctx); - xml_inc(ctx); - } - if (ctx->h_doctype_decl) - ctx->h_doctype_decl(ctx); -} - - - -/////////////////////////////////////////////////////////////////////////////////////////////////////////// - -/* DTD: Internal subset */ - -static void -xml_parse_subset(struct xml_context *ctx, uns external) -{ - // FIXME: - // -- comments/pi have no parent - // -- conditional sections in external subset - // -- check corectness of parameter entities - - /* '[' intSubset ']' - * intSubset :== (markupdecl | DeclSep) - * Already parsed: '[' - * - * extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* - */ - while (1) - { - xml_parse_white(ctx, 0); - uns c = xml_get_char(ctx); - xml_inc(ctx); - if (c == '<') - if ((c = xml_get_char(ctx)) == '!') - switch (c = xml_get_char(ctx)) - { - case '-': - xml_push_comment(ctx); - xml_pop_comment(ctx); - break; - case 'N': - xml_parse_seq(ctx, "OTATION"); - xml_parse_notation_decl(ctx); - break; - case 'E': - if ((c = xml_get_char(ctx)) == 'N') - { - xml_parse_seq(ctx, "TITY"); - xml_parse_entity_decl(ctx); - } - else if (c == 'L') - { - xml_parse_seq(ctx, "EMENT"); - xml_parse_element_decl(ctx); - } - else - goto invalid_markup; - break; - case 'A': - xml_parse_seq(ctx, "TTLIST"); - xml_parse_attr_list_decl(ctx); - break; - default: - goto invalid_markup; - } - else if (c == '?') - { - xml_push_pi(ctx); - xml_pop_pi(ctx); - } - else - goto invalid_markup; - else if (c == '%') - xml_parse_pe_ref(ctx); - else if (c == ']' && !external) - { - break; - } - else if (c == '>' && external) - { - break; - } - else - goto invalid_markup; - } - xml_dec(ctx); - return; -invalid_markup: ; - xml_fatal(ctx, "Invalid markup in the %s subset", external ? "external" : "internal"); -} - -/*** The State Machine ***/ - -uns -xml_next(struct xml_context *ctx) -{ - /* A nasty state machine */ - -#define PULL(x) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##x; case XML_STATE_##x: ; } while (0) -#define PULL_STATE(x, s) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##s, XML_STATE_##x; case XML_STATE_##s: ; } while (0) - - TRACE(ctx, "xml_next (state=%u)", ctx->state); - jmp_buf throw_buf; - ctx->throw_buf = &throw_buf; - if (setjmp(throw_buf)) - { -error: - if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal) - ctx->h_fatal(ctx); - TRACE(ctx, "raised fatal error"); - return ctx->state = XML_STATE_EOF; - } - uns c; - switch (ctx->state) - { - case XML_STATE_START: - TRACE(ctx, "entering prolog"); - ctx->flags |= XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL; - if (ctx->h_document_start) - ctx->h_document_start(ctx); - /* XMLDecl */ - xml_refill(ctx); - if (ctx->h_xml_decl) - ctx->h_xml_decl(ctx); - PULL(XML_DECL); - - /* Misc* (doctypedecl Misc*)? */ - while (1) - { - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '<'); - xml_inc(ctx); - if ((c = xml_get_char(ctx)) == '?') - /* Processing intruction */ - if (!(ctx->flags & XML_REPORT_PIS)) - xml_skip_pi(ctx); - else - { - xml_push_pi(ctx); - PULL_STATE(PI, PROLOG_PI); - xml_pop_pi(ctx); - } - else if (c != '!') - { - /* Found the root tag */ - xml_unget_char(ctx); - goto first_tag; - } - else if (xml_get_char(ctx) == '-') - if (!(ctx->flags & XML_REPORT_COMMENTS)) - xml_skip_comment(ctx); - else - { - xml_push_comment(ctx); - PULL_STATE(COMMENT, PROLOG_COMMENT); - xml_pop_comment(ctx); - } - else - { - /* DocTypeDecl */ - xml_unget_char(ctx); - xml_parse_doctype_decl(ctx); - PULL(DOCTYPE_DECL); - if (ctx->flags & XML_HAS_DTD) - if (ctx->flags & XML_PARSE_DTD) - { - xml_dtd_init(ctx); - if (ctx->h_dtd_start) - ctx->h_dtd_start(ctx); - if (ctx->flags & XML_HAS_INTERNAL_SUBSET) - { - xml_parse_subset(ctx, 0); - xml_dec(ctx); - } - if (ctx->flags & XML_HAS_EXTERNAL_SUBSET) - { - struct xml_dtd_entity ent = { - .system_id = ctx->system_id, - .public_id = ctx->public_id, - }; - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '>'); - xml_unget_char(ctx); - ASSERT(ctx->h_resolve_entity); - ctx->h_resolve_entity(ctx, &ent); - ctx->flags |= XML_SRC_EXPECTED_DECL; - xml_parse_subset(ctx, 1); - xml_unget_char(ctx);; - } - if (ctx->h_dtd_end) - ctx->h_dtd_end(ctx); - } - else if (ctx->flags & XML_HAS_INTERNAL_SUBSET) - xml_skip_internal_subset(ctx); - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '>'); - xml_dec(ctx); - } - } - - case XML_STATE_CHARS: - - while (1) - { - if (xml_peek_char(ctx) != '<') - { - /* CharData */ - xml_append_chars(ctx); - continue; - } - else - xml_skip_char(ctx); - xml_inc(ctx); -first_tag: - - if ((c = xml_get_char(ctx)) == '?') - { - /* PI */ - if (!(ctx->flags & (XML_REPORT_PIS | XML_ALLOC_PIS))) - xml_skip_pi(ctx); - else - { - if (xml_flush_chars(ctx)) - { - PULL_STATE(CHARS, CHARS_BEFORE_PI); - xml_pop_chars(ctx); - } - xml_push_pi(ctx); - PULL(PI); - xml_pop_pi(ctx); - } - } - - else if (c == '!') - if ((c = xml_get_char(ctx)) == '-') - { - /* Comment */ - if (!(ctx->flags & (XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS))) - xml_skip_comment(ctx); - else - { - if (xml_flush_chars(ctx)) - { - PULL_STATE(CHARS, CHARS_BEFORE_COMMENT); - xml_pop_chars(ctx); - } - xml_push_comment(ctx); - PULL(COMMENT); - xml_pop_comment(ctx); - } - } - else if (c == '[') - { - /* CDATA */ - xml_append_cdata(ctx); - } - else - xml_fatal(ctx, "Unexpected character after 'flags & XML_EMPTY_ELEM_TAG) - goto pop_element; - } - - else - { - /* ETag */ - if (xml_flush_chars(ctx)) - { - PULL_STATE(CHARS, CHARS_BEFORE_ETAG); - xml_pop_chars(ctx); - } - - xml_parse_etag(ctx); -pop_element: - PULL(ETAG); - xml_pop_element(ctx); - if (!ctx->node) - goto epilog; - } - } - -epilog: - /* Misc* */ - TRACE(ctx, "entering epilog"); - while (1) - { - /* Epilog whitespace is the only place, where a valid document can reach EOF */ - if (setjmp(throw_buf)) - if (ctx->err_code == XML_ERR_EOF) - { - TRACE(ctx, "reached EOF"); - ctx->state = XML_STATE_EOF; - if (ctx->h_document_end) - ctx->h_document_end(ctx); - case XML_STATE_EOF: - ctx->err_code = 0; - ctx->err_msg = NULL; - return XML_STATE_EOF; - } - else - goto error; - xml_parse_white(ctx, 0); - if (setjmp(throw_buf)) - goto error; - - /* Misc */ - xml_parse_char(ctx, '<'); - xml_inc(ctx); - if ((c = xml_get_char(ctx)) == '?') - /* Processing instruction */ - if (!(ctx->flags & XML_REPORT_PIS)) - xml_skip_pi(ctx); - else - { - xml_push_pi(ctx); - PULL_STATE(PI, EPILOG_PI); - xml_pop_pi(ctx); - } - else if (c == '!') - { - xml_parse_char(ctx, '-'); - /* Comment */ - if (!(ctx->flags & XML_REPORT_COMMENTS)) - xml_skip_comment(ctx); - else - { - xml_push_comment(ctx); - PULL_STATE(COMMENT, EPILOG_COMMENT); - xml_pop_comment(ctx); - } - } - else - xml_fatal(ctx, "Syntax error in the epilog"); - } - - } - ASSERT(0); -} - -uns -xml_next_state(struct xml_context *ctx, uns pull) -{ - uns saved = ctx->pull; - ctx->pull = pull; - uns res = xml_next(ctx); - ctx->pull = saved; - return res; -} - -uns -xml_skip_element(struct xml_context *ctx) -{ - ASSERT(ctx->state == XML_STATE_STAG); - struct xml_node *node = ctx->node; - uns saved = ctx->pull, res; - ctx->pull = XML_PULL_ETAG; - while ((res = xml_next(ctx)) && ctx->node != node); - ctx->pull = saved; - return res; -} - -uns -xml_parse(struct xml_context *ctx) -{ - /* This cycle should run only once unless the user overrides the value of ctx->pull in a SAX handler */ - do - { - ctx->pull = 0; - } - while (xml_next(ctx)); - return ctx->err_code; -} - -char * -xml_merge_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool) -{ - ASSERT(node->type == XML_NODE_ELEM); - char *p = mp_start_noalign(pool, 1); - XML_NODE_FOR_EACH(son, node) - if (son->type == XML_NODE_CHARS) - { - p = mp_spread(pool, p, son->len + 1); - memcpy(p, son->text, son->len); - p += son->len; - } - *p++ = 0; - return mp_end(pool, p); -} - -static char * -xml_append_dom_chars(char *p, struct mempool *pool, struct xml_node *node) -{ - XML_NODE_FOR_EACH(son, node) - if (son->type == XML_NODE_CHARS) - { - p = mp_spread(pool, p, son->len + 1); - memcpy(p, son->text, son->len); - p += son->len; - } - else if (son->type == XML_NODE_ELEM) - p = xml_append_dom_chars(p, pool, son); - return p; -} - -char * -xml_merge_dom_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool) -{ - ASSERT(node->type == XML_NODE_ELEM); - char *p = mp_start_noalign(pool, 1); - p = xml_append_dom_chars(p, pool, node); - *p++ = 0; - return mp_end(pool, p); -} diff --git a/shxml/source.c b/shxml/source.c deleted file mode 100644 index d6d1f3b3..00000000 --- a/shxml/source.c +++ /dev/null @@ -1,486 +0,0 @@ -/* - * Sherlock Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#undef LOCAL_DEBUG - -#include -#include -#include -#include -#include -#include -#include -#include - -/*** Charecter categorization ***/ - -#include "obj/shxml/unicat.c" - -static void -xml_init_cats(struct xml_context *ctx) -{ - if (!(ctx->flags & XML_VERSION_1_1)) - { - ctx->cat_chars = XML_CHAR_VALID_1_0; - ctx->cat_unrestricted = XML_CHAR_VALID_1_0; - ctx->cat_new_line = XML_CHAR_NEW_LINE_1_0; - ctx->cat_name = XML_CHAR_NAME_1_0; - ctx->cat_sname = XML_CHAR_SNAME_1_0; - } - else - { - ctx->cat_chars = XML_CHAR_VALID_1_1; - ctx->cat_unrestricted = XML_CHAR_UNRESTRICTED_1_1; - ctx->cat_new_line = XML_CHAR_NEW_LINE_1_1; - ctx->cat_name = XML_CHAR_NAME_1_1; - ctx->cat_sname = XML_CHAR_SNAME_1_1; - } -} - -/*** Reading of document/external entities ***/ - -static void NONRET -xml_eof(struct xml_context *ctx) -{ - ctx->err_msg = "Unexpected EOF"; - ctx->err_code = XML_ERR_EOF; - xml_throw(ctx); -} - -void NONRET -xml_fatal_nested(struct xml_context *ctx) -{ - xml_fatal(ctx, "Entity is not nested correctly"); -} - -static inline void -xml_add_char(u32 **bstop, uns c) -{ - *(*bstop)++ = c; - *(*bstop)++ = xml_char_cat(c); -} - -struct xml_source * -xml_push_source(struct xml_context *ctx) -{ - xml_push(ctx); - struct xml_source *src = ctx->src; - if (src) - { - src->bptr = ctx->bptr; - src->bstop = ctx->bstop; - } - src = mp_alloc_zero(ctx->stack, sizeof(*src)); - src->next = ctx->src; - src->saved_depth = ctx->depth; - ctx->src = src; - ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_DOCUMENT); - ctx->bstop = ctx->bptr = src->buf; - ctx->depth = 0; - return src; -} - -struct xml_source * -xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb) -{ - struct xml_source *src = xml_push_source(ctx); - src->fb = fb; - return src; -} - -static void -xml_close_source(struct xml_source *src) -{ - bclose(src->fb); - if (src->wrapped_fb) - bclose(src->wrapped_fb); -} - -static void -xml_pop_source(struct xml_context *ctx) -{ - TRACE(ctx, "pop_source"); - if (unlikely(ctx->depth != 0)) - xml_fatal(ctx, "Unexpected end of entity"); - struct xml_source *src = ctx->src; - if (!src) - xml_fatal(ctx, "Undefined source"); - xml_close_source(src); - ctx->depth = src->saved_depth; - ctx->src = src = src->next; - if (src) - { - ctx->bptr = src->bptr; - ctx->bstop = src->bstop; - } - xml_pop(ctx); - if (unlikely(!src)) - xml_eof(ctx); -} - -void -xml_sources_cleanup(struct xml_context *ctx) -{ - struct xml_source *s; - while (s = ctx->src) - { - ctx->src = s->next; - xml_close_source(s); - } -} - -static void xml_refill_utf8(struct xml_context *ctx); - -void -xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent UNUSED) -{ - xml_error(ctx, "References to external entities are not supported"); -} - -void -xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent) -{ - TRACE(ctx, "xml_push_entity"); - struct xml_source *src; - if (ent->flags & XML_DTD_ENTITY_EXTERNAL) - { - ASSERT(ctx->h_resolve_entity); - ctx->h_resolve_entity(ctx, ent); - ctx->flags |= XML_SRC_EXPECTED_DECL; - src = ctx->src; - } - else - { - src = xml_push_source(ctx); - fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, strlen(ent->text), 0); - } - src->refill = xml_refill_utf8; - src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line; - src->refill_cat2 = ctx->cat_new_line; -} - -static uns -xml_error_restricted(struct xml_context *ctx, uns c) -{ - if (c == ~1U) - xml_error(ctx, "Corrupted encoding"); - else - xml_error(ctx, "Restricted char U+%04X", c); - return UNI_REPLACEMENT; -} - -void xml_parse_decl(struct xml_context *ctx); - -#define REFILL(ctx, func, params...) \ - struct xml_source *src = ctx->src; \ - struct fastbuf *fb = src->fb; \ - if (ctx->bptr == ctx->bstop) \ - ctx->bptr = ctx->bstop = src->buf; \ - uns c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ - u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \ - *last_0xd = src->pending_0xd ? bstop : NULL; \ - do \ - { \ - c = func(fb, ##params); \ - uns t = xml_char_cat(c); \ - if (t & t1) \ - /* Typical branch */ \ - *bstop++ = c, *bstop++ = t; \ - else if (t & t2) \ - { \ - /* New line */ \ - /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \ - /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \ - if (c == 0xd) \ - last_0xd = bstop + 2; \ - else if (c != 0x2028 && last_0xd == bstop) \ - { \ - last_0xd = NULL; \ - continue; \ - } \ - xml_add_char(&bstop, 0xa), row++; \ - } \ - else if (c == '>') \ - { \ - /* Used only in XML/TextDecl to switch the encoding */ \ - *bstop++ = c, *bstop++ = t; \ - break; \ - } \ - else if (~c) \ - /* Restricted character */ \ - xml_add_char(&bstop, xml_error_restricted(ctx, c)); \ - else \ - { \ - /* EOF */ \ - ctx->flags |= XML_SRC_EOF; \ - break; \ - } \ - } \ - while (bstop < bend); \ - src->pending_0xd = (last_0xd == bstop); \ - ctx->bstop = bstop; \ - src->row = row; - -static void -xml_refill_utf8(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf8_repl, ~1U); -} - -static void -xml_refill_utf16_le(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf16_le_repl, ~1U); -} - -static void -xml_refill_utf16_be(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf16_be_repl, ~1U); -} - -#undef REFILL - -void -xml_refill(struct xml_context *ctx) -{ - do - { - if (ctx->flags & XML_SRC_EOF) - xml_pop_source(ctx); - else if (ctx->flags & XML_SRC_EXPECTED_DECL) - xml_parse_decl(ctx); - else - { - ctx->src->refill(ctx); - TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2)); - } - } - while (ctx->bptr == ctx->bstop); -} - -static uns -xml_source_row(struct xml_context *ctx, struct xml_source *src) -{ - uns row = src->row; - for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2) - if (p[-1] & src->refill_cat2) - row--; - return row + 1; -} - -uns -xml_row(struct xml_context *ctx) -{ - return ctx->src ? xml_source_row(ctx, ctx->src) : 0; -} - -/* Document/external entity header */ - -static char * -xml_parse_encoding_name(struct xml_context *ctx) -{ - /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */ - char *p = mp_start_noalign(ctx->pool, 1); - uns q = xml_parse_quote(ctx); - if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME))) - xml_fatal(ctx, "Invalid character in the encoding name"); - while (1) - { - p = mp_spread(ctx->pool, p, 2); - *p++ = xml_last_char(ctx); - if (xml_get_char(ctx) == q) - break; - if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME))) - xml_fatal(ctx, "Invalid character in the encoding name"); - } - *p++ = 0; - return mp_end(ctx->pool, p); -} - -static void -xml_init_charconv(struct xml_context *ctx, int cs) -{ - // XXX: with a direct access to libucw-charset tables could be faster - struct xml_source *src = ctx->src; - TRACE(ctx, "wrapping charset %s", charset_name(cs)); - src->wrapped_fb = src->fb; - src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8); -} - -void -xml_parse_decl(struct xml_context *ctx) -{ - TRACE(ctx, "xml_parse_decl"); - struct xml_source *src = ctx->src; - ctx->flags &= ~XML_SRC_EXPECTED_DECL; - uns doc = ctx->flags & XML_SRC_DOCUMENT; - - /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */ - if (doc) - xml_init_cats(ctx); - src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line & ~XML_CHAR_GT; - src->refill_cat2 = ctx->cat_new_line; - - /* Initialize the supplied charset (if any) or try to guess it */ - char *expected_encoding = src->expected_encoding; - src->refill = xml_refill_utf8; - int bom = bpeekc(src->fb); - if (bom < 0) - ctx->flags |= XML_SRC_EOF; - if (!src->fb_encoding) - { - if (bom == 0xfe) - src->refill = xml_refill_utf16_be; - else if (bom == 0xff) - src->refill = xml_refill_utf16_le; - } - else - { - int cs = find_charset_by_name(src->fb_encoding); - if (cs == CONV_CHARSET_UTF8) - {} - else if (cs >= 0) - { - xml_init_charconv(ctx, cs); - bom = 0; - } - else if (strcasecmp(src->fb_encoding, "UTF-16")) - { - src->refill = xml_refill_utf16_be; - if (bom == 0xff) - src->refill = xml_refill_utf16_le; - } - else if (strcasecmp(src->fb_encoding, "UTF-16BE")) - src->refill = xml_refill_utf16_be; - else if (strcasecmp(src->fb_encoding, "UTF-16LE")) - src->refill = xml_refill_utf16_le; - else - { - xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding); - expected_encoding = NULL; - } - } - uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; - if (utf16) - src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE"; - if (!expected_encoding) - expected_encoding = src->fb_encoding; - if (bom > 0 && xml_peek_char(ctx) == 0xfeff) - xml_skip_char(ctx); - else if (utf16) - xml_error(ctx, "Missing or corrupted BOM"); - TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?"); - - /* Look ahead for presence of XMLDecl or optional TextDecl */ - if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) - xml_refill(ctx); - u32 *bptr = ctx->bptr; - uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) && - bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L'); - if (!have_decl) - { - if (doc) - xml_fatal(ctx, "Missing or corrupted XML header"); - else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16) - xml_error(ctx, "Missing or corrupted entity header"); - goto exit; - } - ctx->bptr = bptr + 12; - xml_parse_white(ctx, 0); - - /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */ - if (xml_peek_char(ctx) == 'v') - { - xml_parse_seq(ctx, "version"); - xml_parse_eq(ctx); - char *version = xml_parse_pubid_literal(ctx, ctx->pool); - TRACE(ctx, "version=%s", version); - uns v = 0; - if (!strcmp(version, "1.1")) - v = XML_VERSION_1_1; - else if (strcmp(version, "1.0")) - { - xml_error(ctx, "Unknown XML version string '%s'", version); - version = "1.0"; - } - if (doc) - { - ctx->version_str = version; - ctx->flags |= v; - } - else if (v > (ctx->flags & XML_VERSION_1_1)) - xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document"); - if (!xml_parse_white(ctx, !doc)) - goto end; - } - else if (doc) - { - xml_error(ctx, "Expected XML version"); - ctx->version_str = "1.0"; - } - - /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */ - if (xml_peek_char(ctx) == 'e') - { - xml_parse_seq(ctx, "encoding"); - xml_parse_eq(ctx); - src->decl_encoding = xml_parse_encoding_name(ctx); - TRACE(ctx, "encoding=%s", src->decl_encoding); - if (!xml_parse_white(ctx, 0)) - goto end; - } - else if (!doc) - xml_error(ctx, "Expected XML encoding"); - - /* Parse whether the document is standalone (optional in XMLDecl) */ - if (doc && xml_peek_char(ctx) == 's') - { - xml_parse_seq(ctx, "standalone"); - xml_parse_eq(ctx); - uns c = xml_parse_quote(ctx); - if (ctx->standalone = (xml_peek_char(ctx) == 'y')) - xml_parse_seq(ctx, "yes"); - else - xml_parse_seq(ctx, "no"); - xml_parse_char(ctx, c); - TRACE(ctx, "standalone=%d", ctx->standalone); - xml_parse_white(ctx, 0); - } -end: - xml_parse_seq(ctx, "?>"); - - /* Switch to the final encoding */ - if (src->decl_encoding) - { - int cs = find_charset_by_name(src->decl_encoding); - if (cs < 0 && !expected_encoding) - xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); - else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) - { - xml_init_charconv(ctx, cs); - src->fb_encoding = src->decl_encoding; - } - else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || - !(!strcasecmp(src->decl_encoding, "UTF-16") || - (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || - (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) - xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); - } - if (!src->fb_encoding) - src->fb_encoding = "UTF-8"; - TRACE(ctx, "Final encoding=%s", src->fb_encoding); - -exit: - /* Update valid Unicode ranges */ - if (doc) - xml_init_cats(ctx); - src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line; - src->refill_cat2 = ctx->cat_new_line; -} diff --git a/shxml/unicat.pl b/shxml/unicat.pl deleted file mode 100755 index b86106f2..00000000 --- a/shxml/unicat.pl +++ /dev/null @@ -1,165 +0,0 @@ -#!/usr/bin/perl -# -# UCW Library -- Character map for the XML parser -# -# (c) 2007 Pavel Charvat -# -# This software may be freely distributed and used according to the terms -# of the GNU Lesser General Public License. -# - -my @cat = (); -my @lcat = (); -my %ids = (); -my %cls = (); -for (my $i = 0; $i < 0x10000; $i++) { $cat[$i] = 0; } -for (my $i = 0; $i < 0x11; $i++) { $lcat[$i] = 0; } - -my @white = (0x9, 0xA, 0xD, 0x20); -my @base_char_1_0 = ( - [0x0041,0x005A], [0x0061,0x007A], [0x00C0,0x00D6], [0x00D8,0x00F6], [0x00F8,0x00FF], [0x0100,0x0131], - [0x0134,0x013E], [0x0141,0x0148], [0x014A,0x017E], [0x0180,0x01C3], [0x01CD,0x01F0], [0x01F4,0x01F5], - [0x01FA,0x0217], [0x0250,0x02A8], [0x02BB,0x02C1], 0x0386, [0x0388,0x038A], 0x038C, [0x038E,0x03A1], - [0x03A3,0x03CE], [0x03D0,0x03D6], 0x03DA, 0x03DC, 0x03DE, 0x03E0, [0x03E2,0x03F3], [0x0401,0x040C], - [0x040E,0x044F], [0x0451,0x045C], [0x045E,0x0481], [0x0490,0x04C4], [0x04C7,0x04C8], [0x04CB,0x04CC], - [0x04D0,0x04EB], [0x04EE,0x04F5], [0x04F8,0x04F9], [0x0531,0x0556], 0x0559, [0x0561,0x0586], [0x05D0,0x05EA], - [0x05F0,0x05F2], [0x0621,0x063A], [0x0641,0x064A], [0x0671,0x06B7], [0x06BA,0x06BE], [0x06C0,0x06CE], - [0x06D0,0x06D3], 0x06D5, [0x06E5,0x06E6], [0x0905,0x0939], 0x093D, [0x0958,0x0961], [0x0985,0x098C], - [0x098F,0x0990], [0x0993,0x09A8], [0x09AA,0x09B0], 0x09B2, [0x09B6,0x09B9], [0x09DC,0x09DD], [0x09DF,0x09E1], - [0x09F0,0x09F1], [0x0A05,0x0A0A], [0x0A0F,0x0A10], [0x0A13,0x0A28], [0x0A2A,0x0A30], [0x0A32,0x0A33], - [0x0A35,0x0A36], [0x0A38,0x0A39], [0x0A59,0x0A5C], 0x0A5E, [0x0A72,0x0A74], [0x0A85,0x0A8B], 0x0A8D, - [0x0A8F,0x0A91], [0x0A93,0x0AA8], [0x0AAA,0x0AB0], [0x0AB2,0x0AB3], [0x0AB5,0x0AB9], 0x0ABD, 0x0AE0, - [0x0B05,0x0B0C], [0x0B0F,0x0B10], [0x0B13,0x0B28], [0x0B2A,0x0B30], [0x0B32,0x0B33], [0x0B36,0x0B39], - 0x0B3D, [0x0B5C,0x0B5D], [0x0B5F,0x0B61], [0x0B85,0x0B8A], [0x0B8E,0x0B90], [0x0B92,0x0B95], [0x0B99,0x0B9A], - 0x0B9C, [0x0B9E,0x0B9F], [0x0BA3,0x0BA4], [0x0BA8,0x0BAA], [0x0BAE,0x0BB5], [0x0BB7,0x0BB9], [0x0C05,0x0C0C], - [0x0C0E,0x0C10], [0x0C12,0x0C28], [0x0C2A,0x0C33], [0x0C35,0x0C39], [0x0C60,0x0C61], [0x0C85,0x0C8C], - [0x0C8E,0x0C90], [0x0C92,0x0CA8], [0x0CAA,0x0CB3], [0x0CB5,0x0CB9], 0x0CDE, [0x0CE0,0x0CE1], [0x0D05,0x0D0C], - [0x0D0E,0x0D10], [0x0D12,0x0D28], [0x0D2A,0x0D39], [0x0D60,0x0D61], [0x0E01,0x0E2E], 0x0E30, [0x0E32,0x0E33], - [0x0E40,0x0E45], [0x0E81,0x0E82], 0x0E84, [0x0E87,0x0E88], 0x0E8A, 0x0E8D, [0x0E94,0x0E97], [0x0E99,0x0E9F], - [0x0EA1,0x0EA3], 0x0EA5, 0x0EA7, [0x0EAA,0x0EAB], [0x0EAD,0x0EAE], 0x0EB0, [0x0EB2,0x0EB3], 0x0EBD, - [0x0EC0,0x0EC4], [0x0F40,0x0F47], [0x0F49,0x0F69], [0x10A0,0x10C5], [0x10D0,0x10F6], 0x1100, [0x1102,0x1103], - [0x1105,0x1107], 0x1109, [0x110B,0x110C], [0x110E,0x1112], 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150, - [0x1154,0x1155], 0x1159, [0x115F,0x1161], 0x1163, 0x1165, 0x1167, 0x1169, [0x116D,0x116E], [0x1172,0x1173], - 0x1175, 0x119E, 0x11A8, 0x11AB, [0x11AE,0x11AF], [0x11B7,0x11B8], 0x11BA, [0x11BC,0x11C2], 0x11EB, 0x11F0, - 0x11F9, [0x1E00,0x1E9B], [0x1EA0,0x1EF9], [0x1F00,0x1F15], [0x1F18,0x1F1D], [0x1F20,0x1F45], [0x1F48,0x1F4D], - [0x1F50,0x1F57], 0x1F59, 0x1F5B, 0x1F5D, [0x1F5F,0x1F7D], [0x1F80,0x1FB4], [0x1FB6,0x1FBC], 0x1FBE, - [0x1FC2,0x1FC4], [0x1FC6,0x1FCC], [0x1FD0,0x1FD3], [0x1FD6,0x1FDB], [0x1FE0,0x1FEC], [0x1FF2,0x1FF4], - [0x1FF6,0x1FFC], 0x2126, [0x212A,0x212B], 0x212E, [0x2180,0x2182], [0x3041,0x3094], [0x30A1,0x30FA], - [0x3105,0x312C], [0xAC00,0xD7A3]); -my @ideographic_1_0 = ([0x4E00,0x9FA5], 0x3007, [0x3021,0x3029]); -my @combining_char_1_0 = ( - [0x0300,0x0345], [0x0360,0x0361], [0x0483,0x0486], [0x0591,0x05A1], [0x05A3,0x05B9], [0x05BB,0x05BD], - 0x05BF, [0x05C1,0x05C2], 0x05C4, [0x064B,0x0652], 0x0670, [0x06D6,0x06DC], [0x06DD,0x06DF], [0x06E0,0x06E4], - [0x06E7,0x06E8], [0x06EA,0x06ED], [0x0901,0x0903], 0x093C, [0x093E,0x094C], 0x094D, [0x0951,0x0954], - [0x0962,0x0963], [0x0981,0x0983], 0x09BC, 0x09BE, 0x09BF, [0x09C0,0x09C4], [0x09C7,0x09C8], [0x09CB,0x09CD], - 0x09D7, [0x09E2,0x09E3], 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, [0x0A40,0x0A42], [0x0A47,0x0A48], [0x0A4B,0x0A4D], - [0x0A70,0x0A71], [0x0A81,0x0A83], 0x0ABC, [0x0ABE,0x0AC5], [0x0AC7,0x0AC9], [0x0ACB,0x0ACD], [0x0B01,0x0B03], - 0x0B3C, [0x0B3E,0x0B43], [0x0B47,0x0B48], [0x0B4B,0x0B4D], [0x0B56,0x0B57], [0x0B82,0x0B83], [0x0BBE,0x0BC2], - [0x0BC6,0x0BC8], [0x0BCA,0x0BCD], 0x0BD7, [0x0C01,0x0C03], [0x0C3E,0x0C44], [0x0C46,0x0C48], [0x0C4A,0x0C4D], - [0x0C55,0x0C56], [0x0C82,0x0C83], [0x0CBE,0x0CC4], [0x0CC6,0x0CC8], [0x0CCA,0x0CCD], [0x0CD5,0x0CD6], - [0x0D02,0x0D03], [0x0D3E,0x0D43], [0x0D46,0x0D48], [0x0D4A,0x0D4D], 0x0D57, 0x0E31, [0x0E34,0x0E3A], - [0x0E47,0x0E4E], 0x0EB1, [0x0EB4,0x0EB9], [0x0EBB,0x0EBC], [0x0EC8,0x0ECD], [0x0F18,0x0F19], 0x0F35, - 0x0F37, 0x0F39, 0x0F3E, 0x0F3F, [0x0F71,0x0F84], [0x0F86,0x0F8B], [0x0F90,0x0F95], 0x0F97, [0x0F99,0x0FAD], - [0x0FB1,0x0FB7], 0x0FB9, [0x20D0,0x20DC], 0x20E1, [0x302A,0x302F], 0x3099, 0x309A); -my @digit_1_0 = ( - [0x0030,0x0039], [0x0660,0x0669], [0x06F0,0x06F9], [0x0966,0x096F], [0x09E6,0x09EF], [0x0A66,0x0A6F], - [0x0AE6,0x0AEF], [0x0B66,0x0B6F], [0x0BE7,0x0BEF], [0x0C66,0x0C6F], [0x0CE6,0x0CEF], [0x0D66,0x0D6F], - [0x0E50,0x0E59], [0x0ED0,0x0ED9], [0x0F20,0x0F29]); -my @extender_1_0 = ( - 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, [0x3031,0x3035], [0x309D,0x309E], [0x30FC,0x30FE]); -my @sname_1_1 = ( - "[:A-Z_a-z]", [0xC0,0xD6], [0xD8,0xF6], [0xF8,0x2FF], [0x370,0x37D], [0x37F,0x1FFF], - [0x200C,0x200D], [0x2070,0x218F], [0x2C00,0x2FEF], [0x3001,0xD7FF], [0xF900,0xFDCF], [0xFDF0,0xFFFD], [0x10000,0xEFFFF]); - -set("WHITE", @white); -set("NEW_LINE_1_0", 0xA, 0xD); -set("NEW_LINE_1_1", 0xA, 0xD, 0x85, 0x2028); -set("DIGIT", "[0-9]"); -set("XDIGIT", "[0-9a-fA-F]"); -set("VALID_1_0", @white, [0x20,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); -set("VALID_1_1", [0x1,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); -set("UNRESTRICTED_1_1", @white, [0x20,0x7E], 0x85, [0xA0,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); -set("PUBID", 0x20, 0xD, 0xA, "[-a-zA-Z0-9'()+,./:=?:!*#@\$_%]"); -set("ENC_SNAME", "[a-zA-Z]"); -set("ENC_NAME", "[-a-zA-Z0-9._]"); -set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0); -set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0); -set("SNAME_1_1", @sname_1_1); -set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]); -set("GT", "[>]"); - -($ARGV[0] eq "" || $ARGV[1] eq "") && die("Invalid usage"); -find_cls(); -open(H, ">", $ARGV[0]) or die("Cannot create $ARGV[0]"); -open(C, ">", $ARGV[1]) or die("Cannot create $ARGV[1]"); -gen_enum(); -gen_tabs(); -close(H); -close(C); - -sub set { - my $id = shift; - $ids{$id} = scalar keys(%ids) if !defined($ids{$id}); - my $mask = 1 << $ids{$id}; - foreach my $i (@_) { - if (ref($i) eq "ARRAY") { - my $j = $i->[0]; - for (; $j <= $i->[1] && $j < 0x10000; $j++) { $cat[$j] |= $mask; } - for (; $j <= $i->[1]; $j += 0x10000) { $lcat[$j >> 16] |= $mask; } - } - elsif ($i =~ /^\[/) { for (my $j=0; $j < 128; $j++) { if (chr($j) =~ /$i/) { $cat[$j] |= $mask; } } } - else { $cat[$i] |= $mask; } - } -} - -sub find_cls { - foreach (my $i=0; $i<@cat; $i++) { $cls{$cat[$i]} = scalar keys(%cls) if !defined($cls{$cat[$i]}); } - foreach (my $i=0; $i<@lcat; $i++) { $cls{$lcat[$i]} = scalar keys(%cls) if !defined($cls{$lcat[$i]}); } -} - -sub gen_enum { - print H "enum xml_char_type {\n"; - foreach my $id (sort keys %ids) { - my $mask = 0; - foreach my $i (keys %cls) { - $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id})); - } - printf H " XML_CHAR_%-20s = 0x%08x,\n", $id, $mask; - } - print H "};\n\n"; -} - -sub gen_tabs { - my @tab = (); - my %hash = (); - - print H "extern const byte xml_char_tab1[];\n"; - print H "extern const uns xml_char_tab2[];\n"; - print H "extern const byte xml_char_tab3[];\n"; - - print C "const uns xml_char_tab2[] = {\n "; - for (my $t=0; $t<256; $t++) { - my $i = $t * 256; - my @x = (); - for (my $j=0; $j<256; $j += 32) { - push @x, join(",", map($cls{$_}, @cat[$i+$j..$i+$j+31])); - } - my $sub = " " . join(",\n ", @x); - if (!defined($hash{$sub})) { - $hash{$sub} = 256 * scalar @tab; - push @tab, $sub; - } - printf C "0x%x", $hash{$sub}; - print C ((~$t & 15) ? "," : ($t < 255) ? ",\n " : "\n};\n\n"); - } - - print C "const byte xml_char_tab1[] = {\n"; - print C join(",\n\n", @tab); - print C "\n};\n\n"; - - my @l = (); - for (my $i=0; $i<0x11; $i++) { - push @l, sprintf("%d", $cls{$lcat[$i]}); - } - print C "const byte xml_char_tab3[] = {" . join(",", @l) . "};\n"; -} diff --git a/shxml/xml-test.c b/shxml/xml-test.c deleted file mode 100644 index f86547a0..00000000 --- a/shxml/xml-test.c +++ /dev/null @@ -1,365 +0,0 @@ -/* - * Sherlock Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -enum { - WANT_FIRST = 0x100, - WANT_HIDE_ERRORS, - WANT_IGNORE_COMMENTS, - WANT_IGNORE_PIS, - WANT_REPORT_BLOCKS, - WANT_REPORT_IGNORABLE, - WANT_FILE_ENTITIES, -}; - -static char *shortopts = "spdt" CF_SHORT_OPTS; -static struct option longopts[] = { - CF_LONG_OPTS - { "sax", 0, 0, 's' }, - { "pull", 0, 0, 'p' }, - { "dom", 0, 0, 't' }, - { "dtd", 0, 0, 'd' }, - { "hide-errors", 0, 0, WANT_HIDE_ERRORS }, - { "ignore-comments", 0, 0, WANT_IGNORE_COMMENTS }, - { "ignore-pis", 0, 0, WANT_IGNORE_PIS }, - { "report-blocks", 0, 0, WANT_REPORT_BLOCKS }, - { "report-ignorable", 0, 0, WANT_REPORT_IGNORABLE }, - { "file-entities", 0, 0, WANT_FILE_ENTITIES }, - { NULL, 0, 0, 0 } -}; - -static void NONRET -usage(void) -{ - fputs("\ -Usage: xml-test [options] < input.xml\n\ -\n\ -Options:\n" -CF_USAGE -"\ --p, --pull Test PULL interface\n\ --s, --sax Test SAX interface\n\ --t, --dom Test DOM interface\n\ --d, --dtd Enable parsing of DTD\n\ - --hide-errors Hide warnings and error messages\n\ - --ignore-comments Ignore comments\n\ - --ignore-pis Ignore processing instructions\n\ - --report-blocks Report blocks or characters and CDATA sections\n\ - --report-ignorable Report ignorable whitespace\n\ - --file-entities Resolve file external entities (not fully normative)\n\ -\n", stderr); - exit(1); -} - -static uns want_sax; -static uns want_pull; -static uns want_dom; -static uns want_parse_dtd; -static uns want_hide_errors; -static uns want_ignore_comments; -static uns want_ignore_pis; -static uns want_report_blocks; -static uns want_report_ignorable; -static uns want_file_entities; - -static struct fastbuf *out; - -static char * -node_type(struct xml_node *node) -{ - switch (node->type) - { - case XML_NODE_ELEM: return "element"; - case XML_NODE_COMMENT: return "comment"; - case XML_NODE_PI: return "pi"; - case XML_NODE_CHARS: return "chars"; - default: return "unknown"; - } -} - -static void -show_node(struct xml_node *node) -{ - switch (node->type) - { - case XML_NODE_ELEM: - bprintf(out, " <%s>", node->name); - XML_ATTR_FOR_EACH(a, node) - bprintf(out, " %s='%s'", a->name, a->val); - bputc(out, '\n'); - break; - case XML_NODE_COMMENT: - bprintf(out, " text='%s'\n", node->text); - break; - case XML_NODE_PI: - bprintf(out, " target=%s text='%s'\n", node->name, node->text); - break; - case XML_NODE_CHARS: - bprintf(out, " text='%s'\n", node->text); - break; - default: - bputc(out, '\n'); - } -} - -static void -show_tree(struct xml_node *node, uns level) -{ - if (!node) - return; - bputs(out, "DOM: "); - for (uns i = 0; i < level; i++) - bputs(out, " "); - bputs(out, node_type(node)); - show_node(node); - if (node->type == XML_NODE_ELEM) - XML_NODE_FOR_EACH(son, node) - show_tree(son, level + 1); -} - -static void -h_error(struct xml_context *ctx) -{ - bprintf(out, "SAX: %s at %u: %s\n", (ctx->err_code < XML_ERR_ERROR) ? "warn" : "error", xml_row(ctx), ctx->err_msg); -} - -static void -h_document_start(struct xml_context *ctx UNUSED) -{ - bputs(out, "SAX: document_start\n"); -} - -static void -h_document_end(struct xml_context *ctx UNUSED) -{ - bputs(out, "SAX: document_end\n"); -} - -static void -h_xml_decl(struct xml_context *ctx) -{ - bprintf(out, "SAX: xml_decl version=%s standalone=%d fb_encoding=%s\n", ctx->version_str, ctx->standalone, ctx->src->fb_encoding); -} - -static void -h_doctype_decl(struct xml_context *ctx) -{ - bprintf(out, "SAX: doctype_decl type=%s public='%s' system='%s' extsub=%d intsub=%d\n", - ctx->doctype, ctx->public_id ? : "", ctx->system_id ? : "", - !!(ctx->flags & XML_HAS_EXTERNAL_SUBSET), !!(ctx->flags & XML_HAS_INTERNAL_SUBSET)); -} - -static void -h_comment(struct xml_context *ctx) -{ - bputs(out, "SAX: comment"); - show_node(ctx->node); -} - -static void -h_pi(struct xml_context *ctx) -{ - bputs(out, "SAX: pi"); - show_node(ctx->node); -} - -static void -h_stag(struct xml_context *ctx) -{ - bputs(out, "SAX: stag"); - show_node(ctx->node); -} - -static void -h_etag(struct xml_context *ctx) -{ - bprintf(out, "SAX: etag \n", ctx->node->name); -} - -static void -h_chars(struct xml_context *ctx) -{ - bputs(out, "SAX: chars"); - show_node(ctx->node); -} - -static void -h_block(struct xml_context *ctx UNUSED, char *text, uns len UNUSED) -{ - bprintf(out, "SAX: block text='%s'\n", text); -} - -static void -h_cdata(struct xml_context *ctx UNUSED, char *text, uns len UNUSED) -{ - bprintf(out, "SAX: cdata text='%s'\n", text); -} - -static void -h_ignorable(struct xml_context *ctx UNUSED, char *text, uns len UNUSED) -{ - bprintf(out, "SAX: ignorable text='%s'\n", text); -} - -static void -h_dtd_start(struct xml_context *ctx UNUSED) -{ - bputs(out, "SAX: dtd_start\n"); -} - -static void -h_dtd_end(struct xml_context *ctx UNUSED) -{ - bputs(out, "SAX: dtd_end\n"); -} - -static void -h_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *e) -{ - xml_push_fastbuf(ctx, bopen(e->system_id, O_RDONLY, 4096)); -} - -int -main(int argc, char **argv) -{ - int opt; - cf_def_file = NULL; - log_init(argv[0]); - while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0) - switch (opt) - { - case 's': - want_sax++; - break; - case 'p': - want_pull++; - break; - case 't': - want_dom++; - break; - case 'd': - want_parse_dtd++; - break; - case WANT_HIDE_ERRORS: - want_hide_errors++; - break; - case WANT_IGNORE_COMMENTS: - want_ignore_comments++; - break; - case WANT_IGNORE_PIS: - want_ignore_pis++; - break; - case WANT_REPORT_BLOCKS: - want_report_blocks++; - break; - case WANT_REPORT_IGNORABLE: - want_report_ignorable++; - break; - case WANT_FILE_ENTITIES: - want_file_entities++; - break; - default: - usage(); - } - if (optind != argc) - usage(); - - out = bfdopen_shared(1, 4096); - struct xml_context ctx; - xml_init(&ctx); - if (!want_hide_errors) - ctx.h_warn = ctx.h_error = ctx.h_fatal = h_error; - if (want_sax) - { - ctx.h_document_start = h_document_start; - ctx.h_document_end = h_document_end; - ctx.h_xml_decl = h_xml_decl; - ctx.h_doctype_decl = h_doctype_decl; - ctx.h_comment = h_comment; - ctx.h_pi = h_pi; - ctx.h_stag = h_stag; - ctx.h_etag = h_etag; - ctx.h_chars = h_chars; - if (want_report_blocks) - { - ctx.h_block = h_block; - ctx.h_cdata = h_cdata; - } - if (want_report_ignorable) - ctx.h_ignorable = h_ignorable; - ctx.h_dtd_start = h_dtd_start; - ctx.h_dtd_end = h_dtd_end; - } - if (want_dom) - ctx.flags |= XML_ALLOC_ALL; - if (want_parse_dtd) - ctx.flags |= XML_PARSE_DTD; - if (want_ignore_comments) - ctx.flags &= ~(XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS); - if (want_ignore_pis) - ctx.flags &= ~(XML_REPORT_PIS | XML_ALLOC_PIS); - if (want_file_entities) - ctx.h_resolve_entity = h_resolve_entity; - xml_push_fastbuf(&ctx, bfdopen_shared(0, 4096)); - bputs(out, "PULL: start\n"); - if (want_pull) - { - ctx.pull = XML_PULL_CHARS | XML_PULL_STAG | XML_PULL_ETAG | XML_PULL_COMMENT | XML_PULL_PI; - uns state; - while (state = xml_next(&ctx)) - switch (state) - { - case XML_STATE_CHARS: - bputs(out, "PULL: chars"); - show_node(ctx.node); - break; - case XML_STATE_STAG: - bputs(out, "PULL: stag"); - show_node(ctx.node); - break; - case XML_STATE_ETAG: - bprintf(out, "PULL: etag \n", ctx.node->name); - break; - case XML_STATE_COMMENT: - bputs(out, "PULL: comment"); - show_node(ctx.node); - break; - case XML_STATE_PI: - bputs(out, "PULL: pi"); - show_node(ctx.node); - break; - default: - bputs(out, "PULL: unknown\n"); - break; - } - } - else - xml_parse(&ctx); - if (ctx.err_code) - bprintf(out, "PULL: fatal error at %u: %s\n", xml_row(&ctx), ctx.err_msg); - else - { - bputs(out, "PULL: eof\n"); - if (want_dom) - show_tree(ctx.dom, 0); - } - - xml_cleanup(&ctx); - bclose(out); - return 0; -} diff --git a/shxml/xml-test.t b/shxml/xml-test.t deleted file mode 100644 index 7a04fb1b..00000000 --- a/shxml/xml-test.t +++ /dev/null @@ -1,58 +0,0 @@ -# Tests for the XML parser -# (c) 2008 Pavel Charvat - -Run: ../obj/shxml/xml-test -In: - -Out: PULL: start - PULL: eof - -Run: ../obj/shxml/xml-test -s -In: - text1&amp;<text2 -Out: PULL: start - SAX: document_start - SAX: xml_decl version=1.0 standalone=0 fb_encoding=ISO-8859-1 - SAX: stag - SAX: stag a1='val1' a2='val2' - SAX: chars text='text1&<' - SAX: etag - SAX: chars text='text2' - SAX: etag - SAX: document_end - PULL: eof - -Run: ../obj/shxml/xml-test -sptd -In: - - "> - %pe1; - - - ]> - &e1;&e2; -Out: PULL: start - SAX: document_start - SAX: xml_decl version=1.0 standalone=0 fb_encoding=UTF-8 - SAX: doctype_decl type=root public='' system='' extsub=0 intsub=1 - SAX: dtd_start - SAX: dtd_end - SAX: stag - PULL: stag - SAX: chars text='text' - PULL: chars text='text' - SAX: stag - PULL: stag - SAX: chars text='' - PULL: chars text='' - PULL: etag - SAX: etag - PULL: etag - SAX: etag - SAX: document_end - PULL: eof - DOM: element - DOM: chars text='text' - DOM: element - DOM: chars text='' diff --git a/shxml/xml.h b/shxml/xml.h deleted file mode 100644 index f17b1d79..00000000 --- a/shxml/xml.h +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Sherlock Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#ifndef _SHERLOCK_XML_XML_H -#define _SHERLOCK_XML_XML_H - -#include -#include -#include -#include - -struct xml_context; -struct xml_dtd_entity; - -enum xml_error { - XML_ERR_OK = 0, - XML_ERR_WARN = 1000, /* Warning */ - XML_ERR_ERROR = 2000, /* Recoverable error */ - XML_ERR_FATAL = 3000, /* Unrecoverable error */ - XML_ERR_EOF, -}; - -enum xml_state { - XML_STATE_EOF, /* EOF or a fatal error */ - XML_STATE_START, /* Initial state */ - XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */ - XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */ - XML_STATE_CHARS, /* XML_PULL_CHARS */ - XML_STATE_STAG, /* XML_PULL_STAG */ - XML_STATE_ETAG, /* XML_PULL_ETAG */ - XML_STATE_COMMENT, /* XML_PULL_COMMENT */ - XML_STATE_PI, /* XML_PULL_PI */ - - /* Internal states */ - XML_STATE_CHARS_BEFORE_STAG, - XML_STATE_CHARS_BEFORE_ETAG, - XML_STATE_CHARS_BEFORE_CDATA, - XML_STATE_CHARS_BEFORE_COMMENT, - XML_STATE_CHARS_BEFORE_PI, - XML_STATE_PROLOG_COMMENT, - XML_STATE_PROLOG_PI, - XML_STATE_EPILOG_COMMENT, - XML_STATE_EPILOG_PI, -}; - -enum xml_pull { - XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */ - XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */ - XML_PULL_CHARS = 0x00000004, - XML_PULL_STAG = 0x00000008, - XML_PULL_ETAG = 0x00000010, - XML_PULL_COMMENT = 0x00000020, - XML_PULL_PI = 0x00000040, - XML_PULL_ALL = 0xffffffff, -}; - -enum xml_flags { - /* Enable reporting of various events via SAX and/or PUSH interface */ - XML_REPORT_COMMENTS = 0x00000001, /* Report comments */ - XML_REPORT_PIS = 0x00000002, /* Report processing instructions */ - XML_REPORT_CHARS = 0x00000004, /* Report characters */ - XML_REPORT_TAGS = 0x00000008, /* Report element starts/ends */ - XML_REPORT_MISC = XML_REPORT_COMMENTS | XML_REPORT_PIS, - XML_REPORT_ALL = XML_REPORT_MISC | XML_REPORT_CHARS | XML_REPORT_TAGS, - - /* Enable construction of DOM for these types */ - XML_ALLOC_COMMENTS = 0x00000010, /* Create comment nodes */ - XML_ALLOC_PIS = 0x00000020, /* Create processing instruction nodes */ - XML_ALLOC_CHARS = 0x00000040, /* Create character nodes */ - XML_ALLOC_TAGS = 0x00000080, /* Create element nodes */ - XML_ALLOC_MISC = XML_ALLOC_COMMENTS | XML_ALLOC_PIS, - XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS, - - /* Other parameters */ - XML_VALIDATING = 0x00000100, /* Validate everything (not fully implemented!) */ - XML_PARSE_DTD = 0x00000200, /* Enable parsing of DTD */ - XML_NO_CHARS = 0x00000400, /* The current element must not contain character data (filled automaticaly if using DTD) */ - XML_ALLOC_DEFAULT_ATTRS = 0x00000800, /* Allocate default attribute values so they can be found by XML_ATTR_FOR_EACH */ - - /* Internals, do not change! */ - XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */ - XML_VERSION_1_1 = 0x00020000, /* XML version is 1.1, otherwise 1.0 */ - XML_HAS_EXTERNAL_SUBSET = 0x00040000, /* The document contains a reference to external DTD subset */ - XML_HAS_INTERNAL_SUBSET = 0x00080000, /* The document contains an internal subset */ - XML_HAS_DTD = XML_HAS_EXTERNAL_SUBSET | XML_HAS_INTERNAL_SUBSET, - XML_SRC_EOF = 0x00100000, /* EOF reached */ - XML_SRC_EXPECTED_DECL = 0x00200000, /* Just before optional or required XMLDecl/TextDecl */ - XML_SRC_DOCUMENT = 0x00400000, /* The document entity */ - XML_SRC_EXTERNAL = 0x00800000, /* An external entity */ -}; - -enum xml_node_type { - XML_NODE_ELEM, - XML_NODE_COMMENT, - XML_NODE_CHARS, - XML_NODE_PI, -}; - -#define XML_NODE_FOR_EACH(var, node) CLIST_FOR_EACH(struct xml_node *, var, (node)->sons) -#define XML_ATTR_FOR_EACH(var, node) SLIST_FOR_EACH(struct xml_attr *, var, (node)->attrs) - -struct xml_node { - cnode n; /* Node for list of parent's sons */ - uns type; /* XML_NODE_x */ - struct xml_node *parent; /* Parent node */ - char *name; /* Element name / PI target */ - clist sons; /* Children nodes */ - union { - struct { - char *text; /* PI text / Comment / CDATA */ - uns len; /* Text length in bytes */ - }; - struct { - struct xml_dtd_elem *dtd; /* Element DTD */ - slist attrs; /* Link list of element attributes */ - }; - }; - void *user; /* User-defined (initialized to NULL) */ -}; - -struct xml_attr { - snode n; /* Node for elem->attrs */ - struct xml_node *elem; /* Parent element */ - struct xml_dtd_attr *dtd; /* Attribute DTD */ - char *name; /* Attribute name */ - char *val; /* Attribute value */ - void *user; /* User-defined (initialized to NULL) */ -}; - -#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */ - -struct xml_source { - struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ - struct fastbuf *fb; /* Source fastbuf */ - struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */ - struct fastbuf wrap_fb; /* Fbmem wrapper */ - u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ - u32 *bptr, *bstop; /* Current state of the buffer */ - uns row; /* File position */ - char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ - char *fb_encoding; /* Encoding of the source fastbuf */ - char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ - uns refill_cat1; /* Character categories, which should be directly passed to the buffer */ - uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in - sequences) */ - void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ - unsigned short *refill_in_to_x; /* Libucw-charset input table */ - uns saved_depth; /* Saved ctx->depth */ - uns pending_0xd; /* The last read character is 0xD */ -}; - -struct xml_context { - /* Error handling */ - char *err_msg; /* Last error message */ - enum xml_error err_code; /* Last error code */ - void *throw_buf; /* Where to jump on error */ - void (*h_warn)(struct xml_context *ctx); /* Warning callback */ - void (*h_error)(struct xml_context *ctx); /* Recoverable error callback */ - void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */ - - /* Memory management */ - struct mempool *pool; /* DOM pool */ - struct mempool *stack; /* Stack pool (freed as soon as possible) */ - struct xml_stack *stack_list; /* See xml_push(), xml_pop() */ - uns flags; /* XML_FLAG_x (restored on xml_pop()) */ - uns depth; /* Nesting level (for checking of valid source nesting -> valid pushes/pops on memory pools) */ - struct fastbuf chars; /* Character data / attribute value */ - struct mempool_state chars_state; /* Mempool state before the current character block has started */ - char *chars_trivial; /* If not empty, it will be appended to chars */ - void *tab_attrs; /* Hash table of element attributes */ - - /* Input */ - struct xml_source *src; /* Current source */ - u32 *bptr, *bstop; /* Buffer with preprocessed characters (validated UCS-4 + category flags) */ - uns cat_chars; /* Unicode range of supported characters (cdata, attribute values, ...) */ - uns cat_unrestricted; /* Unrestricted characters (may appear in document/external entities) */ - uns cat_new_line; /* New line characters */ - uns cat_name; /* Characters that may appear in names */ - uns cat_sname; /* Characters that may begin a name */ - - /* SAX-like interface */ - void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */ - void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */ - void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */ - void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration (before optional internal subset) */ - void (*h_comment)(struct xml_context *ctx); /* Called after a comment (only with XML_REPORT_COMMENTS) */ - void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */ - void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */ - void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */ - void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */ - void (*h_block)(struct xml_context *ctx, char *text, uns len); /* Called for each continuous block of characters not reported by h_cdata() (only with XML_REPORT_CHARS) */ - void (*h_cdata)(struct xml_context *ctx, char *text, uns len); /* Called for each CDATA section (only with XML_REPORT_CHARS) */ - void (*h_ignorable)(struct xml_context *ctx, char *text, uns len); /* Called for ignorable whitespace (content in tags without #PCDATA) */ - void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */ - void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */ - struct xml_dtd_entity *(*h_find_entity)(struct xml_context *ctx, char *name); /* Called when needed to resolve a general entity */ - void (*h_resolve_entity)(struct xml_context *ctx, struct xml_dtd_entity *ent); /* User should push source fastbuf for a parsed external entity (either general or parameter) */ - - /* DOM */ - struct xml_node *dom; /* DOM root */ - struct xml_node *node; /* Current DOM node */ - - char *version_str; - uns standalone; - char *doctype; /* The document type (or NULL if unknown) */ - char *system_id; /* DTD external id */ - char *public_id; /* DTD public id */ - struct xml_dtd *dtd; /* The DTD structure (or NULL) */ - uns state; /* Current state for the PULL interface (XML_STATE_x) */ - uns pull; /* Parameters for the PULL interface (XML_PULL_x) */ -}; - -/* Initialize XML context */ -void xml_init(struct xml_context *ctx); - -/* Clean up all internal structures */ -void xml_cleanup(struct xml_context *ctx); - -/* Reuse XML context, equivalent to xml_cleanup() and xml_init() */ -void xml_reset(struct xml_context *ctx); - -/* Add XML source (fastbuf will be automatically closed) */ -struct xml_source *xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb); - -/* Parse without the PUSH interface, return XML_ERR_x code (zero on success) */ -uns xml_parse(struct xml_context *ctx); - -/* Parse with the PUSH interface, return XML_STATE_x (zero on EOF or fatal error) */ -uns xml_next(struct xml_context *ctx); - -/* Equivalent to xml_next, but with temporarily changed ctx->pull value */ -uns xml_next_state(struct xml_context *ctx, uns pull); - -/* May be called on XML_STATE_STAG to skip it's content; can return XML_STATE_ETAG or XML_STATE_EOF on fatal error */ -uns xml_skip_element(struct xml_context *ctx); - -/* Returns the current row number in the document entity */ -uns xml_row(struct xml_context *ctx); - -/* Finds a given attribute value in a XML_NODE_ELEM node */ -struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name); - -/* Similar to xml_attr_find, but it deals also with default values */ -char *xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name); - -/* The default value of h_find_entity(), knows <, >, &, ' and " */ -struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name); - -/* The default value of h_resolve_entity(), throws an error */ -void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); - -/* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */ -uns xml_normalize_white(struct xml_context *ctx, char *value); - -/* Merge character contents of a given element to a single string (not recursive) */ -char *xml_merge_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool); - -/* Merge character contents of a given subtree to a single string */ -char *xml_merge_dom_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool); - -/* Public part of error handling */ -void xml_warn(struct xml_context *ctx, const char *format, ...); -void xml_error(struct xml_context *ctx, const char *format, ...); -void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...); - -#endif diff --git a/xml/Makefile b/xml/Makefile new file mode 100644 index 00000000..4a1b2e6b --- /dev/null +++ b/xml/Makefile @@ -0,0 +1,62 @@ +# Makefile for the XML parser +# (c) 2007 Pavel Charvat + +DIRS+=xml +PROGS+=$(o)/xml/xml-test + +LIBXML_MODS=common source parse dtd +LIBXML_INCLUDES=xml.h dtd.h + +LIBXML_MOD_PATHS=$(addprefix $(o)/xml/,$(LIBXML_MODS)) + +$(o)/xml/libucw-xml.a: $(addsuffix .o,$(LIBXML_MOD_PATHS)) +$(o)/xml/libucw-xml-pic.a: $(addsuffix .oo,$(LIBXML_MOD_PATHS)) +$(o)/xml/libucw-xml.so: $(addsuffix .oo,$(LIBXML_MOD_PATHS)) +$(o)/xml/libucw-xml.so: SONAME_SUFFIX=.$(UCW_ABI_VERSION) +$(o)/xml/libucw-xml.pc: $(LIBCHARSET) + +ifdef CONFIG_STATIC_PIC +$(o)/xml/libucw-xml.pc: $(o)/xml/libucw-xml-pic.a +endif +ifdef CONFIG_INSTALL_API +$(o)/xml/libucw-xml.pc: $(o)/xml/libucw-xml.a $(o)/xml/libucw-xml-pic.a $(o)/xml/libucw-xml.so +endif + +$(o)/xml/common.o: $(o)/xml/unicat.h +$(o)/xml/common.oo: $(o)/xml/unicat.h +$(o)/xml/source.o: $(o)/xml/unicat.h +$(o)/xml/source.oo: $(o)/xml/unicat.h +$(o)/xml/dtd.o: $(o)/xml/unicat.h +$(o)/xml/dtd.oo: $(o)/xml/unicat.h +$(o)/xml/parse.o: $(o)/xml/unicat.h +$(o)/xml/parse.oo: $(o)/xml/unicat.h +$(o)/xml/unicat.h: $(s)/xml/unicat.pl + $(M)GEN $(addprefix $(o)/xml/unicat,.h .c) + $(Q)$< $(addprefix $(o)/xml/unicat,.h .c) + $(Q)touch $@ + +TESTS+=$(o)/xml/xml-test.test +$(o)/xml/xml-test: $(o)/xml/xml-test.o $(LIBXML) +$(o)/xml/xml-test.test: $(o)/xml/xml-test + +API_LIBS+=libucw-xml +API_INCLUDES+=$(o)/xml/.include-stamp +$(o)/xml/.include-stamp: $(addprefix $(s)/xml/,$(LIBXML_INCLUDES)) +$(o)/xml/.include-stamp: IDST=xml +run/lib/pkgconfig/libucw-xml.pc: $(o)/xml/libucw-xml.pc + +INSTALL_TARGETS+=install-libucw-xml install-libucw-xml-api + +install-libucw-xml: + install -d -m 755 $(DESTDIR)$(INSTALL_LIB_DIR) + install -m 644 run/lib/libucw-xml.so.$(UCW_ABI_VERSION) $(DESTDIR)$(INSTALL_LIB_DIR) + +install-libucw-xml-api: + install -d -m 755 $(DESTDIR)$(INSTALL_INCLUDE_DIR)/xml $(DESTDIR)$(INSTALL_LIB_DIR) $(DESTDIR)$(INSTALL_PKGCONFIG_DIR) + install -m 644 run/lib/pkgconfig/libucw-xml.pc $(DESTDIR)$(INSTALL_PKGCONFIG_DIR) + install -m 644 $(addprefix run/include/xml/,$(LIBXML_INCLUDES)) $(DESTDIR)$(INSTALL_INCLUDE_DIR)/xml + ln -sf libucw-xml.so.$(UCW_ABI_VERSION) $(DESTDIR)$(INSTALL_LIB_DIR)/libucw-xml.so + install -m 644 run/lib/libucw-xml.a $(DESTDIR)$(INSTALL_LIB_DIR) + install -m 644 run/lib/libucw-xml-pic.a $(DESTDIR)$(INSTALL_LIB_DIR) + +.PHONY: install-libucw-xml install-libucw-xml-api diff --git a/xml/TODO b/xml/TODO new file mode 100644 index 00000000..b8dbc29c --- /dev/null +++ b/xml/TODO @@ -0,0 +1,15 @@ +Non-normative / not-implemented: +-- introduce numeric error codes +-- cycle detection in internal entities (and possibly external?) +-- conditional sections in DTD +-- validation of elements (regular expressions, non-cdata) +-- validation of attributes (unfinished) +-- notations +-- URI normalization +-- support for xml:space +-- support for xml:lang +-- full support for standalone documents +-- Unicode normalization + +Optimizations: +-- detect definitions of trivial entities diff --git a/xml/common.c b/xml/common.c new file mode 100644 index 00000000..bd95b7ea --- /dev/null +++ b/xml/common.c @@ -0,0 +1,140 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#undef LOCAL_DEBUG + +#include +#include +#include +#include +#include +#include + +#include + +/*** Error handling ***/ + +void NONRET +xml_throw(struct xml_context *ctx) +{ + ASSERT(ctx->err_code && ctx->throw_buf); + longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code); +} + +void +xml_warn(struct xml_context *ctx, const char *format, ...) +{ + if (ctx->h_warn) + { + va_list args; + va_start(args, format); + ctx->err_msg = stk_vprintf(format, args); + ctx->err_code = XML_ERR_WARN; + va_end(args); + ctx->h_warn(ctx); + ctx->err_msg = NULL; + ctx->err_code = XML_ERR_OK; + } +} + +void +xml_error(struct xml_context *ctx, const char *format, ...) +{ + if (ctx->h_error) + { + va_list args; + va_start(args, format); + ctx->err_msg = stk_vprintf(format, args); + ctx->err_code = XML_ERR_ERROR; + va_end(args); + ctx->h_error(ctx); + ctx->err_msg = NULL; + ctx->err_code = XML_ERR_OK; + } +} + +void NONRET +xml_fatal(struct xml_context *ctx, const char *format, ...) +{ + va_list args; + va_start(args, format); + ctx->err_msg = mp_vprintf(ctx->stack, format, args); + ctx->err_code = XML_ERR_FATAL; + ctx->state = XML_STATE_EOF; + va_end(args); + if (ctx->h_fatal) + ctx->h_fatal(ctx); + xml_throw(ctx); +} + +/*** Memory management ***/ + +void * +xml_hash_new(struct mempool *pool, uns size) +{ + void *tab = mp_alloc_zero(pool, size + XML_HASH_HDR_SIZE); + *(void **)tab = pool; + return tab + XML_HASH_HDR_SIZE; +} + +/*** Initialization ***/ + +static struct xml_context xml_defaults = { + .flags = XML_SRC_EOF | XML_REPORT_ALL, + .state = XML_STATE_START, + .h_resolve_entity = xml_def_resolve_entity, + .chars = { + .name = "", + .spout = xml_spout_chars, + .can_overwrite_buffer = 1, + }, +}; + +static void +xml_do_init(struct xml_context *ctx) +{ + xml_attrs_table_init(ctx); +} + +void +xml_init(struct xml_context *ctx) +{ + *ctx = xml_defaults; + ctx->pool = mp_new(65536); + ctx->stack = mp_new(65536); + xml_do_init(ctx); + TRACE(ctx, "init"); +} + +void +xml_cleanup(struct xml_context *ctx) +{ + TRACE(ctx, "cleanup"); + xml_attrs_table_cleanup(ctx); + xml_dtd_cleanup(ctx); + xml_sources_cleanup(ctx); + mp_delete(ctx->pool); + mp_delete(ctx->stack); +} + +void +xml_reset(struct xml_context *ctx) +{ + TRACE(ctx, "reset"); + struct mempool *pool = ctx->pool, *stack = ctx->stack; + xml_attrs_table_cleanup(ctx); + xml_dtd_cleanup(ctx); + xml_sources_cleanup(ctx); + mp_flush(pool); + mp_flush(stack); + *ctx = xml_defaults; + ctx->pool = pool; + ctx->stack = stack; + xml_do_init(ctx); +} diff --git a/xml/dtd.c b/xml/dtd.c new file mode 100644 index 00000000..27bc9c8e --- /dev/null +++ b/xml/dtd.c @@ -0,0 +1,1003 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#undef LOCAL_DEBUG + +#include +#include +#include +#include +#include +#include +#include + +/* Notations */ + +#define HASH_PREFIX(x) xml_dtd_notns_##x +#define HASH_NODE struct xml_dtd_notn +#define HASH_KEY_STRING name +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_WANT_LOOKUP +#define HASH_WANT_FIND +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +struct xml_dtd_notn * +xml_dtd_find_notn(struct xml_context *ctx, char *name) +{ + struct xml_dtd *dtd = ctx->dtd; + struct xml_dtd_notn *notn = xml_dtd_notns_find(dtd->tab_notns, name); + return !notn ? NULL : (notn->flags & XML_DTD_NOTN_DECLARED) ? notn : NULL; +} + +/* General entities */ + +#define HASH_PREFIX(x) xml_dtd_ents_##x +#define HASH_NODE struct xml_dtd_entity +#define HASH_KEY_STRING name +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_WANT_FIND +#define HASH_WANT_LOOKUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +static struct xml_dtd_entity * +xml_dtd_declare_trivial_entity(struct xml_context *ctx, char *name, char *text) +{ + struct xml_dtd *dtd = ctx->dtd; + struct xml_dtd_entity *ent = xml_dtd_ents_lookup(dtd->tab_ents, name); + if (ent->flags & XML_DTD_ENTITY_DECLARED) + { + xml_warn(ctx, "Entity &%s; already declared", name); + return NULL; + } + slist_add_tail(&dtd->ents, &ent->n); + ent->flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL; + ent->text = text; + return ent; +} + +static void +xml_dtd_declare_default_entities(struct xml_context *ctx) +{ + xml_dtd_declare_trivial_entity(ctx, "lt", "<"); + xml_dtd_declare_trivial_entity(ctx, "gt", ">"); + xml_dtd_declare_trivial_entity(ctx, "amp", "&"); + xml_dtd_declare_trivial_entity(ctx, "apos", "'"); + xml_dtd_declare_trivial_entity(ctx, "quot", "\""); +} + +struct xml_dtd_entity * +xml_def_find_entity(struct xml_context *ctx UNUSED, char *name) +{ +#define ENT(n, t) ent_##n = { .name = #n, .text = t, .flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL } + static struct xml_dtd_entity ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\""); +#undef ENT + switch (name[0]) + { + case 'l': + if (!strcmp(name, "lt")) + return &ent_lt; + break; + case 'g': + if (!strcmp(name, "gt")) + return &ent_gt; + break; + case 'a': + if (!strcmp(name, "amp")) + return &ent_amp; + if (!strcmp(name, "apos")) + return &ent_apos; + break; + case 'q': + if (!strcmp(name, "quot")) + return &ent_quot; + break; + } + return NULL; +} + +struct xml_dtd_entity * +xml_dtd_find_entity(struct xml_context *ctx, char *name) +{ + struct xml_dtd *dtd = ctx->dtd; + if (ctx->h_find_entity) + return ctx->h_find_entity(ctx, name); + else if (dtd) + { + struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_ents, name); + return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL; + } + else + return xml_def_find_entity(ctx, name); +} + +/* Parameter entities */ + +static struct xml_dtd_entity * +xml_dtd_find_pentity(struct xml_context *ctx, char *name) +{ + struct xml_dtd *dtd = ctx->dtd; + struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_pents, name); + return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL; +} + +/* Elements */ + +struct xml_dtd_elems_table; + +static void +xml_dtd_elems_init_data(struct xml_dtd_elems_table *tab UNUSED, struct xml_dtd_elem *e) +{ + slist_init(&e->attrs); +} + +#define HASH_PREFIX(x) xml_dtd_elems_##x +#define HASH_NODE struct xml_dtd_elem +#define HASH_KEY_STRING name +#define HASH_TABLE_DYNAMIC +#define HASH_ZERO_FILL +#define HASH_WANT_FIND +#define HASH_WANT_LOOKUP +#define HASH_GIVE_ALLOC +#define HASH_GIVE_INIT_DATA +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +struct xml_dtd_elem * +xml_dtd_find_elem(struct xml_context *ctx, char *name) +{ + return ctx->dtd ? xml_dtd_elems_find(ctx->dtd->tab_elems, name) : NULL; +} + +/* Element sons */ + +struct xml_dtd_enodes_table; + +static inline uns +xml_dtd_enodes_hash(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) +{ + return hash_pointer(parent) ^ hash_pointer(elem); +} + +static inline int +xml_dtd_enodes_eq(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent1, struct xml_dtd_elem *elem1, struct xml_dtd_elem_node *parent2, struct xml_dtd_elem *elem2) +{ + return (parent1 == parent2) && (elem1 == elem2); +} + +static inline void +xml_dtd_enodes_init_key(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *node, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) +{ + node->parent = parent; + node->elem = elem; +} + +#define HASH_PREFIX(x) xml_dtd_enodes_##x +#define HASH_NODE struct xml_dtd_elem_node +#define HASH_KEY_COMPLEX(x) x parent, x elem +#define HASH_KEY_DECL struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_TABLE_DYNAMIC +#define HASH_ZERO_FILL +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +/* Element attributes */ + +struct xml_dtd_attrs_table; + +static inline uns +xml_dtd_attrs_hash(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem, char *name) +{ + return hash_pointer(elem) ^ hash_string(name); +} + +static inline int +xml_dtd_attrs_eq(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem1, char *name1, struct xml_dtd_elem *elem2, char *name2) +{ + return (elem1 == elem2) && !strcmp(name1, name2); +} + +static inline void +xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_elem *elem, char *name) +{ + attr->elem = elem; + attr->name = name; + slist_add_tail(&elem->attrs, &attr->n); +} + +#define HASH_PREFIX(x) xml_dtd_attrs_##x +#define HASH_NODE struct xml_dtd_attr +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x elem, x name +#define HASH_KEY_DECL struct xml_dtd_elem *elem, char *name +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +struct xml_dtd_attr * +xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name) +{ + return ctx->dtd ? xml_dtd_attrs_find(ctx->dtd->tab_attrs, elem, name) : NULL; +} + +/* Enumerated attribute values */ + +struct xml_dtd_evals_table; + +static inline uns +xml_dtd_evals_hash(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr, char *val) +{ + return hash_pointer(attr) ^ hash_string(val); +} + +static inline int +xml_dtd_evals_eq(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr1, char *val1, struct xml_dtd_attr *attr2, char *val2) +{ + return (attr1 == attr2) && !strcmp(val1, val2); +} + +static inline void +xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_eval *eval, struct xml_dtd_attr *attr, char *val) +{ + eval->attr = attr; + eval->val = val; +} + +#define HASH_PREFIX(x) xml_dtd_evals_##x +#define HASH_NODE struct xml_dtd_eval +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x attr, x val +#define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +/* Enumerated attribute notations */ + +struct xml_dtd_enotns_table; + +static inline uns +xml_dtd_enotns_hash(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) +{ + return hash_pointer(attr) ^ hash_pointer(notn); +} + +static inline int +xml_dtd_enotns_eq(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr1, struct xml_dtd_notn *notn1, struct xml_dtd_attr *attr2, struct xml_dtd_notn *notn2) +{ + return (attr1 == attr2) && (notn1 == notn2); +} + +static inline void +xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_enotn *enotn, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) +{ + enotn->attr = attr; + enotn->notn = notn; +} + +#define HASH_PREFIX(x) xml_dtd_enotns_##x +#define HASH_NODE struct xml_dtd_enotn +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x attr, x notn +#define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +/* DTD initialization/cleanup */ + +void +xml_dtd_init(struct xml_context *ctx) +{ + if (ctx->dtd) + return; + struct mempool *pool = mp_new(4096); + struct xml_dtd *dtd = ctx->dtd = mp_alloc_zero(pool, sizeof(*ctx->dtd)); + dtd->pool = pool; + xml_dtd_ents_init(dtd->tab_ents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); + xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); + xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table))); + xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table))); + xml_dtd_enodes_init(dtd->tab_enodes = xml_hash_new(pool, sizeof(struct xml_dtd_enodes_table))); + xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table))); + xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table))); + xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table))); + xml_dtd_declare_default_entities(ctx); +} + +void +xml_dtd_cleanup(struct xml_context *ctx) +{ + if (!ctx->dtd) + return; + mp_delete(ctx->dtd->pool); + ctx->dtd = NULL; +} + +void +xml_dtd_finish(struct xml_context *ctx) +{ + if (!ctx->dtd) + return; + // FIXME: validity checks +} + +/*** Parsing functions ***/ + +/* References to parameter entities */ + +void +xml_parse_pe_ref(struct xml_context *ctx) +{ + /* PEReference ::= '%' Name ';' + * Already parsed: '%' */ + struct mempool_state state; + mp_save(ctx->stack, &state); + char *name = xml_parse_name(ctx, ctx->stack); + xml_parse_char(ctx, ';'); + struct xml_dtd_entity *ent = xml_dtd_find_pentity(ctx, name); + if (!ent) + xml_error(ctx, "Unknown entity %%%s;", name); + else + { + TRACE(ctx, "Pushed entity %%%s;", name); + mp_restore(ctx->stack, &state); + xml_dec(ctx); + xml_push_entity(ctx, ent); + return; + } + mp_restore(ctx->stack, &state); + xml_dec(ctx); +} + +static uns +xml_parse_dtd_pe(struct xml_context *ctx, uns entity_decl) +{ + /* Already parsed: '%' */ + do + { + xml_inc(ctx); + if (!~entity_decl && (xml_peek_cat(ctx) & XML_CHAR_WHITE)) + { + xml_dec(ctx); + return ~0U; + } + xml_parse_pe_ref(ctx); + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + xml_skip_char(ctx); + } + while (xml_get_char(ctx) == '%'); + xml_unget_char(ctx); + return 1; +} + +static inline uns +xml_parse_dtd_white(struct xml_context *ctx, uns mandatory) +{ + /* Whitespace or parameter entity, + * mandatory==~0U has a special maening of the whitespace before the '%' character in an parameter entity declaration */ + uns cnt = 0; + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + { + xml_skip_char(ctx); + cnt = 1; + } + if (xml_peek_char(ctx) == '%') + { + xml_skip_char(ctx); + return xml_parse_dtd_pe(ctx, mandatory); + } + else if (unlikely(mandatory && !cnt)) + xml_fatal_expected_white(ctx); + return cnt; +} + +static void +xml_dtd_parse_external_id(struct xml_context *ctx, char **system_id, char **public_id, uns allow_public) +{ + struct xml_dtd *dtd = ctx->dtd; + uns c = xml_peek_char(ctx); + if (c == 'S') + { + xml_parse_seq(ctx, "SYSTEM"); + xml_parse_dtd_white(ctx, 1); + *public_id = NULL; + *system_id = xml_parse_system_literal(ctx, dtd->pool); + } + else if (c == 'P') + { + xml_parse_seq(ctx, "PUBLIC"); + xml_parse_dtd_white(ctx, 1); + *system_id = NULL; + *public_id = xml_parse_pubid_literal(ctx, dtd->pool); + if (xml_parse_dtd_white(ctx, !allow_public)) + if ((c = xml_peek_char(ctx)) == '\'' || c == '"' || !allow_public) + *system_id = xml_parse_system_literal(ctx, dtd->pool); + } + else + xml_fatal(ctx, "Expected an external ID"); +} + +/* DTD: */ + +void +xml_parse_notation_decl(struct xml_context *ctx) +{ + /* NotationDecl ::= '' + * Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + + struct xml_dtd_notn *notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); + xml_parse_dtd_white(ctx, 1); + char *system_id, *public_id; + xml_dtd_parse_external_id(ctx, &system_id, &public_id, 1); + xml_parse_dtd_white(ctx, 0); + xml_parse_char(ctx, '>'); + + if (notn->flags & XML_DTD_NOTN_DECLARED) + xml_warn(ctx, "Notation %s already declared", notn->name); + else + { + notn->flags = XML_DTD_NOTN_DECLARED; + notn->system_id = system_id; + notn->public_id = public_id; + slist_add_tail(&dtd->notns, ¬n->n); + } + xml_dec(ctx); +} + +/* DTD: */ + +void +xml_parse_entity_decl(struct xml_context *ctx) +{ + /* Already parsed: 'dtd; + uns flags = ~xml_parse_dtd_white(ctx, ~0U) ? 0 : XML_DTD_ENTITY_PARAMETER; + if (flags) + xml_parse_dtd_white(ctx, 1); + struct xml_dtd_entity *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool)); + xml_parse_dtd_white(ctx, 1); + slist *list = flags ? &dtd->pents : &dtd->ents; + if (ent->flags & XML_DTD_ENTITY_DECLARED) + { + xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); + // FIXME: should be only warning + } + uns c, sep = xml_get_char(ctx); + if (sep == '\'' || sep == '"') + { + /* Internal entity: + * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */ + char *p = mp_start_noalign(dtd->pool, 1); + while (1) + { + if ((c = xml_get_char(ctx)) == sep) + break; + if (c == '%') + { + // FIXME + ASSERT(0); + //xml_parse_parameter_ref(ctx); + continue; + } + if (c == '&') + { + xml_inc(ctx); + if (xml_peek_char(ctx) != '#') + { + /* Bypass references to general entities */ + struct mempool_state state; + mp_save(ctx->stack, &state); + char *n = xml_parse_name(ctx, ctx->stack); + xml_parse_char(ctx, ';'); + xml_dec(ctx); + uns l = strlen(n); + p = mp_spread(dtd->pool, p, 3 + l); + *p++ = '&'; + memcpy(p, n, l); + p += l; + *p++ = ';';; + mp_restore(ctx->stack, &state); + continue; + } + else + { + xml_skip_char(ctx); + c = xml_parse_char_ref(ctx); + } + } + p = mp_spread(dtd->pool, p, 5); + p = utf8_32_put(p, c); + } + *p = 0; + ent->len = p - (char *)mp_ptr(dtd->pool); + ent->text = mp_end(dtd->pool, p + 1); + slist_add_tail(list, &ent->n); + ent->flags = flags | XML_DTD_ENTITY_DECLARED; + } + else + { + /* External entity */ + struct xml_dtd_notn *notn = NULL; + char *system_id, *public_id; + xml_unget_char(ctx); + xml_dtd_parse_external_id(ctx, &system_id, &public_id, 0); + if (xml_parse_dtd_white(ctx, 0) && flags && xml_peek_char(ctx) != '>') + { + /* General external unparsed entity */ + flags |= XML_DTD_ENTITY_UNPARSED; + xml_parse_seq(ctx, "NDATA"); + xml_parse_dtd_white(ctx, 1); + notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); + } + slist_add_tail(list, &ent->n); + ent->flags = flags | XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_EXTERNAL; + ent->system_id = system_id; + ent->public_id = public_id; + ent->notn = notn; + } + xml_parse_dtd_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_dec(ctx); +} + +/* DTD: */ + +void +xml_parse_element_decl(struct xml_context *ctx) +{ + /* Elementdecl ::= '' + * Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + char *name = xml_parse_name(ctx, dtd->pool); + xml_parse_dtd_white(ctx, 1); + struct xml_dtd_elem *elem = xml_dtd_elems_lookup(dtd->tab_elems, name); + if (elem->flags & XML_DTD_ELEM_DECLARED) + xml_fatal(ctx, "Element <%s> already declared", name); + + /* contentspec ::= 'EMPTY' | 'ANY' | Mixed | children */ + uns c = xml_peek_char(ctx); + if (c == 'E') + { + xml_parse_seq(ctx, "EMPTY"); + elem->type = XML_DTD_ELEM_EMPTY; + } + else if (c == 'A') + { + xml_parse_seq(ctx, "ANY"); + elem->type = XML_DTD_ELEM_ANY; + } + else if (c == '(') + { + xml_skip_char(ctx); + xml_inc(ctx); + xml_parse_dtd_white(ctx, 0); + struct xml_dtd_elem_node *parent = elem->node = mp_alloc_zero(dtd->pool, sizeof(*parent)); + if (xml_peek_char(ctx) == '#') + { + /* Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' */ + xml_skip_char(ctx); + xml_parse_seq(ctx, "PCDATA"); + elem->type = XML_DTD_ELEM_MIXED; + parent->type = XML_DTD_ELEM_PCDATA; + while (1) + { + xml_parse_dtd_white(ctx, 0); + if ((c = xml_get_char(ctx)) == ')') + break; + else if (c != '|') + xml_fatal_expected(ctx, ')'); + xml_parse_dtd_white(ctx, 0); + struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); + if (xml_dtd_enodes_find(dtd->tab_enodes, parent, son_elem)) + xml_error(ctx, "Duplicate content '%s'", son_elem->name); + else + { + struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); + slist_add_tail(&parent->sons, &son->n); + } + } + xml_dec(ctx); + if (xml_peek_char(ctx) == '*') + { + xml_skip_char(ctx); + parent->occur = XML_DTD_ELEM_OCCUR_MULT; + } + else if (!slist_head(&parent->sons)) + parent->occur = XML_DTD_ELEM_OCCUR_ONCE; + else + xml_fatal_expected(ctx, '*'); + } + else + { + /* children ::= (choice | seq) ('?' | '*' | '+')? + * cp ::= (Name | choice | seq) ('?' | '*' | '+')? + * choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' + * seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' */ + + elem->type = XML_DTD_ELEM_CHILDREN; + parent->type = XML_DTD_ELEM_PCDATA; + uns c; + goto first; + + while (1) + { + /* After name */ + xml_parse_dtd_white(ctx, 0); + if ((c = xml_get_char(ctx)) == ')') + { + xml_dec(ctx); + if (parent->type == XML_DTD_ELEM_PCDATA) + parent->type = XML_DTD_ELEM_SEQ; + if ((c = xml_get_char(ctx)) == '?') + parent->occur = XML_DTD_ELEM_OCCUR_OPT; + else if (c == '*') + parent->occur = XML_DTD_ELEM_OCCUR_MULT; + else if (c == '+') + parent->occur = XML_DTD_ELEM_OCCUR_PLUS; + else + { + xml_unget_char(ctx); + parent->occur = XML_DTD_ELEM_OCCUR_ONCE; + } + if (!parent->parent) + break; + parent = parent->parent; + continue; + } + else if (c == '|') + { + if (parent->type == XML_DTD_ELEM_PCDATA) + parent->type = XML_DTD_ELEM_OR; + else if (parent->type != XML_DTD_ELEM_OR) + xml_fatal(ctx, "Mixed operators in the list of element children"); + } + else if (c == ',') + { + if (parent->type == XML_DTD_ELEM_PCDATA) + parent->type = XML_DTD_ELEM_SEQ; + else if (parent->type != XML_DTD_ELEM_SEQ) + xml_fatal(ctx, "Mixed operators in the list of element children"); + } + else if (c == '(') + { + xml_inc(ctx); + struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); + son->parent = parent; + slist_add_tail(&parent->sons, &son->n); + parent = son->parent; + son->type = XML_DTD_ELEM_MIXED; + } + else + xml_unget_char(ctx); + + /* Before name */ + xml_parse_dtd_white(ctx, 0); +first:; + struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); + // FIXME: duplicates, occurance + //struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); + struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); + son->parent = parent; + son->elem = son_elem; + slist_add_tail(&parent->sons, &son->n); + } + } + } + else + xml_fatal(ctx, "Expected element content specification"); + + xml_parse_dtd_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_dec(ctx); +} + +void +xml_parse_attr_list_decl(struct xml_context *ctx) +{ + /* AttlistDecl ::= '' + * AttDef ::= S Name S AttType S DefaultDecl + * Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + struct xml_dtd_elem *elem = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); + + while (xml_parse_dtd_white(ctx, 0) && xml_peek_char(ctx) != '>') + { + char *name = xml_parse_name(ctx, dtd->pool); + struct xml_dtd_attr *attr = xml_dtd_attrs_find(dtd->tab_attrs, elem, name); + uns ignored = 0; + if (attr) + { + xml_warn(ctx, "Duplicate attribute definition"); + ignored++; + } + else + attr = xml_dtd_attrs_new(ctx->dtd->tab_attrs, elem, name); + xml_parse_dtd_white(ctx, 1); + if (xml_peek_char(ctx) == '(') + { + xml_skip_char(ctx); // FIXME: xml_inc/dec ? + if (!ignored) + attr->type = XML_ATTR_ENUM; + do + { + xml_parse_dtd_white(ctx, 0); + char *value = xml_parse_nmtoken(ctx, dtd->pool); + if (!ignored) + if (xml_dtd_evals_find(ctx->dtd->tab_evals, attr, value)) + xml_error(ctx, "Duplicate enumeration value"); + else + xml_dtd_evals_new(ctx->dtd->tab_evals, attr, value); + xml_parse_dtd_white(ctx, 0); + } + while (xml_get_char(ctx) == '|'); + xml_unget_char(ctx); + xml_parse_char(ctx, ')'); + } + else + { + char *type = xml_parse_name(ctx, dtd->pool); + enum xml_dtd_attr_type t = XML_ATTR_CDATA; + if (!strcmp(type, "CDATA")) + t = XML_ATTR_CDATA; + else if (!strcmp(type, "ID")) + t = XML_ATTR_ID; + else if (!strcmp(type, "IDREF")) + t = XML_ATTR_IDREF; + else if (!strcmp(type, "IDREFS")) + t = XML_ATTR_IDREFS; + else if (!strcmp(type, "ENTITY")) + t = XML_ATTR_ENTITY; + else if (!strcmp(type, "ENTITIES")) + t = XML_ATTR_ENTITIES; + else if (!strcmp(type, "NMTOKEN")) + t = XML_ATTR_NMTOKEN; + else if (!strcmp(type, "NMTOKENS")) + t = XML_ATTR_NMTOKENS; + else if (!strcmp(type, "NOTATION")) + { + if (elem->type == XML_DTD_ELEM_EMPTY) + xml_fatal(ctx, "Empty element must not have notation attribute"); + // FIXME: An element type MUST NOT have more than one NOTATION attribute specified. + t = XML_ATTR_NOTATION; + xml_parse_dtd_white(ctx, 1); + xml_parse_char(ctx, '('); + do + { + xml_parse_dtd_white(ctx, 0); + struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); + if (!ignored) + if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, attr, n)) + xml_error(ctx, "Duplicate enumerated notation"); + else + xml_dtd_enotns_new(ctx->dtd->tab_enotns, attr, n); + xml_parse_dtd_white(ctx, 0); + } + while (xml_get_char(ctx) == '|'); + xml_unget_char(ctx); + xml_parse_char(ctx, ')'); + } + else + xml_fatal(ctx, "Unknown attribute type"); + if (!ignored) + attr->type = t; + } + xml_parse_dtd_white(ctx, 1); + enum xml_dtd_attr_default def = XML_ATTR_NONE; + if (xml_get_char(ctx) == '#') + switch (xml_peek_char(ctx)) + { + case 'R': + xml_parse_seq(ctx, "REQUIRED"); + def = XML_ATTR_REQUIRED; + break; + case 'I': + xml_parse_seq(ctx, "IMPLIED"); + def = XML_ATTR_IMPLIED; + break; + case 'F': + xml_parse_seq(ctx, "FIXED"); + def = XML_ATTR_FIXED; + xml_parse_dtd_white(ctx, 1); + break; + default: + xml_fatal(ctx, "Expected a modifier for default attribute value"); + } + else + xml_unget_char(ctx); + if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED) + { + char *v = xml_parse_attr_value(ctx, attr); + if (!ignored) + attr->default_value = v; + } + if (!ignored) + attr->default_mode = def; + } + xml_skip_char(ctx); + xml_dec(ctx); +} + +void +xml_skip_internal_subset(struct xml_context *ctx) +{ + TRACE(ctx, "skip_internal_subset"); + /* AlreadyParsed: '[' */ + uns c; + while ((c = xml_get_char(ctx)) != ']') + { + if (c != '<') + continue; + if ((c = xml_get_char(ctx)) == '?') + { + xml_inc(ctx); + xml_skip_pi(ctx); + } + else if (c != '!') + xml_dec(ctx); + else if (xml_get_char(ctx) == '-') + { + xml_inc(ctx); + xml_skip_comment(ctx); + } + else + while ((c = xml_get_char(ctx)) != '>') + if (c == '\'' || c == '"') + while (xml_get_char(ctx) != c); + } + xml_dec(ctx); +} + +/*** Validation of attribute values ***/ + +static uns +xml_check_tokens(char *value, uns first_cat, uns next_cat, uns seq) +{ + char *p = value; + uns u; + while (1) + { + p = utf8_32_get(p, &u); + if (!(xml_char_cat(u) & first_cat)) + return 0; + while (*p & ~0x20) + { + p = utf8_32_get(p, &u); + if (!(xml_char_cat(u) & next_cat)) + return 0; + } + if (!*p) + return 1; + if (!seq) + return 0; + p++; + } +} + +static uns +xml_is_name(struct xml_context *ctx, char *value) +{ + /* Name ::= NameStartChar (NameChar)* */ + return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 0); +} + +static uns +xml_is_names(struct xml_context *ctx, char *value) +{ + /* Names ::= Name (#x20 Name)* */ + return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 1); +} + +static uns +xml_is_nmtoken(struct xml_context *ctx, char *value) +{ + /* Nmtoken ::= (NameChar)+ */ + return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 0); +} + +static uns +xml_is_nmtokens(struct xml_context *ctx, char *value) +{ + /* Nmtokens ::= Nmtoken (#x20 Nmtoken)* */ + return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 1); +} + +static void +xml_err_attr_format(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *type) +{ + xml_error(ctx, "Attribute %s in <%s> does not match the production of %s", dtd->name, dtd->elem->name, type); +} + +void +xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value) +{ + if (dtd->type == XML_ATTR_CDATA) + return; + xml_normalize_white(ctx, value); + switch (dtd->type) + { + case XML_ATTR_ID: + if (!xml_is_name(ctx, value)) + xml_err_attr_format(ctx, dtd, "NAME"); + //FIXME: add to a hash table + break; + case XML_ATTR_IDREF: + if (!xml_is_name(ctx, value)) + xml_err_attr_format(ctx, dtd, "NAME"); + // FIXME: find in hash table (beware forward references) + break; + case XML_ATTR_IDREFS: + if (!xml_is_names(ctx, value)) + xml_err_attr_format(ctx, dtd, "NAMES"); + // FIXME: find + break; + case XML_ATTR_ENTITY: + // FIXME + break; + case XML_ATTR_ENTITIES: + // FIXME + break; + case XML_ATTR_NMTOKEN: + if (!xml_is_nmtoken(ctx, value)) + xml_err_attr_format(ctx, dtd, "NMTOKEN"); + break; + case XML_ATTR_NMTOKENS: + if (!xml_is_nmtokens(ctx, value)) + xml_err_attr_format(ctx, dtd, "NMTOKENS"); + break; + case XML_ATTR_ENUM: + if (!xml_dtd_evals_find(ctx->dtd->tab_evals, dtd, value)) + xml_error(ctx, "Attribute %s in <%s> contains an undefined enumeration value", dtd->name, dtd->elem->name); + break; + case XML_ATTR_NOTATION: + if (!xml_dtd_find_notn(ctx, value)) + xml_error(ctx, "Attribute %s in <%s> contains an undefined notation", dtd->name, dtd->elem->name); + break; + } +} diff --git a/xml/dtd.h b/xml/dtd.h new file mode 100644 index 00000000..4546e097 --- /dev/null +++ b/xml/dtd.h @@ -0,0 +1,168 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _SHERLOCK_XML_DTD_H +#define _SHERLOCK_XML_DTD_H + +#include + +struct xml_dtd { + struct mempool *pool; /* Memory pool where to allocate DTD */ + slist ents; /* Link list of general entities */ + slist pents; /* Link list of parameter entities */ + slist notns; /* Link list of notations */ + slist elems; /* Link list of elements */ + void *tab_ents; /* Hash table of general entities */ + void *tab_pents; /* Hash table of parameter entities */ + void *tab_notns; /* Hash table of notations */ + void *tab_elems; /* Hash table of elements */ + void *tab_enodes; /* Hash table of element sons */ + void *tab_attrs; /* Hash table of element attributes */ + void *tab_evals; /* Hash table of enumerated attribute values */ + void *tab_enotns; /* hash table of enumerated attribute notations */ +}; + +/* Notations */ + +enum xml_dtd_notn_flags { + XML_DTD_NOTN_DECLARED = 0x1, /* The notation has been declared (internal usage) */ +}; + +struct xml_dtd_notn { + snode n; /* Node in xml_dtd.notns */ + uns flags; /* XML_DTD_NOTN_x */ + char *name; /* Notation name */ + char *system_id; /* External ID */ + char *public_id; + void *user; /* User-defined */ +}; + +struct xml_dtd_notn *xml_dtd_find_notn(struct xml_context *ctx, char *name); + +/* Entities */ + +enum xml_dtd_entity_flags { + XML_DTD_ENTITY_DECLARED = 0x1, /* The entity has been declared (internal usage) */ + XML_DTD_ENTITY_VISITED = 0x2, /* Cycle detection (internal usage) */ + XML_DTD_ENTITY_PARAMETER = 0x4, /* Parameter entity, general otherwise */ + XML_DTD_ENTITY_EXTERNAL = 0x8, /* External entity, internal otherwise */ + XML_DTD_ENTITY_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */ + XML_DTD_ENTITY_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */ +}; + +struct xml_dtd_entity { + snode n; /* Node in xml_dtd.[gp]ents */ + uns flags; /* XML_DTD_ENT_x */ + char *name; /* Entity name */ + char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */ + uns len; /* Text length */ + char *system_id; /* External ID */ + char *public_id; + struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */ + void *user; /* User-defined */ +}; + +struct xml_dtd_entity *xml_dtd_find_entity(struct xml_context *ctx, char *name); + +/* Elements */ + +enum xml_dtd_elem_flags { + XML_DTD_ELEM_DECLARED = 0x1, /* The element has been declared (internal usage) */ +}; + +enum xml_dtd_elem_type { + XML_DTD_ELEM_EMPTY, + XML_DTD_ELEM_ANY, + XML_DTD_ELEM_MIXED, + XML_DTD_ELEM_CHILDREN, +}; + +struct xml_dtd_elem { + snode n; + uns flags; + uns type; + char *name; + struct xml_dtd_elem_node *node; + slist attrs; + void *user; /* User-defined */ +}; + +struct xml_dtd_elem_node { + snode n; + struct xml_dtd_elem_node *parent; + struct xml_dtd_elem *elem; + slist sons; + uns type; + uns occur; + void *user; /* User-defined */ +}; + +enum xml_dtd_elem_node_type { + XML_DTD_ELEM_PCDATA, + XML_DTD_ELEM_SEQ, + XML_DTD_ELEM_OR, +}; + +enum xml_dtd_elem_node_occur { + XML_DTD_ELEM_OCCUR_ONCE, + XML_DTD_ELEM_OCCUR_OPT, + XML_DTD_ELEM_OCCUR_MULT, + XML_DTD_ELEM_OCCUR_PLUS, +}; + +struct xml_dtd_elem *xml_dtd_find_elem(struct xml_context *ctx, char *name); + +/* Attributes */ + +enum xml_dtd_attr_default { + XML_ATTR_NONE, + XML_ATTR_REQUIRED, + XML_ATTR_IMPLIED, + XML_ATTR_FIXED, +}; + +enum xml_dtd_attr_type { + XML_ATTR_CDATA, + XML_ATTR_ID, + XML_ATTR_IDREF, + XML_ATTR_IDREFS, + XML_ATTR_ENTITY, + XML_ATTR_ENTITIES, + XML_ATTR_NMTOKEN, + XML_ATTR_NMTOKENS, + XML_ATTR_ENUM, + XML_ATTR_NOTATION, +}; + +struct xml_dtd_attr { + snode n; + char *name; /* Attribute name */ + struct xml_dtd_elem *elem; /* Owner element */ + uns type; /* See enum xml_dtd_attr_type */ + uns default_mode; /* See enum xml_dtd_attr_default */ + char *default_value; /* The default value defined in DTD (or NULL) */ +}; + +struct xml_dtd_eval { + struct xml_dtd_attr *attr; + char *val; +}; + +struct xml_dtd_enotn { + struct xml_dtd_attr *attr; + struct xml_dtd_notn *notn; +}; + +void xml_dtd_init(struct xml_context *ctx); +void xml_dtd_cleanup(struct xml_context *ctx); +void xml_dtd_finish(struct xml_context *ctx); + +struct xml_dtd_attr *xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name); + +#endif diff --git a/xml/internals.h b/xml/internals.h new file mode 100644 index 00000000..a3ca04c6 --- /dev/null +++ b/xml/internals.h @@ -0,0 +1,311 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _SHERLOCK_XML_INTERNALS_H +#define _SHERLOCK_XML_INTERNALS_H + +#include +#include + +/*** Debugging ***/ + +#ifdef LOCAL_DEBUG +#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0) +#else +#define TRACE(c, f, p...) do {} while(0) +#endif + +/*** Error handling ***/ + +void NONRET xml_throw(struct xml_context *ctx); + +/*** Memory management ***/ + +struct xml_stack { + struct xml_stack *next; + struct mempool_state state; + uns flags; +}; + +static inline void * +xml_do_push(struct xml_context *ctx, uns size) +{ + /* Saves ctx->stack and ctx->flags state */ + struct mempool_state state; + mp_save(ctx->stack, &state); + struct xml_stack *s = mp_alloc(ctx->stack, size); + s->state = state; + s->flags = ctx->flags; + s->next = ctx->stack_list; + ctx->stack_list = s; + return s; +} + +static inline void +xml_do_pop(struct xml_context *ctx, struct xml_stack *s) +{ + /* Restore ctx->stack and ctx->flags state */ + ctx->stack_list = s->next; + ctx->flags = s->flags; + mp_restore(ctx->stack, &s->state); +} + +static inline void +xml_push(struct xml_context *ctx) +{ + TRACE(ctx, "push"); + xml_do_push(ctx, sizeof(struct xml_stack)); +} + +static inline void +xml_pop(struct xml_context *ctx) +{ + TRACE(ctx, "pop"); + ASSERT(ctx->stack_list); + xml_do_pop(ctx, ctx->stack_list); +} + +struct xml_dom_stack { + struct xml_stack stack; + struct mempool_state state; +}; + +static inline struct xml_node * +xml_push_dom(struct xml_context *ctx, struct mempool_state *state) +{ + /* Create a new DOM node */ + TRACE(ctx, "push_dom"); + struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s)); + if (state) + s->state = *state; + else + mp_save(ctx->pool, &s->state); + struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n)); + n->user = NULL; + if (n->parent = ctx->node) + clist_add_tail(&n->parent->sons, &n->n); + return ctx->node = n; +} + +static inline void +xml_pop_dom(struct xml_context *ctx, uns free) +{ + /* Leave DOM subtree */ + TRACE(ctx, "pop_dom"); + ASSERT(ctx->node); + struct xml_node *p = ctx->node->parent; + struct xml_dom_stack *s = (void *)ctx->stack_list; + if (free) + { + /* See xml_pop_element() for cleanup of attribute hash table */ + if (p) + clist_remove(&ctx->node->n); + mp_restore(ctx->pool, &s->state); + } + ctx->node = p; + xml_do_pop(ctx, &s->stack); +} + +#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN) +#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \ + static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uns size) \ + { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \ + static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {} + +void *xml_hash_new(struct mempool *pool, uns size); + +void xml_spout_chars(struct fastbuf *fb); + +/*** Reading of document/external entities ***/ + +void NONRET xml_fatal_nested(struct xml_context *ctx); + +static inline void +xml_inc(struct xml_context *ctx) +{ + /* Called after the first character of a block */ + TRACE(ctx, "inc"); + ctx->depth++; +} + +static inline void +xml_dec(struct xml_context *ctx) +{ + /* Called after the last character of a block */ + TRACE(ctx, "dec"); + if (unlikely(!ctx->depth--)) + xml_fatal_nested(ctx); +} + +#include "obj/xml/unicat.h" + +static inline uns +xml_char_cat(uns c) +{ + if (c < 0x10000) + return 1U << xml_char_tab1[(c & 0xff) + xml_char_tab2[c >> 8]]; + else if (likely(c < 0x110000)) + return 1U << xml_char_tab3[c >> 16]; + else + return 1; +} + +static inline uns +xml_ascii_cat(uns c) +{ + return xml_char_tab1[c]; +} + +struct xml_source *xml_push_source(struct xml_context *ctx); +void xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); + +void xml_refill(struct xml_context *ctx); + +static inline uns +xml_peek_char(struct xml_context *ctx) +{ + if (ctx->bptr == ctx->bstop) + xml_refill(ctx); + return ctx->bptr[0]; +} + +static inline uns +xml_peek_cat(struct xml_context *ctx) +{ + if (ctx->bptr == ctx->bstop) + xml_refill(ctx); + return ctx->bptr[1]; +} + +static inline uns +xml_get_char(struct xml_context *ctx) +{ + uns c = xml_peek_char(ctx); + ctx->bptr += 2; + return c; +} + +static inline uns +xml_get_cat(struct xml_context *ctx) +{ + uns c = xml_peek_cat(ctx); + ctx->bptr += 2; + return c; +} + +static inline uns +xml_last_char(struct xml_context *ctx) +{ + return ctx->bptr[-2]; +} + +static inline uns +xml_last_cat(struct xml_context *ctx) +{ + return ctx->bptr[-1]; +} + +static inline uns +xml_skip_char(struct xml_context *ctx) +{ + uns c = ctx->bptr[0]; + ctx->bptr += 2; + return c; +} + +static inline uns +xml_unget_char(struct xml_context *ctx) +{ + return *(ctx->bptr -= 2); +} + +void xml_sources_cleanup(struct xml_context *ctx); + +/*** Parsing ***/ + +void NONRET xml_fatal_expected(struct xml_context *ctx, uns c); +void NONRET xml_fatal_expected_white(struct xml_context *ctx); +void NONRET xml_fatal_expected_quot(struct xml_context *ctx); + +static inline uns +xml_parse_white(struct xml_context *ctx, uns mandatory) +{ + /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+ + * mandatory=0 -> S? */ + uns cnt = 0; + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + { + xml_skip_char(ctx); + cnt++; + } + if (unlikely(mandatory && !cnt)) + xml_fatal_expected_white(ctx); + return cnt; +} + +static inline void +xml_parse_char(struct xml_context *ctx, uns c) +{ + /* Consumes a given Unicode character */ + if (unlikely(c != xml_get_char(ctx))) + xml_fatal_expected(ctx, c); +} + +static inline void +xml_parse_seq(struct xml_context *ctx, const char *seq) +{ + /* Consumes a given sequence of ASCII characters */ + while (*seq) + xml_parse_char(ctx, *seq++); +} + +void xml_parse_eq(struct xml_context *ctx); + +static inline uns +xml_parse_quote(struct xml_context *ctx) +{ + /* "'" | '"' */ + uns c = xml_get_char(ctx); + if (unlikely(c != '\'' && c != '\"')) + xml_fatal_expected_quot(ctx); + return c; +} + +char *xml_parse_name(struct xml_context *ctx, struct mempool *pool); +void xml_skip_name(struct xml_context *ctx); +char *xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool); + +char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool); +char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool); + +uns xml_parse_char_ref(struct xml_context *ctx); +void xml_parse_pe_ref(struct xml_context *ctx); + +char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr); + +void xml_skip_internal_subset(struct xml_context *ctx); +void xml_parse_notation_decl(struct xml_context *ctx); +void xml_parse_entity_decl(struct xml_context *ctx); +void xml_parse_element_decl(struct xml_context *ctx); +void xml_parse_attr_list_decl(struct xml_context *ctx); + +void xml_push_comment(struct xml_context *ctx); +void xml_pop_comment(struct xml_context *ctx); +void xml_skip_comment(struct xml_context *ctx); + +void xml_push_pi(struct xml_context *ctx); +void xml_pop_pi(struct xml_context *ctx); +void xml_skip_pi(struct xml_context *ctx); + +void xml_attrs_table_init(struct xml_context *ctx); +void xml_attrs_table_cleanup(struct xml_context *ctx); + +void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value); + +#endif diff --git a/xml/libucw-xml.pc b/xml/libucw-xml.pc new file mode 100644 index 00000000..2115af95 --- /dev/null +++ b/xml/libucw-xml.pc @@ -0,0 +1,14 @@ +# pkg-config metadata for libucw-xml + +libdir=@LIBDIR@ +incdir=. + +# Override if you want to use the -pic version +picsuffix= + +Name: libucw-xml +Description: XML parser for LibUCW project +Version: @UCW_ABI_VERSION@ +Cflags: -I${incdir} +Libs: -L${libdir} -lucw-xml${picsuffix} +Requires: @DEPS@ diff --git a/xml/parse.c b/xml/parse.c new file mode 100644 index 00000000..1d7fe6fb --- /dev/null +++ b/xml/parse.c @@ -0,0 +1,1287 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#undef LOCAL_DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/*** Basic parsing ***/ + +void NONRET +xml_fatal_expected(struct xml_context *ctx, uns c) +{ + if (c >= 32 && c < 128) + xml_fatal(ctx, "Expected '%c'", c); + else + xml_fatal(ctx, "Expected U+%04x", c); +} + +void NONRET +xml_fatal_expected_white(struct xml_context *ctx) +{ + xml_fatal(ctx, "Expected a white space"); +} + +void NONRET +xml_fatal_expected_quot(struct xml_context *ctx) +{ + xml_fatal(ctx, "Expected a quotation mark"); +} + +void +xml_parse_eq(struct xml_context *ctx) +{ + /* Eq ::= S? '=' S? */ + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '='); + xml_parse_white(ctx, 0); +} + +/*** Names and nmtokens ***/ + +static char * +xml_parse_string(struct xml_context *ctx, struct mempool *pool, uns first_cat, uns next_cat, char *err) +{ + char *p = mp_start_noalign(pool, 1); + if (unlikely(!(xml_peek_cat(ctx) & first_cat))) + xml_fatal(ctx, "%s", err); + do + { + p = mp_spread(pool, p, 5); + p = utf8_32_put(p, xml_skip_char(ctx)); + } + while (xml_peek_cat(ctx) & next_cat); + *p++ = 0; + return mp_end(pool, p); +} + +static void +xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err) +{ + if (unlikely(!(xml_get_cat(ctx) & first_cat))) + xml_fatal(ctx, "%s", err); + while (xml_peek_cat(ctx) & next_cat) + xml_skip_char(ctx); +} + +char * +xml_parse_name(struct xml_context *ctx, struct mempool *pool) +{ + /* Name ::= NameStartChar (NameChar)* */ + return xml_parse_string(ctx, pool, ctx->cat_sname, ctx->cat_name, "Expected a name"); +} + +void +xml_skip_name(struct xml_context *ctx) +{ + xml_skip_string(ctx, ctx->cat_sname, ctx->cat_name, "Expected a name"); +} + +char * +xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool) +{ + /* Nmtoken ::= (NameChar)+ */ + return xml_parse_string(ctx, pool, ctx->cat_name, ctx->cat_name, "Expected a nmtoken"); +} + +/*** Simple literals ***/ + +char * +xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool) +{ + /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */ + char *p = mp_start_noalign(pool, 1); + uns q = xml_parse_quote(ctx), c; + while ((c = xml_get_char(ctx)) != q) + { + p = mp_spread(pool, p, 5); + p = utf8_32_put(p, c); + } + *p++ = 0; + return mp_end(pool, p); +} + +char * +xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool) +{ + /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */ + char *p = mp_start_noalign(pool, 1); + uns q = xml_parse_quote(ctx), c; + while ((c = xml_get_char(ctx)) != q) + { + if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID))) + xml_fatal(ctx, "Expected a pubid character"); + p = mp_spread(pool, p, 2); + *p++ = c; + } + *p++ = 0; + return mp_end(pool, p); +} + +/*** Comments ***/ + +void +xml_push_comment(struct xml_context *ctx) +{ + TRACE(ctx, "push_comment"); + /* Comment ::= '' + * Already parsed: 'type = XML_NODE_COMMENT; + char *p = mp_start_noalign(ctx->pool, 6); + while (1) + { + if (xml_get_char(ctx) == '-') + if (xml_get_char(ctx) == '-') + break; + else + *p++ = '-'; + p = utf8_32_put(p, xml_last_char(ctx)); + p = mp_spread(ctx->pool, p, 6); + } + xml_parse_char(ctx, '>'); + *p = 0; + n->len = p - (char *)mp_ptr(ctx->pool); + n->text = mp_end(ctx->pool, p + 1); + if ((ctx->flags & XML_REPORT_COMMENTS) && ctx->h_comment) + ctx->h_comment(ctx); +} + +void +xml_pop_comment(struct xml_context *ctx) +{ + xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_COMMENTS)); + xml_dec(ctx); + TRACE(ctx, "pop_comment"); +} + +void +xml_skip_comment(struct xml_context *ctx) +{ + TRACE(ctx, "skip_comment"); + xml_parse_char(ctx, '-'); + while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-'); + xml_parse_char(ctx, '>'); + xml_dec(ctx); +} + +/*** Processing instructions ***/ + +void +xml_push_pi(struct xml_context *ctx) +{ + TRACE(ctx, "push_pi"); + /* Parses a PI to ctx->value and ctx->name: + * PI ::= '' Char*)))? '?>' + * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) + * Already parsed: 'type = XML_NODE_PI; + n->name = xml_parse_name(ctx, ctx->pool); + if (unlikely(!strcasecmp(n->name, "xml"))) + xml_error(ctx, "Reserved PI target"); + char *p = mp_start_noalign(ctx->pool, 5); + if (!xml_parse_white(ctx, 0)) + xml_parse_seq(ctx, "?>"); + else + while (1) + { + if (xml_get_char(ctx) == '?') + if (xml_peek_char(ctx) == '>') + { + xml_skip_char(ctx); + break; + } + else + *p++ = '?'; + else + p = utf8_32_put(p, xml_last_char(ctx)); + p = mp_spread(ctx->pool, p, 5); + } + *p = 0; + n->len = p - (char *)mp_ptr(ctx->pool); + n->text = mp_end(ctx->pool, p + 1); + if ((ctx->flags & XML_REPORT_PIS) && ctx->h_pi) + ctx->h_pi(ctx); +} + +void +xml_pop_pi(struct xml_context *ctx) +{ + xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_PIS)); + xml_dec(ctx); + TRACE(ctx, "pop_pi"); +} + +void +xml_skip_pi(struct xml_context *ctx) +{ + TRACE(ctx, "skip_pi"); + if (ctx->flags & XML_VALIDATING) + { + struct mempool_state state; + mp_save(ctx->stack, &state); + if (unlikely(!strcasecmp(xml_parse_name(ctx, ctx->stack), "xml"))) + xml_error(ctx, "Reserved PI target"); + mp_restore(ctx->stack, &state); + if (!xml_parse_white(ctx, 0)) + { + xml_parse_seq(ctx, "?>"); + xml_dec(ctx); + return; + } + } + while (1) + if (xml_get_char(ctx) == '?') + if (xml_peek_char(ctx) == '>') + break; + xml_skip_char(ctx); + xml_dec(ctx); +} + +/*** Character references ***/ + +uns +xml_parse_char_ref(struct xml_context *ctx) +{ + TRACE(ctx, "parse_char_ref"); + /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' + * Already parsed: '&#' */ + uns v = 0; + if (xml_get_char(ctx) == 'x') + { + if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT)) + { + xml_error(ctx, "Expected a hexadecimal value of character reference"); + goto recover; + } + do + { + v = (v << 4) + Cxvalue(xml_last_char(ctx)); + } + while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT)); + } + else + { + if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT)) + { + xml_error(ctx, "Expected a numeric value of character reference"); + goto recover; + } + do + { + v = v * 10 + xml_last_char(ctx) - '0'; + } + while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT)); + } + uns cat = xml_char_cat(v); + if (!(cat & ctx->cat_unrestricted)) + { + xml_error(ctx, "Character reference out of range"); + goto recover; + } + if (xml_last_char(ctx) == ';') + { + xml_dec(ctx); + return v; + } + xml_error(ctx, "Expected ';'"); +recover: + while (xml_last_char(ctx) != ';') + xml_get_char(ctx); + xml_dec(ctx); + return UNI_REPLACEMENT; +} + +/*** References to general entities ***/ + +static void +xml_parse_ref(struct xml_context *ctx) +{ + /* Reference ::= EntityRef | CharRef + * EntityRef ::= '&' Name ';' + * Already parsed: '&' */ + struct fastbuf *out = &ctx->chars; + if (xml_peek_char(ctx) == '#') + { + xml_skip_char(ctx); + bput_utf8_32(out, xml_parse_char_ref(ctx)); + } + else + { + TRACE(ctx, "parse_ge_ref"); + struct mempool_state state; + mp_save(ctx->stack, &state); + char *name = xml_parse_name(ctx, ctx->stack); + xml_parse_char(ctx, ';'); + struct xml_dtd_entity *ent = xml_dtd_find_entity(ctx, name); + if (!ent) + { + xml_error(ctx, "Unknown entity &%s;", name); + bputc(out, '&'); + bputs(out, name); + bputc(out, ';'); + } + else if (ent->flags & XML_DTD_ENTITY_TRIVIAL) + { + TRACE(ctx, "Trivial entity &%s;", name); + bputs(out, ent->text); + } + else + { + TRACE(ctx, "Pushed entity &%s;", name); + mp_restore(ctx->stack, &state); + xml_dec(ctx); + xml_push_entity(ctx, ent); + return; + } + mp_restore(ctx->stack, &state); + xml_dec(ctx); + } +} + +/*** Character data ***/ + +void +xml_spout_chars(struct fastbuf *fb) +{ + if (fb->bptr < fb->bufend) + return; + struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb); + struct mempool *pool = ctx->pool; + if (fb->bufend != fb->buffer) + { + TRACE(ctx, "growing chars"); + uns len = fb->bufend - fb->buffer; + uns reported = fb->bstop - fb->buffer; + fb->buffer = mp_expand(pool); + fb->bufend = fb->buffer + mp_avail(pool); + fb->bptr = fb->buffer + len; + fb->bstop = fb->buffer + reported; + } + else + { + TRACE(ctx, "starting chars"); + mp_save(pool, &ctx->chars_state); + fb->bptr = fb->buffer = fb->bstop = mp_start_noalign(pool, 2); + fb->bufend = fb->buffer + mp_avail(pool) - 1; + } +} + +static inline uns +xml_end_chars(struct xml_context *ctx, char **out) +{ + struct fastbuf *fb = &ctx->chars; + uns len = fb->bptr - fb->buffer; + if (len) + { + TRACE(ctx, "ending chars"); + *fb->bptr = 0; + *out = mp_end(ctx->pool, fb->bptr + 1); + fb->bufend = fb->bstop = fb->bptr = fb->buffer; + } + return len; +} + +static inline uns +xml_report_chars(struct xml_context *ctx, char **out) +{ + struct fastbuf *fb = &ctx->chars; + uns len = fb->bptr - fb->buffer; + if (len) + { + *fb->bptr = 0; + *out = fb->bstop; + fb->bstop = fb->bptr; + } + return len; +} + +static inline uns +xml_flush_chars(struct xml_context *ctx) +{ + char *text, *rtext; + uns len = xml_end_chars(ctx, &text), rlen; + if (len) + { + if (ctx->flags & XML_NO_CHARS) + { + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_ignorable) + ctx->h_ignorable(ctx, text, len); + mp_restore(ctx->pool, &ctx->chars_state); + return 0; + } + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) + ctx->h_block(ctx, rtext, rlen); + if (!(ctx->flags & XML_ALLOC_CHARS) && !(ctx->flags & XML_REPORT_CHARS)) + { + mp_restore(ctx->pool, &ctx->chars_state); + return 0; + } + struct xml_node *n = xml_push_dom(ctx, &ctx->chars_state); + n->type = XML_NODE_CHARS; + n->text = text; + n->len = len; + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars) + ctx->h_chars(ctx); + } + return len; +} + +static inline void +xml_pop_chars(struct xml_context *ctx) +{ + xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS)); + TRACE(ctx, "pop_chars"); +} + +static inline void +xml_append_chars(struct xml_context *ctx) +{ + TRACE(ctx, "append_chars"); + struct fastbuf *out = &ctx->chars; + if (ctx->flags & XML_NO_CHARS) + while (xml_get_char(ctx) != '<') + if (xml_last_cat(ctx) & XML_CHAR_WHITE) + bput_utf8_32(out, xml_last_char(ctx)); + else + { + xml_error(ctx, "This element must not contain character data"); + while (xml_get_char(ctx) != '<'); + break; + } + else + while (xml_get_char(ctx) != '<') + if (xml_last_char(ctx) == '&') + { + xml_inc(ctx); + xml_parse_ref(ctx); + } + else + bput_utf8_32(out, xml_last_char(ctx)); + xml_unget_char(ctx); +} + +/*** CDATA sections ***/ + +static void +xml_skip_cdata(struct xml_context *ctx) +{ + TRACE(ctx, "skip_cdata"); + xml_parse_seq(ctx, "CDATA["); + while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>'); + xml_dec(ctx); +} + +static void +xml_append_cdata(struct xml_context *ctx) +{ + /* CDSect :== '' Char*)) ']]>' + * Already parsed: 'flags & XML_NO_CHARS) + { + xml_error(ctx, "This element must not contain CDATA"); + xml_skip_cdata(ctx); + return; + } + xml_parse_seq(ctx, "CDATA["); + struct fastbuf *out = &ctx->chars; + uns rlen; + char *rtext; + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) + ctx->h_block(ctx, rtext, rlen); + while (1) + { + if (xml_get_char(ctx) == ']') + { + if (xml_get_char(ctx) == ']') + if (xml_get_char(ctx) == '>') + break; + else + bputc(out, ']'); + bputc(out, ']'); + } + bput_utf8_32(out, xml_last_char(ctx)); + } + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata && (rlen = xml_report_chars(ctx, &rtext))) + ctx->h_cdata(ctx, rtext, rlen); + xml_dec(ctx); +} + +/*** Attribute values ***/ + +char * +xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) +{ + TRACE(ctx, "parse_attr_value"); + /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */ + /* FIXME: -- check value constrains / normalize leading/trailing WS and repeated WS */ + struct mempool_state state; + uns quote = xml_parse_quote(ctx); + mp_save(ctx->stack, &state); + struct fastbuf *out = &ctx->chars; + struct xml_source *src = ctx->src; + while (1) + { + uns c = xml_get_char(ctx); + if (c == '&') + { + xml_inc(ctx); + xml_parse_ref(ctx); + } + else if (c == quote && src == ctx->src) + break; + else if (c == '<') + xml_error(ctx, "Attribute value must not contain '<'"); + else if (xml_last_cat(ctx) & XML_CHAR_WHITE) + bputc(out, ' '); + else + bput_utf8_32(out, c); + } + mp_restore(ctx->stack, &state); + char *text; + return xml_end_chars(ctx, &text) ? text : ""; +} + +uns +xml_normalize_white(struct xml_context *ctx UNUSED, char *text) +{ + char *s = text, *d = text; + while (*s == 0x20) + s++; + while (1) + { + while (*s & ~0x20) + *d++ = *s++; + if (!*s) + break; + while (*++s == 0x20); + *d++ = 0x20; + } + if (d != text && d[-1] == 0x20) + d--; + *d = 0; + return d - text; +} + +/*** Attributes ***/ + +struct xml_attrs_table; + +static inline uns +xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_node *e, char *n) +{ + return hash_pointer(e) ^ hash_string(n); +} + +static inline int +xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_node *e1, char *n1, struct xml_node *e2, char *n2) +{ + return (e1 == e2) && !strcmp(n1, n2); +} + +static inline void +xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_node *e, char *name) +{ + a->elem = e; + a->name = name; + a->val = NULL; + a->user = NULL; + slist_add_tail(&e->attrs, &a->n); +} + +#define HASH_PREFIX(x) xml_attrs_##x +#define HASH_NODE struct xml_attr +#define HASH_KEY_COMPLEX(x) x elem, x name +#define HASH_KEY_DECL struct xml_node *elem, char *name +#define HASH_TABLE_DYNAMIC +#define HASH_GIVE_EQ +#define HASH_GIVE_HASHFN +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_CLEANUP +#define HASH_WANT_REMOVE +#define HASH_WANT_LOOKUP +#define HASH_WANT_FIND +#define HASH_GIVE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +static void +xml_parse_attr(struct xml_context *ctx) +{ + TRACE(ctx, "parse_attr"); + /* Attribute ::= Name Eq AttValue */ + struct xml_node *e = ctx->node; + char *n = xml_parse_name(ctx, ctx->pool); + struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, n); + xml_parse_eq(ctx); + char *v = xml_parse_attr_value(ctx, NULL); + if (a->val) + { + xml_error(ctx, "Attribute %s is not unique in element <%s>", n, e->name); + return; + } + a->val = v; + if (!e->dtd) + a->dtd = NULL; + else if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name))) + xml_error(ctx, "Undefined attribute %s in element <%s>", n, e->name); + else + xml_validate_attr(ctx, a->dtd, a->val); +} + +struct xml_attr * +xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name) +{ + return xml_attrs_find(ctx->tab_attrs, node, name); +} + +char * +xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name) +{ + struct xml_attr *attr = xml_attrs_find(ctx->tab_attrs, node, name); + if (attr) + return attr->val; + if (!node->dtd) + return NULL; + struct xml_dtd_attr *dtd = xml_dtd_find_attr(ctx, node->dtd, name); + return dtd ? dtd->default_value : NULL; +} + +void +xml_attrs_table_init(struct xml_context *ctx) +{ + xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table))); +} + +void +xml_attrs_table_cleanup(struct xml_context *ctx) +{ + xml_attrs_cleanup(ctx->tab_attrs); +} + +/*** Elements ***/ + +static uns +xml_validate_element(struct xml_dtd_elem_node *root, struct xml_dtd_elem *elem) +{ + if (root->elem) + return elem == root->elem; + else + SLIST_FOR_EACH(struct xml_dtd_elem_node *, son, root->sons) + if (xml_validate_element(son, elem)) + return 1; + return 0; +} + +static void +xml_push_element(struct xml_context *ctx) +{ + TRACE(ctx, "push_element"); + /* EmptyElemTag | STag + * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' + * STag ::= '<' Name (S Attribute)* S? '>' + * Already parsed: '<' */ + struct xml_node *e = xml_push_dom(ctx, NULL); + clist_init(&e->sons); + e->type = XML_NODE_ELEM; + e->name = xml_parse_name(ctx, ctx->pool); + slist_init(&e->attrs); + if (!e->parent) + { + ctx->dom = e; + if (ctx->doctype && strcmp(e->name, ctx->doctype)) + xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype); + } + if (!ctx->dtd) + e->dtd = NULL; + else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name))) + xml_error(ctx, "Undefined element <%s>", e->name); + else + { + struct xml_dtd_elem *dtd = e->dtd, *parent_dtd = e->parent ? e->parent->dtd : NULL; + if (dtd->type == XML_DTD_ELEM_MIXED) + ctx->flags &= ~XML_NO_CHARS; + else + ctx->flags |= XML_NO_CHARS; + if (parent_dtd) + if (parent_dtd->type == XML_DTD_ELEM_EMPTY) + xml_error(ctx, "Empty element must not contain children"); + else if (parent_dtd->type != XML_DTD_ELEM_ANY) + { + // FIXME: validate regular expressions + if (!xml_validate_element(parent_dtd->node, dtd)) + xml_error(ctx, "Unexpected element <%s>", e->name); + } + } + while (1) + { + uns white = xml_parse_white(ctx, 0); + uns c = xml_get_char(ctx); + if (c == '/') + { + xml_parse_char(ctx, '>'); + ctx->flags |= XML_EMPTY_ELEM_TAG; + break; + } + else if (c == '>') + break; + else if (!white) + xml_fatal_expected_white(ctx); + xml_unget_char(ctx); + xml_parse_attr(ctx); + } + if (e->dtd) + SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs) + if (a->default_mode == XML_ATTR_REQUIRED) + { + if (!xml_attrs_find(ctx->tab_attrs, e, a->name)) + xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name); + } + else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS) + { + struct xml_attr *attr = xml_attrs_lookup(ctx->tab_attrs, e, a->name); + if (!attr->val) + attr->val = a->default_value; + } + if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag) + ctx->h_stag(ctx); +} + +static void +xml_pop_element(struct xml_context *ctx) +{ + TRACE(ctx, "pop_element"); + if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag) + ctx->h_etag(ctx); + struct xml_node *e = ctx->node; + uns free = !(ctx->flags & XML_ALLOC_TAGS); + if (free) + { + if (!e->parent) + ctx->dom = NULL; + /* Restore hash table of attributes */ + SLIST_FOR_EACH(struct xml_attr *, a, e->attrs) + xml_attrs_remove(ctx->tab_attrs, a); + struct xml_node *n; + while (n = clist_head(&e->sons)) + { + if (n->type == XML_NODE_ELEM) + { + SLIST_FOR_EACH(struct xml_attr *, a, n->attrs) + xml_attrs_remove(ctx->tab_attrs, a); + clist_insert_list_after(&n->sons, &n->n); + } + clist_remove(&n->n); + } + } + xml_pop_dom(ctx, free); + xml_dec(ctx); +} + +static void +xml_parse_etag(struct xml_context *ctx) +{ + /* ETag ::= '' + * Already parsed: '<' */ + struct xml_node *e = ctx->node; + ASSERT(e); + char *n = e->name; + while (*n) + { + uns c; + n = utf8_32_get(n, &c); + if (xml_get_char(ctx) != c) + goto recover; + } + xml_parse_white(ctx, 0); + if (xml_get_char(ctx) != '>') + { +recover: + xml_error(ctx, "Invalid ETag, expected ", e->name); + while (xml_get_char(ctx) != '>'); + } + xml_dec(ctx); +} + +/*** Document type declaration ***/ + +static void +xml_parse_doctype_decl(struct xml_context *ctx) +{ + TRACE(ctx, "parse_doctype_decl"); + /* doctypedecl ::= '' + * Already parsed: '' */ + if (ctx->doctype) + xml_fatal(ctx, "Multiple document types not allowed"); + xml_parse_seq(ctx, "DOCTYPE"); + xml_parse_white(ctx, 1); + ctx->doctype = xml_parse_name(ctx, ctx->pool); + TRACE(ctx, "doctype=%s", ctx->doctype); + uns c; + if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P')) + { + if (c == 'S') + { + xml_parse_seq(ctx, "SYSTEM"); + xml_parse_white(ctx, 1); + ctx->system_id = xml_parse_system_literal(ctx, ctx->pool); + } + else + { + xml_parse_seq(ctx, "PUBLIC"); + xml_parse_white(ctx, 1); + ctx->public_id = xml_parse_pubid_literal(ctx, ctx->pool); + xml_parse_white(ctx, 1); + ctx->system_id = xml_parse_system_literal(ctx, ctx->pool); + } + xml_parse_white(ctx, 0); + ctx->flags |= XML_HAS_EXTERNAL_SUBSET; + } + if (xml_peek_char(ctx) == '[') + { + ctx->flags |= XML_HAS_INTERNAL_SUBSET; + xml_skip_char(ctx); + xml_inc(ctx); + } + if (ctx->h_doctype_decl) + ctx->h_doctype_decl(ctx); +} + + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/* DTD: Internal subset */ + +static void +xml_parse_subset(struct xml_context *ctx, uns external) +{ + // FIXME: + // -- comments/pi have no parent + // -- conditional sections in external subset + // -- check corectness of parameter entities + + /* '[' intSubset ']' + * intSubset :== (markupdecl | DeclSep) + * Already parsed: '[' + * + * extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* + */ + while (1) + { + xml_parse_white(ctx, 0); + uns c = xml_get_char(ctx); + xml_inc(ctx); + if (c == '<') + if ((c = xml_get_char(ctx)) == '!') + switch (c = xml_get_char(ctx)) + { + case '-': + xml_push_comment(ctx); + xml_pop_comment(ctx); + break; + case 'N': + xml_parse_seq(ctx, "OTATION"); + xml_parse_notation_decl(ctx); + break; + case 'E': + if ((c = xml_get_char(ctx)) == 'N') + { + xml_parse_seq(ctx, "TITY"); + xml_parse_entity_decl(ctx); + } + else if (c == 'L') + { + xml_parse_seq(ctx, "EMENT"); + xml_parse_element_decl(ctx); + } + else + goto invalid_markup; + break; + case 'A': + xml_parse_seq(ctx, "TTLIST"); + xml_parse_attr_list_decl(ctx); + break; + default: + goto invalid_markup; + } + else if (c == '?') + { + xml_push_pi(ctx); + xml_pop_pi(ctx); + } + else + goto invalid_markup; + else if (c == '%') + xml_parse_pe_ref(ctx); + else if (c == ']' && !external) + { + break; + } + else if (c == '>' && external) + { + break; + } + else + goto invalid_markup; + } + xml_dec(ctx); + return; +invalid_markup: ; + xml_fatal(ctx, "Invalid markup in the %s subset", external ? "external" : "internal"); +} + +/*** The State Machine ***/ + +uns +xml_next(struct xml_context *ctx) +{ + /* A nasty state machine */ + +#define PULL(x) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##x; case XML_STATE_##x: ; } while (0) +#define PULL_STATE(x, s) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##s, XML_STATE_##x; case XML_STATE_##s: ; } while (0) + + TRACE(ctx, "xml_next (state=%u)", ctx->state); + jmp_buf throw_buf; + ctx->throw_buf = &throw_buf; + if (setjmp(throw_buf)) + { +error: + if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal) + ctx->h_fatal(ctx); + TRACE(ctx, "raised fatal error"); + return ctx->state = XML_STATE_EOF; + } + uns c; + switch (ctx->state) + { + case XML_STATE_START: + TRACE(ctx, "entering prolog"); + ctx->flags |= XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL; + if (ctx->h_document_start) + ctx->h_document_start(ctx); + /* XMLDecl */ + xml_refill(ctx); + if (ctx->h_xml_decl) + ctx->h_xml_decl(ctx); + PULL(XML_DECL); + + /* Misc* (doctypedecl Misc*)? */ + while (1) + { + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '<'); + xml_inc(ctx); + if ((c = xml_get_char(ctx)) == '?') + /* Processing intruction */ + if (!(ctx->flags & XML_REPORT_PIS)) + xml_skip_pi(ctx); + else + { + xml_push_pi(ctx); + PULL_STATE(PI, PROLOG_PI); + xml_pop_pi(ctx); + } + else if (c != '!') + { + /* Found the root tag */ + xml_unget_char(ctx); + goto first_tag; + } + else if (xml_get_char(ctx) == '-') + if (!(ctx->flags & XML_REPORT_COMMENTS)) + xml_skip_comment(ctx); + else + { + xml_push_comment(ctx); + PULL_STATE(COMMENT, PROLOG_COMMENT); + xml_pop_comment(ctx); + } + else + { + /* DocTypeDecl */ + xml_unget_char(ctx); + xml_parse_doctype_decl(ctx); + PULL(DOCTYPE_DECL); + if (ctx->flags & XML_HAS_DTD) + if (ctx->flags & XML_PARSE_DTD) + { + xml_dtd_init(ctx); + if (ctx->h_dtd_start) + ctx->h_dtd_start(ctx); + if (ctx->flags & XML_HAS_INTERNAL_SUBSET) + { + xml_parse_subset(ctx, 0); + xml_dec(ctx); + } + if (ctx->flags & XML_HAS_EXTERNAL_SUBSET) + { + struct xml_dtd_entity ent = { + .system_id = ctx->system_id, + .public_id = ctx->public_id, + }; + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_unget_char(ctx); + ASSERT(ctx->h_resolve_entity); + ctx->h_resolve_entity(ctx, &ent); + ctx->flags |= XML_SRC_EXPECTED_DECL; + xml_parse_subset(ctx, 1); + xml_unget_char(ctx);; + } + if (ctx->h_dtd_end) + ctx->h_dtd_end(ctx); + } + else if (ctx->flags & XML_HAS_INTERNAL_SUBSET) + xml_skip_internal_subset(ctx); + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_dec(ctx); + } + } + + case XML_STATE_CHARS: + + while (1) + { + if (xml_peek_char(ctx) != '<') + { + /* CharData */ + xml_append_chars(ctx); + continue; + } + else + xml_skip_char(ctx); + xml_inc(ctx); +first_tag: + + if ((c = xml_get_char(ctx)) == '?') + { + /* PI */ + if (!(ctx->flags & (XML_REPORT_PIS | XML_ALLOC_PIS))) + xml_skip_pi(ctx); + else + { + if (xml_flush_chars(ctx)) + { + PULL_STATE(CHARS, CHARS_BEFORE_PI); + xml_pop_chars(ctx); + } + xml_push_pi(ctx); + PULL(PI); + xml_pop_pi(ctx); + } + } + + else if (c == '!') + if ((c = xml_get_char(ctx)) == '-') + { + /* Comment */ + if (!(ctx->flags & (XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS))) + xml_skip_comment(ctx); + else + { + if (xml_flush_chars(ctx)) + { + PULL_STATE(CHARS, CHARS_BEFORE_COMMENT); + xml_pop_chars(ctx); + } + xml_push_comment(ctx); + PULL(COMMENT); + xml_pop_comment(ctx); + } + } + else if (c == '[') + { + /* CDATA */ + xml_append_cdata(ctx); + } + else + xml_fatal(ctx, "Unexpected character after 'flags & XML_EMPTY_ELEM_TAG) + goto pop_element; + } + + else + { + /* ETag */ + if (xml_flush_chars(ctx)) + { + PULL_STATE(CHARS, CHARS_BEFORE_ETAG); + xml_pop_chars(ctx); + } + + xml_parse_etag(ctx); +pop_element: + PULL(ETAG); + xml_pop_element(ctx); + if (!ctx->node) + goto epilog; + } + } + +epilog: + /* Misc* */ + TRACE(ctx, "entering epilog"); + while (1) + { + /* Epilog whitespace is the only place, where a valid document can reach EOF */ + if (setjmp(throw_buf)) + if (ctx->err_code == XML_ERR_EOF) + { + TRACE(ctx, "reached EOF"); + ctx->state = XML_STATE_EOF; + if (ctx->h_document_end) + ctx->h_document_end(ctx); + case XML_STATE_EOF: + ctx->err_code = 0; + ctx->err_msg = NULL; + return XML_STATE_EOF; + } + else + goto error; + xml_parse_white(ctx, 0); + if (setjmp(throw_buf)) + goto error; + + /* Misc */ + xml_parse_char(ctx, '<'); + xml_inc(ctx); + if ((c = xml_get_char(ctx)) == '?') + /* Processing instruction */ + if (!(ctx->flags & XML_REPORT_PIS)) + xml_skip_pi(ctx); + else + { + xml_push_pi(ctx); + PULL_STATE(PI, EPILOG_PI); + xml_pop_pi(ctx); + } + else if (c == '!') + { + xml_parse_char(ctx, '-'); + /* Comment */ + if (!(ctx->flags & XML_REPORT_COMMENTS)) + xml_skip_comment(ctx); + else + { + xml_push_comment(ctx); + PULL_STATE(COMMENT, EPILOG_COMMENT); + xml_pop_comment(ctx); + } + } + else + xml_fatal(ctx, "Syntax error in the epilog"); + } + + } + ASSERT(0); +} + +uns +xml_next_state(struct xml_context *ctx, uns pull) +{ + uns saved = ctx->pull; + ctx->pull = pull; + uns res = xml_next(ctx); + ctx->pull = saved; + return res; +} + +uns +xml_skip_element(struct xml_context *ctx) +{ + ASSERT(ctx->state == XML_STATE_STAG); + struct xml_node *node = ctx->node; + uns saved = ctx->pull, res; + ctx->pull = XML_PULL_ETAG; + while ((res = xml_next(ctx)) && ctx->node != node); + ctx->pull = saved; + return res; +} + +uns +xml_parse(struct xml_context *ctx) +{ + /* This cycle should run only once unless the user overrides the value of ctx->pull in a SAX handler */ + do + { + ctx->pull = 0; + } + while (xml_next(ctx)); + return ctx->err_code; +} + +char * +xml_merge_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool) +{ + ASSERT(node->type == XML_NODE_ELEM); + char *p = mp_start_noalign(pool, 1); + XML_NODE_FOR_EACH(son, node) + if (son->type == XML_NODE_CHARS) + { + p = mp_spread(pool, p, son->len + 1); + memcpy(p, son->text, son->len); + p += son->len; + } + *p++ = 0; + return mp_end(pool, p); +} + +static char * +xml_append_dom_chars(char *p, struct mempool *pool, struct xml_node *node) +{ + XML_NODE_FOR_EACH(son, node) + if (son->type == XML_NODE_CHARS) + { + p = mp_spread(pool, p, son->len + 1); + memcpy(p, son->text, son->len); + p += son->len; + } + else if (son->type == XML_NODE_ELEM) + p = xml_append_dom_chars(p, pool, son); + return p; +} + +char * +xml_merge_dom_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool) +{ + ASSERT(node->type == XML_NODE_ELEM); + char *p = mp_start_noalign(pool, 1); + p = xml_append_dom_chars(p, pool, node); + *p++ = 0; + return mp_end(pool, p); +} diff --git a/xml/source.c b/xml/source.c new file mode 100644 index 00000000..f0d0cdb0 --- /dev/null +++ b/xml/source.c @@ -0,0 +1,486 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#undef LOCAL_DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include + +/*** Charecter categorization ***/ + +#include "obj/xml/unicat.c" + +static void +xml_init_cats(struct xml_context *ctx) +{ + if (!(ctx->flags & XML_VERSION_1_1)) + { + ctx->cat_chars = XML_CHAR_VALID_1_0; + ctx->cat_unrestricted = XML_CHAR_VALID_1_0; + ctx->cat_new_line = XML_CHAR_NEW_LINE_1_0; + ctx->cat_name = XML_CHAR_NAME_1_0; + ctx->cat_sname = XML_CHAR_SNAME_1_0; + } + else + { + ctx->cat_chars = XML_CHAR_VALID_1_1; + ctx->cat_unrestricted = XML_CHAR_UNRESTRICTED_1_1; + ctx->cat_new_line = XML_CHAR_NEW_LINE_1_1; + ctx->cat_name = XML_CHAR_NAME_1_1; + ctx->cat_sname = XML_CHAR_SNAME_1_1; + } +} + +/*** Reading of document/external entities ***/ + +static void NONRET +xml_eof(struct xml_context *ctx) +{ + ctx->err_msg = "Unexpected EOF"; + ctx->err_code = XML_ERR_EOF; + xml_throw(ctx); +} + +void NONRET +xml_fatal_nested(struct xml_context *ctx) +{ + xml_fatal(ctx, "Entity is not nested correctly"); +} + +static inline void +xml_add_char(u32 **bstop, uns c) +{ + *(*bstop)++ = c; + *(*bstop)++ = xml_char_cat(c); +} + +struct xml_source * +xml_push_source(struct xml_context *ctx) +{ + xml_push(ctx); + struct xml_source *src = ctx->src; + if (src) + { + src->bptr = ctx->bptr; + src->bstop = ctx->bstop; + } + src = mp_alloc_zero(ctx->stack, sizeof(*src)); + src->next = ctx->src; + src->saved_depth = ctx->depth; + ctx->src = src; + ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_DOCUMENT); + ctx->bstop = ctx->bptr = src->buf; + ctx->depth = 0; + return src; +} + +struct xml_source * +xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb) +{ + struct xml_source *src = xml_push_source(ctx); + src->fb = fb; + return src; +} + +static void +xml_close_source(struct xml_source *src) +{ + bclose(src->fb); + if (src->wrapped_fb) + bclose(src->wrapped_fb); +} + +static void +xml_pop_source(struct xml_context *ctx) +{ + TRACE(ctx, "pop_source"); + if (unlikely(ctx->depth != 0)) + xml_fatal(ctx, "Unexpected end of entity"); + struct xml_source *src = ctx->src; + if (!src) + xml_fatal(ctx, "Undefined source"); + xml_close_source(src); + ctx->depth = src->saved_depth; + ctx->src = src = src->next; + if (src) + { + ctx->bptr = src->bptr; + ctx->bstop = src->bstop; + } + xml_pop(ctx); + if (unlikely(!src)) + xml_eof(ctx); +} + +void +xml_sources_cleanup(struct xml_context *ctx) +{ + struct xml_source *s; + while (s = ctx->src) + { + ctx->src = s->next; + xml_close_source(s); + } +} + +static void xml_refill_utf8(struct xml_context *ctx); + +void +xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent UNUSED) +{ + xml_error(ctx, "References to external entities are not supported"); +} + +void +xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent) +{ + TRACE(ctx, "xml_push_entity"); + struct xml_source *src; + if (ent->flags & XML_DTD_ENTITY_EXTERNAL) + { + ASSERT(ctx->h_resolve_entity); + ctx->h_resolve_entity(ctx, ent); + ctx->flags |= XML_SRC_EXPECTED_DECL; + src = ctx->src; + } + else + { + src = xml_push_source(ctx); + fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, strlen(ent->text), 0); + } + src->refill = xml_refill_utf8; + src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line; + src->refill_cat2 = ctx->cat_new_line; +} + +static uns +xml_error_restricted(struct xml_context *ctx, uns c) +{ + if (c == ~1U) + xml_error(ctx, "Corrupted encoding"); + else + xml_error(ctx, "Restricted char U+%04X", c); + return UNI_REPLACEMENT; +} + +void xml_parse_decl(struct xml_context *ctx); + +#define REFILL(ctx, func, params...) \ + struct xml_source *src = ctx->src; \ + struct fastbuf *fb = src->fb; \ + if (ctx->bptr == ctx->bstop) \ + ctx->bptr = ctx->bstop = src->buf; \ + uns c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ + u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \ + *last_0xd = src->pending_0xd ? bstop : NULL; \ + do \ + { \ + c = func(fb, ##params); \ + uns t = xml_char_cat(c); \ + if (t & t1) \ + /* Typical branch */ \ + *bstop++ = c, *bstop++ = t; \ + else if (t & t2) \ + { \ + /* New line */ \ + /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \ + /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \ + if (c == 0xd) \ + last_0xd = bstop + 2; \ + else if (c != 0x2028 && last_0xd == bstop) \ + { \ + last_0xd = NULL; \ + continue; \ + } \ + xml_add_char(&bstop, 0xa), row++; \ + } \ + else if (c == '>') \ + { \ + /* Used only in XML/TextDecl to switch the encoding */ \ + *bstop++ = c, *bstop++ = t; \ + break; \ + } \ + else if (~c) \ + /* Restricted character */ \ + xml_add_char(&bstop, xml_error_restricted(ctx, c)); \ + else \ + { \ + /* EOF */ \ + ctx->flags |= XML_SRC_EOF; \ + break; \ + } \ + } \ + while (bstop < bend); \ + src->pending_0xd = (last_0xd == bstop); \ + ctx->bstop = bstop; \ + src->row = row; + +static void +xml_refill_utf8(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf8_repl, ~1U); +} + +static void +xml_refill_utf16_le(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf16_le_repl, ~1U); +} + +static void +xml_refill_utf16_be(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf16_be_repl, ~1U); +} + +#undef REFILL + +void +xml_refill(struct xml_context *ctx) +{ + do + { + if (ctx->flags & XML_SRC_EOF) + xml_pop_source(ctx); + else if (ctx->flags & XML_SRC_EXPECTED_DECL) + xml_parse_decl(ctx); + else + { + ctx->src->refill(ctx); + TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2)); + } + } + while (ctx->bptr == ctx->bstop); +} + +static uns +xml_source_row(struct xml_context *ctx, struct xml_source *src) +{ + uns row = src->row; + for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2) + if (p[-1] & src->refill_cat2) + row--; + return row + 1; +} + +uns +xml_row(struct xml_context *ctx) +{ + return ctx->src ? xml_source_row(ctx, ctx->src) : 0; +} + +/* Document/external entity header */ + +static char * +xml_parse_encoding_name(struct xml_context *ctx) +{ + /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */ + char *p = mp_start_noalign(ctx->pool, 1); + uns q = xml_parse_quote(ctx); + if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME))) + xml_fatal(ctx, "Invalid character in the encoding name"); + while (1) + { + p = mp_spread(ctx->pool, p, 2); + *p++ = xml_last_char(ctx); + if (xml_get_char(ctx) == q) + break; + if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME))) + xml_fatal(ctx, "Invalid character in the encoding name"); + } + *p++ = 0; + return mp_end(ctx->pool, p); +} + +static void +xml_init_charconv(struct xml_context *ctx, int cs) +{ + // XXX: with a direct access to libucw-charset tables could be faster + struct xml_source *src = ctx->src; + TRACE(ctx, "wrapping charset %s", charset_name(cs)); + src->wrapped_fb = src->fb; + src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8); +} + +void +xml_parse_decl(struct xml_context *ctx) +{ + TRACE(ctx, "xml_parse_decl"); + struct xml_source *src = ctx->src; + ctx->flags &= ~XML_SRC_EXPECTED_DECL; + uns doc = ctx->flags & XML_SRC_DOCUMENT; + + /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */ + if (doc) + xml_init_cats(ctx); + src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line & ~XML_CHAR_GT; + src->refill_cat2 = ctx->cat_new_line; + + /* Initialize the supplied charset (if any) or try to guess it */ + char *expected_encoding = src->expected_encoding; + src->refill = xml_refill_utf8; + int bom = bpeekc(src->fb); + if (bom < 0) + ctx->flags |= XML_SRC_EOF; + if (!src->fb_encoding) + { + if (bom == 0xfe) + src->refill = xml_refill_utf16_be; + else if (bom == 0xff) + src->refill = xml_refill_utf16_le; + } + else + { + int cs = find_charset_by_name(src->fb_encoding); + if (cs == CONV_CHARSET_UTF8) + {} + else if (cs >= 0) + { + xml_init_charconv(ctx, cs); + bom = 0; + } + else if (strcasecmp(src->fb_encoding, "UTF-16")) + { + src->refill = xml_refill_utf16_be; + if (bom == 0xff) + src->refill = xml_refill_utf16_le; + } + else if (strcasecmp(src->fb_encoding, "UTF-16BE")) + src->refill = xml_refill_utf16_be; + else if (strcasecmp(src->fb_encoding, "UTF-16LE")) + src->refill = xml_refill_utf16_le; + else + { + xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding); + expected_encoding = NULL; + } + } + uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; + if (utf16) + src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE"; + if (!expected_encoding) + expected_encoding = src->fb_encoding; + if (bom > 0 && xml_peek_char(ctx) == 0xfeff) + xml_skip_char(ctx); + else if (utf16) + xml_error(ctx, "Missing or corrupted BOM"); + TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?"); + + /* Look ahead for presence of XMLDecl or optional TextDecl */ + if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) + xml_refill(ctx); + u32 *bptr = ctx->bptr; + uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) && + bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L'); + if (!have_decl) + { + if (doc) + xml_fatal(ctx, "Missing or corrupted XML header"); + else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16) + xml_error(ctx, "Missing or corrupted entity header"); + goto exit; + } + ctx->bptr = bptr + 12; + xml_parse_white(ctx, 0); + + /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */ + if (xml_peek_char(ctx) == 'v') + { + xml_parse_seq(ctx, "version"); + xml_parse_eq(ctx); + char *version = xml_parse_pubid_literal(ctx, ctx->pool); + TRACE(ctx, "version=%s", version); + uns v = 0; + if (!strcmp(version, "1.1")) + v = XML_VERSION_1_1; + else if (strcmp(version, "1.0")) + { + xml_error(ctx, "Unknown XML version string '%s'", version); + version = "1.0"; + } + if (doc) + { + ctx->version_str = version; + ctx->flags |= v; + } + else if (v > (ctx->flags & XML_VERSION_1_1)) + xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document"); + if (!xml_parse_white(ctx, !doc)) + goto end; + } + else if (doc) + { + xml_error(ctx, "Expected XML version"); + ctx->version_str = "1.0"; + } + + /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */ + if (xml_peek_char(ctx) == 'e') + { + xml_parse_seq(ctx, "encoding"); + xml_parse_eq(ctx); + src->decl_encoding = xml_parse_encoding_name(ctx); + TRACE(ctx, "encoding=%s", src->decl_encoding); + if (!xml_parse_white(ctx, 0)) + goto end; + } + else if (!doc) + xml_error(ctx, "Expected XML encoding"); + + /* Parse whether the document is standalone (optional in XMLDecl) */ + if (doc && xml_peek_char(ctx) == 's') + { + xml_parse_seq(ctx, "standalone"); + xml_parse_eq(ctx); + uns c = xml_parse_quote(ctx); + if (ctx->standalone = (xml_peek_char(ctx) == 'y')) + xml_parse_seq(ctx, "yes"); + else + xml_parse_seq(ctx, "no"); + xml_parse_char(ctx, c); + TRACE(ctx, "standalone=%d", ctx->standalone); + xml_parse_white(ctx, 0); + } +end: + xml_parse_seq(ctx, "?>"); + + /* Switch to the final encoding */ + if (src->decl_encoding) + { + int cs = find_charset_by_name(src->decl_encoding); + if (cs < 0 && !expected_encoding) + xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); + else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) + { + xml_init_charconv(ctx, cs); + src->fb_encoding = src->decl_encoding; + } + else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || + !(!strcasecmp(src->decl_encoding, "UTF-16") || + (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || + (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) + xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); + } + if (!src->fb_encoding) + src->fb_encoding = "UTF-8"; + TRACE(ctx, "Final encoding=%s", src->fb_encoding); + +exit: + /* Update valid Unicode ranges */ + if (doc) + xml_init_cats(ctx); + src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line; + src->refill_cat2 = ctx->cat_new_line; +} diff --git a/xml/unicat.pl b/xml/unicat.pl new file mode 100755 index 00000000..b86106f2 --- /dev/null +++ b/xml/unicat.pl @@ -0,0 +1,165 @@ +#!/usr/bin/perl +# +# UCW Library -- Character map for the XML parser +# +# (c) 2007 Pavel Charvat +# +# This software may be freely distributed and used according to the terms +# of the GNU Lesser General Public License. +# + +my @cat = (); +my @lcat = (); +my %ids = (); +my %cls = (); +for (my $i = 0; $i < 0x10000; $i++) { $cat[$i] = 0; } +for (my $i = 0; $i < 0x11; $i++) { $lcat[$i] = 0; } + +my @white = (0x9, 0xA, 0xD, 0x20); +my @base_char_1_0 = ( + [0x0041,0x005A], [0x0061,0x007A], [0x00C0,0x00D6], [0x00D8,0x00F6], [0x00F8,0x00FF], [0x0100,0x0131], + [0x0134,0x013E], [0x0141,0x0148], [0x014A,0x017E], [0x0180,0x01C3], [0x01CD,0x01F0], [0x01F4,0x01F5], + [0x01FA,0x0217], [0x0250,0x02A8], [0x02BB,0x02C1], 0x0386, [0x0388,0x038A], 0x038C, [0x038E,0x03A1], + [0x03A3,0x03CE], [0x03D0,0x03D6], 0x03DA, 0x03DC, 0x03DE, 0x03E0, [0x03E2,0x03F3], [0x0401,0x040C], + [0x040E,0x044F], [0x0451,0x045C], [0x045E,0x0481], [0x0490,0x04C4], [0x04C7,0x04C8], [0x04CB,0x04CC], + [0x04D0,0x04EB], [0x04EE,0x04F5], [0x04F8,0x04F9], [0x0531,0x0556], 0x0559, [0x0561,0x0586], [0x05D0,0x05EA], + [0x05F0,0x05F2], [0x0621,0x063A], [0x0641,0x064A], [0x0671,0x06B7], [0x06BA,0x06BE], [0x06C0,0x06CE], + [0x06D0,0x06D3], 0x06D5, [0x06E5,0x06E6], [0x0905,0x0939], 0x093D, [0x0958,0x0961], [0x0985,0x098C], + [0x098F,0x0990], [0x0993,0x09A8], [0x09AA,0x09B0], 0x09B2, [0x09B6,0x09B9], [0x09DC,0x09DD], [0x09DF,0x09E1], + [0x09F0,0x09F1], [0x0A05,0x0A0A], [0x0A0F,0x0A10], [0x0A13,0x0A28], [0x0A2A,0x0A30], [0x0A32,0x0A33], + [0x0A35,0x0A36], [0x0A38,0x0A39], [0x0A59,0x0A5C], 0x0A5E, [0x0A72,0x0A74], [0x0A85,0x0A8B], 0x0A8D, + [0x0A8F,0x0A91], [0x0A93,0x0AA8], [0x0AAA,0x0AB0], [0x0AB2,0x0AB3], [0x0AB5,0x0AB9], 0x0ABD, 0x0AE0, + [0x0B05,0x0B0C], [0x0B0F,0x0B10], [0x0B13,0x0B28], [0x0B2A,0x0B30], [0x0B32,0x0B33], [0x0B36,0x0B39], + 0x0B3D, [0x0B5C,0x0B5D], [0x0B5F,0x0B61], [0x0B85,0x0B8A], [0x0B8E,0x0B90], [0x0B92,0x0B95], [0x0B99,0x0B9A], + 0x0B9C, [0x0B9E,0x0B9F], [0x0BA3,0x0BA4], [0x0BA8,0x0BAA], [0x0BAE,0x0BB5], [0x0BB7,0x0BB9], [0x0C05,0x0C0C], + [0x0C0E,0x0C10], [0x0C12,0x0C28], [0x0C2A,0x0C33], [0x0C35,0x0C39], [0x0C60,0x0C61], [0x0C85,0x0C8C], + [0x0C8E,0x0C90], [0x0C92,0x0CA8], [0x0CAA,0x0CB3], [0x0CB5,0x0CB9], 0x0CDE, [0x0CE0,0x0CE1], [0x0D05,0x0D0C], + [0x0D0E,0x0D10], [0x0D12,0x0D28], [0x0D2A,0x0D39], [0x0D60,0x0D61], [0x0E01,0x0E2E], 0x0E30, [0x0E32,0x0E33], + [0x0E40,0x0E45], [0x0E81,0x0E82], 0x0E84, [0x0E87,0x0E88], 0x0E8A, 0x0E8D, [0x0E94,0x0E97], [0x0E99,0x0E9F], + [0x0EA1,0x0EA3], 0x0EA5, 0x0EA7, [0x0EAA,0x0EAB], [0x0EAD,0x0EAE], 0x0EB0, [0x0EB2,0x0EB3], 0x0EBD, + [0x0EC0,0x0EC4], [0x0F40,0x0F47], [0x0F49,0x0F69], [0x10A0,0x10C5], [0x10D0,0x10F6], 0x1100, [0x1102,0x1103], + [0x1105,0x1107], 0x1109, [0x110B,0x110C], [0x110E,0x1112], 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150, + [0x1154,0x1155], 0x1159, [0x115F,0x1161], 0x1163, 0x1165, 0x1167, 0x1169, [0x116D,0x116E], [0x1172,0x1173], + 0x1175, 0x119E, 0x11A8, 0x11AB, [0x11AE,0x11AF], [0x11B7,0x11B8], 0x11BA, [0x11BC,0x11C2], 0x11EB, 0x11F0, + 0x11F9, [0x1E00,0x1E9B], [0x1EA0,0x1EF9], [0x1F00,0x1F15], [0x1F18,0x1F1D], [0x1F20,0x1F45], [0x1F48,0x1F4D], + [0x1F50,0x1F57], 0x1F59, 0x1F5B, 0x1F5D, [0x1F5F,0x1F7D], [0x1F80,0x1FB4], [0x1FB6,0x1FBC], 0x1FBE, + [0x1FC2,0x1FC4], [0x1FC6,0x1FCC], [0x1FD0,0x1FD3], [0x1FD6,0x1FDB], [0x1FE0,0x1FEC], [0x1FF2,0x1FF4], + [0x1FF6,0x1FFC], 0x2126, [0x212A,0x212B], 0x212E, [0x2180,0x2182], [0x3041,0x3094], [0x30A1,0x30FA], + [0x3105,0x312C], [0xAC00,0xD7A3]); +my @ideographic_1_0 = ([0x4E00,0x9FA5], 0x3007, [0x3021,0x3029]); +my @combining_char_1_0 = ( + [0x0300,0x0345], [0x0360,0x0361], [0x0483,0x0486], [0x0591,0x05A1], [0x05A3,0x05B9], [0x05BB,0x05BD], + 0x05BF, [0x05C1,0x05C2], 0x05C4, [0x064B,0x0652], 0x0670, [0x06D6,0x06DC], [0x06DD,0x06DF], [0x06E0,0x06E4], + [0x06E7,0x06E8], [0x06EA,0x06ED], [0x0901,0x0903], 0x093C, [0x093E,0x094C], 0x094D, [0x0951,0x0954], + [0x0962,0x0963], [0x0981,0x0983], 0x09BC, 0x09BE, 0x09BF, [0x09C0,0x09C4], [0x09C7,0x09C8], [0x09CB,0x09CD], + 0x09D7, [0x09E2,0x09E3], 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, [0x0A40,0x0A42], [0x0A47,0x0A48], [0x0A4B,0x0A4D], + [0x0A70,0x0A71], [0x0A81,0x0A83], 0x0ABC, [0x0ABE,0x0AC5], [0x0AC7,0x0AC9], [0x0ACB,0x0ACD], [0x0B01,0x0B03], + 0x0B3C, [0x0B3E,0x0B43], [0x0B47,0x0B48], [0x0B4B,0x0B4D], [0x0B56,0x0B57], [0x0B82,0x0B83], [0x0BBE,0x0BC2], + [0x0BC6,0x0BC8], [0x0BCA,0x0BCD], 0x0BD7, [0x0C01,0x0C03], [0x0C3E,0x0C44], [0x0C46,0x0C48], [0x0C4A,0x0C4D], + [0x0C55,0x0C56], [0x0C82,0x0C83], [0x0CBE,0x0CC4], [0x0CC6,0x0CC8], [0x0CCA,0x0CCD], [0x0CD5,0x0CD6], + [0x0D02,0x0D03], [0x0D3E,0x0D43], [0x0D46,0x0D48], [0x0D4A,0x0D4D], 0x0D57, 0x0E31, [0x0E34,0x0E3A], + [0x0E47,0x0E4E], 0x0EB1, [0x0EB4,0x0EB9], [0x0EBB,0x0EBC], [0x0EC8,0x0ECD], [0x0F18,0x0F19], 0x0F35, + 0x0F37, 0x0F39, 0x0F3E, 0x0F3F, [0x0F71,0x0F84], [0x0F86,0x0F8B], [0x0F90,0x0F95], 0x0F97, [0x0F99,0x0FAD], + [0x0FB1,0x0FB7], 0x0FB9, [0x20D0,0x20DC], 0x20E1, [0x302A,0x302F], 0x3099, 0x309A); +my @digit_1_0 = ( + [0x0030,0x0039], [0x0660,0x0669], [0x06F0,0x06F9], [0x0966,0x096F], [0x09E6,0x09EF], [0x0A66,0x0A6F], + [0x0AE6,0x0AEF], [0x0B66,0x0B6F], [0x0BE7,0x0BEF], [0x0C66,0x0C6F], [0x0CE6,0x0CEF], [0x0D66,0x0D6F], + [0x0E50,0x0E59], [0x0ED0,0x0ED9], [0x0F20,0x0F29]); +my @extender_1_0 = ( + 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, [0x3031,0x3035], [0x309D,0x309E], [0x30FC,0x30FE]); +my @sname_1_1 = ( + "[:A-Z_a-z]", [0xC0,0xD6], [0xD8,0xF6], [0xF8,0x2FF], [0x370,0x37D], [0x37F,0x1FFF], + [0x200C,0x200D], [0x2070,0x218F], [0x2C00,0x2FEF], [0x3001,0xD7FF], [0xF900,0xFDCF], [0xFDF0,0xFFFD], [0x10000,0xEFFFF]); + +set("WHITE", @white); +set("NEW_LINE_1_0", 0xA, 0xD); +set("NEW_LINE_1_1", 0xA, 0xD, 0x85, 0x2028); +set("DIGIT", "[0-9]"); +set("XDIGIT", "[0-9a-fA-F]"); +set("VALID_1_0", @white, [0x20,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("VALID_1_1", [0x1,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("UNRESTRICTED_1_1", @white, [0x20,0x7E], 0x85, [0xA0,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("PUBID", 0x20, 0xD, 0xA, "[-a-zA-Z0-9'()+,./:=?:!*#@\$_%]"); +set("ENC_SNAME", "[a-zA-Z]"); +set("ENC_NAME", "[-a-zA-Z0-9._]"); +set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0); +set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0); +set("SNAME_1_1", @sname_1_1); +set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]); +set("GT", "[>]"); + +($ARGV[0] eq "" || $ARGV[1] eq "") && die("Invalid usage"); +find_cls(); +open(H, ">", $ARGV[0]) or die("Cannot create $ARGV[0]"); +open(C, ">", $ARGV[1]) or die("Cannot create $ARGV[1]"); +gen_enum(); +gen_tabs(); +close(H); +close(C); + +sub set { + my $id = shift; + $ids{$id} = scalar keys(%ids) if !defined($ids{$id}); + my $mask = 1 << $ids{$id}; + foreach my $i (@_) { + if (ref($i) eq "ARRAY") { + my $j = $i->[0]; + for (; $j <= $i->[1] && $j < 0x10000; $j++) { $cat[$j] |= $mask; } + for (; $j <= $i->[1]; $j += 0x10000) { $lcat[$j >> 16] |= $mask; } + } + elsif ($i =~ /^\[/) { for (my $j=0; $j < 128; $j++) { if (chr($j) =~ /$i/) { $cat[$j] |= $mask; } } } + else { $cat[$i] |= $mask; } + } +} + +sub find_cls { + foreach (my $i=0; $i<@cat; $i++) { $cls{$cat[$i]} = scalar keys(%cls) if !defined($cls{$cat[$i]}); } + foreach (my $i=0; $i<@lcat; $i++) { $cls{$lcat[$i]} = scalar keys(%cls) if !defined($cls{$lcat[$i]}); } +} + +sub gen_enum { + print H "enum xml_char_type {\n"; + foreach my $id (sort keys %ids) { + my $mask = 0; + foreach my $i (keys %cls) { + $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id})); + } + printf H " XML_CHAR_%-20s = 0x%08x,\n", $id, $mask; + } + print H "};\n\n"; +} + +sub gen_tabs { + my @tab = (); + my %hash = (); + + print H "extern const byte xml_char_tab1[];\n"; + print H "extern const uns xml_char_tab2[];\n"; + print H "extern const byte xml_char_tab3[];\n"; + + print C "const uns xml_char_tab2[] = {\n "; + for (my $t=0; $t<256; $t++) { + my $i = $t * 256; + my @x = (); + for (my $j=0; $j<256; $j += 32) { + push @x, join(",", map($cls{$_}, @cat[$i+$j..$i+$j+31])); + } + my $sub = " " . join(",\n ", @x); + if (!defined($hash{$sub})) { + $hash{$sub} = 256 * scalar @tab; + push @tab, $sub; + } + printf C "0x%x", $hash{$sub}; + print C ((~$t & 15) ? "," : ($t < 255) ? ",\n " : "\n};\n\n"); + } + + print C "const byte xml_char_tab1[] = {\n"; + print C join(",\n\n", @tab); + print C "\n};\n\n"; + + my @l = (); + for (my $i=0; $i<0x11; $i++) { + push @l, sprintf("%d", $cls{$lcat[$i]}); + } + print C "const byte xml_char_tab3[] = {" . join(",", @l) . "};\n"; +} diff --git a/xml/xml-test.c b/xml/xml-test.c new file mode 100644 index 00000000..d35aaece --- /dev/null +++ b/xml/xml-test.c @@ -0,0 +1,365 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +enum { + WANT_FIRST = 0x100, + WANT_HIDE_ERRORS, + WANT_IGNORE_COMMENTS, + WANT_IGNORE_PIS, + WANT_REPORT_BLOCKS, + WANT_REPORT_IGNORABLE, + WANT_FILE_ENTITIES, +}; + +static char *shortopts = "spdt" CF_SHORT_OPTS; +static struct option longopts[] = { + CF_LONG_OPTS + { "sax", 0, 0, 's' }, + { "pull", 0, 0, 'p' }, + { "dom", 0, 0, 't' }, + { "dtd", 0, 0, 'd' }, + { "hide-errors", 0, 0, WANT_HIDE_ERRORS }, + { "ignore-comments", 0, 0, WANT_IGNORE_COMMENTS }, + { "ignore-pis", 0, 0, WANT_IGNORE_PIS }, + { "report-blocks", 0, 0, WANT_REPORT_BLOCKS }, + { "report-ignorable", 0, 0, WANT_REPORT_IGNORABLE }, + { "file-entities", 0, 0, WANT_FILE_ENTITIES }, + { NULL, 0, 0, 0 } +}; + +static void NONRET +usage(void) +{ + fputs("\ +Usage: xml-test [options] < input.xml\n\ +\n\ +Options:\n" +CF_USAGE +"\ +-p, --pull Test PULL interface\n\ +-s, --sax Test SAX interface\n\ +-t, --dom Test DOM interface\n\ +-d, --dtd Enable parsing of DTD\n\ + --hide-errors Hide warnings and error messages\n\ + --ignore-comments Ignore comments\n\ + --ignore-pis Ignore processing instructions\n\ + --report-blocks Report blocks or characters and CDATA sections\n\ + --report-ignorable Report ignorable whitespace\n\ + --file-entities Resolve file external entities (not fully normative)\n\ +\n", stderr); + exit(1); +} + +static uns want_sax; +static uns want_pull; +static uns want_dom; +static uns want_parse_dtd; +static uns want_hide_errors; +static uns want_ignore_comments; +static uns want_ignore_pis; +static uns want_report_blocks; +static uns want_report_ignorable; +static uns want_file_entities; + +static struct fastbuf *out; + +static char * +node_type(struct xml_node *node) +{ + switch (node->type) + { + case XML_NODE_ELEM: return "element"; + case XML_NODE_COMMENT: return "comment"; + case XML_NODE_PI: return "pi"; + case XML_NODE_CHARS: return "chars"; + default: return "unknown"; + } +} + +static void +show_node(struct xml_node *node) +{ + switch (node->type) + { + case XML_NODE_ELEM: + bprintf(out, " <%s>", node->name); + XML_ATTR_FOR_EACH(a, node) + bprintf(out, " %s='%s'", a->name, a->val); + bputc(out, '\n'); + break; + case XML_NODE_COMMENT: + bprintf(out, " text='%s'\n", node->text); + break; + case XML_NODE_PI: + bprintf(out, " target=%s text='%s'\n", node->name, node->text); + break; + case XML_NODE_CHARS: + bprintf(out, " text='%s'\n", node->text); + break; + default: + bputc(out, '\n'); + } +} + +static void +show_tree(struct xml_node *node, uns level) +{ + if (!node) + return; + bputs(out, "DOM: "); + for (uns i = 0; i < level; i++) + bputs(out, " "); + bputs(out, node_type(node)); + show_node(node); + if (node->type == XML_NODE_ELEM) + XML_NODE_FOR_EACH(son, node) + show_tree(son, level + 1); +} + +static void +h_error(struct xml_context *ctx) +{ + bprintf(out, "SAX: %s at %u: %s\n", (ctx->err_code < XML_ERR_ERROR) ? "warn" : "error", xml_row(ctx), ctx->err_msg); +} + +static void +h_document_start(struct xml_context *ctx UNUSED) +{ + bputs(out, "SAX: document_start\n"); +} + +static void +h_document_end(struct xml_context *ctx UNUSED) +{ + bputs(out, "SAX: document_end\n"); +} + +static void +h_xml_decl(struct xml_context *ctx) +{ + bprintf(out, "SAX: xml_decl version=%s standalone=%d fb_encoding=%s\n", ctx->version_str, ctx->standalone, ctx->src->fb_encoding); +} + +static void +h_doctype_decl(struct xml_context *ctx) +{ + bprintf(out, "SAX: doctype_decl type=%s public='%s' system='%s' extsub=%d intsub=%d\n", + ctx->doctype, ctx->public_id ? : "", ctx->system_id ? : "", + !!(ctx->flags & XML_HAS_EXTERNAL_SUBSET), !!(ctx->flags & XML_HAS_INTERNAL_SUBSET)); +} + +static void +h_comment(struct xml_context *ctx) +{ + bputs(out, "SAX: comment"); + show_node(ctx->node); +} + +static void +h_pi(struct xml_context *ctx) +{ + bputs(out, "SAX: pi"); + show_node(ctx->node); +} + +static void +h_stag(struct xml_context *ctx) +{ + bputs(out, "SAX: stag"); + show_node(ctx->node); +} + +static void +h_etag(struct xml_context *ctx) +{ + bprintf(out, "SAX: etag \n", ctx->node->name); +} + +static void +h_chars(struct xml_context *ctx) +{ + bputs(out, "SAX: chars"); + show_node(ctx->node); +} + +static void +h_block(struct xml_context *ctx UNUSED, char *text, uns len UNUSED) +{ + bprintf(out, "SAX: block text='%s'\n", text); +} + +static void +h_cdata(struct xml_context *ctx UNUSED, char *text, uns len UNUSED) +{ + bprintf(out, "SAX: cdata text='%s'\n", text); +} + +static void +h_ignorable(struct xml_context *ctx UNUSED, char *text, uns len UNUSED) +{ + bprintf(out, "SAX: ignorable text='%s'\n", text); +} + +static void +h_dtd_start(struct xml_context *ctx UNUSED) +{ + bputs(out, "SAX: dtd_start\n"); +} + +static void +h_dtd_end(struct xml_context *ctx UNUSED) +{ + bputs(out, "SAX: dtd_end\n"); +} + +static void +h_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *e) +{ + xml_push_fastbuf(ctx, bopen(e->system_id, O_RDONLY, 4096)); +} + +int +main(int argc, char **argv) +{ + int opt; + cf_def_file = NULL; + log_init(argv[0]); + while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0) + switch (opt) + { + case 's': + want_sax++; + break; + case 'p': + want_pull++; + break; + case 't': + want_dom++; + break; + case 'd': + want_parse_dtd++; + break; + case WANT_HIDE_ERRORS: + want_hide_errors++; + break; + case WANT_IGNORE_COMMENTS: + want_ignore_comments++; + break; + case WANT_IGNORE_PIS: + want_ignore_pis++; + break; + case WANT_REPORT_BLOCKS: + want_report_blocks++; + break; + case WANT_REPORT_IGNORABLE: + want_report_ignorable++; + break; + case WANT_FILE_ENTITIES: + want_file_entities++; + break; + default: + usage(); + } + if (optind != argc) + usage(); + + out = bfdopen_shared(1, 4096); + struct xml_context ctx; + xml_init(&ctx); + if (!want_hide_errors) + ctx.h_warn = ctx.h_error = ctx.h_fatal = h_error; + if (want_sax) + { + ctx.h_document_start = h_document_start; + ctx.h_document_end = h_document_end; + ctx.h_xml_decl = h_xml_decl; + ctx.h_doctype_decl = h_doctype_decl; + ctx.h_comment = h_comment; + ctx.h_pi = h_pi; + ctx.h_stag = h_stag; + ctx.h_etag = h_etag; + ctx.h_chars = h_chars; + if (want_report_blocks) + { + ctx.h_block = h_block; + ctx.h_cdata = h_cdata; + } + if (want_report_ignorable) + ctx.h_ignorable = h_ignorable; + ctx.h_dtd_start = h_dtd_start; + ctx.h_dtd_end = h_dtd_end; + } + if (want_dom) + ctx.flags |= XML_ALLOC_ALL; + if (want_parse_dtd) + ctx.flags |= XML_PARSE_DTD; + if (want_ignore_comments) + ctx.flags &= ~(XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS); + if (want_ignore_pis) + ctx.flags &= ~(XML_REPORT_PIS | XML_ALLOC_PIS); + if (want_file_entities) + ctx.h_resolve_entity = h_resolve_entity; + xml_push_fastbuf(&ctx, bfdopen_shared(0, 4096)); + bputs(out, "PULL: start\n"); + if (want_pull) + { + ctx.pull = XML_PULL_CHARS | XML_PULL_STAG | XML_PULL_ETAG | XML_PULL_COMMENT | XML_PULL_PI; + uns state; + while (state = xml_next(&ctx)) + switch (state) + { + case XML_STATE_CHARS: + bputs(out, "PULL: chars"); + show_node(ctx.node); + break; + case XML_STATE_STAG: + bputs(out, "PULL: stag"); + show_node(ctx.node); + break; + case XML_STATE_ETAG: + bprintf(out, "PULL: etag \n", ctx.node->name); + break; + case XML_STATE_COMMENT: + bputs(out, "PULL: comment"); + show_node(ctx.node); + break; + case XML_STATE_PI: + bputs(out, "PULL: pi"); + show_node(ctx.node); + break; + default: + bputs(out, "PULL: unknown\n"); + break; + } + } + else + xml_parse(&ctx); + if (ctx.err_code) + bprintf(out, "PULL: fatal error at %u: %s\n", xml_row(&ctx), ctx.err_msg); + else + { + bputs(out, "PULL: eof\n"); + if (want_dom) + show_tree(ctx.dom, 0); + } + + xml_cleanup(&ctx); + bclose(out); + return 0; +} diff --git a/xml/xml-test.t b/xml/xml-test.t new file mode 100644 index 00000000..8d0f9bb1 --- /dev/null +++ b/xml/xml-test.t @@ -0,0 +1,58 @@ +# Tests for the XML parser +# (c) 2008 Pavel Charvat + +Run: ../obj/xml/xml-test +In: + +Out: PULL: start + PULL: eof + +Run: ../obj/xml/xml-test -s +In: + text1&amp;<text2 +Out: PULL: start + SAX: document_start + SAX: xml_decl version=1.0 standalone=0 fb_encoding=ISO-8859-1 + SAX: stag + SAX: stag a1='val1' a2='val2' + SAX: chars text='text1&<' + SAX: etag + SAX: chars text='text2' + SAX: etag + SAX: document_end + PULL: eof + +Run: ../obj/xml/xml-test -sptd +In: + + "> + %pe1; + + + ]> + &e1;&e2; +Out: PULL: start + SAX: document_start + SAX: xml_decl version=1.0 standalone=0 fb_encoding=UTF-8 + SAX: doctype_decl type=root public='' system='' extsub=0 intsub=1 + SAX: dtd_start + SAX: dtd_end + SAX: stag + PULL: stag + SAX: chars text='text' + PULL: chars text='text' + SAX: stag + PULL: stag + SAX: chars text='' + PULL: chars text='' + PULL: etag + SAX: etag + PULL: etag + SAX: etag + SAX: document_end + PULL: eof + DOM: element + DOM: chars text='text' + DOM: element + DOM: chars text='' diff --git a/xml/xml.h b/xml/xml.h new file mode 100644 index 00000000..f17b1d79 --- /dev/null +++ b/xml/xml.h @@ -0,0 +1,272 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _SHERLOCK_XML_XML_H +#define _SHERLOCK_XML_XML_H + +#include +#include +#include +#include + +struct xml_context; +struct xml_dtd_entity; + +enum xml_error { + XML_ERR_OK = 0, + XML_ERR_WARN = 1000, /* Warning */ + XML_ERR_ERROR = 2000, /* Recoverable error */ + XML_ERR_FATAL = 3000, /* Unrecoverable error */ + XML_ERR_EOF, +}; + +enum xml_state { + XML_STATE_EOF, /* EOF or a fatal error */ + XML_STATE_START, /* Initial state */ + XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */ + XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */ + XML_STATE_CHARS, /* XML_PULL_CHARS */ + XML_STATE_STAG, /* XML_PULL_STAG */ + XML_STATE_ETAG, /* XML_PULL_ETAG */ + XML_STATE_COMMENT, /* XML_PULL_COMMENT */ + XML_STATE_PI, /* XML_PULL_PI */ + + /* Internal states */ + XML_STATE_CHARS_BEFORE_STAG, + XML_STATE_CHARS_BEFORE_ETAG, + XML_STATE_CHARS_BEFORE_CDATA, + XML_STATE_CHARS_BEFORE_COMMENT, + XML_STATE_CHARS_BEFORE_PI, + XML_STATE_PROLOG_COMMENT, + XML_STATE_PROLOG_PI, + XML_STATE_EPILOG_COMMENT, + XML_STATE_EPILOG_PI, +}; + +enum xml_pull { + XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */ + XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */ + XML_PULL_CHARS = 0x00000004, + XML_PULL_STAG = 0x00000008, + XML_PULL_ETAG = 0x00000010, + XML_PULL_COMMENT = 0x00000020, + XML_PULL_PI = 0x00000040, + XML_PULL_ALL = 0xffffffff, +}; + +enum xml_flags { + /* Enable reporting of various events via SAX and/or PUSH interface */ + XML_REPORT_COMMENTS = 0x00000001, /* Report comments */ + XML_REPORT_PIS = 0x00000002, /* Report processing instructions */ + XML_REPORT_CHARS = 0x00000004, /* Report characters */ + XML_REPORT_TAGS = 0x00000008, /* Report element starts/ends */ + XML_REPORT_MISC = XML_REPORT_COMMENTS | XML_REPORT_PIS, + XML_REPORT_ALL = XML_REPORT_MISC | XML_REPORT_CHARS | XML_REPORT_TAGS, + + /* Enable construction of DOM for these types */ + XML_ALLOC_COMMENTS = 0x00000010, /* Create comment nodes */ + XML_ALLOC_PIS = 0x00000020, /* Create processing instruction nodes */ + XML_ALLOC_CHARS = 0x00000040, /* Create character nodes */ + XML_ALLOC_TAGS = 0x00000080, /* Create element nodes */ + XML_ALLOC_MISC = XML_ALLOC_COMMENTS | XML_ALLOC_PIS, + XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS, + + /* Other parameters */ + XML_VALIDATING = 0x00000100, /* Validate everything (not fully implemented!) */ + XML_PARSE_DTD = 0x00000200, /* Enable parsing of DTD */ + XML_NO_CHARS = 0x00000400, /* The current element must not contain character data (filled automaticaly if using DTD) */ + XML_ALLOC_DEFAULT_ATTRS = 0x00000800, /* Allocate default attribute values so they can be found by XML_ATTR_FOR_EACH */ + + /* Internals, do not change! */ + XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */ + XML_VERSION_1_1 = 0x00020000, /* XML version is 1.1, otherwise 1.0 */ + XML_HAS_EXTERNAL_SUBSET = 0x00040000, /* The document contains a reference to external DTD subset */ + XML_HAS_INTERNAL_SUBSET = 0x00080000, /* The document contains an internal subset */ + XML_HAS_DTD = XML_HAS_EXTERNAL_SUBSET | XML_HAS_INTERNAL_SUBSET, + XML_SRC_EOF = 0x00100000, /* EOF reached */ + XML_SRC_EXPECTED_DECL = 0x00200000, /* Just before optional or required XMLDecl/TextDecl */ + XML_SRC_DOCUMENT = 0x00400000, /* The document entity */ + XML_SRC_EXTERNAL = 0x00800000, /* An external entity */ +}; + +enum xml_node_type { + XML_NODE_ELEM, + XML_NODE_COMMENT, + XML_NODE_CHARS, + XML_NODE_PI, +}; + +#define XML_NODE_FOR_EACH(var, node) CLIST_FOR_EACH(struct xml_node *, var, (node)->sons) +#define XML_ATTR_FOR_EACH(var, node) SLIST_FOR_EACH(struct xml_attr *, var, (node)->attrs) + +struct xml_node { + cnode n; /* Node for list of parent's sons */ + uns type; /* XML_NODE_x */ + struct xml_node *parent; /* Parent node */ + char *name; /* Element name / PI target */ + clist sons; /* Children nodes */ + union { + struct { + char *text; /* PI text / Comment / CDATA */ + uns len; /* Text length in bytes */ + }; + struct { + struct xml_dtd_elem *dtd; /* Element DTD */ + slist attrs; /* Link list of element attributes */ + }; + }; + void *user; /* User-defined (initialized to NULL) */ +}; + +struct xml_attr { + snode n; /* Node for elem->attrs */ + struct xml_node *elem; /* Parent element */ + struct xml_dtd_attr *dtd; /* Attribute DTD */ + char *name; /* Attribute name */ + char *val; /* Attribute value */ + void *user; /* User-defined (initialized to NULL) */ +}; + +#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */ + +struct xml_source { + struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ + struct fastbuf *fb; /* Source fastbuf */ + struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */ + struct fastbuf wrap_fb; /* Fbmem wrapper */ + u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ + u32 *bptr, *bstop; /* Current state of the buffer */ + uns row; /* File position */ + char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ + char *fb_encoding; /* Encoding of the source fastbuf */ + char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ + uns refill_cat1; /* Character categories, which should be directly passed to the buffer */ + uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in + sequences) */ + void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ + unsigned short *refill_in_to_x; /* Libucw-charset input table */ + uns saved_depth; /* Saved ctx->depth */ + uns pending_0xd; /* The last read character is 0xD */ +}; + +struct xml_context { + /* Error handling */ + char *err_msg; /* Last error message */ + enum xml_error err_code; /* Last error code */ + void *throw_buf; /* Where to jump on error */ + void (*h_warn)(struct xml_context *ctx); /* Warning callback */ + void (*h_error)(struct xml_context *ctx); /* Recoverable error callback */ + void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */ + + /* Memory management */ + struct mempool *pool; /* DOM pool */ + struct mempool *stack; /* Stack pool (freed as soon as possible) */ + struct xml_stack *stack_list; /* See xml_push(), xml_pop() */ + uns flags; /* XML_FLAG_x (restored on xml_pop()) */ + uns depth; /* Nesting level (for checking of valid source nesting -> valid pushes/pops on memory pools) */ + struct fastbuf chars; /* Character data / attribute value */ + struct mempool_state chars_state; /* Mempool state before the current character block has started */ + char *chars_trivial; /* If not empty, it will be appended to chars */ + void *tab_attrs; /* Hash table of element attributes */ + + /* Input */ + struct xml_source *src; /* Current source */ + u32 *bptr, *bstop; /* Buffer with preprocessed characters (validated UCS-4 + category flags) */ + uns cat_chars; /* Unicode range of supported characters (cdata, attribute values, ...) */ + uns cat_unrestricted; /* Unrestricted characters (may appear in document/external entities) */ + uns cat_new_line; /* New line characters */ + uns cat_name; /* Characters that may appear in names */ + uns cat_sname; /* Characters that may begin a name */ + + /* SAX-like interface */ + void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */ + void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */ + void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */ + void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration (before optional internal subset) */ + void (*h_comment)(struct xml_context *ctx); /* Called after a comment (only with XML_REPORT_COMMENTS) */ + void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */ + void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */ + void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */ + void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */ + void (*h_block)(struct xml_context *ctx, char *text, uns len); /* Called for each continuous block of characters not reported by h_cdata() (only with XML_REPORT_CHARS) */ + void (*h_cdata)(struct xml_context *ctx, char *text, uns len); /* Called for each CDATA section (only with XML_REPORT_CHARS) */ + void (*h_ignorable)(struct xml_context *ctx, char *text, uns len); /* Called for ignorable whitespace (content in tags without #PCDATA) */ + void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */ + void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */ + struct xml_dtd_entity *(*h_find_entity)(struct xml_context *ctx, char *name); /* Called when needed to resolve a general entity */ + void (*h_resolve_entity)(struct xml_context *ctx, struct xml_dtd_entity *ent); /* User should push source fastbuf for a parsed external entity (either general or parameter) */ + + /* DOM */ + struct xml_node *dom; /* DOM root */ + struct xml_node *node; /* Current DOM node */ + + char *version_str; + uns standalone; + char *doctype; /* The document type (or NULL if unknown) */ + char *system_id; /* DTD external id */ + char *public_id; /* DTD public id */ + struct xml_dtd *dtd; /* The DTD structure (or NULL) */ + uns state; /* Current state for the PULL interface (XML_STATE_x) */ + uns pull; /* Parameters for the PULL interface (XML_PULL_x) */ +}; + +/* Initialize XML context */ +void xml_init(struct xml_context *ctx); + +/* Clean up all internal structures */ +void xml_cleanup(struct xml_context *ctx); + +/* Reuse XML context, equivalent to xml_cleanup() and xml_init() */ +void xml_reset(struct xml_context *ctx); + +/* Add XML source (fastbuf will be automatically closed) */ +struct xml_source *xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb); + +/* Parse without the PUSH interface, return XML_ERR_x code (zero on success) */ +uns xml_parse(struct xml_context *ctx); + +/* Parse with the PUSH interface, return XML_STATE_x (zero on EOF or fatal error) */ +uns xml_next(struct xml_context *ctx); + +/* Equivalent to xml_next, but with temporarily changed ctx->pull value */ +uns xml_next_state(struct xml_context *ctx, uns pull); + +/* May be called on XML_STATE_STAG to skip it's content; can return XML_STATE_ETAG or XML_STATE_EOF on fatal error */ +uns xml_skip_element(struct xml_context *ctx); + +/* Returns the current row number in the document entity */ +uns xml_row(struct xml_context *ctx); + +/* Finds a given attribute value in a XML_NODE_ELEM node */ +struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name); + +/* Similar to xml_attr_find, but it deals also with default values */ +char *xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name); + +/* The default value of h_find_entity(), knows <, >, &, ' and " */ +struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name); + +/* The default value of h_resolve_entity(), throws an error */ +void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); + +/* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */ +uns xml_normalize_white(struct xml_context *ctx, char *value); + +/* Merge character contents of a given element to a single string (not recursive) */ +char *xml_merge_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool); + +/* Merge character contents of a given subtree to a single string */ +char *xml_merge_dom_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool); + +/* Public part of error handling */ +void xml_warn(struct xml_context *ctx, const char *format, ...); +void xml_error(struct xml_context *ctx, const char *format, ...); +void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...); + +#endif