From: Martin Mares Date: Thu, 12 Feb 2015 18:44:04 +0000 (+0100) Subject: XML: Directory renamed to ucw-xml X-Git-Tag: v6.3~5^2 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=254d3b8291246adb2dbcae5c13bed3928f71c986;p=libucw.git XML: Directory renamed to ucw-xml Instead of , we now include , which is less likely to clash with other libraries. --- diff --git a/Makefile b/Makefile index 68ea310d..51ffa3ed 100644 --- a/Makefile +++ b/Makefile @@ -51,8 +51,8 @@ include $(s)/images/Makefile endif ifdef CONFIG_XML -LIBXML=$(o)/xml/libucw-xml.pc -include $(s)/xml/Makefile +LIBXML=$(o)/ucw-xml/libucw-xml.pc +include $(s)/ucw-xml/Makefile endif # Build documentation by default? diff --git a/maint/abi-map-symbols b/maint/abi-map-symbols index 0a5e2f0f..298ad0f4 100755 --- a/maint/abi-map-symbols +++ b/maint/abi-map-symbols @@ -79,7 +79,7 @@ my %blacklist = map { $_ => 1 } qw( images/scale-gen.h ); -for my $f (, , , , ) { +for my $f (, , , , ) { next if $blacklist{$f}; parse($f); } diff --git a/maint/libucw.abi b/maint/libucw.abi index 90ab410e..1b6376e9 100644 --- a/maint/libucw.abi +++ b/maint/libucw.abi @@ -803,7 +803,7 @@ image_sig_segmentation image_sig_detect_textured image_signatures_dist image_signatures_dist_explain -# xml/dtd.h +# ucw-xml/dtd.h xml_dtd_find_notn xml_dtd_find_entity xml_dtd_find_elem @@ -811,7 +811,7 @@ xml_dtd_init xml_dtd_cleanup xml_dtd_finish xml_dtd_find_attr -# xml/internals.h +# ucw-xml/internals.h xml_throw xml_hash_new xml_spout_chars @@ -846,7 +846,7 @@ xml_skip_pi xml_attrs_table_init xml_attrs_table_cleanup xml_validate_attr -# xml/xml.h +# ucw-xml/xml.h xml_init xml_cleanup xml_reset diff --git a/ucw-xml/Makefile b/ucw-xml/Makefile new file mode 100644 index 00000000..28083ccd --- /dev/null +++ b/ucw-xml/Makefile @@ -0,0 +1,58 @@ +# Makefile for the XML parser +# (c) 2007 Pavel Charvat + +DIRS+=ucw-xml +PROGS+=$(o)/ucw-xml/xml-test + +LIBXML_MODS=common source parse dtd +LIBXML_MOD_PATHS=$(addprefix $(o)/ucw-xml/,$(LIBXML_MODS)) +LIBXML_INCLUDES=xml.h dtd.h +LIBXML_DEPS=$(LIBUCW) $(LIBCHARSET) + +$(o)/ucw-xml/libucw-xml$(LV).a: $(addsuffix .o,$(LIBXML_MOD_PATHS)) +$(o)/ucw-xml/libucw-xml$(LV).so: $(addsuffix .oo,$(LIBXML_MOD_PATHS)) $(LIBXML_DEPS) +$(o)/ucw-xml/libucw-xml$(LV).so: SONAME_SUFFIX=.0 +$(o)/ucw-xml/libucw-xml.pc: $(LIBXML_DEPS) + +ifdef CONFIG_INSTALL_API +$(o)/ucw-xml/libucw-xml.pc: $(addprefix $(o)/ucw-xml/libucw-xml$(LV),.a .so) +endif + +$(o)/ucw-xml/common.o: $(o)/ucw-xml/unicat.h +$(o)/ucw-xml/common.oo: $(o)/ucw-xml/unicat.h +$(o)/ucw-xml/source.o: $(o)/ucw-xml/unicat.h +$(o)/ucw-xml/source.oo: $(o)/ucw-xml/unicat.h +$(o)/ucw-xml/dtd.o: $(o)/ucw-xml/unicat.h +$(o)/ucw-xml/dtd.oo: $(o)/ucw-xml/unicat.h +$(o)/ucw-xml/parse.o: $(o)/ucw-xml/unicat.h +$(o)/ucw-xml/parse.oo: $(o)/ucw-xml/unicat.h +$(o)/ucw-xml/unicat.h: $(s)/ucw-xml/unicat.pl + $(M)GEN $(addprefix $(o)/ucw-xml/unicat,.h .c) + $(Q)$< $(addprefix $(o)/ucw-xml/unicat,.h .c) + $(Q)touch $@ + +TESTS+=$(o)/ucw-xml/xml-test.test +$(o)/ucw-xml/xml-test: $(o)/ucw-xml/xml-test.o $(LIBXML) $(LIBCHARSET) $(LIBUCW) +$(o)/ucw-xml/xml-test.test: $(o)/ucw-xml/xml-test + +API_LIBS+=libucw-xml +API_INCLUDES+=$(o)/ucw-xml/.include-stamp +$(o)/ucw-xml/.include-stamp: $(addprefix $(s)/ucw-xml/,$(LIBXML_INCLUDES)) +$(o)/ucw-xml/.include-stamp: IDST=ucw-xml +run/lib/pkgconfig/libucw-xml.pc: $(o)/ucw-xml/libucw-xml.pc + +INSTALL_TARGETS+=install-libucw-xml-lib +install-libucw-xml-lib: + install -d -m 755 $(DESTDIR)$(INSTALL_LIB_DIR) + install -m 644 run/lib/libucw-xml$(LV).so.0 $(DESTDIR)$(INSTALL_LIB_DIR)/libucw-xml$(LV).so.0.0 + ln -sf libucw-xml$(LV).so.0.0 $(DESTDIR)$(INSTALL_LIB_DIR)/libucw-xml$(LV).so.0 +.PHONY: install-libucw-xml-lib + +INSTALL_TARGETS+=install-libucw-xml-api +install-libucw-xml-api: + install -d -m 755 $(DESTDIR)$(INSTALL_INCLUDE_DIR)/xml $(DESTDIR)$(INSTALL_LIB_DIR) $(DESTDIR)$(INSTALL_PKGCONFIG_DIR) + install -m 644 run/lib/pkgconfig/libucw-xml.pc $(DESTDIR)$(INSTALL_PKGCONFIG_DIR) + install -m 644 $(addprefix run/include/ucw-xml/,$(LIBXML_INCLUDES)) $(DESTDIR)$(INSTALL_INCLUDE_DIR)/xml + ln -sf libucw-xml$(LV).so.0.0 $(DESTDIR)$(INSTALL_LIB_DIR)/libucw-xml$(LV).so + install -m 644 run/lib/libucw-xml$(LV).a $(DESTDIR)$(INSTALL_LIB_DIR) +.PHONY: install-libucw-xml-api diff --git a/ucw-xml/TODO b/ucw-xml/TODO new file mode 100644 index 00000000..b8dbc29c --- /dev/null +++ b/ucw-xml/TODO @@ -0,0 +1,15 @@ +Non-normative / not-implemented: +-- introduce numeric error codes +-- cycle detection in internal entities (and possibly external?) +-- conditional sections in DTD +-- validation of elements (regular expressions, non-cdata) +-- validation of attributes (unfinished) +-- notations +-- URI normalization +-- support for xml:space +-- support for xml:lang +-- full support for standalone documents +-- Unicode normalization + +Optimizations: +-- detect definitions of trivial entities diff --git a/ucw-xml/common.c b/ucw-xml/common.c new file mode 100644 index 00000000..8b37d597 --- /dev/null +++ b/ucw-xml/common.c @@ -0,0 +1,140 @@ +/* + * UCW Library -- A simple XML parser + * + * (c) 2007 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#undef LOCAL_DEBUG + +#include +#include +#include +#include +#include +#include + +#include + +/*** Error handling ***/ + +void NONRET +xml_throw(struct xml_context *ctx) +{ + ASSERT(ctx->err_code && ctx->throw_buf); + longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code); +} + +void +xml_warn(struct xml_context *ctx, const char *format, ...) +{ + if (ctx->h_warn) + { + va_list args; + va_start(args, format); + ctx->err_msg = stk_vprintf(format, args); + ctx->err_code = XML_ERR_WARN; + va_end(args); + ctx->h_warn(ctx); + ctx->err_msg = NULL; + ctx->err_code = XML_ERR_OK; + } +} + +void +xml_error(struct xml_context *ctx, const char *format, ...) +{ + if (ctx->h_error) + { + va_list args; + va_start(args, format); + ctx->err_msg = stk_vprintf(format, args); + ctx->err_code = XML_ERR_ERROR; + va_end(args); + ctx->h_error(ctx); + ctx->err_msg = NULL; + ctx->err_code = XML_ERR_OK; + } +} + +void NONRET +xml_fatal(struct xml_context *ctx, const char *format, ...) +{ + va_list args; + va_start(args, format); + ctx->err_msg = mp_vprintf(ctx->stack, format, args); + ctx->err_code = XML_ERR_FATAL; + ctx->state = XML_STATE_EOF; + va_end(args); + if (ctx->h_fatal) + ctx->h_fatal(ctx); + xml_throw(ctx); +} + +/*** Memory management ***/ + +void * +xml_hash_new(struct mempool *pool, uint size) +{ + void *tab = mp_alloc_zero(pool, size + XML_HASH_HDR_SIZE); + *(void **)tab = pool; + return tab + XML_HASH_HDR_SIZE; +} + +/*** Initialization ***/ + +static struct xml_context xml_defaults = { + .flags = XML_SRC_EOF | XML_REPORT_ALL, + .state = XML_STATE_START, + .h_resolve_entity = xml_def_resolve_entity, + .chars = { + .name = "", + .spout = xml_spout_chars, + .can_overwrite_buffer = 1, + }, +}; + +static void +xml_do_init(struct xml_context *ctx) +{ + xml_attrs_table_init(ctx); +} + +void +xml_init(struct xml_context *ctx) +{ + *ctx = xml_defaults; + ctx->pool = mp_new(65536); + ctx->stack = mp_new(65536); + xml_do_init(ctx); + TRACE(ctx, "init"); +} + +void +xml_cleanup(struct xml_context *ctx) +{ + TRACE(ctx, "cleanup"); + xml_attrs_table_cleanup(ctx); + xml_dtd_cleanup(ctx); + xml_sources_cleanup(ctx); + mp_delete(ctx->pool); + mp_delete(ctx->stack); +} + +void +xml_reset(struct xml_context *ctx) +{ + TRACE(ctx, "reset"); + struct mempool *pool = ctx->pool, *stack = ctx->stack; + xml_attrs_table_cleanup(ctx); + xml_dtd_cleanup(ctx); + xml_sources_cleanup(ctx); + mp_flush(pool); + mp_flush(stack); + *ctx = xml_defaults; + ctx->pool = pool; + ctx->stack = stack; + xml_do_init(ctx); +} diff --git a/ucw-xml/dtd.c b/ucw-xml/dtd.c new file mode 100644 index 00000000..7c06af84 --- /dev/null +++ b/ucw-xml/dtd.c @@ -0,0 +1,1003 @@ +/* + * UCW Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#undef LOCAL_DEBUG + +#include +#include +#include +#include +#include +#include +#include + +/* Notations */ + +#define HASH_PREFIX(x) xml_dtd_notns_##x +#define HASH_NODE struct xml_dtd_notn +#define HASH_KEY_STRING name +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_WANT_LOOKUP +#define HASH_WANT_FIND +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +struct xml_dtd_notn * +xml_dtd_find_notn(struct xml_context *ctx, char *name) +{ + struct xml_dtd *dtd = ctx->dtd; + struct xml_dtd_notn *notn = xml_dtd_notns_find(dtd->tab_notns, name); + return !notn ? NULL : (notn->flags & XML_DTD_NOTN_DECLARED) ? notn : NULL; +} + +/* General entities */ + +#define HASH_PREFIX(x) xml_dtd_ents_##x +#define HASH_NODE struct xml_dtd_entity +#define HASH_KEY_STRING name +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_WANT_FIND +#define HASH_WANT_LOOKUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +static struct xml_dtd_entity * +xml_dtd_declare_trivial_entity(struct xml_context *ctx, char *name, char *text) +{ + struct xml_dtd *dtd = ctx->dtd; + struct xml_dtd_entity *ent = xml_dtd_ents_lookup(dtd->tab_ents, name); + if (ent->flags & XML_DTD_ENTITY_DECLARED) + { + xml_warn(ctx, "Entity &%s; already declared", name); + return NULL; + } + slist_add_tail(&dtd->ents, &ent->n); + ent->flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL; + ent->text = text; + return ent; +} + +static void +xml_dtd_declare_default_entities(struct xml_context *ctx) +{ + xml_dtd_declare_trivial_entity(ctx, "lt", "<"); + xml_dtd_declare_trivial_entity(ctx, "gt", ">"); + xml_dtd_declare_trivial_entity(ctx, "amp", "&"); + xml_dtd_declare_trivial_entity(ctx, "apos", "'"); + xml_dtd_declare_trivial_entity(ctx, "quot", "\""); +} + +struct xml_dtd_entity * +xml_def_find_entity(struct xml_context *ctx UNUSED, char *name) +{ +#define ENT(n, t) ent_##n = { .name = #n, .text = t, .flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL } + static struct xml_dtd_entity ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\""); +#undef ENT + switch (name[0]) + { + case 'l': + if (!strcmp(name, "lt")) + return &ent_lt; + break; + case 'g': + if (!strcmp(name, "gt")) + return &ent_gt; + break; + case 'a': + if (!strcmp(name, "amp")) + return &ent_amp; + if (!strcmp(name, "apos")) + return &ent_apos; + break; + case 'q': + if (!strcmp(name, "quot")) + return &ent_quot; + break; + } + return NULL; +} + +struct xml_dtd_entity * +xml_dtd_find_entity(struct xml_context *ctx, char *name) +{ + struct xml_dtd *dtd = ctx->dtd; + if (ctx->h_find_entity) + return ctx->h_find_entity(ctx, name); + else if (dtd) + { + struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_ents, name); + return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL; + } + else + return xml_def_find_entity(ctx, name); +} + +/* Parameter entities */ + +static struct xml_dtd_entity * +xml_dtd_find_pentity(struct xml_context *ctx, char *name) +{ + struct xml_dtd *dtd = ctx->dtd; + struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_pents, name); + return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL; +} + +/* Elements */ + +struct xml_dtd_elems_table; + +static void +xml_dtd_elems_init_data(struct xml_dtd_elems_table *tab UNUSED, struct xml_dtd_elem *e) +{ + slist_init(&e->attrs); +} + +#define HASH_PREFIX(x) xml_dtd_elems_##x +#define HASH_NODE struct xml_dtd_elem +#define HASH_KEY_STRING name +#define HASH_TABLE_DYNAMIC +#define HASH_ZERO_FILL +#define HASH_WANT_FIND +#define HASH_WANT_LOOKUP +#define HASH_GIVE_ALLOC +#define HASH_GIVE_INIT_DATA +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +struct xml_dtd_elem * +xml_dtd_find_elem(struct xml_context *ctx, char *name) +{ + return ctx->dtd ? xml_dtd_elems_find(ctx->dtd->tab_elems, name) : NULL; +} + +/* Element sons */ + +struct xml_dtd_enodes_table; + +static inline uint +xml_dtd_enodes_hash(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) +{ + return hash_pointer(parent) ^ hash_pointer(elem); +} + +static inline int +xml_dtd_enodes_eq(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent1, struct xml_dtd_elem *elem1, struct xml_dtd_elem_node *parent2, struct xml_dtd_elem *elem2) +{ + return (parent1 == parent2) && (elem1 == elem2); +} + +static inline void +xml_dtd_enodes_init_key(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *node, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) +{ + node->parent = parent; + node->elem = elem; +} + +#define HASH_PREFIX(x) xml_dtd_enodes_##x +#define HASH_NODE struct xml_dtd_elem_node +#define HASH_KEY_COMPLEX(x) x parent, x elem +#define HASH_KEY_DECL struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_TABLE_DYNAMIC +#define HASH_ZERO_FILL +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +/* Element attributes */ + +struct xml_dtd_attrs_table; + +static inline uint +xml_dtd_attrs_hash(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem, char *name) +{ + return hash_pointer(elem) ^ hash_string(name); +} + +static inline int +xml_dtd_attrs_eq(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem1, char *name1, struct xml_dtd_elem *elem2, char *name2) +{ + return (elem1 == elem2) && !strcmp(name1, name2); +} + +static inline void +xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_elem *elem, char *name) +{ + attr->elem = elem; + attr->name = name; + slist_add_tail(&elem->attrs, &attr->n); +} + +#define HASH_PREFIX(x) xml_dtd_attrs_##x +#define HASH_NODE struct xml_dtd_attr +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x elem, x name +#define HASH_KEY_DECL struct xml_dtd_elem *elem, char *name +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +struct xml_dtd_attr * +xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name) +{ + return ctx->dtd ? xml_dtd_attrs_find(ctx->dtd->tab_attrs, elem, name) : NULL; +} + +/* Enumerated attribute values */ + +struct xml_dtd_evals_table; + +static inline uint +xml_dtd_evals_hash(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr, char *val) +{ + return hash_pointer(attr) ^ hash_string(val); +} + +static inline int +xml_dtd_evals_eq(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr1, char *val1, struct xml_dtd_attr *attr2, char *val2) +{ + return (attr1 == attr2) && !strcmp(val1, val2); +} + +static inline void +xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_eval *eval, struct xml_dtd_attr *attr, char *val) +{ + eval->attr = attr; + eval->val = val; +} + +#define HASH_PREFIX(x) xml_dtd_evals_##x +#define HASH_NODE struct xml_dtd_eval +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x attr, x val +#define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +/* Enumerated attribute notations */ + +struct xml_dtd_enotns_table; + +static inline uint +xml_dtd_enotns_hash(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) +{ + return hash_pointer(attr) ^ hash_pointer(notn); +} + +static inline int +xml_dtd_enotns_eq(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr1, struct xml_dtd_notn *notn1, struct xml_dtd_attr *attr2, struct xml_dtd_notn *notn2) +{ + return (attr1 == attr2) && (notn1 == notn2); +} + +static inline void +xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_enotn *enotn, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) +{ + enotn->attr = attr; + enotn->notn = notn; +} + +#define HASH_PREFIX(x) xml_dtd_enotns_##x +#define HASH_NODE struct xml_dtd_enotn +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x attr, x notn +#define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +/* DTD initialization/cleanup */ + +void +xml_dtd_init(struct xml_context *ctx) +{ + if (ctx->dtd) + return; + struct mempool *pool = mp_new(4096); + struct xml_dtd *dtd = ctx->dtd = mp_alloc_zero(pool, sizeof(*ctx->dtd)); + dtd->pool = pool; + xml_dtd_ents_init(dtd->tab_ents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); + xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); + xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table))); + xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table))); + xml_dtd_enodes_init(dtd->tab_enodes = xml_hash_new(pool, sizeof(struct xml_dtd_enodes_table))); + xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table))); + xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table))); + xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table))); + xml_dtd_declare_default_entities(ctx); +} + +void +xml_dtd_cleanup(struct xml_context *ctx) +{ + if (!ctx->dtd) + return; + mp_delete(ctx->dtd->pool); + ctx->dtd = NULL; +} + +void +xml_dtd_finish(struct xml_context *ctx) +{ + if (!ctx->dtd) + return; + // FIXME: validity checks +} + +/*** Parsing functions ***/ + +/* References to parameter entities */ + +void +xml_parse_pe_ref(struct xml_context *ctx) +{ + /* PEReference ::= '%' Name ';' + * Already parsed: '%' */ + struct mempool_state state; + mp_save(ctx->stack, &state); + char *name = xml_parse_name(ctx, ctx->stack); + xml_parse_char(ctx, ';'); + struct xml_dtd_entity *ent = xml_dtd_find_pentity(ctx, name); + if (!ent) + xml_error(ctx, "Unknown entity %%%s;", name); + else + { + TRACE(ctx, "Pushed entity %%%s;", name); + mp_restore(ctx->stack, &state); + xml_dec(ctx); + xml_push_entity(ctx, ent); + return; + } + mp_restore(ctx->stack, &state); + xml_dec(ctx); +} + +static uint +xml_parse_dtd_pe(struct xml_context *ctx, uint entity_decl) +{ + /* Already parsed: '%' */ + do + { + xml_inc(ctx); + if (!~entity_decl && (xml_peek_cat(ctx) & XML_CHAR_WHITE)) + { + xml_dec(ctx); + return ~0U; + } + xml_parse_pe_ref(ctx); + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + xml_skip_char(ctx); + } + while (xml_get_char(ctx) == '%'); + xml_unget_char(ctx); + return 1; +} + +static inline uint +xml_parse_dtd_white(struct xml_context *ctx, uint mandatory) +{ + /* Whitespace or parameter entity, + * mandatory==~0U has a special maening of the whitespace before the '%' character in an parameter entity declaration */ + uint cnt = 0; + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + { + xml_skip_char(ctx); + cnt = 1; + } + if (xml_peek_char(ctx) == '%') + { + xml_skip_char(ctx); + return xml_parse_dtd_pe(ctx, mandatory); + } + else if (unlikely(mandatory && !cnt)) + xml_fatal_expected_white(ctx); + return cnt; +} + +static void +xml_dtd_parse_external_id(struct xml_context *ctx, char **system_id, char **public_id, uint allow_public) +{ + struct xml_dtd *dtd = ctx->dtd; + uint c = xml_peek_char(ctx); + if (c == 'S') + { + xml_parse_seq(ctx, "SYSTEM"); + xml_parse_dtd_white(ctx, 1); + *public_id = NULL; + *system_id = xml_parse_system_literal(ctx, dtd->pool); + } + else if (c == 'P') + { + xml_parse_seq(ctx, "PUBLIC"); + xml_parse_dtd_white(ctx, 1); + *system_id = NULL; + *public_id = xml_parse_pubid_literal(ctx, dtd->pool); + if (xml_parse_dtd_white(ctx, !allow_public)) + if ((c = xml_peek_char(ctx)) == '\'' || c == '"' || !allow_public) + *system_id = xml_parse_system_literal(ctx, dtd->pool); + } + else + xml_fatal(ctx, "Expected an external ID"); +} + +/* DTD: */ + +void +xml_parse_notation_decl(struct xml_context *ctx) +{ + /* NotationDecl ::= '' + * Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + + struct xml_dtd_notn *notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); + xml_parse_dtd_white(ctx, 1); + char *system_id, *public_id; + xml_dtd_parse_external_id(ctx, &system_id, &public_id, 1); + xml_parse_dtd_white(ctx, 0); + xml_parse_char(ctx, '>'); + + if (notn->flags & XML_DTD_NOTN_DECLARED) + xml_warn(ctx, "Notation %s already declared", notn->name); + else + { + notn->flags = XML_DTD_NOTN_DECLARED; + notn->system_id = system_id; + notn->public_id = public_id; + slist_add_tail(&dtd->notns, ¬n->n); + } + xml_dec(ctx); +} + +/* DTD: */ + +void +xml_parse_entity_decl(struct xml_context *ctx) +{ + /* Already parsed: 'dtd; + uint flags = ~xml_parse_dtd_white(ctx, ~0U) ? 0 : XML_DTD_ENTITY_PARAMETER; + if (flags) + xml_parse_dtd_white(ctx, 1); + struct xml_dtd_entity *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool)); + xml_parse_dtd_white(ctx, 1); + slist *list = flags ? &dtd->pents : &dtd->ents; + if (ent->flags & XML_DTD_ENTITY_DECLARED) + { + xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); + // FIXME: should be only warning + } + uint c, sep = xml_get_char(ctx); + if (sep == '\'' || sep == '"') + { + /* Internal entity: + * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */ + char *p = mp_start_noalign(dtd->pool, 1); + while (1) + { + if ((c = xml_get_char(ctx)) == sep) + break; + if (c == '%') + { + // FIXME + ASSERT(0); + //xml_parse_parameter_ref(ctx); + continue; + } + if (c == '&') + { + xml_inc(ctx); + if (xml_peek_char(ctx) != '#') + { + /* Bypass references to general entities */ + struct mempool_state state; + mp_save(ctx->stack, &state); + char *n = xml_parse_name(ctx, ctx->stack); + xml_parse_char(ctx, ';'); + xml_dec(ctx); + uint l = strlen(n); + p = mp_spread(dtd->pool, p, 3 + l); + *p++ = '&'; + memcpy(p, n, l); + p += l; + *p++ = ';';; + mp_restore(ctx->stack, &state); + continue; + } + else + { + xml_skip_char(ctx); + c = xml_parse_char_ref(ctx); + } + } + p = mp_spread(dtd->pool, p, 5); + p = utf8_32_put(p, c); + } + *p = 0; + ent->len = p - (char *)mp_ptr(dtd->pool); + ent->text = mp_end(dtd->pool, p + 1); + slist_add_tail(list, &ent->n); + ent->flags = flags | XML_DTD_ENTITY_DECLARED; + } + else + { + /* External entity */ + struct xml_dtd_notn *notn = NULL; + char *system_id, *public_id; + xml_unget_char(ctx); + xml_dtd_parse_external_id(ctx, &system_id, &public_id, 0); + if (xml_parse_dtd_white(ctx, 0) && flags && xml_peek_char(ctx) != '>') + { + /* General external unparsed entity */ + flags |= XML_DTD_ENTITY_UNPARSED; + xml_parse_seq(ctx, "NDATA"); + xml_parse_dtd_white(ctx, 1); + notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); + } + slist_add_tail(list, &ent->n); + ent->flags = flags | XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_EXTERNAL; + ent->system_id = system_id; + ent->public_id = public_id; + ent->notn = notn; + } + xml_parse_dtd_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_dec(ctx); +} + +/* DTD: */ + +void +xml_parse_element_decl(struct xml_context *ctx) +{ + /* Elementdecl ::= '' + * Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + char *name = xml_parse_name(ctx, dtd->pool); + xml_parse_dtd_white(ctx, 1); + struct xml_dtd_elem *elem = xml_dtd_elems_lookup(dtd->tab_elems, name); + if (elem->flags & XML_DTD_ELEM_DECLARED) + xml_fatal(ctx, "Element <%s> already declared", name); + + /* contentspec ::= 'EMPTY' | 'ANY' | Mixed | children */ + uint c = xml_peek_char(ctx); + if (c == 'E') + { + xml_parse_seq(ctx, "EMPTY"); + elem->type = XML_DTD_ELEM_EMPTY; + } + else if (c == 'A') + { + xml_parse_seq(ctx, "ANY"); + elem->type = XML_DTD_ELEM_ANY; + } + else if (c == '(') + { + xml_skip_char(ctx); + xml_inc(ctx); + xml_parse_dtd_white(ctx, 0); + struct xml_dtd_elem_node *parent = elem->node = mp_alloc_zero(dtd->pool, sizeof(*parent)); + if (xml_peek_char(ctx) == '#') + { + /* Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' */ + xml_skip_char(ctx); + xml_parse_seq(ctx, "PCDATA"); + elem->type = XML_DTD_ELEM_MIXED; + parent->type = XML_DTD_ELEM_PCDATA; + while (1) + { + xml_parse_dtd_white(ctx, 0); + if ((c = xml_get_char(ctx)) == ')') + break; + else if (c != '|') + xml_fatal_expected(ctx, ')'); + xml_parse_dtd_white(ctx, 0); + struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); + if (xml_dtd_enodes_find(dtd->tab_enodes, parent, son_elem)) + xml_error(ctx, "Duplicate content '%s'", son_elem->name); + else + { + struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); + slist_add_tail(&parent->sons, &son->n); + } + } + xml_dec(ctx); + if (xml_peek_char(ctx) == '*') + { + xml_skip_char(ctx); + parent->occur = XML_DTD_ELEM_OCCUR_MULT; + } + else if (!slist_head(&parent->sons)) + parent->occur = XML_DTD_ELEM_OCCUR_ONCE; + else + xml_fatal_expected(ctx, '*'); + } + else + { + /* children ::= (choice | seq) ('?' | '*' | '+')? + * cp ::= (Name | choice | seq) ('?' | '*' | '+')? + * choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' + * seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' */ + + elem->type = XML_DTD_ELEM_CHILDREN; + parent->type = XML_DTD_ELEM_PCDATA; + uint c; + goto first; + + while (1) + { + /* After name */ + xml_parse_dtd_white(ctx, 0); + if ((c = xml_get_char(ctx)) == ')') + { + xml_dec(ctx); + if (parent->type == XML_DTD_ELEM_PCDATA) + parent->type = XML_DTD_ELEM_SEQ; + if ((c = xml_get_char(ctx)) == '?') + parent->occur = XML_DTD_ELEM_OCCUR_OPT; + else if (c == '*') + parent->occur = XML_DTD_ELEM_OCCUR_MULT; + else if (c == '+') + parent->occur = XML_DTD_ELEM_OCCUR_PLUS; + else + { + xml_unget_char(ctx); + parent->occur = XML_DTD_ELEM_OCCUR_ONCE; + } + if (!parent->parent) + break; + parent = parent->parent; + continue; + } + else if (c == '|') + { + if (parent->type == XML_DTD_ELEM_PCDATA) + parent->type = XML_DTD_ELEM_OR; + else if (parent->type != XML_DTD_ELEM_OR) + xml_fatal(ctx, "Mixed operators in the list of element children"); + } + else if (c == ',') + { + if (parent->type == XML_DTD_ELEM_PCDATA) + parent->type = XML_DTD_ELEM_SEQ; + else if (parent->type != XML_DTD_ELEM_SEQ) + xml_fatal(ctx, "Mixed operators in the list of element children"); + } + else if (c == '(') + { + xml_inc(ctx); + struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); + son->parent = parent; + slist_add_tail(&parent->sons, &son->n); + parent = son->parent; + son->type = XML_DTD_ELEM_MIXED; + } + else + xml_unget_char(ctx); + + /* Before name */ + xml_parse_dtd_white(ctx, 0); +first:; + struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); + // FIXME: duplicates, occurance + //struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); + struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); + son->parent = parent; + son->elem = son_elem; + slist_add_tail(&parent->sons, &son->n); + } + } + } + else + xml_fatal(ctx, "Expected element content specification"); + + xml_parse_dtd_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_dec(ctx); +} + +void +xml_parse_attr_list_decl(struct xml_context *ctx) +{ + /* AttlistDecl ::= '' + * AttDef ::= S Name S AttType S DefaultDecl + * Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + struct xml_dtd_elem *elem = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); + + while (xml_parse_dtd_white(ctx, 0) && xml_peek_char(ctx) != '>') + { + char *name = xml_parse_name(ctx, dtd->pool); + struct xml_dtd_attr *attr = xml_dtd_attrs_find(dtd->tab_attrs, elem, name); + uint ignored = 0; + if (attr) + { + xml_warn(ctx, "Duplicate attribute definition"); + ignored++; + } + else + attr = xml_dtd_attrs_new(ctx->dtd->tab_attrs, elem, name); + xml_parse_dtd_white(ctx, 1); + if (xml_peek_char(ctx) == '(') + { + xml_skip_char(ctx); // FIXME: xml_inc/dec ? + if (!ignored) + attr->type = XML_ATTR_ENUM; + do + { + xml_parse_dtd_white(ctx, 0); + char *value = xml_parse_nmtoken(ctx, dtd->pool); + if (!ignored) + if (xml_dtd_evals_find(ctx->dtd->tab_evals, attr, value)) + xml_error(ctx, "Duplicate enumeration value"); + else + xml_dtd_evals_new(ctx->dtd->tab_evals, attr, value); + xml_parse_dtd_white(ctx, 0); + } + while (xml_get_char(ctx) == '|'); + xml_unget_char(ctx); + xml_parse_char(ctx, ')'); + } + else + { + char *type = xml_parse_name(ctx, dtd->pool); + enum xml_dtd_attr_type t = XML_ATTR_CDATA; + if (!strcmp(type, "CDATA")) + t = XML_ATTR_CDATA; + else if (!strcmp(type, "ID")) + t = XML_ATTR_ID; + else if (!strcmp(type, "IDREF")) + t = XML_ATTR_IDREF; + else if (!strcmp(type, "IDREFS")) + t = XML_ATTR_IDREFS; + else if (!strcmp(type, "ENTITY")) + t = XML_ATTR_ENTITY; + else if (!strcmp(type, "ENTITIES")) + t = XML_ATTR_ENTITIES; + else if (!strcmp(type, "NMTOKEN")) + t = XML_ATTR_NMTOKEN; + else if (!strcmp(type, "NMTOKENS")) + t = XML_ATTR_NMTOKENS; + else if (!strcmp(type, "NOTATION")) + { + if (elem->type == XML_DTD_ELEM_EMPTY) + xml_fatal(ctx, "Empty element must not have notation attribute"); + // FIXME: An element type MUST NOT have more than one NOTATION attribute specified. + t = XML_ATTR_NOTATION; + xml_parse_dtd_white(ctx, 1); + xml_parse_char(ctx, '('); + do + { + xml_parse_dtd_white(ctx, 0); + struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); + if (!ignored) + if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, attr, n)) + xml_error(ctx, "Duplicate enumerated notation"); + else + xml_dtd_enotns_new(ctx->dtd->tab_enotns, attr, n); + xml_parse_dtd_white(ctx, 0); + } + while (xml_get_char(ctx) == '|'); + xml_unget_char(ctx); + xml_parse_char(ctx, ')'); + } + else + xml_fatal(ctx, "Unknown attribute type"); + if (!ignored) + attr->type = t; + } + xml_parse_dtd_white(ctx, 1); + enum xml_dtd_attr_default def = XML_ATTR_NONE; + if (xml_get_char(ctx) == '#') + switch (xml_peek_char(ctx)) + { + case 'R': + xml_parse_seq(ctx, "REQUIRED"); + def = XML_ATTR_REQUIRED; + break; + case 'I': + xml_parse_seq(ctx, "IMPLIED"); + def = XML_ATTR_IMPLIED; + break; + case 'F': + xml_parse_seq(ctx, "FIXED"); + def = XML_ATTR_FIXED; + xml_parse_dtd_white(ctx, 1); + break; + default: + xml_fatal(ctx, "Expected a modifier for default attribute value"); + } + else + xml_unget_char(ctx); + if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED) + { + char *v = xml_parse_attr_value(ctx, attr); + if (!ignored) + attr->default_value = v; + } + if (!ignored) + attr->default_mode = def; + } + xml_skip_char(ctx); + xml_dec(ctx); +} + +void +xml_skip_internal_subset(struct xml_context *ctx) +{ + TRACE(ctx, "skip_internal_subset"); + /* AlreadyParsed: '[' */ + uint c; + while ((c = xml_get_char(ctx)) != ']') + { + if (c != '<') + continue; + if ((c = xml_get_char(ctx)) == '?') + { + xml_inc(ctx); + xml_skip_pi(ctx); + } + else if (c != '!') + xml_dec(ctx); + else if (xml_get_char(ctx) == '-') + { + xml_inc(ctx); + xml_skip_comment(ctx); + } + else + while ((c = xml_get_char(ctx)) != '>') + if (c == '\'' || c == '"') + while (xml_get_char(ctx) != c); + } + xml_dec(ctx); +} + +/*** Validation of attribute values ***/ + +static uint +xml_check_tokens(char *value, uint first_cat, uint next_cat, uint seq) +{ + char *p = value; + uint u; + while (1) + { + p = utf8_32_get(p, &u); + if (!(xml_char_cat(u) & first_cat)) + return 0; + while (*p & ~0x20) + { + p = utf8_32_get(p, &u); + if (!(xml_char_cat(u) & next_cat)) + return 0; + } + if (!*p) + return 1; + if (!seq) + return 0; + p++; + } +} + +static uint +xml_is_name(struct xml_context *ctx, char *value) +{ + /* Name ::= NameStartChar (NameChar)* */ + return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 0); +} + +static uint +xml_is_names(struct xml_context *ctx, char *value) +{ + /* Names ::= Name (#x20 Name)* */ + return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 1); +} + +static uint +xml_is_nmtoken(struct xml_context *ctx, char *value) +{ + /* Nmtoken ::= (NameChar)+ */ + return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 0); +} + +static uint +xml_is_nmtokens(struct xml_context *ctx, char *value) +{ + /* Nmtokens ::= Nmtoken (#x20 Nmtoken)* */ + return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 1); +} + +static void +xml_err_attr_format(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *type) +{ + xml_error(ctx, "Attribute %s in <%s> does not match the production of %s", dtd->name, dtd->elem->name, type); +} + +void +xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value) +{ + if (dtd->type == XML_ATTR_CDATA) + return; + xml_normalize_white(ctx, value); + switch (dtd->type) + { + case XML_ATTR_ID: + if (!xml_is_name(ctx, value)) + xml_err_attr_format(ctx, dtd, "NAME"); + //FIXME: add to a hash table + break; + case XML_ATTR_IDREF: + if (!xml_is_name(ctx, value)) + xml_err_attr_format(ctx, dtd, "NAME"); + // FIXME: find in hash table (beware forward references) + break; + case XML_ATTR_IDREFS: + if (!xml_is_names(ctx, value)) + xml_err_attr_format(ctx, dtd, "NAMES"); + // FIXME: find + break; + case XML_ATTR_ENTITY: + // FIXME + break; + case XML_ATTR_ENTITIES: + // FIXME + break; + case XML_ATTR_NMTOKEN: + if (!xml_is_nmtoken(ctx, value)) + xml_err_attr_format(ctx, dtd, "NMTOKEN"); + break; + case XML_ATTR_NMTOKENS: + if (!xml_is_nmtokens(ctx, value)) + xml_err_attr_format(ctx, dtd, "NMTOKENS"); + break; + case XML_ATTR_ENUM: + if (!xml_dtd_evals_find(ctx->dtd->tab_evals, dtd, value)) + xml_error(ctx, "Attribute %s in <%s> contains an undefined enumeration value", dtd->name, dtd->elem->name); + break; + case XML_ATTR_NOTATION: + if (!xml_dtd_find_notn(ctx, value)) + xml_error(ctx, "Attribute %s in <%s> contains an undefined notation", dtd->name, dtd->elem->name); + break; + } +} diff --git a/ucw-xml/dtd.h b/ucw-xml/dtd.h new file mode 100644 index 00000000..c3e07f69 --- /dev/null +++ b/ucw-xml/dtd.h @@ -0,0 +1,178 @@ +/* + * UCW Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _UCW_XML_DTD_H +#define _UCW_XML_DTD_H + +#include + +#ifdef CONFIG_UCW_CLEAN_ABI +#define xml_dtd_cleanup ucw_xml_dtd_cleanup +#define xml_dtd_find_attr ucw_xml_dtd_find_attr +#define xml_dtd_find_elem ucw_xml_dtd_find_elem +#define xml_dtd_find_entity ucw_xml_dtd_find_entity +#define xml_dtd_find_notn ucw_xml_dtd_find_notn +#define xml_dtd_finish ucw_xml_dtd_finish +#define xml_dtd_init ucw_xml_dtd_init +#endif + +struct xml_dtd { + struct mempool *pool; /* Memory pool where to allocate DTD */ + slist ents; /* Link list of general entities */ + slist pents; /* Link list of parameter entities */ + slist notns; /* Link list of notations */ + slist elems; /* Link list of elements */ + void *tab_ents; /* Hash table of general entities */ + void *tab_pents; /* Hash table of parameter entities */ + void *tab_notns; /* Hash table of notations */ + void *tab_elems; /* Hash table of elements */ + void *tab_enodes; /* Hash table of element sons */ + void *tab_attrs; /* Hash table of element attributes */ + void *tab_evals; /* Hash table of enumerated attribute values */ + void *tab_enotns; /* hash table of enumerated attribute notations */ +}; + +/* Notations */ + +enum xml_dtd_notn_flags { + XML_DTD_NOTN_DECLARED = 0x1, /* The notation has been declared (internal usage) */ +}; + +struct xml_dtd_notn { + snode n; /* Node in xml_dtd.notns */ + uint flags; /* XML_DTD_NOTN_x */ + char *name; /* Notation name */ + char *system_id; /* External ID */ + char *public_id; + void *user; /* User-defined */ +}; + +struct xml_dtd_notn *xml_dtd_find_notn(struct xml_context *ctx, char *name); + +/* Entities */ + +enum xml_dtd_entity_flags { + XML_DTD_ENTITY_DECLARED = 0x1, /* The entity has been declared (internal usage) */ + XML_DTD_ENTITY_VISITED = 0x2, /* Cycle detection (internal usage) */ + XML_DTD_ENTITY_PARAMETER = 0x4, /* Parameter entity, general otherwise */ + XML_DTD_ENTITY_EXTERNAL = 0x8, /* External entity, internal otherwise */ + XML_DTD_ENTITY_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */ + XML_DTD_ENTITY_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */ +}; + +struct xml_dtd_entity { + snode n; /* Node in xml_dtd.[gp]ents */ + uint flags; /* XML_DTD_ENT_x */ + char *name; /* Entity name */ + char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */ + uint len; /* Text length */ + char *system_id; /* External ID */ + char *public_id; + struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */ + void *user; /* User-defined */ +}; + +struct xml_dtd_entity *xml_dtd_find_entity(struct xml_context *ctx, char *name); + +/* Elements */ + +enum xml_dtd_elem_flags { + XML_DTD_ELEM_DECLARED = 0x1, /* The element has been declared (internal usage) */ +}; + +enum xml_dtd_elem_type { + XML_DTD_ELEM_EMPTY, + XML_DTD_ELEM_ANY, + XML_DTD_ELEM_MIXED, + XML_DTD_ELEM_CHILDREN, +}; + +struct xml_dtd_elem { + snode n; + uint flags; + uint type; + char *name; + struct xml_dtd_elem_node *node; + slist attrs; + void *user; /* User-defined */ +}; + +struct xml_dtd_elem_node { + snode n; + struct xml_dtd_elem_node *parent; + struct xml_dtd_elem *elem; + slist sons; + uint type; + uint occur; + void *user; /* User-defined */ +}; + +enum xml_dtd_elem_node_type { + XML_DTD_ELEM_PCDATA, + XML_DTD_ELEM_SEQ, + XML_DTD_ELEM_OR, +}; + +enum xml_dtd_elem_node_occur { + XML_DTD_ELEM_OCCUR_ONCE, + XML_DTD_ELEM_OCCUR_OPT, + XML_DTD_ELEM_OCCUR_MULT, + XML_DTD_ELEM_OCCUR_PLUS, +}; + +struct xml_dtd_elem *xml_dtd_find_elem(struct xml_context *ctx, char *name); + +/* Attributes */ + +enum xml_dtd_attr_default { + XML_ATTR_NONE, + XML_ATTR_REQUIRED, + XML_ATTR_IMPLIED, + XML_ATTR_FIXED, +}; + +enum xml_dtd_attr_type { + XML_ATTR_CDATA, + XML_ATTR_ID, + XML_ATTR_IDREF, + XML_ATTR_IDREFS, + XML_ATTR_ENTITY, + XML_ATTR_ENTITIES, + XML_ATTR_NMTOKEN, + XML_ATTR_NMTOKENS, + XML_ATTR_ENUM, + XML_ATTR_NOTATION, +}; + +struct xml_dtd_attr { + snode n; + char *name; /* Attribute name */ + struct xml_dtd_elem *elem; /* Owner element */ + uint type; /* See enum xml_dtd_attr_type */ + uint default_mode; /* See enum xml_dtd_attr_default */ + char *default_value; /* The default value defined in DTD (or NULL) */ +}; + +struct xml_dtd_eval { + struct xml_dtd_attr *attr; + char *val; +}; + +struct xml_dtd_enotn { + struct xml_dtd_attr *attr; + struct xml_dtd_notn *notn; +}; + +void xml_dtd_init(struct xml_context *ctx); +void xml_dtd_cleanup(struct xml_context *ctx); +void xml_dtd_finish(struct xml_context *ctx); + +struct xml_dtd_attr *xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name); + +#endif diff --git a/ucw-xml/internals.h b/ucw-xml/internals.h new file mode 100644 index 00000000..a67cd8eb --- /dev/null +++ b/ucw-xml/internals.h @@ -0,0 +1,326 @@ +/* + * UCW Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _UCW_XML_INTERNALS_H +#define _UCW_XML_INTERNALS_H + +#include +#include + +#ifdef CONFIG_UCW_CLEAN_ABI +#define xml_attrs_table_cleanup ucw_xml_attrs_table_cleanup +#define xml_attrs_table_init ucw_xml_attrs_table_init +#define xml_fatal_expected ucw_xml_fatal_expected +#define xml_fatal_expected_quot ucw_xml_fatal_expected_quot +#define xml_fatal_expected_white ucw_xml_fatal_expected_white +#define xml_fatal_nested ucw_xml_fatal_nested +#define xml_hash_new ucw_xml_hash_new +#define xml_parse_attr_list_decl ucw_xml_parse_attr_list_decl +#define xml_parse_attr_value ucw_xml_parse_attr_value +#define xml_parse_char_ref ucw_xml_parse_char_ref +#define xml_parse_element_decl ucw_xml_parse_element_decl +#define xml_parse_entity_decl ucw_xml_parse_entity_decl +#define xml_parse_eq ucw_xml_parse_eq +#define xml_parse_name ucw_xml_parse_name +#define xml_parse_nmtoken ucw_xml_parse_nmtoken +#define xml_parse_notation_decl ucw_xml_parse_notation_decl +#define xml_parse_pe_ref ucw_xml_parse_pe_ref +#define xml_parse_pubid_literal ucw_xml_parse_pubid_literal +#define xml_parse_system_literal ucw_xml_parse_system_literal +#define xml_pop_comment ucw_xml_pop_comment +#define xml_pop_pi ucw_xml_pop_pi +#define xml_push_comment ucw_xml_push_comment +#define xml_push_entity ucw_xml_push_entity +#define xml_push_pi ucw_xml_push_pi +#define xml_push_source ucw_xml_push_source +#define xml_refill ucw_xml_refill +#define xml_skip_comment ucw_xml_skip_comment +#define xml_skip_internal_subset ucw_xml_skip_internal_subset +#define xml_skip_name ucw_xml_skip_name +#define xml_skip_pi ucw_xml_skip_pi +#define xml_sources_cleanup ucw_xml_sources_cleanup +#define xml_spout_chars ucw_xml_spout_chars +#define xml_throw ucw_xml_throw +#define xml_validate_attr ucw_xml_validate_attr +#endif + +/*** Debugging ***/ + +#ifdef LOCAL_DEBUG +#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0) +#else +#define TRACE(c, f, p...) do {} while(0) +#endif + +/*** Error handling ***/ + +void NONRET xml_throw(struct xml_context *ctx); + +/*** Memory management ***/ + +struct xml_stack { + struct xml_stack *next; + struct mempool_state state; + uint flags; +}; + +static inline void *xml_do_push(struct xml_context *ctx, uint size) +{ + /* Saves ctx->stack and ctx->flags state */ + struct mempool_state state; + mp_save(ctx->stack, &state); + struct xml_stack *s = mp_alloc(ctx->stack, size); + s->state = state; + s->flags = ctx->flags; + s->next = ctx->stack_list; + ctx->stack_list = s; + return s; +} + +static inline void xml_do_pop(struct xml_context *ctx, struct xml_stack *s) +{ + /* Restore ctx->stack and ctx->flags state */ + ctx->stack_list = s->next; + ctx->flags = s->flags; + mp_restore(ctx->stack, &s->state); +} + +static inline void xml_push(struct xml_context *ctx) +{ + TRACE(ctx, "push"); + xml_do_push(ctx, sizeof(struct xml_stack)); +} + +static inline void xml_pop(struct xml_context *ctx) +{ + TRACE(ctx, "pop"); + ASSERT(ctx->stack_list); + xml_do_pop(ctx, ctx->stack_list); +} + +struct xml_dom_stack { + struct xml_stack stack; + struct mempool_state state; +}; + +static inline struct xml_node *xml_push_dom(struct xml_context *ctx, struct mempool_state *state) +{ + /* Create a new DOM node */ + TRACE(ctx, "push_dom"); + struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s)); + if (state) + s->state = *state; + else + mp_save(ctx->pool, &s->state); + struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n)); + n->user = NULL; + if (n->parent = ctx->node) + clist_add_tail(&n->parent->sons, &n->n); + return ctx->node = n; +} + +static inline void xml_pop_dom(struct xml_context *ctx, uint free) +{ + /* Leave DOM subtree */ + TRACE(ctx, "pop_dom"); + ASSERT(ctx->node); + struct xml_node *p = ctx->node->parent; + struct xml_dom_stack *s = (void *)ctx->stack_list; + if (free) + { + /* See xml_pop_element() for cleanup of attribute hash table */ + if (p) + clist_remove(&ctx->node->n); + mp_restore(ctx->pool, &s->state); + } + ctx->node = p; + xml_do_pop(ctx, &s->stack); +} + +#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN) +#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \ + static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uint size) \ + { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \ + static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {} + +void *xml_hash_new(struct mempool *pool, uint size); + +void xml_spout_chars(struct fastbuf *fb); + +/*** Reading of document/external entities ***/ + +void NONRET xml_fatal_nested(struct xml_context *ctx); + +static inline void xml_inc(struct xml_context *ctx) +{ + /* Called after the first character of a block */ + TRACE(ctx, "inc"); + ctx->depth++; +} + +static inline void xml_dec(struct xml_context *ctx) +{ + /* Called after the last character of a block */ + TRACE(ctx, "dec"); + if (unlikely(!ctx->depth--)) + xml_fatal_nested(ctx); +} + +#include "obj/ucw-xml/unicat.h" + +static inline uint xml_char_cat(uint c) +{ + if (c < 0x10000) + return 1U << ucw_xml_char_tab1[(c & 0xff) + ucw_xml_char_tab2[c >> 8]]; + else if (likely(c < 0x110000)) + return 1U << ucw_xml_char_tab3[c >> 16]; + else + return 1; +} + +static inline uint xml_ascii_cat(uint c) +{ + return ucw_xml_char_tab1[c]; +} + +struct xml_source *xml_push_source(struct xml_context *ctx); +void xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); + +void xml_refill(struct xml_context *ctx); + +static inline uint xml_peek_char(struct xml_context *ctx) +{ + if (ctx->bptr == ctx->bstop) + xml_refill(ctx); + return ctx->bptr[0]; +} + +static inline uint xml_peek_cat(struct xml_context *ctx) +{ + if (ctx->bptr == ctx->bstop) + xml_refill(ctx); + return ctx->bptr[1]; +} + +static inline uint xml_get_char(struct xml_context *ctx) +{ + uint c = xml_peek_char(ctx); + ctx->bptr += 2; + return c; +} + +static inline uint xml_get_cat(struct xml_context *ctx) +{ + uint c = xml_peek_cat(ctx); + ctx->bptr += 2; + return c; +} + +static inline uint xml_last_char(struct xml_context *ctx) +{ + return ctx->bptr[-2]; +} + +static inline uint xml_last_cat(struct xml_context *ctx) +{ + return ctx->bptr[-1]; +} + +static inline uint xml_skip_char(struct xml_context *ctx) +{ + uint c = ctx->bptr[0]; + ctx->bptr += 2; + return c; +} + +static inline uint xml_unget_char(struct xml_context *ctx) +{ + return *(ctx->bptr -= 2); +} + +void xml_sources_cleanup(struct xml_context *ctx); + +/*** Parsing ***/ + +void NONRET xml_fatal_expected(struct xml_context *ctx, uint c); +void NONRET xml_fatal_expected_white(struct xml_context *ctx); +void NONRET xml_fatal_expected_quot(struct xml_context *ctx); + +static inline uint xml_parse_white(struct xml_context *ctx, uint mandatory) +{ + /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+ + * mandatory=0 -> S? */ + uint cnt = 0; + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + { + xml_skip_char(ctx); + cnt++; + } + if (unlikely(mandatory && !cnt)) + xml_fatal_expected_white(ctx); + return cnt; +} + +static inline void xml_parse_char(struct xml_context *ctx, uint c) +{ + /* Consumes a given Unicode character */ + if (unlikely(c != xml_get_char(ctx))) + xml_fatal_expected(ctx, c); +} + +static inline void xml_parse_seq(struct xml_context *ctx, const char *seq) +{ + /* Consumes a given sequence of ASCII characters */ + while (*seq) + xml_parse_char(ctx, *seq++); +} + +void xml_parse_eq(struct xml_context *ctx); + +static inline uint xml_parse_quote(struct xml_context *ctx) +{ + /* "'" | '"' */ + uint c = xml_get_char(ctx); + if (unlikely(c != '\'' && c != '\"')) + xml_fatal_expected_quot(ctx); + return c; +} + +char *xml_parse_name(struct xml_context *ctx, struct mempool *pool); +void xml_skip_name(struct xml_context *ctx); +char *xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool); + +char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool); +char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool); + +uint xml_parse_char_ref(struct xml_context *ctx); +void xml_parse_pe_ref(struct xml_context *ctx); + +char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr); + +void xml_skip_internal_subset(struct xml_context *ctx); +void xml_parse_notation_decl(struct xml_context *ctx); +void xml_parse_entity_decl(struct xml_context *ctx); +void xml_parse_element_decl(struct xml_context *ctx); +void xml_parse_attr_list_decl(struct xml_context *ctx); + +void xml_push_comment(struct xml_context *ctx); +void xml_pop_comment(struct xml_context *ctx); +void xml_skip_comment(struct xml_context *ctx); + +void xml_push_pi(struct xml_context *ctx); +void xml_pop_pi(struct xml_context *ctx); +void xml_skip_pi(struct xml_context *ctx); + +void xml_attrs_table_init(struct xml_context *ctx); +void xml_attrs_table_cleanup(struct xml_context *ctx); + +void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value); + +#endif diff --git a/ucw-xml/libucw-xml.pc b/ucw-xml/libucw-xml.pc new file mode 100644 index 00000000..5c02e996 --- /dev/null +++ b/ucw-xml/libucw-xml.pc @@ -0,0 +1,11 @@ +# pkg-config metadata for libucw-xml + +libdir=@LIBDIR@ +incdir=. + +Name: libucw-xml +Description: XML parser for LibUCW project +Version: @UCW_VERSION@ +Cflags: -I${incdir} +Libs: -L${libdir} @SO_LINK_PATH@ -lucw-xml@UCW_ABI_SUFFIX@ +Requires.private: @DEPS@ diff --git a/ucw-xml/parse.c b/ucw-xml/parse.c new file mode 100644 index 00000000..3402b732 --- /dev/null +++ b/ucw-xml/parse.c @@ -0,0 +1,1287 @@ +/* + * UCW Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#undef LOCAL_DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/*** Basic parsing ***/ + +void NONRET +xml_fatal_expected(struct xml_context *ctx, uint c) +{ + if (c >= 32 && c < 127) + xml_fatal(ctx, "Expected '%c'", c); + else + xml_fatal(ctx, "Expected U+%04x", c); +} + +void NONRET +xml_fatal_expected_white(struct xml_context *ctx) +{ + xml_fatal(ctx, "Expected a white space"); +} + +void NONRET +xml_fatal_expected_quot(struct xml_context *ctx) +{ + xml_fatal(ctx, "Expected a quotation mark"); +} + +void +xml_parse_eq(struct xml_context *ctx) +{ + /* Eq ::= S? '=' S? */ + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '='); + xml_parse_white(ctx, 0); +} + +/*** Names and nmtokens ***/ + +static char * +xml_parse_string(struct xml_context *ctx, struct mempool *pool, uint first_cat, uint next_cat, char *err) +{ + char *p = mp_start_noalign(pool, 1); + if (unlikely(!(xml_peek_cat(ctx) & first_cat))) + xml_fatal(ctx, "%s", err); + do + { + p = mp_spread(pool, p, 5); + p = utf8_32_put(p, xml_skip_char(ctx)); + } + while (xml_peek_cat(ctx) & next_cat); + *p++ = 0; + return mp_end(pool, p); +} + +static void +xml_skip_string(struct xml_context *ctx, uint first_cat, uint next_cat, char *err) +{ + if (unlikely(!(xml_get_cat(ctx) & first_cat))) + xml_fatal(ctx, "%s", err); + while (xml_peek_cat(ctx) & next_cat) + xml_skip_char(ctx); +} + +char * +xml_parse_name(struct xml_context *ctx, struct mempool *pool) +{ + /* Name ::= NameStartChar (NameChar)* */ + return xml_parse_string(ctx, pool, ctx->cat_sname, ctx->cat_name, "Expected a name"); +} + +void +xml_skip_name(struct xml_context *ctx) +{ + xml_skip_string(ctx, ctx->cat_sname, ctx->cat_name, "Expected a name"); +} + +char * +xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool) +{ + /* Nmtoken ::= (NameChar)+ */ + return xml_parse_string(ctx, pool, ctx->cat_name, ctx->cat_name, "Expected a nmtoken"); +} + +/*** Simple literals ***/ + +char * +xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool) +{ + /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */ + char *p = mp_start_noalign(pool, 1); + uint q = xml_parse_quote(ctx), c; + while ((c = xml_get_char(ctx)) != q) + { + p = mp_spread(pool, p, 5); + p = utf8_32_put(p, c); + } + *p++ = 0; + return mp_end(pool, p); +} + +char * +xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool) +{ + /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */ + char *p = mp_start_noalign(pool, 1); + uint q = xml_parse_quote(ctx), c; + while ((c = xml_get_char(ctx)) != q) + { + if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID))) + xml_fatal(ctx, "Expected a pubid character"); + p = mp_spread(pool, p, 2); + *p++ = c; + } + *p++ = 0; + return mp_end(pool, p); +} + +/*** Comments ***/ + +void +xml_push_comment(struct xml_context *ctx) +{ + TRACE(ctx, "push_comment"); + /* Comment ::= '' + * Already parsed: 'type = XML_NODE_COMMENT; + char *p = mp_start_noalign(ctx->pool, 6); + while (1) + { + if (xml_get_char(ctx) == '-') + if (xml_get_char(ctx) == '-') + break; + else + *p++ = '-'; + p = utf8_32_put(p, xml_last_char(ctx)); + p = mp_spread(ctx->pool, p, 6); + } + xml_parse_char(ctx, '>'); + *p = 0; + n->len = p - (char *)mp_ptr(ctx->pool); + n->text = mp_end(ctx->pool, p + 1); + if ((ctx->flags & XML_REPORT_COMMENTS) && ctx->h_comment) + ctx->h_comment(ctx); +} + +void +xml_pop_comment(struct xml_context *ctx) +{ + xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_COMMENTS)); + xml_dec(ctx); + TRACE(ctx, "pop_comment"); +} + +void +xml_skip_comment(struct xml_context *ctx) +{ + TRACE(ctx, "skip_comment"); + xml_parse_char(ctx, '-'); + while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-'); + xml_parse_char(ctx, '>'); + xml_dec(ctx); +} + +/*** Processing instructions ***/ + +void +xml_push_pi(struct xml_context *ctx) +{ + TRACE(ctx, "push_pi"); + /* Parses a PI to ctx->value and ctx->name: + * PI ::= '' Char*)))? '?>' + * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) + * Already parsed: 'type = XML_NODE_PI; + n->name = xml_parse_name(ctx, ctx->pool); + if (unlikely(!strcasecmp(n->name, "xml"))) + xml_error(ctx, "Reserved PI target"); + char *p = mp_start_noalign(ctx->pool, 5); + if (!xml_parse_white(ctx, 0)) + xml_parse_seq(ctx, "?>"); + else + while (1) + { + if (xml_get_char(ctx) == '?') + if (xml_peek_char(ctx) == '>') + { + xml_skip_char(ctx); + break; + } + else + *p++ = '?'; + else + p = utf8_32_put(p, xml_last_char(ctx)); + p = mp_spread(ctx->pool, p, 5); + } + *p = 0; + n->len = p - (char *)mp_ptr(ctx->pool); + n->text = mp_end(ctx->pool, p + 1); + if ((ctx->flags & XML_REPORT_PIS) && ctx->h_pi) + ctx->h_pi(ctx); +} + +void +xml_pop_pi(struct xml_context *ctx) +{ + xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_PIS)); + xml_dec(ctx); + TRACE(ctx, "pop_pi"); +} + +void +xml_skip_pi(struct xml_context *ctx) +{ + TRACE(ctx, "skip_pi"); + if (ctx->flags & XML_VALIDATING) + { + struct mempool_state state; + mp_save(ctx->stack, &state); + if (unlikely(!strcasecmp(xml_parse_name(ctx, ctx->stack), "xml"))) + xml_error(ctx, "Reserved PI target"); + mp_restore(ctx->stack, &state); + if (!xml_parse_white(ctx, 0)) + { + xml_parse_seq(ctx, "?>"); + xml_dec(ctx); + return; + } + } + while (1) + if (xml_get_char(ctx) == '?') + if (xml_peek_char(ctx) == '>') + break; + xml_skip_char(ctx); + xml_dec(ctx); +} + +/*** Character references ***/ + +uint +xml_parse_char_ref(struct xml_context *ctx) +{ + TRACE(ctx, "parse_char_ref"); + /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' + * Already parsed: '&#' */ + uint v = 0; + if (xml_get_char(ctx) == 'x') + { + if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT)) + { + xml_error(ctx, "Expected a hexadecimal value of character reference"); + goto recover; + } + do + { + v = (v << 4) + Cxvalue(xml_last_char(ctx)); + } + while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT)); + } + else + { + if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT)) + { + xml_error(ctx, "Expected a numeric value of character reference"); + goto recover; + } + do + { + v = v * 10 + xml_last_char(ctx) - '0'; + } + while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT)); + } + uint cat = xml_char_cat(v); + if (!(cat & ctx->cat_unrestricted)) + { + xml_error(ctx, "Character reference out of range"); + goto recover; + } + if (xml_last_char(ctx) == ';') + { + xml_dec(ctx); + return v; + } + xml_error(ctx, "Expected ';'"); +recover: + while (xml_last_char(ctx) != ';') + xml_get_char(ctx); + xml_dec(ctx); + return UNI_REPLACEMENT; +} + +/*** References to general entities ***/ + +static void +xml_parse_ref(struct xml_context *ctx) +{ + /* Reference ::= EntityRef | CharRef + * EntityRef ::= '&' Name ';' + * Already parsed: '&' */ + struct fastbuf *out = &ctx->chars; + if (xml_peek_char(ctx) == '#') + { + xml_skip_char(ctx); + bput_utf8_32(out, xml_parse_char_ref(ctx)); + } + else + { + TRACE(ctx, "parse_ge_ref"); + struct mempool_state state; + mp_save(ctx->stack, &state); + char *name = xml_parse_name(ctx, ctx->stack); + xml_parse_char(ctx, ';'); + struct xml_dtd_entity *ent = xml_dtd_find_entity(ctx, name); + if (!ent) + { + xml_error(ctx, "Unknown entity &%s;", name); + bputc(out, '&'); + bputs(out, name); + bputc(out, ';'); + } + else if (ent->flags & XML_DTD_ENTITY_TRIVIAL) + { + TRACE(ctx, "Trivial entity &%s;", name); + bputs(out, ent->text); + } + else + { + TRACE(ctx, "Pushed entity &%s;", name); + mp_restore(ctx->stack, &state); + xml_dec(ctx); + xml_push_entity(ctx, ent); + return; + } + mp_restore(ctx->stack, &state); + xml_dec(ctx); + } +} + +/*** Character data ***/ + +void +xml_spout_chars(struct fastbuf *fb) +{ + if (fb->bptr < fb->bufend) + return; + struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb); + struct mempool *pool = ctx->pool; + if (fb->bufend != fb->buffer) + { + TRACE(ctx, "growing chars"); + uint len = fb->bufend - fb->buffer; + uint reported = fb->bstop - fb->buffer; + fb->buffer = mp_expand(pool); + fb->bufend = fb->buffer + mp_avail(pool); + fb->bptr = fb->buffer + len; + fb->bstop = fb->buffer + reported; + } + else + { + TRACE(ctx, "starting chars"); + mp_save(pool, &ctx->chars_state); + fb->bptr = fb->buffer = fb->bstop = mp_start_noalign(pool, 2); + fb->bufend = fb->buffer + mp_avail(pool) - 1; + } +} + +static inline uint +xml_end_chars(struct xml_context *ctx, char **out) +{ + struct fastbuf *fb = &ctx->chars; + uint len = fb->bptr - fb->buffer; + if (len) + { + TRACE(ctx, "ending chars"); + *fb->bptr = 0; + *out = mp_end(ctx->pool, fb->bptr + 1); + fb->bufend = fb->bstop = fb->bptr = fb->buffer; + } + return len; +} + +static inline uint +xml_report_chars(struct xml_context *ctx, char **out) +{ + struct fastbuf *fb = &ctx->chars; + uint len = fb->bptr - fb->buffer; + if (len) + { + *fb->bptr = 0; + *out = fb->bstop; + fb->bstop = fb->bptr; + } + return len; +} + +static inline uint +xml_flush_chars(struct xml_context *ctx) +{ + char *text, *rtext; + uint len = xml_end_chars(ctx, &text), rlen; + if (len) + { + if (ctx->flags & XML_NO_CHARS) + { + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_ignorable) + ctx->h_ignorable(ctx, text, len); + mp_restore(ctx->pool, &ctx->chars_state); + return 0; + } + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) + ctx->h_block(ctx, rtext, rlen); + if (!(ctx->flags & XML_ALLOC_CHARS) && !(ctx->flags & XML_REPORT_CHARS)) + { + mp_restore(ctx->pool, &ctx->chars_state); + return 0; + } + struct xml_node *n = xml_push_dom(ctx, &ctx->chars_state); + n->type = XML_NODE_CHARS; + n->text = text; + n->len = len; + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars) + ctx->h_chars(ctx); + } + return len; +} + +static inline void +xml_pop_chars(struct xml_context *ctx) +{ + xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS)); + TRACE(ctx, "pop_chars"); +} + +static inline void +xml_append_chars(struct xml_context *ctx) +{ + TRACE(ctx, "append_chars"); + struct fastbuf *out = &ctx->chars; + if (ctx->flags & XML_NO_CHARS) + while (xml_get_char(ctx) != '<') + if (xml_last_cat(ctx) & XML_CHAR_WHITE) + bput_utf8_32(out, xml_last_char(ctx)); + else + { + xml_error(ctx, "This element must not contain character data"); + while (xml_get_char(ctx) != '<'); + break; + } + else + while (xml_get_char(ctx) != '<') + if (xml_last_char(ctx) == '&') + { + xml_inc(ctx); + xml_parse_ref(ctx); + } + else + bput_utf8_32(out, xml_last_char(ctx)); + xml_unget_char(ctx); +} + +/*** CDATA sections ***/ + +static void +xml_skip_cdata(struct xml_context *ctx) +{ + TRACE(ctx, "skip_cdata"); + xml_parse_seq(ctx, "CDATA["); + while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>'); + xml_dec(ctx); +} + +static void +xml_append_cdata(struct xml_context *ctx) +{ + /* CDSect :== '' Char*)) ']]>' + * Already parsed: 'flags & XML_NO_CHARS) + { + xml_error(ctx, "This element must not contain CDATA"); + xml_skip_cdata(ctx); + return; + } + xml_parse_seq(ctx, "CDATA["); + struct fastbuf *out = &ctx->chars; + uint rlen; + char *rtext; + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) + ctx->h_block(ctx, rtext, rlen); + while (1) + { + if (xml_get_char(ctx) == ']') + { + if (xml_get_char(ctx) == ']') + if (xml_get_char(ctx) == '>') + break; + else + bputc(out, ']'); + bputc(out, ']'); + } + bput_utf8_32(out, xml_last_char(ctx)); + } + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata && (rlen = xml_report_chars(ctx, &rtext))) + ctx->h_cdata(ctx, rtext, rlen); + xml_dec(ctx); +} + +/*** Attribute values ***/ + +char * +xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) +{ + TRACE(ctx, "parse_attr_value"); + /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */ + /* FIXME: -- check value constrains / normalize leading/trailing WS and repeated WS */ + struct mempool_state state; + uint quote = xml_parse_quote(ctx); + mp_save(ctx->stack, &state); + struct fastbuf *out = &ctx->chars; + struct xml_source *src = ctx->src; + while (1) + { + uint c = xml_get_char(ctx); + if (c == '&') + { + xml_inc(ctx); + xml_parse_ref(ctx); + } + else if (c == quote && src == ctx->src) + break; + else if (c == '<') + xml_error(ctx, "Attribute value must not contain '<'"); + else if (xml_last_cat(ctx) & XML_CHAR_WHITE) + bputc(out, ' '); + else + bput_utf8_32(out, c); + } + mp_restore(ctx->stack, &state); + char *text; + return xml_end_chars(ctx, &text) ? text : ""; +} + +uint +xml_normalize_white(struct xml_context *ctx UNUSED, char *text) +{ + char *s = text, *d = text; + while (*s == 0x20) + s++; + while (1) + { + while (*s & ~0x20) + *d++ = *s++; + if (!*s) + break; + while (*++s == 0x20); + *d++ = 0x20; + } + if (d != text && d[-1] == 0x20) + d--; + *d = 0; + return d - text; +} + +/*** Attributes ***/ + +struct xml_attrs_table; + +static inline uint +xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_node *e, char *n) +{ + return hash_pointer(e) ^ hash_string(n); +} + +static inline int +xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_node *e1, char *n1, struct xml_node *e2, char *n2) +{ + return (e1 == e2) && !strcmp(n1, n2); +} + +static inline void +xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_node *e, char *name) +{ + a->elem = e; + a->name = name; + a->val = NULL; + a->user = NULL; + slist_add_tail(&e->attrs, &a->n); +} + +#define HASH_PREFIX(x) xml_attrs_##x +#define HASH_NODE struct xml_attr +#define HASH_KEY_COMPLEX(x) x elem, x name +#define HASH_KEY_DECL struct xml_node *elem, char *name +#define HASH_TABLE_DYNAMIC +#define HASH_GIVE_EQ +#define HASH_GIVE_HASHFN +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_CLEANUP +#define HASH_WANT_REMOVE +#define HASH_WANT_LOOKUP +#define HASH_WANT_FIND +#define HASH_GIVE_ALLOC +XML_HASH_GIVE_ALLOC +#include + +static void +xml_parse_attr(struct xml_context *ctx) +{ + TRACE(ctx, "parse_attr"); + /* Attribute ::= Name Eq AttValue */ + struct xml_node *e = ctx->node; + char *n = xml_parse_name(ctx, ctx->pool); + struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, n); + xml_parse_eq(ctx); + char *v = xml_parse_attr_value(ctx, NULL); + if (a->val) + { + xml_error(ctx, "Attribute %s is not unique in element <%s>", n, e->name); + return; + } + a->val = v; + if (!e->dtd) + a->dtd = NULL; + else if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name))) + xml_error(ctx, "Undefined attribute %s in element <%s>", n, e->name); + else + xml_validate_attr(ctx, a->dtd, a->val); +} + +struct xml_attr * +xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name) +{ + return xml_attrs_find(ctx->tab_attrs, node, name); +} + +char * +xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name) +{ + struct xml_attr *attr = xml_attrs_find(ctx->tab_attrs, node, name); + if (attr) + return attr->val; + if (!node->dtd) + return NULL; + struct xml_dtd_attr *dtd = xml_dtd_find_attr(ctx, node->dtd, name); + return dtd ? dtd->default_value : NULL; +} + +void +xml_attrs_table_init(struct xml_context *ctx) +{ + xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table))); +} + +void +xml_attrs_table_cleanup(struct xml_context *ctx) +{ + xml_attrs_cleanup(ctx->tab_attrs); +} + +/*** Elements ***/ + +static uint +xml_validate_element(struct xml_dtd_elem_node *root, struct xml_dtd_elem *elem) +{ + if (root->elem) + return elem == root->elem; + else + SLIST_FOR_EACH(struct xml_dtd_elem_node *, son, root->sons) + if (xml_validate_element(son, elem)) + return 1; + return 0; +} + +static void +xml_push_element(struct xml_context *ctx) +{ + TRACE(ctx, "push_element"); + /* EmptyElemTag | STag + * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' + * STag ::= '<' Name (S Attribute)* S? '>' + * Already parsed: '<' */ + struct xml_node *e = xml_push_dom(ctx, NULL); + clist_init(&e->sons); + e->type = XML_NODE_ELEM; + e->name = xml_parse_name(ctx, ctx->pool); + slist_init(&e->attrs); + if (!e->parent) + { + ctx->dom = e; + if (ctx->doctype && strcmp(e->name, ctx->doctype)) + xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype); + } + if (!ctx->dtd) + e->dtd = NULL; + else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name))) + xml_error(ctx, "Undefined element <%s>", e->name); + else + { + struct xml_dtd_elem *dtd = e->dtd, *parent_dtd = e->parent ? e->parent->dtd : NULL; + if (dtd->type == XML_DTD_ELEM_MIXED) + ctx->flags &= ~XML_NO_CHARS; + else + ctx->flags |= XML_NO_CHARS; + if (parent_dtd) + if (parent_dtd->type == XML_DTD_ELEM_EMPTY) + xml_error(ctx, "Empty element must not contain children"); + else if (parent_dtd->type != XML_DTD_ELEM_ANY) + { + // FIXME: validate regular expressions + if (!xml_validate_element(parent_dtd->node, dtd)) + xml_error(ctx, "Unexpected element <%s>", e->name); + } + } + while (1) + { + uint white = xml_parse_white(ctx, 0); + uint c = xml_get_char(ctx); + if (c == '/') + { + xml_parse_char(ctx, '>'); + ctx->flags |= XML_EMPTY_ELEM_TAG; + break; + } + else if (c == '>') + break; + else if (!white) + xml_fatal_expected_white(ctx); + xml_unget_char(ctx); + xml_parse_attr(ctx); + } + if (e->dtd) + SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs) + if (a->default_mode == XML_ATTR_REQUIRED) + { + if (!xml_attrs_find(ctx->tab_attrs, e, a->name)) + xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name); + } + else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS) + { + struct xml_attr *attr = xml_attrs_lookup(ctx->tab_attrs, e, a->name); + if (!attr->val) + attr->val = a->default_value; + } + if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag) + ctx->h_stag(ctx); +} + +static void +xml_pop_element(struct xml_context *ctx) +{ + TRACE(ctx, "pop_element"); + if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag) + ctx->h_etag(ctx); + struct xml_node *e = ctx->node; + uint free = !(ctx->flags & XML_ALLOC_TAGS); + if (free) + { + if (!e->parent) + ctx->dom = NULL; + /* Restore hash table of attributes */ + SLIST_FOR_EACH(struct xml_attr *, a, e->attrs) + xml_attrs_remove(ctx->tab_attrs, a); + struct xml_node *n; + while (n = clist_head(&e->sons)) + { + if (n->type == XML_NODE_ELEM) + { + SLIST_FOR_EACH(struct xml_attr *, a, n->attrs) + xml_attrs_remove(ctx->tab_attrs, a); + clist_insert_list_after(&n->sons, &n->n); + } + clist_remove(&n->n); + } + } + xml_pop_dom(ctx, free); + xml_dec(ctx); +} + +static void +xml_parse_etag(struct xml_context *ctx) +{ + /* ETag ::= '' + * Already parsed: '<' */ + struct xml_node *e = ctx->node; + ASSERT(e); + char *n = e->name; + while (*n) + { + uint c; + n = utf8_32_get(n, &c); + if (xml_get_char(ctx) != c) + goto recover; + } + xml_parse_white(ctx, 0); + if (xml_get_char(ctx) != '>') + { +recover: + xml_error(ctx, "Invalid ETag, expected ", e->name); + while (xml_get_char(ctx) != '>'); + } + xml_dec(ctx); +} + +/*** Document type declaration ***/ + +static void +xml_parse_doctype_decl(struct xml_context *ctx) +{ + TRACE(ctx, "parse_doctype_decl"); + /* doctypedecl ::= '' + * Already parsed: '' */ + if (ctx->doctype) + xml_fatal(ctx, "Multiple document types not allowed"); + xml_parse_seq(ctx, "DOCTYPE"); + xml_parse_white(ctx, 1); + ctx->doctype = xml_parse_name(ctx, ctx->pool); + TRACE(ctx, "doctype=%s", ctx->doctype); + uint c; + if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P')) + { + if (c == 'S') + { + xml_parse_seq(ctx, "SYSTEM"); + xml_parse_white(ctx, 1); + ctx->system_id = xml_parse_system_literal(ctx, ctx->pool); + } + else + { + xml_parse_seq(ctx, "PUBLIC"); + xml_parse_white(ctx, 1); + ctx->public_id = xml_parse_pubid_literal(ctx, ctx->pool); + xml_parse_white(ctx, 1); + ctx->system_id = xml_parse_system_literal(ctx, ctx->pool); + } + xml_parse_white(ctx, 0); + ctx->flags |= XML_HAS_EXTERNAL_SUBSET; + } + if (xml_peek_char(ctx) == '[') + { + ctx->flags |= XML_HAS_INTERNAL_SUBSET; + xml_skip_char(ctx); + xml_inc(ctx); + } + if (ctx->h_doctype_decl) + ctx->h_doctype_decl(ctx); +} + + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/* DTD: Internal subset */ + +static void +xml_parse_subset(struct xml_context *ctx, uint external) +{ + // FIXME: + // -- comments/pi have no parent + // -- conditional sections in external subset + // -- check corectness of parameter entities + + /* '[' intSubset ']' + * intSubset :== (markupdecl | DeclSep) + * Already parsed: '[' + * + * extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* + */ + while (1) + { + xml_parse_white(ctx, 0); + uint c = xml_get_char(ctx); + xml_inc(ctx); + if (c == '<') + if ((c = xml_get_char(ctx)) == '!') + switch (c = xml_get_char(ctx)) + { + case '-': + xml_push_comment(ctx); + xml_pop_comment(ctx); + break; + case 'N': + xml_parse_seq(ctx, "OTATION"); + xml_parse_notation_decl(ctx); + break; + case 'E': + if ((c = xml_get_char(ctx)) == 'N') + { + xml_parse_seq(ctx, "TITY"); + xml_parse_entity_decl(ctx); + } + else if (c == 'L') + { + xml_parse_seq(ctx, "EMENT"); + xml_parse_element_decl(ctx); + } + else + goto invalid_markup; + break; + case 'A': + xml_parse_seq(ctx, "TTLIST"); + xml_parse_attr_list_decl(ctx); + break; + default: + goto invalid_markup; + } + else if (c == '?') + { + xml_push_pi(ctx); + xml_pop_pi(ctx); + } + else + goto invalid_markup; + else if (c == '%') + xml_parse_pe_ref(ctx); + else if (c == ']' && !external) + { + break; + } + else if (c == '>' && external) + { + break; + } + else + goto invalid_markup; + } + xml_dec(ctx); + return; +invalid_markup: ; + xml_fatal(ctx, "Invalid markup in the %s subset", external ? "external" : "internal"); +} + +/*** The State Machine ***/ + +uint +xml_next(struct xml_context *ctx) +{ + /* A nasty state machine */ + +#define PULL(x) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##x; case XML_STATE_##x: ; } while (0) +#define PULL_STATE(x, s) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##s, XML_STATE_##x; case XML_STATE_##s: ; } while (0) + + TRACE(ctx, "xml_next (state=%u)", ctx->state); + jmp_buf throw_buf; + ctx->throw_buf = &throw_buf; + if (setjmp(throw_buf)) + { +error: + if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal) + ctx->h_fatal(ctx); + TRACE(ctx, "raised fatal error"); + return ctx->state = XML_STATE_EOF; + } + uint c; + switch (ctx->state) + { + case XML_STATE_START: + TRACE(ctx, "entering prolog"); + ctx->flags |= XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL; + if (ctx->h_document_start) + ctx->h_document_start(ctx); + /* XMLDecl */ + xml_refill(ctx); + if (ctx->h_xml_decl) + ctx->h_xml_decl(ctx); + PULL(XML_DECL); + + /* Misc* (doctypedecl Misc*)? */ + while (1) + { + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '<'); + xml_inc(ctx); + if ((c = xml_get_char(ctx)) == '?') + /* Processing intruction */ + if (!(ctx->flags & XML_REPORT_PIS)) + xml_skip_pi(ctx); + else + { + xml_push_pi(ctx); + PULL_STATE(PI, PROLOG_PI); + xml_pop_pi(ctx); + } + else if (c != '!') + { + /* Found the root tag */ + xml_unget_char(ctx); + goto first_tag; + } + else if (xml_get_char(ctx) == '-') + if (!(ctx->flags & XML_REPORT_COMMENTS)) + xml_skip_comment(ctx); + else + { + xml_push_comment(ctx); + PULL_STATE(COMMENT, PROLOG_COMMENT); + xml_pop_comment(ctx); + } + else + { + /* DocTypeDecl */ + xml_unget_char(ctx); + xml_parse_doctype_decl(ctx); + PULL(DOCTYPE_DECL); + if (ctx->flags & XML_HAS_DTD) + if (ctx->flags & XML_PARSE_DTD) + { + xml_dtd_init(ctx); + if (ctx->h_dtd_start) + ctx->h_dtd_start(ctx); + if (ctx->flags & XML_HAS_INTERNAL_SUBSET) + { + xml_parse_subset(ctx, 0); + xml_dec(ctx); + } + if (ctx->flags & XML_HAS_EXTERNAL_SUBSET) + { + struct xml_dtd_entity ent = { + .system_id = ctx->system_id, + .public_id = ctx->public_id, + }; + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_unget_char(ctx); + ASSERT(ctx->h_resolve_entity); + ctx->h_resolve_entity(ctx, &ent); + ctx->flags |= XML_SRC_EXPECTED_DECL; + xml_parse_subset(ctx, 1); + xml_unget_char(ctx);; + } + if (ctx->h_dtd_end) + ctx->h_dtd_end(ctx); + } + else if (ctx->flags & XML_HAS_INTERNAL_SUBSET) + xml_skip_internal_subset(ctx); + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_dec(ctx); + } + } + + case XML_STATE_CHARS: + + while (1) + { + if (xml_peek_char(ctx) != '<') + { + /* CharData */ + xml_append_chars(ctx); + continue; + } + else + xml_skip_char(ctx); + xml_inc(ctx); +first_tag: + + if ((c = xml_get_char(ctx)) == '?') + { + /* PI */ + if (!(ctx->flags & (XML_REPORT_PIS | XML_ALLOC_PIS))) + xml_skip_pi(ctx); + else + { + if (xml_flush_chars(ctx)) + { + PULL_STATE(CHARS, CHARS_BEFORE_PI); + xml_pop_chars(ctx); + } + xml_push_pi(ctx); + PULL(PI); + xml_pop_pi(ctx); + } + } + + else if (c == '!') + if ((c = xml_get_char(ctx)) == '-') + { + /* Comment */ + if (!(ctx->flags & (XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS))) + xml_skip_comment(ctx); + else + { + if (xml_flush_chars(ctx)) + { + PULL_STATE(CHARS, CHARS_BEFORE_COMMENT); + xml_pop_chars(ctx); + } + xml_push_comment(ctx); + PULL(COMMENT); + xml_pop_comment(ctx); + } + } + else if (c == '[') + { + /* CDATA */ + xml_append_cdata(ctx); + } + else + xml_fatal(ctx, "Unexpected character after 'flags & XML_EMPTY_ELEM_TAG) + goto pop_element; + } + + else + { + /* ETag */ + if (xml_flush_chars(ctx)) + { + PULL_STATE(CHARS, CHARS_BEFORE_ETAG); + xml_pop_chars(ctx); + } + + xml_parse_etag(ctx); +pop_element: + PULL(ETAG); + xml_pop_element(ctx); + if (!ctx->node) + goto epilog; + } + } + +epilog: + /* Misc* */ + TRACE(ctx, "entering epilog"); + while (1) + { + /* Epilog whitespace is the only place, where a valid document can reach EOF */ + if (setjmp(throw_buf)) + if (ctx->err_code == XML_ERR_EOF) + { + TRACE(ctx, "reached EOF"); + ctx->state = XML_STATE_EOF; + if (ctx->h_document_end) + ctx->h_document_end(ctx); + case XML_STATE_EOF: + ctx->err_code = 0; + ctx->err_msg = NULL; + return XML_STATE_EOF; + } + else + goto error; + xml_parse_white(ctx, 0); + if (setjmp(throw_buf)) + goto error; + + /* Misc */ + xml_parse_char(ctx, '<'); + xml_inc(ctx); + if ((c = xml_get_char(ctx)) == '?') + /* Processing instruction */ + if (!(ctx->flags & XML_REPORT_PIS)) + xml_skip_pi(ctx); + else + { + xml_push_pi(ctx); + PULL_STATE(PI, EPILOG_PI); + xml_pop_pi(ctx); + } + else if (c == '!') + { + xml_parse_char(ctx, '-'); + /* Comment */ + if (!(ctx->flags & XML_REPORT_COMMENTS)) + xml_skip_comment(ctx); + else + { + xml_push_comment(ctx); + PULL_STATE(COMMENT, EPILOG_COMMENT); + xml_pop_comment(ctx); + } + } + else + xml_fatal(ctx, "Syntax error in the epilog"); + } + + } + ASSERT(0); +} + +uint +xml_next_state(struct xml_context *ctx, uint pull) +{ + uint saved = ctx->pull; + ctx->pull = pull; + uint res = xml_next(ctx); + ctx->pull = saved; + return res; +} + +uint +xml_skip_element(struct xml_context *ctx) +{ + ASSERT(ctx->state == XML_STATE_STAG); + struct xml_node *node = ctx->node; + uint saved = ctx->pull, res; + ctx->pull = XML_PULL_ETAG; + while ((res = xml_next(ctx)) && ctx->node != node); + ctx->pull = saved; + return res; +} + +uint +xml_parse(struct xml_context *ctx) +{ + /* This cycle should run only once unless the user overrides the value of ctx->pull in a SAX handler */ + do + { + ctx->pull = 0; + } + while (xml_next(ctx)); + return ctx->err_code; +} + +char * +xml_merge_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool) +{ + ASSERT(node->type == XML_NODE_ELEM); + char *p = mp_start_noalign(pool, 1); + XML_NODE_FOR_EACH(son, node) + if (son->type == XML_NODE_CHARS) + { + p = mp_spread(pool, p, son->len + 1); + memcpy(p, son->text, son->len); + p += son->len; + } + *p++ = 0; + return mp_end(pool, p); +} + +static char * +xml_append_dom_chars(char *p, struct mempool *pool, struct xml_node *node) +{ + XML_NODE_FOR_EACH(son, node) + if (son->type == XML_NODE_CHARS) + { + p = mp_spread(pool, p, son->len + 1); + memcpy(p, son->text, son->len); + p += son->len; + } + else if (son->type == XML_NODE_ELEM) + p = xml_append_dom_chars(p, pool, son); + return p; +} + +char * +xml_merge_dom_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool) +{ + ASSERT(node->type == XML_NODE_ELEM); + char *p = mp_start_noalign(pool, 1); + p = xml_append_dom_chars(p, pool, node); + *p++ = 0; + return mp_end(pool, p); +} diff --git a/ucw-xml/source.c b/ucw-xml/source.c new file mode 100644 index 00000000..5396c50d --- /dev/null +++ b/ucw-xml/source.c @@ -0,0 +1,486 @@ +/* + * UCW Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#undef LOCAL_DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include + +/*** Character categorization ***/ + +#include "obj/ucw-xml/unicat.c" + +static void +xml_init_cats(struct xml_context *ctx) +{ + if (!(ctx->flags & XML_VERSION_1_1)) + { + ctx->cat_chars = XML_CHAR_VALID_1_0; + ctx->cat_unrestricted = XML_CHAR_VALID_1_0; + ctx->cat_new_line = XML_CHAR_NEW_LINE_1_0; + ctx->cat_name = XML_CHAR_NAME_1_0; + ctx->cat_sname = XML_CHAR_SNAME_1_0; + } + else + { + ctx->cat_chars = XML_CHAR_VALID_1_1; + ctx->cat_unrestricted = XML_CHAR_UNRESTRICTED_1_1; + ctx->cat_new_line = XML_CHAR_NEW_LINE_1_1; + ctx->cat_name = XML_CHAR_NAME_1_1; + ctx->cat_sname = XML_CHAR_SNAME_1_1; + } +} + +/*** Reading of document/external entities ***/ + +static void NONRET +xml_eof(struct xml_context *ctx) +{ + ctx->err_msg = "Unexpected EOF"; + ctx->err_code = XML_ERR_EOF; + xml_throw(ctx); +} + +void NONRET +xml_fatal_nested(struct xml_context *ctx) +{ + xml_fatal(ctx, "Entity is not nested correctly"); +} + +static inline void +xml_add_char(u32 **bstop, uint c) +{ + *(*bstop)++ = c; + *(*bstop)++ = xml_char_cat(c); +} + +struct xml_source * +xml_push_source(struct xml_context *ctx) +{ + xml_push(ctx); + struct xml_source *src = ctx->src; + if (src) + { + src->bptr = ctx->bptr; + src->bstop = ctx->bstop; + } + src = mp_alloc_zero(ctx->stack, sizeof(*src)); + src->next = ctx->src; + src->saved_depth = ctx->depth; + ctx->src = src; + ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_DOCUMENT); + ctx->bstop = ctx->bptr = src->buf; + ctx->depth = 0; + return src; +} + +struct xml_source * +xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb) +{ + struct xml_source *src = xml_push_source(ctx); + src->fb = fb; + return src; +} + +static void +xml_close_source(struct xml_source *src) +{ + bclose(src->fb); + if (src->wrapped_fb) + bclose(src->wrapped_fb); +} + +static void +xml_pop_source(struct xml_context *ctx) +{ + TRACE(ctx, "pop_source"); + if (unlikely(ctx->depth != 0)) + xml_fatal(ctx, "Unexpected end of entity"); + struct xml_source *src = ctx->src; + if (!src) + xml_fatal(ctx, "Undefined source"); + xml_close_source(src); + ctx->depth = src->saved_depth; + ctx->src = src = src->next; + if (src) + { + ctx->bptr = src->bptr; + ctx->bstop = src->bstop; + } + xml_pop(ctx); + if (unlikely(!src)) + xml_eof(ctx); +} + +void +xml_sources_cleanup(struct xml_context *ctx) +{ + struct xml_source *s; + while (s = ctx->src) + { + ctx->src = s->next; + xml_close_source(s); + } +} + +static void xml_refill_utf8(struct xml_context *ctx); + +void +xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent UNUSED) +{ + xml_error(ctx, "References to external entities are not supported"); +} + +void +xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent) +{ + TRACE(ctx, "xml_push_entity"); + struct xml_source *src; + if (ent->flags & XML_DTD_ENTITY_EXTERNAL) + { + ASSERT(ctx->h_resolve_entity); + ctx->h_resolve_entity(ctx, ent); + ctx->flags |= XML_SRC_EXPECTED_DECL; + src = ctx->src; + } + else + { + src = xml_push_source(ctx); + fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, strlen(ent->text), 0); + } + src->refill = xml_refill_utf8; + src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line; + src->refill_cat2 = ctx->cat_new_line; +} + +static uint +xml_error_restricted(struct xml_context *ctx, uint c) +{ + if (c == ~1U) + xml_error(ctx, "Corrupted encoding"); + else + xml_error(ctx, "Restricted char U+%04X", c); + return UNI_REPLACEMENT; +} + +static void xml_parse_decl(struct xml_context *ctx); + +#define REFILL(ctx, func, params...) \ + struct xml_source *src = ctx->src; \ + struct fastbuf *fb = src->fb; \ + if (ctx->bptr == ctx->bstop) \ + ctx->bptr = ctx->bstop = src->buf; \ + uint c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ + u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \ + *last_0xd = src->pending_0xd ? bstop : NULL; \ + do \ + { \ + c = func(fb, ##params); \ + uint t = xml_char_cat(c); \ + if (t & t1) \ + /* Typical branch */ \ + *bstop++ = c, *bstop++ = t; \ + else if (t & t2) \ + { \ + /* New line */ \ + /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \ + /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \ + if (c == 0xd) \ + last_0xd = bstop + 2; \ + else if (c != 0x2028 && last_0xd == bstop) \ + { \ + last_0xd = NULL; \ + continue; \ + } \ + xml_add_char(&bstop, 0xa), row++; \ + } \ + else if (c == '>') \ + { \ + /* Used only in XML/TextDecl to switch the encoding */ \ + *bstop++ = c, *bstop++ = t; \ + break; \ + } \ + else if (~c) \ + /* Restricted character */ \ + xml_add_char(&bstop, xml_error_restricted(ctx, c)); \ + else \ + { \ + /* EOF */ \ + ctx->flags |= XML_SRC_EOF; \ + break; \ + } \ + } \ + while (bstop < bend); \ + src->pending_0xd = (last_0xd == bstop); \ + ctx->bstop = bstop; \ + src->row = row; + +static void +xml_refill_utf8(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf8_repl, ~1U); +} + +static void +xml_refill_utf16_le(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf16_le_repl, ~1U); +} + +static void +xml_refill_utf16_be(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf16_be_repl, ~1U); +} + +#undef REFILL + +void +xml_refill(struct xml_context *ctx) +{ + do + { + if (ctx->flags & XML_SRC_EOF) + xml_pop_source(ctx); + else if (ctx->flags & XML_SRC_EXPECTED_DECL) + xml_parse_decl(ctx); + else + { + ctx->src->refill(ctx); + TRACE(ctx, "refilled %u characters", (uint)((ctx->bstop - ctx->bptr) / 2)); + } + } + while (ctx->bptr == ctx->bstop); +} + +static uint +xml_source_row(struct xml_context *ctx, struct xml_source *src) +{ + uint row = src->row; + for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2) + if (p[-1] & src->refill_cat2) + row--; + return row + 1; +} + +uint +xml_row(struct xml_context *ctx) +{ + return ctx->src ? xml_source_row(ctx, ctx->src) : 0; +} + +/* Document/external entity header */ + +static char * +xml_parse_encoding_name(struct xml_context *ctx) +{ + /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */ + char *p = mp_start_noalign(ctx->pool, 1); + uint q = xml_parse_quote(ctx); + if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME))) + xml_fatal(ctx, "Invalid character in the encoding name"); + while (1) + { + p = mp_spread(ctx->pool, p, 2); + *p++ = xml_last_char(ctx); + if (xml_get_char(ctx) == q) + break; + if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME))) + xml_fatal(ctx, "Invalid character in the encoding name"); + } + *p++ = 0; + return mp_end(ctx->pool, p); +} + +static void +xml_init_charconv(struct xml_context *ctx, int cs) +{ + // XXX: with a direct access to libucw-charset tables could be faster + struct xml_source *src = ctx->src; + TRACE(ctx, "wrapping charset %s", charset_name(cs)); + src->wrapped_fb = src->fb; + src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8); +} + +static void +xml_parse_decl(struct xml_context *ctx) +{ + TRACE(ctx, "xml_parse_decl"); + struct xml_source *src = ctx->src; + ctx->flags &= ~XML_SRC_EXPECTED_DECL; + uint doc = ctx->flags & XML_SRC_DOCUMENT; + + /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */ + if (doc) + xml_init_cats(ctx); + src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line & ~XML_CHAR_GT; + src->refill_cat2 = ctx->cat_new_line; + + /* Initialize the supplied charset (if any) or try to guess it */ + char *expected_encoding = src->expected_encoding; + src->refill = xml_refill_utf8; + int bom = bpeekc(src->fb); + if (bom < 0) + ctx->flags |= XML_SRC_EOF; + if (!src->fb_encoding) + { + if (bom == 0xfe) + src->refill = xml_refill_utf16_be; + else if (bom == 0xff) + src->refill = xml_refill_utf16_le; + } + else + { + int cs = find_charset_by_name(src->fb_encoding); + if (cs == CONV_CHARSET_UTF8) + {} + else if (cs >= 0) + { + xml_init_charconv(ctx, cs); + bom = 0; + } + else if (strcasecmp(src->fb_encoding, "UTF-16")) + { + src->refill = xml_refill_utf16_be; + if (bom == 0xff) + src->refill = xml_refill_utf16_le; + } + else if (strcasecmp(src->fb_encoding, "UTF-16BE")) + src->refill = xml_refill_utf16_be; + else if (strcasecmp(src->fb_encoding, "UTF-16LE")) + src->refill = xml_refill_utf16_le; + else + { + xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding); + expected_encoding = NULL; + } + } + uint utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; + if (utf16) + src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE"; + if (!expected_encoding) + expected_encoding = src->fb_encoding; + if (bom > 0 && xml_peek_char(ctx) == 0xfeff) + xml_skip_char(ctx); + else if (utf16) + xml_error(ctx, "Missing or corrupted BOM"); + TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?"); + + /* Look ahead for presence of XMLDecl or optional TextDecl */ + if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) + xml_refill(ctx); + u32 *bptr = ctx->bptr; + uint have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) && + bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L'); + if (!have_decl) + { + if (doc) + xml_fatal(ctx, "Missing or corrupted XML header"); + else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16) + xml_error(ctx, "Missing or corrupted entity header"); + goto exit; + } + ctx->bptr = bptr + 12; + xml_parse_white(ctx, 0); + + /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */ + if (xml_peek_char(ctx) == 'v') + { + xml_parse_seq(ctx, "version"); + xml_parse_eq(ctx); + char *version = xml_parse_pubid_literal(ctx, ctx->pool); + TRACE(ctx, "version=%s", version); + uint v = 0; + if (!strcmp(version, "1.1")) + v = XML_VERSION_1_1; + else if (strcmp(version, "1.0")) + { + xml_error(ctx, "Unknown XML version string '%s'", version); + version = "1.0"; + } + if (doc) + { + ctx->version_str = version; + ctx->flags |= v; + } + else if (v > (ctx->flags & XML_VERSION_1_1)) + xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document"); + if (!xml_parse_white(ctx, !doc)) + goto end; + } + else if (doc) + { + xml_error(ctx, "Expected XML version"); + ctx->version_str = "1.0"; + } + + /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */ + if (xml_peek_char(ctx) == 'e') + { + xml_parse_seq(ctx, "encoding"); + xml_parse_eq(ctx); + src->decl_encoding = xml_parse_encoding_name(ctx); + TRACE(ctx, "encoding=%s", src->decl_encoding); + if (!xml_parse_white(ctx, 0)) + goto end; + } + else if (!doc) + xml_error(ctx, "Expected XML encoding"); + + /* Parse whether the document is standalone (optional in XMLDecl) */ + if (doc && xml_peek_char(ctx) == 's') + { + xml_parse_seq(ctx, "standalone"); + xml_parse_eq(ctx); + uint c = xml_parse_quote(ctx); + if (ctx->standalone = (xml_peek_char(ctx) == 'y')) + xml_parse_seq(ctx, "yes"); + else + xml_parse_seq(ctx, "no"); + xml_parse_char(ctx, c); + TRACE(ctx, "standalone=%d", ctx->standalone); + xml_parse_white(ctx, 0); + } +end: + xml_parse_seq(ctx, "?>"); + + /* Switch to the final encoding */ + if (src->decl_encoding) + { + int cs = find_charset_by_name(src->decl_encoding); + if (cs < 0 && !expected_encoding) + xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); + else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) + { + xml_init_charconv(ctx, cs); + src->fb_encoding = src->decl_encoding; + } + else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || + !(!strcasecmp(src->decl_encoding, "UTF-16") || + (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || + (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) + xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); + } + if (!src->fb_encoding) + src->fb_encoding = "UTF-8"; + TRACE(ctx, "Final encoding=%s", src->fb_encoding); + +exit: + /* Update valid Unicode ranges */ + if (doc) + xml_init_cats(ctx); + src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line; + src->refill_cat2 = ctx->cat_new_line; +} diff --git a/ucw-xml/unicat.pl b/ucw-xml/unicat.pl new file mode 100755 index 00000000..c1bc442b --- /dev/null +++ b/ucw-xml/unicat.pl @@ -0,0 +1,165 @@ +#!/usr/bin/perl +# +# UCW Library -- Character map for the XML parser +# +# (c) 2007 Pavel Charvat +# +# This software may be freely distributed and used according to the terms +# of the GNU Lesser General Public License. +# + +my @cat = (); +my @lcat = (); +my %ids = (); +my %cls = (); +for (my $i = 0; $i < 0x10000; $i++) { $cat[$i] = 0; } +for (my $i = 0; $i < 0x11; $i++) { $lcat[$i] = 0; } + +my @white = (0x9, 0xA, 0xD, 0x20); +my @base_char_1_0 = ( + [0x0041,0x005A], [0x0061,0x007A], [0x00C0,0x00D6], [0x00D8,0x00F6], [0x00F8,0x00FF], [0x0100,0x0131], + [0x0134,0x013E], [0x0141,0x0148], [0x014A,0x017E], [0x0180,0x01C3], [0x01CD,0x01F0], [0x01F4,0x01F5], + [0x01FA,0x0217], [0x0250,0x02A8], [0x02BB,0x02C1], 0x0386, [0x0388,0x038A], 0x038C, [0x038E,0x03A1], + [0x03A3,0x03CE], [0x03D0,0x03D6], 0x03DA, 0x03DC, 0x03DE, 0x03E0, [0x03E2,0x03F3], [0x0401,0x040C], + [0x040E,0x044F], [0x0451,0x045C], [0x045E,0x0481], [0x0490,0x04C4], [0x04C7,0x04C8], [0x04CB,0x04CC], + [0x04D0,0x04EB], [0x04EE,0x04F5], [0x04F8,0x04F9], [0x0531,0x0556], 0x0559, [0x0561,0x0586], [0x05D0,0x05EA], + [0x05F0,0x05F2], [0x0621,0x063A], [0x0641,0x064A], [0x0671,0x06B7], [0x06BA,0x06BE], [0x06C0,0x06CE], + [0x06D0,0x06D3], 0x06D5, [0x06E5,0x06E6], [0x0905,0x0939], 0x093D, [0x0958,0x0961], [0x0985,0x098C], + [0x098F,0x0990], [0x0993,0x09A8], [0x09AA,0x09B0], 0x09B2, [0x09B6,0x09B9], [0x09DC,0x09DD], [0x09DF,0x09E1], + [0x09F0,0x09F1], [0x0A05,0x0A0A], [0x0A0F,0x0A10], [0x0A13,0x0A28], [0x0A2A,0x0A30], [0x0A32,0x0A33], + [0x0A35,0x0A36], [0x0A38,0x0A39], [0x0A59,0x0A5C], 0x0A5E, [0x0A72,0x0A74], [0x0A85,0x0A8B], 0x0A8D, + [0x0A8F,0x0A91], [0x0A93,0x0AA8], [0x0AAA,0x0AB0], [0x0AB2,0x0AB3], [0x0AB5,0x0AB9], 0x0ABD, 0x0AE0, + [0x0B05,0x0B0C], [0x0B0F,0x0B10], [0x0B13,0x0B28], [0x0B2A,0x0B30], [0x0B32,0x0B33], [0x0B36,0x0B39], + 0x0B3D, [0x0B5C,0x0B5D], [0x0B5F,0x0B61], [0x0B85,0x0B8A], [0x0B8E,0x0B90], [0x0B92,0x0B95], [0x0B99,0x0B9A], + 0x0B9C, [0x0B9E,0x0B9F], [0x0BA3,0x0BA4], [0x0BA8,0x0BAA], [0x0BAE,0x0BB5], [0x0BB7,0x0BB9], [0x0C05,0x0C0C], + [0x0C0E,0x0C10], [0x0C12,0x0C28], [0x0C2A,0x0C33], [0x0C35,0x0C39], [0x0C60,0x0C61], [0x0C85,0x0C8C], + [0x0C8E,0x0C90], [0x0C92,0x0CA8], [0x0CAA,0x0CB3], [0x0CB5,0x0CB9], 0x0CDE, [0x0CE0,0x0CE1], [0x0D05,0x0D0C], + [0x0D0E,0x0D10], [0x0D12,0x0D28], [0x0D2A,0x0D39], [0x0D60,0x0D61], [0x0E01,0x0E2E], 0x0E30, [0x0E32,0x0E33], + [0x0E40,0x0E45], [0x0E81,0x0E82], 0x0E84, [0x0E87,0x0E88], 0x0E8A, 0x0E8D, [0x0E94,0x0E97], [0x0E99,0x0E9F], + [0x0EA1,0x0EA3], 0x0EA5, 0x0EA7, [0x0EAA,0x0EAB], [0x0EAD,0x0EAE], 0x0EB0, [0x0EB2,0x0EB3], 0x0EBD, + [0x0EC0,0x0EC4], [0x0F40,0x0F47], [0x0F49,0x0F69], [0x10A0,0x10C5], [0x10D0,0x10F6], 0x1100, [0x1102,0x1103], + [0x1105,0x1107], 0x1109, [0x110B,0x110C], [0x110E,0x1112], 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150, + [0x1154,0x1155], 0x1159, [0x115F,0x1161], 0x1163, 0x1165, 0x1167, 0x1169, [0x116D,0x116E], [0x1172,0x1173], + 0x1175, 0x119E, 0x11A8, 0x11AB, [0x11AE,0x11AF], [0x11B7,0x11B8], 0x11BA, [0x11BC,0x11C2], 0x11EB, 0x11F0, + 0x11F9, [0x1E00,0x1E9B], [0x1EA0,0x1EF9], [0x1F00,0x1F15], [0x1F18,0x1F1D], [0x1F20,0x1F45], [0x1F48,0x1F4D], + [0x1F50,0x1F57], 0x1F59, 0x1F5B, 0x1F5D, [0x1F5F,0x1F7D], [0x1F80,0x1FB4], [0x1FB6,0x1FBC], 0x1FBE, + [0x1FC2,0x1FC4], [0x1FC6,0x1FCC], [0x1FD0,0x1FD3], [0x1FD6,0x1FDB], [0x1FE0,0x1FEC], [0x1FF2,0x1FF4], + [0x1FF6,0x1FFC], 0x2126, [0x212A,0x212B], 0x212E, [0x2180,0x2182], [0x3041,0x3094], [0x30A1,0x30FA], + [0x3105,0x312C], [0xAC00,0xD7A3]); +my @ideographic_1_0 = ([0x4E00,0x9FA5], 0x3007, [0x3021,0x3029]); +my @combining_char_1_0 = ( + [0x0300,0x0345], [0x0360,0x0361], [0x0483,0x0486], [0x0591,0x05A1], [0x05A3,0x05B9], [0x05BB,0x05BD], + 0x05BF, [0x05C1,0x05C2], 0x05C4, [0x064B,0x0652], 0x0670, [0x06D6,0x06DC], [0x06DD,0x06DF], [0x06E0,0x06E4], + [0x06E7,0x06E8], [0x06EA,0x06ED], [0x0901,0x0903], 0x093C, [0x093E,0x094C], 0x094D, [0x0951,0x0954], + [0x0962,0x0963], [0x0981,0x0983], 0x09BC, 0x09BE, 0x09BF, [0x09C0,0x09C4], [0x09C7,0x09C8], [0x09CB,0x09CD], + 0x09D7, [0x09E2,0x09E3], 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, [0x0A40,0x0A42], [0x0A47,0x0A48], [0x0A4B,0x0A4D], + [0x0A70,0x0A71], [0x0A81,0x0A83], 0x0ABC, [0x0ABE,0x0AC5], [0x0AC7,0x0AC9], [0x0ACB,0x0ACD], [0x0B01,0x0B03], + 0x0B3C, [0x0B3E,0x0B43], [0x0B47,0x0B48], [0x0B4B,0x0B4D], [0x0B56,0x0B57], [0x0B82,0x0B83], [0x0BBE,0x0BC2], + [0x0BC6,0x0BC8], [0x0BCA,0x0BCD], 0x0BD7, [0x0C01,0x0C03], [0x0C3E,0x0C44], [0x0C46,0x0C48], [0x0C4A,0x0C4D], + [0x0C55,0x0C56], [0x0C82,0x0C83], [0x0CBE,0x0CC4], [0x0CC6,0x0CC8], [0x0CCA,0x0CCD], [0x0CD5,0x0CD6], + [0x0D02,0x0D03], [0x0D3E,0x0D43], [0x0D46,0x0D48], [0x0D4A,0x0D4D], 0x0D57, 0x0E31, [0x0E34,0x0E3A], + [0x0E47,0x0E4E], 0x0EB1, [0x0EB4,0x0EB9], [0x0EBB,0x0EBC], [0x0EC8,0x0ECD], [0x0F18,0x0F19], 0x0F35, + 0x0F37, 0x0F39, 0x0F3E, 0x0F3F, [0x0F71,0x0F84], [0x0F86,0x0F8B], [0x0F90,0x0F95], 0x0F97, [0x0F99,0x0FAD], + [0x0FB1,0x0FB7], 0x0FB9, [0x20D0,0x20DC], 0x20E1, [0x302A,0x302F], 0x3099, 0x309A); +my @digit_1_0 = ( + [0x0030,0x0039], [0x0660,0x0669], [0x06F0,0x06F9], [0x0966,0x096F], [0x09E6,0x09EF], [0x0A66,0x0A6F], + [0x0AE6,0x0AEF], [0x0B66,0x0B6F], [0x0BE7,0x0BEF], [0x0C66,0x0C6F], [0x0CE6,0x0CEF], [0x0D66,0x0D6F], + [0x0E50,0x0E59], [0x0ED0,0x0ED9], [0x0F20,0x0F29]); +my @extender_1_0 = ( + 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, [0x3031,0x3035], [0x309D,0x309E], [0x30FC,0x30FE]); +my @sname_1_1 = ( + "[:A-Z_a-z]", [0xC0,0xD6], [0xD8,0xF6], [0xF8,0x2FF], [0x370,0x37D], [0x37F,0x1FFF], + [0x200C,0x200D], [0x2070,0x218F], [0x2C00,0x2FEF], [0x3001,0xD7FF], [0xF900,0xFDCF], [0xFDF0,0xFFFD], [0x10000,0xEFFFF]); + +set("WHITE", @white); +set("NEW_LINE_1_0", 0xA, 0xD); +set("NEW_LINE_1_1", 0xA, 0xD, 0x85, 0x2028); +set("DIGIT", "[0-9]"); +set("XDIGIT", "[0-9a-fA-F]"); +set("VALID_1_0", @white, [0x20,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("VALID_1_1", [0x1,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("UNRESTRICTED_1_1", @white, [0x20,0x7E], 0x85, [0xA0,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("PUBID", 0x20, 0xD, 0xA, "[-a-zA-Z0-9'()+,./:=?:!*#@\$_%]"); +set("ENC_SNAME", "[a-zA-Z]"); +set("ENC_NAME", "[-a-zA-Z0-9._]"); +set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0); +set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0); +set("SNAME_1_1", @sname_1_1); +set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]); +set("GT", "[>]"); + +($ARGV[0] eq "" || $ARGV[1] eq "") && die("Invalid usage"); +find_cls(); +open(H, ">", $ARGV[0]) or die("Cannot create $ARGV[0]"); +open(C, ">", $ARGV[1]) or die("Cannot create $ARGV[1]"); +gen_enum(); +gen_tabs(); +close(H); +close(C); + +sub set { + my $id = shift; + $ids{$id} = scalar keys(%ids) if !defined($ids{$id}); + my $mask = 1 << $ids{$id}; + foreach my $i (@_) { + if (ref($i) eq "ARRAY") { + my $j = $i->[0]; + for (; $j <= $i->[1] && $j < 0x10000; $j++) { $cat[$j] |= $mask; } + for (; $j <= $i->[1]; $j += 0x10000) { $lcat[$j >> 16] |= $mask; } + } + elsif ($i =~ /^\[/) { for (my $j=0; $j < 128; $j++) { if (chr($j) =~ /$i/) { $cat[$j] |= $mask; } } } + else { $cat[$i] |= $mask; } + } +} + +sub find_cls { + foreach (my $i=0; $i<@cat; $i++) { $cls{$cat[$i]} = scalar keys(%cls) if !defined($cls{$cat[$i]}); } + foreach (my $i=0; $i<@lcat; $i++) { $cls{$lcat[$i]} = scalar keys(%cls) if !defined($cls{$lcat[$i]}); } +} + +sub gen_enum { + print H "enum xml_char_type {\n"; + foreach my $id (sort keys %ids) { + my $mask = 0; + foreach my $i (keys %cls) { + $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id})); + } + printf H " XML_CHAR_%-20s = 0x%08x,\n", $id, $mask; + } + print H "};\n\n"; +} + +sub gen_tabs { + my @tab = (); + my %hash = (); + + print H "extern const byte ucw_xml_char_tab1[];\n"; + print H "extern const uint ucw_xml_char_tab2[];\n"; + print H "extern const byte ucw_xml_char_tab3[];\n"; + + print C "const uint ucw_xml_char_tab2[] = {\n "; + for (my $t=0; $t<256; $t++) { + my $i = $t * 256; + my @x = (); + for (my $j=0; $j<256; $j += 32) { + push @x, join(",", map($cls{$_}, @cat[$i+$j..$i+$j+31])); + } + my $sub = " " . join(",\n ", @x); + if (!defined($hash{$sub})) { + $hash{$sub} = 256 * scalar @tab; + push @tab, $sub; + } + printf C "0x%x", $hash{$sub}; + print C ((~$t & 15) ? "," : ($t < 255) ? ",\n " : "\n};\n\n"); + } + + print C "const byte ucw_xml_char_tab1[] = {\n"; + print C join(",\n\n", @tab); + print C "\n};\n\n"; + + my @l = (); + for (my $i=0; $i<0x11; $i++) { + push @l, sprintf("%d", $cls{$lcat[$i]}); + } + print C "const byte ucw_xml_char_tab3[] = {" . join(",", @l) . "};\n"; +} diff --git a/ucw-xml/xml-test.c b/ucw-xml/xml-test.c new file mode 100644 index 00000000..758b0373 --- /dev/null +++ b/ucw-xml/xml-test.c @@ -0,0 +1,365 @@ +/* + * UCW Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +enum { + WANT_FIRST = 0x100, + WANT_HIDE_ERRORS, + WANT_IGNORE_COMMENTS, + WANT_IGNORE_PIS, + WANT_REPORT_BLOCKS, + WANT_REPORT_IGNORABLE, + WANT_FILE_ENTITIES, +}; + +static char *shortopts = "spdt" CF_SHORT_OPTS; +static struct option longopts[] = { + CF_LONG_OPTS + { "sax", 0, 0, 's' }, + { "pull", 0, 0, 'p' }, + { "dom", 0, 0, 't' }, + { "dtd", 0, 0, 'd' }, + { "hide-errors", 0, 0, WANT_HIDE_ERRORS }, + { "ignore-comments", 0, 0, WANT_IGNORE_COMMENTS }, + { "ignore-pis", 0, 0, WANT_IGNORE_PIS }, + { "report-blocks", 0, 0, WANT_REPORT_BLOCKS }, + { "report-ignorable", 0, 0, WANT_REPORT_IGNORABLE }, + { "file-entities", 0, 0, WANT_FILE_ENTITIES }, + { NULL, 0, 0, 0 } +}; + +static void NONRET +usage(void) +{ + fputs("\ +Usage: xml-test [options] < input.xml\n\ +\n\ +Options:\n" +CF_USAGE +"\ +-p, --pull Test PULL interface\n\ +-s, --sax Test SAX interface\n\ +-t, --dom Test DOM interface\n\ +-d, --dtd Enable parsing of DTD\n\ + --hide-errors Hide warnings and error messages\n\ + --ignore-comments Ignore comments\n\ + --ignore-pis Ignore processing instructions\n\ + --report-blocks Report blocks or characters and CDATA sections\n\ + --report-ignorable Report ignorable whitespace\n\ + --file-entities Resolve file external entities (not fully normative)\n\ +\n", stderr); + exit(1); +} + +static uint want_sax; +static uint want_pull; +static uint want_dom; +static uint want_parse_dtd; +static uint want_hide_errors; +static uint want_ignore_comments; +static uint want_ignore_pis; +static uint want_report_blocks; +static uint want_report_ignorable; +static uint want_file_entities; + +static struct fastbuf *out; + +static char * +node_type(struct xml_node *node) +{ + switch (node->type) + { + case XML_NODE_ELEM: return "element"; + case XML_NODE_COMMENT: return "comment"; + case XML_NODE_PI: return "pi"; + case XML_NODE_CHARS: return "chars"; + default: return "unknown"; + } +} + +static void +show_node(struct xml_node *node) +{ + switch (node->type) + { + case XML_NODE_ELEM: + bprintf(out, " <%s>", node->name); + XML_ATTR_FOR_EACH(a, node) + bprintf(out, " %s='%s'", a->name, a->val); + bputc(out, '\n'); + break; + case XML_NODE_COMMENT: + bprintf(out, " text='%s'\n", node->text); + break; + case XML_NODE_PI: + bprintf(out, " target=%s text='%s'\n", node->name, node->text); + break; + case XML_NODE_CHARS: + bprintf(out, " text='%s'\n", node->text); + break; + default: + bputc(out, '\n'); + } +} + +static void +show_tree(struct xml_node *node, uint level) +{ + if (!node) + return; + bputs(out, "DOM: "); + for (uint i = 0; i < level; i++) + bputs(out, " "); + bputs(out, node_type(node)); + show_node(node); + if (node->type == XML_NODE_ELEM) + XML_NODE_FOR_EACH(son, node) + show_tree(son, level + 1); +} + +static void +h_error(struct xml_context *ctx) +{ + bprintf(out, "SAX: %s at %u: %s\n", (ctx->err_code < XML_ERR_ERROR) ? "warn" : "error", xml_row(ctx), ctx->err_msg); +} + +static void +h_document_start(struct xml_context *ctx UNUSED) +{ + bputs(out, "SAX: document_start\n"); +} + +static void +h_document_end(struct xml_context *ctx UNUSED) +{ + bputs(out, "SAX: document_end\n"); +} + +static void +h_xml_decl(struct xml_context *ctx) +{ + bprintf(out, "SAX: xml_decl version=%s standalone=%d fb_encoding=%s\n", ctx->version_str, ctx->standalone, ctx->src->fb_encoding); +} + +static void +h_doctype_decl(struct xml_context *ctx) +{ + bprintf(out, "SAX: doctype_decl type=%s public='%s' system='%s' extsub=%d intsub=%d\n", + ctx->doctype, ctx->public_id ? : "", ctx->system_id ? : "", + !!(ctx->flags & XML_HAS_EXTERNAL_SUBSET), !!(ctx->flags & XML_HAS_INTERNAL_SUBSET)); +} + +static void +h_comment(struct xml_context *ctx) +{ + bputs(out, "SAX: comment"); + show_node(ctx->node); +} + +static void +h_pi(struct xml_context *ctx) +{ + bputs(out, "SAX: pi"); + show_node(ctx->node); +} + +static void +h_stag(struct xml_context *ctx) +{ + bputs(out, "SAX: stag"); + show_node(ctx->node); +} + +static void +h_etag(struct xml_context *ctx) +{ + bprintf(out, "SAX: etag \n", ctx->node->name); +} + +static void +h_chars(struct xml_context *ctx) +{ + bputs(out, "SAX: chars"); + show_node(ctx->node); +} + +static void +h_block(struct xml_context *ctx UNUSED, char *text, uint len UNUSED) +{ + bprintf(out, "SAX: block text='%s'\n", text); +} + +static void +h_cdata(struct xml_context *ctx UNUSED, char *text, uint len UNUSED) +{ + bprintf(out, "SAX: cdata text='%s'\n", text); +} + +static void +h_ignorable(struct xml_context *ctx UNUSED, char *text, uint len UNUSED) +{ + bprintf(out, "SAX: ignorable text='%s'\n", text); +} + +static void +h_dtd_start(struct xml_context *ctx UNUSED) +{ + bputs(out, "SAX: dtd_start\n"); +} + +static void +h_dtd_end(struct xml_context *ctx UNUSED) +{ + bputs(out, "SAX: dtd_end\n"); +} + +static void +h_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *e) +{ + xml_push_fastbuf(ctx, bopen(e->system_id, O_RDONLY, 4096)); +} + +int +main(int argc, char **argv) +{ + int opt; + cf_def_file = NULL; + log_init(argv[0]); + while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0) + switch (opt) + { + case 's': + want_sax++; + break; + case 'p': + want_pull++; + break; + case 't': + want_dom++; + break; + case 'd': + want_parse_dtd++; + break; + case WANT_HIDE_ERRORS: + want_hide_errors++; + break; + case WANT_IGNORE_COMMENTS: + want_ignore_comments++; + break; + case WANT_IGNORE_PIS: + want_ignore_pis++; + break; + case WANT_REPORT_BLOCKS: + want_report_blocks++; + break; + case WANT_REPORT_IGNORABLE: + want_report_ignorable++; + break; + case WANT_FILE_ENTITIES: + want_file_entities++; + break; + default: + usage(); + } + if (optind != argc) + usage(); + + out = bfdopen_shared(1, 4096); + struct xml_context ctx; + xml_init(&ctx); + if (!want_hide_errors) + ctx.h_warn = ctx.h_error = ctx.h_fatal = h_error; + if (want_sax) + { + ctx.h_document_start = h_document_start; + ctx.h_document_end = h_document_end; + ctx.h_xml_decl = h_xml_decl; + ctx.h_doctype_decl = h_doctype_decl; + ctx.h_comment = h_comment; + ctx.h_pi = h_pi; + ctx.h_stag = h_stag; + ctx.h_etag = h_etag; + ctx.h_chars = h_chars; + if (want_report_blocks) + { + ctx.h_block = h_block; + ctx.h_cdata = h_cdata; + } + if (want_report_ignorable) + ctx.h_ignorable = h_ignorable; + ctx.h_dtd_start = h_dtd_start; + ctx.h_dtd_end = h_dtd_end; + } + if (want_dom) + ctx.flags |= XML_ALLOC_ALL; + if (want_parse_dtd) + ctx.flags |= XML_PARSE_DTD; + if (want_ignore_comments) + ctx.flags &= ~(XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS); + if (want_ignore_pis) + ctx.flags &= ~(XML_REPORT_PIS | XML_ALLOC_PIS); + if (want_file_entities) + ctx.h_resolve_entity = h_resolve_entity; + xml_push_fastbuf(&ctx, bfdopen_shared(0, 4096)); + bputs(out, "PULL: start\n"); + if (want_pull) + { + ctx.pull = XML_PULL_CHARS | XML_PULL_STAG | XML_PULL_ETAG | XML_PULL_COMMENT | XML_PULL_PI; + uint state; + while (state = xml_next(&ctx)) + switch (state) + { + case XML_STATE_CHARS: + bputs(out, "PULL: chars"); + show_node(ctx.node); + break; + case XML_STATE_STAG: + bputs(out, "PULL: stag"); + show_node(ctx.node); + break; + case XML_STATE_ETAG: + bprintf(out, "PULL: etag \n", ctx.node->name); + break; + case XML_STATE_COMMENT: + bputs(out, "PULL: comment"); + show_node(ctx.node); + break; + case XML_STATE_PI: + bputs(out, "PULL: pi"); + show_node(ctx.node); + break; + default: + bputs(out, "PULL: unknown\n"); + break; + } + } + else + xml_parse(&ctx); + if (ctx.err_code) + bprintf(out, "PULL: fatal error at %u: %s\n", xml_row(&ctx), ctx.err_msg); + else + { + bputs(out, "PULL: eof\n"); + if (want_dom) + show_tree(ctx.dom, 0); + } + + xml_cleanup(&ctx); + bclose(out); + return 0; +} diff --git a/ucw-xml/xml-test.t b/ucw-xml/xml-test.t new file mode 100644 index 00000000..d48fd409 --- /dev/null +++ b/ucw-xml/xml-test.t @@ -0,0 +1,58 @@ +# Tests for the XML parser +# (c) 2008 Pavel Charvat + +Run: ../obj/ucw-xml/xml-test +In: + +Out: PULL: start + PULL: eof + +Run: ../obj/ucw-xml/xml-test -s +In: + text1&amp;<text2 +Out: PULL: start + SAX: document_start + SAX: xml_decl version=1.0 standalone=0 fb_encoding=ISO-8859-1 + SAX: stag + SAX: stag a1='val1' a2='val2' + SAX: chars text='text1&<' + SAX: etag + SAX: chars text='text2' + SAX: etag + SAX: document_end + PULL: eof + +Run: ../obj/ucw-xml/xml-test -sptd +In: + + "> + %pe1; + + + ]> + &e1;&e2; +Out: PULL: start + SAX: document_start + SAX: xml_decl version=1.0 standalone=0 fb_encoding=UTF-8 + SAX: doctype_decl type=root public='' system='' extsub=0 intsub=1 + SAX: dtd_start + SAX: dtd_end + SAX: stag + PULL: stag + SAX: chars text='text' + PULL: chars text='text' + SAX: stag + PULL: stag + SAX: chars text='' + PULL: chars text='' + PULL: etag + SAX: etag + PULL: etag + SAX: etag + SAX: document_end + PULL: eof + DOM: element + DOM: chars text='text' + DOM: element + DOM: chars text='' diff --git a/ucw-xml/xml.h b/ucw-xml/xml.h new file mode 100644 index 00000000..c048f56c --- /dev/null +++ b/ucw-xml/xml.h @@ -0,0 +1,294 @@ +/* + * UCW Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _UCW_XML_XML_H +#define _UCW_XML_XML_H + +#include +#include +#include +#include + +#ifdef CONFIG_UCW_CLEAN_ABI +#define xml_attr_find ucw_xml_attr_find +#define xml_attr_value ucw_xml_attr_value +#define xml_cleanup ucw_xml_cleanup +#define xml_def_find_entity ucw_xml_def_find_entity +#define xml_def_resolve_entity ucw_xml_def_resolve_entity +#define xml_error ucw_xml_error +#define xml_fatal ucw_xml_fatal +#define xml_init ucw_xml_init +#define xml_merge_chars ucw_xml_merge_chars +#define xml_merge_dom_chars ucw_xml_merge_dom_chars +#define xml_next ucw_xml_next +#define xml_next_state ucw_xml_next_state +#define xml_normalize_white ucw_xml_normalize_white +#define xml_parse ucw_xml_parse +#define xml_push_fastbuf ucw_xml_push_fastbuf +#define xml_reset ucw_xml_reset +#define xml_row ucw_xml_row +#define xml_skip_element ucw_xml_skip_element +#define xml_warn ucw_xml_warn +#endif + +struct xml_context; +struct xml_dtd_entity; + +enum xml_error { + XML_ERR_OK = 0, + XML_ERR_WARN = 1000, /* Warning */ + XML_ERR_ERROR = 2000, /* Recoverable error */ + XML_ERR_FATAL = 3000, /* Unrecoverable error */ + XML_ERR_EOF, +}; + +enum xml_state { + XML_STATE_EOF, /* EOF or a fatal error */ + XML_STATE_START, /* Initial state */ + XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */ + XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */ + XML_STATE_CHARS, /* XML_PULL_CHARS */ + XML_STATE_STAG, /* XML_PULL_STAG */ + XML_STATE_ETAG, /* XML_PULL_ETAG */ + XML_STATE_COMMENT, /* XML_PULL_COMMENT */ + XML_STATE_PI, /* XML_PULL_PI */ + + /* Internal states */ + XML_STATE_CHARS_BEFORE_STAG, + XML_STATE_CHARS_BEFORE_ETAG, + XML_STATE_CHARS_BEFORE_CDATA, + XML_STATE_CHARS_BEFORE_COMMENT, + XML_STATE_CHARS_BEFORE_PI, + XML_STATE_PROLOG_COMMENT, + XML_STATE_PROLOG_PI, + XML_STATE_EPILOG_COMMENT, + XML_STATE_EPILOG_PI, +}; + +enum xml_pull { + XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */ + XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */ + XML_PULL_CHARS = 0x00000004, + XML_PULL_STAG = 0x00000008, + XML_PULL_ETAG = 0x00000010, + XML_PULL_COMMENT = 0x00000020, + XML_PULL_PI = 0x00000040, + XML_PULL_ALL = 0xffffffff, +}; + +enum xml_flags { + /* Enable reporting of various events via SAX and/or PULL interface */ + XML_REPORT_COMMENTS = 0x00000001, /* Report comments */ + XML_REPORT_PIS = 0x00000002, /* Report processing instructions */ + XML_REPORT_CHARS = 0x00000004, /* Report characters */ + XML_REPORT_TAGS = 0x00000008, /* Report element starts/ends */ + XML_REPORT_MISC = XML_REPORT_COMMENTS | XML_REPORT_PIS, + XML_REPORT_ALL = XML_REPORT_MISC | XML_REPORT_CHARS | XML_REPORT_TAGS, + + /* Enable construction of DOM for these types */ + XML_ALLOC_COMMENTS = 0x00000010, /* Create comment nodes */ + XML_ALLOC_PIS = 0x00000020, /* Create processing instruction nodes */ + XML_ALLOC_CHARS = 0x00000040, /* Create character nodes */ + XML_ALLOC_TAGS = 0x00000080, /* Create element nodes */ + XML_ALLOC_MISC = XML_ALLOC_COMMENTS | XML_ALLOC_PIS, + XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS, + + /* Other parameters */ + XML_VALIDATING = 0x00000100, /* Validate everything (not fully implemented!) */ + XML_PARSE_DTD = 0x00000200, /* Enable parsing of DTD */ + XML_NO_CHARS = 0x00000400, /* The current element must not contain character data (filled automaticaly if using DTD) */ + XML_ALLOC_DEFAULT_ATTRS = 0x00000800, /* Allocate default attribute values so they can be found by XML_ATTR_FOR_EACH */ + + /* Internals, do not change! */ + XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */ + XML_VERSION_1_1 = 0x00020000, /* XML version is 1.1, otherwise 1.0 */ + XML_HAS_EXTERNAL_SUBSET = 0x00040000, /* The document contains a reference to external DTD subset */ + XML_HAS_INTERNAL_SUBSET = 0x00080000, /* The document contains an internal subset */ + XML_HAS_DTD = XML_HAS_EXTERNAL_SUBSET | XML_HAS_INTERNAL_SUBSET, + XML_SRC_EOF = 0x00100000, /* EOF reached */ + XML_SRC_EXPECTED_DECL = 0x00200000, /* Just before optional or required XMLDecl/TextDecl */ + XML_SRC_DOCUMENT = 0x00400000, /* The document entity */ + XML_SRC_EXTERNAL = 0x00800000, /* An external entity */ +}; + +enum xml_node_type { + XML_NODE_ELEM, + XML_NODE_COMMENT, + XML_NODE_CHARS, + XML_NODE_PI, +}; + +#define XML_NODE_FOR_EACH(var, node) CLIST_FOR_EACH(struct xml_node *, var, (node)->sons) +#define XML_ATTR_FOR_EACH(var, node) SLIST_FOR_EACH(struct xml_attr *, var, (node)->attrs) + +struct xml_node { + cnode n; /* Node for list of parent's sons */ + uint type; /* XML_NODE_x */ + struct xml_node *parent; /* Parent node */ + char *name; /* Element name / PI target */ + clist sons; /* Children nodes */ + union { + struct { + char *text; /* PI text / Comment / CDATA */ + uint len; /* Text length in bytes */ + }; + struct { + struct xml_dtd_elem *dtd; /* Element DTD */ + slist attrs; /* Link list of element attributes */ + }; + }; + void *user; /* User-defined (initialized to NULL) */ +}; + +struct xml_attr { + snode n; /* Node for elem->attrs */ + struct xml_node *elem; /* Parent element */ + struct xml_dtd_attr *dtd; /* Attribute DTD */ + char *name; /* Attribute name */ + char *val; /* Attribute value */ + void *user; /* User-defined (initialized to NULL) */ +}; + +#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */ + +struct xml_source { + struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ + struct fastbuf *fb; /* Source fastbuf */ + struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */ + struct fastbuf wrap_fb; /* Fbmem wrapper */ + u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ + u32 *bptr, *bstop; /* Current state of the buffer */ + uint row; /* File position */ + char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ + char *fb_encoding; /* Encoding of the source fastbuf */ + char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ + uint refill_cat1; /* Character categories, which should be directly passed to the buffer */ + uint refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in + sequences) */ + void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ + unsigned short *refill_in_to_x; /* Libucw-charset input table */ + uint saved_depth; /* Saved ctx->depth */ + uint pending_0xd; /* The last read character is 0xD */ +}; + +struct xml_context { + /* Error handling */ + char *err_msg; /* Last error message */ + enum xml_error err_code; /* Last error code */ + void *throw_buf; /* Where to jump on error */ + void (*h_warn)(struct xml_context *ctx); /* Warning callback */ + void (*h_error)(struct xml_context *ctx); /* Recoverable error callback */ + void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */ + + /* Memory management */ + struct mempool *pool; /* DOM pool */ + struct mempool *stack; /* Stack pool (freed as soon as possible) */ + struct xml_stack *stack_list; /* See xml_push(), xml_pop() */ + uint flags; /* XML_FLAG_x (restored on xml_pop()) */ + uint depth; /* Nesting level (for checking of valid source nesting -> valid pushes/pops on memory pools) */ + struct fastbuf chars; /* Character data / attribute value */ + struct mempool_state chars_state; /* Mempool state before the current character block has started */ + char *chars_trivial; /* If not empty, it will be appended to chars */ + void *tab_attrs; /* Hash table of element attributes */ + + /* Input */ + struct xml_source *src; /* Current source */ + u32 *bptr, *bstop; /* Buffer with preprocessed characters (validated UCS-4 + category flags) */ + uint cat_chars; /* Unicode range of supported characters (cdata, attribute values, ...) */ + uint cat_unrestricted; /* Unrestricted characters (may appear in document/external entities) */ + uint cat_new_line; /* New line characters */ + uint cat_name; /* Characters that may appear in names */ + uint cat_sname; /* Characters that may begin a name */ + + /* SAX-like interface */ + void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */ + void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */ + void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */ + void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration (before optional internal subset) */ + void (*h_comment)(struct xml_context *ctx); /* Called after a comment (only with XML_REPORT_COMMENTS) */ + void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */ + void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */ + void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */ + void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */ + void (*h_block)(struct xml_context *ctx, char *text, uint len); /* Called for each continuous block of characters not reported by h_cdata() (only with XML_REPORT_CHARS) */ + void (*h_cdata)(struct xml_context *ctx, char *text, uint len); /* Called for each CDATA section (only with XML_REPORT_CHARS) */ + void (*h_ignorable)(struct xml_context *ctx, char *text, uint len); /* Called for ignorable whitespace (content in tags without #PCDATA) */ + void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */ + void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */ + struct xml_dtd_entity *(*h_find_entity)(struct xml_context *ctx, char *name); /* Called when needed to resolve a general entity */ + void (*h_resolve_entity)(struct xml_context *ctx, struct xml_dtd_entity *ent); /* User should push source fastbuf for a parsed external entity (either general or parameter) */ + + /* DOM */ + struct xml_node *dom; /* DOM root */ + struct xml_node *node; /* Current DOM node */ + + char *version_str; + uint standalone; + char *doctype; /* The document type (or NULL if unknown) */ + char *system_id; /* DTD external id */ + char *public_id; /* DTD public id */ + struct xml_dtd *dtd; /* The DTD structure (or NULL) */ + uint state; /* Current state for the PULL interface (XML_STATE_x) */ + uint pull; /* Parameters for the PULL interface (XML_PULL_x) */ +}; + +/* Initialize XML context */ +void xml_init(struct xml_context *ctx); + +/* Clean up all internal structures */ +void xml_cleanup(struct xml_context *ctx); + +/* Reuse XML context, equivalent to xml_cleanup() and xml_init() */ +void xml_reset(struct xml_context *ctx); + +/* Add XML source (fastbuf will be automatically closed) */ +struct xml_source *xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb); + +/* Parse without the PULL interface, return XML_ERR_x code (zero on success) */ +uint xml_parse(struct xml_context *ctx); + +/* Parse with the PULL interface, return XML_STATE_x (zero on EOF or fatal error) */ +uint xml_next(struct xml_context *ctx); + +/* Equivalent to xml_next, but with temporarily changed ctx->pull value */ +uint xml_next_state(struct xml_context *ctx, uint pull); + +/* May be called on XML_STATE_STAG to skip it's content; can return XML_STATE_ETAG or XML_STATE_EOF on fatal error */ +uint xml_skip_element(struct xml_context *ctx); + +/* Returns the current row number in the document entity */ +uint xml_row(struct xml_context *ctx); + +/* Finds a given attribute value in a XML_NODE_ELEM node */ +struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name); + +/* Similar to xml_attr_find, but it deals also with default values */ +char *xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name); + +/* The default value of h_find_entity(), knows <, >, &, ' and " */ +struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name); + +/* The default value of h_resolve_entity(), throws an error */ +void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); + +/* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */ +uint xml_normalize_white(struct xml_context *ctx, char *value); + +/* Merge character contents of a given element to a single string (not recursive) */ +char *xml_merge_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool); + +/* Merge character contents of a given subtree to a single string */ +char *xml_merge_dom_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool); + +/* Public part of error handling */ +void xml_warn(struct xml_context *ctx, const char *format, ...); +void xml_error(struct xml_context *ctx, const char *format, ...); +void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...); + +#endif diff --git a/xml/Makefile b/xml/Makefile deleted file mode 100644 index 4df8de25..00000000 --- a/xml/Makefile +++ /dev/null @@ -1,58 +0,0 @@ -# Makefile for the XML parser -# (c) 2007 Pavel Charvat - -DIRS+=xml -PROGS+=$(o)/xml/xml-test - -LIBXML_MODS=common source parse dtd -LIBXML_MOD_PATHS=$(addprefix $(o)/xml/,$(LIBXML_MODS)) -LIBXML_INCLUDES=xml.h dtd.h -LIBXML_DEPS=$(LIBUCW) $(LIBCHARSET) - -$(o)/xml/libucw-xml$(LV).a: $(addsuffix .o,$(LIBXML_MOD_PATHS)) -$(o)/xml/libucw-xml$(LV).so: $(addsuffix .oo,$(LIBXML_MOD_PATHS)) $(LIBXML_DEPS) -$(o)/xml/libucw-xml$(LV).so: SONAME_SUFFIX=.0 -$(o)/xml/libucw-xml.pc: $(LIBXML_DEPS) - -ifdef CONFIG_INSTALL_API -$(o)/xml/libucw-xml.pc: $(addprefix $(o)/xml/libucw-xml$(LV),.a .so) -endif - -$(o)/xml/common.o: $(o)/xml/unicat.h -$(o)/xml/common.oo: $(o)/xml/unicat.h -$(o)/xml/source.o: $(o)/xml/unicat.h -$(o)/xml/source.oo: $(o)/xml/unicat.h -$(o)/xml/dtd.o: $(o)/xml/unicat.h -$(o)/xml/dtd.oo: $(o)/xml/unicat.h -$(o)/xml/parse.o: $(o)/xml/unicat.h -$(o)/xml/parse.oo: $(o)/xml/unicat.h -$(o)/xml/unicat.h: $(s)/xml/unicat.pl - $(M)GEN $(addprefix $(o)/xml/unicat,.h .c) - $(Q)$< $(addprefix $(o)/xml/unicat,.h .c) - $(Q)touch $@ - -TESTS+=$(o)/xml/xml-test.test -$(o)/xml/xml-test: $(o)/xml/xml-test.o $(LIBXML) $(LIBCHARSET) $(LIBUCW) -$(o)/xml/xml-test.test: $(o)/xml/xml-test - -API_LIBS+=libucw-xml -API_INCLUDES+=$(o)/xml/.include-stamp -$(o)/xml/.include-stamp: $(addprefix $(s)/xml/,$(LIBXML_INCLUDES)) -$(o)/xml/.include-stamp: IDST=xml -run/lib/pkgconfig/libucw-xml.pc: $(o)/xml/libucw-xml.pc - -INSTALL_TARGETS+=install-libucw-xml-lib -install-libucw-xml-lib: - install -d -m 755 $(DESTDIR)$(INSTALL_LIB_DIR) - install -m 644 run/lib/libucw-xml$(LV).so.0 $(DESTDIR)$(INSTALL_LIB_DIR)/libucw-xml$(LV).so.0.0 - ln -sf libucw-xml$(LV).so.0.0 $(DESTDIR)$(INSTALL_LIB_DIR)/libucw-xml$(LV).so.0 -.PHONY: install-libucw-xml-lib - -INSTALL_TARGETS+=install-libucw-xml-api -install-libucw-xml-api: - install -d -m 755 $(DESTDIR)$(INSTALL_INCLUDE_DIR)/xml $(DESTDIR)$(INSTALL_LIB_DIR) $(DESTDIR)$(INSTALL_PKGCONFIG_DIR) - install -m 644 run/lib/pkgconfig/libucw-xml.pc $(DESTDIR)$(INSTALL_PKGCONFIG_DIR) - install -m 644 $(addprefix run/include/xml/,$(LIBXML_INCLUDES)) $(DESTDIR)$(INSTALL_INCLUDE_DIR)/xml - ln -sf libucw-xml$(LV).so.0.0 $(DESTDIR)$(INSTALL_LIB_DIR)/libucw-xml$(LV).so - install -m 644 run/lib/libucw-xml$(LV).a $(DESTDIR)$(INSTALL_LIB_DIR) -.PHONY: install-libucw-xml-api diff --git a/xml/TODO b/xml/TODO deleted file mode 100644 index b8dbc29c..00000000 --- a/xml/TODO +++ /dev/null @@ -1,15 +0,0 @@ -Non-normative / not-implemented: --- introduce numeric error codes --- cycle detection in internal entities (and possibly external?) --- conditional sections in DTD --- validation of elements (regular expressions, non-cdata) --- validation of attributes (unfinished) --- notations --- URI normalization --- support for xml:space --- support for xml:lang --- full support for standalone documents --- Unicode normalization - -Optimizations: --- detect definitions of trivial entities diff --git a/xml/common.c b/xml/common.c deleted file mode 100644 index 7f120d85..00000000 --- a/xml/common.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * UCW Library -- A simple XML parser - * - * (c) 2007 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#undef LOCAL_DEBUG - -#include -#include -#include -#include -#include -#include - -#include - -/*** Error handling ***/ - -void NONRET -xml_throw(struct xml_context *ctx) -{ - ASSERT(ctx->err_code && ctx->throw_buf); - longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code); -} - -void -xml_warn(struct xml_context *ctx, const char *format, ...) -{ - if (ctx->h_warn) - { - va_list args; - va_start(args, format); - ctx->err_msg = stk_vprintf(format, args); - ctx->err_code = XML_ERR_WARN; - va_end(args); - ctx->h_warn(ctx); - ctx->err_msg = NULL; - ctx->err_code = XML_ERR_OK; - } -} - -void -xml_error(struct xml_context *ctx, const char *format, ...) -{ - if (ctx->h_error) - { - va_list args; - va_start(args, format); - ctx->err_msg = stk_vprintf(format, args); - ctx->err_code = XML_ERR_ERROR; - va_end(args); - ctx->h_error(ctx); - ctx->err_msg = NULL; - ctx->err_code = XML_ERR_OK; - } -} - -void NONRET -xml_fatal(struct xml_context *ctx, const char *format, ...) -{ - va_list args; - va_start(args, format); - ctx->err_msg = mp_vprintf(ctx->stack, format, args); - ctx->err_code = XML_ERR_FATAL; - ctx->state = XML_STATE_EOF; - va_end(args); - if (ctx->h_fatal) - ctx->h_fatal(ctx); - xml_throw(ctx); -} - -/*** Memory management ***/ - -void * -xml_hash_new(struct mempool *pool, uint size) -{ - void *tab = mp_alloc_zero(pool, size + XML_HASH_HDR_SIZE); - *(void **)tab = pool; - return tab + XML_HASH_HDR_SIZE; -} - -/*** Initialization ***/ - -static struct xml_context xml_defaults = { - .flags = XML_SRC_EOF | XML_REPORT_ALL, - .state = XML_STATE_START, - .h_resolve_entity = xml_def_resolve_entity, - .chars = { - .name = "", - .spout = xml_spout_chars, - .can_overwrite_buffer = 1, - }, -}; - -static void -xml_do_init(struct xml_context *ctx) -{ - xml_attrs_table_init(ctx); -} - -void -xml_init(struct xml_context *ctx) -{ - *ctx = xml_defaults; - ctx->pool = mp_new(65536); - ctx->stack = mp_new(65536); - xml_do_init(ctx); - TRACE(ctx, "init"); -} - -void -xml_cleanup(struct xml_context *ctx) -{ - TRACE(ctx, "cleanup"); - xml_attrs_table_cleanup(ctx); - xml_dtd_cleanup(ctx); - xml_sources_cleanup(ctx); - mp_delete(ctx->pool); - mp_delete(ctx->stack); -} - -void -xml_reset(struct xml_context *ctx) -{ - TRACE(ctx, "reset"); - struct mempool *pool = ctx->pool, *stack = ctx->stack; - xml_attrs_table_cleanup(ctx); - xml_dtd_cleanup(ctx); - xml_sources_cleanup(ctx); - mp_flush(pool); - mp_flush(stack); - *ctx = xml_defaults; - ctx->pool = pool; - ctx->stack = stack; - xml_do_init(ctx); -} diff --git a/xml/dtd.c b/xml/dtd.c deleted file mode 100644 index 407511f7..00000000 --- a/xml/dtd.c +++ /dev/null @@ -1,1003 +0,0 @@ -/* - * UCW Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#undef LOCAL_DEBUG - -#include -#include -#include -#include -#include -#include -#include - -/* Notations */ - -#define HASH_PREFIX(x) xml_dtd_notns_##x -#define HASH_NODE struct xml_dtd_notn -#define HASH_KEY_STRING name -#define HASH_ZERO_FILL -#define HASH_TABLE_DYNAMIC -#define HASH_WANT_LOOKUP -#define HASH_WANT_FIND -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -struct xml_dtd_notn * -xml_dtd_find_notn(struct xml_context *ctx, char *name) -{ - struct xml_dtd *dtd = ctx->dtd; - struct xml_dtd_notn *notn = xml_dtd_notns_find(dtd->tab_notns, name); - return !notn ? NULL : (notn->flags & XML_DTD_NOTN_DECLARED) ? notn : NULL; -} - -/* General entities */ - -#define HASH_PREFIX(x) xml_dtd_ents_##x -#define HASH_NODE struct xml_dtd_entity -#define HASH_KEY_STRING name -#define HASH_ZERO_FILL -#define HASH_TABLE_DYNAMIC -#define HASH_WANT_FIND -#define HASH_WANT_LOOKUP -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -static struct xml_dtd_entity * -xml_dtd_declare_trivial_entity(struct xml_context *ctx, char *name, char *text) -{ - struct xml_dtd *dtd = ctx->dtd; - struct xml_dtd_entity *ent = xml_dtd_ents_lookup(dtd->tab_ents, name); - if (ent->flags & XML_DTD_ENTITY_DECLARED) - { - xml_warn(ctx, "Entity &%s; already declared", name); - return NULL; - } - slist_add_tail(&dtd->ents, &ent->n); - ent->flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL; - ent->text = text; - return ent; -} - -static void -xml_dtd_declare_default_entities(struct xml_context *ctx) -{ - xml_dtd_declare_trivial_entity(ctx, "lt", "<"); - xml_dtd_declare_trivial_entity(ctx, "gt", ">"); - xml_dtd_declare_trivial_entity(ctx, "amp", "&"); - xml_dtd_declare_trivial_entity(ctx, "apos", "'"); - xml_dtd_declare_trivial_entity(ctx, "quot", "\""); -} - -struct xml_dtd_entity * -xml_def_find_entity(struct xml_context *ctx UNUSED, char *name) -{ -#define ENT(n, t) ent_##n = { .name = #n, .text = t, .flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL } - static struct xml_dtd_entity ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\""); -#undef ENT - switch (name[0]) - { - case 'l': - if (!strcmp(name, "lt")) - return &ent_lt; - break; - case 'g': - if (!strcmp(name, "gt")) - return &ent_gt; - break; - case 'a': - if (!strcmp(name, "amp")) - return &ent_amp; - if (!strcmp(name, "apos")) - return &ent_apos; - break; - case 'q': - if (!strcmp(name, "quot")) - return &ent_quot; - break; - } - return NULL; -} - -struct xml_dtd_entity * -xml_dtd_find_entity(struct xml_context *ctx, char *name) -{ - struct xml_dtd *dtd = ctx->dtd; - if (ctx->h_find_entity) - return ctx->h_find_entity(ctx, name); - else if (dtd) - { - struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_ents, name); - return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL; - } - else - return xml_def_find_entity(ctx, name); -} - -/* Parameter entities */ - -static struct xml_dtd_entity * -xml_dtd_find_pentity(struct xml_context *ctx, char *name) -{ - struct xml_dtd *dtd = ctx->dtd; - struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_pents, name); - return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL; -} - -/* Elements */ - -struct xml_dtd_elems_table; - -static void -xml_dtd_elems_init_data(struct xml_dtd_elems_table *tab UNUSED, struct xml_dtd_elem *e) -{ - slist_init(&e->attrs); -} - -#define HASH_PREFIX(x) xml_dtd_elems_##x -#define HASH_NODE struct xml_dtd_elem -#define HASH_KEY_STRING name -#define HASH_TABLE_DYNAMIC -#define HASH_ZERO_FILL -#define HASH_WANT_FIND -#define HASH_WANT_LOOKUP -#define HASH_GIVE_ALLOC -#define HASH_GIVE_INIT_DATA -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -struct xml_dtd_elem * -xml_dtd_find_elem(struct xml_context *ctx, char *name) -{ - return ctx->dtd ? xml_dtd_elems_find(ctx->dtd->tab_elems, name) : NULL; -} - -/* Element sons */ - -struct xml_dtd_enodes_table; - -static inline uint -xml_dtd_enodes_hash(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) -{ - return hash_pointer(parent) ^ hash_pointer(elem); -} - -static inline int -xml_dtd_enodes_eq(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent1, struct xml_dtd_elem *elem1, struct xml_dtd_elem_node *parent2, struct xml_dtd_elem *elem2) -{ - return (parent1 == parent2) && (elem1 == elem2); -} - -static inline void -xml_dtd_enodes_init_key(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *node, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) -{ - node->parent = parent; - node->elem = elem; -} - -#define HASH_PREFIX(x) xml_dtd_enodes_##x -#define HASH_NODE struct xml_dtd_elem_node -#define HASH_KEY_COMPLEX(x) x parent, x elem -#define HASH_KEY_DECL struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_TABLE_DYNAMIC -#define HASH_ZERO_FILL -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -/* Element attributes */ - -struct xml_dtd_attrs_table; - -static inline uint -xml_dtd_attrs_hash(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem, char *name) -{ - return hash_pointer(elem) ^ hash_string(name); -} - -static inline int -xml_dtd_attrs_eq(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem1, char *name1, struct xml_dtd_elem *elem2, char *name2) -{ - return (elem1 == elem2) && !strcmp(name1, name2); -} - -static inline void -xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_elem *elem, char *name) -{ - attr->elem = elem; - attr->name = name; - slist_add_tail(&elem->attrs, &attr->n); -} - -#define HASH_PREFIX(x) xml_dtd_attrs_##x -#define HASH_NODE struct xml_dtd_attr -#define HASH_ZERO_FILL -#define HASH_TABLE_DYNAMIC -#define HASH_KEY_COMPLEX(x) x elem, x name -#define HASH_KEY_DECL struct xml_dtd_elem *elem, char *name -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -struct xml_dtd_attr * -xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name) -{ - return ctx->dtd ? xml_dtd_attrs_find(ctx->dtd->tab_attrs, elem, name) : NULL; -} - -/* Enumerated attribute values */ - -struct xml_dtd_evals_table; - -static inline uint -xml_dtd_evals_hash(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr, char *val) -{ - return hash_pointer(attr) ^ hash_string(val); -} - -static inline int -xml_dtd_evals_eq(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr1, char *val1, struct xml_dtd_attr *attr2, char *val2) -{ - return (attr1 == attr2) && !strcmp(val1, val2); -} - -static inline void -xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_eval *eval, struct xml_dtd_attr *attr, char *val) -{ - eval->attr = attr; - eval->val = val; -} - -#define HASH_PREFIX(x) xml_dtd_evals_##x -#define HASH_NODE struct xml_dtd_eval -#define HASH_TABLE_DYNAMIC -#define HASH_KEY_COMPLEX(x) x attr, x val -#define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -/* Enumerated attribute notations */ - -struct xml_dtd_enotns_table; - -static inline uint -xml_dtd_enotns_hash(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) -{ - return hash_pointer(attr) ^ hash_pointer(notn); -} - -static inline int -xml_dtd_enotns_eq(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr1, struct xml_dtd_notn *notn1, struct xml_dtd_attr *attr2, struct xml_dtd_notn *notn2) -{ - return (attr1 == attr2) && (notn1 == notn2); -} - -static inline void -xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_enotn *enotn, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) -{ - enotn->attr = attr; - enotn->notn = notn; -} - -#define HASH_PREFIX(x) xml_dtd_enotns_##x -#define HASH_NODE struct xml_dtd_enotn -#define HASH_TABLE_DYNAMIC -#define HASH_KEY_COMPLEX(x) x attr, x notn -#define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -/* DTD initialization/cleanup */ - -void -xml_dtd_init(struct xml_context *ctx) -{ - if (ctx->dtd) - return; - struct mempool *pool = mp_new(4096); - struct xml_dtd *dtd = ctx->dtd = mp_alloc_zero(pool, sizeof(*ctx->dtd)); - dtd->pool = pool; - xml_dtd_ents_init(dtd->tab_ents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); - xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); - xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table))); - xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table))); - xml_dtd_enodes_init(dtd->tab_enodes = xml_hash_new(pool, sizeof(struct xml_dtd_enodes_table))); - xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table))); - xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table))); - xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table))); - xml_dtd_declare_default_entities(ctx); -} - -void -xml_dtd_cleanup(struct xml_context *ctx) -{ - if (!ctx->dtd) - return; - mp_delete(ctx->dtd->pool); - ctx->dtd = NULL; -} - -void -xml_dtd_finish(struct xml_context *ctx) -{ - if (!ctx->dtd) - return; - // FIXME: validity checks -} - -/*** Parsing functions ***/ - -/* References to parameter entities */ - -void -xml_parse_pe_ref(struct xml_context *ctx) -{ - /* PEReference ::= '%' Name ';' - * Already parsed: '%' */ - struct mempool_state state; - mp_save(ctx->stack, &state); - char *name = xml_parse_name(ctx, ctx->stack); - xml_parse_char(ctx, ';'); - struct xml_dtd_entity *ent = xml_dtd_find_pentity(ctx, name); - if (!ent) - xml_error(ctx, "Unknown entity %%%s;", name); - else - { - TRACE(ctx, "Pushed entity %%%s;", name); - mp_restore(ctx->stack, &state); - xml_dec(ctx); - xml_push_entity(ctx, ent); - return; - } - mp_restore(ctx->stack, &state); - xml_dec(ctx); -} - -static uint -xml_parse_dtd_pe(struct xml_context *ctx, uint entity_decl) -{ - /* Already parsed: '%' */ - do - { - xml_inc(ctx); - if (!~entity_decl && (xml_peek_cat(ctx) & XML_CHAR_WHITE)) - { - xml_dec(ctx); - return ~0U; - } - xml_parse_pe_ref(ctx); - while (xml_peek_cat(ctx) & XML_CHAR_WHITE) - xml_skip_char(ctx); - } - while (xml_get_char(ctx) == '%'); - xml_unget_char(ctx); - return 1; -} - -static inline uint -xml_parse_dtd_white(struct xml_context *ctx, uint mandatory) -{ - /* Whitespace or parameter entity, - * mandatory==~0U has a special maening of the whitespace before the '%' character in an parameter entity declaration */ - uint cnt = 0; - while (xml_peek_cat(ctx) & XML_CHAR_WHITE) - { - xml_skip_char(ctx); - cnt = 1; - } - if (xml_peek_char(ctx) == '%') - { - xml_skip_char(ctx); - return xml_parse_dtd_pe(ctx, mandatory); - } - else if (unlikely(mandatory && !cnt)) - xml_fatal_expected_white(ctx); - return cnt; -} - -static void -xml_dtd_parse_external_id(struct xml_context *ctx, char **system_id, char **public_id, uint allow_public) -{ - struct xml_dtd *dtd = ctx->dtd; - uint c = xml_peek_char(ctx); - if (c == 'S') - { - xml_parse_seq(ctx, "SYSTEM"); - xml_parse_dtd_white(ctx, 1); - *public_id = NULL; - *system_id = xml_parse_system_literal(ctx, dtd->pool); - } - else if (c == 'P') - { - xml_parse_seq(ctx, "PUBLIC"); - xml_parse_dtd_white(ctx, 1); - *system_id = NULL; - *public_id = xml_parse_pubid_literal(ctx, dtd->pool); - if (xml_parse_dtd_white(ctx, !allow_public)) - if ((c = xml_peek_char(ctx)) == '\'' || c == '"' || !allow_public) - *system_id = xml_parse_system_literal(ctx, dtd->pool); - } - else - xml_fatal(ctx, "Expected an external ID"); -} - -/* DTD: */ - -void -xml_parse_notation_decl(struct xml_context *ctx) -{ - /* NotationDecl ::= '' - * Already parsed: 'dtd; - xml_parse_dtd_white(ctx, 1); - - struct xml_dtd_notn *notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); - xml_parse_dtd_white(ctx, 1); - char *system_id, *public_id; - xml_dtd_parse_external_id(ctx, &system_id, &public_id, 1); - xml_parse_dtd_white(ctx, 0); - xml_parse_char(ctx, '>'); - - if (notn->flags & XML_DTD_NOTN_DECLARED) - xml_warn(ctx, "Notation %s already declared", notn->name); - else - { - notn->flags = XML_DTD_NOTN_DECLARED; - notn->system_id = system_id; - notn->public_id = public_id; - slist_add_tail(&dtd->notns, ¬n->n); - } - xml_dec(ctx); -} - -/* DTD: */ - -void -xml_parse_entity_decl(struct xml_context *ctx) -{ - /* Already parsed: 'dtd; - uint flags = ~xml_parse_dtd_white(ctx, ~0U) ? 0 : XML_DTD_ENTITY_PARAMETER; - if (flags) - xml_parse_dtd_white(ctx, 1); - struct xml_dtd_entity *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool)); - xml_parse_dtd_white(ctx, 1); - slist *list = flags ? &dtd->pents : &dtd->ents; - if (ent->flags & XML_DTD_ENTITY_DECLARED) - { - xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); - // FIXME: should be only warning - } - uint c, sep = xml_get_char(ctx); - if (sep == '\'' || sep == '"') - { - /* Internal entity: - * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */ - char *p = mp_start_noalign(dtd->pool, 1); - while (1) - { - if ((c = xml_get_char(ctx)) == sep) - break; - if (c == '%') - { - // FIXME - ASSERT(0); - //xml_parse_parameter_ref(ctx); - continue; - } - if (c == '&') - { - xml_inc(ctx); - if (xml_peek_char(ctx) != '#') - { - /* Bypass references to general entities */ - struct mempool_state state; - mp_save(ctx->stack, &state); - char *n = xml_parse_name(ctx, ctx->stack); - xml_parse_char(ctx, ';'); - xml_dec(ctx); - uint l = strlen(n); - p = mp_spread(dtd->pool, p, 3 + l); - *p++ = '&'; - memcpy(p, n, l); - p += l; - *p++ = ';';; - mp_restore(ctx->stack, &state); - continue; - } - else - { - xml_skip_char(ctx); - c = xml_parse_char_ref(ctx); - } - } - p = mp_spread(dtd->pool, p, 5); - p = utf8_32_put(p, c); - } - *p = 0; - ent->len = p - (char *)mp_ptr(dtd->pool); - ent->text = mp_end(dtd->pool, p + 1); - slist_add_tail(list, &ent->n); - ent->flags = flags | XML_DTD_ENTITY_DECLARED; - } - else - { - /* External entity */ - struct xml_dtd_notn *notn = NULL; - char *system_id, *public_id; - xml_unget_char(ctx); - xml_dtd_parse_external_id(ctx, &system_id, &public_id, 0); - if (xml_parse_dtd_white(ctx, 0) && flags && xml_peek_char(ctx) != '>') - { - /* General external unparsed entity */ - flags |= XML_DTD_ENTITY_UNPARSED; - xml_parse_seq(ctx, "NDATA"); - xml_parse_dtd_white(ctx, 1); - notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); - } - slist_add_tail(list, &ent->n); - ent->flags = flags | XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_EXTERNAL; - ent->system_id = system_id; - ent->public_id = public_id; - ent->notn = notn; - } - xml_parse_dtd_white(ctx, 0); - xml_parse_char(ctx, '>'); - xml_dec(ctx); -} - -/* DTD: */ - -void -xml_parse_element_decl(struct xml_context *ctx) -{ - /* Elementdecl ::= '' - * Already parsed: 'dtd; - xml_parse_dtd_white(ctx, 1); - char *name = xml_parse_name(ctx, dtd->pool); - xml_parse_dtd_white(ctx, 1); - struct xml_dtd_elem *elem = xml_dtd_elems_lookup(dtd->tab_elems, name); - if (elem->flags & XML_DTD_ELEM_DECLARED) - xml_fatal(ctx, "Element <%s> already declared", name); - - /* contentspec ::= 'EMPTY' | 'ANY' | Mixed | children */ - uint c = xml_peek_char(ctx); - if (c == 'E') - { - xml_parse_seq(ctx, "EMPTY"); - elem->type = XML_DTD_ELEM_EMPTY; - } - else if (c == 'A') - { - xml_parse_seq(ctx, "ANY"); - elem->type = XML_DTD_ELEM_ANY; - } - else if (c == '(') - { - xml_skip_char(ctx); - xml_inc(ctx); - xml_parse_dtd_white(ctx, 0); - struct xml_dtd_elem_node *parent = elem->node = mp_alloc_zero(dtd->pool, sizeof(*parent)); - if (xml_peek_char(ctx) == '#') - { - /* Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' */ - xml_skip_char(ctx); - xml_parse_seq(ctx, "PCDATA"); - elem->type = XML_DTD_ELEM_MIXED; - parent->type = XML_DTD_ELEM_PCDATA; - while (1) - { - xml_parse_dtd_white(ctx, 0); - if ((c = xml_get_char(ctx)) == ')') - break; - else if (c != '|') - xml_fatal_expected(ctx, ')'); - xml_parse_dtd_white(ctx, 0); - struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); - if (xml_dtd_enodes_find(dtd->tab_enodes, parent, son_elem)) - xml_error(ctx, "Duplicate content '%s'", son_elem->name); - else - { - struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); - slist_add_tail(&parent->sons, &son->n); - } - } - xml_dec(ctx); - if (xml_peek_char(ctx) == '*') - { - xml_skip_char(ctx); - parent->occur = XML_DTD_ELEM_OCCUR_MULT; - } - else if (!slist_head(&parent->sons)) - parent->occur = XML_DTD_ELEM_OCCUR_ONCE; - else - xml_fatal_expected(ctx, '*'); - } - else - { - /* children ::= (choice | seq) ('?' | '*' | '+')? - * cp ::= (Name | choice | seq) ('?' | '*' | '+')? - * choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' - * seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' */ - - elem->type = XML_DTD_ELEM_CHILDREN; - parent->type = XML_DTD_ELEM_PCDATA; - uint c; - goto first; - - while (1) - { - /* After name */ - xml_parse_dtd_white(ctx, 0); - if ((c = xml_get_char(ctx)) == ')') - { - xml_dec(ctx); - if (parent->type == XML_DTD_ELEM_PCDATA) - parent->type = XML_DTD_ELEM_SEQ; - if ((c = xml_get_char(ctx)) == '?') - parent->occur = XML_DTD_ELEM_OCCUR_OPT; - else if (c == '*') - parent->occur = XML_DTD_ELEM_OCCUR_MULT; - else if (c == '+') - parent->occur = XML_DTD_ELEM_OCCUR_PLUS; - else - { - xml_unget_char(ctx); - parent->occur = XML_DTD_ELEM_OCCUR_ONCE; - } - if (!parent->parent) - break; - parent = parent->parent; - continue; - } - else if (c == '|') - { - if (parent->type == XML_DTD_ELEM_PCDATA) - parent->type = XML_DTD_ELEM_OR; - else if (parent->type != XML_DTD_ELEM_OR) - xml_fatal(ctx, "Mixed operators in the list of element children"); - } - else if (c == ',') - { - if (parent->type == XML_DTD_ELEM_PCDATA) - parent->type = XML_DTD_ELEM_SEQ; - else if (parent->type != XML_DTD_ELEM_SEQ) - xml_fatal(ctx, "Mixed operators in the list of element children"); - } - else if (c == '(') - { - xml_inc(ctx); - struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); - son->parent = parent; - slist_add_tail(&parent->sons, &son->n); - parent = son->parent; - son->type = XML_DTD_ELEM_MIXED; - } - else - xml_unget_char(ctx); - - /* Before name */ - xml_parse_dtd_white(ctx, 0); -first:; - struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); - // FIXME: duplicates, occurance - //struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); - struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); - son->parent = parent; - son->elem = son_elem; - slist_add_tail(&parent->sons, &son->n); - } - } - } - else - xml_fatal(ctx, "Expected element content specification"); - - xml_parse_dtd_white(ctx, 0); - xml_parse_char(ctx, '>'); - xml_dec(ctx); -} - -void -xml_parse_attr_list_decl(struct xml_context *ctx) -{ - /* AttlistDecl ::= '' - * AttDef ::= S Name S AttType S DefaultDecl - * Already parsed: 'dtd; - xml_parse_dtd_white(ctx, 1); - struct xml_dtd_elem *elem = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); - - while (xml_parse_dtd_white(ctx, 0) && xml_peek_char(ctx) != '>') - { - char *name = xml_parse_name(ctx, dtd->pool); - struct xml_dtd_attr *attr = xml_dtd_attrs_find(dtd->tab_attrs, elem, name); - uint ignored = 0; - if (attr) - { - xml_warn(ctx, "Duplicate attribute definition"); - ignored++; - } - else - attr = xml_dtd_attrs_new(ctx->dtd->tab_attrs, elem, name); - xml_parse_dtd_white(ctx, 1); - if (xml_peek_char(ctx) == '(') - { - xml_skip_char(ctx); // FIXME: xml_inc/dec ? - if (!ignored) - attr->type = XML_ATTR_ENUM; - do - { - xml_parse_dtd_white(ctx, 0); - char *value = xml_parse_nmtoken(ctx, dtd->pool); - if (!ignored) - if (xml_dtd_evals_find(ctx->dtd->tab_evals, attr, value)) - xml_error(ctx, "Duplicate enumeration value"); - else - xml_dtd_evals_new(ctx->dtd->tab_evals, attr, value); - xml_parse_dtd_white(ctx, 0); - } - while (xml_get_char(ctx) == '|'); - xml_unget_char(ctx); - xml_parse_char(ctx, ')'); - } - else - { - char *type = xml_parse_name(ctx, dtd->pool); - enum xml_dtd_attr_type t = XML_ATTR_CDATA; - if (!strcmp(type, "CDATA")) - t = XML_ATTR_CDATA; - else if (!strcmp(type, "ID")) - t = XML_ATTR_ID; - else if (!strcmp(type, "IDREF")) - t = XML_ATTR_IDREF; - else if (!strcmp(type, "IDREFS")) - t = XML_ATTR_IDREFS; - else if (!strcmp(type, "ENTITY")) - t = XML_ATTR_ENTITY; - else if (!strcmp(type, "ENTITIES")) - t = XML_ATTR_ENTITIES; - else if (!strcmp(type, "NMTOKEN")) - t = XML_ATTR_NMTOKEN; - else if (!strcmp(type, "NMTOKENS")) - t = XML_ATTR_NMTOKENS; - else if (!strcmp(type, "NOTATION")) - { - if (elem->type == XML_DTD_ELEM_EMPTY) - xml_fatal(ctx, "Empty element must not have notation attribute"); - // FIXME: An element type MUST NOT have more than one NOTATION attribute specified. - t = XML_ATTR_NOTATION; - xml_parse_dtd_white(ctx, 1); - xml_parse_char(ctx, '('); - do - { - xml_parse_dtd_white(ctx, 0); - struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); - if (!ignored) - if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, attr, n)) - xml_error(ctx, "Duplicate enumerated notation"); - else - xml_dtd_enotns_new(ctx->dtd->tab_enotns, attr, n); - xml_parse_dtd_white(ctx, 0); - } - while (xml_get_char(ctx) == '|'); - xml_unget_char(ctx); - xml_parse_char(ctx, ')'); - } - else - xml_fatal(ctx, "Unknown attribute type"); - if (!ignored) - attr->type = t; - } - xml_parse_dtd_white(ctx, 1); - enum xml_dtd_attr_default def = XML_ATTR_NONE; - if (xml_get_char(ctx) == '#') - switch (xml_peek_char(ctx)) - { - case 'R': - xml_parse_seq(ctx, "REQUIRED"); - def = XML_ATTR_REQUIRED; - break; - case 'I': - xml_parse_seq(ctx, "IMPLIED"); - def = XML_ATTR_IMPLIED; - break; - case 'F': - xml_parse_seq(ctx, "FIXED"); - def = XML_ATTR_FIXED; - xml_parse_dtd_white(ctx, 1); - break; - default: - xml_fatal(ctx, "Expected a modifier for default attribute value"); - } - else - xml_unget_char(ctx); - if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED) - { - char *v = xml_parse_attr_value(ctx, attr); - if (!ignored) - attr->default_value = v; - } - if (!ignored) - attr->default_mode = def; - } - xml_skip_char(ctx); - xml_dec(ctx); -} - -void -xml_skip_internal_subset(struct xml_context *ctx) -{ - TRACE(ctx, "skip_internal_subset"); - /* AlreadyParsed: '[' */ - uint c; - while ((c = xml_get_char(ctx)) != ']') - { - if (c != '<') - continue; - if ((c = xml_get_char(ctx)) == '?') - { - xml_inc(ctx); - xml_skip_pi(ctx); - } - else if (c != '!') - xml_dec(ctx); - else if (xml_get_char(ctx) == '-') - { - xml_inc(ctx); - xml_skip_comment(ctx); - } - else - while ((c = xml_get_char(ctx)) != '>') - if (c == '\'' || c == '"') - while (xml_get_char(ctx) != c); - } - xml_dec(ctx); -} - -/*** Validation of attribute values ***/ - -static uint -xml_check_tokens(char *value, uint first_cat, uint next_cat, uint seq) -{ - char *p = value; - uint u; - while (1) - { - p = utf8_32_get(p, &u); - if (!(xml_char_cat(u) & first_cat)) - return 0; - while (*p & ~0x20) - { - p = utf8_32_get(p, &u); - if (!(xml_char_cat(u) & next_cat)) - return 0; - } - if (!*p) - return 1; - if (!seq) - return 0; - p++; - } -} - -static uint -xml_is_name(struct xml_context *ctx, char *value) -{ - /* Name ::= NameStartChar (NameChar)* */ - return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 0); -} - -static uint -xml_is_names(struct xml_context *ctx, char *value) -{ - /* Names ::= Name (#x20 Name)* */ - return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 1); -} - -static uint -xml_is_nmtoken(struct xml_context *ctx, char *value) -{ - /* Nmtoken ::= (NameChar)+ */ - return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 0); -} - -static uint -xml_is_nmtokens(struct xml_context *ctx, char *value) -{ - /* Nmtokens ::= Nmtoken (#x20 Nmtoken)* */ - return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 1); -} - -static void -xml_err_attr_format(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *type) -{ - xml_error(ctx, "Attribute %s in <%s> does not match the production of %s", dtd->name, dtd->elem->name, type); -} - -void -xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value) -{ - if (dtd->type == XML_ATTR_CDATA) - return; - xml_normalize_white(ctx, value); - switch (dtd->type) - { - case XML_ATTR_ID: - if (!xml_is_name(ctx, value)) - xml_err_attr_format(ctx, dtd, "NAME"); - //FIXME: add to a hash table - break; - case XML_ATTR_IDREF: - if (!xml_is_name(ctx, value)) - xml_err_attr_format(ctx, dtd, "NAME"); - // FIXME: find in hash table (beware forward references) - break; - case XML_ATTR_IDREFS: - if (!xml_is_names(ctx, value)) - xml_err_attr_format(ctx, dtd, "NAMES"); - // FIXME: find - break; - case XML_ATTR_ENTITY: - // FIXME - break; - case XML_ATTR_ENTITIES: - // FIXME - break; - case XML_ATTR_NMTOKEN: - if (!xml_is_nmtoken(ctx, value)) - xml_err_attr_format(ctx, dtd, "NMTOKEN"); - break; - case XML_ATTR_NMTOKENS: - if (!xml_is_nmtokens(ctx, value)) - xml_err_attr_format(ctx, dtd, "NMTOKENS"); - break; - case XML_ATTR_ENUM: - if (!xml_dtd_evals_find(ctx->dtd->tab_evals, dtd, value)) - xml_error(ctx, "Attribute %s in <%s> contains an undefined enumeration value", dtd->name, dtd->elem->name); - break; - case XML_ATTR_NOTATION: - if (!xml_dtd_find_notn(ctx, value)) - xml_error(ctx, "Attribute %s in <%s> contains an undefined notation", dtd->name, dtd->elem->name); - break; - } -} diff --git a/xml/dtd.h b/xml/dtd.h deleted file mode 100644 index d3524ad1..00000000 --- a/xml/dtd.h +++ /dev/null @@ -1,178 +0,0 @@ -/* - * UCW Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#ifndef _UCW_XML_DTD_H -#define _UCW_XML_DTD_H - -#include - -#ifdef CONFIG_UCW_CLEAN_ABI -#define xml_dtd_cleanup ucw_xml_dtd_cleanup -#define xml_dtd_find_attr ucw_xml_dtd_find_attr -#define xml_dtd_find_elem ucw_xml_dtd_find_elem -#define xml_dtd_find_entity ucw_xml_dtd_find_entity -#define xml_dtd_find_notn ucw_xml_dtd_find_notn -#define xml_dtd_finish ucw_xml_dtd_finish -#define xml_dtd_init ucw_xml_dtd_init -#endif - -struct xml_dtd { - struct mempool *pool; /* Memory pool where to allocate DTD */ - slist ents; /* Link list of general entities */ - slist pents; /* Link list of parameter entities */ - slist notns; /* Link list of notations */ - slist elems; /* Link list of elements */ - void *tab_ents; /* Hash table of general entities */ - void *tab_pents; /* Hash table of parameter entities */ - void *tab_notns; /* Hash table of notations */ - void *tab_elems; /* Hash table of elements */ - void *tab_enodes; /* Hash table of element sons */ - void *tab_attrs; /* Hash table of element attributes */ - void *tab_evals; /* Hash table of enumerated attribute values */ - void *tab_enotns; /* hash table of enumerated attribute notations */ -}; - -/* Notations */ - -enum xml_dtd_notn_flags { - XML_DTD_NOTN_DECLARED = 0x1, /* The notation has been declared (internal usage) */ -}; - -struct xml_dtd_notn { - snode n; /* Node in xml_dtd.notns */ - uint flags; /* XML_DTD_NOTN_x */ - char *name; /* Notation name */ - char *system_id; /* External ID */ - char *public_id; - void *user; /* User-defined */ -}; - -struct xml_dtd_notn *xml_dtd_find_notn(struct xml_context *ctx, char *name); - -/* Entities */ - -enum xml_dtd_entity_flags { - XML_DTD_ENTITY_DECLARED = 0x1, /* The entity has been declared (internal usage) */ - XML_DTD_ENTITY_VISITED = 0x2, /* Cycle detection (internal usage) */ - XML_DTD_ENTITY_PARAMETER = 0x4, /* Parameter entity, general otherwise */ - XML_DTD_ENTITY_EXTERNAL = 0x8, /* External entity, internal otherwise */ - XML_DTD_ENTITY_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */ - XML_DTD_ENTITY_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */ -}; - -struct xml_dtd_entity { - snode n; /* Node in xml_dtd.[gp]ents */ - uint flags; /* XML_DTD_ENT_x */ - char *name; /* Entity name */ - char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */ - uint len; /* Text length */ - char *system_id; /* External ID */ - char *public_id; - struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */ - void *user; /* User-defined */ -}; - -struct xml_dtd_entity *xml_dtd_find_entity(struct xml_context *ctx, char *name); - -/* Elements */ - -enum xml_dtd_elem_flags { - XML_DTD_ELEM_DECLARED = 0x1, /* The element has been declared (internal usage) */ -}; - -enum xml_dtd_elem_type { - XML_DTD_ELEM_EMPTY, - XML_DTD_ELEM_ANY, - XML_DTD_ELEM_MIXED, - XML_DTD_ELEM_CHILDREN, -}; - -struct xml_dtd_elem { - snode n; - uint flags; - uint type; - char *name; - struct xml_dtd_elem_node *node; - slist attrs; - void *user; /* User-defined */ -}; - -struct xml_dtd_elem_node { - snode n; - struct xml_dtd_elem_node *parent; - struct xml_dtd_elem *elem; - slist sons; - uint type; - uint occur; - void *user; /* User-defined */ -}; - -enum xml_dtd_elem_node_type { - XML_DTD_ELEM_PCDATA, - XML_DTD_ELEM_SEQ, - XML_DTD_ELEM_OR, -}; - -enum xml_dtd_elem_node_occur { - XML_DTD_ELEM_OCCUR_ONCE, - XML_DTD_ELEM_OCCUR_OPT, - XML_DTD_ELEM_OCCUR_MULT, - XML_DTD_ELEM_OCCUR_PLUS, -}; - -struct xml_dtd_elem *xml_dtd_find_elem(struct xml_context *ctx, char *name); - -/* Attributes */ - -enum xml_dtd_attr_default { - XML_ATTR_NONE, - XML_ATTR_REQUIRED, - XML_ATTR_IMPLIED, - XML_ATTR_FIXED, -}; - -enum xml_dtd_attr_type { - XML_ATTR_CDATA, - XML_ATTR_ID, - XML_ATTR_IDREF, - XML_ATTR_IDREFS, - XML_ATTR_ENTITY, - XML_ATTR_ENTITIES, - XML_ATTR_NMTOKEN, - XML_ATTR_NMTOKENS, - XML_ATTR_ENUM, - XML_ATTR_NOTATION, -}; - -struct xml_dtd_attr { - snode n; - char *name; /* Attribute name */ - struct xml_dtd_elem *elem; /* Owner element */ - uint type; /* See enum xml_dtd_attr_type */ - uint default_mode; /* See enum xml_dtd_attr_default */ - char *default_value; /* The default value defined in DTD (or NULL) */ -}; - -struct xml_dtd_eval { - struct xml_dtd_attr *attr; - char *val; -}; - -struct xml_dtd_enotn { - struct xml_dtd_attr *attr; - struct xml_dtd_notn *notn; -}; - -void xml_dtd_init(struct xml_context *ctx); -void xml_dtd_cleanup(struct xml_context *ctx); -void xml_dtd_finish(struct xml_context *ctx); - -struct xml_dtd_attr *xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name); - -#endif diff --git a/xml/internals.h b/xml/internals.h deleted file mode 100644 index a605733b..00000000 --- a/xml/internals.h +++ /dev/null @@ -1,326 +0,0 @@ -/* - * UCW Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#ifndef _UCW_XML_INTERNALS_H -#define _UCW_XML_INTERNALS_H - -#include -#include - -#ifdef CONFIG_UCW_CLEAN_ABI -#define xml_attrs_table_cleanup ucw_xml_attrs_table_cleanup -#define xml_attrs_table_init ucw_xml_attrs_table_init -#define xml_fatal_expected ucw_xml_fatal_expected -#define xml_fatal_expected_quot ucw_xml_fatal_expected_quot -#define xml_fatal_expected_white ucw_xml_fatal_expected_white -#define xml_fatal_nested ucw_xml_fatal_nested -#define xml_hash_new ucw_xml_hash_new -#define xml_parse_attr_list_decl ucw_xml_parse_attr_list_decl -#define xml_parse_attr_value ucw_xml_parse_attr_value -#define xml_parse_char_ref ucw_xml_parse_char_ref -#define xml_parse_element_decl ucw_xml_parse_element_decl -#define xml_parse_entity_decl ucw_xml_parse_entity_decl -#define xml_parse_eq ucw_xml_parse_eq -#define xml_parse_name ucw_xml_parse_name -#define xml_parse_nmtoken ucw_xml_parse_nmtoken -#define xml_parse_notation_decl ucw_xml_parse_notation_decl -#define xml_parse_pe_ref ucw_xml_parse_pe_ref -#define xml_parse_pubid_literal ucw_xml_parse_pubid_literal -#define xml_parse_system_literal ucw_xml_parse_system_literal -#define xml_pop_comment ucw_xml_pop_comment -#define xml_pop_pi ucw_xml_pop_pi -#define xml_push_comment ucw_xml_push_comment -#define xml_push_entity ucw_xml_push_entity -#define xml_push_pi ucw_xml_push_pi -#define xml_push_source ucw_xml_push_source -#define xml_refill ucw_xml_refill -#define xml_skip_comment ucw_xml_skip_comment -#define xml_skip_internal_subset ucw_xml_skip_internal_subset -#define xml_skip_name ucw_xml_skip_name -#define xml_skip_pi ucw_xml_skip_pi -#define xml_sources_cleanup ucw_xml_sources_cleanup -#define xml_spout_chars ucw_xml_spout_chars -#define xml_throw ucw_xml_throw -#define xml_validate_attr ucw_xml_validate_attr -#endif - -/*** Debugging ***/ - -#ifdef LOCAL_DEBUG -#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0) -#else -#define TRACE(c, f, p...) do {} while(0) -#endif - -/*** Error handling ***/ - -void NONRET xml_throw(struct xml_context *ctx); - -/*** Memory management ***/ - -struct xml_stack { - struct xml_stack *next; - struct mempool_state state; - uint flags; -}; - -static inline void *xml_do_push(struct xml_context *ctx, uint size) -{ - /* Saves ctx->stack and ctx->flags state */ - struct mempool_state state; - mp_save(ctx->stack, &state); - struct xml_stack *s = mp_alloc(ctx->stack, size); - s->state = state; - s->flags = ctx->flags; - s->next = ctx->stack_list; - ctx->stack_list = s; - return s; -} - -static inline void xml_do_pop(struct xml_context *ctx, struct xml_stack *s) -{ - /* Restore ctx->stack and ctx->flags state */ - ctx->stack_list = s->next; - ctx->flags = s->flags; - mp_restore(ctx->stack, &s->state); -} - -static inline void xml_push(struct xml_context *ctx) -{ - TRACE(ctx, "push"); - xml_do_push(ctx, sizeof(struct xml_stack)); -} - -static inline void xml_pop(struct xml_context *ctx) -{ - TRACE(ctx, "pop"); - ASSERT(ctx->stack_list); - xml_do_pop(ctx, ctx->stack_list); -} - -struct xml_dom_stack { - struct xml_stack stack; - struct mempool_state state; -}; - -static inline struct xml_node *xml_push_dom(struct xml_context *ctx, struct mempool_state *state) -{ - /* Create a new DOM node */ - TRACE(ctx, "push_dom"); - struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s)); - if (state) - s->state = *state; - else - mp_save(ctx->pool, &s->state); - struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n)); - n->user = NULL; - if (n->parent = ctx->node) - clist_add_tail(&n->parent->sons, &n->n); - return ctx->node = n; -} - -static inline void xml_pop_dom(struct xml_context *ctx, uint free) -{ - /* Leave DOM subtree */ - TRACE(ctx, "pop_dom"); - ASSERT(ctx->node); - struct xml_node *p = ctx->node->parent; - struct xml_dom_stack *s = (void *)ctx->stack_list; - if (free) - { - /* See xml_pop_element() for cleanup of attribute hash table */ - if (p) - clist_remove(&ctx->node->n); - mp_restore(ctx->pool, &s->state); - } - ctx->node = p; - xml_do_pop(ctx, &s->stack); -} - -#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN) -#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \ - static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uint size) \ - { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \ - static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {} - -void *xml_hash_new(struct mempool *pool, uint size); - -void xml_spout_chars(struct fastbuf *fb); - -/*** Reading of document/external entities ***/ - -void NONRET xml_fatal_nested(struct xml_context *ctx); - -static inline void xml_inc(struct xml_context *ctx) -{ - /* Called after the first character of a block */ - TRACE(ctx, "inc"); - ctx->depth++; -} - -static inline void xml_dec(struct xml_context *ctx) -{ - /* Called after the last character of a block */ - TRACE(ctx, "dec"); - if (unlikely(!ctx->depth--)) - xml_fatal_nested(ctx); -} - -#include "obj/xml/unicat.h" - -static inline uint xml_char_cat(uint c) -{ - if (c < 0x10000) - return 1U << ucw_xml_char_tab1[(c & 0xff) + ucw_xml_char_tab2[c >> 8]]; - else if (likely(c < 0x110000)) - return 1U << ucw_xml_char_tab3[c >> 16]; - else - return 1; -} - -static inline uint xml_ascii_cat(uint c) -{ - return ucw_xml_char_tab1[c]; -} - -struct xml_source *xml_push_source(struct xml_context *ctx); -void xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); - -void xml_refill(struct xml_context *ctx); - -static inline uint xml_peek_char(struct xml_context *ctx) -{ - if (ctx->bptr == ctx->bstop) - xml_refill(ctx); - return ctx->bptr[0]; -} - -static inline uint xml_peek_cat(struct xml_context *ctx) -{ - if (ctx->bptr == ctx->bstop) - xml_refill(ctx); - return ctx->bptr[1]; -} - -static inline uint xml_get_char(struct xml_context *ctx) -{ - uint c = xml_peek_char(ctx); - ctx->bptr += 2; - return c; -} - -static inline uint xml_get_cat(struct xml_context *ctx) -{ - uint c = xml_peek_cat(ctx); - ctx->bptr += 2; - return c; -} - -static inline uint xml_last_char(struct xml_context *ctx) -{ - return ctx->bptr[-2]; -} - -static inline uint xml_last_cat(struct xml_context *ctx) -{ - return ctx->bptr[-1]; -} - -static inline uint xml_skip_char(struct xml_context *ctx) -{ - uint c = ctx->bptr[0]; - ctx->bptr += 2; - return c; -} - -static inline uint xml_unget_char(struct xml_context *ctx) -{ - return *(ctx->bptr -= 2); -} - -void xml_sources_cleanup(struct xml_context *ctx); - -/*** Parsing ***/ - -void NONRET xml_fatal_expected(struct xml_context *ctx, uint c); -void NONRET xml_fatal_expected_white(struct xml_context *ctx); -void NONRET xml_fatal_expected_quot(struct xml_context *ctx); - -static inline uint xml_parse_white(struct xml_context *ctx, uint mandatory) -{ - /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+ - * mandatory=0 -> S? */ - uint cnt = 0; - while (xml_peek_cat(ctx) & XML_CHAR_WHITE) - { - xml_skip_char(ctx); - cnt++; - } - if (unlikely(mandatory && !cnt)) - xml_fatal_expected_white(ctx); - return cnt; -} - -static inline void xml_parse_char(struct xml_context *ctx, uint c) -{ - /* Consumes a given Unicode character */ - if (unlikely(c != xml_get_char(ctx))) - xml_fatal_expected(ctx, c); -} - -static inline void xml_parse_seq(struct xml_context *ctx, const char *seq) -{ - /* Consumes a given sequence of ASCII characters */ - while (*seq) - xml_parse_char(ctx, *seq++); -} - -void xml_parse_eq(struct xml_context *ctx); - -static inline uint xml_parse_quote(struct xml_context *ctx) -{ - /* "'" | '"' */ - uint c = xml_get_char(ctx); - if (unlikely(c != '\'' && c != '\"')) - xml_fatal_expected_quot(ctx); - return c; -} - -char *xml_parse_name(struct xml_context *ctx, struct mempool *pool); -void xml_skip_name(struct xml_context *ctx); -char *xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool); - -char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool); -char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool); - -uint xml_parse_char_ref(struct xml_context *ctx); -void xml_parse_pe_ref(struct xml_context *ctx); - -char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr); - -void xml_skip_internal_subset(struct xml_context *ctx); -void xml_parse_notation_decl(struct xml_context *ctx); -void xml_parse_entity_decl(struct xml_context *ctx); -void xml_parse_element_decl(struct xml_context *ctx); -void xml_parse_attr_list_decl(struct xml_context *ctx); - -void xml_push_comment(struct xml_context *ctx); -void xml_pop_comment(struct xml_context *ctx); -void xml_skip_comment(struct xml_context *ctx); - -void xml_push_pi(struct xml_context *ctx); -void xml_pop_pi(struct xml_context *ctx); -void xml_skip_pi(struct xml_context *ctx); - -void xml_attrs_table_init(struct xml_context *ctx); -void xml_attrs_table_cleanup(struct xml_context *ctx); - -void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value); - -#endif diff --git a/xml/libucw-xml.pc b/xml/libucw-xml.pc deleted file mode 100644 index 5c02e996..00000000 --- a/xml/libucw-xml.pc +++ /dev/null @@ -1,11 +0,0 @@ -# pkg-config metadata for libucw-xml - -libdir=@LIBDIR@ -incdir=. - -Name: libucw-xml -Description: XML parser for LibUCW project -Version: @UCW_VERSION@ -Cflags: -I${incdir} -Libs: -L${libdir} @SO_LINK_PATH@ -lucw-xml@UCW_ABI_SUFFIX@ -Requires.private: @DEPS@ diff --git a/xml/parse.c b/xml/parse.c deleted file mode 100644 index 1187d8be..00000000 --- a/xml/parse.c +++ /dev/null @@ -1,1287 +0,0 @@ -/* - * UCW Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#undef LOCAL_DEBUG - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/*** Basic parsing ***/ - -void NONRET -xml_fatal_expected(struct xml_context *ctx, uint c) -{ - if (c >= 32 && c < 127) - xml_fatal(ctx, "Expected '%c'", c); - else - xml_fatal(ctx, "Expected U+%04x", c); -} - -void NONRET -xml_fatal_expected_white(struct xml_context *ctx) -{ - xml_fatal(ctx, "Expected a white space"); -} - -void NONRET -xml_fatal_expected_quot(struct xml_context *ctx) -{ - xml_fatal(ctx, "Expected a quotation mark"); -} - -void -xml_parse_eq(struct xml_context *ctx) -{ - /* Eq ::= S? '=' S? */ - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '='); - xml_parse_white(ctx, 0); -} - -/*** Names and nmtokens ***/ - -static char * -xml_parse_string(struct xml_context *ctx, struct mempool *pool, uint first_cat, uint next_cat, char *err) -{ - char *p = mp_start_noalign(pool, 1); - if (unlikely(!(xml_peek_cat(ctx) & first_cat))) - xml_fatal(ctx, "%s", err); - do - { - p = mp_spread(pool, p, 5); - p = utf8_32_put(p, xml_skip_char(ctx)); - } - while (xml_peek_cat(ctx) & next_cat); - *p++ = 0; - return mp_end(pool, p); -} - -static void -xml_skip_string(struct xml_context *ctx, uint first_cat, uint next_cat, char *err) -{ - if (unlikely(!(xml_get_cat(ctx) & first_cat))) - xml_fatal(ctx, "%s", err); - while (xml_peek_cat(ctx) & next_cat) - xml_skip_char(ctx); -} - -char * -xml_parse_name(struct xml_context *ctx, struct mempool *pool) -{ - /* Name ::= NameStartChar (NameChar)* */ - return xml_parse_string(ctx, pool, ctx->cat_sname, ctx->cat_name, "Expected a name"); -} - -void -xml_skip_name(struct xml_context *ctx) -{ - xml_skip_string(ctx, ctx->cat_sname, ctx->cat_name, "Expected a name"); -} - -char * -xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool) -{ - /* Nmtoken ::= (NameChar)+ */ - return xml_parse_string(ctx, pool, ctx->cat_name, ctx->cat_name, "Expected a nmtoken"); -} - -/*** Simple literals ***/ - -char * -xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool) -{ - /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */ - char *p = mp_start_noalign(pool, 1); - uint q = xml_parse_quote(ctx), c; - while ((c = xml_get_char(ctx)) != q) - { - p = mp_spread(pool, p, 5); - p = utf8_32_put(p, c); - } - *p++ = 0; - return mp_end(pool, p); -} - -char * -xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool) -{ - /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */ - char *p = mp_start_noalign(pool, 1); - uint q = xml_parse_quote(ctx), c; - while ((c = xml_get_char(ctx)) != q) - { - if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID))) - xml_fatal(ctx, "Expected a pubid character"); - p = mp_spread(pool, p, 2); - *p++ = c; - } - *p++ = 0; - return mp_end(pool, p); -} - -/*** Comments ***/ - -void -xml_push_comment(struct xml_context *ctx) -{ - TRACE(ctx, "push_comment"); - /* Comment ::= '' - * Already parsed: 'type = XML_NODE_COMMENT; - char *p = mp_start_noalign(ctx->pool, 6); - while (1) - { - if (xml_get_char(ctx) == '-') - if (xml_get_char(ctx) == '-') - break; - else - *p++ = '-'; - p = utf8_32_put(p, xml_last_char(ctx)); - p = mp_spread(ctx->pool, p, 6); - } - xml_parse_char(ctx, '>'); - *p = 0; - n->len = p - (char *)mp_ptr(ctx->pool); - n->text = mp_end(ctx->pool, p + 1); - if ((ctx->flags & XML_REPORT_COMMENTS) && ctx->h_comment) - ctx->h_comment(ctx); -} - -void -xml_pop_comment(struct xml_context *ctx) -{ - xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_COMMENTS)); - xml_dec(ctx); - TRACE(ctx, "pop_comment"); -} - -void -xml_skip_comment(struct xml_context *ctx) -{ - TRACE(ctx, "skip_comment"); - xml_parse_char(ctx, '-'); - while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-'); - xml_parse_char(ctx, '>'); - xml_dec(ctx); -} - -/*** Processing instructions ***/ - -void -xml_push_pi(struct xml_context *ctx) -{ - TRACE(ctx, "push_pi"); - /* Parses a PI to ctx->value and ctx->name: - * PI ::= '' Char*)))? '?>' - * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) - * Already parsed: 'type = XML_NODE_PI; - n->name = xml_parse_name(ctx, ctx->pool); - if (unlikely(!strcasecmp(n->name, "xml"))) - xml_error(ctx, "Reserved PI target"); - char *p = mp_start_noalign(ctx->pool, 5); - if (!xml_parse_white(ctx, 0)) - xml_parse_seq(ctx, "?>"); - else - while (1) - { - if (xml_get_char(ctx) == '?') - if (xml_peek_char(ctx) == '>') - { - xml_skip_char(ctx); - break; - } - else - *p++ = '?'; - else - p = utf8_32_put(p, xml_last_char(ctx)); - p = mp_spread(ctx->pool, p, 5); - } - *p = 0; - n->len = p - (char *)mp_ptr(ctx->pool); - n->text = mp_end(ctx->pool, p + 1); - if ((ctx->flags & XML_REPORT_PIS) && ctx->h_pi) - ctx->h_pi(ctx); -} - -void -xml_pop_pi(struct xml_context *ctx) -{ - xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_PIS)); - xml_dec(ctx); - TRACE(ctx, "pop_pi"); -} - -void -xml_skip_pi(struct xml_context *ctx) -{ - TRACE(ctx, "skip_pi"); - if (ctx->flags & XML_VALIDATING) - { - struct mempool_state state; - mp_save(ctx->stack, &state); - if (unlikely(!strcasecmp(xml_parse_name(ctx, ctx->stack), "xml"))) - xml_error(ctx, "Reserved PI target"); - mp_restore(ctx->stack, &state); - if (!xml_parse_white(ctx, 0)) - { - xml_parse_seq(ctx, "?>"); - xml_dec(ctx); - return; - } - } - while (1) - if (xml_get_char(ctx) == '?') - if (xml_peek_char(ctx) == '>') - break; - xml_skip_char(ctx); - xml_dec(ctx); -} - -/*** Character references ***/ - -uint -xml_parse_char_ref(struct xml_context *ctx) -{ - TRACE(ctx, "parse_char_ref"); - /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' - * Already parsed: '&#' */ - uint v = 0; - if (xml_get_char(ctx) == 'x') - { - if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT)) - { - xml_error(ctx, "Expected a hexadecimal value of character reference"); - goto recover; - } - do - { - v = (v << 4) + Cxvalue(xml_last_char(ctx)); - } - while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT)); - } - else - { - if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT)) - { - xml_error(ctx, "Expected a numeric value of character reference"); - goto recover; - } - do - { - v = v * 10 + xml_last_char(ctx) - '0'; - } - while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT)); - } - uint cat = xml_char_cat(v); - if (!(cat & ctx->cat_unrestricted)) - { - xml_error(ctx, "Character reference out of range"); - goto recover; - } - if (xml_last_char(ctx) == ';') - { - xml_dec(ctx); - return v; - } - xml_error(ctx, "Expected ';'"); -recover: - while (xml_last_char(ctx) != ';') - xml_get_char(ctx); - xml_dec(ctx); - return UNI_REPLACEMENT; -} - -/*** References to general entities ***/ - -static void -xml_parse_ref(struct xml_context *ctx) -{ - /* Reference ::= EntityRef | CharRef - * EntityRef ::= '&' Name ';' - * Already parsed: '&' */ - struct fastbuf *out = &ctx->chars; - if (xml_peek_char(ctx) == '#') - { - xml_skip_char(ctx); - bput_utf8_32(out, xml_parse_char_ref(ctx)); - } - else - { - TRACE(ctx, "parse_ge_ref"); - struct mempool_state state; - mp_save(ctx->stack, &state); - char *name = xml_parse_name(ctx, ctx->stack); - xml_parse_char(ctx, ';'); - struct xml_dtd_entity *ent = xml_dtd_find_entity(ctx, name); - if (!ent) - { - xml_error(ctx, "Unknown entity &%s;", name); - bputc(out, '&'); - bputs(out, name); - bputc(out, ';'); - } - else if (ent->flags & XML_DTD_ENTITY_TRIVIAL) - { - TRACE(ctx, "Trivial entity &%s;", name); - bputs(out, ent->text); - } - else - { - TRACE(ctx, "Pushed entity &%s;", name); - mp_restore(ctx->stack, &state); - xml_dec(ctx); - xml_push_entity(ctx, ent); - return; - } - mp_restore(ctx->stack, &state); - xml_dec(ctx); - } -} - -/*** Character data ***/ - -void -xml_spout_chars(struct fastbuf *fb) -{ - if (fb->bptr < fb->bufend) - return; - struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb); - struct mempool *pool = ctx->pool; - if (fb->bufend != fb->buffer) - { - TRACE(ctx, "growing chars"); - uint len = fb->bufend - fb->buffer; - uint reported = fb->bstop - fb->buffer; - fb->buffer = mp_expand(pool); - fb->bufend = fb->buffer + mp_avail(pool); - fb->bptr = fb->buffer + len; - fb->bstop = fb->buffer + reported; - } - else - { - TRACE(ctx, "starting chars"); - mp_save(pool, &ctx->chars_state); - fb->bptr = fb->buffer = fb->bstop = mp_start_noalign(pool, 2); - fb->bufend = fb->buffer + mp_avail(pool) - 1; - } -} - -static inline uint -xml_end_chars(struct xml_context *ctx, char **out) -{ - struct fastbuf *fb = &ctx->chars; - uint len = fb->bptr - fb->buffer; - if (len) - { - TRACE(ctx, "ending chars"); - *fb->bptr = 0; - *out = mp_end(ctx->pool, fb->bptr + 1); - fb->bufend = fb->bstop = fb->bptr = fb->buffer; - } - return len; -} - -static inline uint -xml_report_chars(struct xml_context *ctx, char **out) -{ - struct fastbuf *fb = &ctx->chars; - uint len = fb->bptr - fb->buffer; - if (len) - { - *fb->bptr = 0; - *out = fb->bstop; - fb->bstop = fb->bptr; - } - return len; -} - -static inline uint -xml_flush_chars(struct xml_context *ctx) -{ - char *text, *rtext; - uint len = xml_end_chars(ctx, &text), rlen; - if (len) - { - if (ctx->flags & XML_NO_CHARS) - { - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_ignorable) - ctx->h_ignorable(ctx, text, len); - mp_restore(ctx->pool, &ctx->chars_state); - return 0; - } - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) - ctx->h_block(ctx, rtext, rlen); - if (!(ctx->flags & XML_ALLOC_CHARS) && !(ctx->flags & XML_REPORT_CHARS)) - { - mp_restore(ctx->pool, &ctx->chars_state); - return 0; - } - struct xml_node *n = xml_push_dom(ctx, &ctx->chars_state); - n->type = XML_NODE_CHARS; - n->text = text; - n->len = len; - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars) - ctx->h_chars(ctx); - } - return len; -} - -static inline void -xml_pop_chars(struct xml_context *ctx) -{ - xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS)); - TRACE(ctx, "pop_chars"); -} - -static inline void -xml_append_chars(struct xml_context *ctx) -{ - TRACE(ctx, "append_chars"); - struct fastbuf *out = &ctx->chars; - if (ctx->flags & XML_NO_CHARS) - while (xml_get_char(ctx) != '<') - if (xml_last_cat(ctx) & XML_CHAR_WHITE) - bput_utf8_32(out, xml_last_char(ctx)); - else - { - xml_error(ctx, "This element must not contain character data"); - while (xml_get_char(ctx) != '<'); - break; - } - else - while (xml_get_char(ctx) != '<') - if (xml_last_char(ctx) == '&') - { - xml_inc(ctx); - xml_parse_ref(ctx); - } - else - bput_utf8_32(out, xml_last_char(ctx)); - xml_unget_char(ctx); -} - -/*** CDATA sections ***/ - -static void -xml_skip_cdata(struct xml_context *ctx) -{ - TRACE(ctx, "skip_cdata"); - xml_parse_seq(ctx, "CDATA["); - while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>'); - xml_dec(ctx); -} - -static void -xml_append_cdata(struct xml_context *ctx) -{ - /* CDSect :== '' Char*)) ']]>' - * Already parsed: 'flags & XML_NO_CHARS) - { - xml_error(ctx, "This element must not contain CDATA"); - xml_skip_cdata(ctx); - return; - } - xml_parse_seq(ctx, "CDATA["); - struct fastbuf *out = &ctx->chars; - uint rlen; - char *rtext; - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) - ctx->h_block(ctx, rtext, rlen); - while (1) - { - if (xml_get_char(ctx) == ']') - { - if (xml_get_char(ctx) == ']') - if (xml_get_char(ctx) == '>') - break; - else - bputc(out, ']'); - bputc(out, ']'); - } - bput_utf8_32(out, xml_last_char(ctx)); - } - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata && (rlen = xml_report_chars(ctx, &rtext))) - ctx->h_cdata(ctx, rtext, rlen); - xml_dec(ctx); -} - -/*** Attribute values ***/ - -char * -xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) -{ - TRACE(ctx, "parse_attr_value"); - /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */ - /* FIXME: -- check value constrains / normalize leading/trailing WS and repeated WS */ - struct mempool_state state; - uint quote = xml_parse_quote(ctx); - mp_save(ctx->stack, &state); - struct fastbuf *out = &ctx->chars; - struct xml_source *src = ctx->src; - while (1) - { - uint c = xml_get_char(ctx); - if (c == '&') - { - xml_inc(ctx); - xml_parse_ref(ctx); - } - else if (c == quote && src == ctx->src) - break; - else if (c == '<') - xml_error(ctx, "Attribute value must not contain '<'"); - else if (xml_last_cat(ctx) & XML_CHAR_WHITE) - bputc(out, ' '); - else - bput_utf8_32(out, c); - } - mp_restore(ctx->stack, &state); - char *text; - return xml_end_chars(ctx, &text) ? text : ""; -} - -uint -xml_normalize_white(struct xml_context *ctx UNUSED, char *text) -{ - char *s = text, *d = text; - while (*s == 0x20) - s++; - while (1) - { - while (*s & ~0x20) - *d++ = *s++; - if (!*s) - break; - while (*++s == 0x20); - *d++ = 0x20; - } - if (d != text && d[-1] == 0x20) - d--; - *d = 0; - return d - text; -} - -/*** Attributes ***/ - -struct xml_attrs_table; - -static inline uint -xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_node *e, char *n) -{ - return hash_pointer(e) ^ hash_string(n); -} - -static inline int -xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_node *e1, char *n1, struct xml_node *e2, char *n2) -{ - return (e1 == e2) && !strcmp(n1, n2); -} - -static inline void -xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_node *e, char *name) -{ - a->elem = e; - a->name = name; - a->val = NULL; - a->user = NULL; - slist_add_tail(&e->attrs, &a->n); -} - -#define HASH_PREFIX(x) xml_attrs_##x -#define HASH_NODE struct xml_attr -#define HASH_KEY_COMPLEX(x) x elem, x name -#define HASH_KEY_DECL struct xml_node *elem, char *name -#define HASH_TABLE_DYNAMIC -#define HASH_GIVE_EQ -#define HASH_GIVE_HASHFN -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_CLEANUP -#define HASH_WANT_REMOVE -#define HASH_WANT_LOOKUP -#define HASH_WANT_FIND -#define HASH_GIVE_ALLOC -XML_HASH_GIVE_ALLOC -#include - -static void -xml_parse_attr(struct xml_context *ctx) -{ - TRACE(ctx, "parse_attr"); - /* Attribute ::= Name Eq AttValue */ - struct xml_node *e = ctx->node; - char *n = xml_parse_name(ctx, ctx->pool); - struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, n); - xml_parse_eq(ctx); - char *v = xml_parse_attr_value(ctx, NULL); - if (a->val) - { - xml_error(ctx, "Attribute %s is not unique in element <%s>", n, e->name); - return; - } - a->val = v; - if (!e->dtd) - a->dtd = NULL; - else if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name))) - xml_error(ctx, "Undefined attribute %s in element <%s>", n, e->name); - else - xml_validate_attr(ctx, a->dtd, a->val); -} - -struct xml_attr * -xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name) -{ - return xml_attrs_find(ctx->tab_attrs, node, name); -} - -char * -xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name) -{ - struct xml_attr *attr = xml_attrs_find(ctx->tab_attrs, node, name); - if (attr) - return attr->val; - if (!node->dtd) - return NULL; - struct xml_dtd_attr *dtd = xml_dtd_find_attr(ctx, node->dtd, name); - return dtd ? dtd->default_value : NULL; -} - -void -xml_attrs_table_init(struct xml_context *ctx) -{ - xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table))); -} - -void -xml_attrs_table_cleanup(struct xml_context *ctx) -{ - xml_attrs_cleanup(ctx->tab_attrs); -} - -/*** Elements ***/ - -static uint -xml_validate_element(struct xml_dtd_elem_node *root, struct xml_dtd_elem *elem) -{ - if (root->elem) - return elem == root->elem; - else - SLIST_FOR_EACH(struct xml_dtd_elem_node *, son, root->sons) - if (xml_validate_element(son, elem)) - return 1; - return 0; -} - -static void -xml_push_element(struct xml_context *ctx) -{ - TRACE(ctx, "push_element"); - /* EmptyElemTag | STag - * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' - * STag ::= '<' Name (S Attribute)* S? '>' - * Already parsed: '<' */ - struct xml_node *e = xml_push_dom(ctx, NULL); - clist_init(&e->sons); - e->type = XML_NODE_ELEM; - e->name = xml_parse_name(ctx, ctx->pool); - slist_init(&e->attrs); - if (!e->parent) - { - ctx->dom = e; - if (ctx->doctype && strcmp(e->name, ctx->doctype)) - xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype); - } - if (!ctx->dtd) - e->dtd = NULL; - else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name))) - xml_error(ctx, "Undefined element <%s>", e->name); - else - { - struct xml_dtd_elem *dtd = e->dtd, *parent_dtd = e->parent ? e->parent->dtd : NULL; - if (dtd->type == XML_DTD_ELEM_MIXED) - ctx->flags &= ~XML_NO_CHARS; - else - ctx->flags |= XML_NO_CHARS; - if (parent_dtd) - if (parent_dtd->type == XML_DTD_ELEM_EMPTY) - xml_error(ctx, "Empty element must not contain children"); - else if (parent_dtd->type != XML_DTD_ELEM_ANY) - { - // FIXME: validate regular expressions - if (!xml_validate_element(parent_dtd->node, dtd)) - xml_error(ctx, "Unexpected element <%s>", e->name); - } - } - while (1) - { - uint white = xml_parse_white(ctx, 0); - uint c = xml_get_char(ctx); - if (c == '/') - { - xml_parse_char(ctx, '>'); - ctx->flags |= XML_EMPTY_ELEM_TAG; - break; - } - else if (c == '>') - break; - else if (!white) - xml_fatal_expected_white(ctx); - xml_unget_char(ctx); - xml_parse_attr(ctx); - } - if (e->dtd) - SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs) - if (a->default_mode == XML_ATTR_REQUIRED) - { - if (!xml_attrs_find(ctx->tab_attrs, e, a->name)) - xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name); - } - else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS) - { - struct xml_attr *attr = xml_attrs_lookup(ctx->tab_attrs, e, a->name); - if (!attr->val) - attr->val = a->default_value; - } - if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag) - ctx->h_stag(ctx); -} - -static void -xml_pop_element(struct xml_context *ctx) -{ - TRACE(ctx, "pop_element"); - if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag) - ctx->h_etag(ctx); - struct xml_node *e = ctx->node; - uint free = !(ctx->flags & XML_ALLOC_TAGS); - if (free) - { - if (!e->parent) - ctx->dom = NULL; - /* Restore hash table of attributes */ - SLIST_FOR_EACH(struct xml_attr *, a, e->attrs) - xml_attrs_remove(ctx->tab_attrs, a); - struct xml_node *n; - while (n = clist_head(&e->sons)) - { - if (n->type == XML_NODE_ELEM) - { - SLIST_FOR_EACH(struct xml_attr *, a, n->attrs) - xml_attrs_remove(ctx->tab_attrs, a); - clist_insert_list_after(&n->sons, &n->n); - } - clist_remove(&n->n); - } - } - xml_pop_dom(ctx, free); - xml_dec(ctx); -} - -static void -xml_parse_etag(struct xml_context *ctx) -{ - /* ETag ::= '' - * Already parsed: '<' */ - struct xml_node *e = ctx->node; - ASSERT(e); - char *n = e->name; - while (*n) - { - uint c; - n = utf8_32_get(n, &c); - if (xml_get_char(ctx) != c) - goto recover; - } - xml_parse_white(ctx, 0); - if (xml_get_char(ctx) != '>') - { -recover: - xml_error(ctx, "Invalid ETag, expected ", e->name); - while (xml_get_char(ctx) != '>'); - } - xml_dec(ctx); -} - -/*** Document type declaration ***/ - -static void -xml_parse_doctype_decl(struct xml_context *ctx) -{ - TRACE(ctx, "parse_doctype_decl"); - /* doctypedecl ::= '' - * Already parsed: '' */ - if (ctx->doctype) - xml_fatal(ctx, "Multiple document types not allowed"); - xml_parse_seq(ctx, "DOCTYPE"); - xml_parse_white(ctx, 1); - ctx->doctype = xml_parse_name(ctx, ctx->pool); - TRACE(ctx, "doctype=%s", ctx->doctype); - uint c; - if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P')) - { - if (c == 'S') - { - xml_parse_seq(ctx, "SYSTEM"); - xml_parse_white(ctx, 1); - ctx->system_id = xml_parse_system_literal(ctx, ctx->pool); - } - else - { - xml_parse_seq(ctx, "PUBLIC"); - xml_parse_white(ctx, 1); - ctx->public_id = xml_parse_pubid_literal(ctx, ctx->pool); - xml_parse_white(ctx, 1); - ctx->system_id = xml_parse_system_literal(ctx, ctx->pool); - } - xml_parse_white(ctx, 0); - ctx->flags |= XML_HAS_EXTERNAL_SUBSET; - } - if (xml_peek_char(ctx) == '[') - { - ctx->flags |= XML_HAS_INTERNAL_SUBSET; - xml_skip_char(ctx); - xml_inc(ctx); - } - if (ctx->h_doctype_decl) - ctx->h_doctype_decl(ctx); -} - - - -/////////////////////////////////////////////////////////////////////////////////////////////////////////// - -/* DTD: Internal subset */ - -static void -xml_parse_subset(struct xml_context *ctx, uint external) -{ - // FIXME: - // -- comments/pi have no parent - // -- conditional sections in external subset - // -- check corectness of parameter entities - - /* '[' intSubset ']' - * intSubset :== (markupdecl | DeclSep) - * Already parsed: '[' - * - * extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* - */ - while (1) - { - xml_parse_white(ctx, 0); - uint c = xml_get_char(ctx); - xml_inc(ctx); - if (c == '<') - if ((c = xml_get_char(ctx)) == '!') - switch (c = xml_get_char(ctx)) - { - case '-': - xml_push_comment(ctx); - xml_pop_comment(ctx); - break; - case 'N': - xml_parse_seq(ctx, "OTATION"); - xml_parse_notation_decl(ctx); - break; - case 'E': - if ((c = xml_get_char(ctx)) == 'N') - { - xml_parse_seq(ctx, "TITY"); - xml_parse_entity_decl(ctx); - } - else if (c == 'L') - { - xml_parse_seq(ctx, "EMENT"); - xml_parse_element_decl(ctx); - } - else - goto invalid_markup; - break; - case 'A': - xml_parse_seq(ctx, "TTLIST"); - xml_parse_attr_list_decl(ctx); - break; - default: - goto invalid_markup; - } - else if (c == '?') - { - xml_push_pi(ctx); - xml_pop_pi(ctx); - } - else - goto invalid_markup; - else if (c == '%') - xml_parse_pe_ref(ctx); - else if (c == ']' && !external) - { - break; - } - else if (c == '>' && external) - { - break; - } - else - goto invalid_markup; - } - xml_dec(ctx); - return; -invalid_markup: ; - xml_fatal(ctx, "Invalid markup in the %s subset", external ? "external" : "internal"); -} - -/*** The State Machine ***/ - -uint -xml_next(struct xml_context *ctx) -{ - /* A nasty state machine */ - -#define PULL(x) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##x; case XML_STATE_##x: ; } while (0) -#define PULL_STATE(x, s) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##s, XML_STATE_##x; case XML_STATE_##s: ; } while (0) - - TRACE(ctx, "xml_next (state=%u)", ctx->state); - jmp_buf throw_buf; - ctx->throw_buf = &throw_buf; - if (setjmp(throw_buf)) - { -error: - if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal) - ctx->h_fatal(ctx); - TRACE(ctx, "raised fatal error"); - return ctx->state = XML_STATE_EOF; - } - uint c; - switch (ctx->state) - { - case XML_STATE_START: - TRACE(ctx, "entering prolog"); - ctx->flags |= XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL; - if (ctx->h_document_start) - ctx->h_document_start(ctx); - /* XMLDecl */ - xml_refill(ctx); - if (ctx->h_xml_decl) - ctx->h_xml_decl(ctx); - PULL(XML_DECL); - - /* Misc* (doctypedecl Misc*)? */ - while (1) - { - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '<'); - xml_inc(ctx); - if ((c = xml_get_char(ctx)) == '?') - /* Processing intruction */ - if (!(ctx->flags & XML_REPORT_PIS)) - xml_skip_pi(ctx); - else - { - xml_push_pi(ctx); - PULL_STATE(PI, PROLOG_PI); - xml_pop_pi(ctx); - } - else if (c != '!') - { - /* Found the root tag */ - xml_unget_char(ctx); - goto first_tag; - } - else if (xml_get_char(ctx) == '-') - if (!(ctx->flags & XML_REPORT_COMMENTS)) - xml_skip_comment(ctx); - else - { - xml_push_comment(ctx); - PULL_STATE(COMMENT, PROLOG_COMMENT); - xml_pop_comment(ctx); - } - else - { - /* DocTypeDecl */ - xml_unget_char(ctx); - xml_parse_doctype_decl(ctx); - PULL(DOCTYPE_DECL); - if (ctx->flags & XML_HAS_DTD) - if (ctx->flags & XML_PARSE_DTD) - { - xml_dtd_init(ctx); - if (ctx->h_dtd_start) - ctx->h_dtd_start(ctx); - if (ctx->flags & XML_HAS_INTERNAL_SUBSET) - { - xml_parse_subset(ctx, 0); - xml_dec(ctx); - } - if (ctx->flags & XML_HAS_EXTERNAL_SUBSET) - { - struct xml_dtd_entity ent = { - .system_id = ctx->system_id, - .public_id = ctx->public_id, - }; - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '>'); - xml_unget_char(ctx); - ASSERT(ctx->h_resolve_entity); - ctx->h_resolve_entity(ctx, &ent); - ctx->flags |= XML_SRC_EXPECTED_DECL; - xml_parse_subset(ctx, 1); - xml_unget_char(ctx);; - } - if (ctx->h_dtd_end) - ctx->h_dtd_end(ctx); - } - else if (ctx->flags & XML_HAS_INTERNAL_SUBSET) - xml_skip_internal_subset(ctx); - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '>'); - xml_dec(ctx); - } - } - - case XML_STATE_CHARS: - - while (1) - { - if (xml_peek_char(ctx) != '<') - { - /* CharData */ - xml_append_chars(ctx); - continue; - } - else - xml_skip_char(ctx); - xml_inc(ctx); -first_tag: - - if ((c = xml_get_char(ctx)) == '?') - { - /* PI */ - if (!(ctx->flags & (XML_REPORT_PIS | XML_ALLOC_PIS))) - xml_skip_pi(ctx); - else - { - if (xml_flush_chars(ctx)) - { - PULL_STATE(CHARS, CHARS_BEFORE_PI); - xml_pop_chars(ctx); - } - xml_push_pi(ctx); - PULL(PI); - xml_pop_pi(ctx); - } - } - - else if (c == '!') - if ((c = xml_get_char(ctx)) == '-') - { - /* Comment */ - if (!(ctx->flags & (XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS))) - xml_skip_comment(ctx); - else - { - if (xml_flush_chars(ctx)) - { - PULL_STATE(CHARS, CHARS_BEFORE_COMMENT); - xml_pop_chars(ctx); - } - xml_push_comment(ctx); - PULL(COMMENT); - xml_pop_comment(ctx); - } - } - else if (c == '[') - { - /* CDATA */ - xml_append_cdata(ctx); - } - else - xml_fatal(ctx, "Unexpected character after 'flags & XML_EMPTY_ELEM_TAG) - goto pop_element; - } - - else - { - /* ETag */ - if (xml_flush_chars(ctx)) - { - PULL_STATE(CHARS, CHARS_BEFORE_ETAG); - xml_pop_chars(ctx); - } - - xml_parse_etag(ctx); -pop_element: - PULL(ETAG); - xml_pop_element(ctx); - if (!ctx->node) - goto epilog; - } - } - -epilog: - /* Misc* */ - TRACE(ctx, "entering epilog"); - while (1) - { - /* Epilog whitespace is the only place, where a valid document can reach EOF */ - if (setjmp(throw_buf)) - if (ctx->err_code == XML_ERR_EOF) - { - TRACE(ctx, "reached EOF"); - ctx->state = XML_STATE_EOF; - if (ctx->h_document_end) - ctx->h_document_end(ctx); - case XML_STATE_EOF: - ctx->err_code = 0; - ctx->err_msg = NULL; - return XML_STATE_EOF; - } - else - goto error; - xml_parse_white(ctx, 0); - if (setjmp(throw_buf)) - goto error; - - /* Misc */ - xml_parse_char(ctx, '<'); - xml_inc(ctx); - if ((c = xml_get_char(ctx)) == '?') - /* Processing instruction */ - if (!(ctx->flags & XML_REPORT_PIS)) - xml_skip_pi(ctx); - else - { - xml_push_pi(ctx); - PULL_STATE(PI, EPILOG_PI); - xml_pop_pi(ctx); - } - else if (c == '!') - { - xml_parse_char(ctx, '-'); - /* Comment */ - if (!(ctx->flags & XML_REPORT_COMMENTS)) - xml_skip_comment(ctx); - else - { - xml_push_comment(ctx); - PULL_STATE(COMMENT, EPILOG_COMMENT); - xml_pop_comment(ctx); - } - } - else - xml_fatal(ctx, "Syntax error in the epilog"); - } - - } - ASSERT(0); -} - -uint -xml_next_state(struct xml_context *ctx, uint pull) -{ - uint saved = ctx->pull; - ctx->pull = pull; - uint res = xml_next(ctx); - ctx->pull = saved; - return res; -} - -uint -xml_skip_element(struct xml_context *ctx) -{ - ASSERT(ctx->state == XML_STATE_STAG); - struct xml_node *node = ctx->node; - uint saved = ctx->pull, res; - ctx->pull = XML_PULL_ETAG; - while ((res = xml_next(ctx)) && ctx->node != node); - ctx->pull = saved; - return res; -} - -uint -xml_parse(struct xml_context *ctx) -{ - /* This cycle should run only once unless the user overrides the value of ctx->pull in a SAX handler */ - do - { - ctx->pull = 0; - } - while (xml_next(ctx)); - return ctx->err_code; -} - -char * -xml_merge_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool) -{ - ASSERT(node->type == XML_NODE_ELEM); - char *p = mp_start_noalign(pool, 1); - XML_NODE_FOR_EACH(son, node) - if (son->type == XML_NODE_CHARS) - { - p = mp_spread(pool, p, son->len + 1); - memcpy(p, son->text, son->len); - p += son->len; - } - *p++ = 0; - return mp_end(pool, p); -} - -static char * -xml_append_dom_chars(char *p, struct mempool *pool, struct xml_node *node) -{ - XML_NODE_FOR_EACH(son, node) - if (son->type == XML_NODE_CHARS) - { - p = mp_spread(pool, p, son->len + 1); - memcpy(p, son->text, son->len); - p += son->len; - } - else if (son->type == XML_NODE_ELEM) - p = xml_append_dom_chars(p, pool, son); - return p; -} - -char * -xml_merge_dom_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool) -{ - ASSERT(node->type == XML_NODE_ELEM); - char *p = mp_start_noalign(pool, 1); - p = xml_append_dom_chars(p, pool, node); - *p++ = 0; - return mp_end(pool, p); -} diff --git a/xml/source.c b/xml/source.c deleted file mode 100644 index d369c319..00000000 --- a/xml/source.c +++ /dev/null @@ -1,486 +0,0 @@ -/* - * UCW Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#undef LOCAL_DEBUG - -#include -#include -#include -#include -#include -#include -#include -#include - -/*** Character categorization ***/ - -#include "obj/xml/unicat.c" - -static void -xml_init_cats(struct xml_context *ctx) -{ - if (!(ctx->flags & XML_VERSION_1_1)) - { - ctx->cat_chars = XML_CHAR_VALID_1_0; - ctx->cat_unrestricted = XML_CHAR_VALID_1_0; - ctx->cat_new_line = XML_CHAR_NEW_LINE_1_0; - ctx->cat_name = XML_CHAR_NAME_1_0; - ctx->cat_sname = XML_CHAR_SNAME_1_0; - } - else - { - ctx->cat_chars = XML_CHAR_VALID_1_1; - ctx->cat_unrestricted = XML_CHAR_UNRESTRICTED_1_1; - ctx->cat_new_line = XML_CHAR_NEW_LINE_1_1; - ctx->cat_name = XML_CHAR_NAME_1_1; - ctx->cat_sname = XML_CHAR_SNAME_1_1; - } -} - -/*** Reading of document/external entities ***/ - -static void NONRET -xml_eof(struct xml_context *ctx) -{ - ctx->err_msg = "Unexpected EOF"; - ctx->err_code = XML_ERR_EOF; - xml_throw(ctx); -} - -void NONRET -xml_fatal_nested(struct xml_context *ctx) -{ - xml_fatal(ctx, "Entity is not nested correctly"); -} - -static inline void -xml_add_char(u32 **bstop, uint c) -{ - *(*bstop)++ = c; - *(*bstop)++ = xml_char_cat(c); -} - -struct xml_source * -xml_push_source(struct xml_context *ctx) -{ - xml_push(ctx); - struct xml_source *src = ctx->src; - if (src) - { - src->bptr = ctx->bptr; - src->bstop = ctx->bstop; - } - src = mp_alloc_zero(ctx->stack, sizeof(*src)); - src->next = ctx->src; - src->saved_depth = ctx->depth; - ctx->src = src; - ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_DOCUMENT); - ctx->bstop = ctx->bptr = src->buf; - ctx->depth = 0; - return src; -} - -struct xml_source * -xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb) -{ - struct xml_source *src = xml_push_source(ctx); - src->fb = fb; - return src; -} - -static void -xml_close_source(struct xml_source *src) -{ - bclose(src->fb); - if (src->wrapped_fb) - bclose(src->wrapped_fb); -} - -static void -xml_pop_source(struct xml_context *ctx) -{ - TRACE(ctx, "pop_source"); - if (unlikely(ctx->depth != 0)) - xml_fatal(ctx, "Unexpected end of entity"); - struct xml_source *src = ctx->src; - if (!src) - xml_fatal(ctx, "Undefined source"); - xml_close_source(src); - ctx->depth = src->saved_depth; - ctx->src = src = src->next; - if (src) - { - ctx->bptr = src->bptr; - ctx->bstop = src->bstop; - } - xml_pop(ctx); - if (unlikely(!src)) - xml_eof(ctx); -} - -void -xml_sources_cleanup(struct xml_context *ctx) -{ - struct xml_source *s; - while (s = ctx->src) - { - ctx->src = s->next; - xml_close_source(s); - } -} - -static void xml_refill_utf8(struct xml_context *ctx); - -void -xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent UNUSED) -{ - xml_error(ctx, "References to external entities are not supported"); -} - -void -xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent) -{ - TRACE(ctx, "xml_push_entity"); - struct xml_source *src; - if (ent->flags & XML_DTD_ENTITY_EXTERNAL) - { - ASSERT(ctx->h_resolve_entity); - ctx->h_resolve_entity(ctx, ent); - ctx->flags |= XML_SRC_EXPECTED_DECL; - src = ctx->src; - } - else - { - src = xml_push_source(ctx); - fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, strlen(ent->text), 0); - } - src->refill = xml_refill_utf8; - src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line; - src->refill_cat2 = ctx->cat_new_line; -} - -static uint -xml_error_restricted(struct xml_context *ctx, uint c) -{ - if (c == ~1U) - xml_error(ctx, "Corrupted encoding"); - else - xml_error(ctx, "Restricted char U+%04X", c); - return UNI_REPLACEMENT; -} - -static void xml_parse_decl(struct xml_context *ctx); - -#define REFILL(ctx, func, params...) \ - struct xml_source *src = ctx->src; \ - struct fastbuf *fb = src->fb; \ - if (ctx->bptr == ctx->bstop) \ - ctx->bptr = ctx->bstop = src->buf; \ - uint c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ - u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \ - *last_0xd = src->pending_0xd ? bstop : NULL; \ - do \ - { \ - c = func(fb, ##params); \ - uint t = xml_char_cat(c); \ - if (t & t1) \ - /* Typical branch */ \ - *bstop++ = c, *bstop++ = t; \ - else if (t & t2) \ - { \ - /* New line */ \ - /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \ - /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \ - if (c == 0xd) \ - last_0xd = bstop + 2; \ - else if (c != 0x2028 && last_0xd == bstop) \ - { \ - last_0xd = NULL; \ - continue; \ - } \ - xml_add_char(&bstop, 0xa), row++; \ - } \ - else if (c == '>') \ - { \ - /* Used only in XML/TextDecl to switch the encoding */ \ - *bstop++ = c, *bstop++ = t; \ - break; \ - } \ - else if (~c) \ - /* Restricted character */ \ - xml_add_char(&bstop, xml_error_restricted(ctx, c)); \ - else \ - { \ - /* EOF */ \ - ctx->flags |= XML_SRC_EOF; \ - break; \ - } \ - } \ - while (bstop < bend); \ - src->pending_0xd = (last_0xd == bstop); \ - ctx->bstop = bstop; \ - src->row = row; - -static void -xml_refill_utf8(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf8_repl, ~1U); -} - -static void -xml_refill_utf16_le(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf16_le_repl, ~1U); -} - -static void -xml_refill_utf16_be(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf16_be_repl, ~1U); -} - -#undef REFILL - -void -xml_refill(struct xml_context *ctx) -{ - do - { - if (ctx->flags & XML_SRC_EOF) - xml_pop_source(ctx); - else if (ctx->flags & XML_SRC_EXPECTED_DECL) - xml_parse_decl(ctx); - else - { - ctx->src->refill(ctx); - TRACE(ctx, "refilled %u characters", (uint)((ctx->bstop - ctx->bptr) / 2)); - } - } - while (ctx->bptr == ctx->bstop); -} - -static uint -xml_source_row(struct xml_context *ctx, struct xml_source *src) -{ - uint row = src->row; - for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2) - if (p[-1] & src->refill_cat2) - row--; - return row + 1; -} - -uint -xml_row(struct xml_context *ctx) -{ - return ctx->src ? xml_source_row(ctx, ctx->src) : 0; -} - -/* Document/external entity header */ - -static char * -xml_parse_encoding_name(struct xml_context *ctx) -{ - /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */ - char *p = mp_start_noalign(ctx->pool, 1); - uint q = xml_parse_quote(ctx); - if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME))) - xml_fatal(ctx, "Invalid character in the encoding name"); - while (1) - { - p = mp_spread(ctx->pool, p, 2); - *p++ = xml_last_char(ctx); - if (xml_get_char(ctx) == q) - break; - if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME))) - xml_fatal(ctx, "Invalid character in the encoding name"); - } - *p++ = 0; - return mp_end(ctx->pool, p); -} - -static void -xml_init_charconv(struct xml_context *ctx, int cs) -{ - // XXX: with a direct access to libucw-charset tables could be faster - struct xml_source *src = ctx->src; - TRACE(ctx, "wrapping charset %s", charset_name(cs)); - src->wrapped_fb = src->fb; - src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8); -} - -static void -xml_parse_decl(struct xml_context *ctx) -{ - TRACE(ctx, "xml_parse_decl"); - struct xml_source *src = ctx->src; - ctx->flags &= ~XML_SRC_EXPECTED_DECL; - uint doc = ctx->flags & XML_SRC_DOCUMENT; - - /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */ - if (doc) - xml_init_cats(ctx); - src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line & ~XML_CHAR_GT; - src->refill_cat2 = ctx->cat_new_line; - - /* Initialize the supplied charset (if any) or try to guess it */ - char *expected_encoding = src->expected_encoding; - src->refill = xml_refill_utf8; - int bom = bpeekc(src->fb); - if (bom < 0) - ctx->flags |= XML_SRC_EOF; - if (!src->fb_encoding) - { - if (bom == 0xfe) - src->refill = xml_refill_utf16_be; - else if (bom == 0xff) - src->refill = xml_refill_utf16_le; - } - else - { - int cs = find_charset_by_name(src->fb_encoding); - if (cs == CONV_CHARSET_UTF8) - {} - else if (cs >= 0) - { - xml_init_charconv(ctx, cs); - bom = 0; - } - else if (strcasecmp(src->fb_encoding, "UTF-16")) - { - src->refill = xml_refill_utf16_be; - if (bom == 0xff) - src->refill = xml_refill_utf16_le; - } - else if (strcasecmp(src->fb_encoding, "UTF-16BE")) - src->refill = xml_refill_utf16_be; - else if (strcasecmp(src->fb_encoding, "UTF-16LE")) - src->refill = xml_refill_utf16_le; - else - { - xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding); - expected_encoding = NULL; - } - } - uint utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; - if (utf16) - src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE"; - if (!expected_encoding) - expected_encoding = src->fb_encoding; - if (bom > 0 && xml_peek_char(ctx) == 0xfeff) - xml_skip_char(ctx); - else if (utf16) - xml_error(ctx, "Missing or corrupted BOM"); - TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?"); - - /* Look ahead for presence of XMLDecl or optional TextDecl */ - if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) - xml_refill(ctx); - u32 *bptr = ctx->bptr; - uint have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) && - bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L'); - if (!have_decl) - { - if (doc) - xml_fatal(ctx, "Missing or corrupted XML header"); - else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16) - xml_error(ctx, "Missing or corrupted entity header"); - goto exit; - } - ctx->bptr = bptr + 12; - xml_parse_white(ctx, 0); - - /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */ - if (xml_peek_char(ctx) == 'v') - { - xml_parse_seq(ctx, "version"); - xml_parse_eq(ctx); - char *version = xml_parse_pubid_literal(ctx, ctx->pool); - TRACE(ctx, "version=%s", version); - uint v = 0; - if (!strcmp(version, "1.1")) - v = XML_VERSION_1_1; - else if (strcmp(version, "1.0")) - { - xml_error(ctx, "Unknown XML version string '%s'", version); - version = "1.0"; - } - if (doc) - { - ctx->version_str = version; - ctx->flags |= v; - } - else if (v > (ctx->flags & XML_VERSION_1_1)) - xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document"); - if (!xml_parse_white(ctx, !doc)) - goto end; - } - else if (doc) - { - xml_error(ctx, "Expected XML version"); - ctx->version_str = "1.0"; - } - - /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */ - if (xml_peek_char(ctx) == 'e') - { - xml_parse_seq(ctx, "encoding"); - xml_parse_eq(ctx); - src->decl_encoding = xml_parse_encoding_name(ctx); - TRACE(ctx, "encoding=%s", src->decl_encoding); - if (!xml_parse_white(ctx, 0)) - goto end; - } - else if (!doc) - xml_error(ctx, "Expected XML encoding"); - - /* Parse whether the document is standalone (optional in XMLDecl) */ - if (doc && xml_peek_char(ctx) == 's') - { - xml_parse_seq(ctx, "standalone"); - xml_parse_eq(ctx); - uint c = xml_parse_quote(ctx); - if (ctx->standalone = (xml_peek_char(ctx) == 'y')) - xml_parse_seq(ctx, "yes"); - else - xml_parse_seq(ctx, "no"); - xml_parse_char(ctx, c); - TRACE(ctx, "standalone=%d", ctx->standalone); - xml_parse_white(ctx, 0); - } -end: - xml_parse_seq(ctx, "?>"); - - /* Switch to the final encoding */ - if (src->decl_encoding) - { - int cs = find_charset_by_name(src->decl_encoding); - if (cs < 0 && !expected_encoding) - xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); - else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) - { - xml_init_charconv(ctx, cs); - src->fb_encoding = src->decl_encoding; - } - else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || - !(!strcasecmp(src->decl_encoding, "UTF-16") || - (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || - (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) - xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); - } - if (!src->fb_encoding) - src->fb_encoding = "UTF-8"; - TRACE(ctx, "Final encoding=%s", src->fb_encoding); - -exit: - /* Update valid Unicode ranges */ - if (doc) - xml_init_cats(ctx); - src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line; - src->refill_cat2 = ctx->cat_new_line; -} diff --git a/xml/unicat.pl b/xml/unicat.pl deleted file mode 100755 index c1bc442b..00000000 --- a/xml/unicat.pl +++ /dev/null @@ -1,165 +0,0 @@ -#!/usr/bin/perl -# -# UCW Library -- Character map for the XML parser -# -# (c) 2007 Pavel Charvat -# -# This software may be freely distributed and used according to the terms -# of the GNU Lesser General Public License. -# - -my @cat = (); -my @lcat = (); -my %ids = (); -my %cls = (); -for (my $i = 0; $i < 0x10000; $i++) { $cat[$i] = 0; } -for (my $i = 0; $i < 0x11; $i++) { $lcat[$i] = 0; } - -my @white = (0x9, 0xA, 0xD, 0x20); -my @base_char_1_0 = ( - [0x0041,0x005A], [0x0061,0x007A], [0x00C0,0x00D6], [0x00D8,0x00F6], [0x00F8,0x00FF], [0x0100,0x0131], - [0x0134,0x013E], [0x0141,0x0148], [0x014A,0x017E], [0x0180,0x01C3], [0x01CD,0x01F0], [0x01F4,0x01F5], - [0x01FA,0x0217], [0x0250,0x02A8], [0x02BB,0x02C1], 0x0386, [0x0388,0x038A], 0x038C, [0x038E,0x03A1], - [0x03A3,0x03CE], [0x03D0,0x03D6], 0x03DA, 0x03DC, 0x03DE, 0x03E0, [0x03E2,0x03F3], [0x0401,0x040C], - [0x040E,0x044F], [0x0451,0x045C], [0x045E,0x0481], [0x0490,0x04C4], [0x04C7,0x04C8], [0x04CB,0x04CC], - [0x04D0,0x04EB], [0x04EE,0x04F5], [0x04F8,0x04F9], [0x0531,0x0556], 0x0559, [0x0561,0x0586], [0x05D0,0x05EA], - [0x05F0,0x05F2], [0x0621,0x063A], [0x0641,0x064A], [0x0671,0x06B7], [0x06BA,0x06BE], [0x06C0,0x06CE], - [0x06D0,0x06D3], 0x06D5, [0x06E5,0x06E6], [0x0905,0x0939], 0x093D, [0x0958,0x0961], [0x0985,0x098C], - [0x098F,0x0990], [0x0993,0x09A8], [0x09AA,0x09B0], 0x09B2, [0x09B6,0x09B9], [0x09DC,0x09DD], [0x09DF,0x09E1], - [0x09F0,0x09F1], [0x0A05,0x0A0A], [0x0A0F,0x0A10], [0x0A13,0x0A28], [0x0A2A,0x0A30], [0x0A32,0x0A33], - [0x0A35,0x0A36], [0x0A38,0x0A39], [0x0A59,0x0A5C], 0x0A5E, [0x0A72,0x0A74], [0x0A85,0x0A8B], 0x0A8D, - [0x0A8F,0x0A91], [0x0A93,0x0AA8], [0x0AAA,0x0AB0], [0x0AB2,0x0AB3], [0x0AB5,0x0AB9], 0x0ABD, 0x0AE0, - [0x0B05,0x0B0C], [0x0B0F,0x0B10], [0x0B13,0x0B28], [0x0B2A,0x0B30], [0x0B32,0x0B33], [0x0B36,0x0B39], - 0x0B3D, [0x0B5C,0x0B5D], [0x0B5F,0x0B61], [0x0B85,0x0B8A], [0x0B8E,0x0B90], [0x0B92,0x0B95], [0x0B99,0x0B9A], - 0x0B9C, [0x0B9E,0x0B9F], [0x0BA3,0x0BA4], [0x0BA8,0x0BAA], [0x0BAE,0x0BB5], [0x0BB7,0x0BB9], [0x0C05,0x0C0C], - [0x0C0E,0x0C10], [0x0C12,0x0C28], [0x0C2A,0x0C33], [0x0C35,0x0C39], [0x0C60,0x0C61], [0x0C85,0x0C8C], - [0x0C8E,0x0C90], [0x0C92,0x0CA8], [0x0CAA,0x0CB3], [0x0CB5,0x0CB9], 0x0CDE, [0x0CE0,0x0CE1], [0x0D05,0x0D0C], - [0x0D0E,0x0D10], [0x0D12,0x0D28], [0x0D2A,0x0D39], [0x0D60,0x0D61], [0x0E01,0x0E2E], 0x0E30, [0x0E32,0x0E33], - [0x0E40,0x0E45], [0x0E81,0x0E82], 0x0E84, [0x0E87,0x0E88], 0x0E8A, 0x0E8D, [0x0E94,0x0E97], [0x0E99,0x0E9F], - [0x0EA1,0x0EA3], 0x0EA5, 0x0EA7, [0x0EAA,0x0EAB], [0x0EAD,0x0EAE], 0x0EB0, [0x0EB2,0x0EB3], 0x0EBD, - [0x0EC0,0x0EC4], [0x0F40,0x0F47], [0x0F49,0x0F69], [0x10A0,0x10C5], [0x10D0,0x10F6], 0x1100, [0x1102,0x1103], - [0x1105,0x1107], 0x1109, [0x110B,0x110C], [0x110E,0x1112], 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150, - [0x1154,0x1155], 0x1159, [0x115F,0x1161], 0x1163, 0x1165, 0x1167, 0x1169, [0x116D,0x116E], [0x1172,0x1173], - 0x1175, 0x119E, 0x11A8, 0x11AB, [0x11AE,0x11AF], [0x11B7,0x11B8], 0x11BA, [0x11BC,0x11C2], 0x11EB, 0x11F0, - 0x11F9, [0x1E00,0x1E9B], [0x1EA0,0x1EF9], [0x1F00,0x1F15], [0x1F18,0x1F1D], [0x1F20,0x1F45], [0x1F48,0x1F4D], - [0x1F50,0x1F57], 0x1F59, 0x1F5B, 0x1F5D, [0x1F5F,0x1F7D], [0x1F80,0x1FB4], [0x1FB6,0x1FBC], 0x1FBE, - [0x1FC2,0x1FC4], [0x1FC6,0x1FCC], [0x1FD0,0x1FD3], [0x1FD6,0x1FDB], [0x1FE0,0x1FEC], [0x1FF2,0x1FF4], - [0x1FF6,0x1FFC], 0x2126, [0x212A,0x212B], 0x212E, [0x2180,0x2182], [0x3041,0x3094], [0x30A1,0x30FA], - [0x3105,0x312C], [0xAC00,0xD7A3]); -my @ideographic_1_0 = ([0x4E00,0x9FA5], 0x3007, [0x3021,0x3029]); -my @combining_char_1_0 = ( - [0x0300,0x0345], [0x0360,0x0361], [0x0483,0x0486], [0x0591,0x05A1], [0x05A3,0x05B9], [0x05BB,0x05BD], - 0x05BF, [0x05C1,0x05C2], 0x05C4, [0x064B,0x0652], 0x0670, [0x06D6,0x06DC], [0x06DD,0x06DF], [0x06E0,0x06E4], - [0x06E7,0x06E8], [0x06EA,0x06ED], [0x0901,0x0903], 0x093C, [0x093E,0x094C], 0x094D, [0x0951,0x0954], - [0x0962,0x0963], [0x0981,0x0983], 0x09BC, 0x09BE, 0x09BF, [0x09C0,0x09C4], [0x09C7,0x09C8], [0x09CB,0x09CD], - 0x09D7, [0x09E2,0x09E3], 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, [0x0A40,0x0A42], [0x0A47,0x0A48], [0x0A4B,0x0A4D], - [0x0A70,0x0A71], [0x0A81,0x0A83], 0x0ABC, [0x0ABE,0x0AC5], [0x0AC7,0x0AC9], [0x0ACB,0x0ACD], [0x0B01,0x0B03], - 0x0B3C, [0x0B3E,0x0B43], [0x0B47,0x0B48], [0x0B4B,0x0B4D], [0x0B56,0x0B57], [0x0B82,0x0B83], [0x0BBE,0x0BC2], - [0x0BC6,0x0BC8], [0x0BCA,0x0BCD], 0x0BD7, [0x0C01,0x0C03], [0x0C3E,0x0C44], [0x0C46,0x0C48], [0x0C4A,0x0C4D], - [0x0C55,0x0C56], [0x0C82,0x0C83], [0x0CBE,0x0CC4], [0x0CC6,0x0CC8], [0x0CCA,0x0CCD], [0x0CD5,0x0CD6], - [0x0D02,0x0D03], [0x0D3E,0x0D43], [0x0D46,0x0D48], [0x0D4A,0x0D4D], 0x0D57, 0x0E31, [0x0E34,0x0E3A], - [0x0E47,0x0E4E], 0x0EB1, [0x0EB4,0x0EB9], [0x0EBB,0x0EBC], [0x0EC8,0x0ECD], [0x0F18,0x0F19], 0x0F35, - 0x0F37, 0x0F39, 0x0F3E, 0x0F3F, [0x0F71,0x0F84], [0x0F86,0x0F8B], [0x0F90,0x0F95], 0x0F97, [0x0F99,0x0FAD], - [0x0FB1,0x0FB7], 0x0FB9, [0x20D0,0x20DC], 0x20E1, [0x302A,0x302F], 0x3099, 0x309A); -my @digit_1_0 = ( - [0x0030,0x0039], [0x0660,0x0669], [0x06F0,0x06F9], [0x0966,0x096F], [0x09E6,0x09EF], [0x0A66,0x0A6F], - [0x0AE6,0x0AEF], [0x0B66,0x0B6F], [0x0BE7,0x0BEF], [0x0C66,0x0C6F], [0x0CE6,0x0CEF], [0x0D66,0x0D6F], - [0x0E50,0x0E59], [0x0ED0,0x0ED9], [0x0F20,0x0F29]); -my @extender_1_0 = ( - 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, [0x3031,0x3035], [0x309D,0x309E], [0x30FC,0x30FE]); -my @sname_1_1 = ( - "[:A-Z_a-z]", [0xC0,0xD6], [0xD8,0xF6], [0xF8,0x2FF], [0x370,0x37D], [0x37F,0x1FFF], - [0x200C,0x200D], [0x2070,0x218F], [0x2C00,0x2FEF], [0x3001,0xD7FF], [0xF900,0xFDCF], [0xFDF0,0xFFFD], [0x10000,0xEFFFF]); - -set("WHITE", @white); -set("NEW_LINE_1_0", 0xA, 0xD); -set("NEW_LINE_1_1", 0xA, 0xD, 0x85, 0x2028); -set("DIGIT", "[0-9]"); -set("XDIGIT", "[0-9a-fA-F]"); -set("VALID_1_0", @white, [0x20,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); -set("VALID_1_1", [0x1,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); -set("UNRESTRICTED_1_1", @white, [0x20,0x7E], 0x85, [0xA0,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); -set("PUBID", 0x20, 0xD, 0xA, "[-a-zA-Z0-9'()+,./:=?:!*#@\$_%]"); -set("ENC_SNAME", "[a-zA-Z]"); -set("ENC_NAME", "[-a-zA-Z0-9._]"); -set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0); -set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0); -set("SNAME_1_1", @sname_1_1); -set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]); -set("GT", "[>]"); - -($ARGV[0] eq "" || $ARGV[1] eq "") && die("Invalid usage"); -find_cls(); -open(H, ">", $ARGV[0]) or die("Cannot create $ARGV[0]"); -open(C, ">", $ARGV[1]) or die("Cannot create $ARGV[1]"); -gen_enum(); -gen_tabs(); -close(H); -close(C); - -sub set { - my $id = shift; - $ids{$id} = scalar keys(%ids) if !defined($ids{$id}); - my $mask = 1 << $ids{$id}; - foreach my $i (@_) { - if (ref($i) eq "ARRAY") { - my $j = $i->[0]; - for (; $j <= $i->[1] && $j < 0x10000; $j++) { $cat[$j] |= $mask; } - for (; $j <= $i->[1]; $j += 0x10000) { $lcat[$j >> 16] |= $mask; } - } - elsif ($i =~ /^\[/) { for (my $j=0; $j < 128; $j++) { if (chr($j) =~ /$i/) { $cat[$j] |= $mask; } } } - else { $cat[$i] |= $mask; } - } -} - -sub find_cls { - foreach (my $i=0; $i<@cat; $i++) { $cls{$cat[$i]} = scalar keys(%cls) if !defined($cls{$cat[$i]}); } - foreach (my $i=0; $i<@lcat; $i++) { $cls{$lcat[$i]} = scalar keys(%cls) if !defined($cls{$lcat[$i]}); } -} - -sub gen_enum { - print H "enum xml_char_type {\n"; - foreach my $id (sort keys %ids) { - my $mask = 0; - foreach my $i (keys %cls) { - $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id})); - } - printf H " XML_CHAR_%-20s = 0x%08x,\n", $id, $mask; - } - print H "};\n\n"; -} - -sub gen_tabs { - my @tab = (); - my %hash = (); - - print H "extern const byte ucw_xml_char_tab1[];\n"; - print H "extern const uint ucw_xml_char_tab2[];\n"; - print H "extern const byte ucw_xml_char_tab3[];\n"; - - print C "const uint ucw_xml_char_tab2[] = {\n "; - for (my $t=0; $t<256; $t++) { - my $i = $t * 256; - my @x = (); - for (my $j=0; $j<256; $j += 32) { - push @x, join(",", map($cls{$_}, @cat[$i+$j..$i+$j+31])); - } - my $sub = " " . join(",\n ", @x); - if (!defined($hash{$sub})) { - $hash{$sub} = 256 * scalar @tab; - push @tab, $sub; - } - printf C "0x%x", $hash{$sub}; - print C ((~$t & 15) ? "," : ($t < 255) ? ",\n " : "\n};\n\n"); - } - - print C "const byte ucw_xml_char_tab1[] = {\n"; - print C join(",\n\n", @tab); - print C "\n};\n\n"; - - my @l = (); - for (my $i=0; $i<0x11; $i++) { - push @l, sprintf("%d", $cls{$lcat[$i]}); - } - print C "const byte ucw_xml_char_tab3[] = {" . join(",", @l) . "};\n"; -} diff --git a/xml/xml-test.c b/xml/xml-test.c deleted file mode 100644 index 6821c17f..00000000 --- a/xml/xml-test.c +++ /dev/null @@ -1,365 +0,0 @@ -/* - * UCW Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -enum { - WANT_FIRST = 0x100, - WANT_HIDE_ERRORS, - WANT_IGNORE_COMMENTS, - WANT_IGNORE_PIS, - WANT_REPORT_BLOCKS, - WANT_REPORT_IGNORABLE, - WANT_FILE_ENTITIES, -}; - -static char *shortopts = "spdt" CF_SHORT_OPTS; -static struct option longopts[] = { - CF_LONG_OPTS - { "sax", 0, 0, 's' }, - { "pull", 0, 0, 'p' }, - { "dom", 0, 0, 't' }, - { "dtd", 0, 0, 'd' }, - { "hide-errors", 0, 0, WANT_HIDE_ERRORS }, - { "ignore-comments", 0, 0, WANT_IGNORE_COMMENTS }, - { "ignore-pis", 0, 0, WANT_IGNORE_PIS }, - { "report-blocks", 0, 0, WANT_REPORT_BLOCKS }, - { "report-ignorable", 0, 0, WANT_REPORT_IGNORABLE }, - { "file-entities", 0, 0, WANT_FILE_ENTITIES }, - { NULL, 0, 0, 0 } -}; - -static void NONRET -usage(void) -{ - fputs("\ -Usage: xml-test [options] < input.xml\n\ -\n\ -Options:\n" -CF_USAGE -"\ --p, --pull Test PULL interface\n\ --s, --sax Test SAX interface\n\ --t, --dom Test DOM interface\n\ --d, --dtd Enable parsing of DTD\n\ - --hide-errors Hide warnings and error messages\n\ - --ignore-comments Ignore comments\n\ - --ignore-pis Ignore processing instructions\n\ - --report-blocks Report blocks or characters and CDATA sections\n\ - --report-ignorable Report ignorable whitespace\n\ - --file-entities Resolve file external entities (not fully normative)\n\ -\n", stderr); - exit(1); -} - -static uint want_sax; -static uint want_pull; -static uint want_dom; -static uint want_parse_dtd; -static uint want_hide_errors; -static uint want_ignore_comments; -static uint want_ignore_pis; -static uint want_report_blocks; -static uint want_report_ignorable; -static uint want_file_entities; - -static struct fastbuf *out; - -static char * -node_type(struct xml_node *node) -{ - switch (node->type) - { - case XML_NODE_ELEM: return "element"; - case XML_NODE_COMMENT: return "comment"; - case XML_NODE_PI: return "pi"; - case XML_NODE_CHARS: return "chars"; - default: return "unknown"; - } -} - -static void -show_node(struct xml_node *node) -{ - switch (node->type) - { - case XML_NODE_ELEM: - bprintf(out, " <%s>", node->name); - XML_ATTR_FOR_EACH(a, node) - bprintf(out, " %s='%s'", a->name, a->val); - bputc(out, '\n'); - break; - case XML_NODE_COMMENT: - bprintf(out, " text='%s'\n", node->text); - break; - case XML_NODE_PI: - bprintf(out, " target=%s text='%s'\n", node->name, node->text); - break; - case XML_NODE_CHARS: - bprintf(out, " text='%s'\n", node->text); - break; - default: - bputc(out, '\n'); - } -} - -static void -show_tree(struct xml_node *node, uint level) -{ - if (!node) - return; - bputs(out, "DOM: "); - for (uint i = 0; i < level; i++) - bputs(out, " "); - bputs(out, node_type(node)); - show_node(node); - if (node->type == XML_NODE_ELEM) - XML_NODE_FOR_EACH(son, node) - show_tree(son, level + 1); -} - -static void -h_error(struct xml_context *ctx) -{ - bprintf(out, "SAX: %s at %u: %s\n", (ctx->err_code < XML_ERR_ERROR) ? "warn" : "error", xml_row(ctx), ctx->err_msg); -} - -static void -h_document_start(struct xml_context *ctx UNUSED) -{ - bputs(out, "SAX: document_start\n"); -} - -static void -h_document_end(struct xml_context *ctx UNUSED) -{ - bputs(out, "SAX: document_end\n"); -} - -static void -h_xml_decl(struct xml_context *ctx) -{ - bprintf(out, "SAX: xml_decl version=%s standalone=%d fb_encoding=%s\n", ctx->version_str, ctx->standalone, ctx->src->fb_encoding); -} - -static void -h_doctype_decl(struct xml_context *ctx) -{ - bprintf(out, "SAX: doctype_decl type=%s public='%s' system='%s' extsub=%d intsub=%d\n", - ctx->doctype, ctx->public_id ? : "", ctx->system_id ? : "", - !!(ctx->flags & XML_HAS_EXTERNAL_SUBSET), !!(ctx->flags & XML_HAS_INTERNAL_SUBSET)); -} - -static void -h_comment(struct xml_context *ctx) -{ - bputs(out, "SAX: comment"); - show_node(ctx->node); -} - -static void -h_pi(struct xml_context *ctx) -{ - bputs(out, "SAX: pi"); - show_node(ctx->node); -} - -static void -h_stag(struct xml_context *ctx) -{ - bputs(out, "SAX: stag"); - show_node(ctx->node); -} - -static void -h_etag(struct xml_context *ctx) -{ - bprintf(out, "SAX: etag \n", ctx->node->name); -} - -static void -h_chars(struct xml_context *ctx) -{ - bputs(out, "SAX: chars"); - show_node(ctx->node); -} - -static void -h_block(struct xml_context *ctx UNUSED, char *text, uint len UNUSED) -{ - bprintf(out, "SAX: block text='%s'\n", text); -} - -static void -h_cdata(struct xml_context *ctx UNUSED, char *text, uint len UNUSED) -{ - bprintf(out, "SAX: cdata text='%s'\n", text); -} - -static void -h_ignorable(struct xml_context *ctx UNUSED, char *text, uint len UNUSED) -{ - bprintf(out, "SAX: ignorable text='%s'\n", text); -} - -static void -h_dtd_start(struct xml_context *ctx UNUSED) -{ - bputs(out, "SAX: dtd_start\n"); -} - -static void -h_dtd_end(struct xml_context *ctx UNUSED) -{ - bputs(out, "SAX: dtd_end\n"); -} - -static void -h_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *e) -{ - xml_push_fastbuf(ctx, bopen(e->system_id, O_RDONLY, 4096)); -} - -int -main(int argc, char **argv) -{ - int opt; - cf_def_file = NULL; - log_init(argv[0]); - while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0) - switch (opt) - { - case 's': - want_sax++; - break; - case 'p': - want_pull++; - break; - case 't': - want_dom++; - break; - case 'd': - want_parse_dtd++; - break; - case WANT_HIDE_ERRORS: - want_hide_errors++; - break; - case WANT_IGNORE_COMMENTS: - want_ignore_comments++; - break; - case WANT_IGNORE_PIS: - want_ignore_pis++; - break; - case WANT_REPORT_BLOCKS: - want_report_blocks++; - break; - case WANT_REPORT_IGNORABLE: - want_report_ignorable++; - break; - case WANT_FILE_ENTITIES: - want_file_entities++; - break; - default: - usage(); - } - if (optind != argc) - usage(); - - out = bfdopen_shared(1, 4096); - struct xml_context ctx; - xml_init(&ctx); - if (!want_hide_errors) - ctx.h_warn = ctx.h_error = ctx.h_fatal = h_error; - if (want_sax) - { - ctx.h_document_start = h_document_start; - ctx.h_document_end = h_document_end; - ctx.h_xml_decl = h_xml_decl; - ctx.h_doctype_decl = h_doctype_decl; - ctx.h_comment = h_comment; - ctx.h_pi = h_pi; - ctx.h_stag = h_stag; - ctx.h_etag = h_etag; - ctx.h_chars = h_chars; - if (want_report_blocks) - { - ctx.h_block = h_block; - ctx.h_cdata = h_cdata; - } - if (want_report_ignorable) - ctx.h_ignorable = h_ignorable; - ctx.h_dtd_start = h_dtd_start; - ctx.h_dtd_end = h_dtd_end; - } - if (want_dom) - ctx.flags |= XML_ALLOC_ALL; - if (want_parse_dtd) - ctx.flags |= XML_PARSE_DTD; - if (want_ignore_comments) - ctx.flags &= ~(XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS); - if (want_ignore_pis) - ctx.flags &= ~(XML_REPORT_PIS | XML_ALLOC_PIS); - if (want_file_entities) - ctx.h_resolve_entity = h_resolve_entity; - xml_push_fastbuf(&ctx, bfdopen_shared(0, 4096)); - bputs(out, "PULL: start\n"); - if (want_pull) - { - ctx.pull = XML_PULL_CHARS | XML_PULL_STAG | XML_PULL_ETAG | XML_PULL_COMMENT | XML_PULL_PI; - uint state; - while (state = xml_next(&ctx)) - switch (state) - { - case XML_STATE_CHARS: - bputs(out, "PULL: chars"); - show_node(ctx.node); - break; - case XML_STATE_STAG: - bputs(out, "PULL: stag"); - show_node(ctx.node); - break; - case XML_STATE_ETAG: - bprintf(out, "PULL: etag \n", ctx.node->name); - break; - case XML_STATE_COMMENT: - bputs(out, "PULL: comment"); - show_node(ctx.node); - break; - case XML_STATE_PI: - bputs(out, "PULL: pi"); - show_node(ctx.node); - break; - default: - bputs(out, "PULL: unknown\n"); - break; - } - } - else - xml_parse(&ctx); - if (ctx.err_code) - bprintf(out, "PULL: fatal error at %u: %s\n", xml_row(&ctx), ctx.err_msg); - else - { - bputs(out, "PULL: eof\n"); - if (want_dom) - show_tree(ctx.dom, 0); - } - - xml_cleanup(&ctx); - bclose(out); - return 0; -} diff --git a/xml/xml-test.t b/xml/xml-test.t deleted file mode 100644 index 8d0f9bb1..00000000 --- a/xml/xml-test.t +++ /dev/null @@ -1,58 +0,0 @@ -# Tests for the XML parser -# (c) 2008 Pavel Charvat - -Run: ../obj/xml/xml-test -In: - -Out: PULL: start - PULL: eof - -Run: ../obj/xml/xml-test -s -In: - text1&amp;<text2 -Out: PULL: start - SAX: document_start - SAX: xml_decl version=1.0 standalone=0 fb_encoding=ISO-8859-1 - SAX: stag - SAX: stag a1='val1' a2='val2' - SAX: chars text='text1&<' - SAX: etag - SAX: chars text='text2' - SAX: etag - SAX: document_end - PULL: eof - -Run: ../obj/xml/xml-test -sptd -In: - - "> - %pe1; - - - ]> - &e1;&e2; -Out: PULL: start - SAX: document_start - SAX: xml_decl version=1.0 standalone=0 fb_encoding=UTF-8 - SAX: doctype_decl type=root public='' system='' extsub=0 intsub=1 - SAX: dtd_start - SAX: dtd_end - SAX: stag - PULL: stag - SAX: chars text='text' - PULL: chars text='text' - SAX: stag - PULL: stag - SAX: chars text='' - PULL: chars text='' - PULL: etag - SAX: etag - PULL: etag - SAX: etag - SAX: document_end - PULL: eof - DOM: element - DOM: chars text='text' - DOM: element - DOM: chars text='' diff --git a/xml/xml.h b/xml/xml.h deleted file mode 100644 index c048f56c..00000000 --- a/xml/xml.h +++ /dev/null @@ -1,294 +0,0 @@ -/* - * UCW Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#ifndef _UCW_XML_XML_H -#define _UCW_XML_XML_H - -#include -#include -#include -#include - -#ifdef CONFIG_UCW_CLEAN_ABI -#define xml_attr_find ucw_xml_attr_find -#define xml_attr_value ucw_xml_attr_value -#define xml_cleanup ucw_xml_cleanup -#define xml_def_find_entity ucw_xml_def_find_entity -#define xml_def_resolve_entity ucw_xml_def_resolve_entity -#define xml_error ucw_xml_error -#define xml_fatal ucw_xml_fatal -#define xml_init ucw_xml_init -#define xml_merge_chars ucw_xml_merge_chars -#define xml_merge_dom_chars ucw_xml_merge_dom_chars -#define xml_next ucw_xml_next -#define xml_next_state ucw_xml_next_state -#define xml_normalize_white ucw_xml_normalize_white -#define xml_parse ucw_xml_parse -#define xml_push_fastbuf ucw_xml_push_fastbuf -#define xml_reset ucw_xml_reset -#define xml_row ucw_xml_row -#define xml_skip_element ucw_xml_skip_element -#define xml_warn ucw_xml_warn -#endif - -struct xml_context; -struct xml_dtd_entity; - -enum xml_error { - XML_ERR_OK = 0, - XML_ERR_WARN = 1000, /* Warning */ - XML_ERR_ERROR = 2000, /* Recoverable error */ - XML_ERR_FATAL = 3000, /* Unrecoverable error */ - XML_ERR_EOF, -}; - -enum xml_state { - XML_STATE_EOF, /* EOF or a fatal error */ - XML_STATE_START, /* Initial state */ - XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */ - XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */ - XML_STATE_CHARS, /* XML_PULL_CHARS */ - XML_STATE_STAG, /* XML_PULL_STAG */ - XML_STATE_ETAG, /* XML_PULL_ETAG */ - XML_STATE_COMMENT, /* XML_PULL_COMMENT */ - XML_STATE_PI, /* XML_PULL_PI */ - - /* Internal states */ - XML_STATE_CHARS_BEFORE_STAG, - XML_STATE_CHARS_BEFORE_ETAG, - XML_STATE_CHARS_BEFORE_CDATA, - XML_STATE_CHARS_BEFORE_COMMENT, - XML_STATE_CHARS_BEFORE_PI, - XML_STATE_PROLOG_COMMENT, - XML_STATE_PROLOG_PI, - XML_STATE_EPILOG_COMMENT, - XML_STATE_EPILOG_PI, -}; - -enum xml_pull { - XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */ - XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */ - XML_PULL_CHARS = 0x00000004, - XML_PULL_STAG = 0x00000008, - XML_PULL_ETAG = 0x00000010, - XML_PULL_COMMENT = 0x00000020, - XML_PULL_PI = 0x00000040, - XML_PULL_ALL = 0xffffffff, -}; - -enum xml_flags { - /* Enable reporting of various events via SAX and/or PULL interface */ - XML_REPORT_COMMENTS = 0x00000001, /* Report comments */ - XML_REPORT_PIS = 0x00000002, /* Report processing instructions */ - XML_REPORT_CHARS = 0x00000004, /* Report characters */ - XML_REPORT_TAGS = 0x00000008, /* Report element starts/ends */ - XML_REPORT_MISC = XML_REPORT_COMMENTS | XML_REPORT_PIS, - XML_REPORT_ALL = XML_REPORT_MISC | XML_REPORT_CHARS | XML_REPORT_TAGS, - - /* Enable construction of DOM for these types */ - XML_ALLOC_COMMENTS = 0x00000010, /* Create comment nodes */ - XML_ALLOC_PIS = 0x00000020, /* Create processing instruction nodes */ - XML_ALLOC_CHARS = 0x00000040, /* Create character nodes */ - XML_ALLOC_TAGS = 0x00000080, /* Create element nodes */ - XML_ALLOC_MISC = XML_ALLOC_COMMENTS | XML_ALLOC_PIS, - XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS, - - /* Other parameters */ - XML_VALIDATING = 0x00000100, /* Validate everything (not fully implemented!) */ - XML_PARSE_DTD = 0x00000200, /* Enable parsing of DTD */ - XML_NO_CHARS = 0x00000400, /* The current element must not contain character data (filled automaticaly if using DTD) */ - XML_ALLOC_DEFAULT_ATTRS = 0x00000800, /* Allocate default attribute values so they can be found by XML_ATTR_FOR_EACH */ - - /* Internals, do not change! */ - XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */ - XML_VERSION_1_1 = 0x00020000, /* XML version is 1.1, otherwise 1.0 */ - XML_HAS_EXTERNAL_SUBSET = 0x00040000, /* The document contains a reference to external DTD subset */ - XML_HAS_INTERNAL_SUBSET = 0x00080000, /* The document contains an internal subset */ - XML_HAS_DTD = XML_HAS_EXTERNAL_SUBSET | XML_HAS_INTERNAL_SUBSET, - XML_SRC_EOF = 0x00100000, /* EOF reached */ - XML_SRC_EXPECTED_DECL = 0x00200000, /* Just before optional or required XMLDecl/TextDecl */ - XML_SRC_DOCUMENT = 0x00400000, /* The document entity */ - XML_SRC_EXTERNAL = 0x00800000, /* An external entity */ -}; - -enum xml_node_type { - XML_NODE_ELEM, - XML_NODE_COMMENT, - XML_NODE_CHARS, - XML_NODE_PI, -}; - -#define XML_NODE_FOR_EACH(var, node) CLIST_FOR_EACH(struct xml_node *, var, (node)->sons) -#define XML_ATTR_FOR_EACH(var, node) SLIST_FOR_EACH(struct xml_attr *, var, (node)->attrs) - -struct xml_node { - cnode n; /* Node for list of parent's sons */ - uint type; /* XML_NODE_x */ - struct xml_node *parent; /* Parent node */ - char *name; /* Element name / PI target */ - clist sons; /* Children nodes */ - union { - struct { - char *text; /* PI text / Comment / CDATA */ - uint len; /* Text length in bytes */ - }; - struct { - struct xml_dtd_elem *dtd; /* Element DTD */ - slist attrs; /* Link list of element attributes */ - }; - }; - void *user; /* User-defined (initialized to NULL) */ -}; - -struct xml_attr { - snode n; /* Node for elem->attrs */ - struct xml_node *elem; /* Parent element */ - struct xml_dtd_attr *dtd; /* Attribute DTD */ - char *name; /* Attribute name */ - char *val; /* Attribute value */ - void *user; /* User-defined (initialized to NULL) */ -}; - -#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */ - -struct xml_source { - struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ - struct fastbuf *fb; /* Source fastbuf */ - struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */ - struct fastbuf wrap_fb; /* Fbmem wrapper */ - u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ - u32 *bptr, *bstop; /* Current state of the buffer */ - uint row; /* File position */ - char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ - char *fb_encoding; /* Encoding of the source fastbuf */ - char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ - uint refill_cat1; /* Character categories, which should be directly passed to the buffer */ - uint refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in - sequences) */ - void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ - unsigned short *refill_in_to_x; /* Libucw-charset input table */ - uint saved_depth; /* Saved ctx->depth */ - uint pending_0xd; /* The last read character is 0xD */ -}; - -struct xml_context { - /* Error handling */ - char *err_msg; /* Last error message */ - enum xml_error err_code; /* Last error code */ - void *throw_buf; /* Where to jump on error */ - void (*h_warn)(struct xml_context *ctx); /* Warning callback */ - void (*h_error)(struct xml_context *ctx); /* Recoverable error callback */ - void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */ - - /* Memory management */ - struct mempool *pool; /* DOM pool */ - struct mempool *stack; /* Stack pool (freed as soon as possible) */ - struct xml_stack *stack_list; /* See xml_push(), xml_pop() */ - uint flags; /* XML_FLAG_x (restored on xml_pop()) */ - uint depth; /* Nesting level (for checking of valid source nesting -> valid pushes/pops on memory pools) */ - struct fastbuf chars; /* Character data / attribute value */ - struct mempool_state chars_state; /* Mempool state before the current character block has started */ - char *chars_trivial; /* If not empty, it will be appended to chars */ - void *tab_attrs; /* Hash table of element attributes */ - - /* Input */ - struct xml_source *src; /* Current source */ - u32 *bptr, *bstop; /* Buffer with preprocessed characters (validated UCS-4 + category flags) */ - uint cat_chars; /* Unicode range of supported characters (cdata, attribute values, ...) */ - uint cat_unrestricted; /* Unrestricted characters (may appear in document/external entities) */ - uint cat_new_line; /* New line characters */ - uint cat_name; /* Characters that may appear in names */ - uint cat_sname; /* Characters that may begin a name */ - - /* SAX-like interface */ - void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */ - void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */ - void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */ - void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration (before optional internal subset) */ - void (*h_comment)(struct xml_context *ctx); /* Called after a comment (only with XML_REPORT_COMMENTS) */ - void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */ - void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */ - void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */ - void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */ - void (*h_block)(struct xml_context *ctx, char *text, uint len); /* Called for each continuous block of characters not reported by h_cdata() (only with XML_REPORT_CHARS) */ - void (*h_cdata)(struct xml_context *ctx, char *text, uint len); /* Called for each CDATA section (only with XML_REPORT_CHARS) */ - void (*h_ignorable)(struct xml_context *ctx, char *text, uint len); /* Called for ignorable whitespace (content in tags without #PCDATA) */ - void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */ - void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */ - struct xml_dtd_entity *(*h_find_entity)(struct xml_context *ctx, char *name); /* Called when needed to resolve a general entity */ - void (*h_resolve_entity)(struct xml_context *ctx, struct xml_dtd_entity *ent); /* User should push source fastbuf for a parsed external entity (either general or parameter) */ - - /* DOM */ - struct xml_node *dom; /* DOM root */ - struct xml_node *node; /* Current DOM node */ - - char *version_str; - uint standalone; - char *doctype; /* The document type (or NULL if unknown) */ - char *system_id; /* DTD external id */ - char *public_id; /* DTD public id */ - struct xml_dtd *dtd; /* The DTD structure (or NULL) */ - uint state; /* Current state for the PULL interface (XML_STATE_x) */ - uint pull; /* Parameters for the PULL interface (XML_PULL_x) */ -}; - -/* Initialize XML context */ -void xml_init(struct xml_context *ctx); - -/* Clean up all internal structures */ -void xml_cleanup(struct xml_context *ctx); - -/* Reuse XML context, equivalent to xml_cleanup() and xml_init() */ -void xml_reset(struct xml_context *ctx); - -/* Add XML source (fastbuf will be automatically closed) */ -struct xml_source *xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb); - -/* Parse without the PULL interface, return XML_ERR_x code (zero on success) */ -uint xml_parse(struct xml_context *ctx); - -/* Parse with the PULL interface, return XML_STATE_x (zero on EOF or fatal error) */ -uint xml_next(struct xml_context *ctx); - -/* Equivalent to xml_next, but with temporarily changed ctx->pull value */ -uint xml_next_state(struct xml_context *ctx, uint pull); - -/* May be called on XML_STATE_STAG to skip it's content; can return XML_STATE_ETAG or XML_STATE_EOF on fatal error */ -uint xml_skip_element(struct xml_context *ctx); - -/* Returns the current row number in the document entity */ -uint xml_row(struct xml_context *ctx); - -/* Finds a given attribute value in a XML_NODE_ELEM node */ -struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name); - -/* Similar to xml_attr_find, but it deals also with default values */ -char *xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name); - -/* The default value of h_find_entity(), knows <, >, &, ' and " */ -struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name); - -/* The default value of h_resolve_entity(), throws an error */ -void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); - -/* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */ -uint xml_normalize_white(struct xml_context *ctx, char *value); - -/* Merge character contents of a given element to a single string (not recursive) */ -char *xml_merge_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool); - -/* Merge character contents of a given subtree to a single string */ -char *xml_merge_dom_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool); - -/* Public part of error handling */ -void xml_warn(struct xml_context *ctx, const char *format, ...); -void xml_error(struct xml_context *ctx, const char *format, ...); -void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...); - -#endif