From: Pavel Charvat Date: Wed, 12 Dec 2007 00:57:04 +0000 (+0100) Subject: XML: Split to several files, revised part of iface and X-Git-Tag: holmes-import~481 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=d998b1961061c93132531d6d9cd2772a0c51ea1a;hp=637533a60b2201eaadedcb00fc66ef1e20237432;p=libucw.git XML: Split to several files, revised part of iface and memory handling, added a testing utility. --- diff --git a/sherlock/xml/Makefile b/sherlock/xml/Makefile index f721b500..cc9fda50 100644 --- a/sherlock/xml/Makefile +++ b/sherlock/xml/Makefile @@ -2,22 +2,27 @@ # (c) 2007 Pavel Charvat DIRS+=sherlock/xml +PROGS+=$(o)/sherlock/xml/xml-test -LIBSHXML_MODS=xml +LIBSHXML_MODS=common parse dtd LIBSHXML_INCLUDES=xml.h dtd.h LIBSHXML_MOD_PATHS=$(addprefix $(o)/sherlock/xml/,$(LIBSHXML_MODS)) $(o)/sherlock/xml/libshxml.a: $(addsuffix .o,$(LIBSHXML_MOD_PATHS)) $(o)/sherlock/xml/libshxml.so: $(addsuffix .oo,$(LIBSHXML_MOD_PATHS)) -$(o)/sherlock/xml/libshxml.pc: $(LIBUCW) $(LIBCHARSET) +$(o)/sherlock/xml/libshxml.pc: $(LIBSH) $(LIBCHARSET) -$(o)/sherlock/xml/xml-t: $(LIBSHXML) -$(o)/sherlock/xml/xml.o: $(o)/sherlock/xml/unicat.h -$(o)/sherlock/xml/unicat.h: $(s)/sherlock/xml/unicat.pl - $(M)GEN $@ - $(Q)$< >$@ +$(o)/sherlock/xml/common.o $(o)/sherlock/xml/unicat.h: $(o)/sherlock/xml/unicat.stamp +$(o)/sherlock/xml/unicat.stamp: $(s)/sherlock/xml/unicat.pl + $(M)GEN $(addprefix $(o)/sherlock/xml/unicat,.h .c) + $(Q)$< $(addprefix $(o)/sherlock/xml/unicat,.h .c) + $(Q)touch $@ +$(o)/sherlock/xml/xml-test: $(o)/sherlock/xml/xml-test.o $(LIBSHXML) + +API_LIBS+=libshxml API_INCLUDES+=$(o)/sherlock/xml/.include-stamp $(o)/sherlock/xml/.include-stamp: $(addprefix $(s)/sherlock/xml/,$(LIBSHXML_INCLUDES)) $(o)/sherlock/xml/.include-stamp: IDST=sherlock/xml +run/lib/pkgconfig/libshxml.pc: $(o)/sherlock/xml/libshxml.pc diff --git a/sherlock/xml/common.c b/sherlock/xml/common.c new file mode 100644 index 00000000..4d96cecc --- /dev/null +++ b/sherlock/xml/common.c @@ -0,0 +1,652 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#define LOCAL_DEBUG + +#include "lib/lib.h" +#include "lib/mempool.h" +#include "lib/fastbuf.h" +#include "lib/ff-unicode.h" +#include "lib/ff-binary.h" +#include "lib/chartype.h" +#include "lib/unicode.h" +#include "lib/hashfunc.h" +#include "lib/stkstring.h" +#include "lib/unaligned.h" +#include "charset/charconv.h" +#include "charset/fb-charconv.h" +#include "sherlock/xml/xml.h" +#include "sherlock/xml/dtd.h" +#include "sherlock/xml/common.h" + +#include + +/*** Error handling ***/ + +void NONRET +xml_throw(struct xml_context *ctx) +{ + ASSERT(ctx->err_code && ctx->throw_buf); + longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code); +} + +void +xml_warn(struct xml_context *ctx, const char *format, ...) +{ + if (ctx->h_warn) + { + va_list args; + va_start(args, format); + ctx->err_msg = stk_vprintf(format, args); + ctx->err_code = XML_ERR_WARN; + va_end(args); + ctx->h_warn(ctx); + ctx->err_msg = NULL; + ctx->err_code = XML_ERR_OK; + } +} + +void +xml_error(struct xml_context *ctx, const char *format, ...) +{ + if (ctx->h_error) + { + va_list args; + va_start(args, format); + ctx->err_msg = stk_vprintf(format, args); + ctx->err_code = XML_ERR_ERROR; + va_end(args); + ctx->h_error(ctx); + ctx->err_msg = NULL; + ctx->err_code = XML_ERR_OK; + } +} + +void NONRET +xml_fatal(struct xml_context *ctx, const char *format, ...) +{ + va_list args; + va_start(args, format); + ctx->err_msg = mp_vprintf(ctx->stack, format, args); + ctx->err_code = XML_ERR_FATAL; + ctx->state = XML_STATE_FATAL; + va_end(args); + if (ctx->h_fatal) + ctx->h_fatal(ctx); + xml_throw(ctx); +} + +/*** Charecter categorization ***/ + +#include "obj/sherlock/xml/unicat.c" + +/*** Memory management ***/ + +void NONRET +xml_fatal_nested(struct xml_context *ctx) +{ + xml_fatal(ctx, "Entity not nested correctly"); +} + +void * +xml_hash_new(struct mempool *pool, uns size) +{ + void *tab = mp_alloc_zero(pool, size + XML_HASH_HDR_SIZE); + *(void **)tab = pool; + return tab + XML_HASH_HDR_SIZE; +} + +/*** Reading of document/external entities ***/ + +static void NONRET +xml_eof(struct xml_context *ctx) +{ + ctx->err_msg = "Unexpected EOF"; + ctx->err_code = XML_ERR_EOF; + xml_throw(ctx); +} + +static inline void +xml_add_char(u32 **bstop, uns c) +{ + *(*bstop)++ = c; + *(*bstop)++ = xml_char_cat(c); +} + +struct xml_source * +xml_push_source(struct xml_context *ctx, uns flags) +{ + xml_push(ctx); + struct xml_source *src = ctx->src; + if (src) + { + src->bptr = ctx->bptr; + src->bstop = ctx->bstop; + } + src = mp_alloc_zero(ctx->stack, sizeof(*src)); + src->next = ctx->src; + src->saved_depth = ctx->depth; + ctx->src = src; + ctx->flags = (ctx->flags & ~(XML_FLAG_SRC_EOF | XML_FLAG_SRC_EXPECTED_DECL | XML_FLAG_SRC_NEW_LINE | XML_FLAG_SRC_SURROUND | XML_FLAG_SRC_DOCUMENT)) | flags; + ctx->bstop = ctx->bptr = src->buf; + ctx->depth = 0; + if (flags & XML_FLAG_SRC_SURROUND) + xml_add_char(&ctx->bstop, 0x20); + return src; +} + +static void +xml_pop_source(struct xml_context *ctx) +{ + TRACE(ctx, "pop_source"); + if (unlikely(ctx->depth != 0)) + xml_fatal_nested(ctx); + struct xml_source *src = ctx->src; + ASSERT(src); + bclose(src->fb); + ctx->depth = src->saved_depth; + ctx->src = src = src->next; + if (src) + { + ctx->bptr = src->bptr; + ctx->bstop = src->bstop; + } + xml_pop(ctx); + if (unlikely(!src)) + xml_eof(ctx); +} + +static void xml_refill_utf8(struct xml_context *ctx); + +void +xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent) +{ + TRACE(ctx, "xml_push_entity"); + uns cat1 = ctx->src->refill_cat1; + uns cat2 = ctx->src->refill_cat2; + struct xml_source *src = xml_push_source(ctx, 0); + src->refill_cat1 = cat1; + src->refill_cat2 = cat2; + if (ent->flags & XML_DTD_ENT_EXTERNAL) + xml_fatal(ctx, "External entities not implemented"); // FIXME + else + { + fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0); + src->refill = xml_refill_utf8; + } +} + +void +xml_set_source(struct xml_context *ctx, struct fastbuf *fb) +{ + TRACE(ctx, "xml_set_source"); + ASSERT(!ctx->src); + struct xml_source *src = xml_push_source(ctx, XML_FLAG_SRC_DOCUMENT | XML_FLAG_SRC_EXPECTED_DECL); + src->fb = fb; +} + +static uns +xml_error_restricted(struct xml_context *ctx, uns c) +{ + if (c == ~1U) + xml_error(ctx, "Corrupted encoding"); + else + xml_error(ctx, "Restricted char U+%04X", c); + return UNI_REPLACEMENT; +} + +void xml_parse_decl(struct xml_context *ctx); + +#define REFILL(ctx, func, params...) \ + struct xml_source *src = ctx->src; \ + struct fastbuf *fb = src->fb; \ + if (ctx->bptr == ctx->bstop) \ + ctx->bptr = ctx->bstop = src->buf; \ + uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ + u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \ + *last_0xd = (f & XML_FLAG_SRC_NEW_LINE) ? bstop : bend; \ + do \ + { \ + c = func(fb, ##params); \ + uns t = xml_char_cat(c); \ + if (t & t1) \ + /* Typical branch */ \ + *bstop++ = c, *bstop++ = t; \ + else if (t & t2) \ + { \ + /* New line */ \ + /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \ + /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \ + if (c == 0xd) \ + last_0xd = bstop + 2; \ + else if (c != 0x2028 && last_0xd == bstop) \ + { \ + last_0xd = bend; \ + continue; \ + } \ + xml_add_char(&bstop, 0xa), row++; \ + } \ + else if (c == '>') \ + { \ + /* Used only in XML/TextDecl to switch the encoding */ \ + *bstop++ = c, *bstop++ = t; \ + break; \ + } \ + else if (~c) \ + /* Restricted character */ \ + xml_add_char(&bstop, xml_error_restricted(ctx, c)); \ + else \ + { \ + /* EOF */ \ + if (f & XML_FLAG_SRC_SURROUND) \ + xml_add_char(&bstop, 0x20); \ + f |= XML_FLAG_SRC_EOF; \ + break; \ + } \ + } \ + while (bstop < bend); \ + ctx->flags = (last_0xd == bstop) ? f | XML_FLAG_SRC_NEW_LINE : f & ~XML_FLAG_SRC_NEW_LINE; \ + ctx->bstop = bstop; \ + src->row = row; + +static void +xml_refill_utf8(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf8_repl, ~1U); +} + +static void +xml_refill_utf16_le(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf16_le_repl, ~1U); +} + +static void +xml_refill_utf16_be(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf16_be_repl, ~1U); +} + +#if 0 +static inline uns +xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x) +{ + // FIXME: slow + int c; + return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]); +} + +static void +xml_refill_libcharset(struct xml_context *ctx) +{ + unsigned short int *in_to_x = ctx->src->refill_in_to_x; + REFILL(ctx, xml_refill_libcharset_bget, in_to_x); +} +#endif + +#undef REFILL + +void +xml_refill(struct xml_context *ctx) +{ + do + { + if (ctx->flags & XML_FLAG_SRC_EOF) + xml_pop_source(ctx); + else if (ctx->flags & XML_FLAG_SRC_EXPECTED_DECL) + xml_parse_decl(ctx); + else + { + ctx->src->refill(ctx); + TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2)); + } + } + while (ctx->bptr == ctx->bstop); +} + +uns +xml_row(struct xml_context *ctx) +{ + struct xml_source *src = ctx->src; + if (!src) + return 0; + uns row = src->row; + for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2) + if (p[-1] & src->refill_cat2) + row--; + return row + 1; +} + +/*** Basic parsing ***/ + +void NONRET +xml_fatal_expected(struct xml_context *ctx, uns c) +{ + xml_fatal(ctx, "Expected '%c'", c); +} + +void NONRET +xml_fatal_expected_white(struct xml_context *ctx) +{ + xml_fatal(ctx, "Expected a white space"); +} + +void NONRET +xml_fatal_expected_quot(struct xml_context *ctx) +{ + xml_fatal(ctx, "Expected a quotation mark"); +} + +void +xml_parse_eq(struct xml_context *ctx) +{ + /* Eq ::= S? '=' S? */ + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '='); + xml_parse_white(ctx, 0); +} + +/* Names and nmtokens */ + +static char * +xml_parse_string(struct xml_context *ctx, struct mempool *pool, uns first_cat, uns next_cat, char *err) +{ + char *p = mp_start_noalign(pool, 1); + if (unlikely(!(xml_peek_cat(ctx) & first_cat))) + xml_fatal(ctx, "%s", err); + do + { + p = mp_spread(pool, p, 5); + p = utf8_32_put(p, xml_skip_char(ctx)); + } + while (xml_peek_cat(ctx) & next_cat); + *p++ = 0; + return mp_end(pool, p); +} + +static void +xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err) +{ + if (unlikely(!(xml_get_cat(ctx) & first_cat))) + xml_fatal(ctx, "%s", err); + while (xml_peek_cat(ctx) & next_cat) + xml_skip_char(ctx); +} + +char * +xml_parse_name(struct xml_context *ctx, struct mempool *pool) +{ + /* Name ::= NameStartChar (NameChar)* */ + return xml_parse_string(ctx, pool, + !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, + !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, + "Expected a name"); +} + +void +xml_skip_name(struct xml_context *ctx) +{ + xml_skip_string(ctx, + !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, + !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, + "Expected a name"); +} + +char * +xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool) +{ + /* Nmtoken ::= (NameChar)+ */ + uns cat = !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1; + return xml_parse_string(ctx, pool, cat, cat, "Expected a nmtoken"); +} + +/* Simple literals */ + +char * +xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool) +{ + /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */ + char *p = mp_start_noalign(pool, 1); + uns q = xml_parse_quote(ctx), c; + while ((c = xml_get_char(ctx)) != q) + { + p = mp_spread(pool, p, 5); + p = utf8_32_put(p, c); + } + *p++ = 0; + return mp_end(pool, p); +} + +char * +xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool) +{ + /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */ + char *p = mp_start_noalign(pool, 1); + uns q = xml_parse_quote(ctx), c; + while ((c = xml_get_char(ctx)) != q) + { + if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID))) + xml_fatal(ctx, "Expected a pubid character"); + p = mp_spread(pool, p, 2); + *p++ = c; + } + *p++ = 0; + return mp_end(pool, p); +} + +static char * +xml_parse_encoding_name(struct xml_context *ctx) +{ + /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */ + char *p = mp_start_noalign(ctx->pool, 1); + uns q = xml_parse_quote(ctx); + if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME))) + xml_fatal(ctx, "Invalid character in the encoding name"); + while (1) + { + p = mp_spread(ctx->pool, p, 2); + *p++ = xml_last_char(ctx); + if (xml_get_char(ctx) == q) + break; + if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME))) + xml_fatal(ctx, "Invalid character in the encoding name"); + } + *p++ = 0; + return mp_end(ctx->pool, p); +} + +/* Document/external entity header */ + +static inline void +xml_init_cats(struct xml_context *ctx, uns mask) +{ + if (!(ctx->flags & XML_FLAG_VERSION_1_1)) + { + ctx->src->refill_cat1 = XML_CHAR_VALID_1_0 & ~XML_CHAR_NEW_LINE_1_0 & ~mask; + ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_0; + } + else + { + ctx->src->refill_cat1 = XML_CHAR_UNRESTRICTED_1_1 & ~XML_CHAR_NEW_LINE_1_1 & ~mask; + ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_1; + } +} + +static void +xml_init_charconv(struct xml_context *ctx, int cs) +{ + // FIXME: hack + struct xml_source *src = ctx->src; + TRACE(ctx, "wrapping charset %s", charset_name(cs)); +#if 0 + struct conv_context conv; + conv_set_charset(&conv, cs, CONV_CHARSET_UTF8); + src->refill = xml_refill_libcharset; + src->refill_in_to_x = conv.in_to_x; +#else + src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8); + // FIXME: memory leak +#endif +} + +void +xml_parse_decl(struct xml_context *ctx) +{ + TRACE(ctx, "xml_parse_decl"); + struct xml_source *src = ctx->src; + ctx->flags &= ~XML_FLAG_SRC_EXPECTED_DECL; + + /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */ + xml_init_cats(ctx, XML_CHAR_GT); + + /* Initialize the supplied charset (if any) or try to guess it */ + char *expected_encoding = src->expected_encoding ? : src->fb_encoding; + src->refill = xml_refill_utf8; + int bom = bpeekc(src->fb); + if (bom < 0) + ctx->flags |= XML_FLAG_SRC_EOF; + if (!src->fb_encoding) + { + if (bom == 0xfe) + src->refill = xml_refill_utf16_be; + else if (bom == 0xff) + src->refill = xml_refill_utf16_le; + } + else + { + int cs = find_charset_by_name(src->fb_encoding); + if (cs == CONV_CHARSET_UTF8) + {} + else if (cs >= 0) + { + xml_init_charconv(ctx, cs); + bom = 0; + } + else if (strcasecmp(src->fb_encoding, "UTF-16")) + { + src->refill = xml_refill_utf16_be; + if (bom == 0xff) + src->refill = xml_refill_utf16_le; + if (!src->expected_encoding) + expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE"; + } + else if (strcasecmp(src->fb_encoding, "UTF-16BE")) + src->refill = xml_refill_utf16_be; + else if (strcasecmp(src->fb_encoding, "UTF-16LE")) + src->refill = xml_refill_utf16_le; + else + { + xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding); + expected_encoding = NULL; + } + } + uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; + if (bom > 0 && xml_peek_char(ctx) == 0xfeff) + xml_skip_char(ctx); + else if (utf16) + xml_error(ctx, "Missing or corrupted BOM"); + + /* Look ahead for presence of XMLDecl or optional TextDecl */ + if (!(ctx->flags & XML_FLAG_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) + xml_refill(ctx); + uns doc = ctx->flags & XML_FLAG_SRC_DOCUMENT; + u32 *bptr = ctx->bptr; + uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) && + bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L'); + if (!have_decl) + { + if (doc) + xml_fatal(ctx, "Missing or corrupted XML header"); + else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16) + xml_error(ctx, "Missing or corrupted entity header"); + goto exit; + } + ctx->bptr = bptr + 12; + xml_parse_white(ctx, 0); + + /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */ + if (xml_peek_char(ctx) == 'v') + { + xml_parse_seq(ctx, "version"); + xml_parse_eq(ctx); + char *version = xml_parse_pubid_literal(ctx, ctx->pool); + TRACE(ctx, "version=%s", version); + uns v = 0; + if (!strcmp(version, "1.1")) + v = XML_FLAG_VERSION_1_1; + else if (strcmp(version, "1.0")) + { + xml_error(ctx, "Unknown XML version string '%s'", version); + version = "1.0"; + } + if (doc) + { + ctx->version_str = version; + ctx->flags |= v; + } + else if (v > (ctx->flags & XML_FLAG_VERSION_1_1)) + xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document"); + if (!xml_parse_white(ctx, !doc)) + goto end; + } + else if (doc) + { + xml_error(ctx, "Expected XML version"); + ctx->version_str = "1.0"; + } + + /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */ + if (xml_peek_char(ctx) == 'e') + { + xml_parse_seq(ctx, "encoding"); + xml_parse_eq(ctx); + src->decl_encoding = xml_parse_encoding_name(ctx); + TRACE(ctx, "encoding=%s", src->decl_encoding); + if (!xml_parse_white(ctx, 0)) + goto end; + } + else if (!doc) + xml_error(ctx, "Expected XML encoding"); + + /* Parse whether the document is standalone (optional in XMLDecl) */ + if (doc && xml_peek_char(ctx) == 's') + { + xml_parse_seq(ctx, "standalone"); + xml_parse_eq(ctx); + uns c = xml_parse_quote(ctx); + if (ctx->standalone = (xml_peek_char(ctx) == 'y')) + xml_parse_seq(ctx, "yes"); + else + xml_parse_seq(ctx, "no"); + xml_parse_char(ctx, c); + TRACE(ctx, "standalone=%d", ctx->standalone); + xml_parse_white(ctx, 0); + } +end: + xml_parse_seq(ctx, "?>"); + + /* Switch to the final encoding */ + if (src->decl_encoding) + { + int cs = find_charset_by_name(src->decl_encoding); + if (cs < 0 && !expected_encoding) + xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); + else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) + xml_init_charconv(ctx, cs); + else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || + !(!strcasecmp(src->decl_encoding, "UTF-16") || + (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || + (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) + xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); + } + +exit: + /* Update valid Unicode ranges */ + xml_init_cats(ctx, 0); +} diff --git a/sherlock/xml/common.h b/sherlock/xml/common.h new file mode 100644 index 00000000..ed18e8af --- /dev/null +++ b/sherlock/xml/common.h @@ -0,0 +1,327 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _SHERLOCK_XML_COMMON_H +#define _SHERLOCK_XML_COMMON_H + +#include "sherlock/xml/xml.h" +#include "sherlock/xml/dtd.h" + +/*** Debugging ***/ + +#ifdef LOCAL_DEBUG +#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0) +#else +#define TRACE(c, f, p...) do {} while(0) +#endif + +/*** Error handling ***/ + +void NONRET xml_throw(struct xml_context *ctx); +void xml_warn(struct xml_context *ctx, const char *format, ...); +void xml_error(struct xml_context *ctx, const char *format, ...); +void xml_fatal(struct xml_context *ctx, const char *format, ...); + +/*** Charecter categorization ***/ + +#include "obj/sherlock/xml/unicat.h" + +static inline uns +xml_char_cat(uns c) +{ + if (c < 0x10000) + return 1U << xml_char_tab1[(c & 0xff) + xml_char_tab2[c >> 8]]; + else if (likely(c < 0x110000)) + return 1U << xml_char_tab3[c >> 16]; + else + return 1; +} + +static inline uns +xml_ascii_cat(uns c) +{ + return xml_char_tab1[c]; +} + +/*** Memory management ***/ + +void NONRET xml_fatal_nested(struct xml_context *ctx); + +static inline void +xml_inc(struct xml_context *ctx) +{ + /* Called after the first character of a block */ + TRACE(ctx, "inc"); + ctx->depth++; +} + +static inline void +xml_dec(struct xml_context *ctx) +{ + /* Called after the last character of a block */ + TRACE(ctx, "dec"); + if (unlikely(!ctx->depth--)) + xml_fatal_nested(ctx); +} + +struct xml_stack { + struct xml_stack *next; + struct mempool_state state; + uns flags; +}; + +static inline void * +xml_do_push(struct xml_context *ctx, uns size) +{ + /* Saves ctx->stack and ctx->flags state */ + struct mempool_state state; + mp_save(ctx->stack, &state); + struct xml_stack *s = mp_alloc(ctx->stack, size); + s->state = state; + s->flags = ctx->flags; + s->next = ctx->stack_list; + ctx->stack_list = s; + return s; +} + +static inline void +xml_do_pop(struct xml_context *ctx, struct xml_stack *s) +{ + /* Restore ctx->stack and ctx->flags state */ + ctx->stack_list = s->next; + ctx->flags = s->flags; + mp_restore(ctx->stack, &s->state); +} + +static inline void +xml_push(struct xml_context *ctx) +{ + TRACE(ctx, "push"); + xml_do_push(ctx, sizeof(struct xml_stack)); +} + +static inline void +xml_pop(struct xml_context *ctx) +{ + TRACE(ctx, "pop"); + ASSERT(ctx->stack_list); + xml_do_pop(ctx, ctx->stack_list); +} + +struct xml_dom_stack { + struct xml_stack stack; + struct mempool_state state; +}; + +static inline struct xml_node * +xml_push_dom(struct xml_context *ctx) +{ + /* Create a new DOM node */ + TRACE(ctx, "push_dom"); + struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s)); + mp_save(ctx->pool, &s->state); + struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n)); + if (n->parent = ctx->node) + clist_add_tail(&n->parent->sons, &n->n); + return ctx->node = n; +} + +static inline void +xml_pop_dom(struct xml_context *ctx) +{ + /* Leave DOM subtree */ + TRACE(ctx, "pop_dom"); + ASSERT(ctx->node); + struct xml_node *p = ctx->node->parent; + struct xml_dom_stack *s = (void *)ctx->stack_list; + if (ctx->flags & XML_DOM_FREE) + { + /* See xml_pop_element() for cleanup of attribute hash table */ + if (p) + clist_remove(&ctx->node->n); + mp_restore(ctx->pool, &s->state); + } + ctx->node = p; + xml_do_pop(ctx, &s->stack); +} + +#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN) +#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \ + static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uns size) \ + { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \ + static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {} + +void *xml_hash_new(struct mempool *pool, uns size); + +static inline void +xml_start_chars(struct xml_context *ctx) +{ + struct fastbuf *fb = &ctx->chars; + fb->bstop = fb->bptr = fb->buffer = mp_start_noalign(ctx->pool, 1); + fb->bufend = fb->buffer + mp_avail(ctx->pool); +} + +static inline char * +xml_end_chars(struct xml_context *ctx, uns *len) +{ + struct fastbuf *fb = &ctx->chars; + uns l = fb->bufend - fb->buffer; + if (fb->bptr == fb->bufend) + fb->bptr = mp_expand(ctx->pool) + l; + *fb->bptr = 0; + char *c = mp_end(ctx->pool, fb->bptr + 1); + fb->bptr = fb->bstop = fb->buffer = fb->bufend = NULL; + *len = l; + return c; +} + +/*** Reading of document/external entities ***/ + +struct xml_source *xml_push_source(struct xml_context *ctx, uns flags); +void xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent); + +void xml_refill(struct xml_context *ctx); + +static inline uns +xml_peek_char(struct xml_context *ctx) +{ + if (ctx->bptr == ctx->bstop) + xml_refill(ctx); + return ctx->bptr[0]; +} + +static inline uns +xml_peek_cat(struct xml_context *ctx) +{ + if (ctx->bptr == ctx->bstop) + xml_refill(ctx); + return ctx->bptr[1]; +} + +static inline uns +xml_get_char(struct xml_context *ctx) +{ + uns c = xml_peek_char(ctx); + ctx->bptr += 2; + return c; +} + +static inline uns +xml_get_cat(struct xml_context *ctx) +{ + uns c = xml_peek_cat(ctx); + ctx->bptr += 2; + return c; +} + +static inline uns +xml_last_char(struct xml_context *ctx) +{ + return ctx->bptr[-2]; +} + +static inline uns +xml_last_cat(struct xml_context *ctx) +{ + return ctx->bptr[-1]; +} + +static inline uns +xml_skip_char(struct xml_context *ctx) +{ + uns c = ctx->bptr[0]; + ctx->bptr += 2; + return c; +} + +static inline uns +xml_unget_char(struct xml_context *ctx) +{ + return *(ctx->bptr -= 2); +} + +/*** Basic parsing ***/ + +void NONRET xml_fatal_expected(struct xml_context *ctx, uns c); +void NONRET xml_fatal_expected_white(struct xml_context *ctx); +void NONRET xml_fatal_expected_quot(struct xml_context *ctx); + +static inline uns +xml_parse_white(struct xml_context *ctx, uns mandatory) +{ + /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+ + * mandatory=0 -> S? */ + uns cnt = 0; + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + { + xml_skip_char(ctx); + cnt++; + } + if (unlikely(mandatory && !cnt)) + xml_fatal_expected_white(ctx); + return cnt; +} + +static inline void +xml_parse_char(struct xml_context *ctx, uns c) +{ + /* Consumes a given Unicode character */ + if (unlikely(c != xml_get_char(ctx))) + xml_fatal_expected(ctx, c); +} + +static inline void +xml_parse_seq(struct xml_context *ctx, const char *seq) +{ + /* Consumes a given sequence of ASCII characters */ + while (*seq) + xml_parse_char(ctx, *seq++); +} + +void xml_parse_eq(struct xml_context *ctx); + +static inline uns +xml_parse_quote(struct xml_context *ctx) +{ + /* "'" | '"' */ + uns c = xml_get_char(ctx); + if (unlikely(c != '\'' && c != '\"')) + xml_fatal_expected_quot(ctx); + return c; +} + +/* Names and nmtokens */ + +char *xml_parse_name(struct xml_context *ctx, struct mempool *pool); +void xml_skip_name(struct xml_context *ctx); +char *xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool); + +/* Simple literals */ + +char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool); +char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool); + +/* Parsing */ + +uns xml_parse_char_ref(struct xml_context *ctx); +void xml_parse_ref(struct xml_context *ctx); +void xml_parse_pe_ref(struct xml_context *ctx); +char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr); +void xml_parse_notation_decl(struct xml_context *ctx); +void xml_parse_entity_decl(struct xml_context *ctx); +void xml_parse_element_decl(struct xml_context *ctx); +void xml_parse_attr_list_decl(struct xml_context *ctx); +void xml_push_comment(struct xml_context *ctx); +void xml_pop_comment(struct xml_context *ctx); +void xml_skip_comment(struct xml_context *ctx); +void xml_push_pi(struct xml_context *ctx); +void xml_pop_pi(struct xml_context *ctx); +void xml_skip_pi(struct xml_context *ctx); + +#endif diff --git a/sherlock/xml/dtd.c b/sherlock/xml/dtd.c new file mode 100644 index 00000000..07f030a4 --- /dev/null +++ b/sherlock/xml/dtd.c @@ -0,0 +1,823 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#define LOCAL_DEBUG + +#include "sherlock/sherlock.h" +#include "sherlock/xml/xml.h" +#include "sherlock/xml/dtd.h" +#include "sherlock/xml/common.h" +#include "lib/fastbuf.h" +#include "lib/ff-unicode.h" + +/* Notations */ + +#define HASH_PREFIX(x) xml_dtd_notns_##x +#define HASH_NODE struct xml_dtd_notn +#define HASH_KEY_STRING name +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_WANT_FIND +#define HASH_WANT_LOOKUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +/* General entities */ + +#define HASH_PREFIX(x) xml_dtd_ents_##x +#define HASH_NODE struct xml_dtd_ent +#define HASH_KEY_STRING name +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_WANT_FIND +#define HASH_WANT_LOOKUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +static struct xml_dtd_ent * +xml_dtd_declare_trivial_gent(struct xml_context *ctx, char *name, char *text) +{ + struct xml_dtd *dtd = ctx->dtd; + struct xml_dtd_ent *ent = xml_dtd_ents_lookup(dtd->tab_gents, name); + if (ent->flags & XML_DTD_ENT_DECLARED) + { + xml_warn(ctx, "Entity &%s; already declared", name); + return NULL; + } + slist_add_tail(&dtd->gents, &ent->n); + ent->flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL; + ent->text = text; + ent->len = strlen(text); + return ent; +} + +static void +xml_dtd_declare_default_gents(struct xml_context *ctx) +{ + xml_dtd_declare_trivial_gent(ctx, "lt", "<"); + xml_dtd_declare_trivial_gent(ctx, "gt", ">"); + xml_dtd_declare_trivial_gent(ctx, "amp", "&"); + xml_dtd_declare_trivial_gent(ctx, "apos", "'"); + xml_dtd_declare_trivial_gent(ctx, "quot", "\""); +} + +struct xml_dtd_ent * +xml_dtd_find_gent(struct xml_context *ctx, char *name) +{ + struct xml_dtd *dtd = ctx->dtd; + if (dtd) + { + struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_gents, name); + return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; + } + else + { +#define ENT(n, t) ent_##n = { .name = #n, .text = t, .len = 1, .flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL } + static struct xml_dtd_ent ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\""); +#undef ENT + switch (name[0]) + { + case 'l': + if (!strcmp(name, "lt")) + return &ent_lt; + break; + case 'g': + if (!strcmp(name, "gt")) + return &ent_gt; + break; + case 'a': + if (!strcmp(name, "amp")) + return &ent_amp; + if (!strcmp(name, "apos")) + return &ent_apos; + break; + case 'q': + if (!strcmp(name, "quot")) + return &ent_quot; + break; + } + return NULL; + } +} + +/* Parameter entities */ + +static struct xml_dtd_ent * +xml_dtd_find_pent(struct xml_context *ctx, char *name) +{ + struct xml_dtd *dtd = ctx->dtd; + struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_pents, name); + return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; +} + +/* Elements */ + +#define HASH_PREFIX(x) xml_dtd_elems_##x +#define HASH_NODE struct xml_dtd_elem +#define HASH_KEY_STRING name +#define HASH_TABLE_DYNAMIC +#define HASH_ZERO_FILL +#define HASH_WANT_LOOKUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +/* Element sons */ + +struct xml_dtd_enodes_table; + +static inline uns +xml_dtd_enodes_hash(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) +{ + return hash_pointer(parent) ^ hash_pointer(elem); +} + +static inline int +xml_dtd_enodes_eq(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent1, struct xml_dtd_elem *elem1, struct xml_dtd_elem_node *parent2, struct xml_dtd_elem *elem2) +{ + return (parent1 == parent2) && (elem1 == elem2); +} + +static inline void +xml_dtd_enodes_init_key(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *node, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) +{ + node->parent = parent; + node->elem = elem; +} + +#define HASH_PREFIX(x) xml_dtd_enodes_##x +#define HASH_NODE struct xml_dtd_elem_node +#define HASH_KEY_COMPLEX(x) x parent, x elem +#define HASH_KEY_DECL struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_TABLE_DYNAMIC +#define HASH_ZERO_FILL +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +/* Element attributes */ + +struct xml_dtd_attrs_table; + +static inline uns +xml_dtd_attrs_hash(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem, char *name) +{ + return hash_pointer(elem) ^ hash_string(name); +} + +static inline int +xml_dtd_attrs_eq(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem1, char *name1, struct xml_dtd_elem *elem2, char *name2) +{ + return (elem1 == elem2) && !strcmp(name1, name2); +} + +static inline void +xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_elem *elem, char *name) +{ + attr->elem = elem; + attr->name = name; +} + +#define HASH_PREFIX(x) xml_dtd_attrs_##x +#define HASH_NODE struct xml_dtd_attr +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x elem, x name +#define HASH_KEY_DECL struct xml_dtd_elem *elem, char *name +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +/* Enumerated attribute values */ + +struct xml_dtd_evals_table; + +static inline uns +xml_dtd_evals_hash(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr, char *val) +{ + return hash_pointer(attr) ^ hash_string(val); +} + +static inline int +xml_dtd_evals_eq(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr1, char *val1, struct xml_dtd_attr *attr2, char *val2) +{ + return (attr1 == attr2) && !strcmp(val1, val2); +} + +static inline void +xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_eval *eval, struct xml_dtd_attr *attr, char *val) +{ + eval->attr = attr; + eval->val = val; +} + +#define HASH_PREFIX(x) xml_dtd_evals_##x +#define HASH_NODE struct xml_dtd_eval +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x attr, x val +#define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +/* Enumerated attribute notations */ + +struct xml_dtd_enotns_table; + +static inline uns +xml_dtd_enotns_hash(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) +{ + return hash_pointer(attr) ^ hash_pointer(notn); +} + +static inline int +xml_dtd_enotns_eq(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr1, struct xml_dtd_notn *notn1, struct xml_dtd_attr *attr2, struct xml_dtd_notn *notn2) +{ + return (attr1 == attr2) && (notn1 == notn2); +} + +static inline void +xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_enotn *enotn, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) +{ + enotn->attr = attr; + enotn->notn = notn; +} + +#define HASH_PREFIX(x) xml_dtd_enotns_##x +#define HASH_NODE struct xml_dtd_enotn +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x attr, x notn +#define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +/* DTD initialization/cleanup */ + +void +xml_dtd_init(struct xml_context *ctx) +{ + if (ctx->dtd) + return; + struct mempool *pool = mp_new(4096); + struct xml_dtd *dtd = ctx->dtd = mp_alloc_zero(pool, sizeof(*ctx->dtd)); + dtd->pool = pool; + xml_dtd_ents_init(dtd->tab_gents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); + xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); + xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table))); + xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table))); + xml_dtd_enodes_init(dtd->tab_enodes = xml_hash_new(pool, sizeof(struct xml_dtd_enodes_table))); + xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table))); + xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table))); + xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table))); + xml_dtd_declare_default_gents(ctx); +} + +void +xml_dtd_cleanup(struct xml_context *ctx) +{ + if (!ctx->dtd) + return; + mp_delete(ctx->dtd->pool); + ctx->dtd = NULL; +} + +void +xml_dtd_finish(struct xml_context *ctx) +{ + if (!ctx->dtd) + return; + // FIXME: validity checks +} + +/*** Parsing functions ***/ + +/* References to parameter entities */ + +void +xml_parse_pe_ref(struct xml_context *ctx) +{ + /* PEReference ::= '%' Name ';' + * Already parsed: '%' */ + struct mempool_state state; + mp_save(ctx->stack, &state); + char *name = xml_parse_name(ctx, ctx->stack); + xml_parse_char(ctx, ';'); + struct xml_dtd_ent *ent = xml_dtd_find_pent(ctx, name); + if (!ent) + xml_error(ctx, "Unknown entity %%%s;", name); + else + { + TRACE(ctx, "Pushed entity %%%s;", name); + mp_restore(ctx->stack, &state); + xml_dec(ctx); + xml_push_entity(ctx, ent); + return; + } + mp_restore(ctx->stack, &state); + xml_dec(ctx); +} + +static void +xml_parse_dtd_pe(struct xml_context *ctx) +{ + do + { + xml_skip_char(ctx); + xml_inc(ctx); + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + xml_skip_char(ctx); + xml_parse_pe_ref(ctx); + } + while (xml_peek_char(ctx) != '%'); +} + +static inline uns +xml_parse_dtd_white(struct xml_context *ctx, uns mandatory) +{ + /* Whitespace or parameter entity */ + uns cnt = 0; + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + { + xml_skip_char(ctx); + cnt = 1; + } + if (xml_peek_char(ctx) == '%') + { + xml_parse_dtd_pe(ctx); + return 1; + } + else if (unlikely(mandatory && !cnt)) + xml_fatal_expected_white(ctx); + return cnt; +} + +static void +xml_dtd_parse_external_id(struct xml_context *ctx, struct xml_ext_id *eid, uns allow_public) +{ + struct xml_dtd *dtd = ctx->dtd; + bzero(eid, sizeof(*eid)); + uns c = xml_peek_char(ctx); + if (c == 'S') + { + xml_parse_seq(ctx, "SYSTEM"); + xml_parse_dtd_white(ctx, 1); + eid->system_id = xml_parse_system_literal(ctx, dtd->pool); + } + else if (c == 'P') + { + xml_parse_seq(ctx, "PUBLIC"); + xml_parse_dtd_white(ctx, 1); + eid->public_id = xml_parse_pubid_literal(ctx, dtd->pool); + if (xml_parse_dtd_white(ctx, 0)) // FIXME + if ((c = xml_peek_char(ctx)) == '\'' || c == '"' || !allow_public) + eid->system_id = xml_parse_system_literal(ctx, dtd->pool); + } + else + xml_fatal(ctx, "Expected an external ID"); +} + +/* DTD: */ + +void +xml_parse_notation_decl(struct xml_context *ctx) +{ + /* NotationDecl ::= '' + * Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + + struct xml_dtd_notn *notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); + xml_parse_dtd_white(ctx, 1); + struct xml_ext_id eid; + xml_dtd_parse_external_id(ctx, &eid, 1); + xml_parse_dtd_white(ctx, 0); + xml_parse_char(ctx, '>'); + + if (notn->flags & XML_DTD_NOTN_DECLARED) + xml_warn(ctx, "Notation %s already declared", notn->name); + else + { + notn->flags = XML_DTD_NOTN_DECLARED; + notn->eid = eid; + slist_add_tail(&dtd->notns, ¬n->n); + } + xml_dec(ctx); +} + +/* DTD: */ + +void +xml_parse_entity_decl(struct xml_context *ctx) +{ + /* Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + + uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENT_PARAMETER : 0; + if (flags) + xml_parse_dtd_white(ctx, 1); + else + xml_unget_char(ctx); + + struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_gents, xml_parse_name(ctx, dtd->pool)); + slist *list = flags ? &dtd->pents : &dtd->gents; + xml_parse_dtd_white(ctx, 1); + if (ent->flags & XML_DTD_ENT_DECLARED) + { + xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); + // FIXME: should be only warning + } + + uns c, sep = xml_get_char(ctx); + if (sep == '\'' || sep == '"') + { + /* Internal entity: + * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */ + char *p = mp_start_noalign(dtd->pool, 1); + while (1) + { + if ((c = xml_get_char(ctx)) == sep) + break; + if (c == '%') + { + // FIXME + ASSERT(0); + //xml_parse_parameter_ref(ctx); + continue; + } + if (c == '&') + { + xml_inc(ctx); + if (xml_peek_char(ctx) != '#') + { + /* Bypass references to general entities */ + struct mempool_state state; + mp_save(ctx->stack, &state); + char *n = xml_parse_name(ctx, ctx->stack); + xml_parse_char(ctx, ';'); + xml_dec(ctx); + uns l = strlen(n); + p = mp_spread(dtd->pool, p, 3 + l); + *p++ = '&'; + memcpy(p, n, l); + *p++ = ';';; + mp_restore(ctx->stack, &state); + continue; + } + else + { + xml_skip_char(ctx); + c = xml_parse_char_ref(ctx); + } + } + p = mp_spread(dtd->pool, p, 5); + p = utf8_32_put(p, c); + } + *p = 0; + ent->len = p - (char *)mp_ptr(dtd->pool); + ent->text = mp_end(dtd->pool, p + 1); + slist_add_tail(list, &ent->n); + ent->flags = flags | XML_DTD_ENT_DECLARED; + } + else + { + /* External entity */ + struct xml_ext_id eid; + struct xml_dtd_notn *notn = NULL; + xml_dtd_parse_external_id(ctx, &eid, 0); + if (!xml_parse_dtd_white(ctx, 0) || !flags) + xml_parse_char(ctx, '>'); + else if (xml_get_char(ctx) != '>') + { + /* General external unparsed entity */ + flags |= XML_DTD_ENT_UNPARSED; + xml_parse_seq(ctx, "NDATA"); + xml_parse_dtd_white(ctx, 1); + notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); + } + slist_add_tail(list, &ent->n); + ent->flags = flags | XML_DTD_ENT_DECLARED | XML_DTD_ENT_EXTERNAL; + ent->eid = eid; + ent->notn = notn; + } + xml_parse_dtd_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_dec(ctx); +} + +/* DTD: */ + +void +xml_parse_element_decl(struct xml_context *ctx) +{ + /* Elementdecl ::= '' + * Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + char *name = xml_parse_name(ctx, dtd->pool); + xml_parse_dtd_white(ctx, 1); + struct xml_dtd_elem *elem = xml_dtd_elems_lookup(dtd->tab_elems, name); + if (elem->flags & XML_DTD_ELEM_DECLARED) + xml_fatal(ctx, "Element <%s> already declared", name); + + /* contentspec ::= 'EMPTY' | 'ANY' | Mixed | children */ + uns c = xml_peek_char(ctx); + if (c == 'E') + { + xml_parse_seq(ctx, "EMPTY"); + elem->type = XML_DTD_ELEM_EMPTY; + } + else if (c == 'A') + { + xml_parse_seq(ctx, "ANY"); + elem->type = XML_DTD_ELEM_ANY; + } + else if (c == '(') + { + xml_skip_char(ctx); + xml_inc(ctx); + xml_parse_dtd_white(ctx, 0); + struct xml_dtd_elem_node *parent = elem->node = mp_alloc_zero(dtd->pool, sizeof(*parent)); + if (xml_peek_char(ctx) == '#') + { + /* Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' */ + xml_skip_char(ctx); + xml_parse_seq(ctx, "PCDATA"); + elem->type = XML_DTD_ELEM_MIXED; + parent->type = XML_DTD_ELEM_PCDATA; + while (1) + { + xml_parse_dtd_white(ctx, 0); + if ((c = xml_get_char(ctx)) == ')') + break; + else if (c != '|') + xml_fatal_expected(ctx, ')'); + xml_parse_dtd_white(ctx, 0); + struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); + if (xml_dtd_enodes_find(dtd->tab_enodes, parent, son_elem)) + xml_error(ctx, "Duplicate content '%s'", son_elem->name); + else + { + struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); + slist_add_tail(&parent->sons, &son->n); + } + } + xml_dec(ctx); + if (xml_peek_char(ctx) == '*') + { + xml_skip_char(ctx); + parent->occur = XML_DTD_ELEM_OCCUR_MULT; + } + else if (!slist_head(&parent->sons)) + parent->occur = XML_DTD_ELEM_OCCUR_ONCE; + else + xml_fatal_expected(ctx, '*'); + } + else + { + /* children ::= (choice | seq) ('?' | '*' | '+')? + * cp ::= (Name | choice | seq) ('?' | '*' | '+')? + * choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' + * seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' */ + + elem->type = XML_DTD_ELEM_CHILDREN; + parent->type = XML_DTD_ELEM_PCDATA; + uns c; + goto first; + + while (1) + { + /* After name */ + xml_parse_dtd_white(ctx, 0); + if ((c = xml_get_char(ctx)) == ')') + { + xml_dec(ctx); + if (parent->type == XML_DTD_ELEM_PCDATA) + parent->type = XML_DTD_ELEM_SEQ; + if ((c = xml_get_char(ctx)) == '?') + parent->occur = XML_DTD_ELEM_OCCUR_OPT; + else if (c == '*') + parent->occur = XML_DTD_ELEM_OCCUR_MULT; + else if (c == '+') + parent->occur = XML_DTD_ELEM_OCCUR_PLUS; + else + { + xml_unget_char(ctx); + parent->occur = XML_DTD_ELEM_OCCUR_ONCE; + } + if (!parent->parent) + break; + parent = parent->parent; + continue; + } + else if (c == '|') + { + if (parent->type == XML_DTD_ELEM_PCDATA) + parent->type = XML_DTD_ELEM_OR; + else if (parent->type != XML_DTD_ELEM_OR) + xml_fatal(ctx, "Mixed operators in the list of element children"); + } + else if (c == ',') + { + if (parent->type == XML_DTD_ELEM_PCDATA) + parent->type = XML_DTD_ELEM_SEQ; + else if (parent->type != XML_DTD_ELEM_SEQ) + xml_fatal(ctx, "Mixed operators in the list of element children"); + } + else if (c == '(') + { + xml_inc(ctx); + struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); + son->parent = parent; + slist_add_tail(&parent->sons, &son->n); + parent = son->parent; + son->type = XML_DTD_ELEM_MIXED; + } + else + xml_unget_char(ctx); + + /* Before name */ + xml_parse_dtd_white(ctx, 0); +first:; + struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); + // FIXME: duplicates, occurance + //struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); + struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); + son->parent = parent; + son->elem = son_elem; + slist_add_tail(&parent->sons, &son->n); + } + } + } + else + xml_fatal(ctx, "Expected element content specification"); + + xml_parse_dtd_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_dec(ctx); +} + +void +xml_parse_attr_list_decl(struct xml_context *ctx) +{ + /* AttlistDecl ::= '' + * AttDef ::= S Name S AttType S DefaultDecl + * Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + struct xml_dtd_elem *elem = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx, dtd->pool)); + + while (xml_parse_dtd_white(ctx, 0) && xml_peek_char(ctx) != '>') + { + char *name = xml_parse_name(ctx, dtd->pool); + struct xml_dtd_attr *attr = xml_dtd_attrs_find(dtd->tab_attrs, elem, name); + uns ignored = 0; + if (attr) + { + xml_warn(ctx, "Duplicate attribute definition"); + ignored++; + } + else + attr = xml_dtd_attrs_new(ctx->dtd->tab_attrs, elem, name); + xml_parse_dtd_white(ctx, 1); + if (xml_peek_char(ctx) == '(') + { + xml_skip_char(ctx); // FIXME: xml_inc/dec ? + if (!ignored) + attr->type = XML_ATTR_ENUM; + do + { + xml_parse_dtd_white(ctx, 0); + char *value = xml_parse_nmtoken(ctx, dtd->pool); + if (!ignored) + if (xml_dtd_evals_find(ctx->dtd->tab_evals, attr, value)) + xml_error(ctx, "Duplicate enumeration value"); + else + xml_dtd_evals_new(ctx->dtd->tab_evals, attr, value); + xml_parse_dtd_white(ctx, 0); + } + while (xml_get_char(ctx) == '|'); + xml_unget_char(ctx); + xml_parse_char(ctx, ')'); + } + else + { + char *type = xml_parse_name(ctx, dtd->pool); + enum xml_dtd_attribute_type t = XML_ATTR_CDATA; + if (!strcmp(type, "CDATA")) + t = XML_ATTR_CDATA; + else if (!strcmp(type, "ID")) + t = XML_ATTR_ID; + else if (!strcmp(type, "IDREF")) + t = XML_ATTR_IDREF; + else if (!strcmp(type, "IDREFS")) + t = XML_ATTR_IDREFS; + else if (!strcmp(type, "ENTITY")) + t = XML_ATTR_ENTITY; + else if (!strcmp(type, "ENTITIES")) + t = XML_ATTR_ENTITIES; + else if (!strcmp(type, "NMTOKEN")) + t = XML_ATTR_NMTOKEN; + else if (!strcmp(type, "NMTOKENS")) + t = XML_ATTR_NMTOKENS; + else if (!strcmp(type, "NOTATION")) + { + if (elem->type == XML_DTD_ELEM_EMPTY) + xml_fatal(ctx, "Empty element must not have notation attribute"); + // FIXME: An element type MUST NOT have more than one NOTATION attribute specified. + t = XML_ATTR_NOTATION; + xml_parse_dtd_white(ctx, 1); + xml_parse_char(ctx, '('); + do + { + xml_parse_dtd_white(ctx, 0); + struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); + if (!ignored) + if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, attr, n)) + xml_error(ctx, "Duplicate enumerated notation"); + else + xml_dtd_enotns_new(ctx->dtd->tab_enotns, attr, n); + xml_parse_dtd_white(ctx, 0); + } + while (xml_get_char(ctx) == '|'); + xml_unget_char(ctx); + xml_parse_char(ctx, ')'); + } + else + xml_fatal(ctx, "Unknown attribute type"); + if (!ignored) + attr->type = t; + } + xml_parse_dtd_white(ctx, 1); + enum xml_dtd_attribute_default def = XML_ATTR_NONE; + if (xml_get_char(ctx) == '#') + switch (xml_peek_char(ctx)) + { + case 'R': + xml_parse_seq(ctx, "REQUIRED"); + def = XML_ATTR_REQUIRED; + break; + case 'I': + xml_parse_seq(ctx, "IMPLIED"); + def = XML_ATTR_IMPLIED; + break; + case 'F': + xml_parse_seq(ctx, "FIXED"); + def = XML_ATTR_FIXED; + xml_parse_dtd_white(ctx, 1); + break; + default: + xml_fatal(ctx, "Expected a modifier for default attribute value"); + } + else + xml_unget_char(ctx); + if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED) + { + char *v = xml_parse_attr_value(ctx, attr); + if (!ignored) + attr->default_value = v; + } + if (!ignored) + attr->default_mode = def; + } + xml_skip_char(ctx); + xml_dec(ctx); +} diff --git a/sherlock/xml/dtd.h b/sherlock/xml/dtd.h index bf95b872..9b4c6d98 100644 --- a/sherlock/xml/dtd.h +++ b/sherlock/xml/dtd.h @@ -62,6 +62,8 @@ struct xml_dtd_ent { struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */ }; +struct xml_dtd_ent *xml_dtd_find_gent(struct xml_context *ctx, char *name); + /* Elements */ enum xml_dtd_elem_flags { @@ -145,4 +147,8 @@ struct xml_dtd_enotn { struct xml_dtd_notn *notn; }; +void xml_dtd_init(struct xml_context *ctx); +void xml_dtd_cleanup(struct xml_context *ctx); +void xml_dtd_finish(struct xml_context *ctx); + #endif diff --git a/sherlock/xml/parse.c b/sherlock/xml/parse.c new file mode 100644 index 00000000..6f2e7e00 --- /dev/null +++ b/sherlock/xml/parse.c @@ -0,0 +1,1004 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#define LOCAL_DEBUG + +#include "sherlock/sherlock.h" +#include "sherlock/xml/xml.h" +#include "sherlock/xml/dtd.h" +#include "sherlock/xml/common.h" +#include "lib/fastbuf.h" +#include "lib/ff-unicode.h" +#include "lib/unicode.h" +#include "lib/chartype.h" +#include "lib/hashfunc.h" + +#include + +/*** Comments ***/ + +void +xml_push_comment(struct xml_context *ctx) +{ + TRACE(ctx, "push_comment"); + /* Comment ::= '' + * Already parsed: 'type = XML_NODE_COMMENT; + char *p = mp_start_noalign(ctx->pool, 6); + while (1) + { + if (xml_get_char(ctx) == '-') + if (xml_get_char(ctx) == '-') + break; + else + *p++ = '-'; + p = utf8_32_put(p, xml_last_char(ctx)); + p = mp_spread(ctx->pool, p, 6); + } + xml_parse_char(ctx, '>'); + *p = 0; + n->len = p - (char *)mp_ptr(ctx->pool); + n->text = mp_end(ctx->pool, p + 1); + if (ctx->h_comment) + ctx->h_comment(ctx); +} + +void +xml_pop_comment(struct xml_context *ctx) +{ + xml_pop_dom(ctx); + xml_dec(ctx); + TRACE(ctx, "pop_comment"); +} + +void +xml_skip_comment(struct xml_context *ctx) +{ + TRACE(ctx, "skip_comment"); + xml_parse_char(ctx, '-'); + while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-'); + xml_parse_char(ctx, '>'); + xml_dec(ctx); +} + +/*** Processing instructions ***/ + +void +xml_push_pi(struct xml_context *ctx) +{ + TRACE(ctx, "push_pi"); + /* Parses a PI to ctx->value and ctx->name: + * PI ::= '' Char*)))? '?>' + * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) + * Already parsed: 'type = XML_NODE_PI; + n->name = xml_parse_name(ctx, ctx->pool); + if (unlikely(!strcasecmp(n->name, "xml"))) + xml_error(ctx, "Reserved PI target"); + char *p = mp_start_noalign(ctx->pool, 5); + if (!xml_parse_white(ctx, 0)) + xml_parse_seq(ctx, "?>"); + else + while (1) + { + if (xml_get_char(ctx) == '?') + if (xml_peek_char(ctx) == '>') + { + xml_skip_char(ctx); + break; + } + else + *p++ = '?'; + else + p = utf8_32_put(p, xml_last_char(ctx)); + p = mp_spread(ctx->pool, p, 5); + } + *p = 0; + n->len = p - (char *)mp_ptr(ctx->pool); + n->text = mp_end(ctx->pool, p + 1); + if (ctx->h_pi) + ctx->h_pi(ctx); +} + +void +xml_pop_pi(struct xml_context *ctx) +{ + xml_pop_dom(ctx); + xml_dec(ctx); + TRACE(ctx, "pop_pi"); +} + +void +xml_skip_pi(struct xml_context *ctx) +{ + TRACE(ctx, "skip_pi"); + if (ctx->flags & XML_FLAG_VALIDATING) + { + struct mempool_state state; + mp_save(ctx->stack, &state); + if (unlikely(!strcasecmp(xml_parse_name(ctx, ctx->stack), "xml"))) + xml_error(ctx, "Reserved PI target"); + mp_restore(ctx->stack, &state); + if (!xml_parse_white(ctx, 0)) + { + xml_parse_seq(ctx, "?>"); + xml_dec(ctx); + return; + } + } + while (1) + if (xml_get_char(ctx) == '?') + if (xml_peek_char(ctx) == '>') + break; + xml_skip_char(ctx); + xml_dec(ctx); +} + +/*** Character data ***/ + +static void +xml_chars_spout(struct fastbuf *fb) +{ + if (fb->bptr >= fb->bufend) + { + struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb); + struct mempool *pool = ctx->pool; + if (fb->bufend != fb->buffer) + { + uns len = fb->bufend - fb->buffer; + TRACE(ctx, "grow_chars"); + fb->buffer = mp_expand(pool); + fb->bufend = fb->buffer + mp_avail(pool); + fb->bstop = fb->buffer; + fb->bptr = fb->buffer + len; + } + else + { + TRACE(ctx, "push_chars"); + struct xml_node *n = xml_push_dom(ctx); + n->type = XML_NODE_CDATA; + xml_start_chars(ctx); + } + } +} + +static void +xml_init_chars(struct xml_context *ctx) +{ + struct fastbuf *fb = &ctx->chars; + fb->name = ""; + fb->spout = xml_chars_spout; + fb->can_overwrite_buffer = 1; + fb->bptr = fb->bstop = fb->buffer = fb->bufend = NULL; +} + +static inline uns +xml_flush_chars(struct xml_context *ctx) +{ + struct fastbuf *fb = &ctx->chars; + if (fb->bufend == fb->buffer) + return 0; + TRACE(ctx, "flush_chars"); + struct xml_node *n = ctx->node; + n->text = xml_end_chars(ctx, &n->len); + n->len = fb->bufend - fb->buffer; + if (ctx->h_chars) + ctx->h_chars(ctx); + return 1; +} + +static inline void +xml_pop_chars(struct xml_context *ctx) +{ + xml_pop_dom(ctx); + TRACE(ctx, "pop_chars"); +} + +static inline void +xml_append_chars(struct xml_context *ctx) +{ + TRACE(ctx, "append_chars"); + struct fastbuf *out = &ctx->chars; + while (xml_get_char(ctx) != '<') + if (xml_last_char(ctx) == '&') + { + xml_inc(ctx); + xml_parse_ref(ctx); + } + else + bput_utf8_32(out, xml_last_char(ctx)); + xml_unget_char(ctx); +} + +/*** CDATA sections ***/ + +static void +xml_push_cdata(struct xml_context *ctx) +{ + TRACE(ctx, "push_cdata"); + /* CDSect :== '' Char*)) ']]>' + * Already parsed: 'type = XML_NODE_CDATA; + char *p = mp_start_noalign(ctx->pool, 7); + while (1) + { + if (xml_get_char(ctx) == ']') + { + if (xml_get_char(ctx) == ']') + if (xml_get_char(ctx) == '>') + break; + else + *p++ = ']'; + *p++ = ']'; + } + p = utf8_32_put(p, xml_last_char(ctx)); + p = mp_spread(ctx->pool, p, 7); + } + *p = 0; + n->len = p - (char *)mp_ptr(ctx->pool); + n->text = mp_end(ctx->pool, p + 1); + if (ctx->h_cdata) + ctx->h_cdata(ctx); +} + +static void +xml_pop_cdata(struct xml_context *ctx) +{ + xml_pop_dom(ctx); + xml_dec(ctx); + TRACE(ctx, "pop_cdata"); +} + +static void +xml_append_cdata(struct xml_context *ctx) +{ + TRACE(ctx, "append_cdata"); + xml_parse_seq(ctx, "CDATA["); + struct fastbuf *out = &ctx->chars; + while (1) + { + if (xml_get_char(ctx) == ']') + { + if (xml_get_char(ctx) == ']') + if (xml_get_char(ctx) == '>') + break; + else + bputc(out, ']'); + bputc(out, ']'); + } + bput_utf8_32(out, xml_last_char(ctx)); + } + xml_dec(ctx); +} + +static void UNUSED +xml_skip_cdata(struct xml_context *ctx) +{ + TRACE(ctx, "skip_cdata"); + xml_parse_seq(ctx, "CDATA["); + while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>'); + xml_dec(ctx); +} + +/*** Character references ***/ + +uns +xml_parse_char_ref(struct xml_context *ctx) +{ + TRACE(ctx, "parse_char_ref"); + /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' + * Already parsed: '&#' */ + uns v = 0; + if (xml_get_char(ctx) == 'x') + { + if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT)) + { + xml_error(ctx, "Expected a hexadecimal value of character reference"); + goto recover; + } + do + { + v = (v << 4) + Cxvalue(xml_last_char(ctx)); + } + while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT)); + } + else + { + if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT)) + { + xml_error(ctx, "Expected a numeric value of character reference"); + goto recover; + } + do + { + v = v * 10 + xml_last_char(ctx) - '0'; + } + while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT)); + } + uns cat = xml_char_cat(v); + if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_FLAG_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0))) + { + xml_error(ctx, "Character reference out of range"); + goto recover; + } + if (xml_last_char(ctx) == ';') + { + xml_dec(ctx); + return v; + } + xml_error(ctx, "Expected ';'"); +recover: + while (xml_last_char(ctx) != ';') + xml_get_char(ctx); + xml_dec(ctx); + return UNI_REPLACEMENT; +} + +/*** References to general entities ***/ + +void +xml_parse_ref(struct xml_context *ctx) +{ + /* Reference ::= EntityRef | CharRef + * EntityRef ::= '&' Name ';' + * Already parsed: '&' */ + struct fastbuf *out = &ctx->chars; + if (xml_peek_char(ctx) == '#') + { + xml_skip_char(ctx); + bput_utf8_32(out, xml_parse_char_ref(ctx)); + } + else + { + TRACE(ctx, "parse_ge_ref"); + struct mempool_state state; + mp_save(ctx->stack, &state); + char *name = xml_parse_name(ctx, ctx->stack); + xml_parse_char(ctx, ';'); + struct xml_dtd_ent *ent = xml_dtd_find_gent(ctx, name); + if (!ent) + { + xml_error(ctx, "Unknown entity &%s;", name); + bputc(out, '&'); + bputs(out, name); + bputc(out, ';'); + } + else if (ent->flags & XML_DTD_ENT_TRIVIAL) + { + TRACE(ctx, "Trivial entity &%s;", name); + bwrite(out, ent->text, ent->len); + } + else + { + TRACE(ctx, "Pushed entity &%s;", name); + mp_restore(ctx->stack, &state); + xml_dec(ctx); + xml_push_entity(ctx, ent); + return; + } + mp_restore(ctx->stack, &state); + xml_dec(ctx); + } +} + +/*** Attribute values ***/ + +char * +xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) +{ + TRACE(ctx, "parse_attr_value"); + /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */ + /* FIXME: + * -- copying from ctx->chars to ctx->pool is not necessary, we could directly write to ctx->pool + * -- berare quotes inside parased entities + * -- check value constrains / normalize value */ + struct mempool_state state; + uns quote = xml_parse_quote(ctx); + mp_save(ctx->stack, &state); + xml_start_chars(ctx); + struct fastbuf *out = &ctx->chars; + while (1) + { + uns c = xml_get_char(ctx); + if (c == '&') + { + xml_inc(ctx); + xml_parse_ref(ctx); + } + else if (c == quote) // FIXME: beware quotes inside parsed entities + break; + else if (c == '<') + xml_error(ctx, "Attribute value must not contain '<'"); + else if (xml_last_cat(ctx) & XML_CHAR_WHITE) + bputc(out, ' '); + else + bput_utf8_32(out, c); + } + mp_restore(ctx->stack, &state); + uns len; + return xml_end_chars(ctx, &len); +} + +/*** Attributes ***/ + +struct xml_attrs_table; + +static inline uns +xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_node *e, char *n) +{ + return hash_pointer(e) ^ hash_string(n); +} + +static inline int +xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_node *e1, char *n1, struct xml_node *e2, char *n2) +{ + return (e1 == e2) && !strcmp(n1, n2); +} + +static inline void +xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_node *e, char *name) +{ + a->elem = e; + a->name = name; + a->val = NULL; + slist_add_tail(&e->attrs, &a->n); +} + +#define HASH_PREFIX(x) xml_attrs_##x +#define HASH_NODE struct xml_attr +#define HASH_KEY_COMPLEX(x) x elem, x name +#define HASH_KEY_DECL struct xml_node *elem, char *name +#define HASH_TABLE_DYNAMIC +#define HASH_GIVE_EQ +#define HASH_GIVE_HASHFN +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_CLEANUP +#define HASH_WANT_REMOVE +#define HASH_WANT_LOOKUP +#define HASH_WANT_FIND +#define HASH_GIVE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +static void +xml_parse_attr(struct xml_context *ctx) +{ + TRACE(ctx, "parse_attr"); + /* Attribute ::= Name Eq AttValue */ + /* FIXME: + * -- memory management + * -- DTD */ + struct xml_node *e = ctx->node; + char *n = xml_parse_name(ctx, ctx->pool); + struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, n); + xml_parse_eq(ctx); + char *v = xml_parse_attr_value(ctx, NULL); + if (a->val) + xml_error(ctx, "Attribute %s is not unique", n); + else + a->val = v; +} + +/*** Elements ***/ + +static void +xml_push_element(struct xml_context *ctx) +{ + TRACE(ctx, "push_element"); + /* EmptyElemTag | STag + * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' + * STag ::= '<' Name (S Attribute)* S? '>' + * Already parsed: '<' */ + struct xml_node *e = xml_push_dom(ctx); + clist_init(&e->sons); + e->type = XML_NODE_ELEM; + e->name = xml_parse_name(ctx, ctx->pool); + slist_init(&e->attrs); + if (!e->parent) + { + ctx->root = e; + if (ctx->document_type && strcmp(e->name, ctx->document_type)) + xml_error(ctx, "The root element %s does not match the document type %s", e->name, ctx->document_type); + } + while (1) + { + uns white = xml_parse_white(ctx, 0); + uns c = xml_get_char(ctx); + if (c == '/') + { + xml_parse_char(ctx, '>'); + ctx->flags |= XML_FLAG_EMPTY_ELEM; + break; + } + else if (c == '>') + break; + else if (!white) + xml_fatal_expected_white(ctx); + xml_unget_char(ctx); + xml_parse_attr(ctx); + } + if (ctx->h_element_start) + ctx->h_element_start(ctx); +} + +static void +xml_pop_element(struct xml_context *ctx) +{ + TRACE(ctx, "pop_element"); + if (ctx->h_element_end) + ctx->h_element_end(ctx); + struct xml_node *e = ctx->node; + if (ctx->flags & XML_DOM_FREE) + { + if (!e->parent) + ctx->root = NULL; + /* Restore hash table of attributes */ + SLIST_FOR_EACH(struct xml_attr *, a, e->attrs) + xml_attrs_remove(ctx->tab_attrs, a); + struct xml_node *n; + while (n = clist_head(&e->sons)) + { + if (n->type == XML_NODE_ELEM) + { + SLIST_FOR_EACH(struct xml_attr *, a, n->attrs) + xml_attrs_remove(ctx->tab_attrs, a); + clist_insert_list_after(&n->sons, &n->n); + } + clist_remove(&n->n); + } + } + xml_pop_dom(ctx); + xml_dec(ctx); +} + +static void +xml_parse_etag(struct xml_context *ctx) +{ + /* ETag ::= '' + * Already parsed: '<' */ + struct xml_node *e = ctx->node; + ASSERT(e); + char *n = e->name; + while (*n) + { + uns c; + n = utf8_32_get(n, &c); + if (xml_get_char(ctx) != c) + goto recover; + } + xml_parse_white(ctx, 0); + if (xml_get_char(ctx) != '>') + { +recover: + xml_error(ctx, "Invalid ETag, expected ", e->name); + while (xml_get_char(ctx) != '>'); + } + xml_dec(ctx); +} + +/*** Document type declaration ***/ + +static void +xml_parse_doctype_decl(struct xml_context *ctx) +{ + TRACE(ctx, "parse_doctype_decl"); + /* doctypedecl ::= '' + * Already parsed: '' */ + if (ctx->document_type) + xml_fatal(ctx, "Multiple document types not allowed"); + xml_parse_seq(ctx, "DOCTYPE"); + xml_parse_white(ctx, 1); + ctx->document_type = xml_parse_name(ctx, ctx->pool); + TRACE(ctx, "doctyype=%s", ctx->document_type); + uns c; + if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P')) + { + if (c == 'S') + { + xml_parse_seq(ctx, "SYSTEM"); + xml_parse_white(ctx, 1); + ctx->eid.system_id = xml_parse_system_literal(ctx, ctx->pool); + } + else + { + xml_parse_seq(ctx, "PUBLIC"); + xml_parse_white(ctx, 1); + ctx->eid.public_id = xml_parse_pubid_literal(ctx, ctx->pool); + xml_parse_white(ctx, 1); + ctx->eid.system_id = xml_parse_system_literal(ctx, ctx->pool); + } + xml_parse_white(ctx, 0); + ctx->flags |= XML_FLAG_HAS_EXTERNAL_SUBSET; + } + if (xml_peek_char(ctx) == '[') + ctx->flags |= XML_FLAG_HAS_INTERNAL_SUBSET; + if (ctx->h_doctype_decl) + ctx->h_doctype_decl(ctx); +} + + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/* DTD: Internal subset */ + +static void +xml_parse_internal_subset(struct xml_context *ctx) +{ + // FIXME: comments/pi have no parent + /* '[' intSubset ']' + * intSubset :== (markupdecl | DeclSep) + * Already parsed: ']' */ + while (1) + { + xml_parse_white(ctx, 0); + uns c = xml_get_char(ctx); + xml_inc(ctx); + if (c == '<') + if ((c = xml_get_char(ctx)) == '!') + switch (c = xml_get_char(ctx)) + { + case '-': + xml_push_comment(ctx); + xml_pop_comment(ctx); + break; + case 'N': + xml_parse_seq(ctx, "OTATION"); + xml_parse_notation_decl(ctx); + break; + case 'E': + if ((c = xml_get_char(ctx)) == 'N') + { + xml_parse_seq(ctx, "TITY"); + xml_parse_entity_decl(ctx); + } + else if (c == 'L') + { + xml_parse_seq(ctx, "EMENT"); + xml_parse_element_decl(ctx); + } + else + goto invalid_markup; + break; + case 'A': + xml_parse_seq(ctx, "TTLIST"); + xml_parse_attr_list_decl(ctx); + break; + default: + goto invalid_markup; + } + else if (c == '?') + { + xml_push_pi(ctx); + xml_pop_pi(ctx); + } + else + goto invalid_markup; + else if (c == '%') + xml_parse_pe_ref(ctx); + else if (c == ']') + break; + else + goto invalid_markup; + } + xml_dec(ctx); + xml_dec(ctx); + return; +invalid_markup: + xml_fatal(ctx, "Invalid markup in the internal subset"); +} + + +/*----------------------------------------------*/ + +void +xml_init(struct xml_context *ctx) +{ + bzero(ctx, sizeof(*ctx)); + ctx->pool = mp_new(65536); + ctx->stack = mp_new(65536); + ctx->flags = XML_DOM_FREE; + xml_init_chars(ctx); + xml_dtd_init(ctx); + xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table))); +} + +void +xml_cleanup(struct xml_context *ctx) +{ + xml_attrs_cleanup(ctx->tab_attrs); + xml_dtd_cleanup(ctx); + mp_delete(ctx->pool); + mp_delete(ctx->stack); +} + +int +xml_next(struct xml_context *ctx) +{ + /* A nasty state machine */ + + TRACE(ctx, "xml_next (state=%u)", ctx->state); + jmp_buf throw_buf; + ctx->throw_buf = &throw_buf; + if (setjmp(throw_buf)) + { +error: + if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal) + ctx->h_fatal(ctx); + ctx->state = XML_STATE_FATAL; + TRACE(ctx, "raised fatal error"); + return -1; + } + uns c; + switch (ctx->state) + { + case XML_STATE_FATAL: + return -1; + + case XML_STATE_START: + TRACE(ctx, "entering prolog"); + if (ctx->h_document_start) + ctx->h_document_start(ctx); + /* XMLDecl */ + xml_refill(ctx); + if (ctx->h_xml_decl) + ctx->h_xml_decl(ctx); + if (ctx->want & XML_WANT_DECL) + return ctx->state = XML_STATE_DECL; + case XML_STATE_DECL: + + /* Misc* (doctypedecl Misc*)? */ + while (1) + { + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '<'); + if ((c = xml_get_char(ctx)) == '?') + /* Processing intruction */ + if (!(ctx->want & XML_WANT_PI)) + xml_skip_pi(ctx); + else + { + xml_push_pi(ctx); + ctx->state = XML_STATE_PROLOG_PI; + return XML_STATE_PI; + case XML_STATE_PROLOG_PI: + xml_pop_pi(ctx); + } + else if (c != '!') + { + /* Found the root tag */ + xml_unget_char(ctx); + goto first_tag; + } + else if (xml_get_char(ctx) == '-') + if (!(ctx->want & XML_WANT_COMMENT)) + xml_skip_comment(ctx); + else + { + xml_push_comment(ctx); + ctx->state = XML_STATE_PROLOG_COMMENT; + return XML_STATE_COMMENT; + case XML_STATE_PROLOG_COMMENT: + xml_pop_comment(ctx); + } + else + { + /* DocTypeDecl */ + xml_unget_char(ctx); + xml_parse_doctype_decl(ctx); + if (ctx->want & XML_WANT_DOCUMENT_TYPE) + return ctx->state = XML_STATE_DOCUMENT_TYPE; + case XML_STATE_DOCUMENT_TYPE: + if (xml_peek_char(ctx) == '[') + { + xml_skip_char(ctx); + xml_inc(ctx); + xml_parse_internal_subset(ctx); + xml_parse_white(ctx, 0); + } + xml_parse_char(ctx, '>'); + } + } + + case XML_STATE_CHARS: + + while (1) + { + if (xml_peek_char(ctx) != '<') + { + /* CharData */ + xml_append_chars(ctx); + continue; + } + else + xml_skip_char(ctx); +first_tag: ; + + xml_inc(ctx); + if ((c = xml_get_char(ctx)) == '?') + { + /* PI */ + if (!(ctx->want & XML_WANT_PI)) + xml_skip_pi(ctx); + else + { + if (xml_flush_chars(ctx)) + { + if (ctx->want & XML_WANT_CHARS) + { + ctx->state = XML_STATE_CHARS_BEFORE_PI; + return XML_STATE_CHARS; + } + case XML_STATE_CHARS_BEFORE_PI: + xml_pop_chars(ctx); + } + xml_push_pi(ctx); + return ctx->state = XML_STATE_PI; + case XML_STATE_PI: + xml_pop_pi(ctx); + } + } + + else if (c == '!') + if ((c = xml_get_char(ctx)) == '-') + { + /* Comment */ + if (!(ctx->want & XML_WANT_COMMENT)) + xml_skip_comment(ctx); + else + { + if (xml_flush_chars(ctx)) + { + if (ctx->want & XML_WANT_CHARS) + { + ctx->state = XML_STATE_CHARS_BEFORE_COMMENT; + return XML_STATE_CHARS; + } + case XML_STATE_CHARS_BEFORE_COMMENT: + xml_pop_chars(ctx); + } + xml_push_comment(ctx); + return ctx->state = XML_STATE_COMMENT; + case XML_STATE_COMMENT: + xml_pop_comment(ctx); + } + } + else if (c == '[') + { + /* CDATA */ + if (!(ctx->want & XML_WANT_CDATA)) + xml_append_cdata(ctx); + else + { + if (xml_flush_chars(ctx)) + { + if (ctx->want & XML_WANT_CHARS) + { + ctx->state = XML_STATE_CHARS_BEFORE_CDATA; + return XML_STATE_CHARS; + } + case XML_STATE_CHARS_BEFORE_CDATA: + xml_pop_chars(ctx); + } + xml_push_cdata(ctx); + return ctx->state = XML_STATE_CDATA; + case XML_STATE_CDATA: + xml_pop_cdata(ctx); + } + } + else + xml_fatal(ctx, "Unexpected character after 'want & XML_WANT_CHARS) + { + ctx->state = XML_STATE_CHARS_BEFORE_STAG; + return XML_STATE_CHARS; + } + case XML_STATE_CHARS_BEFORE_STAG: + xml_pop_chars(ctx); + } + + xml_push_element(ctx); + if (ctx->want & XML_WANT_STAG) + return ctx->state = XML_STATE_STAG; + case XML_STATE_STAG: + if (ctx->flags & XML_FLAG_EMPTY_ELEM) + goto pop_element; + } + + else + { + /* ETag */ + if (xml_flush_chars(ctx)) + { + if (ctx->want & XML_WANT_CHARS) + { + ctx->state = XML_STATE_CHARS_BEFORE_ETAG; + return XML_STATE_CHARS; + } + case XML_STATE_CHARS_BEFORE_ETAG: + xml_pop_chars(ctx); + } + + xml_parse_etag(ctx); +pop_element: + if (ctx->want & XML_WANT_ETAG) + return ctx->state = XML_STATE_ETAG; + case XML_STATE_ETAG: + xml_pop_element(ctx); + if (!ctx->node) + goto epilog; + } + } + +epilog: + /* Misc* */ + TRACE(ctx, "entering epilog"); + while (1) + { + /* Epilog whitespace is the only place, where a valid document can reach EOF */ + if (setjmp(throw_buf)) + if (ctx->err_code == XML_ERR_EOF) + { + TRACE(ctx, "reached EOF"); + ctx->state = XML_STATE_EOF; + if (ctx->h_document_end) + ctx->h_document_end(ctx); + case XML_STATE_EOF: + return XML_STATE_EOF; + } + else + goto error; + xml_parse_white(ctx, 0); + if (setjmp(throw_buf)) + goto error; + + /* Misc */ + xml_parse_char(ctx, '<'); + if ((c = xml_get_char(ctx)) == '?') + /* Processing instruction */ + if (!(ctx->want & XML_WANT_PI)) + xml_skip_pi(ctx); + else + { + xml_push_pi(ctx); + return ctx->state = XML_STATE_EPILOG_PI, XML_STATE_PI; + case XML_STATE_EPILOG_PI: + xml_pop_pi(ctx); + } + else if (c == '!') + /* Comment */ + if (!(ctx->want & XML_WANT_COMMENT)) + xml_skip_comment(ctx); + else + { + xml_push_comment(ctx); + return ctx->state = XML_STATE_EPILOG_COMMENT, XML_STATE_COMMENT; + case XML_STATE_EPILOG_COMMENT: + xml_pop_comment(ctx); + } + else + xml_fatal(ctx, "Syntax error in the epilog"); + } + + } + return -1; +} diff --git a/sherlock/xml/unicat.pl b/sherlock/xml/unicat.pl index fc39bba7..b86106f2 100755 --- a/sherlock/xml/unicat.pl +++ b/sherlock/xml/unicat.pl @@ -88,9 +88,14 @@ set("SNAME_1_1", @sname_1_1); set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]); set("GT", "[>]"); +($ARGV[0] eq "" || $ARGV[1] eq "") && die("Invalid usage"); find_cls(); +open(H, ">", $ARGV[0]) or die("Cannot create $ARGV[0]"); +open(C, ">", $ARGV[1]) or die("Cannot create $ARGV[1]"); gen_enum(); gen_tabs(); +close(H); +close(C); sub set { my $id = shift; @@ -113,21 +118,26 @@ sub find_cls { } sub gen_enum { - print "enum xml_char_type {\n"; + print H "enum xml_char_type {\n"; foreach my $id (sort keys %ids) { my $mask = 0; foreach my $i (keys %cls) { $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id})); } - printf " XML_CHAR_%-20s = 0x%08x,\n", $id, $mask; + printf H " XML_CHAR_%-20s = 0x%08x,\n", $id, $mask; } - print "};\n\n"; + print H "};\n\n"; } sub gen_tabs { my @tab = (); my %hash = (); - print "static const uns xml_char_tab1[] = {\n "; + + print H "extern const byte xml_char_tab1[];\n"; + print H "extern const uns xml_char_tab2[];\n"; + print H "extern const byte xml_char_tab3[];\n"; + + print C "const uns xml_char_tab2[] = {\n "; for (my $t=0; $t<256; $t++) { my $i = $t * 256; my @x = (); @@ -139,17 +149,17 @@ sub gen_tabs { $hash{$sub} = 256 * scalar @tab; push @tab, $sub; } - printf("0x%x", $hash{$sub}); - print((~$t & 15) ? "," : ($t < 255) ? ",\n " : "\n};\n\n"); + printf C "0x%x", $hash{$sub}; + print C ((~$t & 15) ? "," : ($t < 255) ? ",\n " : "\n};\n\n"); } - print "static const byte xml_char_tab2[] = {\n"; - print join(",\n\n", @tab); - print "\n};\n\n"; + print C "const byte xml_char_tab1[] = {\n"; + print C join(",\n\n", @tab); + print C "\n};\n\n"; my @l = (); for (my $i=0; $i<0x11; $i++) { push @l, sprintf("%d", $cls{$lcat[$i]}); } - print "static const byte xml_char_tab3[] = {" . join(",", @l) . "};\n"; + print C "const byte xml_char_tab3[] = {" . join(",", @l) . "};\n"; } diff --git a/sherlock/xml/xml-test.c b/sherlock/xml/xml-test.c new file mode 100644 index 00000000..cca5ad8a --- /dev/null +++ b/sherlock/xml/xml-test.c @@ -0,0 +1,253 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#include "sherlock/sherlock.h" +#include "sherlock/xml/xml.h" +#include "lib/getopt.h" +#include "lib/fastbuf.h" + +#include +#include + +static char *shortopts = "sp" CF_SHORT_OPTS; +static struct option longopts[] = { + CF_LONG_OPTS + { "sax", 0, 0, 's' }, + { "pull", 0, 0, 'p' }, + { "dom", 0, 0, 'd' }, + { NULL, 0, 0, 0 } +}; + +static void NONRET +usage(void) +{ + fputs("\ +Usage: xml-test [options] < in.xml\n\ +\n\ +Options:\n" +CF_USAGE +"\ +-s, --pull Test PULL interface\n\ +-s, --sax Test SAX interface\n\ +-d, --dom Test DOM interface\n\ +\n", stderr); + exit(1); +} + +static uns want_sax; +static uns want_pull; +static uns want_dom; +static struct fastbuf *out; + +static char * +node_type(struct xml_node *node) +{ + switch (node->type) + { + case XML_NODE_ELEM: return "element"; + case XML_NODE_COMMENT: return "comment"; + case XML_NODE_PI: return "pi"; + case XML_NODE_CDATA: return "chars"; + default: return "unknown"; + } +} + +static void +show_node(struct xml_node *node) +{ + switch (node->type) + { + case XML_NODE_ELEM: + bprintf(out, " <%s>", node->name); + SLIST_FOR_EACH(struct xml_attr *, a, node->attrs) + bprintf(out, " %s='%s'", a->name, a->val); + bputc(out, '\n'); + break; + case XML_NODE_COMMENT: + bprintf(out, " text='%s'\n", node->text); + break; + case XML_NODE_PI: + bprintf(out, " target=%s text='%s'\n", node->name, node->text); + break; + case XML_NODE_CDATA: + bprintf(out, " text='%s'\n", node->text); + break; + default: + bputc(out, '\n'); + } +} + +static void +show_tree(struct xml_node *node, uns level) +{ + if (!node) + return; + bputs(out, "DOM: "); + for (uns i = 0; i < level; i++) + bputs(out, " "); + bputs(out, node_type(node)); + show_node(node); + if (node->type == XML_NODE_ELEM) + CLIST_FOR_EACH(struct xml_node *, son, node->sons) + show_tree(son, level + 1); +} + +static void +h_error(struct xml_context *ctx) +{ + bprintf(out, "SAX: %s at %u: %s\n", (ctx->err_code < XML_ERR_ERROR) ? "warn" : "error", xml_row(ctx), ctx->err_msg); +} + +static void +h_document_start(struct xml_context *ctx UNUSED) +{ + bputs(out, "SAX: document_start\n"); +} + +static void +h_document_end(struct xml_context *ctx UNUSED) +{ + bputs(out, "SAX: document_end\n"); +} + +static void +h_xml_decl(struct xml_context *ctx) +{ + bprintf(out, "SAX: xml_decl version=%s standalone=%d\n", ctx->version_str, ctx->standalone); +} + +static void +h_doctype_decl(struct xml_context *ctx) +{ + bprintf(out, "SAX: doctype_decl type=%s public='%s' system='%s' extsub=%d intsub=%d\n", + ctx->document_type, ctx->eid.public_id ? : "", ctx->eid.system_id ? : "", + !!(ctx->flags & XML_FLAG_HAS_EXTERNAL_SUBSET), !!(ctx->flags & XML_FLAG_HAS_INTERNAL_SUBSET)); +} + +static void +h_comment(struct xml_context *ctx) +{ + bputs(out, "SAX: comment"); + show_node(ctx->node); +} + +static void +h_pi(struct xml_context *ctx) +{ + bprintf(out, "SAX: pi"); + show_node(ctx->node); +} + +static void +h_element_start(struct xml_context *ctx) +{ + bprintf(out, "SAX: element_start"); + show_node(ctx->node); +} + +static void +h_element_end(struct xml_context *ctx) +{ + bprintf(out, "SAX: element_end \n", ctx->node->name); +} + +static void +h_chars(struct xml_context *ctx) +{ + bprintf(out, "SAX: chars"); + show_node(ctx->node); +} + +int +main(int argc, char **argv) +{ + int opt; + cf_def_file = NULL; // FIXME + log_init(argv[0]); + while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0) + switch (opt) + { + case 's': + want_sax++; + break; + case 'p': + want_pull++; + break; + case 'd': + want_dom++; + break; + default: + usage(); + } + if (optind != argc) + usage(); + + out = bfdopen_shared(1, 4096); + struct xml_context ctx; + xml_init(&ctx); + ctx.h_warn = ctx.h_error = ctx.h_fatal = h_error; + if (want_sax) + { + ctx.h_document_start = h_document_start; + ctx.h_document_end = h_document_end; + ctx.h_xml_decl = h_xml_decl; + ctx.h_doctype_decl = h_doctype_decl; + ctx.h_comment = h_comment; + ctx.h_pi = h_pi; + ctx.h_element_start = h_element_start; + ctx.h_element_end = h_element_end; + ctx.h_chars = h_chars; + } + if (want_pull) + ctx.want = XML_WANT_CHARS | XML_WANT_STAG | XML_WANT_ETAG | XML_WANT_COMMENT | XML_WANT_PI; + if (want_dom) + ctx.flags &= ~XML_DOM_FREE; + xml_set_source(&ctx, bfdopen_shared(0, 4096)); + int state; + bprintf(out, "PULL: start\n"); + while ((state = xml_next(&ctx)) >= 0 && state != XML_STATE_EOF) + switch (state) + { + case XML_STATE_CHARS: + bprintf(out, "PULL: chars"); + show_node(ctx.node); + break; + case XML_STATE_STAG: + bprintf(out, "PULL: element_start"); + show_node(ctx.node); + break; + case XML_STATE_ETAG: + bprintf(out, "PULL: element_end \n", ctx.node->name); + break; + case XML_STATE_COMMENT: + bprintf(out, "PULL: comment"); + show_node(ctx.node); + break; + case XML_STATE_PI: + bprintf(out, "PULL: pi"); + show_node(ctx.node); + break; +#if 0 + case XML_STATE_CDATA: + bprintf(out, "PULL: cdata [%s]\n", ctx.node->text); + break; +#endif + } + if (state != XML_STATE_EOF) + bprintf(out, "PULL: fatal error\n"); + else + bprintf(out, "PULL: eof\n"); + + if (want_dom) + show_tree(ctx.root, 0); + + xml_cleanup(&ctx); + bclose(out); + return 0; +} diff --git a/sherlock/xml/xml.c b/sherlock/xml/xml.c deleted file mode 100644 index 1d9f0f45..00000000 --- a/sherlock/xml/xml.c +++ /dev/null @@ -1,2524 +0,0 @@ -/* - * Sherlock Library -- A simple XML parser - * - * (c) 2007 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -/* TODO: - * - iface - * - stack-like memory handling where possible - */ - -#define LOCAL_DEBUG - -#include "lib/lib.h" -#include "lib/mempool.h" -#include "lib/fastbuf.h" -#include "lib/ff-unicode.h" -#include "lib/ff-binary.h" -#include "lib/chartype.h" -#include "lib/unicode.h" -#include "lib/hashfunc.h" -#include "lib/stkstring.h" -#include "lib/unaligned.h" -#include "charset/charconv.h" -#include "charset/fb-charconv.h" -#include "sherlock/xml/xml.h" -#include "sherlock/xml/dtd.h" - -#include - -/*** Debugging ***/ - -#ifdef LOCAL_DEBUG -#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0) -#else -#define TRACE(c, f, p...) do {} while(0) -#endif - -static uns xml_row(struct xml_context *ctx); - -/*** Error handling ***/ - -static void NONRET -xml_throw(struct xml_context *ctx) -{ - ASSERT(ctx->err_code && ctx->throw_buf); - longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code); -} - -static void -xml_warn(struct xml_context *ctx, const char *format, ...) -{ - if (ctx->h_warn) - { - va_list args; - va_start(args, format); - ctx->err_msg = stk_vprintf(format, args); - ctx->err_code = XML_ERR_WARN; - va_end(args); - ctx->h_warn(ctx); - ctx->err_msg = NULL; - ctx->err_code = XML_ERR_OK; - } -} - -static void -xml_error(struct xml_context *ctx, const char *format, ...) -{ - if (ctx->h_error) - { - va_list args; - va_start(args, format); - ctx->err_msg = stk_vprintf(format, args); - ctx->err_code = XML_ERR_ERROR; - va_end(args); - ctx->h_error(ctx); - ctx->err_msg = NULL; - ctx->err_code = XML_ERR_OK; - } -} - -static void NONRET -xml_fatal(struct xml_context *ctx, const char *format, ...) -{ - va_list args; - va_start(args, format); - ctx->err_msg = mp_vprintf(ctx->pool, format, args); - ctx->err_code = XML_ERR_FATAL; - ctx->state = XML_STATE_FATAL; - va_end(args); - if (ctx->h_fatal) - ctx->h_fatal(ctx); - xml_throw(ctx); -} - -/*** Charecter categorization ***/ - -#include "obj/sherlock/xml/unicat.h" - -static inline uns -xml_char_cat(uns c) -{ - if (c < 0x10000) - return 1U << xml_char_tab2[(c & 0xff) + xml_char_tab1[c >> 8]]; - else if (likely(c < 0x110000)) - return 1U << xml_char_tab3[c >> 16]; - else - return 1; -} - -/*** Memory management ***/ - -static void NONRET -xml_fatal_nested(struct xml_context *ctx) -{ - xml_fatal(ctx, "Entity not nested correctly"); -} - -static inline void -xml_inc(struct xml_context *ctx) -{ - /* Called after the first character of a block */ - TRACE(ctx, "inc"); - ctx->depth++; -} - -static inline void -xml_dec(struct xml_context *ctx) -{ - /* Called after the last character of a block */ - TRACE(ctx, "dec"); - if (unlikely(!ctx->depth--)) - xml_fatal_nested(ctx); -} - -static inline void -xml_push(struct xml_context *ctx) -{ - TRACE(ctx, "push"); - struct xml_stack *s = mp_alloc(ctx->pool, sizeof(*s)); - mp_save(ctx->pool, &s->saved_pool); - s->saved_flags = ctx->flags; - s->next = ctx->stack; - ctx->stack = s; - xml_inc(ctx); -} - -static inline void -xml_pop(struct xml_context *ctx) -{ - TRACE(ctx, "pop"); - xml_dec(ctx); - struct xml_stack *s = ctx->stack; - ASSERT(s); - ctx->stack = s->next; - ctx->flags = s->saved_flags; - mp_restore(ctx->pool, &s->saved_pool); -} - -#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN) -#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \ - static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uns size) \ - { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \ - static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {} - -static void * -xml_hash_new(struct mempool *pool, uns size) -{ - void *tab = mp_alloc_zero(pool, size + XML_HASH_HDR_SIZE); - *(void **)tab = pool; - return tab + XML_HASH_HDR_SIZE; -} - -/*** Reading of document/external entities ***/ - -static void NONRET -xml_eof(struct xml_context *ctx) -{ - ctx->err_msg = "Unexpected EOF"; - ctx->err_code = XML_ERR_EOF; - xml_throw(ctx); -} - -static inline void -xml_add_char(u32 **bstop, uns c) -{ - *(*bstop)++ = c; - *(*bstop)++ = xml_char_cat(c); -} - -static struct xml_source * -xml_push_source(struct xml_context *ctx, uns flags) -{ - xml_push(ctx); - struct xml_source *src = ctx->src; - if (src) - { - src->bptr = ctx->bptr; - src->bstop = ctx->bstop; - } - src = mp_alloc_zero(ctx->pool, sizeof(*src)); - src->next = ctx->src; - src->saved_depth = ctx->depth; - ctx->src = src; - ctx->flags = (ctx->flags & ~(XML_FLAG_SRC_EOF | XML_FLAG_SRC_EXPECTED_DECL | XML_FLAG_SRC_NEW_LINE | XML_FLAG_SRC_SURROUND | XML_FLAG_SRC_DOCUMENT)) | flags; - ctx->bstop = ctx->bptr = src->buf; - ctx->depth = 0; - if (flags & XML_FLAG_SRC_SURROUND) - xml_add_char(&ctx->bstop, 0x20); - return src; -} - -static void -xml_pop_source(struct xml_context *ctx) -{ - TRACE(ctx, "xml_pop_source"); - if (unlikely(ctx->depth != 0)) - xml_fatal_nested(ctx); - struct xml_source *src = ctx->src; - ASSERT(src); - bclose(src->fb); - ctx->depth = src->saved_depth; - ctx->src = src = src->next; - if (src) - { - ctx->bptr = src->bptr; - ctx->bstop = src->bstop; - } - xml_pop(ctx); - if (unlikely(!src)) - xml_eof(ctx); -} - -static void xml_refill_utf8(struct xml_context *ctx); - -static void -xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent) -{ - TRACE(ctx, "xml_push_entity"); - uns cat1 = ctx->src->refill_cat1; - uns cat2 = ctx->src->refill_cat2; - struct xml_source *src = xml_push_source(ctx, 0); - src->refill_cat1 = cat1; - src->refill_cat2 = cat2; - if (ent->flags & XML_DTD_ENT_EXTERNAL) - xml_fatal(ctx, "External entities not implemented"); // FIXME - else - { - fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0); - src->refill = xml_refill_utf8; - } -} - -void -xml_set_source(struct xml_context *ctx, struct fastbuf *fb) -{ - TRACE(ctx, "xml_set_source"); - ASSERT(!ctx->src); - struct xml_source *src = xml_push_source(ctx, XML_FLAG_SRC_DOCUMENT | XML_FLAG_SRC_EXPECTED_DECL); - src->fb = fb; -} - -static uns -xml_error_restricted(struct xml_context *ctx, uns c) -{ - if (c == ~1U) - xml_error(ctx, "Corrupted encoding"); - else - xml_error(ctx, "Restricted char U+%04X", c); - return UNI_REPLACEMENT; -} - -static void xml_parse_decl(struct xml_context *ctx); - -#define REFILL(ctx, func, params...) \ - struct xml_source *src = ctx->src; \ - struct fastbuf *fb = src->fb; \ - if (ctx->bptr == ctx->bstop) \ - ctx->bptr = ctx->bstop = src->buf; \ - uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ - u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \ - *last_0xd = (f & XML_FLAG_SRC_NEW_LINE) ? bstop : bend; \ - do \ - { \ - c = func(fb, ##params); \ - uns t = xml_char_cat(c); \ - if (t & t1) \ - /* Typical branch */ \ - *bstop++ = c, *bstop++ = t; \ - else if (t & t2) \ - { \ - /* New line */ \ - /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \ - /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \ - if (c == 0xd) \ - last_0xd = bstop + 2; \ - else if (c != 0x2028 && last_0xd == bstop) \ - { \ - last_0xd = bend; \ - continue; \ - } \ - xml_add_char(&bstop, 0xa), row++; \ - } \ - else if (c == '>') \ - { \ - /* Used only in XML/TextDecl to switch the encoding */ \ - *bstop++ = c, *bstop++ = t; \ - break; \ - } \ - else if (~c) \ - /* Restricted character */ \ - xml_add_char(&bstop, xml_error_restricted(ctx, c)); \ - else \ - { \ - /* EOF */ \ - if (f & XML_FLAG_SRC_SURROUND) \ - xml_add_char(&bstop, 0x20); \ - f |= XML_FLAG_SRC_EOF; \ - break; \ - } \ - } \ - while (bstop < bend); \ - ctx->flags = (last_0xd == bstop) ? f | XML_FLAG_SRC_NEW_LINE : f & ~XML_FLAG_SRC_NEW_LINE; \ - ctx->bstop = bstop; \ - src->row = row; - -static void -xml_refill_utf8(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf8_repl, ~1U); -} - -static void -xml_refill_utf16_le(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf16_le_repl, ~1U); -} - -static void -xml_refill_utf16_be(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf16_be_repl, ~1U); -} - -#if 0 -static inline uns -xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x) -{ - // FIXME: slow - int c; - return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]); -} - -static void -xml_refill_libcharset(struct xml_context *ctx) -{ - unsigned short int *in_to_x = ctx->src->refill_in_to_x; - REFILL(ctx, xml_refill_libcharset_bget, in_to_x); -} -#endif - -#undef REFILL - -static void -xml_refill(struct xml_context *ctx) -{ - do - { - if (ctx->flags & XML_FLAG_SRC_EOF) - xml_pop_source(ctx); - else if (ctx->flags & XML_FLAG_SRC_EXPECTED_DECL) - xml_parse_decl(ctx); - else - { - ctx->src->refill(ctx); - TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2)); - } - } - while (ctx->bptr == ctx->bstop); -} - -static inline uns -xml_peek_char(struct xml_context *ctx) -{ - if (ctx->bptr == ctx->bstop) - xml_refill(ctx); - return ctx->bptr[0]; -} - -static inline uns -xml_peek_cat(struct xml_context *ctx) -{ - if (ctx->bptr == ctx->bstop) - xml_refill(ctx); - return ctx->bptr[1]; -} - -static inline uns -xml_get_char(struct xml_context *ctx) -{ - uns c = xml_peek_char(ctx); - ctx->bptr += 2; - return c; -} - -static inline uns -xml_get_cat(struct xml_context *ctx) -{ - uns c = xml_peek_cat(ctx); - ctx->bptr += 2; - return c; -} - -static inline uns -xml_last_char(struct xml_context *ctx) -{ - return ctx->bptr[-2]; -} - -static inline uns -xml_last_cat(struct xml_context *ctx) -{ - return ctx->bptr[-1]; -} - -static inline uns -xml_skip_char(struct xml_context *ctx) -{ - uns c = ctx->bptr[0]; - ctx->bptr += 2; - return c; -} - -static inline uns -xml_unget_char(struct xml_context *ctx) -{ - return *(ctx->bptr -= 2); -} - -static uns -xml_row(struct xml_context *ctx) -{ - struct xml_source *src = ctx->src; - if (!src) - return 0; - uns row = src->row; - for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2) - if (p[-1] & src->refill_cat2) - row--; - return row + 1; -} - -/*** Basic parsing ***/ - -static void NONRET -xml_fatal_expected(struct xml_context *ctx, uns c) -{ - xml_fatal(ctx, "Expected '%c'", c); -} - -static void NONRET -xml_fatal_expected_white(struct xml_context *ctx) -{ - xml_fatal(ctx, "Expected a white space"); -} - -static void NONRET -xml_fatal_expected_quot(struct xml_context *ctx) -{ - xml_fatal(ctx, "Expected a quotation mark"); -} - -static inline uns -xml_parse_white(struct xml_context *ctx, uns mandatory) -{ - /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+ - * mandatory=0 -> S? */ - uns cnt = 0; - while (xml_peek_cat(ctx) & XML_CHAR_WHITE) - { - xml_skip_char(ctx); - cnt++; - } - if (unlikely(mandatory && !cnt)) - xml_fatal_expected_white(ctx); - return cnt; -} - -static inline void -xml_parse_char(struct xml_context *ctx, uns c) -{ - /* Consumes a given Unicode character */ - if (unlikely(c != xml_get_char(ctx))) - xml_fatal_expected(ctx, c); -} - -static inline void -xml_parse_seq(struct xml_context *ctx, const char *seq) -{ - /* Consumes a given sequence of ASCII characters */ - while (*seq) - xml_parse_char(ctx, *seq++); -} - -static void -xml_parse_eq(struct xml_context *ctx) -{ - /* Eq ::= S? '=' S? */ - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '='); - xml_parse_white(ctx, 0); -} - -static inline uns -xml_parse_quote(struct xml_context *ctx) -{ - /* "'" | '"' */ - uns c = xml_get_char(ctx); - if (unlikely(c != '\'' && c != '\"')) - xml_fatal_expected_quot(ctx); - return c; -} - -/* Names and nmtokens */ - -static char * -xml_parse_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err) -{ - char *p = mp_start_noalign(ctx->pool, 1); - if (unlikely(!(xml_peek_cat(ctx) & first_cat))) - xml_fatal(ctx, "%s", err); - do - { - p = mp_spread(ctx->pool, p, 5); - p = utf8_32_put(p, xml_skip_char(ctx)); - } - while (xml_peek_cat(ctx) & next_cat); - *p++ = 0; - return mp_end(ctx->pool, p); -} - -static void -xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err) -{ - if (unlikely(!(xml_get_cat(ctx) & first_cat))) - xml_fatal(ctx, "%s", err); - while (xml_peek_cat(ctx) & next_cat) - xml_skip_char(ctx); -} - -static char * -xml_parse_name(struct xml_context *ctx) -{ - /* Name ::= NameStartChar (NameChar)* */ - return xml_parse_string(ctx, - !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, - !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, - "Expected a name"); -} - -static void -xml_skip_name(struct xml_context *ctx) -{ - xml_skip_string(ctx, - !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, - !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, - "Expected a name"); -} - -static char * -xml_parse_nmtoken(struct xml_context *ctx) -{ - /* Nmtoken ::= (NameChar)+ */ - uns cat = !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1; - return xml_parse_string(ctx, cat, cat, "Expected a nmtoken"); -} - -/* Simple literals */ - -static char * -xml_parse_system_literal(struct xml_context *ctx) -{ - /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */ - char *p = mp_start_noalign(ctx->pool, 1); - uns q = xml_parse_quote(ctx), c; - while ((c = xml_get_char(ctx)) != q) - { - p = mp_spread(ctx->pool, p, 5); - p = utf8_32_put(p, c); - } - *p++ = 0; - return mp_end(ctx->pool, p); -} - -static char * -xml_parse_pubid_literal(struct xml_context *ctx) -{ - /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */ - char *p = mp_start_noalign(ctx->pool, 1); - uns q = xml_parse_quote(ctx), c; - while ((c = xml_get_char(ctx)) != q) - { - if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID))) - xml_fatal(ctx, "Expected a pubid character"); - p = mp_spread(ctx->pool, p, 2); - *p++ = c; - } - *p++ = 0; - return mp_end(ctx->pool, p); -} - -static char * -xml_parse_encoding_name(struct xml_context *ctx) -{ - /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */ - char *p = mp_start_noalign(ctx->pool, 1); - uns q = xml_parse_quote(ctx); - if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME))) - xml_fatal(ctx, "Invalid character in the encoding name"); - while (1) - { - p = mp_spread(ctx->pool, p, 2); - *p++ = xml_last_char(ctx); - if (xml_get_char(ctx) == q) - break; - if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME))) - xml_fatal(ctx, "Invalid character in the encoding name"); - } - *p++ = 0; - return mp_end(ctx->pool, p); -} - -/* Document/external entity header */ - -static inline void -xml_init_cats(struct xml_context *ctx, uns mask) -{ - if (!(ctx->flags & XML_FLAG_VERSION_1_1)) - { - ctx->src->refill_cat1 = XML_CHAR_VALID_1_0 & ~XML_CHAR_NEW_LINE_1_0 & ~mask; - ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_0; - } - else - { - ctx->src->refill_cat1 = XML_CHAR_UNRESTRICTED_1_1 & ~XML_CHAR_NEW_LINE_1_1 & ~mask; - ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_1; - } -} - -static void -xml_init_charconv(struct xml_context *ctx, int cs) -{ - // FIXME: hack - struct xml_source *src = ctx->src; - TRACE(ctx, "wrapping charset %s", charset_name(cs)); -#if 0 - struct conv_context conv; - conv_set_charset(&conv, cs, CONV_CHARSET_UTF8); - src->refill = xml_refill_libcharset; - src->refill_in_to_x = conv.in_to_x; -#else - src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8); - // FIXME: memory leak -#endif -} - -static void -xml_parse_decl(struct xml_context *ctx) -{ - TRACE(ctx, "xml_parse_decl"); - struct xml_source *src = ctx->src; - ctx->flags &= ~XML_FLAG_SRC_EXPECTED_DECL; - - /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */ - xml_init_cats(ctx, XML_CHAR_GT); - - /* Initialize the supplied charset (if any) or try to guess it */ - char *expected_encoding = src->expected_encoding ? : src->fb_encoding; - src->refill = xml_refill_utf8; - int bom = bpeekc(src->fb); - if (bom < 0) - ctx->flags |= XML_FLAG_SRC_EOF; - if (!src->fb_encoding) - { - if (bom == 0xfe) - src->refill = xml_refill_utf16_be; - else if (bom == 0xff) - src->refill = xml_refill_utf16_le; - } - else - { - int cs = find_charset_by_name(src->fb_encoding); - if (cs == CONV_CHARSET_UTF8) - {} - else if (cs >= 0) - { - xml_init_charconv(ctx, cs); - bom = 0; - } - else if (strcasecmp(src->fb_encoding, "UTF-16")) - { - src->refill = xml_refill_utf16_be; - if (bom == 0xff) - src->refill = xml_refill_utf16_le; - if (!src->expected_encoding) - expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE"; - } - else if (strcasecmp(src->fb_encoding, "UTF-16BE")) - src->refill = xml_refill_utf16_be; - else if (strcasecmp(src->fb_encoding, "UTF-16LE")) - src->refill = xml_refill_utf16_le; - else - { - xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding); - expected_encoding = NULL; - } - } - uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; - if (bom > 0 && xml_peek_char(ctx) == 0xfeff) - xml_skip_char(ctx); - else if (utf16) - xml_error(ctx, "Missing or corrupted BOM"); - - /* Look ahead for presence of XMLDecl or optional TextDecl */ - if (!(ctx->flags & XML_FLAG_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) - xml_refill(ctx); - uns doc = ctx->flags & XML_FLAG_SRC_DOCUMENT; - u32 *bptr = ctx->bptr; - uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) && - bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L'); - if (!have_decl) - { - if (doc) - xml_fatal(ctx, "Missing or corrupted XML header"); - else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16) - xml_error(ctx, "Missing or corrupted entity header"); - goto exit; - } - ctx->bptr = bptr + 12; - xml_parse_white(ctx, 0); - - /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */ - if (xml_peek_char(ctx) == 'v') - { - xml_parse_seq(ctx, "version"); - xml_parse_eq(ctx); - char *version = xml_parse_pubid_literal(ctx); - TRACE(ctx, "version=%s", version); - uns v = 0; - if (!strcmp(version, "1.1")) - v = XML_FLAG_VERSION_1_1; - else if (strcmp(version, "1.0")) - { - xml_error(ctx, "Unknown XML version string '%s'", version); - version = "1.0"; - } - if (doc) - { - ctx->version_str = version; - ctx->flags |= v; - } - else if (v > (ctx->flags & XML_FLAG_VERSION_1_1)) - xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document"); - if (!xml_parse_white(ctx, !doc)) - goto end; - } - else if (doc) - { - xml_error(ctx, "Expected XML version"); - ctx->version_str = "1.0"; - } - - /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */ - if (xml_peek_char(ctx) == 'e') - { - xml_parse_seq(ctx, "encoding"); - xml_parse_eq(ctx); - src->decl_encoding = xml_parse_encoding_name(ctx); - TRACE(ctx, "encoding=%s", src->decl_encoding); - if (!xml_parse_white(ctx, 0)) - goto end; - } - else if (!doc) - xml_error(ctx, "Expected XML encoding"); - - /* Parse whether the document is standalone (optional in XMLDecl) */ - if (doc && xml_peek_char(ctx) == 's') - { - xml_parse_seq(ctx, "standalone"); - xml_parse_eq(ctx); - uns c = xml_parse_quote(ctx); - if (ctx->standalone = (xml_peek_char(ctx) == 'y')) - xml_parse_seq(ctx, "yes"); - else - xml_parse_seq(ctx, "no"); - xml_parse_char(ctx, c); - TRACE(ctx, "standalone=%d", ctx->standalone); - xml_parse_white(ctx, 0); - } -end: - xml_parse_seq(ctx, "?>"); - - /* Switch to the final encoding */ - if (src->decl_encoding) - { - int cs = find_charset_by_name(src->decl_encoding); - if (cs < 0 && !expected_encoding) - xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); - else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) - xml_init_charconv(ctx, cs); - else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || - !(!strcasecmp(src->decl_encoding, "UTF-16") || - (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || - (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) - xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); - } - -exit: - /* Update valid Unicode ranges */ - xml_init_cats(ctx, 0); -} - -/*** Document Type Definition (DTD) ***/ - -/* Notations */ - -#define HASH_PREFIX(x) xml_dtd_notns_##x -#define HASH_NODE struct xml_dtd_notn -#define HASH_KEY_STRING name -#define HASH_ZERO_FILL -#define HASH_TABLE_DYNAMIC -#define HASH_WANT_FIND -#define HASH_WANT_LOOKUP -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include "lib/hashtable.h" - -/* General entities */ - -#define HASH_PREFIX(x) xml_dtd_ents_##x -#define HASH_NODE struct xml_dtd_ent -#define HASH_KEY_STRING name -#define HASH_ZERO_FILL -#define HASH_TABLE_DYNAMIC -#define HASH_WANT_FIND -#define HASH_WANT_LOOKUP -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include "lib/hashtable.h" - -static struct xml_dtd_ent * -xml_dtd_declare_trivial_gent(struct xml_context *ctx, char *name, char *text) -{ - struct xml_dtd *dtd = ctx->dtd; - struct xml_dtd_ent *ent = xml_dtd_ents_lookup(dtd->tab_gents, name); - if (ent->flags & XML_DTD_ENT_DECLARED) - { - xml_warn(ctx, "Entity &%s; already declared", name); - return NULL; - } - slist_add_tail(&dtd->gents, &ent->n); - ent->flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL; - ent->text = text; - ent->len = strlen(text); - return ent; -} - -static void -xml_dtd_declare_default_gents(struct xml_context *ctx) -{ - xml_dtd_declare_trivial_gent(ctx, "lt", "<"); - xml_dtd_declare_trivial_gent(ctx, "gt", ">"); - xml_dtd_declare_trivial_gent(ctx, "amp", "&"); - xml_dtd_declare_trivial_gent(ctx, "apos", "'"); - xml_dtd_declare_trivial_gent(ctx, "quot", "\""); -} - -static struct xml_dtd_ent * -xml_dtd_find_gent(struct xml_context *ctx, char *name) -{ - struct xml_dtd *dtd = ctx->dtd; - if (dtd) - { - struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_gents, name); - return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; - } - else - { -#define ENT(n, t) ent_##n = { .name = #n, .text = t, .len = 1, .flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL } - static struct xml_dtd_ent ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\""); -#undef ENT - switch (name[0]) - { - case 'l': - if (!strcmp(name, "lt")) - return &ent_lt; - break; - case 'g': - if (!strcmp(name, "gt")) - return &ent_gt; - break; - case 'a': - if (!strcmp(name, "amp")) - return &ent_amp; - if (!strcmp(name, "apos")) - return &ent_apos; - break; - case 'q': - if (!strcmp(name, "quot")) - return &ent_quot; - break; - } - return NULL; - } -} - -/* Parameter entities */ - -static struct xml_dtd_ent * -xml_dtd_find_pent(struct xml_context *ctx, char *name) -{ - struct xml_dtd *dtd = ctx->dtd; - struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_pents, name); - return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; -} - -/* Elements */ - -#define HASH_PREFIX(x) xml_dtd_elems_##x -#define HASH_NODE struct xml_dtd_elem -#define HASH_KEY_STRING name -#define HASH_TABLE_DYNAMIC -#define HASH_ZERO_FILL -#define HASH_WANT_LOOKUP -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include "lib/hashtable.h" - -/* Element sons */ - -struct xml_dtd_enodes_table; - -static inline uns -xml_dtd_enodes_hash(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) -{ - return hash_pointer(parent) ^ hash_pointer(elem); -} - -static inline int -xml_dtd_enodes_eq(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent1, struct xml_dtd_elem *elem1, struct xml_dtd_elem_node *parent2, struct xml_dtd_elem *elem2) -{ - return (parent1 == parent2) && (elem1 == elem2); -} - -static inline void -xml_dtd_enodes_init_key(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *node, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) -{ - node->parent = parent; - node->elem = elem; -} - -#define HASH_PREFIX(x) xml_dtd_enodes_##x -#define HASH_NODE struct xml_dtd_elem_node -#define HASH_KEY_COMPLEX(x) x parent, x elem -#define HASH_KEY_DECL struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_TABLE_DYNAMIC -#define HASH_ZERO_FILL -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include "lib/hashtable.h" - -/* Element attributes */ - -struct xml_dtd_attrs_table; - -static inline uns -xml_dtd_attrs_hash(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem, char *name) -{ - return hash_pointer(elem) ^ hash_string(name); -} - -static inline int -xml_dtd_attrs_eq(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem1, char *name1, struct xml_dtd_elem *elem2, char *name2) -{ - return (elem1 == elem2) && !strcmp(name1, name2); -} - -static inline void -xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_elem *elem, char *name) -{ - attr->elem = elem; - attr->name = name; -} - -#define HASH_PREFIX(x) xml_dtd_attrs_##x -#define HASH_NODE struct xml_dtd_attr -#define HASH_ZERO_FILL -#define HASH_TABLE_DYNAMIC -#define HASH_KEY_COMPLEX(x) x elem, x name -#define HASH_KEY_DECL struct xml_dtd_elem *elem, char *name -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include "lib/hashtable.h" - -/* Enumerated attribute values */ - -struct xml_dtd_evals_table; - -static inline uns -xml_dtd_evals_hash(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr, char *val) -{ - return hash_pointer(attr) ^ hash_string(val); -} - -static inline int -xml_dtd_evals_eq(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr1, char *val1, struct xml_dtd_attr *attr2, char *val2) -{ - return (attr1 == attr2) && !strcmp(val1, val2); -} - -static inline void -xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_eval *eval, struct xml_dtd_attr *attr, char *val) -{ - eval->attr = attr; - eval->val = val; -} - -#define HASH_PREFIX(x) xml_dtd_evals_##x -#define HASH_NODE struct xml_dtd_eval -#define HASH_TABLE_DYNAMIC -#define HASH_KEY_COMPLEX(x) x attr, x val -#define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include "lib/hashtable.h" - -/* Enumerated attribute notations */ - -struct xml_dtd_enotns_table; - -static inline uns -xml_dtd_enotns_hash(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) -{ - return hash_pointer(attr) ^ hash_pointer(notn); -} - -static inline int -xml_dtd_enotns_eq(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr1, struct xml_dtd_notn *notn1, struct xml_dtd_attr *attr2, struct xml_dtd_notn *notn2) -{ - return (attr1 == attr2) && (notn1 == notn2); -} - -static inline void -xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_enotn *enotn, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) -{ - enotn->attr = attr; - enotn->notn = notn; -} - -#define HASH_PREFIX(x) xml_dtd_enotns_##x -#define HASH_NODE struct xml_dtd_enotn -#define HASH_TABLE_DYNAMIC -#define HASH_KEY_COMPLEX(x) x attr, x notn -#define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_GIVE_ALLOC -#define HASH_TABLE_ALLOC -XML_HASH_GIVE_ALLOC -#include "lib/hashtable.h" - -/* DTD initialization/cleanup */ - -static void -xml_dtd_init(struct xml_context *ctx) -{ - if (ctx->dtd) - return; - struct mempool *pool = mp_new(4096); - struct xml_dtd *dtd = ctx->dtd = mp_alloc_zero(pool, sizeof(*ctx->dtd)); - dtd->pool = pool; - xml_dtd_ents_init(dtd->tab_gents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); - xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); - xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table))); - xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table))); - xml_dtd_enodes_init(dtd->tab_enodes = xml_hash_new(pool, sizeof(struct xml_dtd_enodes_table))); - xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table))); - xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table))); - xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table))); - xml_dtd_declare_default_gents(ctx); -} - -static void -xml_dtd_cleanup(struct xml_context *ctx) -{ - if (!ctx->dtd) - return; - mp_delete(ctx->dtd->pool); - ctx->dtd = NULL; -} - -static void -xml_dtd_finish(struct xml_context *ctx) -{ - if (!ctx->dtd) - return; - // FIXME -} - -/*** Parsing functions ***/ - -/* Comments */ - -static void -xml_push_comment(struct xml_context *ctx) -{ - /* Parse a comment to ctx->value: - * Comment ::= '' - * Already parsed: 'value; - uns c; - xml_parse_char(ctx, '-'); - while (1) - { - if ((c = xml_get_char(ctx)) == '-') - if ((c = xml_get_char(ctx)) == '-') - break; - else - bputc(out, '-'); - bput_utf8_32(out, c); - } - xml_parse_char(ctx, '>'); - xml_dec(ctx); - fbgrow_rewind(out); - if (ctx->h_comment) - ctx->h_comment(ctx); -} - -static void -xml_pop_comment(struct xml_context *ctx) -{ - fbgrow_rewind(ctx->value); -} - -static void -xml_skip_comment(struct xml_context *ctx) -{ - xml_parse_char(ctx, '-'); - while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-'); - xml_parse_char(ctx, '>'); - xml_dec(ctx); -} - -/* Processing instructions */ - -static void -xml_push_pi(struct xml_context *ctx) -{ - /* Parses a PI to ctx->value and ctx->name: - * PI ::= '' Char*)))? '?>' - * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) - * Already parsed: 'name = xml_parse_name(ctx); - if (unlikely(!strcasecmp(ctx->name, "xml"))) - xml_fatal(ctx, "Reserved PI target"); - struct fastbuf *out = ctx->value; - if (xml_parse_white(ctx, 0)) - xml_parse_seq(ctx, "?>"); - else - { - while (1) - { - uns c; - if ((c = xml_get_char(ctx)) == '?') - if (xml_get_char(ctx) == '>') - break; - else - { - xml_unget_char(ctx); - bputc(out, '?'); - } - else - bput_utf8_32(out, c); - } - fbgrow_rewind(out); - } - xml_dec(ctx); - if (ctx->h_pi) - ctx->h_pi(ctx); -} - -static void -xml_pop_pi(struct xml_context *ctx) -{ - fbgrow_reset(ctx->value); -} - -static void -xml_skip_pi(struct xml_context *ctx) -{ - if (ctx->flags & XML_FLAG_VALIDATING) - { - mp_push(ctx->pool); - if (unlikely(!strcasecmp(xml_parse_name(ctx), "xml"))) - xml_fatal(ctx, "Reserved PI target"); - mp_pop(ctx->pool); - if (!xml_parse_white(ctx, 0)) - { - xml_parse_seq(ctx, "?>"); - xml_dec(ctx); - return; - } - } - while (1) - if (xml_get_char(ctx) == '?') - if (xml_get_char(ctx) == '>') - break; - else - xml_unget_char(ctx); - xml_dec(ctx); -} - -/* Character references */ - -static uns -xml_parse_char_ref(struct xml_context *ctx) -{ - TRACE(ctx, "parse_char_ref"); - /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' - * Already parsed: '&#' */ - uns v = 0; - if (xml_get_char(ctx) == 'x') - { - if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT)) - { - xml_error(ctx, "Expected a hexadecimal value of character reference"); - goto recover; - } - do - { - v = (v << 4) + Cxvalue(xml_last_char(ctx)); - } - while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT)); - } - else - { - if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT)) - { - xml_error(ctx, "Expected a numeric value of character reference"); - goto recover; - } - do - { - v = v * 10 + xml_last_char(ctx) - '0'; - } - while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT)); - } - uns cat = xml_char_cat(v); - if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_FLAG_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0))) - { - xml_error(ctx, "Character reference out of range"); - goto recover; - } - if (xml_last_char(ctx) == ';') - { - xml_dec(ctx); - return v; - } - xml_error(ctx, "Expected ';'"); -recover: - while (xml_last_char(ctx) != ';') - xml_get_char(ctx); - xml_dec(ctx); - return UNI_REPLACEMENT; -} - -/* References to general entities */ - -static void -xml_parse_ge_ref(struct xml_context *ctx, struct fastbuf *out) -{ - /* Reference ::= EntityRef | CharRef - * EntityRef ::= '&' Name ';' - * Already parsed: '&' */ - if (xml_peek_char(ctx) == '#') - { - xml_skip_char(ctx); - uns c = xml_parse_char_ref(ctx); - bput_utf8_32(out, c); - } - else - { - struct mempool_state state; - mp_save(ctx->pool, &state); - char *name = xml_parse_name(ctx); - xml_parse_char(ctx, ';'); - struct xml_dtd_ent *ent = xml_dtd_find_gent(ctx, name); - if (!ent) - { - xml_error(ctx, "Unknown entity &%s;", name); - bputc(out, '&'); - bputs(out, name); - bputc(out, ';'); - } - else if (ent->flags & XML_DTD_ENT_TRIVIAL) - { - TRACE(ctx, "Trivial entity &%s;", name); - bwrite(out, ent->text, ent->len); - } - else - { - TRACE(ctx, "Pushed entity &%s;", name); - mp_restore(ctx->pool, &state); - xml_dec(ctx); - xml_push_entity(ctx, ent); - return; - } - mp_restore(ctx->pool, &state); - xml_dec(ctx); - } -} - -/* References to parameter entities */ - -static void -xml_parse_pe_ref(struct xml_context *ctx) -{ - /* PEReference ::= '%' Name ';' - * Already parsed: '%' */ - struct mempool_state state; - mp_save(ctx->pool, &state); - char *name = xml_parse_name(ctx); - xml_parse_char(ctx, ';'); - struct xml_dtd_ent *ent = xml_dtd_find_pent(ctx, name); - if (!ent) - xml_error(ctx, "Unknown entity %%%s;", name); - else - { - TRACE(ctx, "Pushed entity %%%s;", name); - mp_restore(ctx->pool, &state); - xml_dec(ctx); - xml_push_entity(ctx, ent); - return; - } - mp_restore(ctx->pool, &state); - xml_dec(ctx); -} - -static void -xml_parse_dtd_pe(struct xml_context *ctx) -{ - do - { - xml_skip_char(ctx); - xml_inc(ctx); - while (xml_peek_cat(ctx) & XML_CHAR_WHITE) - xml_skip_char(ctx); - xml_parse_pe_ref(ctx); - } - while (xml_peek_char(ctx) != '%'); -} - -static inline uns -xml_parse_dtd_white(struct xml_context *ctx, uns mandatory) -{ - /* Whitespace or parameter entity */ - uns cnt = 0; - while (xml_peek_cat(ctx) & XML_CHAR_WHITE) - { - xml_skip_char(ctx); - cnt = 1; - } - if (xml_peek_char(ctx) == '%') - { - xml_parse_dtd_pe(ctx); - return 1; - } - else if (unlikely(mandatory && !cnt)) - xml_fatal_expected_white(ctx); - return cnt; -} - -static inline uns -xml_check_dtd_pe(struct xml_context *ctx) -{ - if (xml_peek_char(ctx) == '%') - { - xml_parse_dtd_pe(ctx); - return 1; - } - return 0; -} - -/* External ID */ - -static void -xml_parse_external_id(struct xml_context *ctx, struct xml_ext_id *eid, uns allow_public, uns dtd) -{ - bzero(eid, sizeof(*eid)); - if (dtd) - xml_check_dtd_pe(ctx); - uns c = xml_peek_char(ctx); - if (c == 'S') - { - xml_parse_seq(ctx, "SYSTEM"); - if (dtd) - xml_parse_dtd_white(ctx, 1); - else - xml_parse_white(ctx, 1); - eid->system_id = xml_parse_system_literal(ctx); - } - else if (c == 'P') - { - xml_parse_seq(ctx, "PUBLIC"); - if (dtd) - xml_parse_dtd_white(ctx, 1); - else - xml_parse_white(ctx, 1); - eid->public_id = xml_parse_pubid_literal(ctx); - if (dtd ? xml_parse_dtd_white(ctx, 0) : xml_parse_white(ctx, 0)) - if ((c = xml_peek_char(ctx)) == '\'' || c == '"' || !allow_public) - eid->system_id = xml_parse_system_literal(ctx); - } - else - xml_fatal(ctx, "Expected an external ID"); -} - -/* DTD: */ - -static void -xml_parse_notation_decl(struct xml_context *ctx) -{ - /* NotationDecl ::= '' - * Already parsed: 'dtd; - xml_parse_dtd_white(ctx, 1); - - struct xml_dtd_notn *notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx)); - xml_parse_dtd_white(ctx, 1); - struct xml_ext_id eid; - xml_parse_external_id(ctx, &eid, 1, 1); - xml_parse_dtd_white(ctx, 0); - xml_parse_char(ctx, '>'); - - if (notn->flags & XML_DTD_NOTN_DECLARED) - xml_warn(ctx, "Notation %s already declared", notn->name); - else - { - notn->flags = XML_DTD_NOTN_DECLARED; - notn->eid = eid; - slist_add_tail(&dtd->notns, ¬n->n); - } - xml_dec(ctx); -} - -/* DTD: */ - -static void -xml_parse_entity_decl(struct xml_context *ctx) -{ - /* Already parsed: 'dtd; - xml_parse_dtd_white(ctx, 1); - - uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENT_PARAMETER : 0; - if (flags) - xml_parse_dtd_white(ctx, 1); - else - xml_unget_char(ctx); - - struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_gents, xml_parse_name(ctx)); - slist *list = flags ? &dtd->pents : &dtd->gents; - xml_parse_white(ctx, 1); - if (ent->flags & XML_DTD_ENT_DECLARED) - { - xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); - // FIXME: should be only warning - } - - uns c, sep = xml_get_char(ctx); - if (sep == '\'' || sep == '"') - { - /* Internal entity: - * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */ - struct fastbuf *out = ctx->value; - while (1) - { - if ((c = xml_get_char(ctx)) == sep) - break; - else if (c == '%') - { - // FIXME - ASSERT(0); - //xml_parse_parameter_ref(ctx); - } - else if (c != '&') - bput_utf8_32(out, c); - else if ((c = xml_get_char(ctx)) == '#') - c = xml_parse_char_ref(ctx); - else - { - /* Bypass references to general entities */ - mp_push(ctx->pool); - bputc(out, '&'); - xml_unget_char(ctx); - bputs(out, xml_parse_name(ctx)); - xml_parse_char(ctx, ';'); - bputc(out, ';'); - mp_pop(ctx->pool); - } - } - bputc(out, 0); - fbgrow_rewind(out); - slist_add_tail(list, &ent->n); - ent->flags = flags | XML_DTD_ENT_DECLARED; - ent->len = out->bstop - out->bptr - 1; - ent->text = mp_memdup(ctx->pool, out->bptr, ent->len + 1); - fbgrow_reset(out); - } - else - { - /* External entity */ - struct xml_ext_id eid; - struct xml_dtd_notn *notn = NULL; - xml_parse_external_id(ctx, &eid, 0, 0); - if (!xml_parse_white(ctx, 0) || !flags) - xml_parse_char(ctx, '>'); - else if (xml_get_char(ctx) != '>') - { - /* General external unparsed entity */ - flags |= XML_DTD_ENT_UNPARSED; - xml_parse_seq(ctx, "NDATA"); - xml_parse_white(ctx, 1); - notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx)); - } - slist_add_tail(list, &ent->n); - ent->flags = flags | XML_DTD_ENT_DECLARED | XML_DTD_ENT_EXTERNAL; - ent->eid = eid; - ent->notn = notn; - } - xml_parse_dtd_white(ctx, 0); - xml_parse_char(ctx, '>'); - xml_dec(ctx); -} - -/* DTD: */ - -static void -xml_parse_element_decl(struct xml_context *ctx) -{ - /* Elementdecl ::= '' - * Already parsed: 'dtd; - struct xml_dtd_elem *elem = xml_dtd_elems_lookup(dtd->tab_elems, name); - if (elem->flags & XML_DTD_ELEM_DECLARED) - xml_fatal(ctx, "Element <%s> already declared", name); - - /* contentspec ::= 'EMPTY' | 'ANY' | Mixed | children */ - uns c = xml_peek_char(ctx); - if (c == 'E') - { - xml_parse_seq(ctx, "EMPTY"); - elem->type = XML_DTD_ELEM_EMPTY; - } - else if (c == 'A') - { - xml_parse_seq(ctx, "ANY"); - elem->type = XML_DTD_ELEM_ANY; - } - else if (c == '(') - { - xml_skip_char(ctx); - xml_inc(ctx); - xml_parse_dtd_white(ctx, 0); - struct xml_dtd_elem_node *parent = elem->node = mp_alloc_zero(dtd->pool, sizeof(*parent)); - if (xml_peek_char(ctx) == '#') - { - /* Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' */ - xml_skip_char(ctx); - xml_parse_seq(ctx, "PCDATA"); - elem->type = XML_DTD_ELEM_MIXED; - parent->type = XML_DTD_ELEM_PCDATA; - while (1) - { - xml_parse_dtd_white(ctx, 0); - if ((c = xml_get_char(ctx)) == ')') - break; - else if (c != '|') - xml_fatal_expected(ctx, ')'); - xml_parse_dtd_white(ctx, 0); - struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx)); - if (xml_dtd_enodes_find(dtd->tab_enodes, parent, son_elem)) - xml_error(ctx, "Duplicate content '%s'", son_elem->name); - else - { - struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); - slist_add_tail(&parent->sons, &son->n); - } - } - xml_dec(ctx); - if (xml_peek_char(ctx) == '*') - { - xml_skip_char(ctx); - parent->occur = XML_DTD_ELEM_OCCUR_MULT; - } - else if (!slist_head(&parent->sons)) - parent->occur = XML_DTD_ELEM_OCCUR_ONCE; - else - xml_fatal_expected(ctx, '*'); - } - else - { - /* children ::= (choice | seq) ('?' | '*' | '+')? - * cp ::= (Name | choice | seq) ('?' | '*' | '+')? - * choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' - * seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' */ - - elem->type = XML_DTD_ELEM_CHILDREN; - parent->type = XML_DTD_ELEM_PCDATA; - uns c; - goto first; - - while (1) - { - /* After name */ - xml_parse_dtd_white(ctx, 0); - if ((c = xml_get_char(ctx)) == ')') - { - xml_dec(ctx); - if (parent->type == XML_DTD_ELEM_PCDATA) - parent->type = XML_DTD_ELEM_SEQ; - if ((c = xml_get_char(ctx)) == '?') - parent->occur = XML_DTD_ELEM_OCCUR_OPT; - else if (c == '*') - parent->occur = XML_DTD_ELEM_OCCUR_MULT; - else if (c == '+') - parent->occur = XML_DTD_ELEM_OCCUR_PLUS; - else - { - xml_unget_char(ctx); - parent->occur = XML_DTD_ELEM_OCCUR_ONCE; - } - if (!parent->parent) - break; - parent = parent->parent; - continue; - } - else if (c == '|') - { - if (parent->type == XML_DTD_ELEM_PCDATA) - parent->type = XML_DTD_ELEM_OR; - else if (parent->type != XML_DTD_ELEM_OR) - xml_fatal(ctx, "Mixed operators in the list of element children"); - } - else if (c == ',') - { - if (parent->type == XML_DTD_ELEM_PCDATA) - parent->type = XML_DTD_ELEM_SEQ; - else if (parent->type != XML_DTD_ELEM_SEQ) - xml_fatal(ctx, "Mixed operators in the list of element children"); - } - else if (c == '(') - { - xml_inc(ctx); - struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); - son->parent = parent; - slist_add_tail(&parent->sons, &son->n); - parent = son->parent; - son->type = XML_DTD_ELEM_MIXED; - } - else - xml_unget_char(ctx); - - /* Before name */ - xml_parse_dtd_white(ctx, 0); -first:; - struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx)); - // FIXME: duplicates, occurance - //struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); - struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); - son->parent = parent; - son->elem = son_elem; - slist_add_tail(&parent->sons, &son->n); - } - } - } - else - xml_fatal(ctx, "Expected element content specification"); - - xml_parse_dtd_white(ctx, 0); - xml_parse_char(ctx, '>'); - xml_dec(ctx); -} - -static char * -xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) -{ - uns quote = xml_parse_quote(ctx); - xml_push(ctx); - struct fastbuf *out = ctx->value; - while (1) - { - uns c = xml_get_char(ctx); - if (c == '&') - { - xml_inc(ctx); - xml_parse_ge_ref(ctx, out); - } - else if (c == quote) // FIXME: beware quotes inside parsed - break; - else if (c == '<') - xml_error(ctx, "Attribute value must not contain '<'"); - else - bput_utf8_32(out, c); - } - xml_pop(ctx); - bputc(out, 0); - fbgrow_rewind(out); - char *value = mp_memdup(ctx->pool, out->bptr, out->bstop - out->bptr); - // FIXME: check value constraints / normalize value - fbgrow_reset(out); - return value; -} - -static void -xml_parse_attr_list_decl(struct xml_context *ctx) -{ - /* AttlistDecl ::= '' - * AttDef ::= S Name S AttType S DefaultDecl - * Already parsed: 'dtd->tab_elems, xml_parse_name(ctx)); - - while (xml_parse_dtd_white(ctx, 0) && xml_peek_char(ctx) != '>') - { - char *name = xml_parse_name(ctx); - struct xml_dtd_attr *attr = xml_dtd_attrs_find(ctx->dtd->tab_attrs, elem, name); - uns ignored = 0; - if (attr) - { - xml_warn(ctx, "Duplicate attribute definition"); - ignored++; - } - else - attr = xml_dtd_attrs_new(ctx->dtd->tab_attrs, elem, name); - xml_parse_dtd_white(ctx, 1); - if (xml_peek_char(ctx) == '(') - { - xml_skip_char(ctx); // FIXME: xml_inc/dec ? - if (!ignored) - attr->type = XML_ATTR_ENUM; - do - { - xml_parse_dtd_white(ctx, 0); - char *value = xml_parse_nmtoken(ctx); - if (!ignored) - if (xml_dtd_evals_find(ctx->dtd->tab_evals, attr, value)) - xml_error(ctx, "Duplicate enumeration value"); - else - xml_dtd_evals_new(ctx->dtd->tab_evals, attr, value); - xml_parse_dtd_white(ctx, 0); - } - while (xml_get_char(ctx) == '|'); - xml_unget_char(ctx); - xml_parse_char(ctx, ')'); - } - else - { - char *type = xml_parse_name(ctx); - enum xml_dtd_attribute_type t; - if (!strcmp(type, "CDATA")) - t = XML_ATTR_CDATA; - else if (!strcmp(type, "ID")) - t = XML_ATTR_ID; - else if (!strcmp(type, "IDREF")) - t = XML_ATTR_IDREF; - else if (!strcmp(type, "IDREFS")) - t = XML_ATTR_IDREFS; - else if (!strcmp(type, "ENTITY")) - t = XML_ATTR_ENTITY; - else if (!strcmp(type, "ENTITIES")) - t = XML_ATTR_ENTITIES; - else if (!strcmp(type, "NMTOKEN")) - t = XML_ATTR_NMTOKEN; - else if (!strcmp(type, "NMTOKENS")) - t = XML_ATTR_NMTOKENS; - else if (!strcmp(type, "NOTATION")) - { - if (elem->type == XML_DTD_ELEM_EMPTY) - xml_fatal(ctx, "Empty element must not have notation attribute"); - // FIXME: An element type MUST NOT have more than one NOTATION attribute specified. - t = XML_ATTR_NOTATION; - xml_parse_dtd_white(ctx, 1); - xml_parse_char(ctx, '('); - do - { - xml_parse_dtd_white(ctx, 0); - struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx)); - if (!ignored) - if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, attr, n)) - xml_error(ctx, "Duplicate enumerated notation"); - else - xml_dtd_enotns_new(ctx->dtd->tab_enotns, attr, n); - xml_parse_dtd_white(ctx, 0); - } - while (xml_get_char(ctx) == '|'); - xml_unget_char(ctx); - xml_parse_char(ctx, ')'); - } - else - xml_fatal(ctx, "Unknown attribute type"); - if (!ignored) - attr->type = t; - } - xml_parse_dtd_white(ctx, 1); - enum xml_dtd_attribute_default def = XML_ATTR_NONE; - if (xml_get_char(ctx) == '#') - switch (xml_peek_char(ctx)) - { - case 'R': - xml_parse_seq(ctx, "REQUIRED"); - def = XML_ATTR_REQUIRED; - break; - case 'I': - xml_parse_seq(ctx, "IMPLIED"); - def = XML_ATTR_IMPLIED; - break; - case 'F': - xml_parse_seq(ctx, "FIXED"); - def = XML_ATTR_FIXED; - xml_parse_dtd_white(ctx, 1); - break; - default: - xml_fatal(ctx, "Expected a modifier for default attribute value"); - } - else - xml_unget_char(ctx); - if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED) - { - char *v = xml_parse_attr_value(ctx, attr); - if (!ignored) - attr->default_value = v; - } - if (!ignored) - attr->default_mode = def; - } - xml_skip_char(ctx); - xml_dec(ctx); -} - -/* DTD: Internal subset */ - -static void -xml_parse_internal_subset(struct xml_context *ctx) -{ - // FIXME: comments/pi have no parent - /* '[' intSubset ']' - * intSubset :== (markupdecl | DeclSep) - * Already parsed: ']' */ - while (1) - { - xml_parse_white(ctx, 0); - uns c = xml_get_char(ctx); - xml_inc(ctx); - if (c == '<') - if ((c = xml_get_char(ctx)) == '!') - switch (c = xml_get_char(ctx)) - { - case '-': - xml_push_comment(ctx); - xml_pop_comment(ctx); - break; - case 'N': - xml_parse_seq(ctx, "OTATION"); - xml_parse_notation_decl(ctx); - break; - case 'E': - if ((c = xml_get_char(ctx)) == 'N') - { - xml_parse_seq(ctx, "TITY"); - xml_parse_entity_decl(ctx); - } - else if (c == 'L') - { - xml_parse_seq(ctx, "EMENT"); - xml_parse_element_decl(ctx); - } - else - goto invalid_markup; - break; - case 'A': - xml_parse_seq(ctx, "TTLIST"); - xml_parse_attr_list_decl(ctx); - break; - default: - goto invalid_markup; - } - else if (c == '?') - { - xml_push_pi(ctx); - xml_pop_pi(ctx); - } - else - goto invalid_markup; - else if (c == '%') - xml_parse_dtd_pe(ctx); - else if (c == ']') - break; - else - goto invalid_markup; - } - xml_dec(ctx); - xml_dec(ctx); - return; -invalid_markup: - xml_fatal(ctx, "Invalid markup in the internal subset"); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////////// - -static void -xml_parse_cdata(struct xml_context *ctx) -{ - struct fastbuf *out = ctx->chars; - xml_parse_seq(ctx, "CDATA["); - while (1) - { - uns c; - if ((c = xml_get_char(ctx)) == ']') - { - if ((c = xml_get_char(ctx)) == ']') - if ((c = xml_get_char(ctx)) == '>') - break; - else - bputc(out, ']'); - bputc(out, ']'); - } - bput_utf8_32(out, c); - } -} - -static void -xml_skip_cdata(struct xml_context *ctx) -{ - xml_parse_cdata(ctx); -} - -static void -xml_parse_chars(struct xml_context *ctx) -{ - TRACE(ctx, "parse_chars"); - struct fastbuf *out = ctx->chars; - uns c; - while ((c = xml_get_char(ctx)) != '<') - if (c == '&') - { - xml_inc(ctx); - xml_parse_ge_ref(ctx, out); - } - else - bput_utf8_32(out, c); - xml_unget_char(ctx); -} - -/*----------------------------------------------*/ - -struct xml_attrs_table; - -static inline uns -xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_elem *e, char *n) -{ - return hash_pointer(e) ^ hash_string(n); -} - -static inline int -xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_elem *e1, char *n1, struct xml_elem *e2, char *n2) -{ - return (e1 == e2) && !strcmp(n1, n2); -} - -static inline void -xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_elem *e, char *name) -{ - a->elem = e; - a->name = name; - a->val = NULL; - slist_add_tail(&e->attrs, &a->n); -} - -#define HASH_PREFIX(x) xml_attrs_##x -#define HASH_NODE struct xml_attr -#define HASH_KEY_COMPLEX(x) x elem, x name -#define HASH_KEY_DECL struct xml_elem *elem, char *name -#define HASH_TABLE_DYNAMIC -#define HASH_GIVE_EQ -#define HASH_GIVE_HASHFN -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_CLEANUP -#define HASH_WANT_REMOVE -#define HASH_WANT_LOOKUP -#define HASH_WANT_FIND -#define HASH_GIVE_ALLOC -XML_HASH_GIVE_ALLOC -#include "lib/hashtable.h" - -void -xml_init(struct xml_context *ctx) -{ - bzero(ctx, sizeof(*ctx)); - ctx->pool = mp_new(65536); - ctx->chars = fbgrow_create(4096); - ctx->value = fbgrow_create(4096); - xml_dtd_init(ctx); - xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table))); -} - -void -xml_cleanup(struct xml_context *ctx) -{ - xml_attrs_cleanup(ctx->tab_attrs); - xml_dtd_cleanup(ctx); - bclose(ctx->value); - bclose(ctx->chars); - mp_delete(ctx->pool); -} - -static void -xml_parse_attr(struct xml_context *ctx) -{ - // FIXME: memory management, dtd, literal - TRACE(ctx, "parse_attr"); - struct xml_elem *e = ctx->elem; - char *name = xml_parse_name(ctx); - struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, name); - xml_parse_eq(ctx); - char *val =xml_parse_system_literal(ctx); - if (a->val) - xml_error(ctx, "Attribute is not unique"); - else - a->val = val; -} - -static void -xml_parse_stag(struct xml_context *ctx) -{ - // FIXME: dtd - TRACE(ctx, "parse_stag"); - xml_push(ctx); - struct xml_elem *e = mp_alloc_zero(ctx->pool, sizeof(*e)); - struct xml_elem *parent = ctx->elem; - clist_init(&e->sons); - e->node.parent = (void *)parent; - ctx->elem = e; - e->name = xml_parse_name(ctx); - if (parent) - clist_add_tail(&parent->sons, &e->node.n); - else - { - ctx->root = e; - if (ctx->document_type && strcmp(e->name, ctx->document_type)) - xml_error(ctx, "The root element does not match the document type"); - } - while (1) - { - uns white = xml_parse_white(ctx, 0); - uns c = xml_get_char(ctx); - if (c == '/') - { - xml_parse_char(ctx, '>'); - ctx->flags |= XML_FLAG_EMPTY_ELEM; - break; - } - else if (c == '>') - break; - else if (!white) - xml_fatal_expected_white(ctx); - xml_unget_char(ctx); - xml_parse_attr(ctx); - } - if (ctx->h_element_start) - ctx->h_element_start(ctx); -} - -static void -xml_parse_etag(struct xml_context *ctx) -{ - TRACE(ctx, "parse_etag"); - struct xml_elem *e = ctx->elem; - ASSERT(e); - char *name = xml_parse_name(ctx); - if (strcmp(name, e->name)) - xml_fatal(ctx, "Invalid ETag, expected '%s'", e->name); - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '>'); - xml_dec(ctx); -} - -static void -xml_pop_element(struct xml_context *ctx) -{ - TRACE(ctx, "pop_element"); - if (ctx->h_element_end) - ctx->h_element_end(ctx); - struct xml_elem *e = ctx->elem; - if (ctx->flags & XML_DOM_FREE) - { - if (e->node.parent) - clist_remove(&e->node.n); - else - ctx->root = NULL; - SLIST_FOR_EACH(struct xml_attr *, a, e->attrs) - xml_attrs_remove(ctx->tab_attrs, a); - struct xml_node *n; - while (n = clist_head(&e->sons)) - { - if (n->type == XML_NODE_ELEM) - { - SLIST_FOR_EACH(struct xml_attr *, a, ((struct xml_elem *)n)->attrs) - xml_attrs_remove(ctx->tab_attrs, a); - clist_insert_list_after(&((struct xml_elem *)n)->sons, &n->n); - } - clist_remove(&n->n); - } - } - ctx->node = e->node.parent; - xml_pop(ctx); // FIXME: memory management without XML_DOM_FREE - xml_dec(ctx); -#if 0 - for (struct xml_attribute *a = e->attrs; a; a = a->next) - xml_attribute_remove(ctx->attribute_table, a); -#endif -} - -static void -xml_parse_doctype_decl(struct xml_context *ctx) -{ - if (ctx->document_type) - xml_fatal(ctx, "Multiple document types not allowed"); - xml_parse_seq(ctx, "DOCTYPE"); - xml_parse_white(ctx, 1); - ctx->document_type = xml_parse_name(ctx); - TRACE(ctx, "doctyype=%s", ctx->document_type); - uns white = xml_parse_white(ctx, 0); - uns c = xml_peek_char(ctx); - if (c != '>' && c != '[' && white) - { - xml_parse_external_id(ctx, &ctx->eid, 0, 0); - xml_parse_white(ctx, 0); - ctx->flags |= XML_FLAG_HAS_EXTERNAL_SUBSET; - } - if (xml_peek_char(ctx) == '[') - ctx->flags |= XML_FLAG_HAS_INTERNAL_SUBSET; - if (ctx->h_doctype_decl) - ctx->h_doctype_decl(ctx); -} - -int -xml_next(struct xml_context *ctx) -{ - /* A nasty state machine */ - - TRACE(ctx, "xml_next (state=%u)", ctx->state); - jmp_buf throw_buf; - ctx->throw_buf = &throw_buf; - if (setjmp(throw_buf)) - { -error: - if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal) - ctx->h_fatal(ctx); - ctx->state = XML_STATE_FATAL; - TRACE(ctx, "raised fatal error"); - return -1; - } - uns c; - switch (ctx->state) - { - case XML_STATE_FATAL: - return -1; - - case XML_STATE_START: - TRACE(ctx, "entering prolog"); - if (ctx->h_document_start) - ctx->h_document_start(ctx); - /* XMLDecl */ - xml_refill(ctx); - if (ctx->h_xml_decl) - ctx->h_xml_decl(ctx); - if (ctx->want & XML_WANT_DECL) - return ctx->state = XML_STATE_DECL; - case XML_STATE_DECL: - - /* Misc* (doctypedecl Misc*)? */ - while (1) - { - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '<'); - if ((c = xml_get_char(ctx)) == '?') - /* Processing intruction */ - if (!(ctx->want & XML_WANT_PI)) - xml_skip_pi(ctx); - else - { - xml_push_pi(ctx); - ctx->state = XML_STATE_PROLOG_PI; - return XML_STATE_PI; - case XML_STATE_PROLOG_PI: - xml_pop_pi(ctx); - } - else if (c != '!') - { - /* Found the root tag */ - xml_unget_char(ctx); - goto first_tag; - } - else if (xml_get_char(ctx) == '-') - if (!(ctx->want & XML_WANT_COMMENT)) - xml_skip_comment(ctx); - else - { - xml_push_comment(ctx); - ctx->state = XML_STATE_PROLOG_COMMENT; - return XML_STATE_COMMENT; - case XML_STATE_PROLOG_COMMENT: - xml_pop_comment(ctx); - } - else - { - /* DocTypeDecl */ - xml_unget_char(ctx); - xml_parse_doctype_decl(ctx); - if (ctx->want & XML_WANT_DOCUMENT_TYPE) - return ctx->state = XML_STATE_DOCUMENT_TYPE; - case XML_STATE_DOCUMENT_TYPE: - if (xml_peek_char(ctx) == '[') - { - xml_skip_char(ctx); - xml_inc(ctx); - xml_parse_internal_subset(ctx); - xml_parse_white(ctx, 0); - } - xml_parse_char(ctx, '>'); - } - } - - case XML_STATE_PI: - mp_pop(ctx->pool); - case XML_STATE_COMMENT: - fbgrow_reset(ctx->value); - - case XML_STATE_CHARS: - - while (1) - { - if (xml_peek_char(ctx) != '<') - { - /* CharData */ - xml_parse_chars(ctx); - continue; - } - else - xml_skip_char(ctx); -first_tag: ; - - xml_inc(ctx); - if ((c = xml_get_char(ctx)) == '?') - { - /* PI */ - if (!(ctx->want & XML_WANT_PI)) - xml_skip_pi(ctx); - else - { - if (btell(ctx->chars)) - { - fbgrow_rewind(ctx->chars); - ctx->state = XML_STATE_CHARS_BEFORE_PI; - return XML_STATE_PI; - case XML_STATE_CHARS_BEFORE_PI: - fbgrow_reset(ctx->chars); - } - xml_push_pi(ctx); - return ctx->state = XML_STATE_PI; - } - } - - else if (c == '!') - if ((c = xml_get_char(ctx)) == '-') - { - /* Comment */ - if (!(ctx->want & XML_WANT_COMMENT)) - xml_skip_comment(ctx); - else - { - if (btell(ctx->chars)) - { - fbgrow_rewind(ctx->chars); - ctx->state = XML_STATE_CHARS_BEFORE_COMMENT; - return XML_STATE_CHARS; - case XML_STATE_CHARS_BEFORE_COMMENT: - fbgrow_reset(ctx->chars); - } - xml_push_comment(ctx); - return ctx->state = XML_STATE_COMMENT; - } - } - else if (c == '[') - { - /* CDATA */ - if (!(ctx->want & XML_WANT_CDATA)) - xml_skip_cdata(ctx); - else - { - if (btell(ctx->chars)) - { - fbgrow_rewind(ctx->chars); - ctx->state = XML_STATE_CHARS_BEFORE_CDATA; - return XML_STATE_CHARS; - case XML_STATE_CHARS_BEFORE_CDATA: - fbgrow_reset(ctx->chars); - } - xml_parse_cdata(ctx); - if (btell(ctx->chars)) - { - fbgrow_rewind(ctx->chars); - return ctx->state = XML_STATE_CDATA; - } - case XML_STATE_CDATA: - fbgrow_reset(ctx->chars); - } - } - else - xml_fatal(ctx, "Unexpected character after 'chars)) - { - fbgrow_rewind(ctx->chars); - ctx->state = XML_STATE_CHARS_BEFORE_STAG; - return XML_STATE_CHARS; - case XML_STATE_CHARS_BEFORE_STAG: - fbgrow_reset(ctx->chars); - } - - xml_parse_stag(ctx); - if (ctx->want & XML_WANT_STAG) - return ctx->state = XML_STATE_STAG; - case XML_STATE_STAG: - if (ctx->flags & XML_FLAG_EMPTY_ELEM) - goto pop_element; - } - - else - { - /* ETag */ - if (btell(ctx->chars)) - { - fbgrow_rewind(ctx->chars); - ctx->state = XML_STATE_CHARS_BEFORE_ETAG; - return XML_STATE_CHARS; - case XML_STATE_CHARS_BEFORE_ETAG: - fbgrow_reset(ctx->chars); - } - - xml_parse_etag(ctx); -pop_element: - if (ctx->want & XML_WANT_ETAG) - return ctx->state = XML_STATE_ETAG; - case XML_STATE_ETAG: - xml_pop_element(ctx); - if (!ctx->elem) - goto epilog; - } - } - -epilog: - /* Misc* */ - TRACE(ctx, "entering epilog"); - while (1) - { - /* Epilog whitespace is the only place, where a valid document can reach EOF */ - if (setjmp(throw_buf)) - if (ctx->err_code == XML_ERR_EOF) - { - TRACE(ctx, "reached EOF"); - ctx->state = XML_STATE_EOF; - if (ctx->h_document_end) - ctx->h_document_end(ctx); - case XML_STATE_EOF: - return XML_STATE_EOF; - } - else - goto error; - xml_parse_white(ctx, 0); - if (setjmp(throw_buf)) - goto error; - - /* Misc */ - xml_parse_char(ctx, '<'); - if ((c = xml_get_char(ctx)) == '?') - /* Processing instruction */ - if (!(ctx->want & XML_WANT_PI)) - xml_skip_pi(ctx); - else - { - xml_push_pi(ctx); - return ctx->state = XML_STATE_EPILOG_PI, XML_STATE_PI; - case XML_STATE_EPILOG_PI: - xml_pop_pi(ctx); - } - else if (c == '!') - /* Comment */ - if (!(ctx->want & XML_WANT_COMMENT)) - xml_skip_comment(ctx); - else - { - xml_push_comment(ctx); - return ctx->state = XML_STATE_EPILOG_COMMENT, XML_STATE_COMMENT; - case XML_STATE_EPILOG_COMMENT: - xml_pop_comment(ctx); - } - else - xml_fatal(ctx, "Syntax error in the epilog"); - } - - } - return -1; -} - -#ifdef TEST - -static void -error(struct xml_context *ctx) -{ - msg((ctx->err_code < XML_ERR_ERROR) ? L_WARN_R : L_ERROR_R, "XML %u: %s", xml_row(ctx), ctx->err_msg); -} - -static void -test(struct fastbuf *in, struct fastbuf *out) -{ - struct xml_context ctx; - xml_init(&ctx); - ctx.h_warn = ctx.h_error = ctx.h_fatal = error; - ctx.want = XML_WANT_ALL; - ctx.flags |= XML_DOM_FREE; - xml_set_source(&ctx, in); - int state; - while ((state = xml_next(&ctx)) >= 0) - switch (state) - { - case XML_STATE_CHARS: - bprintf(out, "CHARS [%.*s]\n", (int)(ctx.chars->bstop - ctx.chars->buffer), ctx.chars->buffer); - break; - case XML_STATE_STAG: - bprintf(out, "STAG <%s>\n", ctx.elem->name); - SLIST_FOR_EACH(struct xml_attr *, a, ctx.elem->attrs) - bprintf(out, " ATTR %s=[%s]\n", a->name, a->val); - break; - case XML_STATE_ETAG: - bprintf(out, "ETAG \n", ctx.elem->name); - break; - case XML_STATE_COMMENT: - bprintf(out, "COMMENT [%.*s]\n", (int)(ctx.value->bstop - ctx.value->buffer), ctx.value->buffer); - break; - case XML_STATE_PI: - bprintf(out, "PI [%s] [%.*s]\n", ctx.name, (int)(ctx.value->bstop - ctx.value->buffer), ctx.value->buffer); - break; - case XML_STATE_CDATA: - bprintf(out, "CDATA [%.*s]\n", (int)(ctx.chars->bstop - ctx.chars->buffer), ctx.chars->buffer); - break; - case XML_STATE_EOF: - bprintf(out, "EOF\n"); - goto end; - break; - } -end: - xml_cleanup(&ctx); -} - -int -main(void) -{ - struct fastbuf *in = bfdopen_shared(0, 1024); - struct fastbuf *out = bfdopen_shared(1, 1024); - test(in, out); - bclose(out); - return 0; -} - -#endif diff --git a/sherlock/xml/xml.h b/sherlock/xml/xml.h index 7e83f65a..db6ab6c6 100644 --- a/sherlock/xml/xml.h +++ b/sherlock/xml/xml.h @@ -13,6 +13,7 @@ #include "lib/clists.h" #include "lib/slists.h" #include "lib/mempool.h" +#include "lib/fastbuf.h" enum xml_error { XML_ERR_OK = 0, @@ -98,31 +99,29 @@ struct xml_node { cnode n; /* Node for list of parent's sons */ uns type; /* XML_NODE_x */ struct xml_node *parent; /* Parent node */ -}; - -struct xml_elem { - struct xml_node node; - char *name; /* Element name */ - clist sons; /* List of subnodes */ - struct xml_dtd_elem *dtd; /* Element DTD */ - slist attrs; /* Link list of attributes */ + char *name; /* Element name / PI target */ + clist sons; /* Children nodes */ + union { + struct { + char *text; /* PI text / Comment / CDATA */ + uns len; /* Text length in bytes */ + }; + struct { + struct xml_dtd_elem *dtd; /* Element DTD */ + slist attrs; /* Link list of element attributes */ + }; + }; }; struct xml_attr { snode n; - struct xml_elem *elem; + struct xml_node *elem; char *name; char *val; }; struct xml_context; -struct xml_stack { - struct xml_stack *next; /* Link list of stack records */ - uns saved_flags; /* Saved ctx->flags */ - struct mempool_state saved_pool; /* Saved ctx->pool state */ -}; - #define XML_BUF_SIZE 32 /* At least 16 -- hardcoded */ struct xml_source { @@ -152,16 +151,13 @@ struct xml_context { void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */ /* Memory management */ - struct mempool *pool; /* Most data */ - struct fastbuf *chars; /* Character data */ - struct fastbuf *value; /* Attribute value / comment / processing instruction data */ - char *name; /* Attribute name, processing instruction target */ - void *tab_attrs; - - /* Stack */ - struct xml_stack *stack; /* See xml_push(), xml_pop() */ + struct mempool *pool; /* DOM pool */ + struct mempool *stack; /* Stack pool (freed as soon as possible) */ + struct xml_stack *stack_list; /* See xml_push(), xml_pop() */ uns flags; /* XML_FLAG_x (restored on xml_pop()) */ uns depth; /* Nesting level */ + struct fastbuf chars; /* Character data / attribute value */ + void *tab_attrs; /* Input */ struct xml_source *src; /* Current source */ @@ -172,17 +168,16 @@ struct xml_context { void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */ void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */ void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration just before internal subset */ - void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction */ void (*h_comment)(struct xml_context *ctx); /* Called after a comment */ + void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction */ void (*h_element_start)(struct xml_context *ctx); /* Called after STag or EmptyElemTag */ void (*h_element_end)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag */ + void (*h_chars)(struct xml_context *ctx); /* Called after some characters */ + void (*h_cdata)(struct xml_context *ctx); /* Called after a CDATA section */ /* DOM */ - struct xml_elem *root; /* DOM root */ - union { - struct xml_node *node; /* Current DOM node */ - struct xml_elem *elem; /* Current element */ - }; + struct xml_node *root; /* DOM root */ + struct xml_node *node; /* Current DOM node */ char *version_str; uns standalone; @@ -194,11 +189,8 @@ struct xml_context { void (*start_dtd)(struct xml_context *ctx); void (*end_dtd)(struct xml_context *ctx); - void (*start_cdata)(struct xml_context *ctx); - void (*end_cdata)(struct xml_context *ctx); void (*start_entity)(struct xml_context *ctx); void (*end_entity)(struct xml_context *ctx); - void (*chacacters)(struct xml_context *ctx); struct fastbuf *(*resolve_entity)(struct xml_context *ctx); void (*notation_decl)(struct xml_context *ctx); void (*unparsed_entity_decl)(struct xml_context *ctx); @@ -208,5 +200,6 @@ void xml_init(struct xml_context *ctx); void xml_cleanup(struct xml_context *ctx); void xml_set_source(struct xml_context *ctx, struct fastbuf *fb); int xml_next(struct xml_context *ctx); +uns xml_row(struct xml_context *ctx); #endif