From 4041eceea24b578a0c5edaf0b82361283e6bbafb Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Wed, 12 Dec 2007 23:10:30 +0100 Subject: [PATCH] XML: The great reorganization... several improvements in the iface, see xml-test.c and xml.h for the details. --- sherlock/xml/Makefile | 2 +- sherlock/xml/common.c | 605 ++++------------------------------------ sherlock/xml/common.h | 121 ++++---- sherlock/xml/dtd.c | 46 +-- sherlock/xml/dtd.h | 15 +- sherlock/xml/parse.c | 397 +++++++++++++++----------- sherlock/xml/source.c | 495 ++++++++++++++++++++++++++++++++ sherlock/xml/xml-test.c | 197 ++++++++----- sherlock/xml/xml.h | 221 ++++++++------- 9 files changed, 1143 insertions(+), 956 deletions(-) create mode 100644 sherlock/xml/source.c diff --git a/sherlock/xml/Makefile b/sherlock/xml/Makefile index cc9fda50..a265b96d 100644 --- a/sherlock/xml/Makefile +++ b/sherlock/xml/Makefile @@ -4,7 +4,7 @@ DIRS+=sherlock/xml PROGS+=$(o)/sherlock/xml/xml-test -LIBSHXML_MODS=common parse dtd +LIBSHXML_MODS=common source parse dtd LIBSHXML_INCLUDES=xml.h dtd.h LIBSHXML_MOD_PATHS=$(addprefix $(o)/sherlock/xml/,$(LIBSHXML_MODS)) diff --git a/sherlock/xml/common.c b/sherlock/xml/common.c index 4d96cecc..0c516ea9 100644 --- a/sherlock/xml/common.c +++ b/sherlock/xml/common.c @@ -7,23 +7,14 @@ * of the GNU Lesser General Public License. */ -#define LOCAL_DEBUG +#undef LOCAL_DEBUG -#include "lib/lib.h" -#include "lib/mempool.h" -#include "lib/fastbuf.h" -#include "lib/ff-unicode.h" -#include "lib/ff-binary.h" -#include "lib/chartype.h" -#include "lib/unicode.h" -#include "lib/hashfunc.h" -#include "lib/stkstring.h" -#include "lib/unaligned.h" -#include "charset/charconv.h" -#include "charset/fb-charconv.h" +#include "sherlock/sherlock.h" #include "sherlock/xml/xml.h" #include "sherlock/xml/dtd.h" #include "sherlock/xml/common.h" +#include "lib/stkstring.h" +#include "lib/ff-unicode.h" #include @@ -75,25 +66,15 @@ xml_fatal(struct xml_context *ctx, const char *format, ...) va_start(args, format); ctx->err_msg = mp_vprintf(ctx->stack, format, args); ctx->err_code = XML_ERR_FATAL; - ctx->state = XML_STATE_FATAL; + ctx->state = XML_STATE_EOF; va_end(args); if (ctx->h_fatal) ctx->h_fatal(ctx); xml_throw(ctx); } -/*** Charecter categorization ***/ - -#include "obj/sherlock/xml/unicat.c" - /*** Memory management ***/ -void NONRET -xml_fatal_nested(struct xml_context *ctx) -{ - xml_fatal(ctx, "Entity not nested correctly"); -} - void * xml_hash_new(struct mempool *pool, uns size) { @@ -102,551 +83,83 @@ xml_hash_new(struct mempool *pool, uns size) return tab + XML_HASH_HDR_SIZE; } -/*** Reading of document/external entities ***/ - -static void NONRET -xml_eof(struct xml_context *ctx) -{ - ctx->err_msg = "Unexpected EOF"; - ctx->err_code = XML_ERR_EOF; - xml_throw(ctx); -} - -static inline void -xml_add_char(u32 **bstop, uns c) -{ - *(*bstop)++ = c; - *(*bstop)++ = xml_char_cat(c); -} - -struct xml_source * -xml_push_source(struct xml_context *ctx, uns flags) -{ - xml_push(ctx); - struct xml_source *src = ctx->src; - if (src) - { - src->bptr = ctx->bptr; - src->bstop = ctx->bstop; - } - src = mp_alloc_zero(ctx->stack, sizeof(*src)); - src->next = ctx->src; - src->saved_depth = ctx->depth; - ctx->src = src; - ctx->flags = (ctx->flags & ~(XML_FLAG_SRC_EOF | XML_FLAG_SRC_EXPECTED_DECL | XML_FLAG_SRC_NEW_LINE | XML_FLAG_SRC_SURROUND | XML_FLAG_SRC_DOCUMENT)) | flags; - ctx->bstop = ctx->bptr = src->buf; - ctx->depth = 0; - if (flags & XML_FLAG_SRC_SURROUND) - xml_add_char(&ctx->bstop, 0x20); - return src; -} - -static void -xml_pop_source(struct xml_context *ctx) -{ - TRACE(ctx, "pop_source"); - if (unlikely(ctx->depth != 0)) - xml_fatal_nested(ctx); - struct xml_source *src = ctx->src; - ASSERT(src); - bclose(src->fb); - ctx->depth = src->saved_depth; - ctx->src = src = src->next; - if (src) - { - ctx->bptr = src->bptr; - ctx->bstop = src->bstop; - } - xml_pop(ctx); - if (unlikely(!src)) - xml_eof(ctx); -} - -static void xml_refill_utf8(struct xml_context *ctx); - -void -xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent) -{ - TRACE(ctx, "xml_push_entity"); - uns cat1 = ctx->src->refill_cat1; - uns cat2 = ctx->src->refill_cat2; - struct xml_source *src = xml_push_source(ctx, 0); - src->refill_cat1 = cat1; - src->refill_cat2 = cat2; - if (ent->flags & XML_DTD_ENT_EXTERNAL) - xml_fatal(ctx, "External entities not implemented"); // FIXME - else - { - fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0); - src->refill = xml_refill_utf8; - } -} - -void -xml_set_source(struct xml_context *ctx, struct fastbuf *fb) -{ - TRACE(ctx, "xml_set_source"); - ASSERT(!ctx->src); - struct xml_source *src = xml_push_source(ctx, XML_FLAG_SRC_DOCUMENT | XML_FLAG_SRC_EXPECTED_DECL); - src->fb = fb; -} - -static uns -xml_error_restricted(struct xml_context *ctx, uns c) -{ - if (c == ~1U) - xml_error(ctx, "Corrupted encoding"); - else - xml_error(ctx, "Restricted char U+%04X", c); - return UNI_REPLACEMENT; -} - -void xml_parse_decl(struct xml_context *ctx); - -#define REFILL(ctx, func, params...) \ - struct xml_source *src = ctx->src; \ - struct fastbuf *fb = src->fb; \ - if (ctx->bptr == ctx->bstop) \ - ctx->bptr = ctx->bstop = src->buf; \ - uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ - u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \ - *last_0xd = (f & XML_FLAG_SRC_NEW_LINE) ? bstop : bend; \ - do \ - { \ - c = func(fb, ##params); \ - uns t = xml_char_cat(c); \ - if (t & t1) \ - /* Typical branch */ \ - *bstop++ = c, *bstop++ = t; \ - else if (t & t2) \ - { \ - /* New line */ \ - /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \ - /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \ - if (c == 0xd) \ - last_0xd = bstop + 2; \ - else if (c != 0x2028 && last_0xd == bstop) \ - { \ - last_0xd = bend; \ - continue; \ - } \ - xml_add_char(&bstop, 0xa), row++; \ - } \ - else if (c == '>') \ - { \ - /* Used only in XML/TextDecl to switch the encoding */ \ - *bstop++ = c, *bstop++ = t; \ - break; \ - } \ - else if (~c) \ - /* Restricted character */ \ - xml_add_char(&bstop, xml_error_restricted(ctx, c)); \ - else \ - { \ - /* EOF */ \ - if (f & XML_FLAG_SRC_SURROUND) \ - xml_add_char(&bstop, 0x20); \ - f |= XML_FLAG_SRC_EOF; \ - break; \ - } \ - } \ - while (bstop < bend); \ - ctx->flags = (last_0xd == bstop) ? f | XML_FLAG_SRC_NEW_LINE : f & ~XML_FLAG_SRC_NEW_LINE; \ - ctx->bstop = bstop; \ - src->row = row; - -static void -xml_refill_utf8(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf8_repl, ~1U); -} - -static void -xml_refill_utf16_le(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf16_le_repl, ~1U); -} - -static void -xml_refill_utf16_be(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf16_be_repl, ~1U); -} - -#if 0 -static inline uns -xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x) -{ - // FIXME: slow - int c; - return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]); -} - static void -xml_refill_libcharset(struct xml_context *ctx) +xml_chars_spout(struct fastbuf *fb) { - unsigned short int *in_to_x = ctx->src->refill_in_to_x; - REFILL(ctx, xml_refill_libcharset_bget, in_to_x); -} -#endif - -#undef REFILL - -void -xml_refill(struct xml_context *ctx) -{ - do + if (fb->bptr >= fb->bufend) { - if (ctx->flags & XML_FLAG_SRC_EOF) - xml_pop_source(ctx); - else if (ctx->flags & XML_FLAG_SRC_EXPECTED_DECL) - xml_parse_decl(ctx); + struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb); + struct mempool *pool = ctx->pool; + if (fb->bufend != fb->buffer) + { + uns len = fb->bufend - fb->buffer; + TRACE(ctx, "grow_chars"); + fb->buffer = mp_expand(pool); + fb->bufend = fb->buffer + mp_avail(pool); + fb->bstop = fb->buffer; + fb->bptr = fb->buffer + len; + } else { - ctx->src->refill(ctx); - TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2)); + TRACE(ctx, "push_chars"); + struct xml_node *n = xml_push_dom(ctx); + n->type = XML_NODE_CHARS; + xml_start_chars(ctx); } } - while (ctx->bptr == ctx->bstop); -} - -uns -xml_row(struct xml_context *ctx) -{ - struct xml_source *src = ctx->src; - if (!src) - return 0; - uns row = src->row; - for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2) - if (p[-1] & src->refill_cat2) - row--; - return row + 1; } -/*** Basic parsing ***/ - -void NONRET -xml_fatal_expected(struct xml_context *ctx, uns c) -{ - xml_fatal(ctx, "Expected '%c'", c); -} - -void NONRET -xml_fatal_expected_white(struct xml_context *ctx) -{ - xml_fatal(ctx, "Expected a white space"); -} - -void NONRET -xml_fatal_expected_quot(struct xml_context *ctx) -{ - xml_fatal(ctx, "Expected a quotation mark"); -} - -void -xml_parse_eq(struct xml_context *ctx) +static void +xml_init_chars(struct xml_context *ctx) { - /* Eq ::= S? '=' S? */ - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '='); - xml_parse_white(ctx, 0); + struct fastbuf *fb = &ctx->chars; + fb->name = ""; + fb->spout = xml_chars_spout; + fb->can_overwrite_buffer = 1; + fb->bptr = fb->bstop = fb->buffer = fb->bufend = NULL; } -/* Names and nmtokens */ - -static char * -xml_parse_string(struct xml_context *ctx, struct mempool *pool, uns first_cat, uns next_cat, char *err) -{ - char *p = mp_start_noalign(pool, 1); - if (unlikely(!(xml_peek_cat(ctx) & first_cat))) - xml_fatal(ctx, "%s", err); - do - { - p = mp_spread(pool, p, 5); - p = utf8_32_put(p, xml_skip_char(ctx)); - } - while (xml_peek_cat(ctx) & next_cat); - *p++ = 0; - return mp_end(pool, p); -} +/*** Initialization ***/ static void -xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err) +xml_do_init(struct xml_context *ctx) { - if (unlikely(!(xml_get_cat(ctx) & first_cat))) - xml_fatal(ctx, "%s", err); - while (xml_peek_cat(ctx) & next_cat) - xml_skip_char(ctx); -} - -char * -xml_parse_name(struct xml_context *ctx, struct mempool *pool) -{ - /* Name ::= NameStartChar (NameChar)* */ - return xml_parse_string(ctx, pool, - !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, - !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, - "Expected a name"); + ctx->flags = XML_REPORT_ALL; + xml_init_chars(ctx); + xml_attrs_table_init(ctx); } void -xml_skip_name(struct xml_context *ctx) -{ - xml_skip_string(ctx, - !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, - !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, - "Expected a name"); -} - -char * -xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool) -{ - /* Nmtoken ::= (NameChar)+ */ - uns cat = !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1; - return xml_parse_string(ctx, pool, cat, cat, "Expected a nmtoken"); -} - -/* Simple literals */ - -char * -xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool) +xml_init(struct xml_context *ctx) { - /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */ - char *p = mp_start_noalign(pool, 1); - uns q = xml_parse_quote(ctx), c; - while ((c = xml_get_char(ctx)) != q) - { - p = mp_spread(pool, p, 5); - p = utf8_32_put(p, c); - } - *p++ = 0; - return mp_end(pool, p); -} - -char * -xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool) -{ - /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */ - char *p = mp_start_noalign(pool, 1); - uns q = xml_parse_quote(ctx), c; - while ((c = xml_get_char(ctx)) != q) - { - if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID))) - xml_fatal(ctx, "Expected a pubid character"); - p = mp_spread(pool, p, 2); - *p++ = c; - } - *p++ = 0; - return mp_end(pool, p); -} - -static char * -xml_parse_encoding_name(struct xml_context *ctx) -{ - /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */ - char *p = mp_start_noalign(ctx->pool, 1); - uns q = xml_parse_quote(ctx); - if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME))) - xml_fatal(ctx, "Invalid character in the encoding name"); - while (1) - { - p = mp_spread(ctx->pool, p, 2); - *p++ = xml_last_char(ctx); - if (xml_get_char(ctx) == q) - break; - if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME))) - xml_fatal(ctx, "Invalid character in the encoding name"); - } - *p++ = 0; - return mp_end(ctx->pool, p); -} - -/* Document/external entity header */ - -static inline void -xml_init_cats(struct xml_context *ctx, uns mask) -{ - if (!(ctx->flags & XML_FLAG_VERSION_1_1)) - { - ctx->src->refill_cat1 = XML_CHAR_VALID_1_0 & ~XML_CHAR_NEW_LINE_1_0 & ~mask; - ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_0; - } - else - { - ctx->src->refill_cat1 = XML_CHAR_UNRESTRICTED_1_1 & ~XML_CHAR_NEW_LINE_1_1 & ~mask; - ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_1; - } + bzero(ctx, sizeof(*ctx)); + ctx->pool = mp_new(65536); + ctx->stack = mp_new(65536); + xml_do_init(ctx); + TRACE(ctx, "init"); } -static void -xml_init_charconv(struct xml_context *ctx, int cs) +void +xml_cleanup(struct xml_context *ctx) { - // FIXME: hack - struct xml_source *src = ctx->src; - TRACE(ctx, "wrapping charset %s", charset_name(cs)); -#if 0 - struct conv_context conv; - conv_set_charset(&conv, cs, CONV_CHARSET_UTF8); - src->refill = xml_refill_libcharset; - src->refill_in_to_x = conv.in_to_x; -#else - src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8); - // FIXME: memory leak -#endif + TRACE(ctx, "cleanup"); + xml_attrs_table_cleanup(ctx); + xml_dtd_cleanup(ctx); + xml_sources_cleanup(ctx); + mp_delete(ctx->pool); + mp_delete(ctx->stack); } void -xml_parse_decl(struct xml_context *ctx) -{ - TRACE(ctx, "xml_parse_decl"); - struct xml_source *src = ctx->src; - ctx->flags &= ~XML_FLAG_SRC_EXPECTED_DECL; - - /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */ - xml_init_cats(ctx, XML_CHAR_GT); - - /* Initialize the supplied charset (if any) or try to guess it */ - char *expected_encoding = src->expected_encoding ? : src->fb_encoding; - src->refill = xml_refill_utf8; - int bom = bpeekc(src->fb); - if (bom < 0) - ctx->flags |= XML_FLAG_SRC_EOF; - if (!src->fb_encoding) - { - if (bom == 0xfe) - src->refill = xml_refill_utf16_be; - else if (bom == 0xff) - src->refill = xml_refill_utf16_le; - } - else - { - int cs = find_charset_by_name(src->fb_encoding); - if (cs == CONV_CHARSET_UTF8) - {} - else if (cs >= 0) - { - xml_init_charconv(ctx, cs); - bom = 0; - } - else if (strcasecmp(src->fb_encoding, "UTF-16")) - { - src->refill = xml_refill_utf16_be; - if (bom == 0xff) - src->refill = xml_refill_utf16_le; - if (!src->expected_encoding) - expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE"; - } - else if (strcasecmp(src->fb_encoding, "UTF-16BE")) - src->refill = xml_refill_utf16_be; - else if (strcasecmp(src->fb_encoding, "UTF-16LE")) - src->refill = xml_refill_utf16_le; - else - { - xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding); - expected_encoding = NULL; - } - } - uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; - if (bom > 0 && xml_peek_char(ctx) == 0xfeff) - xml_skip_char(ctx); - else if (utf16) - xml_error(ctx, "Missing or corrupted BOM"); - - /* Look ahead for presence of XMLDecl or optional TextDecl */ - if (!(ctx->flags & XML_FLAG_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) - xml_refill(ctx); - uns doc = ctx->flags & XML_FLAG_SRC_DOCUMENT; - u32 *bptr = ctx->bptr; - uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) && - bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L'); - if (!have_decl) - { - if (doc) - xml_fatal(ctx, "Missing or corrupted XML header"); - else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16) - xml_error(ctx, "Missing or corrupted entity header"); - goto exit; - } - ctx->bptr = bptr + 12; - xml_parse_white(ctx, 0); - - /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */ - if (xml_peek_char(ctx) == 'v') - { - xml_parse_seq(ctx, "version"); - xml_parse_eq(ctx); - char *version = xml_parse_pubid_literal(ctx, ctx->pool); - TRACE(ctx, "version=%s", version); - uns v = 0; - if (!strcmp(version, "1.1")) - v = XML_FLAG_VERSION_1_1; - else if (strcmp(version, "1.0")) - { - xml_error(ctx, "Unknown XML version string '%s'", version); - version = "1.0"; - } - if (doc) - { - ctx->version_str = version; - ctx->flags |= v; - } - else if (v > (ctx->flags & XML_FLAG_VERSION_1_1)) - xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document"); - if (!xml_parse_white(ctx, !doc)) - goto end; - } - else if (doc) - { - xml_error(ctx, "Expected XML version"); - ctx->version_str = "1.0"; - } - - /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */ - if (xml_peek_char(ctx) == 'e') - { - xml_parse_seq(ctx, "encoding"); - xml_parse_eq(ctx); - src->decl_encoding = xml_parse_encoding_name(ctx); - TRACE(ctx, "encoding=%s", src->decl_encoding); - if (!xml_parse_white(ctx, 0)) - goto end; - } - else if (!doc) - xml_error(ctx, "Expected XML encoding"); - - /* Parse whether the document is standalone (optional in XMLDecl) */ - if (doc && xml_peek_char(ctx) == 's') - { - xml_parse_seq(ctx, "standalone"); - xml_parse_eq(ctx); - uns c = xml_parse_quote(ctx); - if (ctx->standalone = (xml_peek_char(ctx) == 'y')) - xml_parse_seq(ctx, "yes"); - else - xml_parse_seq(ctx, "no"); - xml_parse_char(ctx, c); - TRACE(ctx, "standalone=%d", ctx->standalone); - xml_parse_white(ctx, 0); - } -end: - xml_parse_seq(ctx, "?>"); - - /* Switch to the final encoding */ - if (src->decl_encoding) - { - int cs = find_charset_by_name(src->decl_encoding); - if (cs < 0 && !expected_encoding) - xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); - else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) - xml_init_charconv(ctx, cs); - else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || - !(!strcasecmp(src->decl_encoding, "UTF-16") || - (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || - (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) - xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); - } - -exit: - /* Update valid Unicode ranges */ - xml_init_cats(ctx, 0); +xml_reset(struct xml_context *ctx) +{ + TRACE(ctx, "reset"); + struct mempool *pool = ctx->pool, *stack = ctx->stack; + xml_attrs_table_cleanup(ctx); + xml_dtd_cleanup(ctx); + xml_sources_cleanup(ctx); + mp_flush(pool); + mp_flush(stack); + bzero(ctx, sizeof(*ctx)); + xml_do_init(ctx); } diff --git a/sherlock/xml/common.h b/sherlock/xml/common.h index ed18e8af..cecd6119 100644 --- a/sherlock/xml/common.h +++ b/sherlock/xml/common.h @@ -26,50 +26,10 @@ void NONRET xml_throw(struct xml_context *ctx); void xml_warn(struct xml_context *ctx, const char *format, ...); void xml_error(struct xml_context *ctx, const char *format, ...); -void xml_fatal(struct xml_context *ctx, const char *format, ...); - -/*** Charecter categorization ***/ - -#include "obj/sherlock/xml/unicat.h" - -static inline uns -xml_char_cat(uns c) -{ - if (c < 0x10000) - return 1U << xml_char_tab1[(c & 0xff) + xml_char_tab2[c >> 8]]; - else if (likely(c < 0x110000)) - return 1U << xml_char_tab3[c >> 16]; - else - return 1; -} - -static inline uns -xml_ascii_cat(uns c) -{ - return xml_char_tab1[c]; -} +void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...); /*** Memory management ***/ -void NONRET xml_fatal_nested(struct xml_context *ctx); - -static inline void -xml_inc(struct xml_context *ctx) -{ - /* Called after the first character of a block */ - TRACE(ctx, "inc"); - ctx->depth++; -} - -static inline void -xml_dec(struct xml_context *ctx) -{ - /* Called after the last character of a block */ - TRACE(ctx, "dec"); - if (unlikely(!ctx->depth--)) - xml_fatal_nested(ctx); -} - struct xml_stack { struct xml_stack *next; struct mempool_state state; @@ -133,14 +93,14 @@ xml_push_dom(struct xml_context *ctx) } static inline void -xml_pop_dom(struct xml_context *ctx) +xml_pop_dom(struct xml_context *ctx, uns free) { /* Leave DOM subtree */ TRACE(ctx, "pop_dom"); ASSERT(ctx->node); struct xml_node *p = ctx->node->parent; struct xml_dom_stack *s = (void *)ctx->stack_list; - if (ctx->flags & XML_DOM_FREE) + if (free) { /* See xml_pop_element() for cleanup of attribute hash table */ if (p) @@ -183,6 +143,64 @@ xml_end_chars(struct xml_context *ctx, uns *len) /*** Reading of document/external entities ***/ +#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */ + +struct xml_source { + struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ + struct fastbuf *fb; /* Source fastbuf */ + struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */ + struct fastbuf wrap_fb; /* Fbmem wrapper */ + u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ + u32 *bptr, *bstop; /* Current state of the buffer */ + uns row; /* File position */ + char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ + char *fb_encoding; /* Encoding of the source fastbuf */ + char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ + uns refill_cat1; /* Character categories, which should be directly passed to the buffer */ + uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in sequences) */ + void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ + unsigned short *refill_in_to_x; /* Libcharset input table */ + uns saved_depth; /* Saved ctx->depth */ +}; + +void NONRET xml_fatal_nested(struct xml_context *ctx); + +static inline void +xml_inc(struct xml_context *ctx) +{ + /* Called after the first character of a block */ + TRACE(ctx, "inc"); + ctx->depth++; +} + +static inline void +xml_dec(struct xml_context *ctx) +{ + /* Called after the last character of a block */ + TRACE(ctx, "dec"); + if (unlikely(!ctx->depth--)) + xml_fatal_nested(ctx); +} + +#include "obj/sherlock/xml/unicat.h" + +static inline uns +xml_char_cat(uns c) +{ + if (c < 0x10000) + return 1U << xml_char_tab1[(c & 0xff) + xml_char_tab2[c >> 8]]; + else if (likely(c < 0x110000)) + return 1U << xml_char_tab3[c >> 16]; + else + return 1; +} + +static inline uns +xml_ascii_cat(uns c) +{ + return xml_char_tab1[c]; +} + struct xml_source *xml_push_source(struct xml_context *ctx, uns flags); void xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent); @@ -246,7 +264,9 @@ xml_unget_char(struct xml_context *ctx) return *(ctx->bptr -= 2); } -/*** Basic parsing ***/ +void xml_sources_cleanup(struct xml_context *ctx); + +/*** Parsing ***/ void NONRET xml_fatal_expected(struct xml_context *ctx, uns c); void NONRET xml_fatal_expected_white(struct xml_context *ctx); @@ -296,32 +316,33 @@ xml_parse_quote(struct xml_context *ctx) return c; } -/* Names and nmtokens */ - char *xml_parse_name(struct xml_context *ctx, struct mempool *pool); void xml_skip_name(struct xml_context *ctx); char *xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool); -/* Simple literals */ - char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool); char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool); -/* Parsing */ - uns xml_parse_char_ref(struct xml_context *ctx); void xml_parse_ref(struct xml_context *ctx); void xml_parse_pe_ref(struct xml_context *ctx); + char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr); + void xml_parse_notation_decl(struct xml_context *ctx); void xml_parse_entity_decl(struct xml_context *ctx); void xml_parse_element_decl(struct xml_context *ctx); void xml_parse_attr_list_decl(struct xml_context *ctx); + void xml_push_comment(struct xml_context *ctx); void xml_pop_comment(struct xml_context *ctx); void xml_skip_comment(struct xml_context *ctx); + void xml_push_pi(struct xml_context *ctx); void xml_pop_pi(struct xml_context *ctx); void xml_skip_pi(struct xml_context *ctx); +void xml_attrs_table_init(struct xml_context *ctx); +void xml_attrs_table_cleanup(struct xml_context *ctx); + #endif diff --git a/sherlock/xml/dtd.c b/sherlock/xml/dtd.c index 07f030a4..aa99f3c1 100644 --- a/sherlock/xml/dtd.c +++ b/sherlock/xml/dtd.c @@ -7,7 +7,7 @@ * of the GNU Lesser General Public License. */ -#define LOCAL_DEBUG +#undef LOCAL_DEBUG #include "sherlock/sherlock.h" #include "sherlock/xml/xml.h" @@ -23,7 +23,6 @@ #define HASH_KEY_STRING name #define HASH_ZERO_FILL #define HASH_TABLE_DYNAMIC -#define HASH_WANT_FIND #define HASH_WANT_LOOKUP #define HASH_GIVE_ALLOC #define HASH_TABLE_ALLOC @@ -45,16 +44,16 @@ XML_HASH_GIVE_ALLOC #include "lib/hashtable.h" static struct xml_dtd_ent * -xml_dtd_declare_trivial_gent(struct xml_context *ctx, char *name, char *text) +xml_dtd_declare_trivial_ent(struct xml_context *ctx, char *name, char *text) { struct xml_dtd *dtd = ctx->dtd; - struct xml_dtd_ent *ent = xml_dtd_ents_lookup(dtd->tab_gents, name); + struct xml_dtd_ent *ent = xml_dtd_ents_lookup(dtd->tab_ents, name); if (ent->flags & XML_DTD_ENT_DECLARED) { xml_warn(ctx, "Entity &%s; already declared", name); return NULL; } - slist_add_tail(&dtd->gents, &ent->n); + slist_add_tail(&dtd->ents, &ent->n); ent->flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL; ent->text = text; ent->len = strlen(text); @@ -62,22 +61,22 @@ xml_dtd_declare_trivial_gent(struct xml_context *ctx, char *name, char *text) } static void -xml_dtd_declare_default_gents(struct xml_context *ctx) +xml_dtd_declare_default_ents(struct xml_context *ctx) { - xml_dtd_declare_trivial_gent(ctx, "lt", "<"); - xml_dtd_declare_trivial_gent(ctx, "gt", ">"); - xml_dtd_declare_trivial_gent(ctx, "amp", "&"); - xml_dtd_declare_trivial_gent(ctx, "apos", "'"); - xml_dtd_declare_trivial_gent(ctx, "quot", "\""); + xml_dtd_declare_trivial_ent(ctx, "lt", "<"); + xml_dtd_declare_trivial_ent(ctx, "gt", ">"); + xml_dtd_declare_trivial_ent(ctx, "amp", "&"); + xml_dtd_declare_trivial_ent(ctx, "apos", "'"); + xml_dtd_declare_trivial_ent(ctx, "quot", "\""); } struct xml_dtd_ent * -xml_dtd_find_gent(struct xml_context *ctx, char *name) +xml_dtd_find_ent(struct xml_context *ctx, char *name) { struct xml_dtd *dtd = ctx->dtd; if (dtd) { - struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_gents, name); + struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_ents, name); return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; } else @@ -127,12 +126,19 @@ xml_dtd_find_pent(struct xml_context *ctx, char *name) #define HASH_KEY_STRING name #define HASH_TABLE_DYNAMIC #define HASH_ZERO_FILL +#define HASH_WANT_FIND #define HASH_WANT_LOOKUP #define HASH_GIVE_ALLOC #define HASH_TABLE_ALLOC XML_HASH_GIVE_ALLOC #include "lib/hashtable.h" +struct xml_dtd_elem * +xml_dtd_find_elem(struct xml_context *ctx, char *name) +{ + return ctx->dtd ? xml_dtd_elems_find(ctx->dtd->tab_elems, name) : NULL; +} + /* Element sons */ struct xml_dtd_enodes_table; @@ -211,6 +217,12 @@ xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_at XML_HASH_GIVE_ALLOC #include "lib/hashtable.h" +struct xml_dtd_attr * +xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name) +{ + return ctx->dtd ? xml_dtd_attrs_find(ctx->dtd->tab_attrs, elem, name) : NULL; +} + /* Enumerated attribute values */ struct xml_dtd_evals_table; @@ -297,7 +309,7 @@ xml_dtd_init(struct xml_context *ctx) struct mempool *pool = mp_new(4096); struct xml_dtd *dtd = ctx->dtd = mp_alloc_zero(pool, sizeof(*ctx->dtd)); dtd->pool = pool; - xml_dtd_ents_init(dtd->tab_gents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); + xml_dtd_ents_init(dtd->tab_ents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table))); xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table))); @@ -305,7 +317,7 @@ xml_dtd_init(struct xml_context *ctx) xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table))); xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table))); xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table))); - xml_dtd_declare_default_gents(ctx); + xml_dtd_declare_default_ents(ctx); } void @@ -457,8 +469,8 @@ xml_parse_entity_decl(struct xml_context *ctx) else xml_unget_char(ctx); - struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_gents, xml_parse_name(ctx, dtd->pool)); - slist *list = flags ? &dtd->pents : &dtd->gents; + struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool)); + slist *list = flags ? &dtd->pents : &dtd->ents; xml_parse_dtd_white(ctx, 1); if (ent->flags & XML_DTD_ENT_DECLARED) { diff --git a/sherlock/xml/dtd.h b/sherlock/xml/dtd.h index 9b4c6d98..549696e1 100644 --- a/sherlock/xml/dtd.h +++ b/sherlock/xml/dtd.h @@ -14,11 +14,11 @@ struct xml_dtd { struct mempool *pool; /* Memory pool where to allocate DTD */ - slist gents; /* Link list of general entities */ + slist ents; /* Link list of general entities */ slist pents; /* Link list of parapeter entities */ slist notns; /* Link list of notations */ slist elems; /* Link list of elements */ - void *tab_gents; /* Hash table of general entities */ + void *tab_ents; /* Hash table of general entities */ void *tab_pents; /* Hash table of parameter entities */ void *tab_notns; /* Hash table of notations */ void *tab_elems; /* Hash table of elements */ @@ -28,6 +28,11 @@ struct xml_dtd { void *tab_enotns; /* hash table of enumerated attribute notations */ }; +struct xml_ext_id { + char *system_id; + char *public_id; +}; + /* Notations */ enum xml_dtd_notn_flags { @@ -62,7 +67,7 @@ struct xml_dtd_ent { struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */ }; -struct xml_dtd_ent *xml_dtd_find_gent(struct xml_context *ctx, char *name); +struct xml_dtd_ent *xml_dtd_find_ent(struct xml_context *ctx, char *name); /* Elements */ @@ -107,6 +112,8 @@ enum xml_dtd_elem_node_occur { XML_DTD_ELEM_OCCUR_PLUS, }; +struct xml_dtd_elem *xml_dtd_find_elem(struct xml_context *ctx, char *name); + /* Attributes */ enum xml_dtd_attribute_default { @@ -151,4 +158,6 @@ void xml_dtd_init(struct xml_context *ctx); void xml_dtd_cleanup(struct xml_context *ctx); void xml_dtd_finish(struct xml_context *ctx); +struct xml_dtd_attr *xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name); + #endif diff --git a/sherlock/xml/parse.c b/sherlock/xml/parse.c index 6f2e7e00..ea8feab1 100644 --- a/sherlock/xml/parse.c +++ b/sherlock/xml/parse.c @@ -7,7 +7,7 @@ * of the GNU Lesser General Public License. */ -#define LOCAL_DEBUG +#undef LOCAL_DEBUG #include "sherlock/sherlock.h" #include "sherlock/xml/xml.h" @@ -21,6 +21,126 @@ #include +/*** Basic parsing ***/ + +void NONRET +xml_fatal_expected(struct xml_context *ctx, uns c) +{ + if (c >= 32 && c < 128) + xml_fatal(ctx, "Expected '%c'", c); + else + xml_fatal(ctx, "Expected U+%04x", c); +} + +void NONRET +xml_fatal_expected_white(struct xml_context *ctx) +{ + xml_fatal(ctx, "Expected a white space"); +} + +void NONRET +xml_fatal_expected_quot(struct xml_context *ctx) +{ + xml_fatal(ctx, "Expected a quotation mark"); +} + +void +xml_parse_eq(struct xml_context *ctx) +{ + /* Eq ::= S? '=' S? */ + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '='); + xml_parse_white(ctx, 0); +} + +/*** Names and nmtokens ***/ + +static char * +xml_parse_string(struct xml_context *ctx, struct mempool *pool, uns first_cat, uns next_cat, char *err) +{ + char *p = mp_start_noalign(pool, 1); + if (unlikely(!(xml_peek_cat(ctx) & first_cat))) + xml_fatal(ctx, "%s", err); + do + { + p = mp_spread(pool, p, 5); + p = utf8_32_put(p, xml_skip_char(ctx)); + } + while (xml_peek_cat(ctx) & next_cat); + *p++ = 0; + return mp_end(pool, p); +} + +static void +xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err) +{ + if (unlikely(!(xml_get_cat(ctx) & first_cat))) + xml_fatal(ctx, "%s", err); + while (xml_peek_cat(ctx) & next_cat) + xml_skip_char(ctx); +} + +char * +xml_parse_name(struct xml_context *ctx, struct mempool *pool) +{ + /* Name ::= NameStartChar (NameChar)* */ + return xml_parse_string(ctx, pool, + !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, + !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, + "Expected a name"); +} + +void +xml_skip_name(struct xml_context *ctx) +{ + xml_skip_string(ctx, + !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, + !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, + "Expected a name"); +} + +char * +xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool) +{ + /* Nmtoken ::= (NameChar)+ */ + uns cat = !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1; + return xml_parse_string(ctx, pool, cat, cat, "Expected a nmtoken"); +} + +/*** Simple literals ***/ + +char * +xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool) +{ + /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */ + char *p = mp_start_noalign(pool, 1); + uns q = xml_parse_quote(ctx), c; + while ((c = xml_get_char(ctx)) != q) + { + p = mp_spread(pool, p, 5); + p = utf8_32_put(p, c); + } + *p++ = 0; + return mp_end(pool, p); +} + +char * +xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool) +{ + /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */ + char *p = mp_start_noalign(pool, 1); + uns q = xml_parse_quote(ctx), c; + while ((c = xml_get_char(ctx)) != q) + { + if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID))) + xml_fatal(ctx, "Expected a pubid character"); + p = mp_spread(pool, p, 2); + *p++ = c; + } + *p++ = 0; + return mp_end(pool, p); +} + /*** Comments ***/ void @@ -47,14 +167,14 @@ xml_push_comment(struct xml_context *ctx) *p = 0; n->len = p - (char *)mp_ptr(ctx->pool); n->text = mp_end(ctx->pool, p + 1); - if (ctx->h_comment) + if ((ctx->flags & XML_REPORT_COMMENTS) && ctx->h_comment) ctx->h_comment(ctx); } void xml_pop_comment(struct xml_context *ctx) { - xml_pop_dom(ctx); + xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_COMMENTS)); xml_dec(ctx); TRACE(ctx, "pop_comment"); } @@ -105,14 +225,14 @@ xml_push_pi(struct xml_context *ctx) *p = 0; n->len = p - (char *)mp_ptr(ctx->pool); n->text = mp_end(ctx->pool, p + 1); - if (ctx->h_pi) + if ((ctx->flags & XML_REPORT_PIS) && ctx->h_pi) ctx->h_pi(ctx); } void xml_pop_pi(struct xml_context *ctx) { - xml_pop_dom(ctx); + xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_PIS)); xml_dec(ctx); TRACE(ctx, "pop_pi"); } @@ -121,7 +241,7 @@ void xml_skip_pi(struct xml_context *ctx) { TRACE(ctx, "skip_pi"); - if (ctx->flags & XML_FLAG_VALIDATING) + if (ctx->flags & XML_VALIDATING) { struct mempool_state state; mp_save(ctx->stack, &state); @@ -145,42 +265,6 @@ xml_skip_pi(struct xml_context *ctx) /*** Character data ***/ -static void -xml_chars_spout(struct fastbuf *fb) -{ - if (fb->bptr >= fb->bufend) - { - struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb); - struct mempool *pool = ctx->pool; - if (fb->bufend != fb->buffer) - { - uns len = fb->bufend - fb->buffer; - TRACE(ctx, "grow_chars"); - fb->buffer = mp_expand(pool); - fb->bufend = fb->buffer + mp_avail(pool); - fb->bstop = fb->buffer; - fb->bptr = fb->buffer + len; - } - else - { - TRACE(ctx, "push_chars"); - struct xml_node *n = xml_push_dom(ctx); - n->type = XML_NODE_CDATA; - xml_start_chars(ctx); - } - } -} - -static void -xml_init_chars(struct xml_context *ctx) -{ - struct fastbuf *fb = &ctx->chars; - fb->name = ""; - fb->spout = xml_chars_spout; - fb->can_overwrite_buffer = 1; - fb->bptr = fb->bstop = fb->buffer = fb->bufend = NULL; -} - static inline uns xml_flush_chars(struct xml_context *ctx) { @@ -191,7 +275,7 @@ xml_flush_chars(struct xml_context *ctx) struct xml_node *n = ctx->node; n->text = xml_end_chars(ctx, &n->len); n->len = fb->bufend - fb->buffer; - if (ctx->h_chars) + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars) ctx->h_chars(ctx); return 1; } @@ -199,7 +283,7 @@ xml_flush_chars(struct xml_context *ctx) static inline void xml_pop_chars(struct xml_context *ctx) { - xml_pop_dom(ctx); + xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS)); TRACE(ctx, "pop_chars"); } @@ -229,7 +313,7 @@ xml_push_cdata(struct xml_context *ctx) * Already parsed: 'type = XML_NODE_CDATA; + n->type = XML_NODE_CHARS; char *p = mp_start_noalign(ctx->pool, 7); while (1) { @@ -248,14 +332,14 @@ xml_push_cdata(struct xml_context *ctx) *p = 0; n->len = p - (char *)mp_ptr(ctx->pool); n->text = mp_end(ctx->pool, p + 1); - if (ctx->h_cdata) + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata) ctx->h_cdata(ctx); } static void xml_pop_cdata(struct xml_context *ctx) { - xml_pop_dom(ctx); + xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS)); xml_dec(ctx); TRACE(ctx, "pop_cdata"); } @@ -327,7 +411,7 @@ xml_parse_char_ref(struct xml_context *ctx) while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT)); } uns cat = xml_char_cat(v); - if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_FLAG_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0))) + if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0))) { xml_error(ctx, "Character reference out of range"); goto recover; @@ -366,7 +450,7 @@ xml_parse_ref(struct xml_context *ctx) mp_save(ctx->stack, &state); char *name = xml_parse_name(ctx, ctx->stack); xml_parse_char(ctx, ';'); - struct xml_dtd_ent *ent = xml_dtd_find_gent(ctx, name); + struct xml_dtd_ent *ent = xml_dtd_find_ent(ctx, name); if (!ent) { xml_error(ctx, "Unknown entity &%s;", name); @@ -490,6 +574,24 @@ xml_parse_attr(struct xml_context *ctx) a->val = v; } +struct xml_attr * +xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name) +{ + return xml_attrs_find(ctx->tab_attrs, node, name); +} + +void +xml_attrs_table_init(struct xml_context *ctx) +{ + xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table))); +} + +void +xml_attrs_table_cleanup(struct xml_context *ctx) +{ + xml_attrs_cleanup(ctx->tab_attrs); +} + /*** Elements ***/ static void @@ -508,8 +610,8 @@ xml_push_element(struct xml_context *ctx) if (!e->parent) { ctx->root = e; - if (ctx->document_type && strcmp(e->name, ctx->document_type)) - xml_error(ctx, "The root element %s does not match the document type %s", e->name, ctx->document_type); + if (ctx->doctype && strcmp(e->name, ctx->doctype)) + xml_error(ctx, "The root element %s does not match the document type %s", e->name, ctx->doctype); } while (1) { @@ -518,7 +620,7 @@ xml_push_element(struct xml_context *ctx) if (c == '/') { xml_parse_char(ctx, '>'); - ctx->flags |= XML_FLAG_EMPTY_ELEM; + ctx->flags |= XML_EMPTY_ELEM_TAG; break; } else if (c == '>') @@ -528,18 +630,19 @@ xml_push_element(struct xml_context *ctx) xml_unget_char(ctx); xml_parse_attr(ctx); } - if (ctx->h_element_start) - ctx->h_element_start(ctx); + if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag) + ctx->h_stag(ctx); } static void xml_pop_element(struct xml_context *ctx) { TRACE(ctx, "pop_element"); - if (ctx->h_element_end) - ctx->h_element_end(ctx); + if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag) + ctx->h_etag(ctx); struct xml_node *e = ctx->node; - if (ctx->flags & XML_DOM_FREE) + uns free = !(ctx->flags & XML_ALLOC_TAGS); + if (free) { if (!e->parent) ctx->root = NULL; @@ -558,7 +661,7 @@ xml_pop_element(struct xml_context *ctx) clist_remove(&n->n); } } - xml_pop_dom(ctx); + xml_pop_dom(ctx, free); xml_dec(ctx); } @@ -596,12 +699,12 @@ xml_parse_doctype_decl(struct xml_context *ctx) /* doctypedecl ::= '' * Already parsed: '' */ - if (ctx->document_type) + if (ctx->doctype) xml_fatal(ctx, "Multiple document types not allowed"); xml_parse_seq(ctx, "DOCTYPE"); xml_parse_white(ctx, 1); - ctx->document_type = xml_parse_name(ctx, ctx->pool); - TRACE(ctx, "doctyype=%s", ctx->document_type); + ctx->doctype = xml_parse_name(ctx, ctx->pool); + TRACE(ctx, "doctype=%s", ctx->doctype); uns c; if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P')) { @@ -609,21 +712,21 @@ xml_parse_doctype_decl(struct xml_context *ctx) { xml_parse_seq(ctx, "SYSTEM"); xml_parse_white(ctx, 1); - ctx->eid.system_id = xml_parse_system_literal(ctx, ctx->pool); + ctx->system_id = xml_parse_system_literal(ctx, ctx->pool); } else { xml_parse_seq(ctx, "PUBLIC"); xml_parse_white(ctx, 1); - ctx->eid.public_id = xml_parse_pubid_literal(ctx, ctx->pool); + ctx->public_id = xml_parse_pubid_literal(ctx, ctx->pool); xml_parse_white(ctx, 1); - ctx->eid.system_id = xml_parse_system_literal(ctx, ctx->pool); + ctx->system_id = xml_parse_system_literal(ctx, ctx->pool); } xml_parse_white(ctx, 0); - ctx->flags |= XML_FLAG_HAS_EXTERNAL_SUBSET; + ctx->flags |= XML_HAS_EXTERNAL_SUBSET; } if (xml_peek_char(ctx) == '[') - ctx->flags |= XML_FLAG_HAS_INTERNAL_SUBSET; + ctx->flags |= XML_HAS_INTERNAL_SUBSET; if (ctx->h_doctype_decl) ctx->h_doctype_decl(ctx); } @@ -700,35 +803,16 @@ invalid_markup: xml_fatal(ctx, "Invalid markup in the internal subset"); } +/*** The State Machine ***/ -/*----------------------------------------------*/ - -void -xml_init(struct xml_context *ctx) -{ - bzero(ctx, sizeof(*ctx)); - ctx->pool = mp_new(65536); - ctx->stack = mp_new(65536); - ctx->flags = XML_DOM_FREE; - xml_init_chars(ctx); - xml_dtd_init(ctx); - xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table))); -} - -void -xml_cleanup(struct xml_context *ctx) -{ - xml_attrs_cleanup(ctx->tab_attrs); - xml_dtd_cleanup(ctx); - mp_delete(ctx->pool); - mp_delete(ctx->stack); -} - -int +uns xml_next(struct xml_context *ctx) { /* A nasty state machine */ +#define PULL(x) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##x; case XML_STATE_##x: ; } while (0) +#define PULL_STATE(x, s) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##s, XML_STATE_##x; case XML_STATE_##s: ; } while (0) + TRACE(ctx, "xml_next (state=%u)", ctx->state); jmp_buf throw_buf; ctx->throw_buf = &throw_buf; @@ -737,16 +821,12 @@ xml_next(struct xml_context *ctx) error: if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal) ctx->h_fatal(ctx); - ctx->state = XML_STATE_FATAL; TRACE(ctx, "raised fatal error"); - return -1; + return ctx->state = XML_STATE_EOF; } uns c; switch (ctx->state) { - case XML_STATE_FATAL: - return -1; - case XML_STATE_START: TRACE(ctx, "entering prolog"); if (ctx->h_document_start) @@ -755,9 +835,7 @@ error: xml_refill(ctx); if (ctx->h_xml_decl) ctx->h_xml_decl(ctx); - if (ctx->want & XML_WANT_DECL) - return ctx->state = XML_STATE_DECL; - case XML_STATE_DECL: + PULL(XML_DECL); /* Misc* (doctypedecl Misc*)? */ while (1) @@ -766,14 +844,12 @@ error: xml_parse_char(ctx, '<'); if ((c = xml_get_char(ctx)) == '?') /* Processing intruction */ - if (!(ctx->want & XML_WANT_PI)) + if (!(ctx->flags & XML_REPORT_PIS)) xml_skip_pi(ctx); else { xml_push_pi(ctx); - ctx->state = XML_STATE_PROLOG_PI; - return XML_STATE_PI; - case XML_STATE_PROLOG_PI: + PULL_STATE(PI, PROLOG_PI); xml_pop_pi(ctx); } else if (c != '!') @@ -783,14 +859,12 @@ error: goto first_tag; } else if (xml_get_char(ctx) == '-') - if (!(ctx->want & XML_WANT_COMMENT)) + if (!(ctx->flags & XML_REPORT_COMMENTS)) xml_skip_comment(ctx); else { xml_push_comment(ctx); - ctx->state = XML_STATE_PROLOG_COMMENT; - return XML_STATE_COMMENT; - case XML_STATE_PROLOG_COMMENT: + PULL_STATE(COMMENT, PROLOG_COMMENT); xml_pop_comment(ctx); } else @@ -798,14 +872,19 @@ error: /* DocTypeDecl */ xml_unget_char(ctx); xml_parse_doctype_decl(ctx); - if (ctx->want & XML_WANT_DOCUMENT_TYPE) - return ctx->state = XML_STATE_DOCUMENT_TYPE; - case XML_STATE_DOCUMENT_TYPE: + PULL(DOCTYPE_DECL); if (xml_peek_char(ctx) == '[') { + // FIXME: ability to skip the subset xml_skip_char(ctx); xml_inc(ctx); + xml_dtd_init(ctx); + if (ctx->h_dtd_start) + ctx->h_dtd_start(ctx); xml_parse_internal_subset(ctx); + // FIXME: external subset + if (ctx->h_dtd_end) + ctx->h_dtd_end(ctx); xml_parse_white(ctx, 0); } xml_parse_char(ctx, '>'); @@ -830,23 +909,17 @@ first_tag: ; if ((c = xml_get_char(ctx)) == '?') { /* PI */ - if (!(ctx->want & XML_WANT_PI)) + if (!(ctx->flags & (XML_REPORT_PIS | XML_ALLOC_PIS))) xml_skip_pi(ctx); else { if (xml_flush_chars(ctx)) { - if (ctx->want & XML_WANT_CHARS) - { - ctx->state = XML_STATE_CHARS_BEFORE_PI; - return XML_STATE_CHARS; - } - case XML_STATE_CHARS_BEFORE_PI: + PULL_STATE(CHARS, CHARS_BEFORE_PI); xml_pop_chars(ctx); } xml_push_pi(ctx); - return ctx->state = XML_STATE_PI; - case XML_STATE_PI: + PULL(PI); xml_pop_pi(ctx); } } @@ -855,46 +928,34 @@ first_tag: ; if ((c = xml_get_char(ctx)) == '-') { /* Comment */ - if (!(ctx->want & XML_WANT_COMMENT)) + if (!(ctx->flags & (XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS))) xml_skip_comment(ctx); else { if (xml_flush_chars(ctx)) { - if (ctx->want & XML_WANT_CHARS) - { - ctx->state = XML_STATE_CHARS_BEFORE_COMMENT; - return XML_STATE_CHARS; - } - case XML_STATE_CHARS_BEFORE_COMMENT: + PULL_STATE(CHARS, CHARS_BEFORE_COMMENT); xml_pop_chars(ctx); } xml_push_comment(ctx); - return ctx->state = XML_STATE_COMMENT; - case XML_STATE_COMMENT: + PULL(COMMENT); xml_pop_comment(ctx); } } else if (c == '[') { /* CDATA */ - if (!(ctx->want & XML_WANT_CDATA)) + if (!(ctx->flags & XML_UNFOLD_CDATA)) xml_append_cdata(ctx); else { if (xml_flush_chars(ctx)) { - if (ctx->want & XML_WANT_CHARS) - { - ctx->state = XML_STATE_CHARS_BEFORE_CDATA; - return XML_STATE_CHARS; - } - case XML_STATE_CHARS_BEFORE_CDATA: + PULL_STATE(CHARS, CHARS_BEFORE_CDATA); xml_pop_chars(ctx); } xml_push_cdata(ctx); - return ctx->state = XML_STATE_CDATA; - case XML_STATE_CDATA: + PULL(CDATA); xml_pop_cdata(ctx); } } @@ -907,20 +968,13 @@ first_tag: ; xml_unget_char(ctx); if (xml_flush_chars(ctx)) { - if (ctx->want & XML_WANT_CHARS) - { - ctx->state = XML_STATE_CHARS_BEFORE_STAG; - return XML_STATE_CHARS; - } - case XML_STATE_CHARS_BEFORE_STAG: + PULL_STATE(CHARS, CHARS_BEFORE_STAG); xml_pop_chars(ctx); } xml_push_element(ctx); - if (ctx->want & XML_WANT_STAG) - return ctx->state = XML_STATE_STAG; - case XML_STATE_STAG: - if (ctx->flags & XML_FLAG_EMPTY_ELEM) + PULL(STAG); + if (ctx->flags & XML_EMPTY_ELEM_TAG) goto pop_element; } @@ -929,20 +983,13 @@ first_tag: ; /* ETag */ if (xml_flush_chars(ctx)) { - if (ctx->want & XML_WANT_CHARS) - { - ctx->state = XML_STATE_CHARS_BEFORE_ETAG; - return XML_STATE_CHARS; - } - case XML_STATE_CHARS_BEFORE_ETAG: + PULL_STATE(CHARS, CHARS_BEFORE_ETAG); xml_pop_chars(ctx); } xml_parse_etag(ctx); pop_element: - if (ctx->want & XML_WANT_ETAG) - return ctx->state = XML_STATE_ETAG; - case XML_STATE_ETAG: + PULL(ETAG); xml_pop_element(ctx); if (!ctx->node) goto epilog; @@ -963,6 +1010,8 @@ epilog: if (ctx->h_document_end) ctx->h_document_end(ctx); case XML_STATE_EOF: + ctx->err_code = 0; + ctx->err_msg = NULL; return XML_STATE_EOF; } else @@ -973,32 +1022,42 @@ epilog: /* Misc */ xml_parse_char(ctx, '<'); + xml_inc(ctx); if ((c = xml_get_char(ctx)) == '?') /* Processing instruction */ - if (!(ctx->want & XML_WANT_PI)) + if (!(ctx->flags & XML_REPORT_PIS)) xml_skip_pi(ctx); else { xml_push_pi(ctx); - return ctx->state = XML_STATE_EPILOG_PI, XML_STATE_PI; - case XML_STATE_EPILOG_PI: + PULL_STATE(PI, EPILOG_PI); xml_pop_pi(ctx); } else if (c == '!') - /* Comment */ - if (!(ctx->want & XML_WANT_COMMENT)) - xml_skip_comment(ctx); - else - { - xml_push_comment(ctx); - return ctx->state = XML_STATE_EPILOG_COMMENT, XML_STATE_COMMENT; - case XML_STATE_EPILOG_COMMENT: - xml_pop_comment(ctx); - } + { + xml_parse_char(ctx, '-'); + /* Comment */ + if (!(ctx->flags & XML_REPORT_COMMENTS)) + xml_skip_comment(ctx); + else + { + xml_push_comment(ctx); + PULL_STATE(COMMENT, EPILOG_COMMENT); + xml_pop_comment(ctx); + } + } else xml_fatal(ctx, "Syntax error in the epilog"); } } - return -1; + ASSERT(0); +} + +uns +xml_parse(struct xml_context *ctx) +{ + ctx->pull = 0; + xml_next(ctx); + return ctx->err_code; } diff --git a/sherlock/xml/source.c b/sherlock/xml/source.c new file mode 100644 index 00000000..e77cca3e --- /dev/null +++ b/sherlock/xml/source.c @@ -0,0 +1,495 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#undef LOCAL_DEBUG + +#include "sherlock/sherlock.h" +#include "sherlock/xml/xml.h" +#include "sherlock/xml/dtd.h" +#include "sherlock/xml/common.h" +#include "lib/unicode.h" +#include "lib/ff-unicode.h" +#include "charset/charconv.h" +#include "charset/fb-charconv.h" + +/*** Charecter categorization ***/ + +#include "obj/sherlock/xml/unicat.c" + +static void +xml_init_cats(struct xml_context *ctx) +{ + if (!(ctx->flags & XML_VERSION_1_1)) + { + ctx->cat_chars = XML_CHAR_VALID_1_0; + ctx->cat_unrestricted = XML_CHAR_VALID_1_0; + ctx->cat_new_line = XML_CHAR_NEW_LINE_1_0; + ctx->cat_name = XML_CHAR_NAME_1_0; + ctx->cat_sname = XML_CHAR_SNAME_1_0; + } + else + { + ctx->cat_chars = XML_CHAR_VALID_1_1; + ctx->cat_unrestricted = XML_CHAR_UNRESTRICTED_1_1; + ctx->cat_new_line = XML_CHAR_NEW_LINE_1_1; + ctx->cat_name = XML_CHAR_NAME_1_1; + ctx->cat_sname = XML_CHAR_SNAME_1_1; + } +} + +/*** Reading of document/external entities ***/ + +static void NONRET +xml_eof(struct xml_context *ctx) +{ + ctx->err_msg = "Unexpected EOF"; + ctx->err_code = XML_ERR_EOF; + xml_throw(ctx); +} + +void NONRET +xml_fatal_nested(struct xml_context *ctx) +{ + xml_fatal(ctx, "Entity is not nested correctly"); +} + +static inline void +xml_add_char(u32 **bstop, uns c) +{ + *(*bstop)++ = c; + *(*bstop)++ = xml_char_cat(c); +} + +struct xml_source * +xml_push_source(struct xml_context *ctx, uns flags) +{ + xml_push(ctx); + struct xml_source *src = ctx->src; + if (src) + { + src->bptr = ctx->bptr; + src->bstop = ctx->bstop; + } + src = mp_alloc_zero(ctx->stack, sizeof(*src)); + src->next = ctx->src; + src->saved_depth = ctx->depth; + ctx->src = src; + ctx->flags = (ctx->flags & ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_NEW_LINE | XML_SRC_SURROUND | XML_SRC_DOCUMENT)) | flags; + ctx->bstop = ctx->bptr = src->buf; + ctx->depth = 0; + if (flags & XML_SRC_SURROUND) + xml_add_char(&ctx->bstop, 0x20); + return src; +} + +static void +xml_close_source(struct xml_source *src) +{ + bclose(src->fb); + if (src->wrapped_fb) + bclose(src->wrapped_fb); +} + +static void +xml_pop_source(struct xml_context *ctx) +{ + TRACE(ctx, "pop_source"); + if (unlikely(ctx->depth != 0)) + { + xml_fatal(ctx, "Unexpected end of entity"); + } + struct xml_source *src = ctx->src; + ASSERT(src); + xml_close_source(src); + ctx->depth = src->saved_depth; + ctx->src = src = src->next; + if (src) + { + ctx->bptr = src->bptr; + ctx->bstop = src->bstop; + } + xml_pop(ctx); + if (unlikely(!src)) + xml_eof(ctx); +} + +void +xml_sources_cleanup(struct xml_context *ctx) +{ + struct xml_source *s; + while (s = ctx->src) + { + ctx->src = s->next; + xml_close_source(s); + } +} + +static void xml_refill_utf8(struct xml_context *ctx); + +void +xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent) +{ + TRACE(ctx, "xml_push_entity"); + uns cat1 = ctx->src->refill_cat1; + uns cat2 = ctx->src->refill_cat2; + struct xml_source *src = xml_push_source(ctx, 0); + src->refill_cat1 = cat1; + src->refill_cat2 = cat2; + if (ent->flags & XML_DTD_ENT_EXTERNAL) + xml_fatal(ctx, "External entities not implemented"); // FIXME + else + { + fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0); + src->refill = xml_refill_utf8; + } +} + +void +xml_set_source(struct xml_context *ctx, struct fastbuf *fb) +{ + TRACE(ctx, "xml_set_source"); + ASSERT(!ctx->src); + struct xml_source *src = xml_push_source(ctx, XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL); + src->fb = fb; + ctx->state = XML_STATE_START; +} + +static uns +xml_error_restricted(struct xml_context *ctx, uns c) +{ + if (c == ~1U) + xml_error(ctx, "Corrupted encoding"); + else + xml_error(ctx, "Restricted char U+%04X", c); + return UNI_REPLACEMENT; +} + +void xml_parse_decl(struct xml_context *ctx); + +#define REFILL(ctx, func, params...) \ + struct xml_source *src = ctx->src; \ + struct fastbuf *fb = src->fb; \ + if (ctx->bptr == ctx->bstop) \ + ctx->bptr = ctx->bstop = src->buf; \ + uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ + u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \ + *last_0xd = (f & XML_SRC_NEW_LINE) ? bstop : bend; \ + do \ + { \ + c = func(fb, ##params); \ + uns t = xml_char_cat(c); \ + if (t & t1) \ + /* Typical branch */ \ + *bstop++ = c, *bstop++ = t; \ + else if (t & t2) \ + { \ + /* New line */ \ + /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \ + /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \ + if (c == 0xd) \ + last_0xd = bstop + 2; \ + else if (c != 0x2028 && last_0xd == bstop) \ + { \ + last_0xd = bend; \ + continue; \ + } \ + xml_add_char(&bstop, 0xa), row++; \ + } \ + else if (c == '>') \ + { \ + /* Used only in XML/TextDecl to switch the encoding */ \ + *bstop++ = c, *bstop++ = t; \ + break; \ + } \ + else if (~c) \ + /* Restricted character */ \ + xml_add_char(&bstop, xml_error_restricted(ctx, c)); \ + else \ + { \ + /* EOF */ \ + if (f & XML_SRC_SURROUND) \ + xml_add_char(&bstop, 0x20); \ + f |= XML_SRC_EOF; \ + break; \ + } \ + } \ + while (bstop < bend); \ + ctx->flags = (last_0xd == bstop) ? f | XML_SRC_NEW_LINE : f & ~XML_SRC_NEW_LINE; \ + ctx->bstop = bstop; \ + src->row = row; + +static void +xml_refill_utf8(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf8_repl, ~1U); +} + +static void +xml_refill_utf16_le(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf16_le_repl, ~1U); +} + +static void +xml_refill_utf16_be(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf16_be_repl, ~1U); +} + +#if 0 +static inline uns +xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x) +{ + // FIXME: slow + int c; + return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]); +} + +static void +xml_refill_libcharset(struct xml_context *ctx) +{ + unsigned short int *in_to_x = ctx->src->refill_in_to_x; + REFILL(ctx, xml_refill_libcharset_bget, in_to_x); +} +#endif + +#undef REFILL + +void +xml_refill(struct xml_context *ctx) +{ + do + { + if (ctx->flags & XML_SRC_EOF) + xml_pop_source(ctx); + else if (ctx->flags & XML_SRC_EXPECTED_DECL) + xml_parse_decl(ctx); + else + { + ctx->src->refill(ctx); + TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2)); + } + } + while (ctx->bptr == ctx->bstop); +} + +uns +xml_row(struct xml_context *ctx) +{ + struct xml_source *src = ctx->src; + if (!src) + return 0; + uns row = src->row; + for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2) + if (p[-1] & src->refill_cat2) + row--; + return row + 1; +} + +/* Document/external entity header */ + +static char * +xml_parse_encoding_name(struct xml_context *ctx) +{ + /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */ + char *p = mp_start_noalign(ctx->pool, 1); + uns q = xml_parse_quote(ctx); + if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME))) + xml_fatal(ctx, "Invalid character in the encoding name"); + while (1) + { + p = mp_spread(ctx->pool, p, 2); + *p++ = xml_last_char(ctx); + if (xml_get_char(ctx) == q) + break; + if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME))) + xml_fatal(ctx, "Invalid character in the encoding name"); + } + *p++ = 0; + return mp_end(ctx->pool, p); +} + +static void +xml_init_charconv(struct xml_context *ctx, int cs) +{ + // FIXME: hack + struct xml_source *src = ctx->src; + TRACE(ctx, "wrapping charset %s", charset_name(cs)); +#if 0 + struct conv_context conv; + conv_set_charset(&conv, cs, CONV_CHARSET_UTF8); + src->refill = xml_refill_libcharset; + src->refill_in_to_x = conv.in_to_x; +#else + src->wrapped_fb = src->fb; + src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8); +#endif +} + +void +xml_parse_decl(struct xml_context *ctx) +{ + TRACE(ctx, "xml_parse_decl"); + struct xml_source *src = ctx->src; + ctx->flags &= ~XML_SRC_EXPECTED_DECL; + uns doc = ctx->flags & XML_SRC_DOCUMENT; + + /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */ + if (doc) + xml_init_cats(ctx); + src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line & ~XML_CHAR_GT; + src->refill_cat2 = ctx->cat_new_line; + + /* Initialize the supplied charset (if any) or try to guess it */ + char *expected_encoding = src->expected_encoding ? : src->fb_encoding; + src->refill = xml_refill_utf8; + int bom = bpeekc(src->fb); + if (bom < 0) + ctx->flags |= XML_SRC_EOF; + if (!src->fb_encoding) + { + if (bom == 0xfe) + src->refill = xml_refill_utf16_be; + else if (bom == 0xff) + src->refill = xml_refill_utf16_le; + } + else + { + int cs = find_charset_by_name(src->fb_encoding); + if (cs == CONV_CHARSET_UTF8) + {} + else if (cs >= 0) + { + xml_init_charconv(ctx, cs); + bom = 0; + } + else if (strcasecmp(src->fb_encoding, "UTF-16")) + { + src->refill = xml_refill_utf16_be; + if (bom == 0xff) + src->refill = xml_refill_utf16_le; + if (!src->expected_encoding) + expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE"; + } + else if (strcasecmp(src->fb_encoding, "UTF-16BE")) + src->refill = xml_refill_utf16_be; + else if (strcasecmp(src->fb_encoding, "UTF-16LE")) + src->refill = xml_refill_utf16_le; + else + { + xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding); + expected_encoding = NULL; + } + } + uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; + if (bom > 0 && xml_peek_char(ctx) == 0xfeff) + xml_skip_char(ctx); + else if (utf16) + xml_error(ctx, "Missing or corrupted BOM"); + + /* Look ahead for presence of XMLDecl or optional TextDecl */ + if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) + xml_refill(ctx); + u32 *bptr = ctx->bptr; + uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) && + bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L'); + if (!have_decl) + { + if (doc) + xml_fatal(ctx, "Missing or corrupted XML header"); + else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16) + xml_error(ctx, "Missing or corrupted entity header"); + goto exit; + } + ctx->bptr = bptr + 12; + xml_parse_white(ctx, 0); + + /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */ + if (xml_peek_char(ctx) == 'v') + { + xml_parse_seq(ctx, "version"); + xml_parse_eq(ctx); + char *version = xml_parse_pubid_literal(ctx, ctx->pool); + TRACE(ctx, "version=%s", version); + uns v = 0; + if (!strcmp(version, "1.1")) + v = XML_VERSION_1_1; + else if (strcmp(version, "1.0")) + { + xml_error(ctx, "Unknown XML version string '%s'", version); + version = "1.0"; + } + if (doc) + { + ctx->version_str = version; + ctx->flags |= v; + } + else if (v > (ctx->flags & XML_VERSION_1_1)) + xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document"); + if (!xml_parse_white(ctx, !doc)) + goto end; + } + else if (doc) + { + xml_error(ctx, "Expected XML version"); + ctx->version_str = "1.0"; + } + + /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */ + if (xml_peek_char(ctx) == 'e') + { + xml_parse_seq(ctx, "encoding"); + xml_parse_eq(ctx); + src->decl_encoding = xml_parse_encoding_name(ctx); + TRACE(ctx, "encoding=%s", src->decl_encoding); + if (!xml_parse_white(ctx, 0)) + goto end; + } + else if (!doc) + xml_error(ctx, "Expected XML encoding"); + + /* Parse whether the document is standalone (optional in XMLDecl) */ + if (doc && xml_peek_char(ctx) == 's') + { + xml_parse_seq(ctx, "standalone"); + xml_parse_eq(ctx); + uns c = xml_parse_quote(ctx); + if (ctx->standalone = (xml_peek_char(ctx) == 'y')) + xml_parse_seq(ctx, "yes"); + else + xml_parse_seq(ctx, "no"); + xml_parse_char(ctx, c); + TRACE(ctx, "standalone=%d", ctx->standalone); + xml_parse_white(ctx, 0); + } +end: + xml_parse_seq(ctx, "?>"); + + /* Switch to the final encoding */ + if (src->decl_encoding) + { + int cs = find_charset_by_name(src->decl_encoding); + if (cs < 0 && !expected_encoding) + xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); + else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) + xml_init_charconv(ctx, cs); + else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || + !(!strcasecmp(src->decl_encoding, "UTF-16") || + (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || + (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) + xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); + } + +exit: + /* Update valid Unicode ranges */ + if (doc) + xml_init_cats(ctx); + src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line; + src->refill_cat2 = ctx->cat_new_line; +} diff --git a/sherlock/xml/xml-test.c b/sherlock/xml/xml-test.c index cca5ad8a..a7ecda83 100644 --- a/sherlock/xml/xml-test.c +++ b/sherlock/xml/xml-test.c @@ -15,27 +15,43 @@ #include #include -static char *shortopts = "sp" CF_SHORT_OPTS; +enum { + WANT_FIRST = 0x100, + WANT_HIDE_ERRORS, + WANT_UNFOLD_CDATA, + WANT_IGNORE_COMMENTS, + WANT_IGNORE_PIS, +}; + +static char *shortopts = "spd" CF_SHORT_OPTS; static struct option longopts[] = { CF_LONG_OPTS - { "sax", 0, 0, 's' }, - { "pull", 0, 0, 'p' }, - { "dom", 0, 0, 'd' }, - { NULL, 0, 0, 0 } + { "sax", 0, 0, 's' }, + { "pull", 0, 0, 'p' }, + { "dom", 0, 0, 'd' }, + { "hide-errors", 0, 0, WANT_HIDE_ERRORS }, + { "unfold-cdata", 0, 0, WANT_UNFOLD_CDATA }, + { "ignore-comments", 0, 0, WANT_IGNORE_COMMENTS }, + { "ignore-pis", 0, 0, WANT_IGNORE_PIS }, + { NULL, 0, 0, 0 } }; static void NONRET usage(void) { fputs("\ -Usage: xml-test [options] < in.xml\n\ +Usage: xml-test [options] < input.xml\n\ \n\ Options:\n" CF_USAGE "\ --s, --pull Test PULL interface\n\ --s, --sax Test SAX interface\n\ --d, --dom Test DOM interface\n\ +-s, --pull Test PULL interface\n\ +-s, --sax Test SAX interface\n\ +-d, --dom Test DOM interface\n\ + --hide-errors Hide warnings and error messages\n\ + --unfold-cdata Unfold CDATA sections\n\ + --ignore-comments Ignore processing instructions\n\ + --ignore-pis Ignore comments\n\ \n", stderr); exit(1); } @@ -43,6 +59,11 @@ CF_USAGE static uns want_sax; static uns want_pull; static uns want_dom; +static uns want_hide_errors; +static uns want_unfold_cdata; +static uns want_ignore_comments; +static uns want_ignore_pis; + static struct fastbuf *out; static char * @@ -53,7 +74,7 @@ node_type(struct xml_node *node) case XML_NODE_ELEM: return "element"; case XML_NODE_COMMENT: return "comment"; case XML_NODE_PI: return "pi"; - case XML_NODE_CDATA: return "chars"; + case XML_NODE_CHARS: return "chars"; default: return "unknown"; } } @@ -65,7 +86,7 @@ show_node(struct xml_node *node) { case XML_NODE_ELEM: bprintf(out, " <%s>", node->name); - SLIST_FOR_EACH(struct xml_attr *, a, node->attrs) + XML_ATTR_FOR_EACH(a, node) bprintf(out, " %s='%s'", a->name, a->val); bputc(out, '\n'); break; @@ -75,7 +96,7 @@ show_node(struct xml_node *node) case XML_NODE_PI: bprintf(out, " target=%s text='%s'\n", node->name, node->text); break; - case XML_NODE_CDATA: + case XML_NODE_CHARS: bprintf(out, " text='%s'\n", node->text); break; default: @@ -94,7 +115,7 @@ show_tree(struct xml_node *node, uns level) bputs(out, node_type(node)); show_node(node); if (node->type == XML_NODE_ELEM) - CLIST_FOR_EACH(struct xml_node *, son, node->sons) + XML_NODE_FOR_EACH(son, node) show_tree(son, level + 1); } @@ -126,8 +147,8 @@ static void h_doctype_decl(struct xml_context *ctx) { bprintf(out, "SAX: doctype_decl type=%s public='%s' system='%s' extsub=%d intsub=%d\n", - ctx->document_type, ctx->eid.public_id ? : "", ctx->eid.system_id ? : "", - !!(ctx->flags & XML_FLAG_HAS_EXTERNAL_SUBSET), !!(ctx->flags & XML_FLAG_HAS_INTERNAL_SUBSET)); + ctx->doctype, ctx->public_id ? : "", ctx->system_id ? : "", + !!(ctx->flags & XML_HAS_EXTERNAL_SUBSET), !!(ctx->flags & XML_HAS_INTERNAL_SUBSET)); } static void @@ -140,35 +161,54 @@ h_comment(struct xml_context *ctx) static void h_pi(struct xml_context *ctx) { - bprintf(out, "SAX: pi"); + bputs(out, "SAX: pi"); show_node(ctx->node); } static void -h_element_start(struct xml_context *ctx) +h_stag(struct xml_context *ctx) { - bprintf(out, "SAX: element_start"); + bputs(out, "SAX: stag"); show_node(ctx->node); } static void -h_element_end(struct xml_context *ctx) +h_etag(struct xml_context *ctx) { - bprintf(out, "SAX: element_end \n", ctx->node->name); + bprintf(out, "SAX: etag \n", ctx->node->name); } static void h_chars(struct xml_context *ctx) { - bprintf(out, "SAX: chars"); + bputs(out, "SAX: chars"); + show_node(ctx->node); +} + +static void +h_cdata(struct xml_context *ctx) +{ + bputs(out, "SAX: cdata"); show_node(ctx->node); } +static void +h_dtd_start(struct xml_context *ctx UNUSED) +{ + bputs(out, "SAX: dtd_start\n"); +} + +static void +h_dtd_end(struct xml_context *ctx UNUSED) +{ + bputs(out, "SAX: dtd_end\n"); +} + int main(int argc, char **argv) { int opt; - cf_def_file = NULL; // FIXME + cf_def_file = NULL; log_init(argv[0]); while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0) switch (opt) @@ -182,6 +222,18 @@ main(int argc, char **argv) case 'd': want_dom++; break; + case WANT_HIDE_ERRORS: + want_hide_errors++; + break; + case WANT_UNFOLD_CDATA: + want_unfold_cdata++; + break; + case WANT_IGNORE_COMMENTS: + want_ignore_comments++; + break; + case WANT_IGNORE_PIS: + want_ignore_pis++; + break; default: usage(); } @@ -191,7 +243,8 @@ main(int argc, char **argv) out = bfdopen_shared(1, 4096); struct xml_context ctx; xml_init(&ctx); - ctx.h_warn = ctx.h_error = ctx.h_fatal = h_error; + if (!want_hide_errors) + ctx.h_warn = ctx.h_error = ctx.h_fatal = h_error; if (want_sax) { ctx.h_document_start = h_document_start; @@ -200,52 +253,68 @@ main(int argc, char **argv) ctx.h_doctype_decl = h_doctype_decl; ctx.h_comment = h_comment; ctx.h_pi = h_pi; - ctx.h_element_start = h_element_start; - ctx.h_element_end = h_element_end; + ctx.h_stag = h_stag; + ctx.h_etag = h_etag; ctx.h_chars = h_chars; + ctx.h_cdata = h_cdata; + ctx.h_dtd_start = h_dtd_start; + ctx.h_dtd_end = h_dtd_end; } - if (want_pull) - ctx.want = XML_WANT_CHARS | XML_WANT_STAG | XML_WANT_ETAG | XML_WANT_COMMENT | XML_WANT_PI; if (want_dom) - ctx.flags &= ~XML_DOM_FREE; + ctx.flags |= XML_ALLOC_ALL; + if (want_unfold_cdata) + ctx.flags |= XML_UNFOLD_CDATA; + if (want_ignore_comments) + ctx.flags &= ~(XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS); + if (want_ignore_pis) + ctx.flags &= ~(XML_REPORT_PIS | XML_ALLOC_PIS); xml_set_source(&ctx, bfdopen_shared(0, 4096)); - int state; - bprintf(out, "PULL: start\n"); - while ((state = xml_next(&ctx)) >= 0 && state != XML_STATE_EOF) - switch (state) - { - case XML_STATE_CHARS: - bprintf(out, "PULL: chars"); - show_node(ctx.node); - break; - case XML_STATE_STAG: - bprintf(out, "PULL: element_start"); - show_node(ctx.node); - break; - case XML_STATE_ETAG: - bprintf(out, "PULL: element_end \n", ctx.node->name); - break; - case XML_STATE_COMMENT: - bprintf(out, "PULL: comment"); - show_node(ctx.node); - break; - case XML_STATE_PI: - bprintf(out, "PULL: pi"); - show_node(ctx.node); - break; -#if 0 - case XML_STATE_CDATA: - bprintf(out, "PULL: cdata [%s]\n", ctx.node->text); - break; -#endif - } - if (state != XML_STATE_EOF) - bprintf(out, "PULL: fatal error\n"); + bputs(out, "PULL: start\n"); + if (want_pull) + { + ctx.pull = XML_PULL_CHARS | XML_PULL_CDATA | XML_PULL_STAG | XML_PULL_ETAG | XML_PULL_COMMENT | XML_PULL_PI; + uns state; + while (state = xml_next(&ctx)) + switch (state) + { + case XML_STATE_CHARS: + bputs(out, "PULL: chars"); + show_node(ctx.node); + break; + case XML_STATE_CDATA: + bputs(out, "PULL: cdata"); + show_node(ctx.node); + break; + case XML_STATE_STAG: + bputs(out, "PULL: stag"); + show_node(ctx.node); + break; + case XML_STATE_ETAG: + bprintf(out, "PULL: etag \n", ctx.node->name); + break; + case XML_STATE_COMMENT: + bputs(out, "PULL: comment"); + show_node(ctx.node); + break; + case XML_STATE_PI: + bputs(out, "PULL: pi"); + show_node(ctx.node); + break; + default: + bputs(out, "PULL: unknown\n"); + break; + } + } else - bprintf(out, "PULL: eof\n"); - - if (want_dom) - show_tree(ctx.root, 0); + xml_parse(&ctx); + if (ctx.err_code) + bprintf(out, "PULL: fatal error at %u: %s\n", xml_row(&ctx), ctx.err_msg); + else + { + bputs(out, "PULL: eof\n"); + if (want_dom) + show_tree(ctx.root, 0); + } xml_cleanup(&ctx); bclose(out); diff --git a/sherlock/xml/xml.h b/sherlock/xml/xml.h index db6ab6c6..c63d229b 100644 --- a/sherlock/xml/xml.h +++ b/sherlock/xml/xml.h @@ -16,129 +16,117 @@ #include "lib/fastbuf.h" enum xml_error { + // FIXME XML_ERR_OK = 0, - XML_ERR_WARN = 1000, /* Warning */ - XML_ERR_ERROR = 2000, /* Recoverable error */ - XML_ERR_FATAL = 3000, /* Unrecoverable error */ + XML_ERR_WARN = 1000, /* Warning */ + XML_ERR_ERROR = 2000, /* Recoverable error */ + XML_ERR_FATAL = 3000, /* Unrecoverable error */ XML_ERR_EOF, }; enum xml_state { - XML_STATE_START = 0, - XML_STATE_DECL, - XML_STATE_DOCUMENT_TYPE, - XML_STATE_CHARS, - XML_STATE_WHITE, - XML_STATE_CDATA, - XML_STATE_STAG, - XML_STATE_ETAG, - XML_STATE_COMMENT, - XML_STATE_PI, - XML_STATE_EOF, - XML_STATE_FATAL, + XML_STATE_EOF, /* EOF or a fatal error */ + XML_STATE_START, /* Initial state */ + XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */ + XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */ + XML_STATE_CHARS, /* XML_PULL_CHARS */ + XML_STATE_CDATA, /* XML_PULL_CDATA */ + XML_STATE_STAG, /* XML_PULL_STAG */ + XML_STATE_ETAG, /* XML_PULL_ETAG */ + XML_STATE_COMMENT, /* XML_PULL_COMMENT */ + XML_STATE_PI, /* XML_PULL_PI */ /* Internal states */ XML_STATE_CHARS_BEFORE_STAG, XML_STATE_CHARS_BEFORE_ETAG, XML_STATE_CHARS_BEFORE_CDATA, - XML_STATE_CHARS_BEFORE_PI, XML_STATE_CHARS_BEFORE_COMMENT, - XML_STATE_PROLOG_PI, + XML_STATE_CHARS_BEFORE_PI, XML_STATE_PROLOG_COMMENT, - XML_STATE_EPILOG_PI, + XML_STATE_PROLOG_PI, XML_STATE_EPILOG_COMMENT, + XML_STATE_EPILOG_PI, }; -enum xml_want { - XML_WANT_DECL = 1 << XML_STATE_DECL, - XML_WANT_DOCUMENT_TYPE = 1 << XML_STATE_DOCUMENT_TYPE, - XML_WANT_CHARS = 1 << XML_STATE_CHARS, - XML_WANT_WHITE = 1 << XML_STATE_WHITE, - XML_WANT_CDATA = 1 << XML_STATE_CDATA, - XML_WANT_STAG = 1 << XML_STATE_STAG, - XML_WANT_ETAG = 1 << XML_STATE_ETAG, - XML_WANT_COMMENT = 1 << XML_STATE_COMMENT, - XML_WANT_PI = 1 << XML_STATE_PI, - XML_WANT_EOF = 1 << XML_STATE_EOF, - XML_WANT_ALL = ~0U, +enum xml_pull { + XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */ + XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */ + XML_PULL_CHARS = 0x00000004, + XML_PULL_CDATA = 0x00000008, + XML_PULL_STAG = 0x00000010, + XML_PULL_ETAG = 0x00000020, + XML_PULL_COMMENT = 0x00000040, + XML_PULL_PI = 0x00000080, + XML_PULL_ALL = 0xffffffff, }; enum xml_flags { - XML_FLAG_VALIDATING = 0x1, - XML_FLAG_VERSION_1_1 = 0x2, /* XML version 1.1, otherwise 1.0 */ - XML_FLAG_HAS_EXTERNAL_SUBSET = 0x4, /* The document contains a reference to external DTD subset */ - XML_FLAG_HAS_INTERNAL_SUBSET = 0x8, /* The document contains an internal subset */ - - XML_FLAG_SRC_EOF = 0x10, /* EOF reached */ - XML_FLAG_SRC_EXPECTED_DECL = 0x20, /* Just before optional or required XMLDecl/TextDecl */ - XML_FLAG_SRC_NEW_LINE = 0x40, /* The last read character is 0xD */ - XML_FLAG_SRC_SURROUND = 0x80, /* Surround the text with 0x20 (references to parameter entities) */ - XML_FLAG_SRC_DOCUMENT = 0x100, /* The document entity */ - XML_FLAG_SRC_EXTERNAL = 0x200, /* An external entity */ - - XML_DOM_SKIP = 0x1000, /* Do not report DOM nodes */ - XML_DOM_FREE = 0x2000, /* Free the subtree when leaving */ - XML_DOM_IGNORE = XML_DOM_SKIP | XML_DOM_FREE, /* Completely ignore the subtree */ - - XML_FLAG_EMPTY_ELEM = 0x100000, -}; - -struct xml_ext_id { - char *system_id; - char *public_id; + /* Enable reporting of various events via SAX and/or PUSH interface */ + XML_REPORT_COMMENTS = 0x00000001, /* Report comments */ + XML_REPORT_PIS = 0x00000002, /* Report processing instructions */ + XML_REPORT_CHARS = 0x00000004, /* Report characters */ + XML_REPORT_TAGS = 0x00000008, /* Report element starts/ends */ + XML_REPORT_MISC = XML_REPORT_COMMENTS | XML_REPORT_PIS, + XML_REPORT_ALL = XML_REPORT_MISC | XML_REPORT_CHARS | XML_REPORT_TAGS, + + /* Enable construction of DOM for these types */ + XML_ALLOC_COMMENTS = 0x00000010, /* Create comment nodes */ + XML_ALLOC_PIS = 0x00000020, /* Create processing instruction nodes */ + XML_ALLOC_CHARS = 0x00000040, /* Create character nodes */ + XML_ALLOC_TAGS = 0x00000080, /* Create element nodes */ + XML_ALLOC_MISC = XML_ALLOC_COMMENTS | XML_ALLOC_PIS, + XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS, + + /* Other parameters */ + XML_UNFOLD_CDATA = 0x00000100, /* Unfold CDATA sections */ + XML_VALIDATING = 0x00000200, /* Validate everything (not fully implemented!) */ + + /* Internals, do not change! */ + XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */ + XML_VERSION_1_1 = 0x00020000, /* XML version is 1.1, otherwise 1.0 */ + XML_HAS_EXTERNAL_SUBSET = 0x00040000, /* The document contains a reference to external DTD subset */ + XML_HAS_INTERNAL_SUBSET = 0x00080000, /* The document contains an internal subset */ + XML_SRC_EOF = 0x00100000, /* EOF reached */ + XML_SRC_EXPECTED_DECL = 0x00200000, /* Just before optional or required XMLDecl/TextDecl */ + XML_SRC_NEW_LINE = 0x00400000, /* The last read character is 0xD */ + XML_SRC_SURROUND = 0x00800000, /* Surround the text with 0x20 (references to parameter entities) */ + XML_SRC_DOCUMENT = 0x01000000, /* The document entity */ + XML_SRC_EXTERNAL = 0x02000000, /* An external entity */ }; enum xml_node_type { XML_NODE_ELEM, XML_NODE_COMMENT, - XML_NODE_CDATA, + XML_NODE_CHARS, XML_NODE_PI, }; +#define XML_NODE_FOR_EACH(var, node) CLIST_FOR_EACH(struct xml_node *, var, (node)->sons) +#define XML_ATTR_FOR_EACH(var, node) SLIST_FOR_EACH(struct xml_attr *, var, (node)->attrs) + struct xml_node { - cnode n; /* Node for list of parent's sons */ - uns type; /* XML_NODE_x */ - struct xml_node *parent; /* Parent node */ - char *name; /* Element name / PI target */ - clist sons; /* Children nodes */ + cnode n; /* Node for list of parent's sons */ + uns type; /* XML_NODE_x */ + struct xml_node *parent; /* Parent node */ + char *name; /* Element name / PI target */ + clist sons; /* Children nodes */ union { struct { - char *text; /* PI text / Comment / CDATA */ - uns len; /* Text length in bytes */ + char *text; /* PI text / Comment / CDATA */ + uns len; /* Text length in bytes */ }; struct { - struct xml_dtd_elem *dtd; /* Element DTD */ - slist attrs; /* Link list of element attributes */ + struct xml_dtd_elem *dtd; /* Element DTD */ + slist attrs; /* Link list of element attributes */ }; }; }; struct xml_attr { - snode n; - struct xml_node *elem; - char *name; - char *val; -}; - -struct xml_context; - -#define XML_BUF_SIZE 32 /* At least 16 -- hardcoded */ - -struct xml_source { - struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ - struct fastbuf *fb; /* Source fastbuf */ - struct fastbuf wrap_fb; /* Libcharset or fbmem wrapper */ - u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ - u32 *bptr, *bstop; /* Current state of the buffer */ - uns row; /* File position */ - char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ - char *fb_encoding; /* Encoding of the source fastbuf */ - char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ - uns refill_cat1; /* Character categories, which should be directly passed to the buffer */ - uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in sequences) */ - void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ - unsigned short *refill_in_to_x; /* Libcharset input table */ - uns saved_depth; /* Saved ctx->depth */ + snode n; /* Node for elem->attrs */ + struct xml_node *elem; /* Parent element */ + char *name; /* Attribute name */ + char *val; /* Attribute value */ }; struct xml_context { @@ -152,28 +140,35 @@ struct xml_context { /* Memory management */ struct mempool *pool; /* DOM pool */ - struct mempool *stack; /* Stack pool (freed as soon as possible) */ + struct mempool *stack; /* Stack pool (free as soon as possible) */ struct xml_stack *stack_list; /* See xml_push(), xml_pop() */ uns flags; /* XML_FLAG_x (restored on xml_pop()) */ uns depth; /* Nesting level */ struct fastbuf chars; /* Character data / attribute value */ - void *tab_attrs; + void *tab_attrs; /* Hash table of element attributes */ /* Input */ struct xml_source *src; /* Current source */ - u32 *bptr, *bstop; /* Character buffer */ + u32 *bptr, *bstop; /* Buffer with preprocessed characters (validated UCS-4 + category flags) */ + uns cat_chars; /* Unicode range of supported characters (cdata, attribute values, ...) */ + uns cat_unrestricted; /* Unrestricted characters (may appear in document/external entities) */ + uns cat_new_line; /* New line characters */ + uns cat_name; /* Characters that may appear in names */ + uns cat_sname; /* Characters that may begin a name */ /* SAX-like interface */ void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */ void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */ void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */ - void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration just before internal subset */ - void (*h_comment)(struct xml_context *ctx); /* Called after a comment */ - void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction */ - void (*h_element_start)(struct xml_context *ctx); /* Called after STag or EmptyElemTag */ - void (*h_element_end)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag */ - void (*h_chars)(struct xml_context *ctx); /* Called after some characters */ - void (*h_cdata)(struct xml_context *ctx); /* Called after a CDATA section */ + void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration (before optional internal subset) */ + void (*h_comment)(struct xml_context *ctx); /* Called after a comment (only with XML_REPORT_COMMENTS) */ + void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */ + void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */ + void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */ + void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */ + void (*h_cdata)(struct xml_context *ctx); /* Called after a CDATA section (only with XML_REPORT_CHARS and XML_UNFOLD_CDATA) */ + void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */ + void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */ /* DOM */ struct xml_node *root; /* DOM root */ @@ -181,14 +176,13 @@ struct xml_context { char *version_str; uns standalone; - char *document_type; - struct xml_dtd *dtd; - struct xml_ext_id eid; - uns state; - uns want; - - void (*start_dtd)(struct xml_context *ctx); - void (*end_dtd)(struct xml_context *ctx); + char *doctype; /* The document type (or NULL if unknown) */ + char *system_id; /* DTD external id */ + char *public_id; /* DTD public id */ + struct xml_dtd *dtd; /* The DTD structure (or NULL) */ + uns state; /* Current state for the PULL interface (XML_STATE_x) */ + uns pull; /* Parameters for the PULL interface (XML_PULL_x) */ + void (*start_entity)(struct xml_context *ctx); void (*end_entity)(struct xml_context *ctx); struct fastbuf *(*resolve_entity)(struct xml_context *ctx); @@ -196,10 +190,25 @@ struct xml_context { void (*unparsed_entity_decl)(struct xml_context *ctx); }; +/* Initialize XML context */ void xml_init(struct xml_context *ctx); + +/* Clean up all internal structures */ void xml_cleanup(struct xml_context *ctx); + +/* Reuse XML context */ +void xml_reset(struct xml_context *ctx); + +/* Setup XML source (fastbuf will be automatically closed) */ void xml_set_source(struct xml_context *ctx, struct fastbuf *fb); -int xml_next(struct xml_context *ctx); + +/* Parse without the PUSH interface, return XML_ERR_x code (zero on success) */ +uns xml_parse(struct xml_context *ctx); + +/* Parse with the PUSH interface, return XML_STATE_x (zero on EOF or fatal error) */ +uns xml_next(struct xml_context *ctx); + uns xml_row(struct xml_context *ctx); +struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name); #endif -- 2.39.2