From 55449f070299728b5f4b5354b30e4c7e235873f4 Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Mon, 10 Dec 2007 10:40:08 +0100 Subject: [PATCH] XML: Backuped incomplete XML parser. Changes mostly from friday. --- lib/Makefile | 11 +- sherlock/xml/Makefile | 19 + {lib => sherlock/xml}/xml-ucat.pl | 2 + {lib => sherlock/xml}/xml.c | 1435 ++++++++++++++++++----------- {lib => sherlock/xml}/xml.h | 124 +-- 5 files changed, 979 insertions(+), 612 deletions(-) create mode 100644 sherlock/xml/Makefile rename {lib => sherlock/xml}/xml-ucat.pl (99%) rename {lib => sherlock/xml}/xml.c (63%) rename {lib => sherlock/xml}/xml.h (72%) diff --git a/lib/Makefile b/lib/Makefile index 669f356b..52751e11 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -33,8 +33,7 @@ LIBUCW_MODS= \ qache \ string \ bbuf \ - getopt \ - xml + getopt LIBUCW_INCLUDES= \ lib.h config.h threads.h \ @@ -61,8 +60,7 @@ LIBUCW_INCLUDES= \ base64.h base224.h \ qache.h \ kmp.h kmp-search.h binsearch.h \ - partmap.h \ - xml.h + partmap.h ifdef CONFIG_UCW_THREADS # Some modules require threading @@ -88,11 +86,6 @@ $(o)/lib/libucw.so: $(addsuffix .oo,$(LIBUCW_MOD_PATHS)) $(o)/lib/hashfunc.o $(o)/lib/hashfunc.oo: CFLAGS += -funroll-loops $(o)/lib/lizard.o: CFLAGS += $(COPT2) -funroll-loops -$(o)/lib/xml.o: $(o)/lib/xml-ucat.h -$(o)/lib/xml-ucat.h: $(s)/lib/xml-ucat.pl - $(M)GEN $@ - $(Q)$< >$@ - $(o)/lib/db-test: $(o)/lib/db-test.o $(LIBUCW) $(o)/lib/db-tool: $(o)/lib/db-tool.o $(LIBUCW) $(o)/lib/conf-test: $(o)/lib/conf-test.o $(LIBUCW) diff --git a/sherlock/xml/Makefile b/sherlock/xml/Makefile new file mode 100644 index 00000000..3c70a9b0 --- /dev/null +++ b/sherlock/xml/Makefile @@ -0,0 +1,19 @@ +# Makefile for the XML parser +# (c) 2007 Pavel Charvat + +DIRS+=sherlock/xml + +LIBSH_MODS+=xml/xml +LIBSH_XML_INCLUDES=xml/xml.h + +$(o)/sherlock/xml/xml-t: $(LIBSH) $(LIBCHARSET) +$(o)/sherlock/xml/xml.o: $(o)/sherlock/xml/xml-ucat.h +$(o)/sherlock/xml/xml-ucat.h: $(s)/sherlock/xml/xml-ucat.pl + $(M)GEN $@ + $(Q)$< >$@ + +API_INCLUDES+=$(o)/sherlock/xml/.include-stamp +$(o)/sherlock/xml/.include-stamp: $(addprefix $(s)/sherlock/xml/,$(LIBSH_XML_INCLUDES)) +$(o)/sherlock/xml/.include-stamp: IDST=sherlock/xml + +include $(s)/sherlock/perl/Makefile diff --git a/lib/xml-ucat.pl b/sherlock/xml/xml-ucat.pl similarity index 99% rename from lib/xml-ucat.pl rename to sherlock/xml/xml-ucat.pl index cbfb8d34..eeb948e6 100755 --- a/lib/xml-ucat.pl +++ b/sherlock/xml/xml-ucat.pl @@ -86,6 +86,8 @@ set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0); set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0); set("SNAME_1_1", @sname_1_1); set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]); +set("DECL", @white, [0x20,0x7E]); +set("GT", "[>]"); print "/* Automatically generated by xml-ucat.pl */\n\n"; find_cls(); diff --git a/lib/xml.c b/sherlock/xml/xml.c similarity index 63% rename from lib/xml.c rename to sherlock/xml/xml.c index 828b4c15..27ff8249 100644 --- a/lib/xml.c +++ b/sherlock/xml/xml.c @@ -1,5 +1,5 @@ /* - * UCW Library -- A simple XML parser + * Sherlock Library -- A simple XML parser * * (c) 2007 Pavel Charvat * @@ -8,7 +8,6 @@ */ /* TODO: - * - various character encodings * - iface * - stack-like memory handling where possible */ @@ -19,15 +18,28 @@ #include "lib/mempool.h" #include "lib/fastbuf.h" #include "lib/ff-utf8.h" +#include "lib/ff-binary.h" #include "lib/chartype.h" #include "lib/unicode.h" -#include "lib/xml.h" #include "lib/hashfunc.h" #include "lib/stkstring.h" -#include "charset/unicat.h" +#include "lib/unaligned.h" +#include "charset/charconv.h" +#include "charset/fb-charconv.h" +#include "sherlock/xml/xml.h" #include +/*** Debugging ***/ + +#ifdef LOCAL_DEBUG +#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0) +#else +#define TRACE(c, f, p...) do {} while(0) +#endif + +static uns xml_row(struct xml_context *ctx); + /*** Error handling ***/ static void NONRET @@ -98,81 +110,214 @@ xml_char_cat(uns c) return 1; } -/*** Reading of document/external entities ***/ +/*** Generic UTF decoding ***/ -static void NONRET -xml_eof(struct xml_context *ctx) +static uns +bget_utf16_le_slow(struct fastbuf *fb, uns repl) { - ctx->err_msg = "Unexpected EOF"; - ctx->err_code = XML_ERR_EOF; - xml_throw(ctx); + if ((int)bpeekc(fb) < 0) + return ~0U; + uns u = bgetw_le(fb), x, y; + if ((int)u < 0) + return repl; + if ((x = u - 0xd800) >= 0x800) + return u; + if (x >= 0x400 || (int)bpeekc(fb) < 0 || (y = bgetw_le(fb) - 0xdc00) >= 0x400) + return repl; + return 0x10000 + (x << 10) + y; +} + +static uns +bget_utf16_be_slow(struct fastbuf *fb, uns repl) +{ + if ((int)bpeekc(fb) < 0) + return ~0U; + uns u = bgetw_be(fb), x, y; + if ((int)u < 0) + return repl; + if ((x = u - 0xd800) >= 0x800) + return u; + if (x >= 0x400 || (int)bpeekc(fb) < 0 || (y = bgetw_be(fb) - 0xdc00) >= 0x400) + return repl; + return 0x10000 + (x << 10) + y; +} + +static inline uns +bget_utf16_le_repl(struct fastbuf *fb, uns repl) +{ + uns u; + if (bavailr(fb) >= 4) + { + fb->bptr = utf16_le_get_repl(fb->bptr, &u, repl); + return u; + } + else + return bget_utf16_le_slow(fb, repl); +} + +static inline uns +bget_utf16_be_repl(struct fastbuf *fb, uns repl) +{ + uns u; + if (bavailr(fb) >= 4) + { + fb->bptr = utf16_be_get_repl(fb->bptr, &u, repl); + return u; + } + else + return bget_utf16_be_slow(fb, repl); } +/*** Memory management ***/ + static void NONRET xml_fatal_nested(struct xml_context *ctx) { - xml_fatal(ctx, "Entity is not tested correctly"); + xml_fatal(ctx, "Entity not nested correctly"); } static inline void -xml_inc_depth(struct xml_context *ctx) +xml_inc(struct xml_context *ctx) { + /* Called after the first character of a block */ + TRACE(ctx, "inc"); ctx->depth++; } static inline void -xml_dec_depth(struct xml_context *ctx) +xml_dec(struct xml_context *ctx) { - if (unlikely(!ctx->depth)) + /* Called after the last character of a block */ + TRACE(ctx, "dec"); + if (unlikely(!ctx->depth--)) xml_fatal_nested(ctx); - ctx->depth--; } -static void -xml_push_source(struct xml_context *ctx, struct fastbuf *fb, uns flags) +static inline void +xml_push(struct xml_context *ctx) { - DBG("XML: xml_push_source"); - struct xml_source *osrc = ctx->sources; - if (osrc) - { - osrc->bptr = ctx->bptr; - osrc->bstop = ctx->bstop; - osrc->depth = ctx->depth; - } - struct xml_source *src = mp_alloc(ctx->pool, sizeof(*src)); - src->next = osrc; - src->flags = flags; - src->fb = fb; - ctx->depth = 0; - ctx->sources = src; - ctx->bstop = ctx->bptr = src->buf; - if (flags & XML_SRC_SURROUND) - { - *ctx->bptr++ = 0x20; - *ctx->bptr++ = xml_char_cat(0x20); - } + TRACE(ctx, "push"); + struct xml_stack *s = mp_alloc(ctx->pool, sizeof(*s)); + mp_save(ctx->pool, &s->saved_pool); + s->saved_flags = ctx->flags; + s->next = ctx->stack; + ctx->stack = s; + xml_inc(ctx); } -void -xml_set_source(struct xml_context *ctx, struct fastbuf *fb) +static inline void +xml_pop(struct xml_context *ctx) +{ + TRACE(ctx, "pop"); + xml_dec(ctx); + struct xml_stack *s = ctx->stack; + ASSERT(s); + ctx->stack = s->next; + ctx->flags = s->saved_flags; + mp_restore(ctx->pool, &s->saved_pool); +} + +#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN) +#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \ + static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uns size) \ + { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \ + static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {} + +static void * +xml_hash_new(struct mempool *pool, uns size) +{ + void *tab = mp_alloc_zero(pool, size + XML_HASH_HDR_SIZE); + *(void **)tab = pool; + return tab + XML_HASH_HDR_SIZE; +} + +/*** Reading of document/external entities ***/ + +static void NONRET +xml_eof(struct xml_context *ctx) +{ + ctx->err_msg = "Unexpected EOF"; + ctx->err_code = XML_ERR_EOF; + xml_throw(ctx); +} + +static inline void +xml_add_char(u32 **bstop, uns c) { - xml_push_source(ctx, fb, XML_SRC_DOCUMENT | XML_SRC_DECL); + *(*bstop)++ = c; + *(*bstop)++ = xml_char_cat(c); +} + +static struct xml_source * +xml_push_source(struct xml_context *ctx, uns flags) +{ + xml_push(ctx); + struct xml_source *src = ctx->src; + if (src) + { + src->bptr = ctx->bptr; + src->bstop = ctx->bstop; + } + src = mp_alloc_zero(ctx->pool, sizeof(*src)); + src->next = ctx->src; + src->saved_depth = ctx->depth; + ctx->src = src; + ctx->flags = (ctx->flags & ~(XML_FLAG_SRC_EOF | XML_FLAG_SRC_EXPECTED_DECL | XML_FLAG_SRC_NEW_LINE | XML_FLAG_SRC_SURROUND | XML_FLAG_SRC_DOCUMENT)) | flags; + ctx->bstop = ctx->bptr = src->buf; + ctx->depth = 0; + if (flags & XML_FLAG_SRC_SURROUND) + xml_add_char(&ctx->bstop, 0x20); + return src; } static void xml_pop_source(struct xml_context *ctx) { - DBG("XML: xml_pop_source"); - if (unlikely(ctx->depth)) - xml_fatal(ctx, "Invalid entity nesting"); - struct xml_source *src = ctx->sources; + TRACE(ctx, "xml_pop_source"); + if (unlikely(ctx->depth != 0)) + xml_fatal_nested(ctx); + struct xml_source *src = ctx->src; + ASSERT(src); bclose(src->fb); - ctx->sources = src = src->next; + ctx->depth = src->saved_depth; + ctx->src = src = src->next; + if (src) + { + ctx->bptr = src->bptr; + ctx->bstop = src->bstop; + } + xml_pop(ctx); if (unlikely(!src)) xml_eof(ctx); - ctx->bptr = src->bptr; - ctx->bstop = src->bstop; - ctx->depth = src->depth; +} + +static void xml_refill_utf8(struct xml_context *ctx); + +static void +xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent) +{ + TRACE(ctx, "xml_push_entity"); + uns cat1 = ctx->src->refill_cat1; + uns cat2 = ctx->src->refill_cat2; + struct xml_source *src = xml_push_source(ctx, 0); + src->refill_cat1 = cat1; + src->refill_cat2 = cat2; + if (ent->flags & XML_DTD_ENT_EXTERNAL) + xml_fatal(ctx, "External entities not implemented"); // FIXME + else + { + fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0); + src->refill = xml_refill_utf8; + } +} + +void +xml_set_source(struct xml_context *ctx, struct fastbuf *fb) +{ + TRACE(ctx, "xml_set_source"); + ASSERT(!ctx->src); + struct xml_source *src = xml_push_source(ctx, XML_FLAG_SRC_DOCUMENT | XML_FLAG_SRC_EXPECTED_DECL); + src->fb = fb; } static uns @@ -184,86 +329,109 @@ xml_error_restricted(struct xml_context *ctx, uns c) static void xml_parse_decl(struct xml_context *ctx); +#define REFILL(ctx, func, params...) \ + struct xml_source *src = ctx->src; \ + struct fastbuf *fb = src->fb; \ + if (ctx->bptr == ctx->bstop) \ + ctx->bptr = ctx->bstop = src->buf; \ + uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ + u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \ + *last_0xd = (f & XML_FLAG_SRC_NEW_LINE) ? bstop : bend; \ + do \ + { \ + c = func(fb, ##params); \ + uns t = xml_char_cat(c); \ + if (t & t1) \ + /* Typical branch */ \ + *bstop++ = c, *bstop++ = t; \ + else if (t & t2) \ + { \ + /* New line */ \ + /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \ + /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \ + if (c == 0xd) \ + last_0xd = bstop + 2; \ + else if (c != 0x2028 && last_0xd == bstop) \ + { \ + last_0xd = bend; \ + continue; \ + } \ + xml_add_char(&bstop, 0xa), row++; \ + } \ + else if (c == '>') \ + { \ + /* Used only in XML/TextDecl to switch the encoding */ \ + *bstop++ = c, *bstop++ = t; \ + break; \ + } \ + else if ((int)c >= 0) \ + /* Restricted character */ \ + xml_add_char(&bstop, xml_error_restricted(ctx, c)); \ + else \ + { \ + /* EOF */ \ + if (f & XML_FLAG_SRC_SURROUND) \ + xml_add_char(&bstop, 0x20); \ + f |= XML_FLAG_SRC_EOF; \ + break; \ + } \ + } \ + while (bstop < bend); \ + ctx->flags = (last_0xd == bstop) ? f | XML_FLAG_SRC_NEW_LINE : f & ~XML_FLAG_SRC_NEW_LINE; \ + ctx->bstop = bstop; \ + src->row = row; + +static void +xml_refill_utf8(struct xml_context *ctx) +{ + // FIXME: report corrupted encoding + REFILL(ctx, bget_utf8); +} + +static void +xml_refill_utf16_le(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf16_le_repl, 0); +} + +static void +xml_refill_utf16_be(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf16_be_repl, 0); +} + +#if 0 +static inline uns +xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x) +{ + // FIXME: slow + int c; + return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]); +} + +static void +xml_refill_libcharset(struct xml_context *ctx) +{ + unsigned short int *in_to_x = ctx->src->refill_in_to_x; + REFILL(ctx, xml_refill_libcharset_bget, in_to_x); +} +#endif + +#undef REFILL + static void xml_refill(struct xml_context *ctx) { - // FIXME: - // -- various encodings, especially UTF-16 - // -- track col/row numbers - // -- report incorrect encoding - // -- deal with forbidden XML 1.1 newlines in xml/text decl do { - struct xml_source *src = ctx->sources; - uns c, t, t1, t2, f = src->flags; - if (f & XML_SRC_EOF) + if (ctx->flags & XML_FLAG_SRC_EOF) xml_pop_source(ctx); - else if (f & XML_SRC_DECL) + else if (ctx->flags & XML_FLAG_SRC_EXPECTED_DECL) xml_parse_decl(ctx); else { - struct fastbuf *fb = src->fb; - if (ctx->bptr == ctx->bstop) - ctx->bptr = ctx->bstop = src->buf; - u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, *last_0xd = (f & XML_SRC_NEW_LINE) ? bstop : bend; - if (ctx->flags & XML_FLAG_VERSION_1_1) - { - t2 = XML_CHAR_NEW_LINE_1_1; - t1 = XML_CHAR_UNRESTRICTED_1_1 & ~t2; - } - else - { - t2 = XML_CHAR_NEW_LINE_1_0; - t1 = XML_CHAR_VALID_1_0 & ~t2; - } - while (bstop < bend) - { - c = bget_utf8_32(fb); - t = xml_char_cat(c); - if (t & t1) - { - /* Typical branch */ - *bstop++ = c; - *bstop++ = t; - } - else if (t & t2) - { - /* New line - * XML 1.0: 0xA | 0xD | 0xD 0xA - * XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ - *bstop++ = 0xa; - *bstop++ = xml_char_cat(0xa); - if (c == 0xd) - last_0xd = bstop; - else if (c != 0x2028 && last_0xd != bstop - 2) - bstop -= 2; - } - else if ((int)c >= 0) - { - /* Restricted character */ - c = xml_error_restricted(ctx, c); - *bstop++ = c; - *bstop++ = xml_char_cat(c); - } - else - { - /* EOF */ - if (f & XML_SRC_SURROUND) - { - *bstop++ = 0x20; - *bstop++ = xml_char_cat(0x20); - } - f |= XML_SRC_EOF; - break; - } - } - if (last_0xd == bstop) - f |= XML_SRC_NEW_LINE; - else - f &= ~XML_SRC_NEW_LINE; - ctx->sources->flags = f; - ctx->bstop = bstop; - DBG("XML: refilled %u characters", (uns)(ctx->bstop - ctx->bptr) / 2); + ctx->src->refill(ctx); + TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2)); } } while (ctx->bptr == ctx->bstop); @@ -327,6 +495,19 @@ xml_unget_char(struct xml_context *ctx) return *(ctx->bptr -= 2); } +static uns +xml_row(struct xml_context *ctx) +{ + struct xml_source *src = ctx->src; + if (!src) + return 0; + uns row = src->row; + for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2) + if (p[-1] & src->refill_cat2) + row--; + return row + 1; +} + /*** Basic parsing ***/ static void NONRET @@ -492,12 +673,12 @@ xml_parse_encoding_name(struct xml_context *ctx) /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */ char *p = mp_start_noalign(ctx->pool, 1); uns q = xml_parse_quote(ctx); - if (unlikely(!(xml_peek_cat(ctx) & XML_CHAR_ENC_SNAME))) + if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME))) xml_fatal(ctx, "Invalid character in the encoding name"); - while(1) + while (1) { p = mp_spread(ctx->pool, p, 2); - *p++ = xml_skip_char(ctx); + *p++ = xml_last_char(ctx); if (xml_get_char(ctx) == q) break; if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME))) @@ -509,149 +690,159 @@ xml_parse_encoding_name(struct xml_context *ctx) /* Document/external entity header */ -static void -xml_detect_encoding(struct xml_context *ctx) -{ - DBG("XML: xml_detect_encoding"); - struct xml_source *src = ctx->sources; - struct fastbuf *fb = src->fb; - char *detected_encoding = NULL; - uns x = 0, l = 0, c, z = 1; - while (l < 4) +static inline void +xml_init_cats(struct xml_context *ctx, uns mask) +{ + if (!(ctx->flags & XML_FLAG_VERSION_1_1)) { - if (!~(c = bgetc(fb))) - { - src->flags |= XML_SRC_EOF; - break; - } - else if (!c || c >= 0xfe || c == 0xa7 || c == 0x94) - z = 0; - else if ((c < 0x3c || c > 0x78)) - { - bungetc(fb); - break; - } - x = (x << 8) + c; - l++; + ctx->src->refill_cat1 = XML_CHAR_VALID_1_0 & ~XML_CHAR_NEW_LINE_1_0 & ~mask; + ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_0; } - if (z) - z = x; - else if (l == 2) - switch (x) - { - case 0xFEFF: - xml_fatal(ctx, "UTF-16BE encoding not supported"); - case 0xFFFE: - xml_fatal(ctx, "UTF-16LE encoding not supported"); - default: - goto cannot_detect; - } - else if (l == 4) - switch (x) - { - case 0x0000FEFF: - xml_fatal(ctx, "UCS-4BE encoding not supported"); - case 0xFFFE0000: - xml_fatal(ctx, "UCS-4LE encoding not supported"); - case 0x0000FFFE: - xml_fatal(ctx, "UCS-4 encoding (order 2143) not supported"); - case 0xFEFF0000: - xml_fatal(ctx, "UCS-4 encoding (order 3412) not supported"); - case 0x0000003c: - xml_fatal(ctx, "UCS-4BE encoding not supported"); - case 0x3c000000: - xml_fatal(ctx, "UCS-4LE encoding not supported"); - case 0x00003c00: - xml_fatal(ctx, "UCS-4 encoding (order 2143) not supported"); - case 0x003c0000: - xml_fatal(ctx, "UCS-4 encoding (order 3412) not supported"); - case 0x003c003F: - xml_fatal(ctx, "UTF-16BE encoding not supported"); - case 0x3C003F00: - xml_fatal(ctx, "UTF-16LE encoding not supported"); - case 0x3C3F786D: - xml_fatal(ctx, "EBCDIC encoding not supported"); - default: - goto cannot_detect; - } else -cannot_detect: - xml_fatal(ctx, "Cannot detect the encoding"); - ctx->bptr = ctx->bstop = src->buf + 8; - while (z) { - c = z & 0xff; - z >>= 8; - *--ctx->bptr = xml_char_cat(c); - *--ctx->bptr = c; + ctx->src->refill_cat1 = XML_CHAR_UNRESTRICTED_1_1 & ~XML_CHAR_NEW_LINE_1_1 & ~mask; + ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_1; } - if (!detected_encoding && ctx->bstop == ctx->bptr && xml_peek_char(ctx) == 0xfeff) - xml_skip_char(ctx); - DBG("XML: Detected encoding: %s", detected_encoding ? : "UTF-8"); - if (!(src->flags & XML_SRC_EOF)) - xml_refill(ctx); +} + +static void +xml_init_charconv(struct xml_context *ctx, int cs) +{ + // FIXME: hack + struct xml_source *src = ctx->src; + TRACE(ctx, "wrapping charset %s", charset_name(cs)); +#if 0 + struct conv_context conv; + conv_set_charset(&conv, cs, CONV_CHARSET_UTF8); + src->refill = xml_refill_libcharset; + src->refill_in_to_x = conv.in_to_x; +#else + src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8); + // FIXME: memory leak +#endif } static void xml_parse_decl(struct xml_context *ctx) { - DBG("XML: xml_parse_decl"); - ctx->sources->flags &= ~XML_SRC_DECL; - xml_detect_encoding(ctx); - uns document = ctx->sources->flags & XML_SRC_DOCUMENT; + TRACE(ctx, "xml_parse_decl"); + struct xml_source *src = ctx->src; + ctx->flags &= ~XML_FLAG_SRC_EXPECTED_DECL; + + /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */ + xml_init_cats(ctx, XML_CHAR_GT); + + /* Initialize the supplied charset (if any) or try to guess it */ + char *expected_encoding = src->expected_encoding ? : src->fb_encoding; + src->refill = xml_refill_utf8; + int bom = bpeekc(src->fb); + if (bom < 0) + ctx->flags |= XML_FLAG_SRC_EOF; + if (!src->fb_encoding) + { + if (bom == 0xfe) + src->refill = xml_refill_utf16_be; + else if (bom == 0xff) + src->refill = xml_refill_utf16_le; + } + else + { + int cs = find_charset_by_name(src->fb_encoding); + if (cs == CONV_CHARSET_UTF8) + {} + else if (cs >= 0) + { + xml_init_charconv(ctx, cs); + bom = 0; + } + else if (strcasecmp(src->fb_encoding, "UTF-16")) + { + src->refill = xml_refill_utf16_be; + if (bom == 0xff) + src->refill = xml_refill_utf16_le; + if (!src->expected_encoding) + expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE"; + } + else if (strcasecmp(src->fb_encoding, "UTF-16BE")) + src->refill = xml_refill_utf16_be; + else if (strcasecmp(src->fb_encoding, "UTF-16LE")) + src->refill = xml_refill_utf16_le; + else + { + xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding); + expected_encoding = NULL; + } + } + uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; + if (bom > 0 && xml_peek_char(ctx) == 0xfeff) + xml_skip_char(ctx); + else if (utf16) + xml_error(ctx, "Missing or corrupted BOM"); + + /* Look ahead for presence of XMLDecl or optional TextDecl */ + if (!(ctx->flags & XML_FLAG_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) + xml_refill(ctx); + uns doc = ctx->flags & XML_FLAG_SRC_DOCUMENT; u32 *bptr = ctx->bptr; - uns have_decl = - (12 <= ctx->bstop - ctx->bptr && - bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L' && - (bptr[11] & XML_CHAR_WHITE)); + uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) && + bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L'); if (!have_decl) { - if (document) - xml_fatal(ctx, "Missing or corrupted XML declaration header"); - return; + if (doc) + xml_fatal(ctx, "Missing or corrupted XML header"); + else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16) + xml_error(ctx, "Missing or corrupted entity header"); + goto exit; } - ctx->bptr += 12; - - /* FIXME: the header must not contain exotic newlines */ + ctx->bptr = bptr + 12; xml_parse_white(ctx, 0); + /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */ if (xml_peek_char(ctx) == 'v') { xml_parse_seq(ctx, "version"); xml_parse_eq(ctx); char *version = xml_parse_pubid_literal(ctx); - DBG("XML: Version=%s", version); - if (document) + TRACE(ctx, "version=%s", version); + uns v = 0; + if (!strcmp(version, "1.1")) + v = XML_FLAG_VERSION_1_1; + else if (strcmp(version, "1.0")) + { + xml_error(ctx, "Unknown XML version string '%s'", version); + version = "1.0"; + } + if (doc) { ctx->version_str = version; - if (!strcmp(ctx->version_str, "1.0")) - ; - else if (!strcmp(ctx->version_str, "1.1")) - ctx->flags |= XML_FLAG_VERSION_1_1; - else - xml_fatal(ctx, "Unsupported XML version"); + ctx->flags |= v; } - else if (strcmp(version, ctx->version_str)) - xml_error(ctx, "Mixed XML versions"); + else if (v > (ctx->flags & XML_FLAG_VERSION_1_1)) + xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document"); + if (!xml_parse_white(ctx, !doc)) + goto end; + } + else if (doc) + { + xml_error(ctx, "Expected XML version"); + ctx->version_str = "1.0"; } - else if (document) - xml_fatal(ctx, "Missing XML version"); - // FIXME: TextDecl must contain encoding - if (!xml_parse_white(ctx, 0)) - goto end; + /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */ if (xml_peek_char(ctx) == 'e') { xml_parse_seq(ctx, "encoding"); xml_parse_eq(ctx); - ctx->encoding = xml_parse_encoding_name(ctx); - DBG("encoding=%s", ctx->encoding); - // FIXME: check encoding + src->decl_encoding = xml_parse_encoding_name(ctx); + TRACE(ctx, "encoding=%s", src->decl_encoding); if (!xml_parse_white(ctx, 0)) goto end; } + else if (!doc) + xml_error(ctx, "Expected XML encoding"); - if (document && xml_peek_char(ctx) == 's') + /* Parse whether the document is standalone (optional in XMLDecl) */ + if (doc && xml_peek_char(ctx) == 's') { xml_parse_seq(ctx, "standalone"); xml_parse_eq(ctx); @@ -661,11 +852,30 @@ xml_parse_decl(struct xml_context *ctx) else xml_parse_seq(ctx, "no"); xml_parse_char(ctx, c); - DBG("standalone=%d", ctx->standalone); + TRACE(ctx, "standalone=%d", ctx->standalone); xml_parse_white(ctx, 0); } end: xml_parse_seq(ctx, "?>"); + + /* Switch to the final encoding */ + if (src->decl_encoding) + { + int cs = find_charset_by_name(src->decl_encoding); + if (cs < 0 && !expected_encoding) + xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); + else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) + xml_init_charconv(ctx, cs); + else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || + !(!strcasecmp(src->decl_encoding, "UTF-16") || + (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || + (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) + xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); + } + +exit: + /* Update valid Unicode ranges */ + xml_init_cats(ctx, 0); } /*** Document Type Definition (DTD) ***/ @@ -675,12 +885,13 @@ end: #define HASH_PREFIX(x) xml_dtd_notns_##x #define HASH_NODE struct xml_dtd_notn #define HASH_KEY_STRING name -#define HASH_AUTO_POOL 1024 #define HASH_ZERO_FILL #define HASH_TABLE_DYNAMIC #define HASH_WANT_FIND #define HASH_WANT_LOOKUP -#define HASH_WANT_CLEANUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC #include "lib/hashtable.h" /* General entities */ @@ -688,15 +899,16 @@ end: #define HASH_PREFIX(x) xml_dtd_ents_##x #define HASH_NODE struct xml_dtd_ent #define HASH_KEY_STRING name -#define HASH_AUTO_POOL 1024 #define HASH_ZERO_FILL #define HASH_TABLE_DYNAMIC #define HASH_WANT_FIND #define HASH_WANT_LOOKUP -#define HASH_WANT_CLEANUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC #include "lib/hashtable.h" -static void +static struct xml_dtd_ent * xml_dtd_declare_trivial_gent(struct xml_context *ctx, char *name, char *text) { struct xml_dtd *dtd = ctx->dtd; @@ -704,11 +916,13 @@ xml_dtd_declare_trivial_gent(struct xml_context *ctx, char *name, char *text) if (ent->flags & XML_DTD_ENT_DECLARED) { xml_warn(ctx, "Entity &%s; already declared", name); - return; + return NULL; } slist_add_tail(&dtd->gents, &ent->n); ent->flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL; ent->text = text; + ent->len = strlen(text); + return ent; } static void @@ -728,7 +942,7 @@ xml_dtd_find_gent(struct xml_context *ctx, char *name) if (dtd) { struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_gents, name); - return (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; + return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; } else { @@ -767,7 +981,7 @@ xml_dtd_find_pent(struct xml_context *ctx, char *name) { struct xml_dtd *dtd = ctx->dtd; struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_pents, name); - return (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; + return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; } /* Elements */ @@ -776,10 +990,11 @@ xml_dtd_find_pent(struct xml_context *ctx, char *name) #define HASH_NODE struct xml_dtd_elem #define HASH_KEY_STRING name #define HASH_TABLE_DYNAMIC -#define HASH_AUTO_POOL 1024 #define HASH_ZERO_FILL #define HASH_WANT_LOOKUP -#define HASH_WANT_CLEANUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC #include "lib/hashtable.h" /* Element attributes */ @@ -807,7 +1022,6 @@ xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_at #define HASH_PREFIX(x) xml_dtd_attrs_##x #define HASH_NODE struct xml_dtd_attr -#define HASH_AUTO_POOL 1024 #define HASH_ZERO_FILL #define HASH_TABLE_DYNAMIC #define HASH_KEY_COMPLEX(x) x elem, x name @@ -817,7 +1031,9 @@ xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_at #define HASH_GIVE_INIT_KEY #define HASH_WANT_FIND #define HASH_WANT_NEW -#define HASH_WANT_CLEANUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC #include "lib/hashtable.h" /* Enumerated attribute values */ @@ -845,7 +1061,6 @@ xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_ev #define HASH_PREFIX(x) xml_dtd_evals_##x #define HASH_NODE struct xml_dtd_eval -#define HASH_AUTO_POOL 1024 #define HASH_TABLE_DYNAMIC #define HASH_KEY_COMPLEX(x) x attr, x val #define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val @@ -854,7 +1069,9 @@ xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_ev #define HASH_GIVE_INIT_KEY #define HASH_WANT_FIND #define HASH_WANT_NEW -#define HASH_WANT_CLEANUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC #include "lib/hashtable.h" /* Enumerated attribute notations */ @@ -882,7 +1099,6 @@ xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_ #define HASH_PREFIX(x) xml_dtd_enotns_##x #define HASH_NODE struct xml_dtd_enotn -#define HASH_AUTO_POOL 1024 #define HASH_TABLE_DYNAMIC #define HASH_KEY_COMPLEX(x) x attr, x notn #define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn @@ -891,7 +1107,9 @@ xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_ #define HASH_GIVE_INIT_KEY #define HASH_WANT_FIND #define HASH_WANT_NEW -#define HASH_WANT_CLEANUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC #include "lib/hashtable.h" /* DTD initialization/cleanup */ @@ -899,14 +1117,18 @@ xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_ static void xml_dtd_init(struct xml_context *ctx) { - ctx->dtd = mp_alloc_zero(ctx->pool, sizeof(*ctx->dtd)); - xml_dtd_ents_init(ctx->dtd->tab_gents = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_ents_table))); - xml_dtd_ents_init(ctx->dtd->tab_pents = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_ents_table))); - xml_dtd_notns_init(ctx->dtd->tab_notns = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_notns_table))); - xml_dtd_elems_init(ctx->dtd->tab_elems = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_elems_table))); - xml_dtd_attrs_init(ctx->dtd->tab_attrs = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_attrs_table))); - xml_dtd_evals_init(ctx->dtd->tab_evals = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_evals_table))); - xml_dtd_enotns_init(ctx->dtd->tab_enotns = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_enotns_table))); + if (ctx->dtd) + return; + struct mempool *pool = mp_new(4096); + struct xml_dtd *dtd = ctx->dtd = mp_alloc_zero(pool, sizeof(*ctx->dtd)); + dtd->pool = pool; + xml_dtd_ents_init(dtd->tab_gents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); + xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); + xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table))); + xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table))); + xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table))); + xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table))); + xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table))); xml_dtd_declare_default_gents(ctx); } @@ -915,13 +1137,8 @@ xml_dtd_cleanup(struct xml_context *ctx) { if (!ctx->dtd) return; - xml_dtd_ents_cleanup(ctx->dtd->tab_gents); - xml_dtd_ents_cleanup(ctx->dtd->tab_pents); - xml_dtd_notns_cleanup(ctx->dtd->tab_notns); - xml_dtd_elems_cleanup(ctx->dtd->tab_elems); - xml_dtd_attrs_cleanup(ctx->dtd->tab_attrs); - xml_dtd_evals_cleanup(ctx->dtd->tab_evals); - xml_dtd_enotns_cleanup(ctx->dtd->tab_enotns); + mp_delete(ctx->dtd->pool); + ctx->dtd = NULL; } static void @@ -955,6 +1172,7 @@ xml_push_comment(struct xml_context *ctx) bput_utf8_32(out, c); } xml_parse_char(ctx, '>'); + xml_dec(ctx); fbgrow_rewind(out); if (ctx->h_comment) ctx->h_comment(ctx); @@ -972,6 +1190,7 @@ xml_skip_comment(struct xml_context *ctx) xml_parse_char(ctx, '-'); while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-'); xml_parse_char(ctx, '>'); + xml_dec(ctx); } /* Processing instructions */ @@ -1008,6 +1227,7 @@ xml_push_pi(struct xml_context *ctx) } fbgrow_rewind(out); } + xml_dec(ctx); if (ctx->h_pi) ctx->h_pi(ctx); } @@ -1030,6 +1250,7 @@ xml_skip_pi(struct xml_context *ctx) if (!xml_parse_white(ctx, 0)) { xml_parse_seq(ctx, "?>"); + xml_dec(ctx); return; } } @@ -1039,6 +1260,7 @@ xml_skip_pi(struct xml_context *ctx) break; else xml_unget_char(ctx); + xml_dec(ctx); } /* Character references */ @@ -1046,6 +1268,7 @@ xml_skip_pi(struct xml_context *ctx) static uns xml_parse_char_ref(struct xml_context *ctx) { + TRACE(ctx, "parse_char_ref"); /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' * Already parsed: '&#' */ uns v = 0; @@ -1082,111 +1305,301 @@ xml_parse_char_ref(struct xml_context *ctx) goto recover; } if (xml_last_char(ctx) == ';') - return v; + { + xml_dec(ctx); + return v; + } xml_error(ctx, "Expected ';'"); recover: while (xml_last_char(ctx) != ';') xml_get_char(ctx); + xml_dec(ctx); return UNI_REPLACEMENT; } -/////////////////////////////////////////////////////////////////////////////////////////////////////////// +/* References to general entities */ + +static void +xml_parse_ge_ref(struct xml_context *ctx, struct fastbuf *out) +{ + /* Reference ::= EntityRef | CharRef + * EntityRef ::= '&' Name ';' + * Already parsed: '&' */ + if (xml_peek_char(ctx) == '#') + { + xml_skip_char(ctx); + uns c = xml_parse_char_ref(ctx); + bput_utf8_32(out, c); + } + else + { + struct mempool_state state; + mp_save(ctx->pool, &state); + char *name = xml_parse_name(ctx); + xml_parse_char(ctx, ';'); + struct xml_dtd_ent *ent = xml_dtd_find_gent(ctx, name); + if (!ent) + { + xml_error(ctx, "Unknown entity &%s;", name); + bputc(out, '&'); + bputs(out, name); + bputc(out, ';'); + } + else if (ent->flags & XML_DTD_ENT_TRIVIAL) + { + TRACE(ctx, "Trivial entity &%s;", name); + bwrite(out, ent->text, ent->len); + } + else + { + TRACE(ctx, "Pushed entity &%s;", name); + mp_restore(ctx->pool, &state); + xml_dec(ctx); + xml_push_entity(ctx, ent); + return; + } + mp_restore(ctx->pool, &state); + xml_dec(ctx); + } +} + +/* References to parameter entities */ static void -xml_parse_parameter_ref(struct xml_context *ctx) +xml_parse_pe_ref(struct xml_context *ctx) { + /* PEReference ::= '%' Name ';' + * Already parsed: '%' */ + struct mempool_state state; + mp_save(ctx->pool, &state); char *name = xml_parse_name(ctx); xml_parse_char(ctx, ';'); - struct xml_dtd_ent *ent = xml_dtd_ents_find(ctx->dtd->tab_pents, name); - if (!ent || !(ent->flags & XML_DTD_ENT_DECLARED)) + struct xml_dtd_ent *ent = xml_dtd_find_pent(ctx, name); + if (!ent) + xml_error(ctx, "Unknown entity %%%s;", name); + else { - xml_error(ctx, "Reference to unknown parameter entity %%%s", name); + TRACE(ctx, "Pushed entity %%%s;", name); + mp_restore(ctx->pool, &state); + xml_dec(ctx); + xml_push_entity(ctx, ent); return; } - if (ent->flags & XML_DTD_ENT_VISITED) + mp_restore(ctx->pool, &state); + xml_dec(ctx); +} + +static void +xml_parse_dtd_pe(struct xml_context *ctx) +{ + do { - xml_error(ctx, "Cycled references to parameter entity %%%s", name); - return; + xml_skip_char(ctx); + xml_inc(ctx); + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + xml_skip_char(ctx); + xml_parse_pe_ref(ctx); } - if (ent->flags & XML_DTD_ENT_EXTERNAL) + while (xml_peek_char(ctx) != '%'); +} + +static inline uns +xml_parse_dtd_white(struct xml_context *ctx, uns mandatory) +{ + /* Whitespace or parameter entity */ + uns cnt = 0; + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) { - // FIXME: - xml_error(ctx, "Support for external parsed entities not implemented"); - return; + xml_skip_char(ctx); + cnt = 1; + } + if (xml_peek_char(ctx) == '%') + { + xml_parse_dtd_pe(ctx); + return 1; } - ent->flags |= XML_DTD_ENT_VISITED; // FIXME: clear - struct fastbuf *fb = mp_alloc(ctx->pool, sizeof(*fb)); - fbbuf_init_read(fb, ent->text, ent->len, 0); - xml_push_source(ctx, fb, 0); + else if (unlikely(mandatory && !cnt)) + xml_fatal_expected_white(ctx); + return cnt; } -static inline void -xml_check_parameter_ref(struct xml_context *ctx) +static inline uns +xml_check_dtd_pe(struct xml_context *ctx) { - if (xml_get_char(ctx) != '%') + if (xml_peek_char(ctx) == '%') { - xml_unget_char(ctx); - return; + xml_parse_dtd_pe(ctx); + return 1; } - xml_parse_parameter_ref(ctx); + return 0; } +/* External ID */ + static void -xml_parse_external_id(struct xml_context *ctx, struct xml_ext_id *eid, uns allow_public) +xml_parse_external_id(struct xml_context *ctx, struct xml_ext_id *eid, uns allow_public, uns dtd) { bzero(eid, sizeof(*eid)); - uns c = xml_get_char(ctx); + if (dtd) + xml_check_dtd_pe(ctx); + uns c = xml_peek_char(ctx); if (c == 'S') { - xml_parse_seq(ctx, "YSTEM"); - xml_parse_white(ctx, 1); + xml_parse_seq(ctx, "SYSTEM"); + if (dtd) + xml_parse_dtd_white(ctx, 1); + else + xml_parse_white(ctx, 1); eid->system_id = xml_parse_system_literal(ctx); } else if (c == 'P') { - xml_parse_seq(ctx, "UBLIC"); - xml_parse_white(ctx, 1); + xml_parse_seq(ctx, "PUBLIC"); + if (dtd) + xml_parse_dtd_white(ctx, 1); + else + xml_parse_white(ctx, 1); eid->public_id = xml_parse_pubid_literal(ctx); - if (xml_parse_white(ctx, 1)) - if ((c = xml_get_char(ctx)) == '\'' || c == '"' || !allow_public) - { - xml_unget_char(ctx); - eid->system_id = xml_parse_system_literal(ctx); - } - else - xml_unget_char(ctx); + if (dtd ? xml_parse_dtd_white(ctx, 0) : xml_parse_white(ctx, 0)) + if ((c = xml_peek_char(ctx)) == '\'' || c == '"' || !allow_public) + eid->system_id = xml_parse_system_literal(ctx); } else xml_fatal(ctx, "Expected an external ID"); } +/* DTD: Notation declaration */ + static void xml_parse_notation_decl(struct xml_context *ctx) { - /* NotationDecl ::= ''*/ - xml_parse_white(ctx, 1); - struct xml_dtd_notn *notn = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx)); - xml_parse_white(ctx, 1); - struct xml_ext_id eid; - xml_parse_external_id(ctx, &eid, 1); - xml_parse_white(ctx, 0); + /* NotationDecl ::= '' + * Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + + struct xml_dtd_notn *notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx)); + xml_parse_dtd_white(ctx, 1); + struct xml_ext_id eid; + xml_parse_external_id(ctx, &eid, 1, 1); + xml_parse_dtd_white(ctx, 0); xml_parse_char(ctx, '>'); + if (notn->flags & XML_DTD_NOTN_DECLARED) xml_warn(ctx, "Notation %s already declared", notn->name); else { notn->flags = XML_DTD_NOTN_DECLARED; notn->eid = eid; + slist_add_tail(&dtd->notns, ¬n->n); + } + xml_dec(ctx); +} + +static void +xml_parse_entity_decl(struct xml_context *ctx) +{ + /* Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + + uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENT_PARAMETER : 0; + if (flags) + xml_parse_dtd_white(ctx, 1); + else + xml_unget_char(ctx); + + struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_gents, xml_parse_name(ctx)); + slist *list = flags ? &dtd->pents : &dtd->gents; + xml_parse_white(ctx, 1); + if (ent->flags & XML_DTD_ENT_DECLARED) + { + xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); + // FIXME: should be only warning + } + + uns c, sep = xml_get_char(ctx); + if (sep == '\'' || sep == '"') + { + /* Internal entity: + * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */ + struct fastbuf *out = ctx->value; + while (1) + { + if ((c = xml_get_char(ctx)) == sep) + break; + else if (c == '%') + { + // FIXME + ASSERT(0); + //xml_parse_parameter_ref(ctx); + } + else if (c != '&') + bput_utf8_32(out, c); + else if ((c = xml_get_char(ctx)) == '#') + c = xml_parse_char_ref(ctx); + else + { + /* Bypass references to general entities */ + mp_push(ctx->pool); + bputc(out, '&'); + xml_unget_char(ctx); + bputs(out, xml_parse_name(ctx)); + xml_parse_char(ctx, ';'); + bputc(out, ';'); + mp_pop(ctx->pool); + } + } + bputc(out, 0); + fbgrow_rewind(out); + slist_add_tail(list, &ent->n); + ent->flags = flags | XML_DTD_ENT_DECLARED; + ent->len = out->bstop - out->bptr - 1; + ent->text = mp_memdup(ctx->pool, out->bptr, ent->len + 1); + fbgrow_reset(out); + } + else + { + /* External entity */ + struct xml_ext_id eid; + struct xml_dtd_notn *notn = NULL; + xml_parse_external_id(ctx, &eid, 0, 0); + if (!xml_parse_white(ctx, 0) || !flags) + xml_parse_char(ctx, '>'); + else if (xml_get_char(ctx) != '>') + { + /* General external unparsed entity */ + flags |= XML_DTD_ENT_UNPARSED; + xml_parse_seq(ctx, "NDATA"); + xml_parse_white(ctx, 1); + notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx)); + } + slist_add_tail(list, &ent->n); + ent->flags = flags | XML_DTD_ENT_DECLARED | XML_DTD_ENT_EXTERNAL; + ent->eid = eid; + ent->notn = notn; } + xml_parse_dtd_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_dec(ctx); } +/* DTD: Internal subset */ + static void xml_parse_internal_subset(struct xml_context *ctx) { + // FIXME: comments/pi have no parent + /* '[' intSubset ']' + * intSubset :== (markupdecl | DeclSep) + * Already parsed: ']' */ while (1) { xml_parse_white(ctx, 0); uns c = xml_get_char(ctx); + xml_inc(ctx); if (c == '<') if ((c = xml_get_char(ctx)) == '!') switch (c = xml_get_char(ctx)) @@ -1203,7 +1616,7 @@ xml_parse_internal_subset(struct xml_context *ctx) if ((c = xml_get_char(ctx)) == 'N') { xml_parse_seq(ctx, "TITY"); - //xml_parse_entity_decl(ctx); + xml_parse_entity_decl(ctx); } else if (c == 'L') { @@ -1228,76 +1641,106 @@ xml_parse_internal_subset(struct xml_context *ctx) else goto invalid_markup; else if (c == '%') - xml_parse_parameter_ref(ctx); + xml_parse_dtd_pe(ctx); else if (c == ']') break; else goto invalid_markup; } + xml_dec(ctx); + xml_dec(ctx); return; invalid_markup: xml_fatal(ctx, "Invalid markup in the internal subset"); } -/*----------------------------------------------*/ +/////////////////////////////////////////////////////////////////////////////////////////////////////////// +static void +xml_parse_cdata(struct xml_context *ctx) +{ + struct fastbuf *out = ctx->chars; + xml_parse_seq(ctx, "CDATA["); + while (1) + { + uns c; + if ((c = xml_get_char(ctx)) == ']') + { + if ((c = xml_get_char(ctx)) == ']') + if ((c = xml_get_char(ctx)) == '>') + break; + else + bputc(out, ']'); + bputc(out, ']'); + } + bput_utf8_32(out, c); + } +} -/* FIXME */ +static void +xml_skip_cdata(struct xml_context *ctx) +{ + xml_parse_cdata(ctx); +} -struct xml_attribute_table; +static void +xml_parse_chars(struct xml_context *ctx) +{ + TRACE(ctx, "parse_chars"); + struct fastbuf *out = ctx->chars; + uns c; + while ((c = xml_get_char(ctx)) != '<') + if (c == '&') + { + xml_inc(ctx); + xml_parse_ge_ref(ctx, out); + } + else + bput_utf8_32(out, c); + xml_unget_char(ctx); +} -#define HASH_PREFIX(x) xml_attribute_##x -#define HASH_NODE struct xml_attribute -#define HASH_KEY_COMPLEX(x) x element, x name -#define HASH_KEY_DECL struct xml_element *element, char *name -#define HASH_TABLE_DYNAMIC -#define HASH_AUTO_POOL 1024 +/*----------------------------------------------*/ -#define HASH_GIVE_HASHFN +struct xml_attrs_table; static inline uns -xml_attribute_hash(struct xml_attribute_table *t UNUSED, struct xml_element *e, char *n) +xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_elem *e, char *n) { return hash_pointer(e) ^ hash_string(n); } -#define HASH_GIVE_EQ - static inline int -xml_attribute_eq(struct xml_attribute_table *t UNUSED, struct xml_element *e1, char *n1, struct xml_element *e2, char *n2) +xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_elem *e1, char *n1, struct xml_elem *e2, char *n2) { return (e1 == e2) && !strcmp(n1, n2); } -#define HASH_GIVE_INIT_KEY - static inline void -xml_attribute_init_key(struct xml_attribute_table *t UNUSED, struct xml_attribute *a, struct xml_element *e, char *name) +xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_elem *e, char *name) { - a->element = e; + a->elem = e; a->name = name; - a->value = NULL; - a->next = e->attrs; - e->attrs = a; + a->val = NULL; + slist_add_tail(&e->attrs, &a->n); } +#define HASH_PREFIX(x) xml_attrs_##x +#define HASH_NODE struct xml_attr +#define HASH_KEY_COMPLEX(x) x elem, x name +#define HASH_KEY_DECL struct xml_elem *elem, char *name +#define HASH_TABLE_DYNAMIC +#define HASH_GIVE_EQ +#define HASH_GIVE_HASHFN +#define HASH_GIVE_INIT_KEY #define HASH_WANT_CLEANUP #define HASH_WANT_REMOVE #define HASH_WANT_LOOKUP #define HASH_WANT_FIND +#define HASH_GIVE_ALLOC +XML_HASH_GIVE_ALLOC #include "lib/hashtable.h" - -/* -#define HASH_PREFIX(x) xml_parsed_entities_##x -#define HASH_NODE struct xml_parsed_entity -#define HASH_KEY_STRING name -#define HASH_TABLE_DYNAMIC -#define HASH_AUTO_POOL 1024 -#define HASH_WANT_CLEANUP -#include "lib/hashtable.h" -*/ - void xml_init(struct xml_context *ctx) { @@ -1306,116 +1749,55 @@ xml_init(struct xml_context *ctx) ctx->chars = fbgrow_create(4096); ctx->value = fbgrow_create(4096); xml_dtd_init(ctx); + xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table))); } void xml_cleanup(struct xml_context *ctx) { + xml_attrs_cleanup(ctx->tab_attrs); xml_dtd_cleanup(ctx); bclose(ctx->value); bclose(ctx->chars); mp_delete(ctx->pool); } -static void -xml_parse_cdata(struct xml_context *ctx) -{ - struct fastbuf *out = ctx->chars; - xml_parse_seq(ctx, "CDATA["); - while (1) - { - uns c; - if ((c = xml_get_char(ctx)) == ']') - { - if ((c = xml_get_char(ctx)) == ']') - if ((c = xml_get_char(ctx)) == '>') - break; - else - bputc(out, ']'); - bputc(out, ']'); - } - bput_utf8_32(out, c); - } -} - -static void -xml_skip_cdata(struct xml_context *ctx) -{ - xml_parse_cdata(ctx); -} - -static void -xml_parse_ref_entity(struct xml_context *ctx UNUSED, struct fastbuf *out UNUSED, struct xml_dtd_ent *entity UNUSED) -{ -#if 0 - for (struct xml_dtd_ent_node *node = entity->list; node; node = node->next) - if (node->len) - bwrite(out, node->ptr, node->len); - else - xml_parse_ref_entity(ctx, out, node->ptr); // FIXME: do not call the recursion on stack -- could cause segfault -#endif -} - -static void -xml_parse_ref(struct xml_context *ctx, struct fastbuf *out) -{ - if (xml_get_char(ctx) == '#') - { - uns c = xml_parse_char_ref(ctx); - bput_utf8_32(out, c); - } - else - { -#if 0 - xml_unget_char(ctx); - mp_push(ctx->pool); - char *name = xml_parse_name(ctx); - struct xml_parsed_entity *entity = xml_find_parsed_entity(ctx, name); - mp_pop(ctx->pool); - xml_parse_char(ctx, ';'); - xml_parse_ref_entity(ctx, out, entity); -#endif - } -} - -static void -xml_parse_chars(struct xml_context *ctx) -{ - DBG("parse_chars"); - struct fastbuf *out = ctx->chars; - uns c; - while ((c = xml_get_char(ctx)) != '<') - if (c == '&') - xml_parse_ref(ctx, out); - else - bput_utf8_32(out, c); - xml_unget_char(ctx); -} - static void xml_parse_attr(struct xml_context *ctx) { - DBG("parse_attr"); - struct xml_element *e = ctx->element; + // FIXME: memory management, dtd, literal + TRACE(ctx, "parse_attr"); + struct xml_elem *e = ctx->elem; char *name = xml_parse_name(ctx); - struct xml_attribute *a = xml_attribute_lookup(ctx->attribute_table, e, name); - if (a->value) - xml_fatal(ctx, "Attribute is not unique"); + struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, name); xml_parse_eq(ctx); - // FIXME - char *value = xml_parse_system_literal(ctx); - a->value = value; + char *val =xml_parse_system_literal(ctx); + if (a->val) + xml_error(ctx, "Attribute is not unique"); + else + a->val = val; } -static uns +static void xml_parse_stag(struct xml_context *ctx) { - DBG("parse_stag"); - mp_push(ctx->pool); - struct xml_element *e = mp_alloc_zero(ctx->pool, sizeof(*e)); - e->parent = ctx->element; - ctx->element = e; + // FIXME: dtd + TRACE(ctx, "parse_stag"); + xml_push(ctx); + struct xml_elem *e = mp_alloc_zero(ctx->pool, sizeof(*e)); + struct xml_elem *parent = ctx->elem; + clist_init(&e->sons); + e->node.parent = (void *)parent; + ctx->elem = e; e->name = xml_parse_name(ctx); + if (parent) + clist_add_tail(&parent->sons, &e->node.n); + else + { + ctx->root = e; + if (ctx->document_type && strcmp(e->name, ctx->document_type)) + xml_error(ctx, "The root element does not match the document type"); + } while (1) { uns white = xml_parse_white(ctx, 0); @@ -1423,33 +1805,68 @@ xml_parse_stag(struct xml_context *ctx) if (c == '/') { xml_parse_char(ctx, '>'); - return 1; + ctx->flags |= XML_FLAG_EMPTY_ELEM; + break; } else if (c == '>') - return 0; + break; else if (!white) - xml_fatal(ctx, "Expected a white space"); + xml_fatal_expected_white(ctx); xml_unget_char(ctx); xml_parse_attr(ctx); } + if (ctx->h_element_start) + ctx->h_element_start(ctx); } static void xml_parse_etag(struct xml_context *ctx) { - DBG("parse_etag"); - struct xml_element *e = ctx->element; + TRACE(ctx, "parse_etag"); + struct xml_elem *e = ctx->elem; ASSERT(e); char *name = xml_parse_name(ctx); if (strcmp(name, e->name)) xml_fatal(ctx, "Invalid ETag, expected '%s'", e->name); xml_parse_white(ctx, 0); xml_parse_char(ctx, '>'); - // FIXME: remove on pooled hashtable? + xml_dec(ctx); +} + +static void +xml_pop_element(struct xml_context *ctx) +{ + TRACE(ctx, "pop_element"); + if (ctx->h_element_end) + ctx->h_element_end(ctx); + struct xml_elem *e = ctx->elem; + if (ctx->flags & XML_DOM_FREE) + { + if (e->node.parent) + clist_remove(&e->node.n); + else + ctx->root = NULL; + SLIST_FOR_EACH(struct xml_attr *, a, e->attrs) + xml_attrs_remove(ctx->tab_attrs, a); + struct xml_node *n; + while (n = clist_head(&e->sons)) + { + if (n->type == XML_NODE_ELEM) + { + SLIST_FOR_EACH(struct xml_attr *, a, ((struct xml_elem *)n)->attrs) + xml_attrs_remove(ctx->tab_attrs, a); + clist_insert_list_after(&((struct xml_elem *)n)->sons, &n->n); + } + clist_remove(&n->n); + } + } + ctx->node = e->node.parent; + xml_pop(ctx); // FIXME: memory management without XML_DOM_FREE + xml_dec(ctx); +#if 0 for (struct xml_attribute *a = e->attrs; a; a = a->next) xml_attribute_remove(ctx->attribute_table, a); - ctx->element = e->parent; - mp_pop(ctx->pool); +#endif } static void @@ -1655,91 +2072,6 @@ xml_parse_attr_list_decl(struct xml_context *ctx) } #endif -static void -xml_parse_entity_decl(struct xml_context *ctx) -{ - struct xml_dtd *dtd = ctx->dtd; - xml_parse_white(ctx, 1); - - uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENT_PARAMETER : 0; - if (flags) - xml_parse_white(ctx, 1); - else - xml_unget_char(ctx); - - struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_gents, xml_parse_name(ctx)); - slist *list = flags ? &dtd->pents : &dtd->gents; - xml_parse_white(ctx, 1); - if (ent->flags & XML_DTD_ENT_DECLARED) - { - xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); - // FIXME: should be only warning - } - - uns sep = xml_get_char(ctx), c; - if (sep == '\'' || sep == '"') - { - /* Internal entity: - * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */ - struct fastbuf *out = ctx->value; - uns sep = c; - while (1) - { - if ((c = xml_get_char(ctx)) == sep) - break; - else if (c == '%') - { - // FIXME - ASSERT(0); - //xml_parse_parameter_ref(ctx); - } - else if (c != '&') - bput_utf8_32(out, c); - else if ((c = xml_get_char(ctx)) == '#') - c = xml_parse_char_ref(ctx); - else - { - /* Bypass references to general entities */ - mp_push(ctx->pool); - bputc(out, '&'); - xml_unget_char(ctx); - bputs(out, xml_parse_name(ctx)); - xml_parse_char(ctx, ';'); - bputc(out, ';'); - mp_pop(ctx->pool); - } - } - bputc(out, 0); - fbgrow_rewind(out); - slist_add_tail(list, &ent->n); - ent->flags = flags | XML_DTD_ENT_DECLARED; - ent->len = out->bstop - out->bptr - 1; - ent->text = mp_memdup(ctx->pool, out->bptr, ent->len + 1); - fbgrow_reset(out); - } - else - { - /* External entity */ - struct xml_ext_id eid; - struct xml_dtd_notn *notn = NULL; - xml_parse_external_id(ctx, &eid, 0); - if (!xml_parse_white(ctx, 0) || !flags) - xml_parse_char(ctx, '>'); - else if (xml_get_char(ctx) != '>') - { - /* General external unparsed entity */ - flags |= XML_DTD_ENT_UNPARSED; - xml_parse_seq(ctx, "NDATA"); - xml_parse_white(ctx, 1); - notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx)); - } - slist_add_tail(list, &ent->n); - ent->flags = flags | XML_DTD_ENT_DECLARED | XML_DTD_ENT_EXTERNAL; - ent->eid = eid; - ent->notn = notn; - } -} - static void xml_parse_doctype_decl(struct xml_context *ctx) { @@ -1748,14 +2080,17 @@ xml_parse_doctype_decl(struct xml_context *ctx) xml_parse_seq(ctx, "DOCTYPE"); xml_parse_white(ctx, 1); ctx->document_type = xml_parse_name(ctx); - DBG("XML: DocumentType=%s", ctx->document_type); + TRACE(ctx, "doctyype=%s", ctx->document_type); uns white = xml_parse_white(ctx, 0); uns c = xml_peek_char(ctx); if (c != '>' && c != '[' && white) { - xml_parse_external_id(ctx, &ctx->eid, 0); + xml_parse_external_id(ctx, &ctx->eid, 0, 0); xml_parse_white(ctx, 0); + ctx->flags |= XML_FLAG_HAS_EXTERNAL_SUBSET; } + if (xml_peek_char(ctx) == '[') + ctx->flags |= XML_FLAG_HAS_INTERNAL_SUBSET; if (ctx->h_doctype_decl) ctx->h_doctype_decl(ctx); } @@ -1765,7 +2100,7 @@ xml_next(struct xml_context *ctx) { /* A nasty state machine */ - DBG("XML: xml_next (state=%u)", ctx->state); + TRACE(ctx, "xml_next (state=%u)", ctx->state); jmp_buf throw_buf; ctx->throw_buf = &throw_buf; if (setjmp(throw_buf)) @@ -1774,7 +2109,7 @@ error: if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal) ctx->h_fatal(ctx); ctx->state = XML_STATE_FATAL; - DBG("XML: raised fatal error"); + TRACE(ctx, "raised fatal error"); return -1; } uns c; @@ -1784,7 +2119,7 @@ error: return -1; case XML_STATE_START: - DBG("XML: Entering Prolog"); + TRACE(ctx, "entering prolog"); if (ctx->h_document_start) ctx->h_document_start(ctx); /* XMLDecl */ @@ -1840,8 +2175,8 @@ error: if (xml_peek_char(ctx) == '[') { xml_skip_char(ctx); - // FIXME - while (xml_get_char(ctx) != ']'); + xml_inc(ctx); + xml_parse_internal_subset(ctx); xml_parse_white(ctx, 0); } xml_parse_char(ctx, '>'); @@ -1857,15 +2192,17 @@ error: while (1) { - if (xml_get_char(ctx) != '<') + if (xml_peek_char(ctx) != '<') { /* CharData */ - xml_unget_char(ctx); xml_parse_chars(ctx); continue; } + else + xml_skip_char(ctx); first_tag: ; + xml_inc(ctx); if ((c = xml_get_char(ctx)) == '?') { /* PI */ @@ -1947,15 +2284,12 @@ first_tag: ; fbgrow_reset(ctx->chars); } - if (xml_parse_stag(ctx)) - { - } + xml_parse_stag(ctx); if (ctx->want & XML_WANT_STAG) return ctx->state = XML_STATE_STAG; case XML_STATE_STAG: - // FIXME: EmptyElemTag - ; - + if (ctx->flags & XML_FLAG_EMPTY_ELEM) + goto pop_element; } else @@ -1970,27 +2304,27 @@ first_tag: ; fbgrow_reset(ctx->chars); } + xml_parse_etag(ctx); +pop_element: if (ctx->want & XML_WANT_ETAG) return ctx->state = XML_STATE_ETAG; case XML_STATE_ETAG: - - xml_parse_etag(ctx); - - if (!ctx->element) + xml_pop_element(ctx); + if (!ctx->elem) goto epilog; } } epilog: /* Misc* */ - DBG("XML: Entering epilog"); + TRACE(ctx, "entering epilog"); while (1) { /* Epilog whitespace is the only place, where a valid document can reach EOF */ if (setjmp(throw_buf)) if (ctx->err_code == XML_ERR_EOF) { - DBG("XML: Reached EOF"); + TRACE(ctx, "reached EOF"); ctx->state = XML_STATE_EOF; if (ctx->h_document_end) ctx->h_document_end(ctx); @@ -2040,7 +2374,7 @@ epilog: static void error(struct xml_context *ctx) { - msg((ctx->err_code < XML_ERR_ERROR) ? L_WARN_R : L_ERROR_R, "XML: %s", ctx->err_msg); + msg((ctx->err_code < XML_ERR_ERROR) ? L_WARN_R : L_ERROR_R, "XML %u: %s", xml_row(ctx), ctx->err_msg); } static void @@ -2050,6 +2384,7 @@ test(struct fastbuf *in, struct fastbuf *out) xml_init(&ctx); ctx.h_warn = ctx.h_error = ctx.h_fatal = error; ctx.want = XML_WANT_ALL; + ctx.flags |= XML_DOM_FREE; xml_set_source(&ctx, in); int state; while ((state = xml_next(&ctx)) >= 0) @@ -2059,12 +2394,12 @@ test(struct fastbuf *in, struct fastbuf *out) bprintf(out, "CHARS [%.*s]\n", (int)(ctx.chars->bstop - ctx.chars->buffer), ctx.chars->buffer); break; case XML_STATE_STAG: - bprintf(out, "STAG <%s>\n", ctx.element->name); - for (struct xml_attribute *a = ctx.element->attrs; a; a = a->next) - bprintf(out, " ATTR %s=[%s]\n", a->name, a->value); + bprintf(out, "STAG <%s>\n", ctx.elem->name); + SLIST_FOR_EACH(struct xml_attr *, a, ctx.elem->attrs) + bprintf(out, " ATTR %s=[%s]\n", a->name, a->val); break; case XML_STATE_ETAG: - bprintf(out, "ETAG \n", ctx.element->name); + bprintf(out, "ETAG \n", ctx.elem->name); break; case XML_STATE_COMMENT: bprintf(out, "COMMENT [%.*s]\n", (int)(ctx.value->bstop - ctx.value->buffer), ctx.value->buffer); @@ -2078,8 +2413,6 @@ test(struct fastbuf *in, struct fastbuf *out) case XML_STATE_EOF: bprintf(out, "EOF\n"); goto end; - default: - bprintf(out, "STATE %u\n", state); break; } end: diff --git a/lib/xml.h b/sherlock/xml/xml.h similarity index 72% rename from lib/xml.h rename to sherlock/xml/xml.h index 02e62462..87cdff91 100644 --- a/lib/xml.h +++ b/sherlock/xml/xml.h @@ -1,5 +1,5 @@ /* - * UCW Library -- A simple XML parser + * Sherlock Library -- A simple XML parser * * (c) 2007 Pavel Charvat * @@ -7,11 +7,12 @@ * of the GNU Lesser General Public License. */ -#ifndef _UCW_XML_H -#define _UCW_XML_H +#ifndef _SHERLOCK_XML_H +#define _SHERLOCK_XML_H #include "lib/clists.h" #include "lib/slists.h" +#include "lib/mempool.h" enum xml_error { XML_ERR_OK = 0, @@ -63,7 +64,22 @@ enum xml_want { enum xml_flags { XML_FLAG_VALIDATING = 0x1, - XML_FLAG_VERSION_1_1 = 0x2, + XML_FLAG_VERSION_1_1 = 0x2, /* XML version 1.1, otherwise 1.0 */ + XML_FLAG_HAS_EXTERNAL_SUBSET = 0x4, /* The document contains a reference to external DTD subset */ + XML_FLAG_HAS_INTERNAL_SUBSET = 0x8, /* The document contains an internal subset */ + + XML_FLAG_SRC_EOF = 0x10, /* EOF reached */ + XML_FLAG_SRC_EXPECTED_DECL = 0x20, /* Just before optional or required XMLDecl/TextDecl */ + XML_FLAG_SRC_NEW_LINE = 0x40, /* The last read character is 0xD */ + XML_FLAG_SRC_SURROUND = 0x80, /* Surround the text with 0x20 (references to parameter entities) */ + XML_FLAG_SRC_DOCUMENT = 0x100, /* The document entity */ + XML_FLAG_SRC_EXTERNAL = 0x200, /* An external entity */ + + XML_DOM_SKIP = 0x1000, /* Do not report DOM nodes */ + XML_DOM_FREE = 0x2000, /* Free the subtree when leaving */ + XML_DOM_IGNORE = XML_DOM_SKIP | XML_DOM_FREE, /* Completely ignore the subtree */ + + XML_FLAG_EMPTY_ELEM = 0x100000, }; struct xml_ext_id { @@ -78,27 +94,6 @@ enum xml_node_type { XML_NODE_PI, }; -#define XML_BUF_SIZE 32 - -struct xml_source { - struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ - struct fastbuf *fb; - u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ - u32 *bptr, *bstop; /* Current state of the buffer */ - uns depth; - uns flags; -}; - -enum xml_source_flags { - XML_SRC_DECL = 0x1, /* Expected document/text declaration */ - XML_SRC_EOF = 0x2, /* Reached the end of the fastbuf */ - XML_SRC_NEW_LINE = 0x4, /* The last read character is 0xD */ - XML_SRC_SURROUND = 0x8, /* Surround the text with 0x20 (references to parameter entities) */ - XML_SRC_DOCUMENT = 0x10, /* The document entity */ - XML_SRC_EXTERNAL = 0x20, /* An external entity */ -}; - -#if 0 struct xml_node { cnode n; /* Node for list of parent's sons */ uns type; /* XML_NODE_x */ @@ -112,7 +107,40 @@ struct xml_elem { struct xml_dtd_elem *dtd; /* Element DTD */ slist attrs; /* Link list of attributes */ }; -#endif + +struct xml_attr { + snode n; + struct xml_elem *elem; + char *name; + char *val; +}; + +struct xml_context; + +struct xml_stack { + struct xml_stack *next; /* Link list of stack records */ + uns saved_flags; /* Saved ctx->flags */ + struct mempool_state saved_pool; /* Saved ctx->pool state */ +}; + +#define XML_BUF_SIZE 32 /* At least 16 -- hardcoded */ + +struct xml_source { + struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ + struct fastbuf *fb; /* Source fastbuf */ + struct fastbuf wrap_fb; /* Libcharset or fbmem wrapper */ + u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ + u32 *bptr, *bstop; /* Current state of the buffer */ + uns row; /* File position */ + char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ + char *fb_encoding; /* Encoding of the source fastbuf */ + char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ + uns refill_cat1; /* Character categories, which should be directly passed to the buffer */ + uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in sequences) */ + void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ + unsigned short *refill_in_to_x; /* Libcharset input table */ + uns saved_depth; /* Saved ctx->depth */ +}; struct xml_context { /* Error handling */ @@ -128,11 +156,16 @@ struct xml_context { struct fastbuf *chars; /* Character data */ struct fastbuf *value; /* Attribute value / comment / processing instruction data */ char *name; /* Attribute name, processing instruction target */ + void *tab_attrs; + + /* Stack */ + struct xml_stack *stack; /* See xml_push(), xml_pop() */ + uns flags; /* XML_FLAG_x (restored on xml_pop()) */ + uns depth; /* Nesting level */ /* Input */ - struct xml_source *sources; /* Stack of pending sources */ + struct xml_source *src; /* Current source */ u32 *bptr, *bstop; /* Character buffer */ - uns depth; /* Nesting level */ /* SAX-like interface */ void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */ @@ -141,14 +174,17 @@ struct xml_context { void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration just before internal subset */ void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction */ void (*h_comment)(struct xml_context *ctx); /* Called after a comment */ + void (*h_element_start)(struct xml_context *ctx); /* Called after STag or EmptyElemTag */ + void (*h_element_end)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag */ + + /* DOM */ + struct xml_elem *root; /* DOM root */ + union { + struct xml_node *node; /* Current DOM node */ + struct xml_elem *elem; /* Current element */ + }; - /* */ - struct xml_node *node; /* Current XML node */ - uns flags; /* XML_FLAG_x */ - struct xml_element *element; /* Current element */ - void *attribute_table; char *version_str; - char *encoding; uns standalone; char *document_type; struct xml_dtd *dtd; @@ -158,8 +194,6 @@ struct xml_context { void (*start_dtd)(struct xml_context *ctx); void (*end_dtd)(struct xml_context *ctx); - void (*start_element)(struct xml_context *ctx); - void (*end_element)(struct xml_context *ctx); void (*start_cdata)(struct xml_context *ctx); void (*end_cdata)(struct xml_context *ctx); void (*start_entity)(struct xml_context *ctx); @@ -170,24 +204,10 @@ struct xml_context { void (*unparsed_entity_decl)(struct xml_context *ctx); }; -struct xml_attribute { - char *name; - char *value; - struct xml_element *element; - struct xml_attribute *next; - struct xml_dtd_attribute *dtd; -}; - -struct xml_element { - char *name; - struct xml_attribute *attrs; - struct xml_element *parent; - struct xml_dtd_element *dtd; -}; - /*** Document Type Definition (DTD) ***/ struct xml_dtd { + struct mempool *pool; /* Memory pool where to allocate DTD */ slist gents; /* Link list of general entities */ slist pents; /* Link list of parapeter entities */ slist notns; /* Link list of notations */ @@ -229,7 +249,7 @@ struct xml_dtd_ent { snode n; /* Node in xml_dtd.[gp]ents */ uns flags; /* XML_DTD_ENT_x */ char *name; /* Entity name */ - char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRVIAL) */ + char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */ uns len; /* Text length */ struct xml_ext_id eid; /* External ID */ struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */ -- 2.39.2