From e42cd882d7970eb0b01bc9b058e0446996212cb4 Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Wed, 5 Dec 2007 08:31:46 +0100 Subject: [PATCH] XML: Backuped unfinished XML parser. --- lib/Makefile | 11 +- lib/xml-ucat.pl | 155 ++++ lib/xml.c | 2099 +++++++++++++++++++++++++++++++++++++++++++++++ lib/xml.h | 318 +++++++ 4 files changed, 2581 insertions(+), 2 deletions(-) create mode 100755 lib/xml-ucat.pl create mode 100644 lib/xml.c create mode 100644 lib/xml.h diff --git a/lib/Makefile b/lib/Makefile index 52751e11..669f356b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -33,7 +33,8 @@ LIBUCW_MODS= \ qache \ string \ bbuf \ - getopt + getopt \ + xml LIBUCW_INCLUDES= \ lib.h config.h threads.h \ @@ -60,7 +61,8 @@ LIBUCW_INCLUDES= \ base64.h base224.h \ qache.h \ kmp.h kmp-search.h binsearch.h \ - partmap.h + partmap.h \ + xml.h ifdef CONFIG_UCW_THREADS # Some modules require threading @@ -86,6 +88,11 @@ $(o)/lib/libucw.so: $(addsuffix .oo,$(LIBUCW_MOD_PATHS)) $(o)/lib/hashfunc.o $(o)/lib/hashfunc.oo: CFLAGS += -funroll-loops $(o)/lib/lizard.o: CFLAGS += $(COPT2) -funroll-loops +$(o)/lib/xml.o: $(o)/lib/xml-ucat.h +$(o)/lib/xml-ucat.h: $(s)/lib/xml-ucat.pl + $(M)GEN $@ + $(Q)$< >$@ + $(o)/lib/db-test: $(o)/lib/db-test.o $(LIBUCW) $(o)/lib/db-tool: $(o)/lib/db-tool.o $(LIBUCW) $(o)/lib/conf-test: $(o)/lib/conf-test.o $(LIBUCW) diff --git a/lib/xml-ucat.pl b/lib/xml-ucat.pl new file mode 100755 index 00000000..cbfb8d34 --- /dev/null +++ b/lib/xml-ucat.pl @@ -0,0 +1,155 @@ +#!/usr/bin/perl +# +# UCW Library -- Character map for the XML parser +# +# (c) 2007 Pavel Charvat +# +# This software may be freely distributed and used according to the terms +# of the GNU Lesser General Public License. +# + +my @cat = (); +my @lcat = (); +my %ids = (); +my %cls = (); +for (my $i = 0; $i < 0x10000; $i++) { $cat[$i] = 0; } +for (my $i = 0; $i < 0x11; $i++) { $lcat[$i] = 0; } + +my @white = (0x9, 0xA, 0xD, 0x20); +my @base_char_1_0 = ( + [0x0041,0x005A], [0x0061,0x007A], [0x00C0,0x00D6], [0x00D8,0x00F6], [0x00F8,0x00FF], [0x0100,0x0131], + [0x0134,0x013E], [0x0141,0x0148], [0x014A,0x017E], [0x0180,0x01C3], [0x01CD,0x01F0], [0x01F4,0x01F5], + [0x01FA,0x0217], [0x0250,0x02A8], [0x02BB,0x02C1], 0x0386, [0x0388,0x038A], 0x038C, [0x038E,0x03A1], + [0x03A3,0x03CE], [0x03D0,0x03D6], 0x03DA, 0x03DC, 0x03DE, 0x03E0, [0x03E2,0x03F3], [0x0401,0x040C], + [0x040E,0x044F], [0x0451,0x045C], [0x045E,0x0481], [0x0490,0x04C4], [0x04C7,0x04C8], [0x04CB,0x04CC], + [0x04D0,0x04EB], [0x04EE,0x04F5], [0x04F8,0x04F9], [0x0531,0x0556], 0x0559, [0x0561,0x0586], [0x05D0,0x05EA], + [0x05F0,0x05F2], [0x0621,0x063A], [0x0641,0x064A], [0x0671,0x06B7], [0x06BA,0x06BE], [0x06C0,0x06CE], + [0x06D0,0x06D3], 0x06D5, [0x06E5,0x06E6], [0x0905,0x0939], 0x093D, [0x0958,0x0961], [0x0985,0x098C], + [0x098F,0x0990], [0x0993,0x09A8], [0x09AA,0x09B0], 0x09B2, [0x09B6,0x09B9], [0x09DC,0x09DD], [0x09DF,0x09E1], + [0x09F0,0x09F1], [0x0A05,0x0A0A], [0x0A0F,0x0A10], [0x0A13,0x0A28], [0x0A2A,0x0A30], [0x0A32,0x0A33], + [0x0A35,0x0A36], [0x0A38,0x0A39], [0x0A59,0x0A5C], 0x0A5E, [0x0A72,0x0A74], [0x0A85,0x0A8B], 0x0A8D, + [0x0A8F,0x0A91], [0x0A93,0x0AA8], [0x0AAA,0x0AB0], [0x0AB2,0x0AB3], [0x0AB5,0x0AB9], 0x0ABD, 0x0AE0, + [0x0B05,0x0B0C], [0x0B0F,0x0B10], [0x0B13,0x0B28], [0x0B2A,0x0B30], [0x0B32,0x0B33], [0x0B36,0x0B39], + 0x0B3D, [0x0B5C,0x0B5D], [0x0B5F,0x0B61], [0x0B85,0x0B8A], [0x0B8E,0x0B90], [0x0B92,0x0B95], [0x0B99,0x0B9A], + 0x0B9C, [0x0B9E,0x0B9F], [0x0BA3,0x0BA4], [0x0BA8,0x0BAA], [0x0BAE,0x0BB5], [0x0BB7,0x0BB9], [0x0C05,0x0C0C], + [0x0C0E,0x0C10], [0x0C12,0x0C28], [0x0C2A,0x0C33], [0x0C35,0x0C39], [0x0C60,0x0C61], [0x0C85,0x0C8C], + [0x0C8E,0x0C90], [0x0C92,0x0CA8], [0x0CAA,0x0CB3], [0x0CB5,0x0CB9], 0x0CDE, [0x0CE0,0x0CE1], [0x0D05,0x0D0C], + [0x0D0E,0x0D10], [0x0D12,0x0D28], [0x0D2A,0x0D39], [0x0D60,0x0D61], [0x0E01,0x0E2E], 0x0E30, [0x0E32,0x0E33], + [0x0E40,0x0E45], [0x0E81,0x0E82], 0x0E84, [0x0E87,0x0E88], 0x0E8A, 0x0E8D, [0x0E94,0x0E97], [0x0E99,0x0E9F], + [0x0EA1,0x0EA3], 0x0EA5, 0x0EA7, [0x0EAA,0x0EAB], [0x0EAD,0x0EAE], 0x0EB0, [0x0EB2,0x0EB3], 0x0EBD, + [0x0EC0,0x0EC4], [0x0F40,0x0F47], [0x0F49,0x0F69], [0x10A0,0x10C5], [0x10D0,0x10F6], 0x1100, [0x1102,0x1103], + [0x1105,0x1107], 0x1109, [0x110B,0x110C], [0x110E,0x1112], 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150, + [0x1154,0x1155], 0x1159, [0x115F,0x1161], 0x1163, 0x1165, 0x1167, 0x1169, [0x116D,0x116E], [0x1172,0x1173], + 0x1175, 0x119E, 0x11A8, 0x11AB, [0x11AE,0x11AF], [0x11B7,0x11B8], 0x11BA, [0x11BC,0x11C2], 0x11EB, 0x11F0, + 0x11F9, [0x1E00,0x1E9B], [0x1EA0,0x1EF9], [0x1F00,0x1F15], [0x1F18,0x1F1D], [0x1F20,0x1F45], [0x1F48,0x1F4D], + [0x1F50,0x1F57], 0x1F59, 0x1F5B, 0x1F5D, [0x1F5F,0x1F7D], [0x1F80,0x1FB4], [0x1FB6,0x1FBC], 0x1FBE, + [0x1FC2,0x1FC4], [0x1FC6,0x1FCC], [0x1FD0,0x1FD3], [0x1FD6,0x1FDB], [0x1FE0,0x1FEC], [0x1FF2,0x1FF4], + [0x1FF6,0x1FFC], 0x2126, [0x212A,0x212B], 0x212E, [0x2180,0x2182], [0x3041,0x3094], [0x30A1,0x30FA], + [0x3105,0x312C], [0xAC00,0xD7A3]); +my @ideographic_1_0 = ([0x4E00,0x9FA5], 0x3007, [0x3021,0x3029]); +my @combining_char_1_0 = ( + [0x0300,0x0345], [0x0360,0x0361], [0x0483,0x0486], [0x0591,0x05A1], [0x05A3,0x05B9], [0x05BB,0x05BD], + 0x05BF, [0x05C1,0x05C2], 0x05C4, [0x064B,0x0652], 0x0670, [0x06D6,0x06DC], [0x06DD,0x06DF], [0x06E0,0x06E4], + [0x06E7,0x06E8], [0x06EA,0x06ED], [0x0901,0x0903], 0x093C, [0x093E,0x094C], 0x094D, [0x0951,0x0954], + [0x0962,0x0963], [0x0981,0x0983], 0x09BC, 0x09BE, 0x09BF, [0x09C0,0x09C4], [0x09C7,0x09C8], [0x09CB,0x09CD], + 0x09D7, [0x09E2,0x09E3], 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, [0x0A40,0x0A42], [0x0A47,0x0A48], [0x0A4B,0x0A4D], + [0x0A70,0x0A71], [0x0A81,0x0A83], 0x0ABC, [0x0ABE,0x0AC5], [0x0AC7,0x0AC9], [0x0ACB,0x0ACD], [0x0B01,0x0B03], + 0x0B3C, [0x0B3E,0x0B43], [0x0B47,0x0B48], [0x0B4B,0x0B4D], [0x0B56,0x0B57], [0x0B82,0x0B83], [0x0BBE,0x0BC2], + [0x0BC6,0x0BC8], [0x0BCA,0x0BCD], 0x0BD7, [0x0C01,0x0C03], [0x0C3E,0x0C44], [0x0C46,0x0C48], [0x0C4A,0x0C4D], + [0x0C55,0x0C56], [0x0C82,0x0C83], [0x0CBE,0x0CC4], [0x0CC6,0x0CC8], [0x0CCA,0x0CCD], [0x0CD5,0x0CD6], + [0x0D02,0x0D03], [0x0D3E,0x0D43], [0x0D46,0x0D48], [0x0D4A,0x0D4D], 0x0D57, 0x0E31, [0x0E34,0x0E3A], + [0x0E47,0x0E4E], 0x0EB1, [0x0EB4,0x0EB9], [0x0EBB,0x0EBC], [0x0EC8,0x0ECD], [0x0F18,0x0F19], 0x0F35, + 0x0F37, 0x0F39, 0x0F3E, 0x0F3F, [0x0F71,0x0F84], [0x0F86,0x0F8B], [0x0F90,0x0F95], 0x0F97, [0x0F99,0x0FAD], + [0x0FB1,0x0FB7], 0x0FB9, [0x20D0,0x20DC], 0x20E1, [0x302A,0x302F], 0x3099, 0x309A); +my @digit_1_0 = ( + [0x0030,0x0039], [0x0660,0x0669], [0x06F0,0x06F9], [0x0966,0x096F], [0x09E6,0x09EF], [0x0A66,0x0A6F], + [0x0AE6,0x0AEF], [0x0B66,0x0B6F], [0x0BE7,0x0BEF], [0x0C66,0x0C6F], [0x0CE6,0x0CEF], [0x0D66,0x0D6F], + [0x0E50,0x0E59], [0x0ED0,0x0ED9], [0x0F20,0x0F29]); +my @extender_1_0 = ( + 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, [0x3031,0x3035], [0x309D,0x309E], [0x30FC,0x30FE]); +my @sname_1_1 = ( + "[:A-Z_a-z]", [0xC0,0xD6], [0xD8,0xF6], [0xF8,0x2FF], [0x370,0x37D], [0x37F,0x1FFF], + [0x200C,0x200D], [0x2070,0x218F], [0x2C00,0x2FEF], [0x3001,0xD7FF], [0xF900,0xFDCF], [0xFDF0,0xFFFD], [0x10000,0xEFFFF]); + +set("WHITE", @white); +set("NEW_LINE_1_0", 0xA, 0xD); +set("NEW_LINE_1_1", 0xA, 0xD, 0x85, 0x2028); +set("DIGIT", "[0-9]"); +set("XDIGIT", "[0-9a-fA-F]"); +set("VALID_1_0", @white, [0x20,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("VALID_1_1", [0x1,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("UNRESTRICTED_1_1", @white, [0x20,0x7E], 0x85, [0xA0,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("PUBID", 0x20, 0xD, 0xA, "[-a-zA-Z0-9'()+,./:=?:!*#@\$_%]"); +set("ENC_SNAME", "[a-zA-Z]"); +set("ENC_NAME", "[-a-zA-Z0-9._]"); +set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0); +set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0); +set("SNAME_1_1", @sname_1_1); +set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]); + +print "/* Automatically generated by xml-ucat.pl */\n\n"; +find_cls(); +gen_enum(); +gen_tabs(); + +sub set { + my $id = shift; + $ids{$id} = scalar keys(%ids) if !defined($ids{$id}); + my $mask = 1 << $ids{$id}; + foreach my $i (@_) { + if (ref($i) eq "ARRAY") { + my $j = $i->[0]; + for (; $j <= $i->[1] && $j < 0x10000; $j++) { $cat[$j] |= $mask; } + for (; $j <= $i->[1]; $j += 0x10000) { $lcat[$j >> 16] |= $mask; } + } + elsif ($i =~ /^\[/) { for (my $j=0; $j < 128; $j++) { if (chr($j) =~ /$i/) { $cat[$j] |= $mask; } } } + else { $cat[$i] |= $mask; } + } +} + +sub find_cls { + foreach (my $i=0; $i<@cat; $i++) { $cls{$cat[$i]} = scalar keys(%cls) if !defined($cls{$cat[$i]}); } + foreach (my $i=0; $i<@lcat; $i++) { $cls{$lcat[$i]} = scalar keys(%cls) if !defined($cls{$lcat[$i]}); } +} + +sub gen_enum { + print "enum xml_char_type {\n"; + foreach my $id (sort keys %ids) { + my $mask = 0; + foreach my $i (keys %cls) { + $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id})); + } + printf " XML_CHAR_%-20s = 0x%08x,\n", $id, $mask; + } + print "};\n\n"; +} + +sub gen_tabs { + my @tab = (); + my %hash = (); + print "static const uns xml_char_tab1[] = {\n "; + for (my $t=0; $t<256; $t++) { + my $i = $t * 256; + my @x = (); + for (my $j=0; $j<256; $j += 32) { + push @x, join(",", map($cls{$_}, @cat[$i+$j..$i+$j+31])); + } + my $sub = " " . join(",\n ", @x); + if (!defined($hash{$sub})) { + $hash{$sub} = 256 * scalar @tab; + push @tab, $sub; + } + printf("0x%x", $hash{$sub}); + print((~$t & 15) ? "," : ($t < 255) ? ",\n " : "\n};\n\n"); + } + + print "static const byte xml_char_tab2[] = {\n"; + print join(",\n\n", @tab); + print "\n};\n\n"; + + my @l = (); + for (my $i=0; $i<0x11; $i++) { + push @l, sprintf("%d", $cls{$lcat[$i]}); + } + print "static const byte xml_char_tab3[] = {" . join(",", @l) . "};\n"; +} diff --git a/lib/xml.c b/lib/xml.c new file mode 100644 index 00000000..828b4c15 --- /dev/null +++ b/lib/xml.c @@ -0,0 +1,2099 @@ +/* + * UCW Library -- A simple XML parser + * + * (c) 2007 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +/* TODO: + * - various character encodings + * - iface + * - stack-like memory handling where possible + */ + +#define LOCAL_DEBUG + +#include "lib/lib.h" +#include "lib/mempool.h" +#include "lib/fastbuf.h" +#include "lib/ff-utf8.h" +#include "lib/chartype.h" +#include "lib/unicode.h" +#include "lib/xml.h" +#include "lib/hashfunc.h" +#include "lib/stkstring.h" +#include "charset/unicat.h" + +#include + +/*** Error handling ***/ + +static void NONRET +xml_throw(struct xml_context *ctx) +{ + ASSERT(ctx->err_code && ctx->throw_buf); + longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code); +} + +static void +xml_warn(struct xml_context *ctx, const char *format, ...) +{ + if (ctx->h_warn) + { + va_list args; + va_start(args, format); + ctx->err_msg = stk_vprintf(format, args); + ctx->err_code = XML_ERR_WARN; + va_end(args); + ctx->h_warn(ctx); + ctx->err_msg = NULL; + ctx->err_code = XML_ERR_OK; + } +} + +static void +xml_error(struct xml_context *ctx, const char *format, ...) +{ + if (ctx->h_error) + { + va_list args; + va_start(args, format); + ctx->err_msg = stk_vprintf(format, args); + ctx->err_code = XML_ERR_ERROR; + va_end(args); + ctx->h_error(ctx); + ctx->err_msg = NULL; + ctx->err_code = XML_ERR_OK; + } +} + +static void NONRET +xml_fatal(struct xml_context *ctx, const char *format, ...) +{ + va_list args; + va_start(args, format); + ctx->err_msg = mp_vprintf(ctx->pool, format, args); + ctx->err_code = XML_ERR_FATAL; + ctx->state = XML_STATE_FATAL; + va_end(args); + if (ctx->h_fatal) + ctx->h_fatal(ctx); + xml_throw(ctx); +} + +/*** Charecter categorization ***/ + +#include "obj/lib/xml-ucat.h" + +static inline uns +xml_char_cat(uns c) +{ + if (c < 0x10000) + return 1U << xml_char_tab2[(c & 0xff) + xml_char_tab1[c >> 8]]; + else if (likely(c < 0x110000)) + return 1U << xml_char_tab3[c >> 16]; + else + return 1; +} + +/*** Reading of document/external entities ***/ + +static void NONRET +xml_eof(struct xml_context *ctx) +{ + ctx->err_msg = "Unexpected EOF"; + ctx->err_code = XML_ERR_EOF; + xml_throw(ctx); +} + +static void NONRET +xml_fatal_nested(struct xml_context *ctx) +{ + xml_fatal(ctx, "Entity is not tested correctly"); +} + +static inline void +xml_inc_depth(struct xml_context *ctx) +{ + ctx->depth++; +} + +static inline void +xml_dec_depth(struct xml_context *ctx) +{ + if (unlikely(!ctx->depth)) + xml_fatal_nested(ctx); + ctx->depth--; +} + +static void +xml_push_source(struct xml_context *ctx, struct fastbuf *fb, uns flags) +{ + DBG("XML: xml_push_source"); + struct xml_source *osrc = ctx->sources; + if (osrc) + { + osrc->bptr = ctx->bptr; + osrc->bstop = ctx->bstop; + osrc->depth = ctx->depth; + } + struct xml_source *src = mp_alloc(ctx->pool, sizeof(*src)); + src->next = osrc; + src->flags = flags; + src->fb = fb; + ctx->depth = 0; + ctx->sources = src; + ctx->bstop = ctx->bptr = src->buf; + if (flags & XML_SRC_SURROUND) + { + *ctx->bptr++ = 0x20; + *ctx->bptr++ = xml_char_cat(0x20); + } +} + +void +xml_set_source(struct xml_context *ctx, struct fastbuf *fb) +{ + xml_push_source(ctx, fb, XML_SRC_DOCUMENT | XML_SRC_DECL); +} + +static void +xml_pop_source(struct xml_context *ctx) +{ + DBG("XML: xml_pop_source"); + if (unlikely(ctx->depth)) + xml_fatal(ctx, "Invalid entity nesting"); + struct xml_source *src = ctx->sources; + bclose(src->fb); + ctx->sources = src = src->next; + if (unlikely(!src)) + xml_eof(ctx); + ctx->bptr = src->bptr; + ctx->bstop = src->bstop; + ctx->depth = src->depth; +} + +static uns +xml_error_restricted(struct xml_context *ctx, uns c) +{ + xml_error(ctx, "Restricted char U+%04X", c); + return UNI_REPLACEMENT; +} + +static void xml_parse_decl(struct xml_context *ctx); + +static void +xml_refill(struct xml_context *ctx) +{ + // FIXME: + // -- various encodings, especially UTF-16 + // -- track col/row numbers + // -- report incorrect encoding + // -- deal with forbidden XML 1.1 newlines in xml/text decl + do + { + struct xml_source *src = ctx->sources; + uns c, t, t1, t2, f = src->flags; + if (f & XML_SRC_EOF) + xml_pop_source(ctx); + else if (f & XML_SRC_DECL) + xml_parse_decl(ctx); + else + { + struct fastbuf *fb = src->fb; + if (ctx->bptr == ctx->bstop) + ctx->bptr = ctx->bstop = src->buf; + u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, *last_0xd = (f & XML_SRC_NEW_LINE) ? bstop : bend; + if (ctx->flags & XML_FLAG_VERSION_1_1) + { + t2 = XML_CHAR_NEW_LINE_1_1; + t1 = XML_CHAR_UNRESTRICTED_1_1 & ~t2; + } + else + { + t2 = XML_CHAR_NEW_LINE_1_0; + t1 = XML_CHAR_VALID_1_0 & ~t2; + } + while (bstop < bend) + { + c = bget_utf8_32(fb); + t = xml_char_cat(c); + if (t & t1) + { + /* Typical branch */ + *bstop++ = c; + *bstop++ = t; + } + else if (t & t2) + { + /* New line + * XML 1.0: 0xA | 0xD | 0xD 0xA + * XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ + *bstop++ = 0xa; + *bstop++ = xml_char_cat(0xa); + if (c == 0xd) + last_0xd = bstop; + else if (c != 0x2028 && last_0xd != bstop - 2) + bstop -= 2; + } + else if ((int)c >= 0) + { + /* Restricted character */ + c = xml_error_restricted(ctx, c); + *bstop++ = c; + *bstop++ = xml_char_cat(c); + } + else + { + /* EOF */ + if (f & XML_SRC_SURROUND) + { + *bstop++ = 0x20; + *bstop++ = xml_char_cat(0x20); + } + f |= XML_SRC_EOF; + break; + } + } + if (last_0xd == bstop) + f |= XML_SRC_NEW_LINE; + else + f &= ~XML_SRC_NEW_LINE; + ctx->sources->flags = f; + ctx->bstop = bstop; + DBG("XML: refilled %u characters", (uns)(ctx->bstop - ctx->bptr) / 2); + } + } + while (ctx->bptr == ctx->bstop); +} + +static inline uns +xml_peek_char(struct xml_context *ctx) +{ + if (ctx->bptr == ctx->bstop) + xml_refill(ctx); + return ctx->bptr[0]; +} + +static inline uns +xml_peek_cat(struct xml_context *ctx) +{ + if (ctx->bptr == ctx->bstop) + xml_refill(ctx); + return ctx->bptr[1]; +} + +static inline uns +xml_get_char(struct xml_context *ctx) +{ + uns c = xml_peek_char(ctx); + ctx->bptr += 2; + return c; +} + +static inline uns +xml_get_cat(struct xml_context *ctx) +{ + uns c = xml_peek_cat(ctx); + ctx->bptr += 2; + return c; +} + +static inline uns +xml_last_char(struct xml_context *ctx) +{ + return ctx->bptr[-2]; +} + +static inline uns +xml_last_cat(struct xml_context *ctx) +{ + return ctx->bptr[-1]; +} + +static inline uns +xml_skip_char(struct xml_context *ctx) +{ + uns c = ctx->bptr[0]; + ctx->bptr += 2; + return c; +} + +static inline uns +xml_unget_char(struct xml_context *ctx) +{ + return *(ctx->bptr -= 2); +} + +/*** Basic parsing ***/ + +static void NONRET +xml_fatal_expected(struct xml_context *ctx, uns c) +{ + xml_fatal(ctx, "Expected '%c'", c); +} + +static void NONRET +xml_fatal_expected_white(struct xml_context *ctx) +{ + xml_fatal(ctx, "Expected a white space"); +} + +static void NONRET +xml_fatal_expected_quot(struct xml_context *ctx) +{ + xml_fatal(ctx, "Expected a quotation mark"); +} + +static inline uns +xml_parse_white(struct xml_context *ctx, uns mandatory) +{ + /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+ + * mandatory=0 -> S? */ + uns cnt = 0; + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + { + xml_skip_char(ctx); + cnt++; + } + if (unlikely(mandatory && !cnt)) + xml_fatal_expected_white(ctx); + return cnt; +} + +static inline void +xml_parse_char(struct xml_context *ctx, uns c) +{ + /* Consumes a given Unicode character */ + if (unlikely(c != xml_get_char(ctx))) + xml_fatal_expected(ctx, c); +} + +static inline void +xml_parse_seq(struct xml_context *ctx, const char *seq) +{ + /* Consumes a given sequence of ASCII characters */ + while (*seq) + xml_parse_char(ctx, *seq++); +} + +static void +xml_parse_eq(struct xml_context *ctx) +{ + /* Eq ::= S? '=' S? */ + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '='); + xml_parse_white(ctx, 0); +} + +static inline uns +xml_parse_quote(struct xml_context *ctx) +{ + /* "'" | '"' */ + uns c = xml_get_char(ctx); + if (unlikely(c != '\'' && c != '\"')) + xml_fatal_expected_quot(ctx); + return c; +} + +/* Names and nmtokens */ + +static char * +xml_parse_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err) +{ + char *p = mp_start_noalign(ctx->pool, 1); + if (unlikely(!(xml_peek_cat(ctx) & first_cat))) + xml_fatal(ctx, "%s", err); + do + { + p = mp_spread(ctx->pool, p, 5); + p = utf8_32_put(p, xml_skip_char(ctx)); + } + while (xml_peek_cat(ctx) & next_cat); + *p++ = 0; + return mp_end(ctx->pool, p); +} + +static void +xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err) +{ + if (unlikely(!(xml_get_cat(ctx) & first_cat))) + xml_fatal(ctx, "%s", err); + while (xml_peek_cat(ctx) & next_cat) + xml_skip_char(ctx); +} + +static char * +xml_parse_name(struct xml_context *ctx) +{ + /* Name ::= NameStartChar (NameChar)* */ + return xml_parse_string(ctx, + !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, + !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, + "Expected a name"); +} + +static void +xml_skip_name(struct xml_context *ctx) +{ + xml_skip_string(ctx, + !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, + !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, + "Expected a name"); +} + +static char * +xml_parse_nmtoken(struct xml_context *ctx) +{ + /* Nmtoken ::= (NameChar)+ */ + uns cat = !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1; + return xml_parse_string(ctx, cat, cat, "Expected a nmtoken"); +} + +/* Simple literals */ + +static char * +xml_parse_system_literal(struct xml_context *ctx) +{ + /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */ + char *p = mp_start_noalign(ctx->pool, 1); + uns q = xml_parse_quote(ctx), c; + while ((c = xml_get_char(ctx)) != q) + { + p = mp_spread(ctx->pool, p, 5); + p = utf8_32_put(p, c); + } + *p++ = 0; + return mp_end(ctx->pool, p); +} + +static char * +xml_parse_pubid_literal(struct xml_context *ctx) +{ + /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */ + char *p = mp_start_noalign(ctx->pool, 1); + uns q = xml_parse_quote(ctx), c; + while ((c = xml_get_char(ctx)) != q) + { + if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID))) + xml_fatal(ctx, "Expected a pubid character"); + p = mp_spread(ctx->pool, p, 2); + *p++ = c; + } + *p++ = 0; + return mp_end(ctx->pool, p); +} + +static char * +xml_parse_encoding_name(struct xml_context *ctx) +{ + /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */ + char *p = mp_start_noalign(ctx->pool, 1); + uns q = xml_parse_quote(ctx); + if (unlikely(!(xml_peek_cat(ctx) & XML_CHAR_ENC_SNAME))) + xml_fatal(ctx, "Invalid character in the encoding name"); + while(1) + { + p = mp_spread(ctx->pool, p, 2); + *p++ = xml_skip_char(ctx); + if (xml_get_char(ctx) == q) + break; + if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME))) + xml_fatal(ctx, "Invalid character in the encoding name"); + } + *p++ = 0; + return mp_end(ctx->pool, p); +} + +/* Document/external entity header */ + +static void +xml_detect_encoding(struct xml_context *ctx) +{ + DBG("XML: xml_detect_encoding"); + struct xml_source *src = ctx->sources; + struct fastbuf *fb = src->fb; + char *detected_encoding = NULL; + uns x = 0, l = 0, c, z = 1; + while (l < 4) + { + if (!~(c = bgetc(fb))) + { + src->flags |= XML_SRC_EOF; + break; + } + else if (!c || c >= 0xfe || c == 0xa7 || c == 0x94) + z = 0; + else if ((c < 0x3c || c > 0x78)) + { + bungetc(fb); + break; + } + x = (x << 8) + c; + l++; + } + if (z) + z = x; + else if (l == 2) + switch (x) + { + case 0xFEFF: + xml_fatal(ctx, "UTF-16BE encoding not supported"); + case 0xFFFE: + xml_fatal(ctx, "UTF-16LE encoding not supported"); + default: + goto cannot_detect; + } + else if (l == 4) + switch (x) + { + case 0x0000FEFF: + xml_fatal(ctx, "UCS-4BE encoding not supported"); + case 0xFFFE0000: + xml_fatal(ctx, "UCS-4LE encoding not supported"); + case 0x0000FFFE: + xml_fatal(ctx, "UCS-4 encoding (order 2143) not supported"); + case 0xFEFF0000: + xml_fatal(ctx, "UCS-4 encoding (order 3412) not supported"); + case 0x0000003c: + xml_fatal(ctx, "UCS-4BE encoding not supported"); + case 0x3c000000: + xml_fatal(ctx, "UCS-4LE encoding not supported"); + case 0x00003c00: + xml_fatal(ctx, "UCS-4 encoding (order 2143) not supported"); + case 0x003c0000: + xml_fatal(ctx, "UCS-4 encoding (order 3412) not supported"); + case 0x003c003F: + xml_fatal(ctx, "UTF-16BE encoding not supported"); + case 0x3C003F00: + xml_fatal(ctx, "UTF-16LE encoding not supported"); + case 0x3C3F786D: + xml_fatal(ctx, "EBCDIC encoding not supported"); + default: + goto cannot_detect; + } + else +cannot_detect: + xml_fatal(ctx, "Cannot detect the encoding"); + ctx->bptr = ctx->bstop = src->buf + 8; + while (z) + { + c = z & 0xff; + z >>= 8; + *--ctx->bptr = xml_char_cat(c); + *--ctx->bptr = c; + } + if (!detected_encoding && ctx->bstop == ctx->bptr && xml_peek_char(ctx) == 0xfeff) + xml_skip_char(ctx); + DBG("XML: Detected encoding: %s", detected_encoding ? : "UTF-8"); + if (!(src->flags & XML_SRC_EOF)) + xml_refill(ctx); +} + +static void +xml_parse_decl(struct xml_context *ctx) +{ + DBG("XML: xml_parse_decl"); + ctx->sources->flags &= ~XML_SRC_DECL; + xml_detect_encoding(ctx); + uns document = ctx->sources->flags & XML_SRC_DOCUMENT; + u32 *bptr = ctx->bptr; + uns have_decl = + (12 <= ctx->bstop - ctx->bptr && + bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L' && + (bptr[11] & XML_CHAR_WHITE)); + if (!have_decl) + { + if (document) + xml_fatal(ctx, "Missing or corrupted XML declaration header"); + return; + } + ctx->bptr += 12; + + /* FIXME: the header must not contain exotic newlines */ + xml_parse_white(ctx, 0); + + if (xml_peek_char(ctx) == 'v') + { + xml_parse_seq(ctx, "version"); + xml_parse_eq(ctx); + char *version = xml_parse_pubid_literal(ctx); + DBG("XML: Version=%s", version); + if (document) + { + ctx->version_str = version; + if (!strcmp(ctx->version_str, "1.0")) + ; + else if (!strcmp(ctx->version_str, "1.1")) + ctx->flags |= XML_FLAG_VERSION_1_1; + else + xml_fatal(ctx, "Unsupported XML version"); + } + else if (strcmp(version, ctx->version_str)) + xml_error(ctx, "Mixed XML versions"); + } + else if (document) + xml_fatal(ctx, "Missing XML version"); + + // FIXME: TextDecl must contain encoding + if (!xml_parse_white(ctx, 0)) + goto end; + if (xml_peek_char(ctx) == 'e') + { + xml_parse_seq(ctx, "encoding"); + xml_parse_eq(ctx); + ctx->encoding = xml_parse_encoding_name(ctx); + DBG("encoding=%s", ctx->encoding); + // FIXME: check encoding + if (!xml_parse_white(ctx, 0)) + goto end; + } + + if (document && xml_peek_char(ctx) == 's') + { + xml_parse_seq(ctx, "standalone"); + xml_parse_eq(ctx); + uns c = xml_parse_quote(ctx); + if (ctx->standalone = (xml_peek_char(ctx) == 'y')) + xml_parse_seq(ctx, "yes"); + else + xml_parse_seq(ctx, "no"); + xml_parse_char(ctx, c); + DBG("standalone=%d", ctx->standalone); + xml_parse_white(ctx, 0); + } +end: + xml_parse_seq(ctx, "?>"); +} + +/*** Document Type Definition (DTD) ***/ + +/* Notations */ + +#define HASH_PREFIX(x) xml_dtd_notns_##x +#define HASH_NODE struct xml_dtd_notn +#define HASH_KEY_STRING name +#define HASH_AUTO_POOL 1024 +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_WANT_FIND +#define HASH_WANT_LOOKUP +#define HASH_WANT_CLEANUP +#include "lib/hashtable.h" + +/* General entities */ + +#define HASH_PREFIX(x) xml_dtd_ents_##x +#define HASH_NODE struct xml_dtd_ent +#define HASH_KEY_STRING name +#define HASH_AUTO_POOL 1024 +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_WANT_FIND +#define HASH_WANT_LOOKUP +#define HASH_WANT_CLEANUP +#include "lib/hashtable.h" + +static void +xml_dtd_declare_trivial_gent(struct xml_context *ctx, char *name, char *text) +{ + struct xml_dtd *dtd = ctx->dtd; + struct xml_dtd_ent *ent = xml_dtd_ents_lookup(dtd->tab_gents, name); + if (ent->flags & XML_DTD_ENT_DECLARED) + { + xml_warn(ctx, "Entity &%s; already declared", name); + return; + } + slist_add_tail(&dtd->gents, &ent->n); + ent->flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL; + ent->text = text; +} + +static void +xml_dtd_declare_default_gents(struct xml_context *ctx) +{ + xml_dtd_declare_trivial_gent(ctx, "lt", "<"); + xml_dtd_declare_trivial_gent(ctx, "gt", ">"); + xml_dtd_declare_trivial_gent(ctx, "amp", "&"); + xml_dtd_declare_trivial_gent(ctx, "apos", "'"); + xml_dtd_declare_trivial_gent(ctx, "quot", "\""); +} + +static struct xml_dtd_ent * +xml_dtd_find_gent(struct xml_context *ctx, char *name) +{ + struct xml_dtd *dtd = ctx->dtd; + if (dtd) + { + struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_gents, name); + return (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; + } + else + { +#define ENT(n, t) ent_##n = { .name = #n, .text = t, .len = 1, .flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL } + static struct xml_dtd_ent ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\""); +#undef ENT + switch (name[0]) + { + case 'l': + if (!strcmp(name, "lt")) + return &ent_lt; + break; + case 'g': + if (!strcmp(name, "gt")) + return &ent_gt; + break; + case 'a': + if (!strcmp(name, "amp")) + return &ent_amp; + if (!strcmp(name, "apos")) + return &ent_apos; + break; + case 'q': + if (!strcmp(name, "quot")) + return &ent_quot; + break; + } + return NULL; + } +} + +/* Parameter entities */ + +static struct xml_dtd_ent * +xml_dtd_find_pent(struct xml_context *ctx, char *name) +{ + struct xml_dtd *dtd = ctx->dtd; + struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_pents, name); + return (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; +} + +/* Elements */ + +#define HASH_PREFIX(x) xml_dtd_elems_##x +#define HASH_NODE struct xml_dtd_elem +#define HASH_KEY_STRING name +#define HASH_TABLE_DYNAMIC +#define HASH_AUTO_POOL 1024 +#define HASH_ZERO_FILL +#define HASH_WANT_LOOKUP +#define HASH_WANT_CLEANUP +#include "lib/hashtable.h" + +/* Element attributes */ + +struct xml_dtd_attrs_table; + +static inline uns +xml_dtd_attrs_hash(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem, char *name) +{ + return hash_pointer(elem) ^ hash_string(name); +} + +static inline int +xml_dtd_attrs_eq(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem1, char *name1, struct xml_dtd_elem *elem2, char *name2) +{ + return (elem1 == elem2) && !strcmp(name1, name2); +} + +static inline void +xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_elem *elem, char *name) +{ + attr->elem = elem; + attr->name = name; +} + +#define HASH_PREFIX(x) xml_dtd_attrs_##x +#define HASH_NODE struct xml_dtd_attr +#define HASH_AUTO_POOL 1024 +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x elem, x name +#define HASH_KEY_DECL struct xml_dtd_elem *elem, char *name +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_WANT_CLEANUP +#include "lib/hashtable.h" + +/* Enumerated attribute values */ + +struct xml_dtd_evals_table; + +static inline uns +xml_dtd_evals_hash(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr, char *val) +{ + return hash_pointer(attr) ^ hash_string(val); +} + +static inline int +xml_dtd_evals_eq(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr1, char *val1, struct xml_dtd_attr *attr2, char *val2) +{ + return (attr1 == attr2) && !strcmp(val1, val2); +} + +static inline void +xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_eval *eval, struct xml_dtd_attr *attr, char *val) +{ + eval->attr = attr; + eval->val = val; +} + +#define HASH_PREFIX(x) xml_dtd_evals_##x +#define HASH_NODE struct xml_dtd_eval +#define HASH_AUTO_POOL 1024 +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x attr, x val +#define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_WANT_CLEANUP +#include "lib/hashtable.h" + +/* Enumerated attribute notations */ + +struct xml_dtd_enotns_table; + +static inline uns +xml_dtd_enotns_hash(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) +{ + return hash_pointer(attr) ^ hash_pointer(notn); +} + +static inline int +xml_dtd_enotns_eq(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr1, struct xml_dtd_notn *notn1, struct xml_dtd_attr *attr2, struct xml_dtd_notn *notn2) +{ + return (attr1 == attr2) && (notn1 == notn2); +} + +static inline void +xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_enotn *enotn, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) +{ + enotn->attr = attr; + enotn->notn = notn; +} + +#define HASH_PREFIX(x) xml_dtd_enotns_##x +#define HASH_NODE struct xml_dtd_enotn +#define HASH_AUTO_POOL 1024 +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x attr, x notn +#define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_WANT_CLEANUP +#include "lib/hashtable.h" + +/* DTD initialization/cleanup */ + +static void +xml_dtd_init(struct xml_context *ctx) +{ + ctx->dtd = mp_alloc_zero(ctx->pool, sizeof(*ctx->dtd)); + xml_dtd_ents_init(ctx->dtd->tab_gents = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_ents_table))); + xml_dtd_ents_init(ctx->dtd->tab_pents = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_ents_table))); + xml_dtd_notns_init(ctx->dtd->tab_notns = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_notns_table))); + xml_dtd_elems_init(ctx->dtd->tab_elems = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_elems_table))); + xml_dtd_attrs_init(ctx->dtd->tab_attrs = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_attrs_table))); + xml_dtd_evals_init(ctx->dtd->tab_evals = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_evals_table))); + xml_dtd_enotns_init(ctx->dtd->tab_enotns = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_enotns_table))); + xml_dtd_declare_default_gents(ctx); +} + +static void +xml_dtd_cleanup(struct xml_context *ctx) +{ + if (!ctx->dtd) + return; + xml_dtd_ents_cleanup(ctx->dtd->tab_gents); + xml_dtd_ents_cleanup(ctx->dtd->tab_pents); + xml_dtd_notns_cleanup(ctx->dtd->tab_notns); + xml_dtd_elems_cleanup(ctx->dtd->tab_elems); + xml_dtd_attrs_cleanup(ctx->dtd->tab_attrs); + xml_dtd_evals_cleanup(ctx->dtd->tab_evals); + xml_dtd_enotns_cleanup(ctx->dtd->tab_enotns); +} + +static void +xml_dtd_finish(struct xml_context *ctx) +{ + if (!ctx->dtd) + return; + // FIXME +} + +/*** Parsing functions ***/ + +/* Comments */ + +static void +xml_push_comment(struct xml_context *ctx) +{ + /* Parse a comment to ctx->value: + * Comment ::= '' + * Already parsed: 'value; + uns c; + xml_parse_char(ctx, '-'); + while (1) + { + if ((c = xml_get_char(ctx)) == '-') + if ((c = xml_get_char(ctx)) == '-') + break; + else + bputc(out, '-'); + bput_utf8_32(out, c); + } + xml_parse_char(ctx, '>'); + fbgrow_rewind(out); + if (ctx->h_comment) + ctx->h_comment(ctx); +} + +static void +xml_pop_comment(struct xml_context *ctx) +{ + fbgrow_rewind(ctx->value); +} + +static void +xml_skip_comment(struct xml_context *ctx) +{ + xml_parse_char(ctx, '-'); + while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-'); + xml_parse_char(ctx, '>'); +} + +/* Processing instructions */ + +static void +xml_push_pi(struct xml_context *ctx) +{ + /* Parses a PI to ctx->value and ctx->name: + * PI ::= '' Char*)))? '?>' + * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) + * Already parsed: 'name = xml_parse_name(ctx); + if (unlikely(!strcasecmp(ctx->name, "xml"))) + xml_fatal(ctx, "Reserved PI target"); + struct fastbuf *out = ctx->value; + if (xml_parse_white(ctx, 0)) + xml_parse_seq(ctx, "?>"); + else + { + while (1) + { + uns c; + if ((c = xml_get_char(ctx)) == '?') + if (xml_get_char(ctx) == '>') + break; + else + { + xml_unget_char(ctx); + bputc(out, '?'); + } + else + bput_utf8_32(out, c); + } + fbgrow_rewind(out); + } + if (ctx->h_pi) + ctx->h_pi(ctx); +} + +static void +xml_pop_pi(struct xml_context *ctx) +{ + fbgrow_reset(ctx->value); +} + +static void +xml_skip_pi(struct xml_context *ctx) +{ + if (ctx->flags & XML_FLAG_VALIDATING) + { + mp_push(ctx->pool); + if (unlikely(!strcasecmp(xml_parse_name(ctx), "xml"))) + xml_fatal(ctx, "Reserved PI target"); + mp_pop(ctx->pool); + if (!xml_parse_white(ctx, 0)) + { + xml_parse_seq(ctx, "?>"); + return; + } + } + while (1) + if (xml_get_char(ctx) == '?') + if (xml_get_char(ctx) == '>') + break; + else + xml_unget_char(ctx); +} + +/* Character references */ + +static uns +xml_parse_char_ref(struct xml_context *ctx) +{ + /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' + * Already parsed: '&#' */ + uns v = 0; + if (xml_get_char(ctx) == 'x') + { + if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT)) + { + xml_error(ctx, "Expected a hexadecimal value of character reference"); + goto recover; + } + do + { + v = (v << 4) + Cxvalue(xml_last_char(ctx)); + } + while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT)); + } + else + { + if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT)) + { + xml_error(ctx, "Expected a numeric value of character reference"); + goto recover; + } + do + { + v = v * 10 + xml_last_char(ctx) - '0'; + } + while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT)); + } + uns cat = xml_char_cat(v); + if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_FLAG_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0))) + { + xml_error(ctx, "Character reference out of range"); + goto recover; + } + if (xml_last_char(ctx) == ';') + return v; + xml_error(ctx, "Expected ';'"); +recover: + while (xml_last_char(ctx) != ';') + xml_get_char(ctx); + return UNI_REPLACEMENT; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////// + +static void +xml_parse_parameter_ref(struct xml_context *ctx) +{ + char *name = xml_parse_name(ctx); + xml_parse_char(ctx, ';'); + struct xml_dtd_ent *ent = xml_dtd_ents_find(ctx->dtd->tab_pents, name); + if (!ent || !(ent->flags & XML_DTD_ENT_DECLARED)) + { + xml_error(ctx, "Reference to unknown parameter entity %%%s", name); + return; + } + if (ent->flags & XML_DTD_ENT_VISITED) + { + xml_error(ctx, "Cycled references to parameter entity %%%s", name); + return; + } + if (ent->flags & XML_DTD_ENT_EXTERNAL) + { + // FIXME: + xml_error(ctx, "Support for external parsed entities not implemented"); + return; + } + ent->flags |= XML_DTD_ENT_VISITED; // FIXME: clear + struct fastbuf *fb = mp_alloc(ctx->pool, sizeof(*fb)); + fbbuf_init_read(fb, ent->text, ent->len, 0); + xml_push_source(ctx, fb, 0); +} + +static inline void +xml_check_parameter_ref(struct xml_context *ctx) +{ + if (xml_get_char(ctx) != '%') + { + xml_unget_char(ctx); + return; + } + xml_parse_parameter_ref(ctx); +} + +static void +xml_parse_external_id(struct xml_context *ctx, struct xml_ext_id *eid, uns allow_public) +{ + bzero(eid, sizeof(*eid)); + uns c = xml_get_char(ctx); + if (c == 'S') + { + xml_parse_seq(ctx, "YSTEM"); + xml_parse_white(ctx, 1); + eid->system_id = xml_parse_system_literal(ctx); + } + else if (c == 'P') + { + xml_parse_seq(ctx, "UBLIC"); + xml_parse_white(ctx, 1); + eid->public_id = xml_parse_pubid_literal(ctx); + if (xml_parse_white(ctx, 1)) + if ((c = xml_get_char(ctx)) == '\'' || c == '"' || !allow_public) + { + xml_unget_char(ctx); + eid->system_id = xml_parse_system_literal(ctx); + } + else + xml_unget_char(ctx); + } + else + xml_fatal(ctx, "Expected an external ID"); +} + +static void +xml_parse_notation_decl(struct xml_context *ctx) +{ + /* NotationDecl ::= ''*/ + xml_parse_white(ctx, 1); + struct xml_dtd_notn *notn = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx)); + xml_parse_white(ctx, 1); + struct xml_ext_id eid; + xml_parse_external_id(ctx, &eid, 1); + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '>'); + if (notn->flags & XML_DTD_NOTN_DECLARED) + xml_warn(ctx, "Notation %s already declared", notn->name); + else + { + notn->flags = XML_DTD_NOTN_DECLARED; + notn->eid = eid; + } +} + +static void +xml_parse_internal_subset(struct xml_context *ctx) +{ + while (1) + { + xml_parse_white(ctx, 0); + uns c = xml_get_char(ctx); + if (c == '<') + if ((c = xml_get_char(ctx)) == '!') + switch (c = xml_get_char(ctx)) + { + case '-': + xml_push_comment(ctx); + xml_pop_comment(ctx); + break; + case 'N': + xml_parse_seq(ctx, "OTATION"); + xml_parse_notation_decl(ctx); + break; + case 'E': + if ((c = xml_get_char(ctx)) == 'N') + { + xml_parse_seq(ctx, "TITY"); + //xml_parse_entity_decl(ctx); + } + else if (c == 'L') + { + xml_parse_seq(ctx, "EMENT"); + // FIXME: Element + } + else + goto invalid_markup; + break; + case 'A': + xml_parse_seq(ctx, "TTLIST"); + // FIXME: AttList + break; + default: + goto invalid_markup; + } + else if (c == '?') + { + xml_push_pi(ctx); + xml_pop_pi(ctx); + } + else + goto invalid_markup; + else if (c == '%') + xml_parse_parameter_ref(ctx); + else if (c == ']') + break; + else + goto invalid_markup; + } + return; +invalid_markup: + xml_fatal(ctx, "Invalid markup in the internal subset"); +} + +/*----------------------------------------------*/ + + +/* FIXME */ + +struct xml_attribute_table; + +#define HASH_PREFIX(x) xml_attribute_##x +#define HASH_NODE struct xml_attribute +#define HASH_KEY_COMPLEX(x) x element, x name +#define HASH_KEY_DECL struct xml_element *element, char *name +#define HASH_TABLE_DYNAMIC +#define HASH_AUTO_POOL 1024 + +#define HASH_GIVE_HASHFN + +static inline uns +xml_attribute_hash(struct xml_attribute_table *t UNUSED, struct xml_element *e, char *n) +{ + return hash_pointer(e) ^ hash_string(n); +} + +#define HASH_GIVE_EQ + +static inline int +xml_attribute_eq(struct xml_attribute_table *t UNUSED, struct xml_element *e1, char *n1, struct xml_element *e2, char *n2) +{ + return (e1 == e2) && !strcmp(n1, n2); +} + +#define HASH_GIVE_INIT_KEY + +static inline void +xml_attribute_init_key(struct xml_attribute_table *t UNUSED, struct xml_attribute *a, struct xml_element *e, char *name) +{ + a->element = e; + a->name = name; + a->value = NULL; + a->next = e->attrs; + e->attrs = a; +} + +#define HASH_WANT_CLEANUP +#define HASH_WANT_REMOVE +#define HASH_WANT_LOOKUP +#define HASH_WANT_FIND +#include "lib/hashtable.h" + + +/* +#define HASH_PREFIX(x) xml_parsed_entities_##x +#define HASH_NODE struct xml_parsed_entity +#define HASH_KEY_STRING name +#define HASH_TABLE_DYNAMIC +#define HASH_AUTO_POOL 1024 +#define HASH_WANT_CLEANUP +#include "lib/hashtable.h" +*/ + +void +xml_init(struct xml_context *ctx) +{ + bzero(ctx, sizeof(*ctx)); + ctx->pool = mp_new(65536); + ctx->chars = fbgrow_create(4096); + ctx->value = fbgrow_create(4096); + xml_dtd_init(ctx); +} + +void +xml_cleanup(struct xml_context *ctx) +{ + xml_dtd_cleanup(ctx); + bclose(ctx->value); + bclose(ctx->chars); + mp_delete(ctx->pool); +} + +static void +xml_parse_cdata(struct xml_context *ctx) +{ + struct fastbuf *out = ctx->chars; + xml_parse_seq(ctx, "CDATA["); + while (1) + { + uns c; + if ((c = xml_get_char(ctx)) == ']') + { + if ((c = xml_get_char(ctx)) == ']') + if ((c = xml_get_char(ctx)) == '>') + break; + else + bputc(out, ']'); + bputc(out, ']'); + } + bput_utf8_32(out, c); + } +} + +static void +xml_skip_cdata(struct xml_context *ctx) +{ + xml_parse_cdata(ctx); +} + +static void +xml_parse_ref_entity(struct xml_context *ctx UNUSED, struct fastbuf *out UNUSED, struct xml_dtd_ent *entity UNUSED) +{ +#if 0 + for (struct xml_dtd_ent_node *node = entity->list; node; node = node->next) + if (node->len) + bwrite(out, node->ptr, node->len); + else + xml_parse_ref_entity(ctx, out, node->ptr); // FIXME: do not call the recursion on stack -- could cause segfault +#endif +} + +static void +xml_parse_ref(struct xml_context *ctx, struct fastbuf *out) +{ + if (xml_get_char(ctx) == '#') + { + uns c = xml_parse_char_ref(ctx); + bput_utf8_32(out, c); + } + else + { +#if 0 + xml_unget_char(ctx); + mp_push(ctx->pool); + char *name = xml_parse_name(ctx); + struct xml_parsed_entity *entity = xml_find_parsed_entity(ctx, name); + mp_pop(ctx->pool); + xml_parse_char(ctx, ';'); + xml_parse_ref_entity(ctx, out, entity); +#endif + } +} + +static void +xml_parse_chars(struct xml_context *ctx) +{ + DBG("parse_chars"); + struct fastbuf *out = ctx->chars; + uns c; + while ((c = xml_get_char(ctx)) != '<') + if (c == '&') + xml_parse_ref(ctx, out); + else + bput_utf8_32(out, c); + xml_unget_char(ctx); +} + +static void +xml_parse_attr(struct xml_context *ctx) +{ + DBG("parse_attr"); + struct xml_element *e = ctx->element; + char *name = xml_parse_name(ctx); + struct xml_attribute *a = xml_attribute_lookup(ctx->attribute_table, e, name); + if (a->value) + xml_fatal(ctx, "Attribute is not unique"); + xml_parse_eq(ctx); + // FIXME + char *value = xml_parse_system_literal(ctx); + a->value = value; +} + +static uns +xml_parse_stag(struct xml_context *ctx) +{ + DBG("parse_stag"); + mp_push(ctx->pool); + struct xml_element *e = mp_alloc_zero(ctx->pool, sizeof(*e)); + e->parent = ctx->element; + ctx->element = e; + e->name = xml_parse_name(ctx); + while (1) + { + uns white = xml_parse_white(ctx, 0); + uns c = xml_get_char(ctx); + if (c == '/') + { + xml_parse_char(ctx, '>'); + return 1; + } + else if (c == '>') + return 0; + else if (!white) + xml_fatal(ctx, "Expected a white space"); + xml_unget_char(ctx); + xml_parse_attr(ctx); + } +} + +static void +xml_parse_etag(struct xml_context *ctx) +{ + DBG("parse_etag"); + struct xml_element *e = ctx->element; + ASSERT(e); + char *name = xml_parse_name(ctx); + if (strcmp(name, e->name)) + xml_fatal(ctx, "Invalid ETag, expected '%s'", e->name); + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '>'); + // FIXME: remove on pooled hashtable? + for (struct xml_attribute *a = e->attrs; a; a = a->next) + xml_attribute_remove(ctx->attribute_table, a); + ctx->element = e->parent; + mp_pop(ctx->pool); +} + +static void +xml_parse_element_decl(struct xml_context *ctx) +{ + // FIXME + mp_push(ctx->pool); + xml_parse_seq(ctx, "'); + mp_pop(ctx->pool); +} + +#if 0 +static void +xml_parse_attr_list_decl(struct xml_context *ctx) +{ + /* AttlistDecl ::= '' + * AttDef ::= S Name S AttType S DefaultDecl */ + xml_parse_seq(ctx, "ATTLIST"); + xml_parse_white(ctx, 1); + struct xml_dtd_elem *e = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx)); + e->attlist_declared = 1; + + while (xml_parse_white(ctx, 0) && xml_get_char(ctx) != '>') + { + xml_unget_char(ctx); + char *name = xml_parse_name(ctx); + struct xml_dtd_attr *a = xml_dtd_attrs_find(ctx->dtd->tab_attrs, e, name); + uns ignored = 0; + if (a) + { + xml_warn(ctx, "Duplicate attribute definition"); + ignored++; + } + else + a = xml_dtd_attrs_new(ctx->dtd->tab_attrs, e, name); + xml_parse_white(ctx, 1); + if (xml_get_char(ctx) == '(') + { + if (!ignored) + a->type = XML_ATTR_ENUM; + do + { + xml_parse_white(ctx, 0); + char *value = xml_parse_nmtoken(ctx); + if (!ignored) + if (xml_dtd_evals_find(ctx->dtd->tab_evals, a, value)) + xml_error(ctx, "Duplicate enumeration value"); + else + xml_dtd_evals_new(ctx->dtd->tab_evals, a, value); + xml_parse_white(ctx, 0); + } + while (xml_get_char(ctx) == '|'); + xml_unget_char(ctx); + xml_parse_char(ctx, ')'); + } + else + { + xml_unget_char(ctx); + char *type = xml_parse_name(ctx); + enum xml_dtd_attribute_type t; + if (!strcmp(type, "CDATA")) + t = XML_ATTR_CDATA; + else if (!strcmp(type, "ID")) + t = XML_ATTR_ID; + else if (!strcmp(type, "IDREF")) + t = XML_ATTR_IDREF; + else if (!strcmp(type, "IDREFS")) + t = XML_ATTR_IDREFS; + else if (!strcmp(type, "ENTITY")) + t = XML_ATTR_ENTITY; + else if (!strcmp(type, "ENTITIES")) + t = XML_ATTR_ENTITIES; + else if (!strcmp(type, "NMTOKEN")) + t = XML_ATTR_NMTOKEN; + else if (!strcmp(type, "NMTOKENS")) + t = XML_ATTR_NMTOKENS; + else if (!strcmp(type, "NOTATION")) + { + t = XML_ATTR_NOTATION; + xml_parse_white(ctx, 1); + xml_parse_char(ctx, '('); + do + { + xml_parse_white(ctx, 0); + struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx)); + if (!ignored) + if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, a, n)) + xml_error(ctx, "Duplicate enumerated notation"); + else + xml_dtd_enotns_new(ctx->dtd->tab_enotns, a, n); + xml_parse_white(ctx, 0); + } + while (xml_get_char(ctx) == '|'); + xml_unget_char(ctx); + xml_parse_char(ctx, ')'); + } + else + xml_fatal(ctx, "Unknown attribute type"); + if (!ignored) + a->type = t; + } + xml_parse_white(ctx, 1); + enum xml_dtd_attribute_default def = XML_ATTR_NONE; + if (xml_get_char(ctx) == '#') + switch (xml_get_char(ctx)) + { + case 'R': + xml_parse_seq(ctx, "EQUIRED"); + def = XML_ATTR_REQUIRED; + break; + case 'I': + xml_parse_seq(ctx, "MPLIED"); + def = XML_ATTR_IMPLIED; + break; + case 'F': + xml_parse_seq(ctx, "IXED"); + def = XML_ATTR_FIXED; + break; + default: + xml_fatal(ctx, "Expected a modifier for default attribute value"); + } + else + xml_unget_char(ctx); + if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED) + { + xml_parse_system_literal(ctx); + // FIXME + } + } +} +#endif + +static void +xml_parse_entity_decl(struct xml_context *ctx) +{ + struct xml_dtd *dtd = ctx->dtd; + xml_parse_white(ctx, 1); + + uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENT_PARAMETER : 0; + if (flags) + xml_parse_white(ctx, 1); + else + xml_unget_char(ctx); + + struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_gents, xml_parse_name(ctx)); + slist *list = flags ? &dtd->pents : &dtd->gents; + xml_parse_white(ctx, 1); + if (ent->flags & XML_DTD_ENT_DECLARED) + { + xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); + // FIXME: should be only warning + } + + uns sep = xml_get_char(ctx), c; + if (sep == '\'' || sep == '"') + { + /* Internal entity: + * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */ + struct fastbuf *out = ctx->value; + uns sep = c; + while (1) + { + if ((c = xml_get_char(ctx)) == sep) + break; + else if (c == '%') + { + // FIXME + ASSERT(0); + //xml_parse_parameter_ref(ctx); + } + else if (c != '&') + bput_utf8_32(out, c); + else if ((c = xml_get_char(ctx)) == '#') + c = xml_parse_char_ref(ctx); + else + { + /* Bypass references to general entities */ + mp_push(ctx->pool); + bputc(out, '&'); + xml_unget_char(ctx); + bputs(out, xml_parse_name(ctx)); + xml_parse_char(ctx, ';'); + bputc(out, ';'); + mp_pop(ctx->pool); + } + } + bputc(out, 0); + fbgrow_rewind(out); + slist_add_tail(list, &ent->n); + ent->flags = flags | XML_DTD_ENT_DECLARED; + ent->len = out->bstop - out->bptr - 1; + ent->text = mp_memdup(ctx->pool, out->bptr, ent->len + 1); + fbgrow_reset(out); + } + else + { + /* External entity */ + struct xml_ext_id eid; + struct xml_dtd_notn *notn = NULL; + xml_parse_external_id(ctx, &eid, 0); + if (!xml_parse_white(ctx, 0) || !flags) + xml_parse_char(ctx, '>'); + else if (xml_get_char(ctx) != '>') + { + /* General external unparsed entity */ + flags |= XML_DTD_ENT_UNPARSED; + xml_parse_seq(ctx, "NDATA"); + xml_parse_white(ctx, 1); + notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx)); + } + slist_add_tail(list, &ent->n); + ent->flags = flags | XML_DTD_ENT_DECLARED | XML_DTD_ENT_EXTERNAL; + ent->eid = eid; + ent->notn = notn; + } +} + +static void +xml_parse_doctype_decl(struct xml_context *ctx) +{ + if (ctx->document_type) + xml_fatal(ctx, "Multiple document types not allowed"); + xml_parse_seq(ctx, "DOCTYPE"); + xml_parse_white(ctx, 1); + ctx->document_type = xml_parse_name(ctx); + DBG("XML: DocumentType=%s", ctx->document_type); + uns white = xml_parse_white(ctx, 0); + uns c = xml_peek_char(ctx); + if (c != '>' && c != '[' && white) + { + xml_parse_external_id(ctx, &ctx->eid, 0); + xml_parse_white(ctx, 0); + } + if (ctx->h_doctype_decl) + ctx->h_doctype_decl(ctx); +} + +int +xml_next(struct xml_context *ctx) +{ + /* A nasty state machine */ + + DBG("XML: xml_next (state=%u)", ctx->state); + jmp_buf throw_buf; + ctx->throw_buf = &throw_buf; + if (setjmp(throw_buf)) + { +error: + if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal) + ctx->h_fatal(ctx); + ctx->state = XML_STATE_FATAL; + DBG("XML: raised fatal error"); + return -1; + } + uns c; + switch (ctx->state) + { + case XML_STATE_FATAL: + return -1; + + case XML_STATE_START: + DBG("XML: Entering Prolog"); + if (ctx->h_document_start) + ctx->h_document_start(ctx); + /* XMLDecl */ + xml_refill(ctx); + if (ctx->h_xml_decl) + ctx->h_xml_decl(ctx); + if (ctx->want & XML_WANT_DECL) + return ctx->state = XML_STATE_DECL; + case XML_STATE_DECL: + + /* Misc* (doctypedecl Misc*)? */ + while (1) + { + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '<'); + if ((c = xml_get_char(ctx)) == '?') + /* Processing intruction */ + if (!(ctx->want & XML_WANT_PI)) + xml_skip_pi(ctx); + else + { + xml_push_pi(ctx); + ctx->state = XML_STATE_PROLOG_PI; + return XML_STATE_PI; + case XML_STATE_PROLOG_PI: + xml_pop_pi(ctx); + } + else if (c != '!') + { + /* Found the root tag */ + xml_unget_char(ctx); + goto first_tag; + } + else if (xml_get_char(ctx) == '-') + if (!(ctx->want & XML_WANT_COMMENT)) + xml_skip_comment(ctx); + else + { + xml_push_comment(ctx); + ctx->state = XML_STATE_PROLOG_COMMENT; + return XML_STATE_COMMENT; + case XML_STATE_PROLOG_COMMENT: + xml_pop_comment(ctx); + } + else + { + /* DocTypeDecl */ + xml_unget_char(ctx); + xml_parse_doctype_decl(ctx); + if (ctx->want & XML_WANT_DOCUMENT_TYPE) + return ctx->state = XML_STATE_DOCUMENT_TYPE; + case XML_STATE_DOCUMENT_TYPE: + if (xml_peek_char(ctx) == '[') + { + xml_skip_char(ctx); + // FIXME + while (xml_get_char(ctx) != ']'); + xml_parse_white(ctx, 0); + } + xml_parse_char(ctx, '>'); + } + } + + case XML_STATE_PI: + mp_pop(ctx->pool); + case XML_STATE_COMMENT: + fbgrow_reset(ctx->value); + + case XML_STATE_CHARS: + + while (1) + { + if (xml_get_char(ctx) != '<') + { + /* CharData */ + xml_unget_char(ctx); + xml_parse_chars(ctx); + continue; + } +first_tag: ; + + if ((c = xml_get_char(ctx)) == '?') + { + /* PI */ + if (!(ctx->want & XML_WANT_PI)) + xml_skip_pi(ctx); + else + { + if (btell(ctx->chars)) + { + fbgrow_rewind(ctx->chars); + ctx->state = XML_STATE_CHARS_BEFORE_PI; + return XML_STATE_PI; + case XML_STATE_CHARS_BEFORE_PI: + fbgrow_reset(ctx->chars); + } + xml_push_pi(ctx); + return ctx->state = XML_STATE_PI; + } + } + + else if (c == '!') + if ((c = xml_get_char(ctx)) == '-') + { + /* Comment */ + if (!(ctx->want & XML_WANT_COMMENT)) + xml_skip_comment(ctx); + else + { + if (btell(ctx->chars)) + { + fbgrow_rewind(ctx->chars); + ctx->state = XML_STATE_CHARS_BEFORE_COMMENT; + return XML_STATE_CHARS; + case XML_STATE_CHARS_BEFORE_COMMENT: + fbgrow_reset(ctx->chars); + } + xml_push_comment(ctx); + return ctx->state = XML_STATE_COMMENT; + } + } + else if (c == '[') + { + /* CDATA */ + if (!(ctx->want & XML_WANT_CDATA)) + xml_skip_cdata(ctx); + else + { + if (btell(ctx->chars)) + { + fbgrow_rewind(ctx->chars); + ctx->state = XML_STATE_CHARS_BEFORE_CDATA; + return XML_STATE_CHARS; + case XML_STATE_CHARS_BEFORE_CDATA: + fbgrow_reset(ctx->chars); + } + xml_parse_cdata(ctx); + if (btell(ctx->chars)) + { + fbgrow_rewind(ctx->chars); + return ctx->state = XML_STATE_CDATA; + } + case XML_STATE_CDATA: + fbgrow_reset(ctx->chars); + } + } + else + xml_fatal(ctx, "Unexpected character after 'chars)) + { + fbgrow_rewind(ctx->chars); + ctx->state = XML_STATE_CHARS_BEFORE_STAG; + return XML_STATE_CHARS; + case XML_STATE_CHARS_BEFORE_STAG: + fbgrow_reset(ctx->chars); + } + + if (xml_parse_stag(ctx)) + { + } + if (ctx->want & XML_WANT_STAG) + return ctx->state = XML_STATE_STAG; + case XML_STATE_STAG: + // FIXME: EmptyElemTag + ; + + } + + else + { + /* ETag */ + if (btell(ctx->chars)) + { + fbgrow_rewind(ctx->chars); + ctx->state = XML_STATE_CHARS_BEFORE_ETAG; + return XML_STATE_CHARS; + case XML_STATE_CHARS_BEFORE_ETAG: + fbgrow_reset(ctx->chars); + } + + if (ctx->want & XML_WANT_ETAG) + return ctx->state = XML_STATE_ETAG; + case XML_STATE_ETAG: + + xml_parse_etag(ctx); + + if (!ctx->element) + goto epilog; + } + } + +epilog: + /* Misc* */ + DBG("XML: Entering epilog"); + while (1) + { + /* Epilog whitespace is the only place, where a valid document can reach EOF */ + if (setjmp(throw_buf)) + if (ctx->err_code == XML_ERR_EOF) + { + DBG("XML: Reached EOF"); + ctx->state = XML_STATE_EOF; + if (ctx->h_document_end) + ctx->h_document_end(ctx); + case XML_STATE_EOF: + return XML_STATE_EOF; + } + else + goto error; + xml_parse_white(ctx, 0); + if (setjmp(throw_buf)) + goto error; + + /* Misc */ + xml_parse_char(ctx, '<'); + if ((c = xml_get_char(ctx)) == '?') + /* Processing instruction */ + if (!(ctx->want & XML_WANT_PI)) + xml_skip_pi(ctx); + else + { + xml_push_pi(ctx); + return ctx->state = XML_STATE_EPILOG_PI, XML_STATE_PI; + case XML_STATE_EPILOG_PI: + xml_pop_pi(ctx); + } + else if (c == '!') + /* Comment */ + if (!(ctx->want & XML_WANT_COMMENT)) + xml_skip_comment(ctx); + else + { + xml_push_comment(ctx); + return ctx->state = XML_STATE_EPILOG_COMMENT, XML_STATE_COMMENT; + case XML_STATE_EPILOG_COMMENT: + xml_pop_comment(ctx); + } + else + xml_fatal(ctx, "Syntax error in the epilog"); + } + + } + return -1; +} + +#ifdef TEST + +static void +error(struct xml_context *ctx) +{ + msg((ctx->err_code < XML_ERR_ERROR) ? L_WARN_R : L_ERROR_R, "XML: %s", ctx->err_msg); +} + +static void +test(struct fastbuf *in, struct fastbuf *out) +{ + struct xml_context ctx; + xml_init(&ctx); + ctx.h_warn = ctx.h_error = ctx.h_fatal = error; + ctx.want = XML_WANT_ALL; + xml_set_source(&ctx, in); + int state; + while ((state = xml_next(&ctx)) >= 0) + switch (state) + { + case XML_STATE_CHARS: + bprintf(out, "CHARS [%.*s]\n", (int)(ctx.chars->bstop - ctx.chars->buffer), ctx.chars->buffer); + break; + case XML_STATE_STAG: + bprintf(out, "STAG <%s>\n", ctx.element->name); + for (struct xml_attribute *a = ctx.element->attrs; a; a = a->next) + bprintf(out, " ATTR %s=[%s]\n", a->name, a->value); + break; + case XML_STATE_ETAG: + bprintf(out, "ETAG \n", ctx.element->name); + break; + case XML_STATE_COMMENT: + bprintf(out, "COMMENT [%.*s]\n", (int)(ctx.value->bstop - ctx.value->buffer), ctx.value->buffer); + break; + case XML_STATE_PI: + bprintf(out, "PI [%s] [%.*s]\n", ctx.name, (int)(ctx.value->bstop - ctx.value->buffer), ctx.value->buffer); + break; + case XML_STATE_CDATA: + bprintf(out, "CDATA [%.*s]\n", (int)(ctx.chars->bstop - ctx.chars->buffer), ctx.chars->buffer); + break; + case XML_STATE_EOF: + bprintf(out, "EOF\n"); + goto end; + default: + bprintf(out, "STATE %u\n", state); + break; + } +end: + xml_cleanup(&ctx); +} + +int +main(void) +{ + struct fastbuf *in = bfdopen_shared(0, 1024); + struct fastbuf *out = bfdopen_shared(1, 1024); + test(in, out); + bclose(out); + return 0; +} + +#endif diff --git a/lib/xml.h b/lib/xml.h new file mode 100644 index 00000000..02e62462 --- /dev/null +++ b/lib/xml.h @@ -0,0 +1,318 @@ +/* + * UCW Library -- A simple XML parser + * + * (c) 2007 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _UCW_XML_H +#define _UCW_XML_H + +#include "lib/clists.h" +#include "lib/slists.h" + +enum xml_error { + XML_ERR_OK = 0, + XML_ERR_WARN = 1000, /* Warning */ + XML_ERR_ERROR = 2000, /* Recoverable error */ + XML_ERR_FATAL = 3000, /* Unrecoverable error */ + XML_ERR_EOF, +}; + +enum xml_state { + XML_STATE_START = 0, + XML_STATE_DECL, + XML_STATE_DOCUMENT_TYPE, + XML_STATE_CHARS, + XML_STATE_WHITE, + XML_STATE_CDATA, + XML_STATE_STAG, + XML_STATE_ETAG, + XML_STATE_COMMENT, + XML_STATE_PI, + XML_STATE_EOF, + XML_STATE_FATAL, + + /* Internal states */ + XML_STATE_CHARS_BEFORE_STAG, + XML_STATE_CHARS_BEFORE_ETAG, + XML_STATE_CHARS_BEFORE_CDATA, + XML_STATE_CHARS_BEFORE_PI, + XML_STATE_CHARS_BEFORE_COMMENT, + XML_STATE_PROLOG_PI, + XML_STATE_PROLOG_COMMENT, + XML_STATE_EPILOG_PI, + XML_STATE_EPILOG_COMMENT, +}; + +enum xml_want { + XML_WANT_DECL = 1 << XML_STATE_DECL, + XML_WANT_DOCUMENT_TYPE = 1 << XML_STATE_DOCUMENT_TYPE, + XML_WANT_CHARS = 1 << XML_STATE_CHARS, + XML_WANT_WHITE = 1 << XML_STATE_WHITE, + XML_WANT_CDATA = 1 << XML_STATE_CDATA, + XML_WANT_STAG = 1 << XML_STATE_STAG, + XML_WANT_ETAG = 1 << XML_STATE_ETAG, + XML_WANT_COMMENT = 1 << XML_STATE_COMMENT, + XML_WANT_PI = 1 << XML_STATE_PI, + XML_WANT_EOF = 1 << XML_STATE_EOF, + XML_WANT_ALL = ~0U, +}; + +enum xml_flags { + XML_FLAG_VALIDATING = 0x1, + XML_FLAG_VERSION_1_1 = 0x2, +}; + +struct xml_ext_id { + char *system_id; + char *public_id; +}; + +enum xml_node_type { + XML_NODE_ELEM, + XML_NODE_COMMENT, + XML_NODE_CDATA, + XML_NODE_PI, +}; + +#define XML_BUF_SIZE 32 + +struct xml_source { + struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ + struct fastbuf *fb; + u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ + u32 *bptr, *bstop; /* Current state of the buffer */ + uns depth; + uns flags; +}; + +enum xml_source_flags { + XML_SRC_DECL = 0x1, /* Expected document/text declaration */ + XML_SRC_EOF = 0x2, /* Reached the end of the fastbuf */ + XML_SRC_NEW_LINE = 0x4, /* The last read character is 0xD */ + XML_SRC_SURROUND = 0x8, /* Surround the text with 0x20 (references to parameter entities) */ + XML_SRC_DOCUMENT = 0x10, /* The document entity */ + XML_SRC_EXTERNAL = 0x20, /* An external entity */ +}; + +#if 0 +struct xml_node { + cnode n; /* Node for list of parent's sons */ + uns type; /* XML_NODE_x */ + struct xml_node *parent; /* Parent node */ +}; + +struct xml_elem { + struct xml_node node; + char *name; /* Element name */ + clist sons; /* List of subnodes */ + struct xml_dtd_elem *dtd; /* Element DTD */ + slist attrs; /* Link list of attributes */ +}; +#endif + +struct xml_context { + /* Error handling */ + char *err_msg; /* Last error message */ + enum xml_error err_code; /* Last error code */ + void *throw_buf; /* Where to jump on error */ + void (*h_warn)(struct xml_context *ctx); /* Warning callback */ + void (*h_error)(struct xml_context *ctx); /* Recoverable error callback */ + void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */ + + /* Memory management */ + struct mempool *pool; /* Most data */ + struct fastbuf *chars; /* Character data */ + struct fastbuf *value; /* Attribute value / comment / processing instruction data */ + char *name; /* Attribute name, processing instruction target */ + + /* Input */ + struct xml_source *sources; /* Stack of pending sources */ + u32 *bptr, *bstop; /* Character buffer */ + uns depth; /* Nesting level */ + + /* SAX-like interface */ + void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */ + void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */ + void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */ + void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration just before internal subset */ + void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction */ + void (*h_comment)(struct xml_context *ctx); /* Called after a comment */ + + /* */ + struct xml_node *node; /* Current XML node */ + uns flags; /* XML_FLAG_x */ + struct xml_element *element; /* Current element */ + void *attribute_table; + char *version_str; + char *encoding; + uns standalone; + char *document_type; + struct xml_dtd *dtd; + struct xml_ext_id eid; + uns state; + uns want; + + void (*start_dtd)(struct xml_context *ctx); + void (*end_dtd)(struct xml_context *ctx); + void (*start_element)(struct xml_context *ctx); + void (*end_element)(struct xml_context *ctx); + void (*start_cdata)(struct xml_context *ctx); + void (*end_cdata)(struct xml_context *ctx); + void (*start_entity)(struct xml_context *ctx); + void (*end_entity)(struct xml_context *ctx); + void (*chacacters)(struct xml_context *ctx); + struct fastbuf *(*resolve_entity)(struct xml_context *ctx); + void (*notation_decl)(struct xml_context *ctx); + void (*unparsed_entity_decl)(struct xml_context *ctx); +}; + +struct xml_attribute { + char *name; + char *value; + struct xml_element *element; + struct xml_attribute *next; + struct xml_dtd_attribute *dtd; +}; + +struct xml_element { + char *name; + struct xml_attribute *attrs; + struct xml_element *parent; + struct xml_dtd_element *dtd; +}; + +/*** Document Type Definition (DTD) ***/ + +struct xml_dtd { + slist gents; /* Link list of general entities */ + slist pents; /* Link list of parapeter entities */ + slist notns; /* Link list of notations */ + slist elems; /* Link list of elements */ + void *tab_gents; /* Hash table of general entities */ + void *tab_pents; /* Hash table of parameter entities */ + void *tab_notns; /* Hash table of notations */ + void *tab_elems; /* Hash table of elements */ + void *tab_attrs; /* Hash table of element attributes */ + void *tab_evals; /* Hash table of enumerated attribute values */ + void *tab_enotns; /* hash table of enumerated attribute notations */ +}; + +/* Notations */ + +enum xml_dtd_notn_flags { + XML_DTD_NOTN_DECLARED = 0x1, /* The notation has been declared (interbal usage) */ +}; + +struct xml_dtd_notn { + snode n; /* Node in xml_dtd.notns */ + uns flags; /* XML_DTD_NOTN_x */ + char *name; /* Notation name */ + struct xml_ext_id eid; /* External id */ +}; + +/* Entities */ + +enum xml_dtd_ent_flags { + XML_DTD_ENT_DECLARED = 0x1, /* The entity has been declared (internal usage) */ + XML_DTD_ENT_VISITED = 0x2, /* Cycle detection (internal usage) */ + XML_DTD_ENT_PARAMETER = 0x4, /* Parameter entity, general otherwise */ + XML_DTD_ENT_EXTERNAL = 0x8, /* External entity, internal otherwise */ + XML_DTD_ENT_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */ + XML_DTD_ENT_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */ +}; + +struct xml_dtd_ent { + snode n; /* Node in xml_dtd.[gp]ents */ + uns flags; /* XML_DTD_ENT_x */ + char *name; /* Entity name */ + char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRVIAL) */ + uns len; /* Text length */ + struct xml_ext_id eid; /* External ID */ + struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */ +}; + +/* Elements */ + +enum xml_dtd_elem_flags { + XML_DTD_ELEM_DECLARED = 0x1, /* The element has been declared (internal usage) */ +}; + +struct xml_dtd_elem { + snode n; + uns flags; + char *name; + struct xml_dtd_elem_node *node; +}; + +struct xml_dtd_elem_node { + snode n; + struct xml_dtd_elem_node *parent; + slist sons; + uns type; + uns occur; +}; + +enum xml_dtd_elem_node_type { + XML_DTD_ELEM_PCDATA, + XML_DTD_ELEM_SEQ, + XML_DTD_ELEM_OR, +}; + +enum xml_dtd_elem_node_occur { + XML_DTD_ELEM_OCCUR_ONCE, + XML_DTD_ELEM_OCCUR_OPT, + XML_DTD_ELEM_OCCUR_MULT, + XML_DTD_ELEM_OCCUR_PLUS, +}; + +/* Attributes */ + + +enum xml_dtd_attribute_default { + XML_ATTR_NONE, + XML_ATTR_REQUIRED, + XML_ATTR_IMPLIED, + XML_ATTR_FIXED, +}; + +enum xml_dtd_attribute_type { + XML_ATTR_CDATA, + XML_ATTR_ID, + XML_ATTR_IDREF, + XML_ATTR_IDREFS, + XML_ATTR_ENTITY, + XML_ATTR_ENTITIES, + XML_ATTR_NMTOKEN, + XML_ATTR_NMTOKENS, + XML_ATTR_ENUM, + XML_ATTR_NOTATION, +}; + +struct xml_dtd_attr { + char *name; + struct xml_dtd_elem *elem; + enum xml_dtd_attribute_type type; + enum xml_dtd_attribute_default default_mode; + char *default_value; +}; + +struct xml_dtd_eval { + struct xml_dtd_attr *attr; + char *val; +}; + +struct xml_dtd_enotn { + struct xml_dtd_attr *attr; + struct xml_dtd_notn *notn; +}; + +void xml_init(struct xml_context *ctx); +void xml_cleanup(struct xml_context *ctx); +void xml_set_source(struct xml_context *ctx, struct fastbuf *fb); +int xml_next(struct xml_context *ctx); + +#endif -- 2.39.2