From: Pavel Charvat Date: Mon, 10 Dec 2007 09:40:08 +0000 (+0100) Subject: XML: Backuped incomplete XML parser. Changes mostly from friday. X-Git-Tag: holmes-import~496 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=55449f070299728b5f4b5354b30e4c7e235873f4;p=libucw.git XML: Backuped incomplete XML parser. Changes mostly from friday. --- diff --git a/lib/Makefile b/lib/Makefile index 669f356b..52751e11 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -33,8 +33,7 @@ LIBUCW_MODS= \ qache \ string \ bbuf \ - getopt \ - xml + getopt LIBUCW_INCLUDES= \ lib.h config.h threads.h \ @@ -61,8 +60,7 @@ LIBUCW_INCLUDES= \ base64.h base224.h \ qache.h \ kmp.h kmp-search.h binsearch.h \ - partmap.h \ - xml.h + partmap.h ifdef CONFIG_UCW_THREADS # Some modules require threading @@ -88,11 +86,6 @@ $(o)/lib/libucw.so: $(addsuffix .oo,$(LIBUCW_MOD_PATHS)) $(o)/lib/hashfunc.o $(o)/lib/hashfunc.oo: CFLAGS += -funroll-loops $(o)/lib/lizard.o: CFLAGS += $(COPT2) -funroll-loops -$(o)/lib/xml.o: $(o)/lib/xml-ucat.h -$(o)/lib/xml-ucat.h: $(s)/lib/xml-ucat.pl - $(M)GEN $@ - $(Q)$< >$@ - $(o)/lib/db-test: $(o)/lib/db-test.o $(LIBUCW) $(o)/lib/db-tool: $(o)/lib/db-tool.o $(LIBUCW) $(o)/lib/conf-test: $(o)/lib/conf-test.o $(LIBUCW) diff --git a/lib/xml-ucat.pl b/lib/xml-ucat.pl deleted file mode 100755 index cbfb8d34..00000000 --- a/lib/xml-ucat.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/perl -# -# UCW Library -- Character map for the XML parser -# -# (c) 2007 Pavel Charvat -# -# This software may be freely distributed and used according to the terms -# of the GNU Lesser General Public License. -# - -my @cat = (); -my @lcat = (); -my %ids = (); -my %cls = (); -for (my $i = 0; $i < 0x10000; $i++) { $cat[$i] = 0; } -for (my $i = 0; $i < 0x11; $i++) { $lcat[$i] = 0; } - -my @white = (0x9, 0xA, 0xD, 0x20); -my @base_char_1_0 = ( - [0x0041,0x005A], [0x0061,0x007A], [0x00C0,0x00D6], [0x00D8,0x00F6], [0x00F8,0x00FF], [0x0100,0x0131], - [0x0134,0x013E], [0x0141,0x0148], [0x014A,0x017E], [0x0180,0x01C3], [0x01CD,0x01F0], [0x01F4,0x01F5], - [0x01FA,0x0217], [0x0250,0x02A8], [0x02BB,0x02C1], 0x0386, [0x0388,0x038A], 0x038C, [0x038E,0x03A1], - [0x03A3,0x03CE], [0x03D0,0x03D6], 0x03DA, 0x03DC, 0x03DE, 0x03E0, [0x03E2,0x03F3], [0x0401,0x040C], - [0x040E,0x044F], [0x0451,0x045C], [0x045E,0x0481], [0x0490,0x04C4], [0x04C7,0x04C8], [0x04CB,0x04CC], - [0x04D0,0x04EB], [0x04EE,0x04F5], [0x04F8,0x04F9], [0x0531,0x0556], 0x0559, [0x0561,0x0586], [0x05D0,0x05EA], - [0x05F0,0x05F2], [0x0621,0x063A], [0x0641,0x064A], [0x0671,0x06B7], [0x06BA,0x06BE], [0x06C0,0x06CE], - [0x06D0,0x06D3], 0x06D5, [0x06E5,0x06E6], [0x0905,0x0939], 0x093D, [0x0958,0x0961], [0x0985,0x098C], - [0x098F,0x0990], [0x0993,0x09A8], [0x09AA,0x09B0], 0x09B2, [0x09B6,0x09B9], [0x09DC,0x09DD], [0x09DF,0x09E1], - [0x09F0,0x09F1], [0x0A05,0x0A0A], [0x0A0F,0x0A10], [0x0A13,0x0A28], [0x0A2A,0x0A30], [0x0A32,0x0A33], - [0x0A35,0x0A36], [0x0A38,0x0A39], [0x0A59,0x0A5C], 0x0A5E, [0x0A72,0x0A74], [0x0A85,0x0A8B], 0x0A8D, - [0x0A8F,0x0A91], [0x0A93,0x0AA8], [0x0AAA,0x0AB0], [0x0AB2,0x0AB3], [0x0AB5,0x0AB9], 0x0ABD, 0x0AE0, - [0x0B05,0x0B0C], [0x0B0F,0x0B10], [0x0B13,0x0B28], [0x0B2A,0x0B30], [0x0B32,0x0B33], [0x0B36,0x0B39], - 0x0B3D, [0x0B5C,0x0B5D], [0x0B5F,0x0B61], [0x0B85,0x0B8A], [0x0B8E,0x0B90], [0x0B92,0x0B95], [0x0B99,0x0B9A], - 0x0B9C, [0x0B9E,0x0B9F], [0x0BA3,0x0BA4], [0x0BA8,0x0BAA], [0x0BAE,0x0BB5], [0x0BB7,0x0BB9], [0x0C05,0x0C0C], - [0x0C0E,0x0C10], [0x0C12,0x0C28], [0x0C2A,0x0C33], [0x0C35,0x0C39], [0x0C60,0x0C61], [0x0C85,0x0C8C], - [0x0C8E,0x0C90], [0x0C92,0x0CA8], [0x0CAA,0x0CB3], [0x0CB5,0x0CB9], 0x0CDE, [0x0CE0,0x0CE1], [0x0D05,0x0D0C], - [0x0D0E,0x0D10], [0x0D12,0x0D28], [0x0D2A,0x0D39], [0x0D60,0x0D61], [0x0E01,0x0E2E], 0x0E30, [0x0E32,0x0E33], - [0x0E40,0x0E45], [0x0E81,0x0E82], 0x0E84, [0x0E87,0x0E88], 0x0E8A, 0x0E8D, [0x0E94,0x0E97], [0x0E99,0x0E9F], - [0x0EA1,0x0EA3], 0x0EA5, 0x0EA7, [0x0EAA,0x0EAB], [0x0EAD,0x0EAE], 0x0EB0, [0x0EB2,0x0EB3], 0x0EBD, - [0x0EC0,0x0EC4], [0x0F40,0x0F47], [0x0F49,0x0F69], [0x10A0,0x10C5], [0x10D0,0x10F6], 0x1100, [0x1102,0x1103], - [0x1105,0x1107], 0x1109, [0x110B,0x110C], [0x110E,0x1112], 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150, - [0x1154,0x1155], 0x1159, [0x115F,0x1161], 0x1163, 0x1165, 0x1167, 0x1169, [0x116D,0x116E], [0x1172,0x1173], - 0x1175, 0x119E, 0x11A8, 0x11AB, [0x11AE,0x11AF], [0x11B7,0x11B8], 0x11BA, [0x11BC,0x11C2], 0x11EB, 0x11F0, - 0x11F9, [0x1E00,0x1E9B], [0x1EA0,0x1EF9], [0x1F00,0x1F15], [0x1F18,0x1F1D], [0x1F20,0x1F45], [0x1F48,0x1F4D], - [0x1F50,0x1F57], 0x1F59, 0x1F5B, 0x1F5D, [0x1F5F,0x1F7D], [0x1F80,0x1FB4], [0x1FB6,0x1FBC], 0x1FBE, - [0x1FC2,0x1FC4], [0x1FC6,0x1FCC], [0x1FD0,0x1FD3], [0x1FD6,0x1FDB], [0x1FE0,0x1FEC], [0x1FF2,0x1FF4], - [0x1FF6,0x1FFC], 0x2126, [0x212A,0x212B], 0x212E, [0x2180,0x2182], [0x3041,0x3094], [0x30A1,0x30FA], - [0x3105,0x312C], [0xAC00,0xD7A3]); -my @ideographic_1_0 = ([0x4E00,0x9FA5], 0x3007, [0x3021,0x3029]); -my @combining_char_1_0 = ( - [0x0300,0x0345], [0x0360,0x0361], [0x0483,0x0486], [0x0591,0x05A1], [0x05A3,0x05B9], [0x05BB,0x05BD], - 0x05BF, [0x05C1,0x05C2], 0x05C4, [0x064B,0x0652], 0x0670, [0x06D6,0x06DC], [0x06DD,0x06DF], [0x06E0,0x06E4], - [0x06E7,0x06E8], [0x06EA,0x06ED], [0x0901,0x0903], 0x093C, [0x093E,0x094C], 0x094D, [0x0951,0x0954], - [0x0962,0x0963], [0x0981,0x0983], 0x09BC, 0x09BE, 0x09BF, [0x09C0,0x09C4], [0x09C7,0x09C8], [0x09CB,0x09CD], - 0x09D7, [0x09E2,0x09E3], 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, [0x0A40,0x0A42], [0x0A47,0x0A48], [0x0A4B,0x0A4D], - [0x0A70,0x0A71], [0x0A81,0x0A83], 0x0ABC, [0x0ABE,0x0AC5], [0x0AC7,0x0AC9], [0x0ACB,0x0ACD], [0x0B01,0x0B03], - 0x0B3C, [0x0B3E,0x0B43], [0x0B47,0x0B48], [0x0B4B,0x0B4D], [0x0B56,0x0B57], [0x0B82,0x0B83], [0x0BBE,0x0BC2], - [0x0BC6,0x0BC8], [0x0BCA,0x0BCD], 0x0BD7, [0x0C01,0x0C03], [0x0C3E,0x0C44], [0x0C46,0x0C48], [0x0C4A,0x0C4D], - [0x0C55,0x0C56], [0x0C82,0x0C83], [0x0CBE,0x0CC4], [0x0CC6,0x0CC8], [0x0CCA,0x0CCD], [0x0CD5,0x0CD6], - [0x0D02,0x0D03], [0x0D3E,0x0D43], [0x0D46,0x0D48], [0x0D4A,0x0D4D], 0x0D57, 0x0E31, [0x0E34,0x0E3A], - [0x0E47,0x0E4E], 0x0EB1, [0x0EB4,0x0EB9], [0x0EBB,0x0EBC], [0x0EC8,0x0ECD], [0x0F18,0x0F19], 0x0F35, - 0x0F37, 0x0F39, 0x0F3E, 0x0F3F, [0x0F71,0x0F84], [0x0F86,0x0F8B], [0x0F90,0x0F95], 0x0F97, [0x0F99,0x0FAD], - [0x0FB1,0x0FB7], 0x0FB9, [0x20D0,0x20DC], 0x20E1, [0x302A,0x302F], 0x3099, 0x309A); -my @digit_1_0 = ( - [0x0030,0x0039], [0x0660,0x0669], [0x06F0,0x06F9], [0x0966,0x096F], [0x09E6,0x09EF], [0x0A66,0x0A6F], - [0x0AE6,0x0AEF], [0x0B66,0x0B6F], [0x0BE7,0x0BEF], [0x0C66,0x0C6F], [0x0CE6,0x0CEF], [0x0D66,0x0D6F], - [0x0E50,0x0E59], [0x0ED0,0x0ED9], [0x0F20,0x0F29]); -my @extender_1_0 = ( - 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, [0x3031,0x3035], [0x309D,0x309E], [0x30FC,0x30FE]); -my @sname_1_1 = ( - "[:A-Z_a-z]", [0xC0,0xD6], [0xD8,0xF6], [0xF8,0x2FF], [0x370,0x37D], [0x37F,0x1FFF], - [0x200C,0x200D], [0x2070,0x218F], [0x2C00,0x2FEF], [0x3001,0xD7FF], [0xF900,0xFDCF], [0xFDF0,0xFFFD], [0x10000,0xEFFFF]); - -set("WHITE", @white); -set("NEW_LINE_1_0", 0xA, 0xD); -set("NEW_LINE_1_1", 0xA, 0xD, 0x85, 0x2028); -set("DIGIT", "[0-9]"); -set("XDIGIT", "[0-9a-fA-F]"); -set("VALID_1_0", @white, [0x20,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); -set("VALID_1_1", [0x1,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); -set("UNRESTRICTED_1_1", @white, [0x20,0x7E], 0x85, [0xA0,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); -set("PUBID", 0x20, 0xD, 0xA, "[-a-zA-Z0-9'()+,./:=?:!*#@\$_%]"); -set("ENC_SNAME", "[a-zA-Z]"); -set("ENC_NAME", "[-a-zA-Z0-9._]"); -set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0); -set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0); -set("SNAME_1_1", @sname_1_1); -set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]); - -print "/* Automatically generated by xml-ucat.pl */\n\n"; -find_cls(); -gen_enum(); -gen_tabs(); - -sub set { - my $id = shift; - $ids{$id} = scalar keys(%ids) if !defined($ids{$id}); - my $mask = 1 << $ids{$id}; - foreach my $i (@_) { - if (ref($i) eq "ARRAY") { - my $j = $i->[0]; - for (; $j <= $i->[1] && $j < 0x10000; $j++) { $cat[$j] |= $mask; } - for (; $j <= $i->[1]; $j += 0x10000) { $lcat[$j >> 16] |= $mask; } - } - elsif ($i =~ /^\[/) { for (my $j=0; $j < 128; $j++) { if (chr($j) =~ /$i/) { $cat[$j] |= $mask; } } } - else { $cat[$i] |= $mask; } - } -} - -sub find_cls { - foreach (my $i=0; $i<@cat; $i++) { $cls{$cat[$i]} = scalar keys(%cls) if !defined($cls{$cat[$i]}); } - foreach (my $i=0; $i<@lcat; $i++) { $cls{$lcat[$i]} = scalar keys(%cls) if !defined($cls{$lcat[$i]}); } -} - -sub gen_enum { - print "enum xml_char_type {\n"; - foreach my $id (sort keys %ids) { - my $mask = 0; - foreach my $i (keys %cls) { - $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id})); - } - printf " XML_CHAR_%-20s = 0x%08x,\n", $id, $mask; - } - print "};\n\n"; -} - -sub gen_tabs { - my @tab = (); - my %hash = (); - print "static const uns xml_char_tab1[] = {\n "; - for (my $t=0; $t<256; $t++) { - my $i = $t * 256; - my @x = (); - for (my $j=0; $j<256; $j += 32) { - push @x, join(",", map($cls{$_}, @cat[$i+$j..$i+$j+31])); - } - my $sub = " " . join(",\n ", @x); - if (!defined($hash{$sub})) { - $hash{$sub} = 256 * scalar @tab; - push @tab, $sub; - } - printf("0x%x", $hash{$sub}); - print((~$t & 15) ? "," : ($t < 255) ? ",\n " : "\n};\n\n"); - } - - print "static const byte xml_char_tab2[] = {\n"; - print join(",\n\n", @tab); - print "\n};\n\n"; - - my @l = (); - for (my $i=0; $i<0x11; $i++) { - push @l, sprintf("%d", $cls{$lcat[$i]}); - } - print "static const byte xml_char_tab3[] = {" . join(",", @l) . "};\n"; -} diff --git a/lib/xml.c b/lib/xml.c deleted file mode 100644 index 828b4c15..00000000 --- a/lib/xml.c +++ /dev/null @@ -1,2099 +0,0 @@ -/* - * UCW Library -- A simple XML parser - * - * (c) 2007 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -/* TODO: - * - various character encodings - * - iface - * - stack-like memory handling where possible - */ - -#define LOCAL_DEBUG - -#include "lib/lib.h" -#include "lib/mempool.h" -#include "lib/fastbuf.h" -#include "lib/ff-utf8.h" -#include "lib/chartype.h" -#include "lib/unicode.h" -#include "lib/xml.h" -#include "lib/hashfunc.h" -#include "lib/stkstring.h" -#include "charset/unicat.h" - -#include - -/*** Error handling ***/ - -static void NONRET -xml_throw(struct xml_context *ctx) -{ - ASSERT(ctx->err_code && ctx->throw_buf); - longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code); -} - -static void -xml_warn(struct xml_context *ctx, const char *format, ...) -{ - if (ctx->h_warn) - { - va_list args; - va_start(args, format); - ctx->err_msg = stk_vprintf(format, args); - ctx->err_code = XML_ERR_WARN; - va_end(args); - ctx->h_warn(ctx); - ctx->err_msg = NULL; - ctx->err_code = XML_ERR_OK; - } -} - -static void -xml_error(struct xml_context *ctx, const char *format, ...) -{ - if (ctx->h_error) - { - va_list args; - va_start(args, format); - ctx->err_msg = stk_vprintf(format, args); - ctx->err_code = XML_ERR_ERROR; - va_end(args); - ctx->h_error(ctx); - ctx->err_msg = NULL; - ctx->err_code = XML_ERR_OK; - } -} - -static void NONRET -xml_fatal(struct xml_context *ctx, const char *format, ...) -{ - va_list args; - va_start(args, format); - ctx->err_msg = mp_vprintf(ctx->pool, format, args); - ctx->err_code = XML_ERR_FATAL; - ctx->state = XML_STATE_FATAL; - va_end(args); - if (ctx->h_fatal) - ctx->h_fatal(ctx); - xml_throw(ctx); -} - -/*** Charecter categorization ***/ - -#include "obj/lib/xml-ucat.h" - -static inline uns -xml_char_cat(uns c) -{ - if (c < 0x10000) - return 1U << xml_char_tab2[(c & 0xff) + xml_char_tab1[c >> 8]]; - else if (likely(c < 0x110000)) - return 1U << xml_char_tab3[c >> 16]; - else - return 1; -} - -/*** Reading of document/external entities ***/ - -static void NONRET -xml_eof(struct xml_context *ctx) -{ - ctx->err_msg = "Unexpected EOF"; - ctx->err_code = XML_ERR_EOF; - xml_throw(ctx); -} - -static void NONRET -xml_fatal_nested(struct xml_context *ctx) -{ - xml_fatal(ctx, "Entity is not tested correctly"); -} - -static inline void -xml_inc_depth(struct xml_context *ctx) -{ - ctx->depth++; -} - -static inline void -xml_dec_depth(struct xml_context *ctx) -{ - if (unlikely(!ctx->depth)) - xml_fatal_nested(ctx); - ctx->depth--; -} - -static void -xml_push_source(struct xml_context *ctx, struct fastbuf *fb, uns flags) -{ - DBG("XML: xml_push_source"); - struct xml_source *osrc = ctx->sources; - if (osrc) - { - osrc->bptr = ctx->bptr; - osrc->bstop = ctx->bstop; - osrc->depth = ctx->depth; - } - struct xml_source *src = mp_alloc(ctx->pool, sizeof(*src)); - src->next = osrc; - src->flags = flags; - src->fb = fb; - ctx->depth = 0; - ctx->sources = src; - ctx->bstop = ctx->bptr = src->buf; - if (flags & XML_SRC_SURROUND) - { - *ctx->bptr++ = 0x20; - *ctx->bptr++ = xml_char_cat(0x20); - } -} - -void -xml_set_source(struct xml_context *ctx, struct fastbuf *fb) -{ - xml_push_source(ctx, fb, XML_SRC_DOCUMENT | XML_SRC_DECL); -} - -static void -xml_pop_source(struct xml_context *ctx) -{ - DBG("XML: xml_pop_source"); - if (unlikely(ctx->depth)) - xml_fatal(ctx, "Invalid entity nesting"); - struct xml_source *src = ctx->sources; - bclose(src->fb); - ctx->sources = src = src->next; - if (unlikely(!src)) - xml_eof(ctx); - ctx->bptr = src->bptr; - ctx->bstop = src->bstop; - ctx->depth = src->depth; -} - -static uns -xml_error_restricted(struct xml_context *ctx, uns c) -{ - xml_error(ctx, "Restricted char U+%04X", c); - return UNI_REPLACEMENT; -} - -static void xml_parse_decl(struct xml_context *ctx); - -static void -xml_refill(struct xml_context *ctx) -{ - // FIXME: - // -- various encodings, especially UTF-16 - // -- track col/row numbers - // -- report incorrect encoding - // -- deal with forbidden XML 1.1 newlines in xml/text decl - do - { - struct xml_source *src = ctx->sources; - uns c, t, t1, t2, f = src->flags; - if (f & XML_SRC_EOF) - xml_pop_source(ctx); - else if (f & XML_SRC_DECL) - xml_parse_decl(ctx); - else - { - struct fastbuf *fb = src->fb; - if (ctx->bptr == ctx->bstop) - ctx->bptr = ctx->bstop = src->buf; - u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, *last_0xd = (f & XML_SRC_NEW_LINE) ? bstop : bend; - if (ctx->flags & XML_FLAG_VERSION_1_1) - { - t2 = XML_CHAR_NEW_LINE_1_1; - t1 = XML_CHAR_UNRESTRICTED_1_1 & ~t2; - } - else - { - t2 = XML_CHAR_NEW_LINE_1_0; - t1 = XML_CHAR_VALID_1_0 & ~t2; - } - while (bstop < bend) - { - c = bget_utf8_32(fb); - t = xml_char_cat(c); - if (t & t1) - { - /* Typical branch */ - *bstop++ = c; - *bstop++ = t; - } - else if (t & t2) - { - /* New line - * XML 1.0: 0xA | 0xD | 0xD 0xA - * XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ - *bstop++ = 0xa; - *bstop++ = xml_char_cat(0xa); - if (c == 0xd) - last_0xd = bstop; - else if (c != 0x2028 && last_0xd != bstop - 2) - bstop -= 2; - } - else if ((int)c >= 0) - { - /* Restricted character */ - c = xml_error_restricted(ctx, c); - *bstop++ = c; - *bstop++ = xml_char_cat(c); - } - else - { - /* EOF */ - if (f & XML_SRC_SURROUND) - { - *bstop++ = 0x20; - *bstop++ = xml_char_cat(0x20); - } - f |= XML_SRC_EOF; - break; - } - } - if (last_0xd == bstop) - f |= XML_SRC_NEW_LINE; - else - f &= ~XML_SRC_NEW_LINE; - ctx->sources->flags = f; - ctx->bstop = bstop; - DBG("XML: refilled %u characters", (uns)(ctx->bstop - ctx->bptr) / 2); - } - } - while (ctx->bptr == ctx->bstop); -} - -static inline uns -xml_peek_char(struct xml_context *ctx) -{ - if (ctx->bptr == ctx->bstop) - xml_refill(ctx); - return ctx->bptr[0]; -} - -static inline uns -xml_peek_cat(struct xml_context *ctx) -{ - if (ctx->bptr == ctx->bstop) - xml_refill(ctx); - return ctx->bptr[1]; -} - -static inline uns -xml_get_char(struct xml_context *ctx) -{ - uns c = xml_peek_char(ctx); - ctx->bptr += 2; - return c; -} - -static inline uns -xml_get_cat(struct xml_context *ctx) -{ - uns c = xml_peek_cat(ctx); - ctx->bptr += 2; - return c; -} - -static inline uns -xml_last_char(struct xml_context *ctx) -{ - return ctx->bptr[-2]; -} - -static inline uns -xml_last_cat(struct xml_context *ctx) -{ - return ctx->bptr[-1]; -} - -static inline uns -xml_skip_char(struct xml_context *ctx) -{ - uns c = ctx->bptr[0]; - ctx->bptr += 2; - return c; -} - -static inline uns -xml_unget_char(struct xml_context *ctx) -{ - return *(ctx->bptr -= 2); -} - -/*** Basic parsing ***/ - -static void NONRET -xml_fatal_expected(struct xml_context *ctx, uns c) -{ - xml_fatal(ctx, "Expected '%c'", c); -} - -static void NONRET -xml_fatal_expected_white(struct xml_context *ctx) -{ - xml_fatal(ctx, "Expected a white space"); -} - -static void NONRET -xml_fatal_expected_quot(struct xml_context *ctx) -{ - xml_fatal(ctx, "Expected a quotation mark"); -} - -static inline uns -xml_parse_white(struct xml_context *ctx, uns mandatory) -{ - /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+ - * mandatory=0 -> S? */ - uns cnt = 0; - while (xml_peek_cat(ctx) & XML_CHAR_WHITE) - { - xml_skip_char(ctx); - cnt++; - } - if (unlikely(mandatory && !cnt)) - xml_fatal_expected_white(ctx); - return cnt; -} - -static inline void -xml_parse_char(struct xml_context *ctx, uns c) -{ - /* Consumes a given Unicode character */ - if (unlikely(c != xml_get_char(ctx))) - xml_fatal_expected(ctx, c); -} - -static inline void -xml_parse_seq(struct xml_context *ctx, const char *seq) -{ - /* Consumes a given sequence of ASCII characters */ - while (*seq) - xml_parse_char(ctx, *seq++); -} - -static void -xml_parse_eq(struct xml_context *ctx) -{ - /* Eq ::= S? '=' S? */ - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '='); - xml_parse_white(ctx, 0); -} - -static inline uns -xml_parse_quote(struct xml_context *ctx) -{ - /* "'" | '"' */ - uns c = xml_get_char(ctx); - if (unlikely(c != '\'' && c != '\"')) - xml_fatal_expected_quot(ctx); - return c; -} - -/* Names and nmtokens */ - -static char * -xml_parse_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err) -{ - char *p = mp_start_noalign(ctx->pool, 1); - if (unlikely(!(xml_peek_cat(ctx) & first_cat))) - xml_fatal(ctx, "%s", err); - do - { - p = mp_spread(ctx->pool, p, 5); - p = utf8_32_put(p, xml_skip_char(ctx)); - } - while (xml_peek_cat(ctx) & next_cat); - *p++ = 0; - return mp_end(ctx->pool, p); -} - -static void -xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err) -{ - if (unlikely(!(xml_get_cat(ctx) & first_cat))) - xml_fatal(ctx, "%s", err); - while (xml_peek_cat(ctx) & next_cat) - xml_skip_char(ctx); -} - -static char * -xml_parse_name(struct xml_context *ctx) -{ - /* Name ::= NameStartChar (NameChar)* */ - return xml_parse_string(ctx, - !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, - !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, - "Expected a name"); -} - -static void -xml_skip_name(struct xml_context *ctx) -{ - xml_skip_string(ctx, - !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, - !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, - "Expected a name"); -} - -static char * -xml_parse_nmtoken(struct xml_context *ctx) -{ - /* Nmtoken ::= (NameChar)+ */ - uns cat = !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1; - return xml_parse_string(ctx, cat, cat, "Expected a nmtoken"); -} - -/* Simple literals */ - -static char * -xml_parse_system_literal(struct xml_context *ctx) -{ - /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */ - char *p = mp_start_noalign(ctx->pool, 1); - uns q = xml_parse_quote(ctx), c; - while ((c = xml_get_char(ctx)) != q) - { - p = mp_spread(ctx->pool, p, 5); - p = utf8_32_put(p, c); - } - *p++ = 0; - return mp_end(ctx->pool, p); -} - -static char * -xml_parse_pubid_literal(struct xml_context *ctx) -{ - /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */ - char *p = mp_start_noalign(ctx->pool, 1); - uns q = xml_parse_quote(ctx), c; - while ((c = xml_get_char(ctx)) != q) - { - if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID))) - xml_fatal(ctx, "Expected a pubid character"); - p = mp_spread(ctx->pool, p, 2); - *p++ = c; - } - *p++ = 0; - return mp_end(ctx->pool, p); -} - -static char * -xml_parse_encoding_name(struct xml_context *ctx) -{ - /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */ - char *p = mp_start_noalign(ctx->pool, 1); - uns q = xml_parse_quote(ctx); - if (unlikely(!(xml_peek_cat(ctx) & XML_CHAR_ENC_SNAME))) - xml_fatal(ctx, "Invalid character in the encoding name"); - while(1) - { - p = mp_spread(ctx->pool, p, 2); - *p++ = xml_skip_char(ctx); - if (xml_get_char(ctx) == q) - break; - if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME))) - xml_fatal(ctx, "Invalid character in the encoding name"); - } - *p++ = 0; - return mp_end(ctx->pool, p); -} - -/* Document/external entity header */ - -static void -xml_detect_encoding(struct xml_context *ctx) -{ - DBG("XML: xml_detect_encoding"); - struct xml_source *src = ctx->sources; - struct fastbuf *fb = src->fb; - char *detected_encoding = NULL; - uns x = 0, l = 0, c, z = 1; - while (l < 4) - { - if (!~(c = bgetc(fb))) - { - src->flags |= XML_SRC_EOF; - break; - } - else if (!c || c >= 0xfe || c == 0xa7 || c == 0x94) - z = 0; - else if ((c < 0x3c || c > 0x78)) - { - bungetc(fb); - break; - } - x = (x << 8) + c; - l++; - } - if (z) - z = x; - else if (l == 2) - switch (x) - { - case 0xFEFF: - xml_fatal(ctx, "UTF-16BE encoding not supported"); - case 0xFFFE: - xml_fatal(ctx, "UTF-16LE encoding not supported"); - default: - goto cannot_detect; - } - else if (l == 4) - switch (x) - { - case 0x0000FEFF: - xml_fatal(ctx, "UCS-4BE encoding not supported"); - case 0xFFFE0000: - xml_fatal(ctx, "UCS-4LE encoding not supported"); - case 0x0000FFFE: - xml_fatal(ctx, "UCS-4 encoding (order 2143) not supported"); - case 0xFEFF0000: - xml_fatal(ctx, "UCS-4 encoding (order 3412) not supported"); - case 0x0000003c: - xml_fatal(ctx, "UCS-4BE encoding not supported"); - case 0x3c000000: - xml_fatal(ctx, "UCS-4LE encoding not supported"); - case 0x00003c00: - xml_fatal(ctx, "UCS-4 encoding (order 2143) not supported"); - case 0x003c0000: - xml_fatal(ctx, "UCS-4 encoding (order 3412) not supported"); - case 0x003c003F: - xml_fatal(ctx, "UTF-16BE encoding not supported"); - case 0x3C003F00: - xml_fatal(ctx, "UTF-16LE encoding not supported"); - case 0x3C3F786D: - xml_fatal(ctx, "EBCDIC encoding not supported"); - default: - goto cannot_detect; - } - else -cannot_detect: - xml_fatal(ctx, "Cannot detect the encoding"); - ctx->bptr = ctx->bstop = src->buf + 8; - while (z) - { - c = z & 0xff; - z >>= 8; - *--ctx->bptr = xml_char_cat(c); - *--ctx->bptr = c; - } - if (!detected_encoding && ctx->bstop == ctx->bptr && xml_peek_char(ctx) == 0xfeff) - xml_skip_char(ctx); - DBG("XML: Detected encoding: %s", detected_encoding ? : "UTF-8"); - if (!(src->flags & XML_SRC_EOF)) - xml_refill(ctx); -} - -static void -xml_parse_decl(struct xml_context *ctx) -{ - DBG("XML: xml_parse_decl"); - ctx->sources->flags &= ~XML_SRC_DECL; - xml_detect_encoding(ctx); - uns document = ctx->sources->flags & XML_SRC_DOCUMENT; - u32 *bptr = ctx->bptr; - uns have_decl = - (12 <= ctx->bstop - ctx->bptr && - bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L' && - (bptr[11] & XML_CHAR_WHITE)); - if (!have_decl) - { - if (document) - xml_fatal(ctx, "Missing or corrupted XML declaration header"); - return; - } - ctx->bptr += 12; - - /* FIXME: the header must not contain exotic newlines */ - xml_parse_white(ctx, 0); - - if (xml_peek_char(ctx) == 'v') - { - xml_parse_seq(ctx, "version"); - xml_parse_eq(ctx); - char *version = xml_parse_pubid_literal(ctx); - DBG("XML: Version=%s", version); - if (document) - { - ctx->version_str = version; - if (!strcmp(ctx->version_str, "1.0")) - ; - else if (!strcmp(ctx->version_str, "1.1")) - ctx->flags |= XML_FLAG_VERSION_1_1; - else - xml_fatal(ctx, "Unsupported XML version"); - } - else if (strcmp(version, ctx->version_str)) - xml_error(ctx, "Mixed XML versions"); - } - else if (document) - xml_fatal(ctx, "Missing XML version"); - - // FIXME: TextDecl must contain encoding - if (!xml_parse_white(ctx, 0)) - goto end; - if (xml_peek_char(ctx) == 'e') - { - xml_parse_seq(ctx, "encoding"); - xml_parse_eq(ctx); - ctx->encoding = xml_parse_encoding_name(ctx); - DBG("encoding=%s", ctx->encoding); - // FIXME: check encoding - if (!xml_parse_white(ctx, 0)) - goto end; - } - - if (document && xml_peek_char(ctx) == 's') - { - xml_parse_seq(ctx, "standalone"); - xml_parse_eq(ctx); - uns c = xml_parse_quote(ctx); - if (ctx->standalone = (xml_peek_char(ctx) == 'y')) - xml_parse_seq(ctx, "yes"); - else - xml_parse_seq(ctx, "no"); - xml_parse_char(ctx, c); - DBG("standalone=%d", ctx->standalone); - xml_parse_white(ctx, 0); - } -end: - xml_parse_seq(ctx, "?>"); -} - -/*** Document Type Definition (DTD) ***/ - -/* Notations */ - -#define HASH_PREFIX(x) xml_dtd_notns_##x -#define HASH_NODE struct xml_dtd_notn -#define HASH_KEY_STRING name -#define HASH_AUTO_POOL 1024 -#define HASH_ZERO_FILL -#define HASH_TABLE_DYNAMIC -#define HASH_WANT_FIND -#define HASH_WANT_LOOKUP -#define HASH_WANT_CLEANUP -#include "lib/hashtable.h" - -/* General entities */ - -#define HASH_PREFIX(x) xml_dtd_ents_##x -#define HASH_NODE struct xml_dtd_ent -#define HASH_KEY_STRING name -#define HASH_AUTO_POOL 1024 -#define HASH_ZERO_FILL -#define HASH_TABLE_DYNAMIC -#define HASH_WANT_FIND -#define HASH_WANT_LOOKUP -#define HASH_WANT_CLEANUP -#include "lib/hashtable.h" - -static void -xml_dtd_declare_trivial_gent(struct xml_context *ctx, char *name, char *text) -{ - struct xml_dtd *dtd = ctx->dtd; - struct xml_dtd_ent *ent = xml_dtd_ents_lookup(dtd->tab_gents, name); - if (ent->flags & XML_DTD_ENT_DECLARED) - { - xml_warn(ctx, "Entity &%s; already declared", name); - return; - } - slist_add_tail(&dtd->gents, &ent->n); - ent->flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL; - ent->text = text; -} - -static void -xml_dtd_declare_default_gents(struct xml_context *ctx) -{ - xml_dtd_declare_trivial_gent(ctx, "lt", "<"); - xml_dtd_declare_trivial_gent(ctx, "gt", ">"); - xml_dtd_declare_trivial_gent(ctx, "amp", "&"); - xml_dtd_declare_trivial_gent(ctx, "apos", "'"); - xml_dtd_declare_trivial_gent(ctx, "quot", "\""); -} - -static struct xml_dtd_ent * -xml_dtd_find_gent(struct xml_context *ctx, char *name) -{ - struct xml_dtd *dtd = ctx->dtd; - if (dtd) - { - struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_gents, name); - return (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; - } - else - { -#define ENT(n, t) ent_##n = { .name = #n, .text = t, .len = 1, .flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL } - static struct xml_dtd_ent ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\""); -#undef ENT - switch (name[0]) - { - case 'l': - if (!strcmp(name, "lt")) - return &ent_lt; - break; - case 'g': - if (!strcmp(name, "gt")) - return &ent_gt; - break; - case 'a': - if (!strcmp(name, "amp")) - return &ent_amp; - if (!strcmp(name, "apos")) - return &ent_apos; - break; - case 'q': - if (!strcmp(name, "quot")) - return &ent_quot; - break; - } - return NULL; - } -} - -/* Parameter entities */ - -static struct xml_dtd_ent * -xml_dtd_find_pent(struct xml_context *ctx, char *name) -{ - struct xml_dtd *dtd = ctx->dtd; - struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_pents, name); - return (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; -} - -/* Elements */ - -#define HASH_PREFIX(x) xml_dtd_elems_##x -#define HASH_NODE struct xml_dtd_elem -#define HASH_KEY_STRING name -#define HASH_TABLE_DYNAMIC -#define HASH_AUTO_POOL 1024 -#define HASH_ZERO_FILL -#define HASH_WANT_LOOKUP -#define HASH_WANT_CLEANUP -#include "lib/hashtable.h" - -/* Element attributes */ - -struct xml_dtd_attrs_table; - -static inline uns -xml_dtd_attrs_hash(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem, char *name) -{ - return hash_pointer(elem) ^ hash_string(name); -} - -static inline int -xml_dtd_attrs_eq(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem1, char *name1, struct xml_dtd_elem *elem2, char *name2) -{ - return (elem1 == elem2) && !strcmp(name1, name2); -} - -static inline void -xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_elem *elem, char *name) -{ - attr->elem = elem; - attr->name = name; -} - -#define HASH_PREFIX(x) xml_dtd_attrs_##x -#define HASH_NODE struct xml_dtd_attr -#define HASH_AUTO_POOL 1024 -#define HASH_ZERO_FILL -#define HASH_TABLE_DYNAMIC -#define HASH_KEY_COMPLEX(x) x elem, x name -#define HASH_KEY_DECL struct xml_dtd_elem *elem, char *name -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_WANT_CLEANUP -#include "lib/hashtable.h" - -/* Enumerated attribute values */ - -struct xml_dtd_evals_table; - -static inline uns -xml_dtd_evals_hash(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr, char *val) -{ - return hash_pointer(attr) ^ hash_string(val); -} - -static inline int -xml_dtd_evals_eq(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr1, char *val1, struct xml_dtd_attr *attr2, char *val2) -{ - return (attr1 == attr2) && !strcmp(val1, val2); -} - -static inline void -xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_eval *eval, struct xml_dtd_attr *attr, char *val) -{ - eval->attr = attr; - eval->val = val; -} - -#define HASH_PREFIX(x) xml_dtd_evals_##x -#define HASH_NODE struct xml_dtd_eval -#define HASH_AUTO_POOL 1024 -#define HASH_TABLE_DYNAMIC -#define HASH_KEY_COMPLEX(x) x attr, x val -#define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_WANT_CLEANUP -#include "lib/hashtable.h" - -/* Enumerated attribute notations */ - -struct xml_dtd_enotns_table; - -static inline uns -xml_dtd_enotns_hash(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) -{ - return hash_pointer(attr) ^ hash_pointer(notn); -} - -static inline int -xml_dtd_enotns_eq(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr1, struct xml_dtd_notn *notn1, struct xml_dtd_attr *attr2, struct xml_dtd_notn *notn2) -{ - return (attr1 == attr2) && (notn1 == notn2); -} - -static inline void -xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_enotn *enotn, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) -{ - enotn->attr = attr; - enotn->notn = notn; -} - -#define HASH_PREFIX(x) xml_dtd_enotns_##x -#define HASH_NODE struct xml_dtd_enotn -#define HASH_AUTO_POOL 1024 -#define HASH_TABLE_DYNAMIC -#define HASH_KEY_COMPLEX(x) x attr, x notn -#define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn -#define HASH_GIVE_HASHFN -#define HASH_GIVE_EQ -#define HASH_GIVE_INIT_KEY -#define HASH_WANT_FIND -#define HASH_WANT_NEW -#define HASH_WANT_CLEANUP -#include "lib/hashtable.h" - -/* DTD initialization/cleanup */ - -static void -xml_dtd_init(struct xml_context *ctx) -{ - ctx->dtd = mp_alloc_zero(ctx->pool, sizeof(*ctx->dtd)); - xml_dtd_ents_init(ctx->dtd->tab_gents = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_ents_table))); - xml_dtd_ents_init(ctx->dtd->tab_pents = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_ents_table))); - xml_dtd_notns_init(ctx->dtd->tab_notns = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_notns_table))); - xml_dtd_elems_init(ctx->dtd->tab_elems = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_elems_table))); - xml_dtd_attrs_init(ctx->dtd->tab_attrs = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_attrs_table))); - xml_dtd_evals_init(ctx->dtd->tab_evals = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_evals_table))); - xml_dtd_enotns_init(ctx->dtd->tab_enotns = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_enotns_table))); - xml_dtd_declare_default_gents(ctx); -} - -static void -xml_dtd_cleanup(struct xml_context *ctx) -{ - if (!ctx->dtd) - return; - xml_dtd_ents_cleanup(ctx->dtd->tab_gents); - xml_dtd_ents_cleanup(ctx->dtd->tab_pents); - xml_dtd_notns_cleanup(ctx->dtd->tab_notns); - xml_dtd_elems_cleanup(ctx->dtd->tab_elems); - xml_dtd_attrs_cleanup(ctx->dtd->tab_attrs); - xml_dtd_evals_cleanup(ctx->dtd->tab_evals); - xml_dtd_enotns_cleanup(ctx->dtd->tab_enotns); -} - -static void -xml_dtd_finish(struct xml_context *ctx) -{ - if (!ctx->dtd) - return; - // FIXME -} - -/*** Parsing functions ***/ - -/* Comments */ - -static void -xml_push_comment(struct xml_context *ctx) -{ - /* Parse a comment to ctx->value: - * Comment ::= '' - * Already parsed: 'value; - uns c; - xml_parse_char(ctx, '-'); - while (1) - { - if ((c = xml_get_char(ctx)) == '-') - if ((c = xml_get_char(ctx)) == '-') - break; - else - bputc(out, '-'); - bput_utf8_32(out, c); - } - xml_parse_char(ctx, '>'); - fbgrow_rewind(out); - if (ctx->h_comment) - ctx->h_comment(ctx); -} - -static void -xml_pop_comment(struct xml_context *ctx) -{ - fbgrow_rewind(ctx->value); -} - -static void -xml_skip_comment(struct xml_context *ctx) -{ - xml_parse_char(ctx, '-'); - while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-'); - xml_parse_char(ctx, '>'); -} - -/* Processing instructions */ - -static void -xml_push_pi(struct xml_context *ctx) -{ - /* Parses a PI to ctx->value and ctx->name: - * PI ::= '' Char*)))? '?>' - * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) - * Already parsed: 'name = xml_parse_name(ctx); - if (unlikely(!strcasecmp(ctx->name, "xml"))) - xml_fatal(ctx, "Reserved PI target"); - struct fastbuf *out = ctx->value; - if (xml_parse_white(ctx, 0)) - xml_parse_seq(ctx, "?>"); - else - { - while (1) - { - uns c; - if ((c = xml_get_char(ctx)) == '?') - if (xml_get_char(ctx) == '>') - break; - else - { - xml_unget_char(ctx); - bputc(out, '?'); - } - else - bput_utf8_32(out, c); - } - fbgrow_rewind(out); - } - if (ctx->h_pi) - ctx->h_pi(ctx); -} - -static void -xml_pop_pi(struct xml_context *ctx) -{ - fbgrow_reset(ctx->value); -} - -static void -xml_skip_pi(struct xml_context *ctx) -{ - if (ctx->flags & XML_FLAG_VALIDATING) - { - mp_push(ctx->pool); - if (unlikely(!strcasecmp(xml_parse_name(ctx), "xml"))) - xml_fatal(ctx, "Reserved PI target"); - mp_pop(ctx->pool); - if (!xml_parse_white(ctx, 0)) - { - xml_parse_seq(ctx, "?>"); - return; - } - } - while (1) - if (xml_get_char(ctx) == '?') - if (xml_get_char(ctx) == '>') - break; - else - xml_unget_char(ctx); -} - -/* Character references */ - -static uns -xml_parse_char_ref(struct xml_context *ctx) -{ - /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' - * Already parsed: '&#' */ - uns v = 0; - if (xml_get_char(ctx) == 'x') - { - if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT)) - { - xml_error(ctx, "Expected a hexadecimal value of character reference"); - goto recover; - } - do - { - v = (v << 4) + Cxvalue(xml_last_char(ctx)); - } - while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT)); - } - else - { - if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT)) - { - xml_error(ctx, "Expected a numeric value of character reference"); - goto recover; - } - do - { - v = v * 10 + xml_last_char(ctx) - '0'; - } - while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT)); - } - uns cat = xml_char_cat(v); - if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_FLAG_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0))) - { - xml_error(ctx, "Character reference out of range"); - goto recover; - } - if (xml_last_char(ctx) == ';') - return v; - xml_error(ctx, "Expected ';'"); -recover: - while (xml_last_char(ctx) != ';') - xml_get_char(ctx); - return UNI_REPLACEMENT; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////////// - -static void -xml_parse_parameter_ref(struct xml_context *ctx) -{ - char *name = xml_parse_name(ctx); - xml_parse_char(ctx, ';'); - struct xml_dtd_ent *ent = xml_dtd_ents_find(ctx->dtd->tab_pents, name); - if (!ent || !(ent->flags & XML_DTD_ENT_DECLARED)) - { - xml_error(ctx, "Reference to unknown parameter entity %%%s", name); - return; - } - if (ent->flags & XML_DTD_ENT_VISITED) - { - xml_error(ctx, "Cycled references to parameter entity %%%s", name); - return; - } - if (ent->flags & XML_DTD_ENT_EXTERNAL) - { - // FIXME: - xml_error(ctx, "Support for external parsed entities not implemented"); - return; - } - ent->flags |= XML_DTD_ENT_VISITED; // FIXME: clear - struct fastbuf *fb = mp_alloc(ctx->pool, sizeof(*fb)); - fbbuf_init_read(fb, ent->text, ent->len, 0); - xml_push_source(ctx, fb, 0); -} - -static inline void -xml_check_parameter_ref(struct xml_context *ctx) -{ - if (xml_get_char(ctx) != '%') - { - xml_unget_char(ctx); - return; - } - xml_parse_parameter_ref(ctx); -} - -static void -xml_parse_external_id(struct xml_context *ctx, struct xml_ext_id *eid, uns allow_public) -{ - bzero(eid, sizeof(*eid)); - uns c = xml_get_char(ctx); - if (c == 'S') - { - xml_parse_seq(ctx, "YSTEM"); - xml_parse_white(ctx, 1); - eid->system_id = xml_parse_system_literal(ctx); - } - else if (c == 'P') - { - xml_parse_seq(ctx, "UBLIC"); - xml_parse_white(ctx, 1); - eid->public_id = xml_parse_pubid_literal(ctx); - if (xml_parse_white(ctx, 1)) - if ((c = xml_get_char(ctx)) == '\'' || c == '"' || !allow_public) - { - xml_unget_char(ctx); - eid->system_id = xml_parse_system_literal(ctx); - } - else - xml_unget_char(ctx); - } - else - xml_fatal(ctx, "Expected an external ID"); -} - -static void -xml_parse_notation_decl(struct xml_context *ctx) -{ - /* NotationDecl ::= ''*/ - xml_parse_white(ctx, 1); - struct xml_dtd_notn *notn = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx)); - xml_parse_white(ctx, 1); - struct xml_ext_id eid; - xml_parse_external_id(ctx, &eid, 1); - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '>'); - if (notn->flags & XML_DTD_NOTN_DECLARED) - xml_warn(ctx, "Notation %s already declared", notn->name); - else - { - notn->flags = XML_DTD_NOTN_DECLARED; - notn->eid = eid; - } -} - -static void -xml_parse_internal_subset(struct xml_context *ctx) -{ - while (1) - { - xml_parse_white(ctx, 0); - uns c = xml_get_char(ctx); - if (c == '<') - if ((c = xml_get_char(ctx)) == '!') - switch (c = xml_get_char(ctx)) - { - case '-': - xml_push_comment(ctx); - xml_pop_comment(ctx); - break; - case 'N': - xml_parse_seq(ctx, "OTATION"); - xml_parse_notation_decl(ctx); - break; - case 'E': - if ((c = xml_get_char(ctx)) == 'N') - { - xml_parse_seq(ctx, "TITY"); - //xml_parse_entity_decl(ctx); - } - else if (c == 'L') - { - xml_parse_seq(ctx, "EMENT"); - // FIXME: Element - } - else - goto invalid_markup; - break; - case 'A': - xml_parse_seq(ctx, "TTLIST"); - // FIXME: AttList - break; - default: - goto invalid_markup; - } - else if (c == '?') - { - xml_push_pi(ctx); - xml_pop_pi(ctx); - } - else - goto invalid_markup; - else if (c == '%') - xml_parse_parameter_ref(ctx); - else if (c == ']') - break; - else - goto invalid_markup; - } - return; -invalid_markup: - xml_fatal(ctx, "Invalid markup in the internal subset"); -} - -/*----------------------------------------------*/ - - -/* FIXME */ - -struct xml_attribute_table; - -#define HASH_PREFIX(x) xml_attribute_##x -#define HASH_NODE struct xml_attribute -#define HASH_KEY_COMPLEX(x) x element, x name -#define HASH_KEY_DECL struct xml_element *element, char *name -#define HASH_TABLE_DYNAMIC -#define HASH_AUTO_POOL 1024 - -#define HASH_GIVE_HASHFN - -static inline uns -xml_attribute_hash(struct xml_attribute_table *t UNUSED, struct xml_element *e, char *n) -{ - return hash_pointer(e) ^ hash_string(n); -} - -#define HASH_GIVE_EQ - -static inline int -xml_attribute_eq(struct xml_attribute_table *t UNUSED, struct xml_element *e1, char *n1, struct xml_element *e2, char *n2) -{ - return (e1 == e2) && !strcmp(n1, n2); -} - -#define HASH_GIVE_INIT_KEY - -static inline void -xml_attribute_init_key(struct xml_attribute_table *t UNUSED, struct xml_attribute *a, struct xml_element *e, char *name) -{ - a->element = e; - a->name = name; - a->value = NULL; - a->next = e->attrs; - e->attrs = a; -} - -#define HASH_WANT_CLEANUP -#define HASH_WANT_REMOVE -#define HASH_WANT_LOOKUP -#define HASH_WANT_FIND -#include "lib/hashtable.h" - - -/* -#define HASH_PREFIX(x) xml_parsed_entities_##x -#define HASH_NODE struct xml_parsed_entity -#define HASH_KEY_STRING name -#define HASH_TABLE_DYNAMIC -#define HASH_AUTO_POOL 1024 -#define HASH_WANT_CLEANUP -#include "lib/hashtable.h" -*/ - -void -xml_init(struct xml_context *ctx) -{ - bzero(ctx, sizeof(*ctx)); - ctx->pool = mp_new(65536); - ctx->chars = fbgrow_create(4096); - ctx->value = fbgrow_create(4096); - xml_dtd_init(ctx); -} - -void -xml_cleanup(struct xml_context *ctx) -{ - xml_dtd_cleanup(ctx); - bclose(ctx->value); - bclose(ctx->chars); - mp_delete(ctx->pool); -} - -static void -xml_parse_cdata(struct xml_context *ctx) -{ - struct fastbuf *out = ctx->chars; - xml_parse_seq(ctx, "CDATA["); - while (1) - { - uns c; - if ((c = xml_get_char(ctx)) == ']') - { - if ((c = xml_get_char(ctx)) == ']') - if ((c = xml_get_char(ctx)) == '>') - break; - else - bputc(out, ']'); - bputc(out, ']'); - } - bput_utf8_32(out, c); - } -} - -static void -xml_skip_cdata(struct xml_context *ctx) -{ - xml_parse_cdata(ctx); -} - -static void -xml_parse_ref_entity(struct xml_context *ctx UNUSED, struct fastbuf *out UNUSED, struct xml_dtd_ent *entity UNUSED) -{ -#if 0 - for (struct xml_dtd_ent_node *node = entity->list; node; node = node->next) - if (node->len) - bwrite(out, node->ptr, node->len); - else - xml_parse_ref_entity(ctx, out, node->ptr); // FIXME: do not call the recursion on stack -- could cause segfault -#endif -} - -static void -xml_parse_ref(struct xml_context *ctx, struct fastbuf *out) -{ - if (xml_get_char(ctx) == '#') - { - uns c = xml_parse_char_ref(ctx); - bput_utf8_32(out, c); - } - else - { -#if 0 - xml_unget_char(ctx); - mp_push(ctx->pool); - char *name = xml_parse_name(ctx); - struct xml_parsed_entity *entity = xml_find_parsed_entity(ctx, name); - mp_pop(ctx->pool); - xml_parse_char(ctx, ';'); - xml_parse_ref_entity(ctx, out, entity); -#endif - } -} - -static void -xml_parse_chars(struct xml_context *ctx) -{ - DBG("parse_chars"); - struct fastbuf *out = ctx->chars; - uns c; - while ((c = xml_get_char(ctx)) != '<') - if (c == '&') - xml_parse_ref(ctx, out); - else - bput_utf8_32(out, c); - xml_unget_char(ctx); -} - -static void -xml_parse_attr(struct xml_context *ctx) -{ - DBG("parse_attr"); - struct xml_element *e = ctx->element; - char *name = xml_parse_name(ctx); - struct xml_attribute *a = xml_attribute_lookup(ctx->attribute_table, e, name); - if (a->value) - xml_fatal(ctx, "Attribute is not unique"); - xml_parse_eq(ctx); - // FIXME - char *value = xml_parse_system_literal(ctx); - a->value = value; -} - -static uns -xml_parse_stag(struct xml_context *ctx) -{ - DBG("parse_stag"); - mp_push(ctx->pool); - struct xml_element *e = mp_alloc_zero(ctx->pool, sizeof(*e)); - e->parent = ctx->element; - ctx->element = e; - e->name = xml_parse_name(ctx); - while (1) - { - uns white = xml_parse_white(ctx, 0); - uns c = xml_get_char(ctx); - if (c == '/') - { - xml_parse_char(ctx, '>'); - return 1; - } - else if (c == '>') - return 0; - else if (!white) - xml_fatal(ctx, "Expected a white space"); - xml_unget_char(ctx); - xml_parse_attr(ctx); - } -} - -static void -xml_parse_etag(struct xml_context *ctx) -{ - DBG("parse_etag"); - struct xml_element *e = ctx->element; - ASSERT(e); - char *name = xml_parse_name(ctx); - if (strcmp(name, e->name)) - xml_fatal(ctx, "Invalid ETag, expected '%s'", e->name); - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '>'); - // FIXME: remove on pooled hashtable? - for (struct xml_attribute *a = e->attrs; a; a = a->next) - xml_attribute_remove(ctx->attribute_table, a); - ctx->element = e->parent; - mp_pop(ctx->pool); -} - -static void -xml_parse_element_decl(struct xml_context *ctx) -{ - // FIXME - mp_push(ctx->pool); - xml_parse_seq(ctx, "'); - mp_pop(ctx->pool); -} - -#if 0 -static void -xml_parse_attr_list_decl(struct xml_context *ctx) -{ - /* AttlistDecl ::= '' - * AttDef ::= S Name S AttType S DefaultDecl */ - xml_parse_seq(ctx, "ATTLIST"); - xml_parse_white(ctx, 1); - struct xml_dtd_elem *e = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx)); - e->attlist_declared = 1; - - while (xml_parse_white(ctx, 0) && xml_get_char(ctx) != '>') - { - xml_unget_char(ctx); - char *name = xml_parse_name(ctx); - struct xml_dtd_attr *a = xml_dtd_attrs_find(ctx->dtd->tab_attrs, e, name); - uns ignored = 0; - if (a) - { - xml_warn(ctx, "Duplicate attribute definition"); - ignored++; - } - else - a = xml_dtd_attrs_new(ctx->dtd->tab_attrs, e, name); - xml_parse_white(ctx, 1); - if (xml_get_char(ctx) == '(') - { - if (!ignored) - a->type = XML_ATTR_ENUM; - do - { - xml_parse_white(ctx, 0); - char *value = xml_parse_nmtoken(ctx); - if (!ignored) - if (xml_dtd_evals_find(ctx->dtd->tab_evals, a, value)) - xml_error(ctx, "Duplicate enumeration value"); - else - xml_dtd_evals_new(ctx->dtd->tab_evals, a, value); - xml_parse_white(ctx, 0); - } - while (xml_get_char(ctx) == '|'); - xml_unget_char(ctx); - xml_parse_char(ctx, ')'); - } - else - { - xml_unget_char(ctx); - char *type = xml_parse_name(ctx); - enum xml_dtd_attribute_type t; - if (!strcmp(type, "CDATA")) - t = XML_ATTR_CDATA; - else if (!strcmp(type, "ID")) - t = XML_ATTR_ID; - else if (!strcmp(type, "IDREF")) - t = XML_ATTR_IDREF; - else if (!strcmp(type, "IDREFS")) - t = XML_ATTR_IDREFS; - else if (!strcmp(type, "ENTITY")) - t = XML_ATTR_ENTITY; - else if (!strcmp(type, "ENTITIES")) - t = XML_ATTR_ENTITIES; - else if (!strcmp(type, "NMTOKEN")) - t = XML_ATTR_NMTOKEN; - else if (!strcmp(type, "NMTOKENS")) - t = XML_ATTR_NMTOKENS; - else if (!strcmp(type, "NOTATION")) - { - t = XML_ATTR_NOTATION; - xml_parse_white(ctx, 1); - xml_parse_char(ctx, '('); - do - { - xml_parse_white(ctx, 0); - struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx)); - if (!ignored) - if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, a, n)) - xml_error(ctx, "Duplicate enumerated notation"); - else - xml_dtd_enotns_new(ctx->dtd->tab_enotns, a, n); - xml_parse_white(ctx, 0); - } - while (xml_get_char(ctx) == '|'); - xml_unget_char(ctx); - xml_parse_char(ctx, ')'); - } - else - xml_fatal(ctx, "Unknown attribute type"); - if (!ignored) - a->type = t; - } - xml_parse_white(ctx, 1); - enum xml_dtd_attribute_default def = XML_ATTR_NONE; - if (xml_get_char(ctx) == '#') - switch (xml_get_char(ctx)) - { - case 'R': - xml_parse_seq(ctx, "EQUIRED"); - def = XML_ATTR_REQUIRED; - break; - case 'I': - xml_parse_seq(ctx, "MPLIED"); - def = XML_ATTR_IMPLIED; - break; - case 'F': - xml_parse_seq(ctx, "IXED"); - def = XML_ATTR_FIXED; - break; - default: - xml_fatal(ctx, "Expected a modifier for default attribute value"); - } - else - xml_unget_char(ctx); - if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED) - { - xml_parse_system_literal(ctx); - // FIXME - } - } -} -#endif - -static void -xml_parse_entity_decl(struct xml_context *ctx) -{ - struct xml_dtd *dtd = ctx->dtd; - xml_parse_white(ctx, 1); - - uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENT_PARAMETER : 0; - if (flags) - xml_parse_white(ctx, 1); - else - xml_unget_char(ctx); - - struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_gents, xml_parse_name(ctx)); - slist *list = flags ? &dtd->pents : &dtd->gents; - xml_parse_white(ctx, 1); - if (ent->flags & XML_DTD_ENT_DECLARED) - { - xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); - // FIXME: should be only warning - } - - uns sep = xml_get_char(ctx), c; - if (sep == '\'' || sep == '"') - { - /* Internal entity: - * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */ - struct fastbuf *out = ctx->value; - uns sep = c; - while (1) - { - if ((c = xml_get_char(ctx)) == sep) - break; - else if (c == '%') - { - // FIXME - ASSERT(0); - //xml_parse_parameter_ref(ctx); - } - else if (c != '&') - bput_utf8_32(out, c); - else if ((c = xml_get_char(ctx)) == '#') - c = xml_parse_char_ref(ctx); - else - { - /* Bypass references to general entities */ - mp_push(ctx->pool); - bputc(out, '&'); - xml_unget_char(ctx); - bputs(out, xml_parse_name(ctx)); - xml_parse_char(ctx, ';'); - bputc(out, ';'); - mp_pop(ctx->pool); - } - } - bputc(out, 0); - fbgrow_rewind(out); - slist_add_tail(list, &ent->n); - ent->flags = flags | XML_DTD_ENT_DECLARED; - ent->len = out->bstop - out->bptr - 1; - ent->text = mp_memdup(ctx->pool, out->bptr, ent->len + 1); - fbgrow_reset(out); - } - else - { - /* External entity */ - struct xml_ext_id eid; - struct xml_dtd_notn *notn = NULL; - xml_parse_external_id(ctx, &eid, 0); - if (!xml_parse_white(ctx, 0) || !flags) - xml_parse_char(ctx, '>'); - else if (xml_get_char(ctx) != '>') - { - /* General external unparsed entity */ - flags |= XML_DTD_ENT_UNPARSED; - xml_parse_seq(ctx, "NDATA"); - xml_parse_white(ctx, 1); - notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx)); - } - slist_add_tail(list, &ent->n); - ent->flags = flags | XML_DTD_ENT_DECLARED | XML_DTD_ENT_EXTERNAL; - ent->eid = eid; - ent->notn = notn; - } -} - -static void -xml_parse_doctype_decl(struct xml_context *ctx) -{ - if (ctx->document_type) - xml_fatal(ctx, "Multiple document types not allowed"); - xml_parse_seq(ctx, "DOCTYPE"); - xml_parse_white(ctx, 1); - ctx->document_type = xml_parse_name(ctx); - DBG("XML: DocumentType=%s", ctx->document_type); - uns white = xml_parse_white(ctx, 0); - uns c = xml_peek_char(ctx); - if (c != '>' && c != '[' && white) - { - xml_parse_external_id(ctx, &ctx->eid, 0); - xml_parse_white(ctx, 0); - } - if (ctx->h_doctype_decl) - ctx->h_doctype_decl(ctx); -} - -int -xml_next(struct xml_context *ctx) -{ - /* A nasty state machine */ - - DBG("XML: xml_next (state=%u)", ctx->state); - jmp_buf throw_buf; - ctx->throw_buf = &throw_buf; - if (setjmp(throw_buf)) - { -error: - if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal) - ctx->h_fatal(ctx); - ctx->state = XML_STATE_FATAL; - DBG("XML: raised fatal error"); - return -1; - } - uns c; - switch (ctx->state) - { - case XML_STATE_FATAL: - return -1; - - case XML_STATE_START: - DBG("XML: Entering Prolog"); - if (ctx->h_document_start) - ctx->h_document_start(ctx); - /* XMLDecl */ - xml_refill(ctx); - if (ctx->h_xml_decl) - ctx->h_xml_decl(ctx); - if (ctx->want & XML_WANT_DECL) - return ctx->state = XML_STATE_DECL; - case XML_STATE_DECL: - - /* Misc* (doctypedecl Misc*)? */ - while (1) - { - xml_parse_white(ctx, 0); - xml_parse_char(ctx, '<'); - if ((c = xml_get_char(ctx)) == '?') - /* Processing intruction */ - if (!(ctx->want & XML_WANT_PI)) - xml_skip_pi(ctx); - else - { - xml_push_pi(ctx); - ctx->state = XML_STATE_PROLOG_PI; - return XML_STATE_PI; - case XML_STATE_PROLOG_PI: - xml_pop_pi(ctx); - } - else if (c != '!') - { - /* Found the root tag */ - xml_unget_char(ctx); - goto first_tag; - } - else if (xml_get_char(ctx) == '-') - if (!(ctx->want & XML_WANT_COMMENT)) - xml_skip_comment(ctx); - else - { - xml_push_comment(ctx); - ctx->state = XML_STATE_PROLOG_COMMENT; - return XML_STATE_COMMENT; - case XML_STATE_PROLOG_COMMENT: - xml_pop_comment(ctx); - } - else - { - /* DocTypeDecl */ - xml_unget_char(ctx); - xml_parse_doctype_decl(ctx); - if (ctx->want & XML_WANT_DOCUMENT_TYPE) - return ctx->state = XML_STATE_DOCUMENT_TYPE; - case XML_STATE_DOCUMENT_TYPE: - if (xml_peek_char(ctx) == '[') - { - xml_skip_char(ctx); - // FIXME - while (xml_get_char(ctx) != ']'); - xml_parse_white(ctx, 0); - } - xml_parse_char(ctx, '>'); - } - } - - case XML_STATE_PI: - mp_pop(ctx->pool); - case XML_STATE_COMMENT: - fbgrow_reset(ctx->value); - - case XML_STATE_CHARS: - - while (1) - { - if (xml_get_char(ctx) != '<') - { - /* CharData */ - xml_unget_char(ctx); - xml_parse_chars(ctx); - continue; - } -first_tag: ; - - if ((c = xml_get_char(ctx)) == '?') - { - /* PI */ - if (!(ctx->want & XML_WANT_PI)) - xml_skip_pi(ctx); - else - { - if (btell(ctx->chars)) - { - fbgrow_rewind(ctx->chars); - ctx->state = XML_STATE_CHARS_BEFORE_PI; - return XML_STATE_PI; - case XML_STATE_CHARS_BEFORE_PI: - fbgrow_reset(ctx->chars); - } - xml_push_pi(ctx); - return ctx->state = XML_STATE_PI; - } - } - - else if (c == '!') - if ((c = xml_get_char(ctx)) == '-') - { - /* Comment */ - if (!(ctx->want & XML_WANT_COMMENT)) - xml_skip_comment(ctx); - else - { - if (btell(ctx->chars)) - { - fbgrow_rewind(ctx->chars); - ctx->state = XML_STATE_CHARS_BEFORE_COMMENT; - return XML_STATE_CHARS; - case XML_STATE_CHARS_BEFORE_COMMENT: - fbgrow_reset(ctx->chars); - } - xml_push_comment(ctx); - return ctx->state = XML_STATE_COMMENT; - } - } - else if (c == '[') - { - /* CDATA */ - if (!(ctx->want & XML_WANT_CDATA)) - xml_skip_cdata(ctx); - else - { - if (btell(ctx->chars)) - { - fbgrow_rewind(ctx->chars); - ctx->state = XML_STATE_CHARS_BEFORE_CDATA; - return XML_STATE_CHARS; - case XML_STATE_CHARS_BEFORE_CDATA: - fbgrow_reset(ctx->chars); - } - xml_parse_cdata(ctx); - if (btell(ctx->chars)) - { - fbgrow_rewind(ctx->chars); - return ctx->state = XML_STATE_CDATA; - } - case XML_STATE_CDATA: - fbgrow_reset(ctx->chars); - } - } - else - xml_fatal(ctx, "Unexpected character after 'chars)) - { - fbgrow_rewind(ctx->chars); - ctx->state = XML_STATE_CHARS_BEFORE_STAG; - return XML_STATE_CHARS; - case XML_STATE_CHARS_BEFORE_STAG: - fbgrow_reset(ctx->chars); - } - - if (xml_parse_stag(ctx)) - { - } - if (ctx->want & XML_WANT_STAG) - return ctx->state = XML_STATE_STAG; - case XML_STATE_STAG: - // FIXME: EmptyElemTag - ; - - } - - else - { - /* ETag */ - if (btell(ctx->chars)) - { - fbgrow_rewind(ctx->chars); - ctx->state = XML_STATE_CHARS_BEFORE_ETAG; - return XML_STATE_CHARS; - case XML_STATE_CHARS_BEFORE_ETAG: - fbgrow_reset(ctx->chars); - } - - if (ctx->want & XML_WANT_ETAG) - return ctx->state = XML_STATE_ETAG; - case XML_STATE_ETAG: - - xml_parse_etag(ctx); - - if (!ctx->element) - goto epilog; - } - } - -epilog: - /* Misc* */ - DBG("XML: Entering epilog"); - while (1) - { - /* Epilog whitespace is the only place, where a valid document can reach EOF */ - if (setjmp(throw_buf)) - if (ctx->err_code == XML_ERR_EOF) - { - DBG("XML: Reached EOF"); - ctx->state = XML_STATE_EOF; - if (ctx->h_document_end) - ctx->h_document_end(ctx); - case XML_STATE_EOF: - return XML_STATE_EOF; - } - else - goto error; - xml_parse_white(ctx, 0); - if (setjmp(throw_buf)) - goto error; - - /* Misc */ - xml_parse_char(ctx, '<'); - if ((c = xml_get_char(ctx)) == '?') - /* Processing instruction */ - if (!(ctx->want & XML_WANT_PI)) - xml_skip_pi(ctx); - else - { - xml_push_pi(ctx); - return ctx->state = XML_STATE_EPILOG_PI, XML_STATE_PI; - case XML_STATE_EPILOG_PI: - xml_pop_pi(ctx); - } - else if (c == '!') - /* Comment */ - if (!(ctx->want & XML_WANT_COMMENT)) - xml_skip_comment(ctx); - else - { - xml_push_comment(ctx); - return ctx->state = XML_STATE_EPILOG_COMMENT, XML_STATE_COMMENT; - case XML_STATE_EPILOG_COMMENT: - xml_pop_comment(ctx); - } - else - xml_fatal(ctx, "Syntax error in the epilog"); - } - - } - return -1; -} - -#ifdef TEST - -static void -error(struct xml_context *ctx) -{ - msg((ctx->err_code < XML_ERR_ERROR) ? L_WARN_R : L_ERROR_R, "XML: %s", ctx->err_msg); -} - -static void -test(struct fastbuf *in, struct fastbuf *out) -{ - struct xml_context ctx; - xml_init(&ctx); - ctx.h_warn = ctx.h_error = ctx.h_fatal = error; - ctx.want = XML_WANT_ALL; - xml_set_source(&ctx, in); - int state; - while ((state = xml_next(&ctx)) >= 0) - switch (state) - { - case XML_STATE_CHARS: - bprintf(out, "CHARS [%.*s]\n", (int)(ctx.chars->bstop - ctx.chars->buffer), ctx.chars->buffer); - break; - case XML_STATE_STAG: - bprintf(out, "STAG <%s>\n", ctx.element->name); - for (struct xml_attribute *a = ctx.element->attrs; a; a = a->next) - bprintf(out, " ATTR %s=[%s]\n", a->name, a->value); - break; - case XML_STATE_ETAG: - bprintf(out, "ETAG \n", ctx.element->name); - break; - case XML_STATE_COMMENT: - bprintf(out, "COMMENT [%.*s]\n", (int)(ctx.value->bstop - ctx.value->buffer), ctx.value->buffer); - break; - case XML_STATE_PI: - bprintf(out, "PI [%s] [%.*s]\n", ctx.name, (int)(ctx.value->bstop - ctx.value->buffer), ctx.value->buffer); - break; - case XML_STATE_CDATA: - bprintf(out, "CDATA [%.*s]\n", (int)(ctx.chars->bstop - ctx.chars->buffer), ctx.chars->buffer); - break; - case XML_STATE_EOF: - bprintf(out, "EOF\n"); - goto end; - default: - bprintf(out, "STATE %u\n", state); - break; - } -end: - xml_cleanup(&ctx); -} - -int -main(void) -{ - struct fastbuf *in = bfdopen_shared(0, 1024); - struct fastbuf *out = bfdopen_shared(1, 1024); - test(in, out); - bclose(out); - return 0; -} - -#endif diff --git a/lib/xml.h b/lib/xml.h deleted file mode 100644 index 02e62462..00000000 --- a/lib/xml.h +++ /dev/null @@ -1,318 +0,0 @@ -/* - * UCW Library -- A simple XML parser - * - * (c) 2007 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#ifndef _UCW_XML_H -#define _UCW_XML_H - -#include "lib/clists.h" -#include "lib/slists.h" - -enum xml_error { - XML_ERR_OK = 0, - XML_ERR_WARN = 1000, /* Warning */ - XML_ERR_ERROR = 2000, /* Recoverable error */ - XML_ERR_FATAL = 3000, /* Unrecoverable error */ - XML_ERR_EOF, -}; - -enum xml_state { - XML_STATE_START = 0, - XML_STATE_DECL, - XML_STATE_DOCUMENT_TYPE, - XML_STATE_CHARS, - XML_STATE_WHITE, - XML_STATE_CDATA, - XML_STATE_STAG, - XML_STATE_ETAG, - XML_STATE_COMMENT, - XML_STATE_PI, - XML_STATE_EOF, - XML_STATE_FATAL, - - /* Internal states */ - XML_STATE_CHARS_BEFORE_STAG, - XML_STATE_CHARS_BEFORE_ETAG, - XML_STATE_CHARS_BEFORE_CDATA, - XML_STATE_CHARS_BEFORE_PI, - XML_STATE_CHARS_BEFORE_COMMENT, - XML_STATE_PROLOG_PI, - XML_STATE_PROLOG_COMMENT, - XML_STATE_EPILOG_PI, - XML_STATE_EPILOG_COMMENT, -}; - -enum xml_want { - XML_WANT_DECL = 1 << XML_STATE_DECL, - XML_WANT_DOCUMENT_TYPE = 1 << XML_STATE_DOCUMENT_TYPE, - XML_WANT_CHARS = 1 << XML_STATE_CHARS, - XML_WANT_WHITE = 1 << XML_STATE_WHITE, - XML_WANT_CDATA = 1 << XML_STATE_CDATA, - XML_WANT_STAG = 1 << XML_STATE_STAG, - XML_WANT_ETAG = 1 << XML_STATE_ETAG, - XML_WANT_COMMENT = 1 << XML_STATE_COMMENT, - XML_WANT_PI = 1 << XML_STATE_PI, - XML_WANT_EOF = 1 << XML_STATE_EOF, - XML_WANT_ALL = ~0U, -}; - -enum xml_flags { - XML_FLAG_VALIDATING = 0x1, - XML_FLAG_VERSION_1_1 = 0x2, -}; - -struct xml_ext_id { - char *system_id; - char *public_id; -}; - -enum xml_node_type { - XML_NODE_ELEM, - XML_NODE_COMMENT, - XML_NODE_CDATA, - XML_NODE_PI, -}; - -#define XML_BUF_SIZE 32 - -struct xml_source { - struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ - struct fastbuf *fb; - u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ - u32 *bptr, *bstop; /* Current state of the buffer */ - uns depth; - uns flags; -}; - -enum xml_source_flags { - XML_SRC_DECL = 0x1, /* Expected document/text declaration */ - XML_SRC_EOF = 0x2, /* Reached the end of the fastbuf */ - XML_SRC_NEW_LINE = 0x4, /* The last read character is 0xD */ - XML_SRC_SURROUND = 0x8, /* Surround the text with 0x20 (references to parameter entities) */ - XML_SRC_DOCUMENT = 0x10, /* The document entity */ - XML_SRC_EXTERNAL = 0x20, /* An external entity */ -}; - -#if 0 -struct xml_node { - cnode n; /* Node for list of parent's sons */ - uns type; /* XML_NODE_x */ - struct xml_node *parent; /* Parent node */ -}; - -struct xml_elem { - struct xml_node node; - char *name; /* Element name */ - clist sons; /* List of subnodes */ - struct xml_dtd_elem *dtd; /* Element DTD */ - slist attrs; /* Link list of attributes */ -}; -#endif - -struct xml_context { - /* Error handling */ - char *err_msg; /* Last error message */ - enum xml_error err_code; /* Last error code */ - void *throw_buf; /* Where to jump on error */ - void (*h_warn)(struct xml_context *ctx); /* Warning callback */ - void (*h_error)(struct xml_context *ctx); /* Recoverable error callback */ - void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */ - - /* Memory management */ - struct mempool *pool; /* Most data */ - struct fastbuf *chars; /* Character data */ - struct fastbuf *value; /* Attribute value / comment / processing instruction data */ - char *name; /* Attribute name, processing instruction target */ - - /* Input */ - struct xml_source *sources; /* Stack of pending sources */ - u32 *bptr, *bstop; /* Character buffer */ - uns depth; /* Nesting level */ - - /* SAX-like interface */ - void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */ - void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */ - void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */ - void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration just before internal subset */ - void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction */ - void (*h_comment)(struct xml_context *ctx); /* Called after a comment */ - - /* */ - struct xml_node *node; /* Current XML node */ - uns flags; /* XML_FLAG_x */ - struct xml_element *element; /* Current element */ - void *attribute_table; - char *version_str; - char *encoding; - uns standalone; - char *document_type; - struct xml_dtd *dtd; - struct xml_ext_id eid; - uns state; - uns want; - - void (*start_dtd)(struct xml_context *ctx); - void (*end_dtd)(struct xml_context *ctx); - void (*start_element)(struct xml_context *ctx); - void (*end_element)(struct xml_context *ctx); - void (*start_cdata)(struct xml_context *ctx); - void (*end_cdata)(struct xml_context *ctx); - void (*start_entity)(struct xml_context *ctx); - void (*end_entity)(struct xml_context *ctx); - void (*chacacters)(struct xml_context *ctx); - struct fastbuf *(*resolve_entity)(struct xml_context *ctx); - void (*notation_decl)(struct xml_context *ctx); - void (*unparsed_entity_decl)(struct xml_context *ctx); -}; - -struct xml_attribute { - char *name; - char *value; - struct xml_element *element; - struct xml_attribute *next; - struct xml_dtd_attribute *dtd; -}; - -struct xml_element { - char *name; - struct xml_attribute *attrs; - struct xml_element *parent; - struct xml_dtd_element *dtd; -}; - -/*** Document Type Definition (DTD) ***/ - -struct xml_dtd { - slist gents; /* Link list of general entities */ - slist pents; /* Link list of parapeter entities */ - slist notns; /* Link list of notations */ - slist elems; /* Link list of elements */ - void *tab_gents; /* Hash table of general entities */ - void *tab_pents; /* Hash table of parameter entities */ - void *tab_notns; /* Hash table of notations */ - void *tab_elems; /* Hash table of elements */ - void *tab_attrs; /* Hash table of element attributes */ - void *tab_evals; /* Hash table of enumerated attribute values */ - void *tab_enotns; /* hash table of enumerated attribute notations */ -}; - -/* Notations */ - -enum xml_dtd_notn_flags { - XML_DTD_NOTN_DECLARED = 0x1, /* The notation has been declared (interbal usage) */ -}; - -struct xml_dtd_notn { - snode n; /* Node in xml_dtd.notns */ - uns flags; /* XML_DTD_NOTN_x */ - char *name; /* Notation name */ - struct xml_ext_id eid; /* External id */ -}; - -/* Entities */ - -enum xml_dtd_ent_flags { - XML_DTD_ENT_DECLARED = 0x1, /* The entity has been declared (internal usage) */ - XML_DTD_ENT_VISITED = 0x2, /* Cycle detection (internal usage) */ - XML_DTD_ENT_PARAMETER = 0x4, /* Parameter entity, general otherwise */ - XML_DTD_ENT_EXTERNAL = 0x8, /* External entity, internal otherwise */ - XML_DTD_ENT_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */ - XML_DTD_ENT_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */ -}; - -struct xml_dtd_ent { - snode n; /* Node in xml_dtd.[gp]ents */ - uns flags; /* XML_DTD_ENT_x */ - char *name; /* Entity name */ - char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRVIAL) */ - uns len; /* Text length */ - struct xml_ext_id eid; /* External ID */ - struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */ -}; - -/* Elements */ - -enum xml_dtd_elem_flags { - XML_DTD_ELEM_DECLARED = 0x1, /* The element has been declared (internal usage) */ -}; - -struct xml_dtd_elem { - snode n; - uns flags; - char *name; - struct xml_dtd_elem_node *node; -}; - -struct xml_dtd_elem_node { - snode n; - struct xml_dtd_elem_node *parent; - slist sons; - uns type; - uns occur; -}; - -enum xml_dtd_elem_node_type { - XML_DTD_ELEM_PCDATA, - XML_DTD_ELEM_SEQ, - XML_DTD_ELEM_OR, -}; - -enum xml_dtd_elem_node_occur { - XML_DTD_ELEM_OCCUR_ONCE, - XML_DTD_ELEM_OCCUR_OPT, - XML_DTD_ELEM_OCCUR_MULT, - XML_DTD_ELEM_OCCUR_PLUS, -}; - -/* Attributes */ - - -enum xml_dtd_attribute_default { - XML_ATTR_NONE, - XML_ATTR_REQUIRED, - XML_ATTR_IMPLIED, - XML_ATTR_FIXED, -}; - -enum xml_dtd_attribute_type { - XML_ATTR_CDATA, - XML_ATTR_ID, - XML_ATTR_IDREF, - XML_ATTR_IDREFS, - XML_ATTR_ENTITY, - XML_ATTR_ENTITIES, - XML_ATTR_NMTOKEN, - XML_ATTR_NMTOKENS, - XML_ATTR_ENUM, - XML_ATTR_NOTATION, -}; - -struct xml_dtd_attr { - char *name; - struct xml_dtd_elem *elem; - enum xml_dtd_attribute_type type; - enum xml_dtd_attribute_default default_mode; - char *default_value; -}; - -struct xml_dtd_eval { - struct xml_dtd_attr *attr; - char *val; -}; - -struct xml_dtd_enotn { - struct xml_dtd_attr *attr; - struct xml_dtd_notn *notn; -}; - -void xml_init(struct xml_context *ctx); -void xml_cleanup(struct xml_context *ctx); -void xml_set_source(struct xml_context *ctx, struct fastbuf *fb); -int xml_next(struct xml_context *ctx); - -#endif diff --git a/sherlock/xml/Makefile b/sherlock/xml/Makefile new file mode 100644 index 00000000..3c70a9b0 --- /dev/null +++ b/sherlock/xml/Makefile @@ -0,0 +1,19 @@ +# Makefile for the XML parser +# (c) 2007 Pavel Charvat + +DIRS+=sherlock/xml + +LIBSH_MODS+=xml/xml +LIBSH_XML_INCLUDES=xml/xml.h + +$(o)/sherlock/xml/xml-t: $(LIBSH) $(LIBCHARSET) +$(o)/sherlock/xml/xml.o: $(o)/sherlock/xml/xml-ucat.h +$(o)/sherlock/xml/xml-ucat.h: $(s)/sherlock/xml/xml-ucat.pl + $(M)GEN $@ + $(Q)$< >$@ + +API_INCLUDES+=$(o)/sherlock/xml/.include-stamp +$(o)/sherlock/xml/.include-stamp: $(addprefix $(s)/sherlock/xml/,$(LIBSH_XML_INCLUDES)) +$(o)/sherlock/xml/.include-stamp: IDST=sherlock/xml + +include $(s)/sherlock/perl/Makefile diff --git a/sherlock/xml/xml-ucat.pl b/sherlock/xml/xml-ucat.pl new file mode 100755 index 00000000..eeb948e6 --- /dev/null +++ b/sherlock/xml/xml-ucat.pl @@ -0,0 +1,157 @@ +#!/usr/bin/perl +# +# UCW Library -- Character map for the XML parser +# +# (c) 2007 Pavel Charvat +# +# This software may be freely distributed and used according to the terms +# of the GNU Lesser General Public License. +# + +my @cat = (); +my @lcat = (); +my %ids = (); +my %cls = (); +for (my $i = 0; $i < 0x10000; $i++) { $cat[$i] = 0; } +for (my $i = 0; $i < 0x11; $i++) { $lcat[$i] = 0; } + +my @white = (0x9, 0xA, 0xD, 0x20); +my @base_char_1_0 = ( + [0x0041,0x005A], [0x0061,0x007A], [0x00C0,0x00D6], [0x00D8,0x00F6], [0x00F8,0x00FF], [0x0100,0x0131], + [0x0134,0x013E], [0x0141,0x0148], [0x014A,0x017E], [0x0180,0x01C3], [0x01CD,0x01F0], [0x01F4,0x01F5], + [0x01FA,0x0217], [0x0250,0x02A8], [0x02BB,0x02C1], 0x0386, [0x0388,0x038A], 0x038C, [0x038E,0x03A1], + [0x03A3,0x03CE], [0x03D0,0x03D6], 0x03DA, 0x03DC, 0x03DE, 0x03E0, [0x03E2,0x03F3], [0x0401,0x040C], + [0x040E,0x044F], [0x0451,0x045C], [0x045E,0x0481], [0x0490,0x04C4], [0x04C7,0x04C8], [0x04CB,0x04CC], + [0x04D0,0x04EB], [0x04EE,0x04F5], [0x04F8,0x04F9], [0x0531,0x0556], 0x0559, [0x0561,0x0586], [0x05D0,0x05EA], + [0x05F0,0x05F2], [0x0621,0x063A], [0x0641,0x064A], [0x0671,0x06B7], [0x06BA,0x06BE], [0x06C0,0x06CE], + [0x06D0,0x06D3], 0x06D5, [0x06E5,0x06E6], [0x0905,0x0939], 0x093D, [0x0958,0x0961], [0x0985,0x098C], + [0x098F,0x0990], [0x0993,0x09A8], [0x09AA,0x09B0], 0x09B2, [0x09B6,0x09B9], [0x09DC,0x09DD], [0x09DF,0x09E1], + [0x09F0,0x09F1], [0x0A05,0x0A0A], [0x0A0F,0x0A10], [0x0A13,0x0A28], [0x0A2A,0x0A30], [0x0A32,0x0A33], + [0x0A35,0x0A36], [0x0A38,0x0A39], [0x0A59,0x0A5C], 0x0A5E, [0x0A72,0x0A74], [0x0A85,0x0A8B], 0x0A8D, + [0x0A8F,0x0A91], [0x0A93,0x0AA8], [0x0AAA,0x0AB0], [0x0AB2,0x0AB3], [0x0AB5,0x0AB9], 0x0ABD, 0x0AE0, + [0x0B05,0x0B0C], [0x0B0F,0x0B10], [0x0B13,0x0B28], [0x0B2A,0x0B30], [0x0B32,0x0B33], [0x0B36,0x0B39], + 0x0B3D, [0x0B5C,0x0B5D], [0x0B5F,0x0B61], [0x0B85,0x0B8A], [0x0B8E,0x0B90], [0x0B92,0x0B95], [0x0B99,0x0B9A], + 0x0B9C, [0x0B9E,0x0B9F], [0x0BA3,0x0BA4], [0x0BA8,0x0BAA], [0x0BAE,0x0BB5], [0x0BB7,0x0BB9], [0x0C05,0x0C0C], + [0x0C0E,0x0C10], [0x0C12,0x0C28], [0x0C2A,0x0C33], [0x0C35,0x0C39], [0x0C60,0x0C61], [0x0C85,0x0C8C], + [0x0C8E,0x0C90], [0x0C92,0x0CA8], [0x0CAA,0x0CB3], [0x0CB5,0x0CB9], 0x0CDE, [0x0CE0,0x0CE1], [0x0D05,0x0D0C], + [0x0D0E,0x0D10], [0x0D12,0x0D28], [0x0D2A,0x0D39], [0x0D60,0x0D61], [0x0E01,0x0E2E], 0x0E30, [0x0E32,0x0E33], + [0x0E40,0x0E45], [0x0E81,0x0E82], 0x0E84, [0x0E87,0x0E88], 0x0E8A, 0x0E8D, [0x0E94,0x0E97], [0x0E99,0x0E9F], + [0x0EA1,0x0EA3], 0x0EA5, 0x0EA7, [0x0EAA,0x0EAB], [0x0EAD,0x0EAE], 0x0EB0, [0x0EB2,0x0EB3], 0x0EBD, + [0x0EC0,0x0EC4], [0x0F40,0x0F47], [0x0F49,0x0F69], [0x10A0,0x10C5], [0x10D0,0x10F6], 0x1100, [0x1102,0x1103], + [0x1105,0x1107], 0x1109, [0x110B,0x110C], [0x110E,0x1112], 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150, + [0x1154,0x1155], 0x1159, [0x115F,0x1161], 0x1163, 0x1165, 0x1167, 0x1169, [0x116D,0x116E], [0x1172,0x1173], + 0x1175, 0x119E, 0x11A8, 0x11AB, [0x11AE,0x11AF], [0x11B7,0x11B8], 0x11BA, [0x11BC,0x11C2], 0x11EB, 0x11F0, + 0x11F9, [0x1E00,0x1E9B], [0x1EA0,0x1EF9], [0x1F00,0x1F15], [0x1F18,0x1F1D], [0x1F20,0x1F45], [0x1F48,0x1F4D], + [0x1F50,0x1F57], 0x1F59, 0x1F5B, 0x1F5D, [0x1F5F,0x1F7D], [0x1F80,0x1FB4], [0x1FB6,0x1FBC], 0x1FBE, + [0x1FC2,0x1FC4], [0x1FC6,0x1FCC], [0x1FD0,0x1FD3], [0x1FD6,0x1FDB], [0x1FE0,0x1FEC], [0x1FF2,0x1FF4], + [0x1FF6,0x1FFC], 0x2126, [0x212A,0x212B], 0x212E, [0x2180,0x2182], [0x3041,0x3094], [0x30A1,0x30FA], + [0x3105,0x312C], [0xAC00,0xD7A3]); +my @ideographic_1_0 = ([0x4E00,0x9FA5], 0x3007, [0x3021,0x3029]); +my @combining_char_1_0 = ( + [0x0300,0x0345], [0x0360,0x0361], [0x0483,0x0486], [0x0591,0x05A1], [0x05A3,0x05B9], [0x05BB,0x05BD], + 0x05BF, [0x05C1,0x05C2], 0x05C4, [0x064B,0x0652], 0x0670, [0x06D6,0x06DC], [0x06DD,0x06DF], [0x06E0,0x06E4], + [0x06E7,0x06E8], [0x06EA,0x06ED], [0x0901,0x0903], 0x093C, [0x093E,0x094C], 0x094D, [0x0951,0x0954], + [0x0962,0x0963], [0x0981,0x0983], 0x09BC, 0x09BE, 0x09BF, [0x09C0,0x09C4], [0x09C7,0x09C8], [0x09CB,0x09CD], + 0x09D7, [0x09E2,0x09E3], 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, [0x0A40,0x0A42], [0x0A47,0x0A48], [0x0A4B,0x0A4D], + [0x0A70,0x0A71], [0x0A81,0x0A83], 0x0ABC, [0x0ABE,0x0AC5], [0x0AC7,0x0AC9], [0x0ACB,0x0ACD], [0x0B01,0x0B03], + 0x0B3C, [0x0B3E,0x0B43], [0x0B47,0x0B48], [0x0B4B,0x0B4D], [0x0B56,0x0B57], [0x0B82,0x0B83], [0x0BBE,0x0BC2], + [0x0BC6,0x0BC8], [0x0BCA,0x0BCD], 0x0BD7, [0x0C01,0x0C03], [0x0C3E,0x0C44], [0x0C46,0x0C48], [0x0C4A,0x0C4D], + [0x0C55,0x0C56], [0x0C82,0x0C83], [0x0CBE,0x0CC4], [0x0CC6,0x0CC8], [0x0CCA,0x0CCD], [0x0CD5,0x0CD6], + [0x0D02,0x0D03], [0x0D3E,0x0D43], [0x0D46,0x0D48], [0x0D4A,0x0D4D], 0x0D57, 0x0E31, [0x0E34,0x0E3A], + [0x0E47,0x0E4E], 0x0EB1, [0x0EB4,0x0EB9], [0x0EBB,0x0EBC], [0x0EC8,0x0ECD], [0x0F18,0x0F19], 0x0F35, + 0x0F37, 0x0F39, 0x0F3E, 0x0F3F, [0x0F71,0x0F84], [0x0F86,0x0F8B], [0x0F90,0x0F95], 0x0F97, [0x0F99,0x0FAD], + [0x0FB1,0x0FB7], 0x0FB9, [0x20D0,0x20DC], 0x20E1, [0x302A,0x302F], 0x3099, 0x309A); +my @digit_1_0 = ( + [0x0030,0x0039], [0x0660,0x0669], [0x06F0,0x06F9], [0x0966,0x096F], [0x09E6,0x09EF], [0x0A66,0x0A6F], + [0x0AE6,0x0AEF], [0x0B66,0x0B6F], [0x0BE7,0x0BEF], [0x0C66,0x0C6F], [0x0CE6,0x0CEF], [0x0D66,0x0D6F], + [0x0E50,0x0E59], [0x0ED0,0x0ED9], [0x0F20,0x0F29]); +my @extender_1_0 = ( + 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, [0x3031,0x3035], [0x309D,0x309E], [0x30FC,0x30FE]); +my @sname_1_1 = ( + "[:A-Z_a-z]", [0xC0,0xD6], [0xD8,0xF6], [0xF8,0x2FF], [0x370,0x37D], [0x37F,0x1FFF], + [0x200C,0x200D], [0x2070,0x218F], [0x2C00,0x2FEF], [0x3001,0xD7FF], [0xF900,0xFDCF], [0xFDF0,0xFFFD], [0x10000,0xEFFFF]); + +set("WHITE", @white); +set("NEW_LINE_1_0", 0xA, 0xD); +set("NEW_LINE_1_1", 0xA, 0xD, 0x85, 0x2028); +set("DIGIT", "[0-9]"); +set("XDIGIT", "[0-9a-fA-F]"); +set("VALID_1_0", @white, [0x20,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("VALID_1_1", [0x1,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("UNRESTRICTED_1_1", @white, [0x20,0x7E], 0x85, [0xA0,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("PUBID", 0x20, 0xD, 0xA, "[-a-zA-Z0-9'()+,./:=?:!*#@\$_%]"); +set("ENC_SNAME", "[a-zA-Z]"); +set("ENC_NAME", "[-a-zA-Z0-9._]"); +set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0); +set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0); +set("SNAME_1_1", @sname_1_1); +set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]); +set("DECL", @white, [0x20,0x7E]); +set("GT", "[>]"); + +print "/* Automatically generated by xml-ucat.pl */\n\n"; +find_cls(); +gen_enum(); +gen_tabs(); + +sub set { + my $id = shift; + $ids{$id} = scalar keys(%ids) if !defined($ids{$id}); + my $mask = 1 << $ids{$id}; + foreach my $i (@_) { + if (ref($i) eq "ARRAY") { + my $j = $i->[0]; + for (; $j <= $i->[1] && $j < 0x10000; $j++) { $cat[$j] |= $mask; } + for (; $j <= $i->[1]; $j += 0x10000) { $lcat[$j >> 16] |= $mask; } + } + elsif ($i =~ /^\[/) { for (my $j=0; $j < 128; $j++) { if (chr($j) =~ /$i/) { $cat[$j] |= $mask; } } } + else { $cat[$i] |= $mask; } + } +} + +sub find_cls { + foreach (my $i=0; $i<@cat; $i++) { $cls{$cat[$i]} = scalar keys(%cls) if !defined($cls{$cat[$i]}); } + foreach (my $i=0; $i<@lcat; $i++) { $cls{$lcat[$i]} = scalar keys(%cls) if !defined($cls{$lcat[$i]}); } +} + +sub gen_enum { + print "enum xml_char_type {\n"; + foreach my $id (sort keys %ids) { + my $mask = 0; + foreach my $i (keys %cls) { + $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id})); + } + printf " XML_CHAR_%-20s = 0x%08x,\n", $id, $mask; + } + print "};\n\n"; +} + +sub gen_tabs { + my @tab = (); + my %hash = (); + print "static const uns xml_char_tab1[] = {\n "; + for (my $t=0; $t<256; $t++) { + my $i = $t * 256; + my @x = (); + for (my $j=0; $j<256; $j += 32) { + push @x, join(",", map($cls{$_}, @cat[$i+$j..$i+$j+31])); + } + my $sub = " " . join(",\n ", @x); + if (!defined($hash{$sub})) { + $hash{$sub} = 256 * scalar @tab; + push @tab, $sub; + } + printf("0x%x", $hash{$sub}); + print((~$t & 15) ? "," : ($t < 255) ? ",\n " : "\n};\n\n"); + } + + print "static const byte xml_char_tab2[] = {\n"; + print join(",\n\n", @tab); + print "\n};\n\n"; + + my @l = (); + for (my $i=0; $i<0x11; $i++) { + push @l, sprintf("%d", $cls{$lcat[$i]}); + } + print "static const byte xml_char_tab3[] = {" . join(",", @l) . "};\n"; +} diff --git a/sherlock/xml/xml.c b/sherlock/xml/xml.c new file mode 100644 index 00000000..27ff8249 --- /dev/null +++ b/sherlock/xml/xml.c @@ -0,0 +1,2432 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +/* TODO: + * - iface + * - stack-like memory handling where possible + */ + +#define LOCAL_DEBUG + +#include "lib/lib.h" +#include "lib/mempool.h" +#include "lib/fastbuf.h" +#include "lib/ff-utf8.h" +#include "lib/ff-binary.h" +#include "lib/chartype.h" +#include "lib/unicode.h" +#include "lib/hashfunc.h" +#include "lib/stkstring.h" +#include "lib/unaligned.h" +#include "charset/charconv.h" +#include "charset/fb-charconv.h" +#include "sherlock/xml/xml.h" + +#include + +/*** Debugging ***/ + +#ifdef LOCAL_DEBUG +#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0) +#else +#define TRACE(c, f, p...) do {} while(0) +#endif + +static uns xml_row(struct xml_context *ctx); + +/*** Error handling ***/ + +static void NONRET +xml_throw(struct xml_context *ctx) +{ + ASSERT(ctx->err_code && ctx->throw_buf); + longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code); +} + +static void +xml_warn(struct xml_context *ctx, const char *format, ...) +{ + if (ctx->h_warn) + { + va_list args; + va_start(args, format); + ctx->err_msg = stk_vprintf(format, args); + ctx->err_code = XML_ERR_WARN; + va_end(args); + ctx->h_warn(ctx); + ctx->err_msg = NULL; + ctx->err_code = XML_ERR_OK; + } +} + +static void +xml_error(struct xml_context *ctx, const char *format, ...) +{ + if (ctx->h_error) + { + va_list args; + va_start(args, format); + ctx->err_msg = stk_vprintf(format, args); + ctx->err_code = XML_ERR_ERROR; + va_end(args); + ctx->h_error(ctx); + ctx->err_msg = NULL; + ctx->err_code = XML_ERR_OK; + } +} + +static void NONRET +xml_fatal(struct xml_context *ctx, const char *format, ...) +{ + va_list args; + va_start(args, format); + ctx->err_msg = mp_vprintf(ctx->pool, format, args); + ctx->err_code = XML_ERR_FATAL; + ctx->state = XML_STATE_FATAL; + va_end(args); + if (ctx->h_fatal) + ctx->h_fatal(ctx); + xml_throw(ctx); +} + +/*** Charecter categorization ***/ + +#include "obj/lib/xml-ucat.h" + +static inline uns +xml_char_cat(uns c) +{ + if (c < 0x10000) + return 1U << xml_char_tab2[(c & 0xff) + xml_char_tab1[c >> 8]]; + else if (likely(c < 0x110000)) + return 1U << xml_char_tab3[c >> 16]; + else + return 1; +} + +/*** Generic UTF decoding ***/ + +static uns +bget_utf16_le_slow(struct fastbuf *fb, uns repl) +{ + if ((int)bpeekc(fb) < 0) + return ~0U; + uns u = bgetw_le(fb), x, y; + if ((int)u < 0) + return repl; + if ((x = u - 0xd800) >= 0x800) + return u; + if (x >= 0x400 || (int)bpeekc(fb) < 0 || (y = bgetw_le(fb) - 0xdc00) >= 0x400) + return repl; + return 0x10000 + (x << 10) + y; +} + +static uns +bget_utf16_be_slow(struct fastbuf *fb, uns repl) +{ + if ((int)bpeekc(fb) < 0) + return ~0U; + uns u = bgetw_be(fb), x, y; + if ((int)u < 0) + return repl; + if ((x = u - 0xd800) >= 0x800) + return u; + if (x >= 0x400 || (int)bpeekc(fb) < 0 || (y = bgetw_be(fb) - 0xdc00) >= 0x400) + return repl; + return 0x10000 + (x << 10) + y; +} + +static inline uns +bget_utf16_le_repl(struct fastbuf *fb, uns repl) +{ + uns u; + if (bavailr(fb) >= 4) + { + fb->bptr = utf16_le_get_repl(fb->bptr, &u, repl); + return u; + } + else + return bget_utf16_le_slow(fb, repl); +} + +static inline uns +bget_utf16_be_repl(struct fastbuf *fb, uns repl) +{ + uns u; + if (bavailr(fb) >= 4) + { + fb->bptr = utf16_be_get_repl(fb->bptr, &u, repl); + return u; + } + else + return bget_utf16_be_slow(fb, repl); +} + +/*** Memory management ***/ + +static void NONRET +xml_fatal_nested(struct xml_context *ctx) +{ + xml_fatal(ctx, "Entity not nested correctly"); +} + +static inline void +xml_inc(struct xml_context *ctx) +{ + /* Called after the first character of a block */ + TRACE(ctx, "inc"); + ctx->depth++; +} + +static inline void +xml_dec(struct xml_context *ctx) +{ + /* Called after the last character of a block */ + TRACE(ctx, "dec"); + if (unlikely(!ctx->depth--)) + xml_fatal_nested(ctx); +} + +static inline void +xml_push(struct xml_context *ctx) +{ + TRACE(ctx, "push"); + struct xml_stack *s = mp_alloc(ctx->pool, sizeof(*s)); + mp_save(ctx->pool, &s->saved_pool); + s->saved_flags = ctx->flags; + s->next = ctx->stack; + ctx->stack = s; + xml_inc(ctx); +} + +static inline void +xml_pop(struct xml_context *ctx) +{ + TRACE(ctx, "pop"); + xml_dec(ctx); + struct xml_stack *s = ctx->stack; + ASSERT(s); + ctx->stack = s->next; + ctx->flags = s->saved_flags; + mp_restore(ctx->pool, &s->saved_pool); +} + +#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN) +#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \ + static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uns size) \ + { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \ + static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {} + +static void * +xml_hash_new(struct mempool *pool, uns size) +{ + void *tab = mp_alloc_zero(pool, size + XML_HASH_HDR_SIZE); + *(void **)tab = pool; + return tab + XML_HASH_HDR_SIZE; +} + +/*** Reading of document/external entities ***/ + +static void NONRET +xml_eof(struct xml_context *ctx) +{ + ctx->err_msg = "Unexpected EOF"; + ctx->err_code = XML_ERR_EOF; + xml_throw(ctx); +} + +static inline void +xml_add_char(u32 **bstop, uns c) +{ + *(*bstop)++ = c; + *(*bstop)++ = xml_char_cat(c); +} + +static struct xml_source * +xml_push_source(struct xml_context *ctx, uns flags) +{ + xml_push(ctx); + struct xml_source *src = ctx->src; + if (src) + { + src->bptr = ctx->bptr; + src->bstop = ctx->bstop; + } + src = mp_alloc_zero(ctx->pool, sizeof(*src)); + src->next = ctx->src; + src->saved_depth = ctx->depth; + ctx->src = src; + ctx->flags = (ctx->flags & ~(XML_FLAG_SRC_EOF | XML_FLAG_SRC_EXPECTED_DECL | XML_FLAG_SRC_NEW_LINE | XML_FLAG_SRC_SURROUND | XML_FLAG_SRC_DOCUMENT)) | flags; + ctx->bstop = ctx->bptr = src->buf; + ctx->depth = 0; + if (flags & XML_FLAG_SRC_SURROUND) + xml_add_char(&ctx->bstop, 0x20); + return src; +} + +static void +xml_pop_source(struct xml_context *ctx) +{ + TRACE(ctx, "xml_pop_source"); + if (unlikely(ctx->depth != 0)) + xml_fatal_nested(ctx); + struct xml_source *src = ctx->src; + ASSERT(src); + bclose(src->fb); + ctx->depth = src->saved_depth; + ctx->src = src = src->next; + if (src) + { + ctx->bptr = src->bptr; + ctx->bstop = src->bstop; + } + xml_pop(ctx); + if (unlikely(!src)) + xml_eof(ctx); +} + +static void xml_refill_utf8(struct xml_context *ctx); + +static void +xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent) +{ + TRACE(ctx, "xml_push_entity"); + uns cat1 = ctx->src->refill_cat1; + uns cat2 = ctx->src->refill_cat2; + struct xml_source *src = xml_push_source(ctx, 0); + src->refill_cat1 = cat1; + src->refill_cat2 = cat2; + if (ent->flags & XML_DTD_ENT_EXTERNAL) + xml_fatal(ctx, "External entities not implemented"); // FIXME + else + { + fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0); + src->refill = xml_refill_utf8; + } +} + +void +xml_set_source(struct xml_context *ctx, struct fastbuf *fb) +{ + TRACE(ctx, "xml_set_source"); + ASSERT(!ctx->src); + struct xml_source *src = xml_push_source(ctx, XML_FLAG_SRC_DOCUMENT | XML_FLAG_SRC_EXPECTED_DECL); + src->fb = fb; +} + +static uns +xml_error_restricted(struct xml_context *ctx, uns c) +{ + xml_error(ctx, "Restricted char U+%04X", c); + return UNI_REPLACEMENT; +} + +static void xml_parse_decl(struct xml_context *ctx); + +#define REFILL(ctx, func, params...) \ + struct xml_source *src = ctx->src; \ + struct fastbuf *fb = src->fb; \ + if (ctx->bptr == ctx->bstop) \ + ctx->bptr = ctx->bstop = src->buf; \ + uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ + u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \ + *last_0xd = (f & XML_FLAG_SRC_NEW_LINE) ? bstop : bend; \ + do \ + { \ + c = func(fb, ##params); \ + uns t = xml_char_cat(c); \ + if (t & t1) \ + /* Typical branch */ \ + *bstop++ = c, *bstop++ = t; \ + else if (t & t2) \ + { \ + /* New line */ \ + /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \ + /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \ + if (c == 0xd) \ + last_0xd = bstop + 2; \ + else if (c != 0x2028 && last_0xd == bstop) \ + { \ + last_0xd = bend; \ + continue; \ + } \ + xml_add_char(&bstop, 0xa), row++; \ + } \ + else if (c == '>') \ + { \ + /* Used only in XML/TextDecl to switch the encoding */ \ + *bstop++ = c, *bstop++ = t; \ + break; \ + } \ + else if ((int)c >= 0) \ + /* Restricted character */ \ + xml_add_char(&bstop, xml_error_restricted(ctx, c)); \ + else \ + { \ + /* EOF */ \ + if (f & XML_FLAG_SRC_SURROUND) \ + xml_add_char(&bstop, 0x20); \ + f |= XML_FLAG_SRC_EOF; \ + break; \ + } \ + } \ + while (bstop < bend); \ + ctx->flags = (last_0xd == bstop) ? f | XML_FLAG_SRC_NEW_LINE : f & ~XML_FLAG_SRC_NEW_LINE; \ + ctx->bstop = bstop; \ + src->row = row; + +static void +xml_refill_utf8(struct xml_context *ctx) +{ + // FIXME: report corrupted encoding + REFILL(ctx, bget_utf8); +} + +static void +xml_refill_utf16_le(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf16_le_repl, 0); +} + +static void +xml_refill_utf16_be(struct xml_context *ctx) +{ + REFILL(ctx, bget_utf16_be_repl, 0); +} + +#if 0 +static inline uns +xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x) +{ + // FIXME: slow + int c; + return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]); +} + +static void +xml_refill_libcharset(struct xml_context *ctx) +{ + unsigned short int *in_to_x = ctx->src->refill_in_to_x; + REFILL(ctx, xml_refill_libcharset_bget, in_to_x); +} +#endif + +#undef REFILL + +static void +xml_refill(struct xml_context *ctx) +{ + do + { + if (ctx->flags & XML_FLAG_SRC_EOF) + xml_pop_source(ctx); + else if (ctx->flags & XML_FLAG_SRC_EXPECTED_DECL) + xml_parse_decl(ctx); + else + { + ctx->src->refill(ctx); + TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2)); + } + } + while (ctx->bptr == ctx->bstop); +} + +static inline uns +xml_peek_char(struct xml_context *ctx) +{ + if (ctx->bptr == ctx->bstop) + xml_refill(ctx); + return ctx->bptr[0]; +} + +static inline uns +xml_peek_cat(struct xml_context *ctx) +{ + if (ctx->bptr == ctx->bstop) + xml_refill(ctx); + return ctx->bptr[1]; +} + +static inline uns +xml_get_char(struct xml_context *ctx) +{ + uns c = xml_peek_char(ctx); + ctx->bptr += 2; + return c; +} + +static inline uns +xml_get_cat(struct xml_context *ctx) +{ + uns c = xml_peek_cat(ctx); + ctx->bptr += 2; + return c; +} + +static inline uns +xml_last_char(struct xml_context *ctx) +{ + return ctx->bptr[-2]; +} + +static inline uns +xml_last_cat(struct xml_context *ctx) +{ + return ctx->bptr[-1]; +} + +static inline uns +xml_skip_char(struct xml_context *ctx) +{ + uns c = ctx->bptr[0]; + ctx->bptr += 2; + return c; +} + +static inline uns +xml_unget_char(struct xml_context *ctx) +{ + return *(ctx->bptr -= 2); +} + +static uns +xml_row(struct xml_context *ctx) +{ + struct xml_source *src = ctx->src; + if (!src) + return 0; + uns row = src->row; + for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2) + if (p[-1] & src->refill_cat2) + row--; + return row + 1; +} + +/*** Basic parsing ***/ + +static void NONRET +xml_fatal_expected(struct xml_context *ctx, uns c) +{ + xml_fatal(ctx, "Expected '%c'", c); +} + +static void NONRET +xml_fatal_expected_white(struct xml_context *ctx) +{ + xml_fatal(ctx, "Expected a white space"); +} + +static void NONRET +xml_fatal_expected_quot(struct xml_context *ctx) +{ + xml_fatal(ctx, "Expected a quotation mark"); +} + +static inline uns +xml_parse_white(struct xml_context *ctx, uns mandatory) +{ + /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+ + * mandatory=0 -> S? */ + uns cnt = 0; + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + { + xml_skip_char(ctx); + cnt++; + } + if (unlikely(mandatory && !cnt)) + xml_fatal_expected_white(ctx); + return cnt; +} + +static inline void +xml_parse_char(struct xml_context *ctx, uns c) +{ + /* Consumes a given Unicode character */ + if (unlikely(c != xml_get_char(ctx))) + xml_fatal_expected(ctx, c); +} + +static inline void +xml_parse_seq(struct xml_context *ctx, const char *seq) +{ + /* Consumes a given sequence of ASCII characters */ + while (*seq) + xml_parse_char(ctx, *seq++); +} + +static void +xml_parse_eq(struct xml_context *ctx) +{ + /* Eq ::= S? '=' S? */ + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '='); + xml_parse_white(ctx, 0); +} + +static inline uns +xml_parse_quote(struct xml_context *ctx) +{ + /* "'" | '"' */ + uns c = xml_get_char(ctx); + if (unlikely(c != '\'' && c != '\"')) + xml_fatal_expected_quot(ctx); + return c; +} + +/* Names and nmtokens */ + +static char * +xml_parse_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err) +{ + char *p = mp_start_noalign(ctx->pool, 1); + if (unlikely(!(xml_peek_cat(ctx) & first_cat))) + xml_fatal(ctx, "%s", err); + do + { + p = mp_spread(ctx->pool, p, 5); + p = utf8_32_put(p, xml_skip_char(ctx)); + } + while (xml_peek_cat(ctx) & next_cat); + *p++ = 0; + return mp_end(ctx->pool, p); +} + +static void +xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err) +{ + if (unlikely(!(xml_get_cat(ctx) & first_cat))) + xml_fatal(ctx, "%s", err); + while (xml_peek_cat(ctx) & next_cat) + xml_skip_char(ctx); +} + +static char * +xml_parse_name(struct xml_context *ctx) +{ + /* Name ::= NameStartChar (NameChar)* */ + return xml_parse_string(ctx, + !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, + !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, + "Expected a name"); +} + +static void +xml_skip_name(struct xml_context *ctx) +{ + xml_skip_string(ctx, + !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, + !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, + "Expected a name"); +} + +static char * +xml_parse_nmtoken(struct xml_context *ctx) +{ + /* Nmtoken ::= (NameChar)+ */ + uns cat = !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1; + return xml_parse_string(ctx, cat, cat, "Expected a nmtoken"); +} + +/* Simple literals */ + +static char * +xml_parse_system_literal(struct xml_context *ctx) +{ + /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */ + char *p = mp_start_noalign(ctx->pool, 1); + uns q = xml_parse_quote(ctx), c; + while ((c = xml_get_char(ctx)) != q) + { + p = mp_spread(ctx->pool, p, 5); + p = utf8_32_put(p, c); + } + *p++ = 0; + return mp_end(ctx->pool, p); +} + +static char * +xml_parse_pubid_literal(struct xml_context *ctx) +{ + /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */ + char *p = mp_start_noalign(ctx->pool, 1); + uns q = xml_parse_quote(ctx), c; + while ((c = xml_get_char(ctx)) != q) + { + if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID))) + xml_fatal(ctx, "Expected a pubid character"); + p = mp_spread(ctx->pool, p, 2); + *p++ = c; + } + *p++ = 0; + return mp_end(ctx->pool, p); +} + +static char * +xml_parse_encoding_name(struct xml_context *ctx) +{ + /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */ + char *p = mp_start_noalign(ctx->pool, 1); + uns q = xml_parse_quote(ctx); + if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME))) + xml_fatal(ctx, "Invalid character in the encoding name"); + while (1) + { + p = mp_spread(ctx->pool, p, 2); + *p++ = xml_last_char(ctx); + if (xml_get_char(ctx) == q) + break; + if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME))) + xml_fatal(ctx, "Invalid character in the encoding name"); + } + *p++ = 0; + return mp_end(ctx->pool, p); +} + +/* Document/external entity header */ + +static inline void +xml_init_cats(struct xml_context *ctx, uns mask) +{ + if (!(ctx->flags & XML_FLAG_VERSION_1_1)) + { + ctx->src->refill_cat1 = XML_CHAR_VALID_1_0 & ~XML_CHAR_NEW_LINE_1_0 & ~mask; + ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_0; + } + else + { + ctx->src->refill_cat1 = XML_CHAR_UNRESTRICTED_1_1 & ~XML_CHAR_NEW_LINE_1_1 & ~mask; + ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_1; + } +} + +static void +xml_init_charconv(struct xml_context *ctx, int cs) +{ + // FIXME: hack + struct xml_source *src = ctx->src; + TRACE(ctx, "wrapping charset %s", charset_name(cs)); +#if 0 + struct conv_context conv; + conv_set_charset(&conv, cs, CONV_CHARSET_UTF8); + src->refill = xml_refill_libcharset; + src->refill_in_to_x = conv.in_to_x; +#else + src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8); + // FIXME: memory leak +#endif +} + +static void +xml_parse_decl(struct xml_context *ctx) +{ + TRACE(ctx, "xml_parse_decl"); + struct xml_source *src = ctx->src; + ctx->flags &= ~XML_FLAG_SRC_EXPECTED_DECL; + + /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */ + xml_init_cats(ctx, XML_CHAR_GT); + + /* Initialize the supplied charset (if any) or try to guess it */ + char *expected_encoding = src->expected_encoding ? : src->fb_encoding; + src->refill = xml_refill_utf8; + int bom = bpeekc(src->fb); + if (bom < 0) + ctx->flags |= XML_FLAG_SRC_EOF; + if (!src->fb_encoding) + { + if (bom == 0xfe) + src->refill = xml_refill_utf16_be; + else if (bom == 0xff) + src->refill = xml_refill_utf16_le; + } + else + { + int cs = find_charset_by_name(src->fb_encoding); + if (cs == CONV_CHARSET_UTF8) + {} + else if (cs >= 0) + { + xml_init_charconv(ctx, cs); + bom = 0; + } + else if (strcasecmp(src->fb_encoding, "UTF-16")) + { + src->refill = xml_refill_utf16_be; + if (bom == 0xff) + src->refill = xml_refill_utf16_le; + if (!src->expected_encoding) + expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE"; + } + else if (strcasecmp(src->fb_encoding, "UTF-16BE")) + src->refill = xml_refill_utf16_be; + else if (strcasecmp(src->fb_encoding, "UTF-16LE")) + src->refill = xml_refill_utf16_le; + else + { + xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding); + expected_encoding = NULL; + } + } + uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; + if (bom > 0 && xml_peek_char(ctx) == 0xfeff) + xml_skip_char(ctx); + else if (utf16) + xml_error(ctx, "Missing or corrupted BOM"); + + /* Look ahead for presence of XMLDecl or optional TextDecl */ + if (!(ctx->flags & XML_FLAG_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) + xml_refill(ctx); + uns doc = ctx->flags & XML_FLAG_SRC_DOCUMENT; + u32 *bptr = ctx->bptr; + uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) && + bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L'); + if (!have_decl) + { + if (doc) + xml_fatal(ctx, "Missing or corrupted XML header"); + else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16) + xml_error(ctx, "Missing or corrupted entity header"); + goto exit; + } + ctx->bptr = bptr + 12; + xml_parse_white(ctx, 0); + + /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */ + if (xml_peek_char(ctx) == 'v') + { + xml_parse_seq(ctx, "version"); + xml_parse_eq(ctx); + char *version = xml_parse_pubid_literal(ctx); + TRACE(ctx, "version=%s", version); + uns v = 0; + if (!strcmp(version, "1.1")) + v = XML_FLAG_VERSION_1_1; + else if (strcmp(version, "1.0")) + { + xml_error(ctx, "Unknown XML version string '%s'", version); + version = "1.0"; + } + if (doc) + { + ctx->version_str = version; + ctx->flags |= v; + } + else if (v > (ctx->flags & XML_FLAG_VERSION_1_1)) + xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document"); + if (!xml_parse_white(ctx, !doc)) + goto end; + } + else if (doc) + { + xml_error(ctx, "Expected XML version"); + ctx->version_str = "1.0"; + } + + /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */ + if (xml_peek_char(ctx) == 'e') + { + xml_parse_seq(ctx, "encoding"); + xml_parse_eq(ctx); + src->decl_encoding = xml_parse_encoding_name(ctx); + TRACE(ctx, "encoding=%s", src->decl_encoding); + if (!xml_parse_white(ctx, 0)) + goto end; + } + else if (!doc) + xml_error(ctx, "Expected XML encoding"); + + /* Parse whether the document is standalone (optional in XMLDecl) */ + if (doc && xml_peek_char(ctx) == 's') + { + xml_parse_seq(ctx, "standalone"); + xml_parse_eq(ctx); + uns c = xml_parse_quote(ctx); + if (ctx->standalone = (xml_peek_char(ctx) == 'y')) + xml_parse_seq(ctx, "yes"); + else + xml_parse_seq(ctx, "no"); + xml_parse_char(ctx, c); + TRACE(ctx, "standalone=%d", ctx->standalone); + xml_parse_white(ctx, 0); + } +end: + xml_parse_seq(ctx, "?>"); + + /* Switch to the final encoding */ + if (src->decl_encoding) + { + int cs = find_charset_by_name(src->decl_encoding); + if (cs < 0 && !expected_encoding) + xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); + else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) + xml_init_charconv(ctx, cs); + else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || + !(!strcasecmp(src->decl_encoding, "UTF-16") || + (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || + (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) + xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); + } + +exit: + /* Update valid Unicode ranges */ + xml_init_cats(ctx, 0); +} + +/*** Document Type Definition (DTD) ***/ + +/* Notations */ + +#define HASH_PREFIX(x) xml_dtd_notns_##x +#define HASH_NODE struct xml_dtd_notn +#define HASH_KEY_STRING name +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_WANT_FIND +#define HASH_WANT_LOOKUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +/* General entities */ + +#define HASH_PREFIX(x) xml_dtd_ents_##x +#define HASH_NODE struct xml_dtd_ent +#define HASH_KEY_STRING name +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_WANT_FIND +#define HASH_WANT_LOOKUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +static struct xml_dtd_ent * +xml_dtd_declare_trivial_gent(struct xml_context *ctx, char *name, char *text) +{ + struct xml_dtd *dtd = ctx->dtd; + struct xml_dtd_ent *ent = xml_dtd_ents_lookup(dtd->tab_gents, name); + if (ent->flags & XML_DTD_ENT_DECLARED) + { + xml_warn(ctx, "Entity &%s; already declared", name); + return NULL; + } + slist_add_tail(&dtd->gents, &ent->n); + ent->flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL; + ent->text = text; + ent->len = strlen(text); + return ent; +} + +static void +xml_dtd_declare_default_gents(struct xml_context *ctx) +{ + xml_dtd_declare_trivial_gent(ctx, "lt", "<"); + xml_dtd_declare_trivial_gent(ctx, "gt", ">"); + xml_dtd_declare_trivial_gent(ctx, "amp", "&"); + xml_dtd_declare_trivial_gent(ctx, "apos", "'"); + xml_dtd_declare_trivial_gent(ctx, "quot", "\""); +} + +static struct xml_dtd_ent * +xml_dtd_find_gent(struct xml_context *ctx, char *name) +{ + struct xml_dtd *dtd = ctx->dtd; + if (dtd) + { + struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_gents, name); + return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; + } + else + { +#define ENT(n, t) ent_##n = { .name = #n, .text = t, .len = 1, .flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL } + static struct xml_dtd_ent ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\""); +#undef ENT + switch (name[0]) + { + case 'l': + if (!strcmp(name, "lt")) + return &ent_lt; + break; + case 'g': + if (!strcmp(name, "gt")) + return &ent_gt; + break; + case 'a': + if (!strcmp(name, "amp")) + return &ent_amp; + if (!strcmp(name, "apos")) + return &ent_apos; + break; + case 'q': + if (!strcmp(name, "quot")) + return &ent_quot; + break; + } + return NULL; + } +} + +/* Parameter entities */ + +static struct xml_dtd_ent * +xml_dtd_find_pent(struct xml_context *ctx, char *name) +{ + struct xml_dtd *dtd = ctx->dtd; + struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_pents, name); + return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; +} + +/* Elements */ + +#define HASH_PREFIX(x) xml_dtd_elems_##x +#define HASH_NODE struct xml_dtd_elem +#define HASH_KEY_STRING name +#define HASH_TABLE_DYNAMIC +#define HASH_ZERO_FILL +#define HASH_WANT_LOOKUP +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +/* Element attributes */ + +struct xml_dtd_attrs_table; + +static inline uns +xml_dtd_attrs_hash(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem, char *name) +{ + return hash_pointer(elem) ^ hash_string(name); +} + +static inline int +xml_dtd_attrs_eq(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem1, char *name1, struct xml_dtd_elem *elem2, char *name2) +{ + return (elem1 == elem2) && !strcmp(name1, name2); +} + +static inline void +xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_elem *elem, char *name) +{ + attr->elem = elem; + attr->name = name; +} + +#define HASH_PREFIX(x) xml_dtd_attrs_##x +#define HASH_NODE struct xml_dtd_attr +#define HASH_ZERO_FILL +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x elem, x name +#define HASH_KEY_DECL struct xml_dtd_elem *elem, char *name +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +/* Enumerated attribute values */ + +struct xml_dtd_evals_table; + +static inline uns +xml_dtd_evals_hash(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr, char *val) +{ + return hash_pointer(attr) ^ hash_string(val); +} + +static inline int +xml_dtd_evals_eq(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr1, char *val1, struct xml_dtd_attr *attr2, char *val2) +{ + return (attr1 == attr2) && !strcmp(val1, val2); +} + +static inline void +xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_eval *eval, struct xml_dtd_attr *attr, char *val) +{ + eval->attr = attr; + eval->val = val; +} + +#define HASH_PREFIX(x) xml_dtd_evals_##x +#define HASH_NODE struct xml_dtd_eval +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x attr, x val +#define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +/* Enumerated attribute notations */ + +struct xml_dtd_enotns_table; + +static inline uns +xml_dtd_enotns_hash(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) +{ + return hash_pointer(attr) ^ hash_pointer(notn); +} + +static inline int +xml_dtd_enotns_eq(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr1, struct xml_dtd_notn *notn1, struct xml_dtd_attr *attr2, struct xml_dtd_notn *notn2) +{ + return (attr1 == attr2) && (notn1 == notn2); +} + +static inline void +xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_enotn *enotn, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn) +{ + enotn->attr = attr; + enotn->notn = notn; +} + +#define HASH_PREFIX(x) xml_dtd_enotns_##x +#define HASH_NODE struct xml_dtd_enotn +#define HASH_TABLE_DYNAMIC +#define HASH_KEY_COMPLEX(x) x attr, x notn +#define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +/* DTD initialization/cleanup */ + +static void +xml_dtd_init(struct xml_context *ctx) +{ + if (ctx->dtd) + return; + struct mempool *pool = mp_new(4096); + struct xml_dtd *dtd = ctx->dtd = mp_alloc_zero(pool, sizeof(*ctx->dtd)); + dtd->pool = pool; + xml_dtd_ents_init(dtd->tab_gents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); + xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); + xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table))); + xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table))); + xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table))); + xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table))); + xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table))); + xml_dtd_declare_default_gents(ctx); +} + +static void +xml_dtd_cleanup(struct xml_context *ctx) +{ + if (!ctx->dtd) + return; + mp_delete(ctx->dtd->pool); + ctx->dtd = NULL; +} + +static void +xml_dtd_finish(struct xml_context *ctx) +{ + if (!ctx->dtd) + return; + // FIXME +} + +/*** Parsing functions ***/ + +/* Comments */ + +static void +xml_push_comment(struct xml_context *ctx) +{ + /* Parse a comment to ctx->value: + * Comment ::= '' + * Already parsed: 'value; + uns c; + xml_parse_char(ctx, '-'); + while (1) + { + if ((c = xml_get_char(ctx)) == '-') + if ((c = xml_get_char(ctx)) == '-') + break; + else + bputc(out, '-'); + bput_utf8_32(out, c); + } + xml_parse_char(ctx, '>'); + xml_dec(ctx); + fbgrow_rewind(out); + if (ctx->h_comment) + ctx->h_comment(ctx); +} + +static void +xml_pop_comment(struct xml_context *ctx) +{ + fbgrow_rewind(ctx->value); +} + +static void +xml_skip_comment(struct xml_context *ctx) +{ + xml_parse_char(ctx, '-'); + while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-'); + xml_parse_char(ctx, '>'); + xml_dec(ctx); +} + +/* Processing instructions */ + +static void +xml_push_pi(struct xml_context *ctx) +{ + /* Parses a PI to ctx->value and ctx->name: + * PI ::= '' Char*)))? '?>' + * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) + * Already parsed: 'name = xml_parse_name(ctx); + if (unlikely(!strcasecmp(ctx->name, "xml"))) + xml_fatal(ctx, "Reserved PI target"); + struct fastbuf *out = ctx->value; + if (xml_parse_white(ctx, 0)) + xml_parse_seq(ctx, "?>"); + else + { + while (1) + { + uns c; + if ((c = xml_get_char(ctx)) == '?') + if (xml_get_char(ctx) == '>') + break; + else + { + xml_unget_char(ctx); + bputc(out, '?'); + } + else + bput_utf8_32(out, c); + } + fbgrow_rewind(out); + } + xml_dec(ctx); + if (ctx->h_pi) + ctx->h_pi(ctx); +} + +static void +xml_pop_pi(struct xml_context *ctx) +{ + fbgrow_reset(ctx->value); +} + +static void +xml_skip_pi(struct xml_context *ctx) +{ + if (ctx->flags & XML_FLAG_VALIDATING) + { + mp_push(ctx->pool); + if (unlikely(!strcasecmp(xml_parse_name(ctx), "xml"))) + xml_fatal(ctx, "Reserved PI target"); + mp_pop(ctx->pool); + if (!xml_parse_white(ctx, 0)) + { + xml_parse_seq(ctx, "?>"); + xml_dec(ctx); + return; + } + } + while (1) + if (xml_get_char(ctx) == '?') + if (xml_get_char(ctx) == '>') + break; + else + xml_unget_char(ctx); + xml_dec(ctx); +} + +/* Character references */ + +static uns +xml_parse_char_ref(struct xml_context *ctx) +{ + TRACE(ctx, "parse_char_ref"); + /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' + * Already parsed: '&#' */ + uns v = 0; + if (xml_get_char(ctx) == 'x') + { + if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT)) + { + xml_error(ctx, "Expected a hexadecimal value of character reference"); + goto recover; + } + do + { + v = (v << 4) + Cxvalue(xml_last_char(ctx)); + } + while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT)); + } + else + { + if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT)) + { + xml_error(ctx, "Expected a numeric value of character reference"); + goto recover; + } + do + { + v = v * 10 + xml_last_char(ctx) - '0'; + } + while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT)); + } + uns cat = xml_char_cat(v); + if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_FLAG_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0))) + { + xml_error(ctx, "Character reference out of range"); + goto recover; + } + if (xml_last_char(ctx) == ';') + { + xml_dec(ctx); + return v; + } + xml_error(ctx, "Expected ';'"); +recover: + while (xml_last_char(ctx) != ';') + xml_get_char(ctx); + xml_dec(ctx); + return UNI_REPLACEMENT; +} + +/* References to general entities */ + +static void +xml_parse_ge_ref(struct xml_context *ctx, struct fastbuf *out) +{ + /* Reference ::= EntityRef | CharRef + * EntityRef ::= '&' Name ';' + * Already parsed: '&' */ + if (xml_peek_char(ctx) == '#') + { + xml_skip_char(ctx); + uns c = xml_parse_char_ref(ctx); + bput_utf8_32(out, c); + } + else + { + struct mempool_state state; + mp_save(ctx->pool, &state); + char *name = xml_parse_name(ctx); + xml_parse_char(ctx, ';'); + struct xml_dtd_ent *ent = xml_dtd_find_gent(ctx, name); + if (!ent) + { + xml_error(ctx, "Unknown entity &%s;", name); + bputc(out, '&'); + bputs(out, name); + bputc(out, ';'); + } + else if (ent->flags & XML_DTD_ENT_TRIVIAL) + { + TRACE(ctx, "Trivial entity &%s;", name); + bwrite(out, ent->text, ent->len); + } + else + { + TRACE(ctx, "Pushed entity &%s;", name); + mp_restore(ctx->pool, &state); + xml_dec(ctx); + xml_push_entity(ctx, ent); + return; + } + mp_restore(ctx->pool, &state); + xml_dec(ctx); + } +} + +/* References to parameter entities */ + +static void +xml_parse_pe_ref(struct xml_context *ctx) +{ + /* PEReference ::= '%' Name ';' + * Already parsed: '%' */ + struct mempool_state state; + mp_save(ctx->pool, &state); + char *name = xml_parse_name(ctx); + xml_parse_char(ctx, ';'); + struct xml_dtd_ent *ent = xml_dtd_find_pent(ctx, name); + if (!ent) + xml_error(ctx, "Unknown entity %%%s;", name); + else + { + TRACE(ctx, "Pushed entity %%%s;", name); + mp_restore(ctx->pool, &state); + xml_dec(ctx); + xml_push_entity(ctx, ent); + return; + } + mp_restore(ctx->pool, &state); + xml_dec(ctx); +} + +static void +xml_parse_dtd_pe(struct xml_context *ctx) +{ + do + { + xml_skip_char(ctx); + xml_inc(ctx); + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + xml_skip_char(ctx); + xml_parse_pe_ref(ctx); + } + while (xml_peek_char(ctx) != '%'); +} + +static inline uns +xml_parse_dtd_white(struct xml_context *ctx, uns mandatory) +{ + /* Whitespace or parameter entity */ + uns cnt = 0; + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + { + xml_skip_char(ctx); + cnt = 1; + } + if (xml_peek_char(ctx) == '%') + { + xml_parse_dtd_pe(ctx); + return 1; + } + else if (unlikely(mandatory && !cnt)) + xml_fatal_expected_white(ctx); + return cnt; +} + +static inline uns +xml_check_dtd_pe(struct xml_context *ctx) +{ + if (xml_peek_char(ctx) == '%') + { + xml_parse_dtd_pe(ctx); + return 1; + } + return 0; +} + +/* External ID */ + +static void +xml_parse_external_id(struct xml_context *ctx, struct xml_ext_id *eid, uns allow_public, uns dtd) +{ + bzero(eid, sizeof(*eid)); + if (dtd) + xml_check_dtd_pe(ctx); + uns c = xml_peek_char(ctx); + if (c == 'S') + { + xml_parse_seq(ctx, "SYSTEM"); + if (dtd) + xml_parse_dtd_white(ctx, 1); + else + xml_parse_white(ctx, 1); + eid->system_id = xml_parse_system_literal(ctx); + } + else if (c == 'P') + { + xml_parse_seq(ctx, "PUBLIC"); + if (dtd) + xml_parse_dtd_white(ctx, 1); + else + xml_parse_white(ctx, 1); + eid->public_id = xml_parse_pubid_literal(ctx); + if (dtd ? xml_parse_dtd_white(ctx, 0) : xml_parse_white(ctx, 0)) + if ((c = xml_peek_char(ctx)) == '\'' || c == '"' || !allow_public) + eid->system_id = xml_parse_system_literal(ctx); + } + else + xml_fatal(ctx, "Expected an external ID"); +} + +/* DTD: Notation declaration */ + +static void +xml_parse_notation_decl(struct xml_context *ctx) +{ + /* NotationDecl ::= '' + * Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + + struct xml_dtd_notn *notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx)); + xml_parse_dtd_white(ctx, 1); + struct xml_ext_id eid; + xml_parse_external_id(ctx, &eid, 1, 1); + xml_parse_dtd_white(ctx, 0); + xml_parse_char(ctx, '>'); + + if (notn->flags & XML_DTD_NOTN_DECLARED) + xml_warn(ctx, "Notation %s already declared", notn->name); + else + { + notn->flags = XML_DTD_NOTN_DECLARED; + notn->eid = eid; + slist_add_tail(&dtd->notns, ¬n->n); + } + xml_dec(ctx); +} + +static void +xml_parse_entity_decl(struct xml_context *ctx) +{ + /* Already parsed: 'dtd; + xml_parse_dtd_white(ctx, 1); + + uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENT_PARAMETER : 0; + if (flags) + xml_parse_dtd_white(ctx, 1); + else + xml_unget_char(ctx); + + struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_gents, xml_parse_name(ctx)); + slist *list = flags ? &dtd->pents : &dtd->gents; + xml_parse_white(ctx, 1); + if (ent->flags & XML_DTD_ENT_DECLARED) + { + xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); + // FIXME: should be only warning + } + + uns c, sep = xml_get_char(ctx); + if (sep == '\'' || sep == '"') + { + /* Internal entity: + * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */ + struct fastbuf *out = ctx->value; + while (1) + { + if ((c = xml_get_char(ctx)) == sep) + break; + else if (c == '%') + { + // FIXME + ASSERT(0); + //xml_parse_parameter_ref(ctx); + } + else if (c != '&') + bput_utf8_32(out, c); + else if ((c = xml_get_char(ctx)) == '#') + c = xml_parse_char_ref(ctx); + else + { + /* Bypass references to general entities */ + mp_push(ctx->pool); + bputc(out, '&'); + xml_unget_char(ctx); + bputs(out, xml_parse_name(ctx)); + xml_parse_char(ctx, ';'); + bputc(out, ';'); + mp_pop(ctx->pool); + } + } + bputc(out, 0); + fbgrow_rewind(out); + slist_add_tail(list, &ent->n); + ent->flags = flags | XML_DTD_ENT_DECLARED; + ent->len = out->bstop - out->bptr - 1; + ent->text = mp_memdup(ctx->pool, out->bptr, ent->len + 1); + fbgrow_reset(out); + } + else + { + /* External entity */ + struct xml_ext_id eid; + struct xml_dtd_notn *notn = NULL; + xml_parse_external_id(ctx, &eid, 0, 0); + if (!xml_parse_white(ctx, 0) || !flags) + xml_parse_char(ctx, '>'); + else if (xml_get_char(ctx) != '>') + { + /* General external unparsed entity */ + flags |= XML_DTD_ENT_UNPARSED; + xml_parse_seq(ctx, "NDATA"); + xml_parse_white(ctx, 1); + notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx)); + } + slist_add_tail(list, &ent->n); + ent->flags = flags | XML_DTD_ENT_DECLARED | XML_DTD_ENT_EXTERNAL; + ent->eid = eid; + ent->notn = notn; + } + xml_parse_dtd_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_dec(ctx); +} + +/* DTD: Internal subset */ + +static void +xml_parse_internal_subset(struct xml_context *ctx) +{ + // FIXME: comments/pi have no parent + /* '[' intSubset ']' + * intSubset :== (markupdecl | DeclSep) + * Already parsed: ']' */ + while (1) + { + xml_parse_white(ctx, 0); + uns c = xml_get_char(ctx); + xml_inc(ctx); + if (c == '<') + if ((c = xml_get_char(ctx)) == '!') + switch (c = xml_get_char(ctx)) + { + case '-': + xml_push_comment(ctx); + xml_pop_comment(ctx); + break; + case 'N': + xml_parse_seq(ctx, "OTATION"); + xml_parse_notation_decl(ctx); + break; + case 'E': + if ((c = xml_get_char(ctx)) == 'N') + { + xml_parse_seq(ctx, "TITY"); + xml_parse_entity_decl(ctx); + } + else if (c == 'L') + { + xml_parse_seq(ctx, "EMENT"); + // FIXME: Element + } + else + goto invalid_markup; + break; + case 'A': + xml_parse_seq(ctx, "TTLIST"); + // FIXME: AttList + break; + default: + goto invalid_markup; + } + else if (c == '?') + { + xml_push_pi(ctx); + xml_pop_pi(ctx); + } + else + goto invalid_markup; + else if (c == '%') + xml_parse_dtd_pe(ctx); + else if (c == ']') + break; + else + goto invalid_markup; + } + xml_dec(ctx); + xml_dec(ctx); + return; +invalid_markup: + xml_fatal(ctx, "Invalid markup in the internal subset"); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////// + +static void +xml_parse_cdata(struct xml_context *ctx) +{ + struct fastbuf *out = ctx->chars; + xml_parse_seq(ctx, "CDATA["); + while (1) + { + uns c; + if ((c = xml_get_char(ctx)) == ']') + { + if ((c = xml_get_char(ctx)) == ']') + if ((c = xml_get_char(ctx)) == '>') + break; + else + bputc(out, ']'); + bputc(out, ']'); + } + bput_utf8_32(out, c); + } +} + +static void +xml_skip_cdata(struct xml_context *ctx) +{ + xml_parse_cdata(ctx); +} + +static void +xml_parse_chars(struct xml_context *ctx) +{ + TRACE(ctx, "parse_chars"); + struct fastbuf *out = ctx->chars; + uns c; + while ((c = xml_get_char(ctx)) != '<') + if (c == '&') + { + xml_inc(ctx); + xml_parse_ge_ref(ctx, out); + } + else + bput_utf8_32(out, c); + xml_unget_char(ctx); +} + +/*----------------------------------------------*/ + +struct xml_attrs_table; + +static inline uns +xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_elem *e, char *n) +{ + return hash_pointer(e) ^ hash_string(n); +} + +static inline int +xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_elem *e1, char *n1, struct xml_elem *e2, char *n2) +{ + return (e1 == e2) && !strcmp(n1, n2); +} + +static inline void +xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_elem *e, char *name) +{ + a->elem = e; + a->name = name; + a->val = NULL; + slist_add_tail(&e->attrs, &a->n); +} + +#define HASH_PREFIX(x) xml_attrs_##x +#define HASH_NODE struct xml_attr +#define HASH_KEY_COMPLEX(x) x elem, x name +#define HASH_KEY_DECL struct xml_elem *elem, char *name +#define HASH_TABLE_DYNAMIC +#define HASH_GIVE_EQ +#define HASH_GIVE_HASHFN +#define HASH_GIVE_INIT_KEY +#define HASH_WANT_CLEANUP +#define HASH_WANT_REMOVE +#define HASH_WANT_LOOKUP +#define HASH_WANT_FIND +#define HASH_GIVE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + +void +xml_init(struct xml_context *ctx) +{ + bzero(ctx, sizeof(*ctx)); + ctx->pool = mp_new(65536); + ctx->chars = fbgrow_create(4096); + ctx->value = fbgrow_create(4096); + xml_dtd_init(ctx); + xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table))); +} + +void +xml_cleanup(struct xml_context *ctx) +{ + xml_attrs_cleanup(ctx->tab_attrs); + xml_dtd_cleanup(ctx); + bclose(ctx->value); + bclose(ctx->chars); + mp_delete(ctx->pool); +} + +static void +xml_parse_attr(struct xml_context *ctx) +{ + // FIXME: memory management, dtd, literal + TRACE(ctx, "parse_attr"); + struct xml_elem *e = ctx->elem; + char *name = xml_parse_name(ctx); + struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, name); + xml_parse_eq(ctx); + char *val =xml_parse_system_literal(ctx); + if (a->val) + xml_error(ctx, "Attribute is not unique"); + else + a->val = val; +} + +static void +xml_parse_stag(struct xml_context *ctx) +{ + // FIXME: dtd + TRACE(ctx, "parse_stag"); + xml_push(ctx); + struct xml_elem *e = mp_alloc_zero(ctx->pool, sizeof(*e)); + struct xml_elem *parent = ctx->elem; + clist_init(&e->sons); + e->node.parent = (void *)parent; + ctx->elem = e; + e->name = xml_parse_name(ctx); + if (parent) + clist_add_tail(&parent->sons, &e->node.n); + else + { + ctx->root = e; + if (ctx->document_type && strcmp(e->name, ctx->document_type)) + xml_error(ctx, "The root element does not match the document type"); + } + while (1) + { + uns white = xml_parse_white(ctx, 0); + uns c = xml_get_char(ctx); + if (c == '/') + { + xml_parse_char(ctx, '>'); + ctx->flags |= XML_FLAG_EMPTY_ELEM; + break; + } + else if (c == '>') + break; + else if (!white) + xml_fatal_expected_white(ctx); + xml_unget_char(ctx); + xml_parse_attr(ctx); + } + if (ctx->h_element_start) + ctx->h_element_start(ctx); +} + +static void +xml_parse_etag(struct xml_context *ctx) +{ + TRACE(ctx, "parse_etag"); + struct xml_elem *e = ctx->elem; + ASSERT(e); + char *name = xml_parse_name(ctx); + if (strcmp(name, e->name)) + xml_fatal(ctx, "Invalid ETag, expected '%s'", e->name); + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_dec(ctx); +} + +static void +xml_pop_element(struct xml_context *ctx) +{ + TRACE(ctx, "pop_element"); + if (ctx->h_element_end) + ctx->h_element_end(ctx); + struct xml_elem *e = ctx->elem; + if (ctx->flags & XML_DOM_FREE) + { + if (e->node.parent) + clist_remove(&e->node.n); + else + ctx->root = NULL; + SLIST_FOR_EACH(struct xml_attr *, a, e->attrs) + xml_attrs_remove(ctx->tab_attrs, a); + struct xml_node *n; + while (n = clist_head(&e->sons)) + { + if (n->type == XML_NODE_ELEM) + { + SLIST_FOR_EACH(struct xml_attr *, a, ((struct xml_elem *)n)->attrs) + xml_attrs_remove(ctx->tab_attrs, a); + clist_insert_list_after(&((struct xml_elem *)n)->sons, &n->n); + } + clist_remove(&n->n); + } + } + ctx->node = e->node.parent; + xml_pop(ctx); // FIXME: memory management without XML_DOM_FREE + xml_dec(ctx); +#if 0 + for (struct xml_attribute *a = e->attrs; a; a = a->next) + xml_attribute_remove(ctx->attribute_table, a); +#endif +} + +static void +xml_parse_element_decl(struct xml_context *ctx) +{ + // FIXME + mp_push(ctx->pool); + xml_parse_seq(ctx, "'); + mp_pop(ctx->pool); +} + +#if 0 +static void +xml_parse_attr_list_decl(struct xml_context *ctx) +{ + /* AttlistDecl ::= '' + * AttDef ::= S Name S AttType S DefaultDecl */ + xml_parse_seq(ctx, "ATTLIST"); + xml_parse_white(ctx, 1); + struct xml_dtd_elem *e = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx)); + e->attlist_declared = 1; + + while (xml_parse_white(ctx, 0) && xml_get_char(ctx) != '>') + { + xml_unget_char(ctx); + char *name = xml_parse_name(ctx); + struct xml_dtd_attr *a = xml_dtd_attrs_find(ctx->dtd->tab_attrs, e, name); + uns ignored = 0; + if (a) + { + xml_warn(ctx, "Duplicate attribute definition"); + ignored++; + } + else + a = xml_dtd_attrs_new(ctx->dtd->tab_attrs, e, name); + xml_parse_white(ctx, 1); + if (xml_get_char(ctx) == '(') + { + if (!ignored) + a->type = XML_ATTR_ENUM; + do + { + xml_parse_white(ctx, 0); + char *value = xml_parse_nmtoken(ctx); + if (!ignored) + if (xml_dtd_evals_find(ctx->dtd->tab_evals, a, value)) + xml_error(ctx, "Duplicate enumeration value"); + else + xml_dtd_evals_new(ctx->dtd->tab_evals, a, value); + xml_parse_white(ctx, 0); + } + while (xml_get_char(ctx) == '|'); + xml_unget_char(ctx); + xml_parse_char(ctx, ')'); + } + else + { + xml_unget_char(ctx); + char *type = xml_parse_name(ctx); + enum xml_dtd_attribute_type t; + if (!strcmp(type, "CDATA")) + t = XML_ATTR_CDATA; + else if (!strcmp(type, "ID")) + t = XML_ATTR_ID; + else if (!strcmp(type, "IDREF")) + t = XML_ATTR_IDREF; + else if (!strcmp(type, "IDREFS")) + t = XML_ATTR_IDREFS; + else if (!strcmp(type, "ENTITY")) + t = XML_ATTR_ENTITY; + else if (!strcmp(type, "ENTITIES")) + t = XML_ATTR_ENTITIES; + else if (!strcmp(type, "NMTOKEN")) + t = XML_ATTR_NMTOKEN; + else if (!strcmp(type, "NMTOKENS")) + t = XML_ATTR_NMTOKENS; + else if (!strcmp(type, "NOTATION")) + { + t = XML_ATTR_NOTATION; + xml_parse_white(ctx, 1); + xml_parse_char(ctx, '('); + do + { + xml_parse_white(ctx, 0); + struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx)); + if (!ignored) + if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, a, n)) + xml_error(ctx, "Duplicate enumerated notation"); + else + xml_dtd_enotns_new(ctx->dtd->tab_enotns, a, n); + xml_parse_white(ctx, 0); + } + while (xml_get_char(ctx) == '|'); + xml_unget_char(ctx); + xml_parse_char(ctx, ')'); + } + else + xml_fatal(ctx, "Unknown attribute type"); + if (!ignored) + a->type = t; + } + xml_parse_white(ctx, 1); + enum xml_dtd_attribute_default def = XML_ATTR_NONE; + if (xml_get_char(ctx) == '#') + switch (xml_get_char(ctx)) + { + case 'R': + xml_parse_seq(ctx, "EQUIRED"); + def = XML_ATTR_REQUIRED; + break; + case 'I': + xml_parse_seq(ctx, "MPLIED"); + def = XML_ATTR_IMPLIED; + break; + case 'F': + xml_parse_seq(ctx, "IXED"); + def = XML_ATTR_FIXED; + break; + default: + xml_fatal(ctx, "Expected a modifier for default attribute value"); + } + else + xml_unget_char(ctx); + if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED) + { + xml_parse_system_literal(ctx); + // FIXME + } + } +} +#endif + +static void +xml_parse_doctype_decl(struct xml_context *ctx) +{ + if (ctx->document_type) + xml_fatal(ctx, "Multiple document types not allowed"); + xml_parse_seq(ctx, "DOCTYPE"); + xml_parse_white(ctx, 1); + ctx->document_type = xml_parse_name(ctx); + TRACE(ctx, "doctyype=%s", ctx->document_type); + uns white = xml_parse_white(ctx, 0); + uns c = xml_peek_char(ctx); + if (c != '>' && c != '[' && white) + { + xml_parse_external_id(ctx, &ctx->eid, 0, 0); + xml_parse_white(ctx, 0); + ctx->flags |= XML_FLAG_HAS_EXTERNAL_SUBSET; + } + if (xml_peek_char(ctx) == '[') + ctx->flags |= XML_FLAG_HAS_INTERNAL_SUBSET; + if (ctx->h_doctype_decl) + ctx->h_doctype_decl(ctx); +} + +int +xml_next(struct xml_context *ctx) +{ + /* A nasty state machine */ + + TRACE(ctx, "xml_next (state=%u)", ctx->state); + jmp_buf throw_buf; + ctx->throw_buf = &throw_buf; + if (setjmp(throw_buf)) + { +error: + if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal) + ctx->h_fatal(ctx); + ctx->state = XML_STATE_FATAL; + TRACE(ctx, "raised fatal error"); + return -1; + } + uns c; + switch (ctx->state) + { + case XML_STATE_FATAL: + return -1; + + case XML_STATE_START: + TRACE(ctx, "entering prolog"); + if (ctx->h_document_start) + ctx->h_document_start(ctx); + /* XMLDecl */ + xml_refill(ctx); + if (ctx->h_xml_decl) + ctx->h_xml_decl(ctx); + if (ctx->want & XML_WANT_DECL) + return ctx->state = XML_STATE_DECL; + case XML_STATE_DECL: + + /* Misc* (doctypedecl Misc*)? */ + while (1) + { + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '<'); + if ((c = xml_get_char(ctx)) == '?') + /* Processing intruction */ + if (!(ctx->want & XML_WANT_PI)) + xml_skip_pi(ctx); + else + { + xml_push_pi(ctx); + ctx->state = XML_STATE_PROLOG_PI; + return XML_STATE_PI; + case XML_STATE_PROLOG_PI: + xml_pop_pi(ctx); + } + else if (c != '!') + { + /* Found the root tag */ + xml_unget_char(ctx); + goto first_tag; + } + else if (xml_get_char(ctx) == '-') + if (!(ctx->want & XML_WANT_COMMENT)) + xml_skip_comment(ctx); + else + { + xml_push_comment(ctx); + ctx->state = XML_STATE_PROLOG_COMMENT; + return XML_STATE_COMMENT; + case XML_STATE_PROLOG_COMMENT: + xml_pop_comment(ctx); + } + else + { + /* DocTypeDecl */ + xml_unget_char(ctx); + xml_parse_doctype_decl(ctx); + if (ctx->want & XML_WANT_DOCUMENT_TYPE) + return ctx->state = XML_STATE_DOCUMENT_TYPE; + case XML_STATE_DOCUMENT_TYPE: + if (xml_peek_char(ctx) == '[') + { + xml_skip_char(ctx); + xml_inc(ctx); + xml_parse_internal_subset(ctx); + xml_parse_white(ctx, 0); + } + xml_parse_char(ctx, '>'); + } + } + + case XML_STATE_PI: + mp_pop(ctx->pool); + case XML_STATE_COMMENT: + fbgrow_reset(ctx->value); + + case XML_STATE_CHARS: + + while (1) + { + if (xml_peek_char(ctx) != '<') + { + /* CharData */ + xml_parse_chars(ctx); + continue; + } + else + xml_skip_char(ctx); +first_tag: ; + + xml_inc(ctx); + if ((c = xml_get_char(ctx)) == '?') + { + /* PI */ + if (!(ctx->want & XML_WANT_PI)) + xml_skip_pi(ctx); + else + { + if (btell(ctx->chars)) + { + fbgrow_rewind(ctx->chars); + ctx->state = XML_STATE_CHARS_BEFORE_PI; + return XML_STATE_PI; + case XML_STATE_CHARS_BEFORE_PI: + fbgrow_reset(ctx->chars); + } + xml_push_pi(ctx); + return ctx->state = XML_STATE_PI; + } + } + + else if (c == '!') + if ((c = xml_get_char(ctx)) == '-') + { + /* Comment */ + if (!(ctx->want & XML_WANT_COMMENT)) + xml_skip_comment(ctx); + else + { + if (btell(ctx->chars)) + { + fbgrow_rewind(ctx->chars); + ctx->state = XML_STATE_CHARS_BEFORE_COMMENT; + return XML_STATE_CHARS; + case XML_STATE_CHARS_BEFORE_COMMENT: + fbgrow_reset(ctx->chars); + } + xml_push_comment(ctx); + return ctx->state = XML_STATE_COMMENT; + } + } + else if (c == '[') + { + /* CDATA */ + if (!(ctx->want & XML_WANT_CDATA)) + xml_skip_cdata(ctx); + else + { + if (btell(ctx->chars)) + { + fbgrow_rewind(ctx->chars); + ctx->state = XML_STATE_CHARS_BEFORE_CDATA; + return XML_STATE_CHARS; + case XML_STATE_CHARS_BEFORE_CDATA: + fbgrow_reset(ctx->chars); + } + xml_parse_cdata(ctx); + if (btell(ctx->chars)) + { + fbgrow_rewind(ctx->chars); + return ctx->state = XML_STATE_CDATA; + } + case XML_STATE_CDATA: + fbgrow_reset(ctx->chars); + } + } + else + xml_fatal(ctx, "Unexpected character after 'chars)) + { + fbgrow_rewind(ctx->chars); + ctx->state = XML_STATE_CHARS_BEFORE_STAG; + return XML_STATE_CHARS; + case XML_STATE_CHARS_BEFORE_STAG: + fbgrow_reset(ctx->chars); + } + + xml_parse_stag(ctx); + if (ctx->want & XML_WANT_STAG) + return ctx->state = XML_STATE_STAG; + case XML_STATE_STAG: + if (ctx->flags & XML_FLAG_EMPTY_ELEM) + goto pop_element; + } + + else + { + /* ETag */ + if (btell(ctx->chars)) + { + fbgrow_rewind(ctx->chars); + ctx->state = XML_STATE_CHARS_BEFORE_ETAG; + return XML_STATE_CHARS; + case XML_STATE_CHARS_BEFORE_ETAG: + fbgrow_reset(ctx->chars); + } + + xml_parse_etag(ctx); +pop_element: + if (ctx->want & XML_WANT_ETAG) + return ctx->state = XML_STATE_ETAG; + case XML_STATE_ETAG: + xml_pop_element(ctx); + if (!ctx->elem) + goto epilog; + } + } + +epilog: + /* Misc* */ + TRACE(ctx, "entering epilog"); + while (1) + { + /* Epilog whitespace is the only place, where a valid document can reach EOF */ + if (setjmp(throw_buf)) + if (ctx->err_code == XML_ERR_EOF) + { + TRACE(ctx, "reached EOF"); + ctx->state = XML_STATE_EOF; + if (ctx->h_document_end) + ctx->h_document_end(ctx); + case XML_STATE_EOF: + return XML_STATE_EOF; + } + else + goto error; + xml_parse_white(ctx, 0); + if (setjmp(throw_buf)) + goto error; + + /* Misc */ + xml_parse_char(ctx, '<'); + if ((c = xml_get_char(ctx)) == '?') + /* Processing instruction */ + if (!(ctx->want & XML_WANT_PI)) + xml_skip_pi(ctx); + else + { + xml_push_pi(ctx); + return ctx->state = XML_STATE_EPILOG_PI, XML_STATE_PI; + case XML_STATE_EPILOG_PI: + xml_pop_pi(ctx); + } + else if (c == '!') + /* Comment */ + if (!(ctx->want & XML_WANT_COMMENT)) + xml_skip_comment(ctx); + else + { + xml_push_comment(ctx); + return ctx->state = XML_STATE_EPILOG_COMMENT, XML_STATE_COMMENT; + case XML_STATE_EPILOG_COMMENT: + xml_pop_comment(ctx); + } + else + xml_fatal(ctx, "Syntax error in the epilog"); + } + + } + return -1; +} + +#ifdef TEST + +static void +error(struct xml_context *ctx) +{ + msg((ctx->err_code < XML_ERR_ERROR) ? L_WARN_R : L_ERROR_R, "XML %u: %s", xml_row(ctx), ctx->err_msg); +} + +static void +test(struct fastbuf *in, struct fastbuf *out) +{ + struct xml_context ctx; + xml_init(&ctx); + ctx.h_warn = ctx.h_error = ctx.h_fatal = error; + ctx.want = XML_WANT_ALL; + ctx.flags |= XML_DOM_FREE; + xml_set_source(&ctx, in); + int state; + while ((state = xml_next(&ctx)) >= 0) + switch (state) + { + case XML_STATE_CHARS: + bprintf(out, "CHARS [%.*s]\n", (int)(ctx.chars->bstop - ctx.chars->buffer), ctx.chars->buffer); + break; + case XML_STATE_STAG: + bprintf(out, "STAG <%s>\n", ctx.elem->name); + SLIST_FOR_EACH(struct xml_attr *, a, ctx.elem->attrs) + bprintf(out, " ATTR %s=[%s]\n", a->name, a->val); + break; + case XML_STATE_ETAG: + bprintf(out, "ETAG \n", ctx.elem->name); + break; + case XML_STATE_COMMENT: + bprintf(out, "COMMENT [%.*s]\n", (int)(ctx.value->bstop - ctx.value->buffer), ctx.value->buffer); + break; + case XML_STATE_PI: + bprintf(out, "PI [%s] [%.*s]\n", ctx.name, (int)(ctx.value->bstop - ctx.value->buffer), ctx.value->buffer); + break; + case XML_STATE_CDATA: + bprintf(out, "CDATA [%.*s]\n", (int)(ctx.chars->bstop - ctx.chars->buffer), ctx.chars->buffer); + break; + case XML_STATE_EOF: + bprintf(out, "EOF\n"); + goto end; + break; + } +end: + xml_cleanup(&ctx); +} + +int +main(void) +{ + struct fastbuf *in = bfdopen_shared(0, 1024); + struct fastbuf *out = bfdopen_shared(1, 1024); + test(in, out); + bclose(out); + return 0; +} + +#endif diff --git a/sherlock/xml/xml.h b/sherlock/xml/xml.h new file mode 100644 index 00000000..87cdff91 --- /dev/null +++ b/sherlock/xml/xml.h @@ -0,0 +1,338 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _SHERLOCK_XML_H +#define _SHERLOCK_XML_H + +#include "lib/clists.h" +#include "lib/slists.h" +#include "lib/mempool.h" + +enum xml_error { + XML_ERR_OK = 0, + XML_ERR_WARN = 1000, /* Warning */ + XML_ERR_ERROR = 2000, /* Recoverable error */ + XML_ERR_FATAL = 3000, /* Unrecoverable error */ + XML_ERR_EOF, +}; + +enum xml_state { + XML_STATE_START = 0, + XML_STATE_DECL, + XML_STATE_DOCUMENT_TYPE, + XML_STATE_CHARS, + XML_STATE_WHITE, + XML_STATE_CDATA, + XML_STATE_STAG, + XML_STATE_ETAG, + XML_STATE_COMMENT, + XML_STATE_PI, + XML_STATE_EOF, + XML_STATE_FATAL, + + /* Internal states */ + XML_STATE_CHARS_BEFORE_STAG, + XML_STATE_CHARS_BEFORE_ETAG, + XML_STATE_CHARS_BEFORE_CDATA, + XML_STATE_CHARS_BEFORE_PI, + XML_STATE_CHARS_BEFORE_COMMENT, + XML_STATE_PROLOG_PI, + XML_STATE_PROLOG_COMMENT, + XML_STATE_EPILOG_PI, + XML_STATE_EPILOG_COMMENT, +}; + +enum xml_want { + XML_WANT_DECL = 1 << XML_STATE_DECL, + XML_WANT_DOCUMENT_TYPE = 1 << XML_STATE_DOCUMENT_TYPE, + XML_WANT_CHARS = 1 << XML_STATE_CHARS, + XML_WANT_WHITE = 1 << XML_STATE_WHITE, + XML_WANT_CDATA = 1 << XML_STATE_CDATA, + XML_WANT_STAG = 1 << XML_STATE_STAG, + XML_WANT_ETAG = 1 << XML_STATE_ETAG, + XML_WANT_COMMENT = 1 << XML_STATE_COMMENT, + XML_WANT_PI = 1 << XML_STATE_PI, + XML_WANT_EOF = 1 << XML_STATE_EOF, + XML_WANT_ALL = ~0U, +}; + +enum xml_flags { + XML_FLAG_VALIDATING = 0x1, + XML_FLAG_VERSION_1_1 = 0x2, /* XML version 1.1, otherwise 1.0 */ + XML_FLAG_HAS_EXTERNAL_SUBSET = 0x4, /* The document contains a reference to external DTD subset */ + XML_FLAG_HAS_INTERNAL_SUBSET = 0x8, /* The document contains an internal subset */ + + XML_FLAG_SRC_EOF = 0x10, /* EOF reached */ + XML_FLAG_SRC_EXPECTED_DECL = 0x20, /* Just before optional or required XMLDecl/TextDecl */ + XML_FLAG_SRC_NEW_LINE = 0x40, /* The last read character is 0xD */ + XML_FLAG_SRC_SURROUND = 0x80, /* Surround the text with 0x20 (references to parameter entities) */ + XML_FLAG_SRC_DOCUMENT = 0x100, /* The document entity */ + XML_FLAG_SRC_EXTERNAL = 0x200, /* An external entity */ + + XML_DOM_SKIP = 0x1000, /* Do not report DOM nodes */ + XML_DOM_FREE = 0x2000, /* Free the subtree when leaving */ + XML_DOM_IGNORE = XML_DOM_SKIP | XML_DOM_FREE, /* Completely ignore the subtree */ + + XML_FLAG_EMPTY_ELEM = 0x100000, +}; + +struct xml_ext_id { + char *system_id; + char *public_id; +}; + +enum xml_node_type { + XML_NODE_ELEM, + XML_NODE_COMMENT, + XML_NODE_CDATA, + XML_NODE_PI, +}; + +struct xml_node { + cnode n; /* Node for list of parent's sons */ + uns type; /* XML_NODE_x */ + struct xml_node *parent; /* Parent node */ +}; + +struct xml_elem { + struct xml_node node; + char *name; /* Element name */ + clist sons; /* List of subnodes */ + struct xml_dtd_elem *dtd; /* Element DTD */ + slist attrs; /* Link list of attributes */ +}; + +struct xml_attr { + snode n; + struct xml_elem *elem; + char *name; + char *val; +}; + +struct xml_context; + +struct xml_stack { + struct xml_stack *next; /* Link list of stack records */ + uns saved_flags; /* Saved ctx->flags */ + struct mempool_state saved_pool; /* Saved ctx->pool state */ +}; + +#define XML_BUF_SIZE 32 /* At least 16 -- hardcoded */ + +struct xml_source { + struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ + struct fastbuf *fb; /* Source fastbuf */ + struct fastbuf wrap_fb; /* Libcharset or fbmem wrapper */ + u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ + u32 *bptr, *bstop; /* Current state of the buffer */ + uns row; /* File position */ + char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ + char *fb_encoding; /* Encoding of the source fastbuf */ + char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ + uns refill_cat1; /* Character categories, which should be directly passed to the buffer */ + uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in sequences) */ + void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ + unsigned short *refill_in_to_x; /* Libcharset input table */ + uns saved_depth; /* Saved ctx->depth */ +}; + +struct xml_context { + /* Error handling */ + char *err_msg; /* Last error message */ + enum xml_error err_code; /* Last error code */ + void *throw_buf; /* Where to jump on error */ + void (*h_warn)(struct xml_context *ctx); /* Warning callback */ + void (*h_error)(struct xml_context *ctx); /* Recoverable error callback */ + void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */ + + /* Memory management */ + struct mempool *pool; /* Most data */ + struct fastbuf *chars; /* Character data */ + struct fastbuf *value; /* Attribute value / comment / processing instruction data */ + char *name; /* Attribute name, processing instruction target */ + void *tab_attrs; + + /* Stack */ + struct xml_stack *stack; /* See xml_push(), xml_pop() */ + uns flags; /* XML_FLAG_x (restored on xml_pop()) */ + uns depth; /* Nesting level */ + + /* Input */ + struct xml_source *src; /* Current source */ + u32 *bptr, *bstop; /* Character buffer */ + + /* SAX-like interface */ + void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */ + void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */ + void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */ + void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration just before internal subset */ + void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction */ + void (*h_comment)(struct xml_context *ctx); /* Called after a comment */ + void (*h_element_start)(struct xml_context *ctx); /* Called after STag or EmptyElemTag */ + void (*h_element_end)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag */ + + /* DOM */ + struct xml_elem *root; /* DOM root */ + union { + struct xml_node *node; /* Current DOM node */ + struct xml_elem *elem; /* Current element */ + }; + + char *version_str; + uns standalone; + char *document_type; + struct xml_dtd *dtd; + struct xml_ext_id eid; + uns state; + uns want; + + void (*start_dtd)(struct xml_context *ctx); + void (*end_dtd)(struct xml_context *ctx); + void (*start_cdata)(struct xml_context *ctx); + void (*end_cdata)(struct xml_context *ctx); + void (*start_entity)(struct xml_context *ctx); + void (*end_entity)(struct xml_context *ctx); + void (*chacacters)(struct xml_context *ctx); + struct fastbuf *(*resolve_entity)(struct xml_context *ctx); + void (*notation_decl)(struct xml_context *ctx); + void (*unparsed_entity_decl)(struct xml_context *ctx); +}; + +/*** Document Type Definition (DTD) ***/ + +struct xml_dtd { + struct mempool *pool; /* Memory pool where to allocate DTD */ + slist gents; /* Link list of general entities */ + slist pents; /* Link list of parapeter entities */ + slist notns; /* Link list of notations */ + slist elems; /* Link list of elements */ + void *tab_gents; /* Hash table of general entities */ + void *tab_pents; /* Hash table of parameter entities */ + void *tab_notns; /* Hash table of notations */ + void *tab_elems; /* Hash table of elements */ + void *tab_attrs; /* Hash table of element attributes */ + void *tab_evals; /* Hash table of enumerated attribute values */ + void *tab_enotns; /* hash table of enumerated attribute notations */ +}; + +/* Notations */ + +enum xml_dtd_notn_flags { + XML_DTD_NOTN_DECLARED = 0x1, /* The notation has been declared (interbal usage) */ +}; + +struct xml_dtd_notn { + snode n; /* Node in xml_dtd.notns */ + uns flags; /* XML_DTD_NOTN_x */ + char *name; /* Notation name */ + struct xml_ext_id eid; /* External id */ +}; + +/* Entities */ + +enum xml_dtd_ent_flags { + XML_DTD_ENT_DECLARED = 0x1, /* The entity has been declared (internal usage) */ + XML_DTD_ENT_VISITED = 0x2, /* Cycle detection (internal usage) */ + XML_DTD_ENT_PARAMETER = 0x4, /* Parameter entity, general otherwise */ + XML_DTD_ENT_EXTERNAL = 0x8, /* External entity, internal otherwise */ + XML_DTD_ENT_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */ + XML_DTD_ENT_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */ +}; + +struct xml_dtd_ent { + snode n; /* Node in xml_dtd.[gp]ents */ + uns flags; /* XML_DTD_ENT_x */ + char *name; /* Entity name */ + char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */ + uns len; /* Text length */ + struct xml_ext_id eid; /* External ID */ + struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */ +}; + +/* Elements */ + +enum xml_dtd_elem_flags { + XML_DTD_ELEM_DECLARED = 0x1, /* The element has been declared (internal usage) */ +}; + +struct xml_dtd_elem { + snode n; + uns flags; + char *name; + struct xml_dtd_elem_node *node; +}; + +struct xml_dtd_elem_node { + snode n; + struct xml_dtd_elem_node *parent; + slist sons; + uns type; + uns occur; +}; + +enum xml_dtd_elem_node_type { + XML_DTD_ELEM_PCDATA, + XML_DTD_ELEM_SEQ, + XML_DTD_ELEM_OR, +}; + +enum xml_dtd_elem_node_occur { + XML_DTD_ELEM_OCCUR_ONCE, + XML_DTD_ELEM_OCCUR_OPT, + XML_DTD_ELEM_OCCUR_MULT, + XML_DTD_ELEM_OCCUR_PLUS, +}; + +/* Attributes */ + + +enum xml_dtd_attribute_default { + XML_ATTR_NONE, + XML_ATTR_REQUIRED, + XML_ATTR_IMPLIED, + XML_ATTR_FIXED, +}; + +enum xml_dtd_attribute_type { + XML_ATTR_CDATA, + XML_ATTR_ID, + XML_ATTR_IDREF, + XML_ATTR_IDREFS, + XML_ATTR_ENTITY, + XML_ATTR_ENTITIES, + XML_ATTR_NMTOKEN, + XML_ATTR_NMTOKENS, + XML_ATTR_ENUM, + XML_ATTR_NOTATION, +}; + +struct xml_dtd_attr { + char *name; + struct xml_dtd_elem *elem; + enum xml_dtd_attribute_type type; + enum xml_dtd_attribute_default default_mode; + char *default_value; +}; + +struct xml_dtd_eval { + struct xml_dtd_attr *attr; + char *val; +}; + +struct xml_dtd_enotn { + struct xml_dtd_attr *attr; + struct xml_dtd_notn *notn; +}; + +void xml_init(struct xml_context *ctx); +void xml_cleanup(struct xml_context *ctx); +void xml_set_source(struct xml_context *ctx, struct fastbuf *fb); +int xml_next(struct xml_context *ctx); + +#endif