From: Pavel Charvat Date: Tue, 11 Dec 2007 11:17:47 +0000 (+0100) Subject: XML: Updates to the XML parser. X-Git-Tag: holmes-import~482 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=637533a60b2201eaadedcb00fc66ef1e20237432;p=libucw.git XML: Updates to the XML parser. --- diff --git a/sherlock/xml/Makefile b/sherlock/xml/Makefile index e3acc181..f721b500 100644 --- a/sherlock/xml/Makefile +++ b/sherlock/xml/Makefile @@ -3,17 +3,21 @@ DIRS+=sherlock/xml -LIBSH_MODS+=xml/xml -LIBSH_XML_INCLUDES=xml.h +LIBSHXML_MODS=xml +LIBSHXML_INCLUDES=xml.h dtd.h -$(o)/sherlock/xml/xml-t: $(LIBSH) $(LIBCHARSET) -$(o)/sherlock/xml/xml.o: $(o)/sherlock/xml/xml-ucat.h -$(o)/sherlock/xml/xml-ucat.h: $(s)/sherlock/xml/xml-ucat.pl +LIBSHXML_MOD_PATHS=$(addprefix $(o)/sherlock/xml/,$(LIBSHXML_MODS)) + +$(o)/sherlock/xml/libshxml.a: $(addsuffix .o,$(LIBSHXML_MOD_PATHS)) +$(o)/sherlock/xml/libshxml.so: $(addsuffix .oo,$(LIBSHXML_MOD_PATHS)) +$(o)/sherlock/xml/libshxml.pc: $(LIBUCW) $(LIBCHARSET) + +$(o)/sherlock/xml/xml-t: $(LIBSHXML) +$(o)/sherlock/xml/xml.o: $(o)/sherlock/xml/unicat.h +$(o)/sherlock/xml/unicat.h: $(s)/sherlock/xml/unicat.pl $(M)GEN $@ $(Q)$< >$@ API_INCLUDES+=$(o)/sherlock/xml/.include-stamp -$(o)/sherlock/xml/.include-stamp: $(addprefix $(s)/sherlock/xml/,$(LIBSH_XML_INCLUDES)) +$(o)/sherlock/xml/.include-stamp: $(addprefix $(s)/sherlock/xml/,$(LIBSHXML_INCLUDES)) $(o)/sherlock/xml/.include-stamp: IDST=sherlock/xml - -include $(s)/sherlock/perl/Makefile diff --git a/sherlock/xml/dtd.h b/sherlock/xml/dtd.h new file mode 100644 index 00000000..bf95b872 --- /dev/null +++ b/sherlock/xml/dtd.h @@ -0,0 +1,148 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _SHERLOCK_XML_DTD_H +#define _SHERLOCK_XML_DTD_H + +#include "sherlock/xml/xml.h" + +struct xml_dtd { + struct mempool *pool; /* Memory pool where to allocate DTD */ + slist gents; /* Link list of general entities */ + slist pents; /* Link list of parapeter entities */ + slist notns; /* Link list of notations */ + slist elems; /* Link list of elements */ + void *tab_gents; /* Hash table of general entities */ + void *tab_pents; /* Hash table of parameter entities */ + void *tab_notns; /* Hash table of notations */ + void *tab_elems; /* Hash table of elements */ + void *tab_enodes; /* Hash table of element sons */ + void *tab_attrs; /* Hash table of element attributes */ + void *tab_evals; /* Hash table of enumerated attribute values */ + void *tab_enotns; /* hash table of enumerated attribute notations */ +}; + +/* Notations */ + +enum xml_dtd_notn_flags { + XML_DTD_NOTN_DECLARED = 0x1, /* The notation has been declared (interbal usage) */ +}; + +struct xml_dtd_notn { + snode n; /* Node in xml_dtd.notns */ + uns flags; /* XML_DTD_NOTN_x */ + char *name; /* Notation name */ + struct xml_ext_id eid; /* External id */ +}; + +/* Entities */ + +enum xml_dtd_ent_flags { + XML_DTD_ENT_DECLARED = 0x1, /* The entity has been declared (internal usage) */ + XML_DTD_ENT_VISITED = 0x2, /* Cycle detection (internal usage) */ + XML_DTD_ENT_PARAMETER = 0x4, /* Parameter entity, general otherwise */ + XML_DTD_ENT_EXTERNAL = 0x8, /* External entity, internal otherwise */ + XML_DTD_ENT_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */ + XML_DTD_ENT_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */ +}; + +struct xml_dtd_ent { + snode n; /* Node in xml_dtd.[gp]ents */ + uns flags; /* XML_DTD_ENT_x */ + char *name; /* Entity name */ + char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */ + uns len; /* Text length */ + struct xml_ext_id eid; /* External ID */ + struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */ +}; + +/* Elements */ + +enum xml_dtd_elem_flags { + XML_DTD_ELEM_DECLARED = 0x1, /* The element has been declared (internal usage) */ +}; + +enum xml_dtd_elem_type { + XML_DTD_ELEM_EMPTY, + XML_DTD_ELEM_ANY, + XML_DTD_ELEM_MIXED, + XML_DTD_ELEM_CHILDREN, +}; + +struct xml_dtd_elem { + snode n; + uns flags; + uns type; + char *name; + struct xml_dtd_elem_node *node; +}; + +struct xml_dtd_elem_node { + snode n; + struct xml_dtd_elem_node *parent; + struct xml_dtd_elem *elem; + slist sons; + uns type; + uns occur; +}; + +enum xml_dtd_elem_node_type { + XML_DTD_ELEM_PCDATA, + XML_DTD_ELEM_SEQ, + XML_DTD_ELEM_OR, +}; + +enum xml_dtd_elem_node_occur { + XML_DTD_ELEM_OCCUR_ONCE, + XML_DTD_ELEM_OCCUR_OPT, + XML_DTD_ELEM_OCCUR_MULT, + XML_DTD_ELEM_OCCUR_PLUS, +}; + +/* Attributes */ + +enum xml_dtd_attribute_default { + XML_ATTR_NONE, + XML_ATTR_REQUIRED, + XML_ATTR_IMPLIED, + XML_ATTR_FIXED, +}; + +enum xml_dtd_attribute_type { + XML_ATTR_CDATA, + XML_ATTR_ID, + XML_ATTR_IDREF, + XML_ATTR_IDREFS, + XML_ATTR_ENTITY, + XML_ATTR_ENTITIES, + XML_ATTR_NMTOKEN, + XML_ATTR_NMTOKENS, + XML_ATTR_ENUM, + XML_ATTR_NOTATION, +}; + +struct xml_dtd_attr { + char *name; + struct xml_dtd_elem *elem; + enum xml_dtd_attribute_type type; + enum xml_dtd_attribute_default default_mode; + char *default_value; +}; + +struct xml_dtd_eval { + struct xml_dtd_attr *attr; + char *val; +}; + +struct xml_dtd_enotn { + struct xml_dtd_attr *attr; + struct xml_dtd_notn *notn; +}; + +#endif diff --git a/sherlock/xml/libshxml.pc b/sherlock/xml/libshxml.pc new file mode 100644 index 00000000..c2172b39 --- /dev/null +++ b/sherlock/xml/libshxml.pc @@ -0,0 +1,11 @@ +# pkg-config metadata for libshxml + +libdir=@LIBDIR@ +incdir=. + +Name: libshxml +Description: XML parser for Sherlock project +Version: @SHERLOCK_VERSION@ +Cflags: -I${incdir} +Libs: -L${libdir} -lshxml +Requires: @DEPS@ diff --git a/sherlock/xml/unicat.pl b/sherlock/xml/unicat.pl new file mode 100755 index 00000000..fc39bba7 --- /dev/null +++ b/sherlock/xml/unicat.pl @@ -0,0 +1,155 @@ +#!/usr/bin/perl +# +# UCW Library -- Character map for the XML parser +# +# (c) 2007 Pavel Charvat +# +# This software may be freely distributed and used according to the terms +# of the GNU Lesser General Public License. +# + +my @cat = (); +my @lcat = (); +my %ids = (); +my %cls = (); +for (my $i = 0; $i < 0x10000; $i++) { $cat[$i] = 0; } +for (my $i = 0; $i < 0x11; $i++) { $lcat[$i] = 0; } + +my @white = (0x9, 0xA, 0xD, 0x20); +my @base_char_1_0 = ( + [0x0041,0x005A], [0x0061,0x007A], [0x00C0,0x00D6], [0x00D8,0x00F6], [0x00F8,0x00FF], [0x0100,0x0131], + [0x0134,0x013E], [0x0141,0x0148], [0x014A,0x017E], [0x0180,0x01C3], [0x01CD,0x01F0], [0x01F4,0x01F5], + [0x01FA,0x0217], [0x0250,0x02A8], [0x02BB,0x02C1], 0x0386, [0x0388,0x038A], 0x038C, [0x038E,0x03A1], + [0x03A3,0x03CE], [0x03D0,0x03D6], 0x03DA, 0x03DC, 0x03DE, 0x03E0, [0x03E2,0x03F3], [0x0401,0x040C], + [0x040E,0x044F], [0x0451,0x045C], [0x045E,0x0481], [0x0490,0x04C4], [0x04C7,0x04C8], [0x04CB,0x04CC], + [0x04D0,0x04EB], [0x04EE,0x04F5], [0x04F8,0x04F9], [0x0531,0x0556], 0x0559, [0x0561,0x0586], [0x05D0,0x05EA], + [0x05F0,0x05F2], [0x0621,0x063A], [0x0641,0x064A], [0x0671,0x06B7], [0x06BA,0x06BE], [0x06C0,0x06CE], + [0x06D0,0x06D3], 0x06D5, [0x06E5,0x06E6], [0x0905,0x0939], 0x093D, [0x0958,0x0961], [0x0985,0x098C], + [0x098F,0x0990], [0x0993,0x09A8], [0x09AA,0x09B0], 0x09B2, [0x09B6,0x09B9], [0x09DC,0x09DD], [0x09DF,0x09E1], + [0x09F0,0x09F1], [0x0A05,0x0A0A], [0x0A0F,0x0A10], [0x0A13,0x0A28], [0x0A2A,0x0A30], [0x0A32,0x0A33], + [0x0A35,0x0A36], [0x0A38,0x0A39], [0x0A59,0x0A5C], 0x0A5E, [0x0A72,0x0A74], [0x0A85,0x0A8B], 0x0A8D, + [0x0A8F,0x0A91], [0x0A93,0x0AA8], [0x0AAA,0x0AB0], [0x0AB2,0x0AB3], [0x0AB5,0x0AB9], 0x0ABD, 0x0AE0, + [0x0B05,0x0B0C], [0x0B0F,0x0B10], [0x0B13,0x0B28], [0x0B2A,0x0B30], [0x0B32,0x0B33], [0x0B36,0x0B39], + 0x0B3D, [0x0B5C,0x0B5D], [0x0B5F,0x0B61], [0x0B85,0x0B8A], [0x0B8E,0x0B90], [0x0B92,0x0B95], [0x0B99,0x0B9A], + 0x0B9C, [0x0B9E,0x0B9F], [0x0BA3,0x0BA4], [0x0BA8,0x0BAA], [0x0BAE,0x0BB5], [0x0BB7,0x0BB9], [0x0C05,0x0C0C], + [0x0C0E,0x0C10], [0x0C12,0x0C28], [0x0C2A,0x0C33], [0x0C35,0x0C39], [0x0C60,0x0C61], [0x0C85,0x0C8C], + [0x0C8E,0x0C90], [0x0C92,0x0CA8], [0x0CAA,0x0CB3], [0x0CB5,0x0CB9], 0x0CDE, [0x0CE0,0x0CE1], [0x0D05,0x0D0C], + [0x0D0E,0x0D10], [0x0D12,0x0D28], [0x0D2A,0x0D39], [0x0D60,0x0D61], [0x0E01,0x0E2E], 0x0E30, [0x0E32,0x0E33], + [0x0E40,0x0E45], [0x0E81,0x0E82], 0x0E84, [0x0E87,0x0E88], 0x0E8A, 0x0E8D, [0x0E94,0x0E97], [0x0E99,0x0E9F], + [0x0EA1,0x0EA3], 0x0EA5, 0x0EA7, [0x0EAA,0x0EAB], [0x0EAD,0x0EAE], 0x0EB0, [0x0EB2,0x0EB3], 0x0EBD, + [0x0EC0,0x0EC4], [0x0F40,0x0F47], [0x0F49,0x0F69], [0x10A0,0x10C5], [0x10D0,0x10F6], 0x1100, [0x1102,0x1103], + [0x1105,0x1107], 0x1109, [0x110B,0x110C], [0x110E,0x1112], 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150, + [0x1154,0x1155], 0x1159, [0x115F,0x1161], 0x1163, 0x1165, 0x1167, 0x1169, [0x116D,0x116E], [0x1172,0x1173], + 0x1175, 0x119E, 0x11A8, 0x11AB, [0x11AE,0x11AF], [0x11B7,0x11B8], 0x11BA, [0x11BC,0x11C2], 0x11EB, 0x11F0, + 0x11F9, [0x1E00,0x1E9B], [0x1EA0,0x1EF9], [0x1F00,0x1F15], [0x1F18,0x1F1D], [0x1F20,0x1F45], [0x1F48,0x1F4D], + [0x1F50,0x1F57], 0x1F59, 0x1F5B, 0x1F5D, [0x1F5F,0x1F7D], [0x1F80,0x1FB4], [0x1FB6,0x1FBC], 0x1FBE, + [0x1FC2,0x1FC4], [0x1FC6,0x1FCC], [0x1FD0,0x1FD3], [0x1FD6,0x1FDB], [0x1FE0,0x1FEC], [0x1FF2,0x1FF4], + [0x1FF6,0x1FFC], 0x2126, [0x212A,0x212B], 0x212E, [0x2180,0x2182], [0x3041,0x3094], [0x30A1,0x30FA], + [0x3105,0x312C], [0xAC00,0xD7A3]); +my @ideographic_1_0 = ([0x4E00,0x9FA5], 0x3007, [0x3021,0x3029]); +my @combining_char_1_0 = ( + [0x0300,0x0345], [0x0360,0x0361], [0x0483,0x0486], [0x0591,0x05A1], [0x05A3,0x05B9], [0x05BB,0x05BD], + 0x05BF, [0x05C1,0x05C2], 0x05C4, [0x064B,0x0652], 0x0670, [0x06D6,0x06DC], [0x06DD,0x06DF], [0x06E0,0x06E4], + [0x06E7,0x06E8], [0x06EA,0x06ED], [0x0901,0x0903], 0x093C, [0x093E,0x094C], 0x094D, [0x0951,0x0954], + [0x0962,0x0963], [0x0981,0x0983], 0x09BC, 0x09BE, 0x09BF, [0x09C0,0x09C4], [0x09C7,0x09C8], [0x09CB,0x09CD], + 0x09D7, [0x09E2,0x09E3], 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, [0x0A40,0x0A42], [0x0A47,0x0A48], [0x0A4B,0x0A4D], + [0x0A70,0x0A71], [0x0A81,0x0A83], 0x0ABC, [0x0ABE,0x0AC5], [0x0AC7,0x0AC9], [0x0ACB,0x0ACD], [0x0B01,0x0B03], + 0x0B3C, [0x0B3E,0x0B43], [0x0B47,0x0B48], [0x0B4B,0x0B4D], [0x0B56,0x0B57], [0x0B82,0x0B83], [0x0BBE,0x0BC2], + [0x0BC6,0x0BC8], [0x0BCA,0x0BCD], 0x0BD7, [0x0C01,0x0C03], [0x0C3E,0x0C44], [0x0C46,0x0C48], [0x0C4A,0x0C4D], + [0x0C55,0x0C56], [0x0C82,0x0C83], [0x0CBE,0x0CC4], [0x0CC6,0x0CC8], [0x0CCA,0x0CCD], [0x0CD5,0x0CD6], + [0x0D02,0x0D03], [0x0D3E,0x0D43], [0x0D46,0x0D48], [0x0D4A,0x0D4D], 0x0D57, 0x0E31, [0x0E34,0x0E3A], + [0x0E47,0x0E4E], 0x0EB1, [0x0EB4,0x0EB9], [0x0EBB,0x0EBC], [0x0EC8,0x0ECD], [0x0F18,0x0F19], 0x0F35, + 0x0F37, 0x0F39, 0x0F3E, 0x0F3F, [0x0F71,0x0F84], [0x0F86,0x0F8B], [0x0F90,0x0F95], 0x0F97, [0x0F99,0x0FAD], + [0x0FB1,0x0FB7], 0x0FB9, [0x20D0,0x20DC], 0x20E1, [0x302A,0x302F], 0x3099, 0x309A); +my @digit_1_0 = ( + [0x0030,0x0039], [0x0660,0x0669], [0x06F0,0x06F9], [0x0966,0x096F], [0x09E6,0x09EF], [0x0A66,0x0A6F], + [0x0AE6,0x0AEF], [0x0B66,0x0B6F], [0x0BE7,0x0BEF], [0x0C66,0x0C6F], [0x0CE6,0x0CEF], [0x0D66,0x0D6F], + [0x0E50,0x0E59], [0x0ED0,0x0ED9], [0x0F20,0x0F29]); +my @extender_1_0 = ( + 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, [0x3031,0x3035], [0x309D,0x309E], [0x30FC,0x30FE]); +my @sname_1_1 = ( + "[:A-Z_a-z]", [0xC0,0xD6], [0xD8,0xF6], [0xF8,0x2FF], [0x370,0x37D], [0x37F,0x1FFF], + [0x200C,0x200D], [0x2070,0x218F], [0x2C00,0x2FEF], [0x3001,0xD7FF], [0xF900,0xFDCF], [0xFDF0,0xFFFD], [0x10000,0xEFFFF]); + +set("WHITE", @white); +set("NEW_LINE_1_0", 0xA, 0xD); +set("NEW_LINE_1_1", 0xA, 0xD, 0x85, 0x2028); +set("DIGIT", "[0-9]"); +set("XDIGIT", "[0-9a-fA-F]"); +set("VALID_1_0", @white, [0x20,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("VALID_1_1", [0x1,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("UNRESTRICTED_1_1", @white, [0x20,0x7E], 0x85, [0xA0,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); +set("PUBID", 0x20, 0xD, 0xA, "[-a-zA-Z0-9'()+,./:=?:!*#@\$_%]"); +set("ENC_SNAME", "[a-zA-Z]"); +set("ENC_NAME", "[-a-zA-Z0-9._]"); +set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0); +set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0); +set("SNAME_1_1", @sname_1_1); +set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]); +set("GT", "[>]"); + +find_cls(); +gen_enum(); +gen_tabs(); + +sub set { + my $id = shift; + $ids{$id} = scalar keys(%ids) if !defined($ids{$id}); + my $mask = 1 << $ids{$id}; + foreach my $i (@_) { + if (ref($i) eq "ARRAY") { + my $j = $i->[0]; + for (; $j <= $i->[1] && $j < 0x10000; $j++) { $cat[$j] |= $mask; } + for (; $j <= $i->[1]; $j += 0x10000) { $lcat[$j >> 16] |= $mask; } + } + elsif ($i =~ /^\[/) { for (my $j=0; $j < 128; $j++) { if (chr($j) =~ /$i/) { $cat[$j] |= $mask; } } } + else { $cat[$i] |= $mask; } + } +} + +sub find_cls { + foreach (my $i=0; $i<@cat; $i++) { $cls{$cat[$i]} = scalar keys(%cls) if !defined($cls{$cat[$i]}); } + foreach (my $i=0; $i<@lcat; $i++) { $cls{$lcat[$i]} = scalar keys(%cls) if !defined($cls{$lcat[$i]}); } +} + +sub gen_enum { + print "enum xml_char_type {\n"; + foreach my $id (sort keys %ids) { + my $mask = 0; + foreach my $i (keys %cls) { + $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id})); + } + printf " XML_CHAR_%-20s = 0x%08x,\n", $id, $mask; + } + print "};\n\n"; +} + +sub gen_tabs { + my @tab = (); + my %hash = (); + print "static const uns xml_char_tab1[] = {\n "; + for (my $t=0; $t<256; $t++) { + my $i = $t * 256; + my @x = (); + for (my $j=0; $j<256; $j += 32) { + push @x, join(",", map($cls{$_}, @cat[$i+$j..$i+$j+31])); + } + my $sub = " " . join(",\n ", @x); + if (!defined($hash{$sub})) { + $hash{$sub} = 256 * scalar @tab; + push @tab, $sub; + } + printf("0x%x", $hash{$sub}); + print((~$t & 15) ? "," : ($t < 255) ? ",\n " : "\n};\n\n"); + } + + print "static const byte xml_char_tab2[] = {\n"; + print join(",\n\n", @tab); + print "\n};\n\n"; + + my @l = (); + for (my $i=0; $i<0x11; $i++) { + push @l, sprintf("%d", $cls{$lcat[$i]}); + } + print "static const byte xml_char_tab3[] = {" . join(",", @l) . "};\n"; +} diff --git a/sherlock/xml/xml-ucat.pl b/sherlock/xml/xml-ucat.pl deleted file mode 100755 index eeb948e6..00000000 --- a/sherlock/xml/xml-ucat.pl +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/perl -# -# UCW Library -- Character map for the XML parser -# -# (c) 2007 Pavel Charvat -# -# This software may be freely distributed and used according to the terms -# of the GNU Lesser General Public License. -# - -my @cat = (); -my @lcat = (); -my %ids = (); -my %cls = (); -for (my $i = 0; $i < 0x10000; $i++) { $cat[$i] = 0; } -for (my $i = 0; $i < 0x11; $i++) { $lcat[$i] = 0; } - -my @white = (0x9, 0xA, 0xD, 0x20); -my @base_char_1_0 = ( - [0x0041,0x005A], [0x0061,0x007A], [0x00C0,0x00D6], [0x00D8,0x00F6], [0x00F8,0x00FF], [0x0100,0x0131], - [0x0134,0x013E], [0x0141,0x0148], [0x014A,0x017E], [0x0180,0x01C3], [0x01CD,0x01F0], [0x01F4,0x01F5], - [0x01FA,0x0217], [0x0250,0x02A8], [0x02BB,0x02C1], 0x0386, [0x0388,0x038A], 0x038C, [0x038E,0x03A1], - [0x03A3,0x03CE], [0x03D0,0x03D6], 0x03DA, 0x03DC, 0x03DE, 0x03E0, [0x03E2,0x03F3], [0x0401,0x040C], - [0x040E,0x044F], [0x0451,0x045C], [0x045E,0x0481], [0x0490,0x04C4], [0x04C7,0x04C8], [0x04CB,0x04CC], - [0x04D0,0x04EB], [0x04EE,0x04F5], [0x04F8,0x04F9], [0x0531,0x0556], 0x0559, [0x0561,0x0586], [0x05D0,0x05EA], - [0x05F0,0x05F2], [0x0621,0x063A], [0x0641,0x064A], [0x0671,0x06B7], [0x06BA,0x06BE], [0x06C0,0x06CE], - [0x06D0,0x06D3], 0x06D5, [0x06E5,0x06E6], [0x0905,0x0939], 0x093D, [0x0958,0x0961], [0x0985,0x098C], - [0x098F,0x0990], [0x0993,0x09A8], [0x09AA,0x09B0], 0x09B2, [0x09B6,0x09B9], [0x09DC,0x09DD], [0x09DF,0x09E1], - [0x09F0,0x09F1], [0x0A05,0x0A0A], [0x0A0F,0x0A10], [0x0A13,0x0A28], [0x0A2A,0x0A30], [0x0A32,0x0A33], - [0x0A35,0x0A36], [0x0A38,0x0A39], [0x0A59,0x0A5C], 0x0A5E, [0x0A72,0x0A74], [0x0A85,0x0A8B], 0x0A8D, - [0x0A8F,0x0A91], [0x0A93,0x0AA8], [0x0AAA,0x0AB0], [0x0AB2,0x0AB3], [0x0AB5,0x0AB9], 0x0ABD, 0x0AE0, - [0x0B05,0x0B0C], [0x0B0F,0x0B10], [0x0B13,0x0B28], [0x0B2A,0x0B30], [0x0B32,0x0B33], [0x0B36,0x0B39], - 0x0B3D, [0x0B5C,0x0B5D], [0x0B5F,0x0B61], [0x0B85,0x0B8A], [0x0B8E,0x0B90], [0x0B92,0x0B95], [0x0B99,0x0B9A], - 0x0B9C, [0x0B9E,0x0B9F], [0x0BA3,0x0BA4], [0x0BA8,0x0BAA], [0x0BAE,0x0BB5], [0x0BB7,0x0BB9], [0x0C05,0x0C0C], - [0x0C0E,0x0C10], [0x0C12,0x0C28], [0x0C2A,0x0C33], [0x0C35,0x0C39], [0x0C60,0x0C61], [0x0C85,0x0C8C], - [0x0C8E,0x0C90], [0x0C92,0x0CA8], [0x0CAA,0x0CB3], [0x0CB5,0x0CB9], 0x0CDE, [0x0CE0,0x0CE1], [0x0D05,0x0D0C], - [0x0D0E,0x0D10], [0x0D12,0x0D28], [0x0D2A,0x0D39], [0x0D60,0x0D61], [0x0E01,0x0E2E], 0x0E30, [0x0E32,0x0E33], - [0x0E40,0x0E45], [0x0E81,0x0E82], 0x0E84, [0x0E87,0x0E88], 0x0E8A, 0x0E8D, [0x0E94,0x0E97], [0x0E99,0x0E9F], - [0x0EA1,0x0EA3], 0x0EA5, 0x0EA7, [0x0EAA,0x0EAB], [0x0EAD,0x0EAE], 0x0EB0, [0x0EB2,0x0EB3], 0x0EBD, - [0x0EC0,0x0EC4], [0x0F40,0x0F47], [0x0F49,0x0F69], [0x10A0,0x10C5], [0x10D0,0x10F6], 0x1100, [0x1102,0x1103], - [0x1105,0x1107], 0x1109, [0x110B,0x110C], [0x110E,0x1112], 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150, - [0x1154,0x1155], 0x1159, [0x115F,0x1161], 0x1163, 0x1165, 0x1167, 0x1169, [0x116D,0x116E], [0x1172,0x1173], - 0x1175, 0x119E, 0x11A8, 0x11AB, [0x11AE,0x11AF], [0x11B7,0x11B8], 0x11BA, [0x11BC,0x11C2], 0x11EB, 0x11F0, - 0x11F9, [0x1E00,0x1E9B], [0x1EA0,0x1EF9], [0x1F00,0x1F15], [0x1F18,0x1F1D], [0x1F20,0x1F45], [0x1F48,0x1F4D], - [0x1F50,0x1F57], 0x1F59, 0x1F5B, 0x1F5D, [0x1F5F,0x1F7D], [0x1F80,0x1FB4], [0x1FB6,0x1FBC], 0x1FBE, - [0x1FC2,0x1FC4], [0x1FC6,0x1FCC], [0x1FD0,0x1FD3], [0x1FD6,0x1FDB], [0x1FE0,0x1FEC], [0x1FF2,0x1FF4], - [0x1FF6,0x1FFC], 0x2126, [0x212A,0x212B], 0x212E, [0x2180,0x2182], [0x3041,0x3094], [0x30A1,0x30FA], - [0x3105,0x312C], [0xAC00,0xD7A3]); -my @ideographic_1_0 = ([0x4E00,0x9FA5], 0x3007, [0x3021,0x3029]); -my @combining_char_1_0 = ( - [0x0300,0x0345], [0x0360,0x0361], [0x0483,0x0486], [0x0591,0x05A1], [0x05A3,0x05B9], [0x05BB,0x05BD], - 0x05BF, [0x05C1,0x05C2], 0x05C4, [0x064B,0x0652], 0x0670, [0x06D6,0x06DC], [0x06DD,0x06DF], [0x06E0,0x06E4], - [0x06E7,0x06E8], [0x06EA,0x06ED], [0x0901,0x0903], 0x093C, [0x093E,0x094C], 0x094D, [0x0951,0x0954], - [0x0962,0x0963], [0x0981,0x0983], 0x09BC, 0x09BE, 0x09BF, [0x09C0,0x09C4], [0x09C7,0x09C8], [0x09CB,0x09CD], - 0x09D7, [0x09E2,0x09E3], 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, [0x0A40,0x0A42], [0x0A47,0x0A48], [0x0A4B,0x0A4D], - [0x0A70,0x0A71], [0x0A81,0x0A83], 0x0ABC, [0x0ABE,0x0AC5], [0x0AC7,0x0AC9], [0x0ACB,0x0ACD], [0x0B01,0x0B03], - 0x0B3C, [0x0B3E,0x0B43], [0x0B47,0x0B48], [0x0B4B,0x0B4D], [0x0B56,0x0B57], [0x0B82,0x0B83], [0x0BBE,0x0BC2], - [0x0BC6,0x0BC8], [0x0BCA,0x0BCD], 0x0BD7, [0x0C01,0x0C03], [0x0C3E,0x0C44], [0x0C46,0x0C48], [0x0C4A,0x0C4D], - [0x0C55,0x0C56], [0x0C82,0x0C83], [0x0CBE,0x0CC4], [0x0CC6,0x0CC8], [0x0CCA,0x0CCD], [0x0CD5,0x0CD6], - [0x0D02,0x0D03], [0x0D3E,0x0D43], [0x0D46,0x0D48], [0x0D4A,0x0D4D], 0x0D57, 0x0E31, [0x0E34,0x0E3A], - [0x0E47,0x0E4E], 0x0EB1, [0x0EB4,0x0EB9], [0x0EBB,0x0EBC], [0x0EC8,0x0ECD], [0x0F18,0x0F19], 0x0F35, - 0x0F37, 0x0F39, 0x0F3E, 0x0F3F, [0x0F71,0x0F84], [0x0F86,0x0F8B], [0x0F90,0x0F95], 0x0F97, [0x0F99,0x0FAD], - [0x0FB1,0x0FB7], 0x0FB9, [0x20D0,0x20DC], 0x20E1, [0x302A,0x302F], 0x3099, 0x309A); -my @digit_1_0 = ( - [0x0030,0x0039], [0x0660,0x0669], [0x06F0,0x06F9], [0x0966,0x096F], [0x09E6,0x09EF], [0x0A66,0x0A6F], - [0x0AE6,0x0AEF], [0x0B66,0x0B6F], [0x0BE7,0x0BEF], [0x0C66,0x0C6F], [0x0CE6,0x0CEF], [0x0D66,0x0D6F], - [0x0E50,0x0E59], [0x0ED0,0x0ED9], [0x0F20,0x0F29]); -my @extender_1_0 = ( - 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, [0x3031,0x3035], [0x309D,0x309E], [0x30FC,0x30FE]); -my @sname_1_1 = ( - "[:A-Z_a-z]", [0xC0,0xD6], [0xD8,0xF6], [0xF8,0x2FF], [0x370,0x37D], [0x37F,0x1FFF], - [0x200C,0x200D], [0x2070,0x218F], [0x2C00,0x2FEF], [0x3001,0xD7FF], [0xF900,0xFDCF], [0xFDF0,0xFFFD], [0x10000,0xEFFFF]); - -set("WHITE", @white); -set("NEW_LINE_1_0", 0xA, 0xD); -set("NEW_LINE_1_1", 0xA, 0xD, 0x85, 0x2028); -set("DIGIT", "[0-9]"); -set("XDIGIT", "[0-9a-fA-F]"); -set("VALID_1_0", @white, [0x20,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); -set("VALID_1_1", [0x1,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); -set("UNRESTRICTED_1_1", @white, [0x20,0x7E], 0x85, [0xA0,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]); -set("PUBID", 0x20, 0xD, 0xA, "[-a-zA-Z0-9'()+,./:=?:!*#@\$_%]"); -set("ENC_SNAME", "[a-zA-Z]"); -set("ENC_NAME", "[-a-zA-Z0-9._]"); -set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0); -set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0); -set("SNAME_1_1", @sname_1_1); -set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]); -set("DECL", @white, [0x20,0x7E]); -set("GT", "[>]"); - -print "/* Automatically generated by xml-ucat.pl */\n\n"; -find_cls(); -gen_enum(); -gen_tabs(); - -sub set { - my $id = shift; - $ids{$id} = scalar keys(%ids) if !defined($ids{$id}); - my $mask = 1 << $ids{$id}; - foreach my $i (@_) { - if (ref($i) eq "ARRAY") { - my $j = $i->[0]; - for (; $j <= $i->[1] && $j < 0x10000; $j++) { $cat[$j] |= $mask; } - for (; $j <= $i->[1]; $j += 0x10000) { $lcat[$j >> 16] |= $mask; } - } - elsif ($i =~ /^\[/) { for (my $j=0; $j < 128; $j++) { if (chr($j) =~ /$i/) { $cat[$j] |= $mask; } } } - else { $cat[$i] |= $mask; } - } -} - -sub find_cls { - foreach (my $i=0; $i<@cat; $i++) { $cls{$cat[$i]} = scalar keys(%cls) if !defined($cls{$cat[$i]}); } - foreach (my $i=0; $i<@lcat; $i++) { $cls{$lcat[$i]} = scalar keys(%cls) if !defined($cls{$lcat[$i]}); } -} - -sub gen_enum { - print "enum xml_char_type {\n"; - foreach my $id (sort keys %ids) { - my $mask = 0; - foreach my $i (keys %cls) { - $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id})); - } - printf " XML_CHAR_%-20s = 0x%08x,\n", $id, $mask; - } - print "};\n\n"; -} - -sub gen_tabs { - my @tab = (); - my %hash = (); - print "static const uns xml_char_tab1[] = {\n "; - for (my $t=0; $t<256; $t++) { - my $i = $t * 256; - my @x = (); - for (my $j=0; $j<256; $j += 32) { - push @x, join(",", map($cls{$_}, @cat[$i+$j..$i+$j+31])); - } - my $sub = " " . join(",\n ", @x); - if (!defined($hash{$sub})) { - $hash{$sub} = 256 * scalar @tab; - push @tab, $sub; - } - printf("0x%x", $hash{$sub}); - print((~$t & 15) ? "," : ($t < 255) ? ",\n " : "\n};\n\n"); - } - - print "static const byte xml_char_tab2[] = {\n"; - print join(",\n\n", @tab); - print "\n};\n\n"; - - my @l = (); - for (my $i=0; $i<0x11; $i++) { - push @l, sprintf("%d", $cls{$lcat[$i]}); - } - print "static const byte xml_char_tab3[] = {" . join(",", @l) . "};\n"; -} diff --git a/sherlock/xml/xml.c b/sherlock/xml/xml.c index 2de0e818..1d9f0f45 100644 --- a/sherlock/xml/xml.c +++ b/sherlock/xml/xml.c @@ -27,6 +27,7 @@ #include "charset/charconv.h" #include "charset/fb-charconv.h" #include "sherlock/xml/xml.h" +#include "sherlock/xml/dtd.h" #include @@ -97,7 +98,7 @@ xml_fatal(struct xml_context *ctx, const char *format, ...) /*** Charecter categorization ***/ -#include "obj/sherlock/xml/xml-ucat.h" +#include "obj/sherlock/xml/unicat.h" static inline uns xml_char_cat(uns c) @@ -941,6 +942,45 @@ xml_dtd_find_pent(struct xml_context *ctx, char *name) XML_HASH_GIVE_ALLOC #include "lib/hashtable.h" +/* Element sons */ + +struct xml_dtd_enodes_table; + +static inline uns +xml_dtd_enodes_hash(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) +{ + return hash_pointer(parent) ^ hash_pointer(elem); +} + +static inline int +xml_dtd_enodes_eq(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent1, struct xml_dtd_elem *elem1, struct xml_dtd_elem_node *parent2, struct xml_dtd_elem *elem2) +{ + return (parent1 == parent2) && (elem1 == elem2); +} + +static inline void +xml_dtd_enodes_init_key(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *node, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem) +{ + node->parent = parent; + node->elem = elem; +} + +#define HASH_PREFIX(x) xml_dtd_enodes_##x +#define HASH_NODE struct xml_dtd_elem_node +#define HASH_KEY_COMPLEX(x) x parent, x elem +#define HASH_KEY_DECL struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem +#define HASH_GIVE_HASHFN +#define HASH_GIVE_EQ +#define HASH_GIVE_INIT_KEY +#define HASH_TABLE_DYNAMIC +#define HASH_ZERO_FILL +#define HASH_WANT_FIND +#define HASH_WANT_NEW +#define HASH_GIVE_ALLOC +#define HASH_TABLE_ALLOC +XML_HASH_GIVE_ALLOC +#include "lib/hashtable.h" + /* Element attributes */ struct xml_dtd_attrs_table; @@ -1070,6 +1110,7 @@ xml_dtd_init(struct xml_context *ctx) xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table))); xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table))); xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table))); + xml_dtd_enodes_init(dtd->tab_enodes = xml_hash_new(pool, sizeof(struct xml_dtd_enodes_table))); xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table))); xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table))); xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table))); @@ -1412,7 +1453,7 @@ xml_parse_external_id(struct xml_context *ctx, struct xml_ext_id *eid, uns allow xml_fatal(ctx, "Expected an external ID"); } -/* DTD: Notation declaration */ +/* DTD: */ static void xml_parse_notation_decl(struct xml_context *ctx) @@ -1441,6 +1482,8 @@ xml_parse_notation_decl(struct xml_context *ctx) xml_dec(ctx); } +/* DTD: */ + static void xml_parse_entity_decl(struct xml_context *ctx) { @@ -1530,6 +1573,314 @@ xml_parse_entity_decl(struct xml_context *ctx) xml_dec(ctx); } +/* DTD: */ + +static void +xml_parse_element_decl(struct xml_context *ctx) +{ + /* Elementdecl ::= '' + * Already parsed: 'dtd; + struct xml_dtd_elem *elem = xml_dtd_elems_lookup(dtd->tab_elems, name); + if (elem->flags & XML_DTD_ELEM_DECLARED) + xml_fatal(ctx, "Element <%s> already declared", name); + + /* contentspec ::= 'EMPTY' | 'ANY' | Mixed | children */ + uns c = xml_peek_char(ctx); + if (c == 'E') + { + xml_parse_seq(ctx, "EMPTY"); + elem->type = XML_DTD_ELEM_EMPTY; + } + else if (c == 'A') + { + xml_parse_seq(ctx, "ANY"); + elem->type = XML_DTD_ELEM_ANY; + } + else if (c == '(') + { + xml_skip_char(ctx); + xml_inc(ctx); + xml_parse_dtd_white(ctx, 0); + struct xml_dtd_elem_node *parent = elem->node = mp_alloc_zero(dtd->pool, sizeof(*parent)); + if (xml_peek_char(ctx) == '#') + { + /* Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' */ + xml_skip_char(ctx); + xml_parse_seq(ctx, "PCDATA"); + elem->type = XML_DTD_ELEM_MIXED; + parent->type = XML_DTD_ELEM_PCDATA; + while (1) + { + xml_parse_dtd_white(ctx, 0); + if ((c = xml_get_char(ctx)) == ')') + break; + else if (c != '|') + xml_fatal_expected(ctx, ')'); + xml_parse_dtd_white(ctx, 0); + struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx)); + if (xml_dtd_enodes_find(dtd->tab_enodes, parent, son_elem)) + xml_error(ctx, "Duplicate content '%s'", son_elem->name); + else + { + struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); + slist_add_tail(&parent->sons, &son->n); + } + } + xml_dec(ctx); + if (xml_peek_char(ctx) == '*') + { + xml_skip_char(ctx); + parent->occur = XML_DTD_ELEM_OCCUR_MULT; + } + else if (!slist_head(&parent->sons)) + parent->occur = XML_DTD_ELEM_OCCUR_ONCE; + else + xml_fatal_expected(ctx, '*'); + } + else + { + /* children ::= (choice | seq) ('?' | '*' | '+')? + * cp ::= (Name | choice | seq) ('?' | '*' | '+')? + * choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' + * seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' */ + + elem->type = XML_DTD_ELEM_CHILDREN; + parent->type = XML_DTD_ELEM_PCDATA; + uns c; + goto first; + + while (1) + { + /* After name */ + xml_parse_dtd_white(ctx, 0); + if ((c = xml_get_char(ctx)) == ')') + { + xml_dec(ctx); + if (parent->type == XML_DTD_ELEM_PCDATA) + parent->type = XML_DTD_ELEM_SEQ; + if ((c = xml_get_char(ctx)) == '?') + parent->occur = XML_DTD_ELEM_OCCUR_OPT; + else if (c == '*') + parent->occur = XML_DTD_ELEM_OCCUR_MULT; + else if (c == '+') + parent->occur = XML_DTD_ELEM_OCCUR_PLUS; + else + { + xml_unget_char(ctx); + parent->occur = XML_DTD_ELEM_OCCUR_ONCE; + } + if (!parent->parent) + break; + parent = parent->parent; + continue; + } + else if (c == '|') + { + if (parent->type == XML_DTD_ELEM_PCDATA) + parent->type = XML_DTD_ELEM_OR; + else if (parent->type != XML_DTD_ELEM_OR) + xml_fatal(ctx, "Mixed operators in the list of element children"); + } + else if (c == ',') + { + if (parent->type == XML_DTD_ELEM_PCDATA) + parent->type = XML_DTD_ELEM_SEQ; + else if (parent->type != XML_DTD_ELEM_SEQ) + xml_fatal(ctx, "Mixed operators in the list of element children"); + } + else if (c == '(') + { + xml_inc(ctx); + struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); + son->parent = parent; + slist_add_tail(&parent->sons, &son->n); + parent = son->parent; + son->type = XML_DTD_ELEM_MIXED; + } + else + xml_unget_char(ctx); + + /* Before name */ + xml_parse_dtd_white(ctx, 0); +first:; + struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx)); + // FIXME: duplicates, occurance + //struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem); + struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son)); + son->parent = parent; + son->elem = son_elem; + slist_add_tail(&parent->sons, &son->n); + } + } + } + else + xml_fatal(ctx, "Expected element content specification"); + + xml_parse_dtd_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_dec(ctx); +} + +static char * +xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) +{ + uns quote = xml_parse_quote(ctx); + xml_push(ctx); + struct fastbuf *out = ctx->value; + while (1) + { + uns c = xml_get_char(ctx); + if (c == '&') + { + xml_inc(ctx); + xml_parse_ge_ref(ctx, out); + } + else if (c == quote) // FIXME: beware quotes inside parsed + break; + else if (c == '<') + xml_error(ctx, "Attribute value must not contain '<'"); + else + bput_utf8_32(out, c); + } + xml_pop(ctx); + bputc(out, 0); + fbgrow_rewind(out); + char *value = mp_memdup(ctx->pool, out->bptr, out->bstop - out->bptr); + // FIXME: check value constraints / normalize value + fbgrow_reset(out); + return value; +} + +static void +xml_parse_attr_list_decl(struct xml_context *ctx) +{ + /* AttlistDecl ::= '' + * AttDef ::= S Name S AttType S DefaultDecl + * Already parsed: 'dtd->tab_elems, xml_parse_name(ctx)); + + while (xml_parse_dtd_white(ctx, 0) && xml_peek_char(ctx) != '>') + { + char *name = xml_parse_name(ctx); + struct xml_dtd_attr *attr = xml_dtd_attrs_find(ctx->dtd->tab_attrs, elem, name); + uns ignored = 0; + if (attr) + { + xml_warn(ctx, "Duplicate attribute definition"); + ignored++; + } + else + attr = xml_dtd_attrs_new(ctx->dtd->tab_attrs, elem, name); + xml_parse_dtd_white(ctx, 1); + if (xml_peek_char(ctx) == '(') + { + xml_skip_char(ctx); // FIXME: xml_inc/dec ? + if (!ignored) + attr->type = XML_ATTR_ENUM; + do + { + xml_parse_dtd_white(ctx, 0); + char *value = xml_parse_nmtoken(ctx); + if (!ignored) + if (xml_dtd_evals_find(ctx->dtd->tab_evals, attr, value)) + xml_error(ctx, "Duplicate enumeration value"); + else + xml_dtd_evals_new(ctx->dtd->tab_evals, attr, value); + xml_parse_dtd_white(ctx, 0); + } + while (xml_get_char(ctx) == '|'); + xml_unget_char(ctx); + xml_parse_char(ctx, ')'); + } + else + { + char *type = xml_parse_name(ctx); + enum xml_dtd_attribute_type t; + if (!strcmp(type, "CDATA")) + t = XML_ATTR_CDATA; + else if (!strcmp(type, "ID")) + t = XML_ATTR_ID; + else if (!strcmp(type, "IDREF")) + t = XML_ATTR_IDREF; + else if (!strcmp(type, "IDREFS")) + t = XML_ATTR_IDREFS; + else if (!strcmp(type, "ENTITY")) + t = XML_ATTR_ENTITY; + else if (!strcmp(type, "ENTITIES")) + t = XML_ATTR_ENTITIES; + else if (!strcmp(type, "NMTOKEN")) + t = XML_ATTR_NMTOKEN; + else if (!strcmp(type, "NMTOKENS")) + t = XML_ATTR_NMTOKENS; + else if (!strcmp(type, "NOTATION")) + { + if (elem->type == XML_DTD_ELEM_EMPTY) + xml_fatal(ctx, "Empty element must not have notation attribute"); + // FIXME: An element type MUST NOT have more than one NOTATION attribute specified. + t = XML_ATTR_NOTATION; + xml_parse_dtd_white(ctx, 1); + xml_parse_char(ctx, '('); + do + { + xml_parse_dtd_white(ctx, 0); + struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx)); + if (!ignored) + if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, attr, n)) + xml_error(ctx, "Duplicate enumerated notation"); + else + xml_dtd_enotns_new(ctx->dtd->tab_enotns, attr, n); + xml_parse_dtd_white(ctx, 0); + } + while (xml_get_char(ctx) == '|'); + xml_unget_char(ctx); + xml_parse_char(ctx, ')'); + } + else + xml_fatal(ctx, "Unknown attribute type"); + if (!ignored) + attr->type = t; + } + xml_parse_dtd_white(ctx, 1); + enum xml_dtd_attribute_default def = XML_ATTR_NONE; + if (xml_get_char(ctx) == '#') + switch (xml_peek_char(ctx)) + { + case 'R': + xml_parse_seq(ctx, "REQUIRED"); + def = XML_ATTR_REQUIRED; + break; + case 'I': + xml_parse_seq(ctx, "IMPLIED"); + def = XML_ATTR_IMPLIED; + break; + case 'F': + xml_parse_seq(ctx, "FIXED"); + def = XML_ATTR_FIXED; + xml_parse_dtd_white(ctx, 1); + break; + default: + xml_fatal(ctx, "Expected a modifier for default attribute value"); + } + else + xml_unget_char(ctx); + if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED) + { + char *v = xml_parse_attr_value(ctx, attr); + if (!ignored) + attr->default_value = v; + } + if (!ignored) + attr->default_mode = def; + } + xml_skip_char(ctx); + xml_dec(ctx); +} + /* DTD: Internal subset */ static void @@ -1565,14 +1916,14 @@ xml_parse_internal_subset(struct xml_context *ctx) else if (c == 'L') { xml_parse_seq(ctx, "EMENT"); - // FIXME: Element + xml_parse_element_decl(ctx); } else goto invalid_markup; break; case 'A': xml_parse_seq(ctx, "TTLIST"); - // FIXME: AttList + xml_parse_attr_list_decl(ctx); break; default: goto invalid_markup; @@ -1813,209 +2164,6 @@ xml_pop_element(struct xml_context *ctx) #endif } -static void -xml_parse_element_decl(struct xml_context *ctx) -{ - // FIXME - mp_push(ctx->pool); - xml_parse_seq(ctx, "'); - mp_pop(ctx->pool); -} - -#if 0 -static void -xml_parse_attr_list_decl(struct xml_context *ctx) -{ - /* AttlistDecl ::= '' - * AttDef ::= S Name S AttType S DefaultDecl */ - xml_parse_seq(ctx, "ATTLIST"); - xml_parse_white(ctx, 1); - struct xml_dtd_elem *e = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx)); - e->attlist_declared = 1; - - while (xml_parse_white(ctx, 0) && xml_get_char(ctx) != '>') - { - xml_unget_char(ctx); - char *name = xml_parse_name(ctx); - struct xml_dtd_attr *a = xml_dtd_attrs_find(ctx->dtd->tab_attrs, e, name); - uns ignored = 0; - if (a) - { - xml_warn(ctx, "Duplicate attribute definition"); - ignored++; - } - else - a = xml_dtd_attrs_new(ctx->dtd->tab_attrs, e, name); - xml_parse_white(ctx, 1); - if (xml_get_char(ctx) == '(') - { - if (!ignored) - a->type = XML_ATTR_ENUM; - do - { - xml_parse_white(ctx, 0); - char *value = xml_parse_nmtoken(ctx); - if (!ignored) - if (xml_dtd_evals_find(ctx->dtd->tab_evals, a, value)) - xml_error(ctx, "Duplicate enumeration value"); - else - xml_dtd_evals_new(ctx->dtd->tab_evals, a, value); - xml_parse_white(ctx, 0); - } - while (xml_get_char(ctx) == '|'); - xml_unget_char(ctx); - xml_parse_char(ctx, ')'); - } - else - { - xml_unget_char(ctx); - char *type = xml_parse_name(ctx); - enum xml_dtd_attribute_type t; - if (!strcmp(type, "CDATA")) - t = XML_ATTR_CDATA; - else if (!strcmp(type, "ID")) - t = XML_ATTR_ID; - else if (!strcmp(type, "IDREF")) - t = XML_ATTR_IDREF; - else if (!strcmp(type, "IDREFS")) - t = XML_ATTR_IDREFS; - else if (!strcmp(type, "ENTITY")) - t = XML_ATTR_ENTITY; - else if (!strcmp(type, "ENTITIES")) - t = XML_ATTR_ENTITIES; - else if (!strcmp(type, "NMTOKEN")) - t = XML_ATTR_NMTOKEN; - else if (!strcmp(type, "NMTOKENS")) - t = XML_ATTR_NMTOKENS; - else if (!strcmp(type, "NOTATION")) - { - t = XML_ATTR_NOTATION; - xml_parse_white(ctx, 1); - xml_parse_char(ctx, '('); - do - { - xml_parse_white(ctx, 0); - struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx)); - if (!ignored) - if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, a, n)) - xml_error(ctx, "Duplicate enumerated notation"); - else - xml_dtd_enotns_new(ctx->dtd->tab_enotns, a, n); - xml_parse_white(ctx, 0); - } - while (xml_get_char(ctx) == '|'); - xml_unget_char(ctx); - xml_parse_char(ctx, ')'); - } - else - xml_fatal(ctx, "Unknown attribute type"); - if (!ignored) - a->type = t; - } - xml_parse_white(ctx, 1); - enum xml_dtd_attribute_default def = XML_ATTR_NONE; - if (xml_get_char(ctx) == '#') - switch (xml_get_char(ctx)) - { - case 'R': - xml_parse_seq(ctx, "EQUIRED"); - def = XML_ATTR_REQUIRED; - break; - case 'I': - xml_parse_seq(ctx, "MPLIED"); - def = XML_ATTR_IMPLIED; - break; - case 'F': - xml_parse_seq(ctx, "IXED"); - def = XML_ATTR_FIXED; - break; - default: - xml_fatal(ctx, "Expected a modifier for default attribute value"); - } - else - xml_unget_char(ctx); - if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED) - { - xml_parse_system_literal(ctx); - // FIXME - } - } -} -#endif - static void xml_parse_doctype_decl(struct xml_context *ctx) { diff --git a/sherlock/xml/xml.h b/sherlock/xml/xml.h index 87cdff91..7e83f65a 100644 --- a/sherlock/xml/xml.h +++ b/sherlock/xml/xml.h @@ -7,8 +7,8 @@ * of the GNU Lesser General Public License. */ -#ifndef _SHERLOCK_XML_H -#define _SHERLOCK_XML_H +#ifndef _SHERLOCK_XML_XML_H +#define _SHERLOCK_XML_XML_H #include "lib/clists.h" #include "lib/slists.h" @@ -204,132 +204,6 @@ struct xml_context { void (*unparsed_entity_decl)(struct xml_context *ctx); }; -/*** Document Type Definition (DTD) ***/ - -struct xml_dtd { - struct mempool *pool; /* Memory pool where to allocate DTD */ - slist gents; /* Link list of general entities */ - slist pents; /* Link list of parapeter entities */ - slist notns; /* Link list of notations */ - slist elems; /* Link list of elements */ - void *tab_gents; /* Hash table of general entities */ - void *tab_pents; /* Hash table of parameter entities */ - void *tab_notns; /* Hash table of notations */ - void *tab_elems; /* Hash table of elements */ - void *tab_attrs; /* Hash table of element attributes */ - void *tab_evals; /* Hash table of enumerated attribute values */ - void *tab_enotns; /* hash table of enumerated attribute notations */ -}; - -/* Notations */ - -enum xml_dtd_notn_flags { - XML_DTD_NOTN_DECLARED = 0x1, /* The notation has been declared (interbal usage) */ -}; - -struct xml_dtd_notn { - snode n; /* Node in xml_dtd.notns */ - uns flags; /* XML_DTD_NOTN_x */ - char *name; /* Notation name */ - struct xml_ext_id eid; /* External id */ -}; - -/* Entities */ - -enum xml_dtd_ent_flags { - XML_DTD_ENT_DECLARED = 0x1, /* The entity has been declared (internal usage) */ - XML_DTD_ENT_VISITED = 0x2, /* Cycle detection (internal usage) */ - XML_DTD_ENT_PARAMETER = 0x4, /* Parameter entity, general otherwise */ - XML_DTD_ENT_EXTERNAL = 0x8, /* External entity, internal otherwise */ - XML_DTD_ENT_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */ - XML_DTD_ENT_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */ -}; - -struct xml_dtd_ent { - snode n; /* Node in xml_dtd.[gp]ents */ - uns flags; /* XML_DTD_ENT_x */ - char *name; /* Entity name */ - char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */ - uns len; /* Text length */ - struct xml_ext_id eid; /* External ID */ - struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */ -}; - -/* Elements */ - -enum xml_dtd_elem_flags { - XML_DTD_ELEM_DECLARED = 0x1, /* The element has been declared (internal usage) */ -}; - -struct xml_dtd_elem { - snode n; - uns flags; - char *name; - struct xml_dtd_elem_node *node; -}; - -struct xml_dtd_elem_node { - snode n; - struct xml_dtd_elem_node *parent; - slist sons; - uns type; - uns occur; -}; - -enum xml_dtd_elem_node_type { - XML_DTD_ELEM_PCDATA, - XML_DTD_ELEM_SEQ, - XML_DTD_ELEM_OR, -}; - -enum xml_dtd_elem_node_occur { - XML_DTD_ELEM_OCCUR_ONCE, - XML_DTD_ELEM_OCCUR_OPT, - XML_DTD_ELEM_OCCUR_MULT, - XML_DTD_ELEM_OCCUR_PLUS, -}; - -/* Attributes */ - - -enum xml_dtd_attribute_default { - XML_ATTR_NONE, - XML_ATTR_REQUIRED, - XML_ATTR_IMPLIED, - XML_ATTR_FIXED, -}; - -enum xml_dtd_attribute_type { - XML_ATTR_CDATA, - XML_ATTR_ID, - XML_ATTR_IDREF, - XML_ATTR_IDREFS, - XML_ATTR_ENTITY, - XML_ATTR_ENTITIES, - XML_ATTR_NMTOKEN, - XML_ATTR_NMTOKENS, - XML_ATTR_ENUM, - XML_ATTR_NOTATION, -}; - -struct xml_dtd_attr { - char *name; - struct xml_dtd_elem *elem; - enum xml_dtd_attribute_type type; - enum xml_dtd_attribute_default default_mode; - char *default_value; -}; - -struct xml_dtd_eval { - struct xml_dtd_attr *attr; - char *val; -}; - -struct xml_dtd_enotn { - struct xml_dtd_attr *attr; - struct xml_dtd_notn *notn; -}; - void xml_init(struct xml_context *ctx); void xml_cleanup(struct xml_context *ctx); void xml_set_source(struct xml_context *ctx, struct fastbuf *fb);