]> mj.ucw.cz Git - libucw.git/commitdiff
XML: Updates to the XML parser.
authorPavel Charvat <pavel.charvat@netcentrum.cz>
Tue, 11 Dec 2007 11:17:47 +0000 (12:17 +0100)
committerPavel Charvat <pavel.charvat@netcentrum.cz>
Tue, 11 Dec 2007 11:17:47 +0000 (12:17 +0100)
sherlock/xml/Makefile
sherlock/xml/dtd.h [new file with mode: 0644]
sherlock/xml/libshxml.pc [new file with mode: 0644]
sherlock/xml/unicat.pl [new file with mode: 0755]
sherlock/xml/xml-ucat.pl [deleted file]
sherlock/xml/xml.c
sherlock/xml/xml.h

index e3acc1812f88e02d41c40ebbdd233505a13ccd5f..f721b500d154bf9045304c273b7f20e3577dea74 100644 (file)
@@ -3,17 +3,21 @@
 
 DIRS+=sherlock/xml
 
-LIBSH_MODS+=xml/xml
-LIBSH_XML_INCLUDES=xml.h
+LIBSHXML_MODS=xml
+LIBSHXML_INCLUDES=xml.h dtd.h
 
-$(o)/sherlock/xml/xml-t: $(LIBSH) $(LIBCHARSET)
-$(o)/sherlock/xml/xml.o: $(o)/sherlock/xml/xml-ucat.h
-$(o)/sherlock/xml/xml-ucat.h: $(s)/sherlock/xml/xml-ucat.pl
+LIBSHXML_MOD_PATHS=$(addprefix $(o)/sherlock/xml/,$(LIBSHXML_MODS))
+
+$(o)/sherlock/xml/libshxml.a: $(addsuffix .o,$(LIBSHXML_MOD_PATHS))
+$(o)/sherlock/xml/libshxml.so: $(addsuffix .oo,$(LIBSHXML_MOD_PATHS))
+$(o)/sherlock/xml/libshxml.pc: $(LIBUCW) $(LIBCHARSET)
+
+$(o)/sherlock/xml/xml-t: $(LIBSHXML)
+$(o)/sherlock/xml/xml.o: $(o)/sherlock/xml/unicat.h
+$(o)/sherlock/xml/unicat.h: $(s)/sherlock/xml/unicat.pl
        $(M)GEN $@
        $(Q)$< >$@
 
 API_INCLUDES+=$(o)/sherlock/xml/.include-stamp
-$(o)/sherlock/xml/.include-stamp: $(addprefix $(s)/sherlock/xml/,$(LIBSH_XML_INCLUDES))
+$(o)/sherlock/xml/.include-stamp: $(addprefix $(s)/sherlock/xml/,$(LIBSHXML_INCLUDES))
 $(o)/sherlock/xml/.include-stamp: IDST=sherlock/xml
-
-include $(s)/sherlock/perl/Makefile
diff --git a/sherlock/xml/dtd.h b/sherlock/xml/dtd.h
new file mode 100644 (file)
index 0000000..bf95b87
--- /dev/null
@@ -0,0 +1,148 @@
+/*
+ *     Sherlock Library -- A simple XML parser
+ *
+ *     (c) 2007 Pavel Charvat <pchar@ucw.cz>
+ *
+ *     This software may be freely distributed and used according to the terms
+ *     of the GNU Lesser General Public License.
+ */
+
+#ifndef _SHERLOCK_XML_DTD_H
+#define _SHERLOCK_XML_DTD_H
+
+#include "sherlock/xml/xml.h"
+
+struct xml_dtd {
+  struct mempool *pool;                        /* Memory pool where to allocate DTD */
+  slist gents;                         /* Link list of general entities */
+  slist pents;                         /* Link list of parapeter entities */
+  slist notns;                         /* Link list of notations */
+  slist elems;                         /* Link list of elements */
+  void *tab_gents;                     /* Hash table of general entities */
+  void *tab_pents;                     /* Hash table of parameter entities */
+  void *tab_notns;                     /* Hash table of notations */
+  void *tab_elems;                     /* Hash table of elements */
+  void *tab_enodes;                    /* Hash table of element sons */
+  void *tab_attrs;                     /* Hash table of element attributes */
+  void *tab_evals;                     /* Hash table of enumerated attribute values */
+  void *tab_enotns;                    /* hash table of enumerated attribute notations */
+};
+
+/* Notations */
+
+enum xml_dtd_notn_flags {
+  XML_DTD_NOTN_DECLARED = 0x1,         /* The notation has been declared (interbal usage) */
+};
+
+struct xml_dtd_notn {
+  snode n;                             /* Node in xml_dtd.notns */
+  uns flags;                           /* XML_DTD_NOTN_x */
+  char *name;                          /* Notation name */
+  struct xml_ext_id eid;               /* External id */
+};
+
+/* Entities */
+
+enum xml_dtd_ent_flags {
+  XML_DTD_ENT_DECLARED = 0x1,          /* The entity has been declared (internal usage) */
+  XML_DTD_ENT_VISITED = 0x2,           /* Cycle detection (internal usage) */
+  XML_DTD_ENT_PARAMETER = 0x4,         /* Parameter entity, general otherwise */
+  XML_DTD_ENT_EXTERNAL = 0x8,          /* External entity, internal otherwise */
+  XML_DTD_ENT_UNPARSED = 0x10,         /* Unparsed entity, parsed otherwise */
+  XML_DTD_ENT_TRIVIAL = 0x20,          /* Replacement text is a sequence of characters and character references */
+};
+
+struct xml_dtd_ent {
+  snode n;                             /* Node in xml_dtd.[gp]ents */
+  uns flags;                           /* XML_DTD_ENT_x */
+  char *name;                          /* Entity name */
+  char *text;                          /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */
+  uns len;                             /* Text length */
+  struct xml_ext_id eid;               /* External ID */
+  struct xml_dtd_notn *notn;           /* Notation (XML_DTD_ENT_UNPARSED only) */
+};
+
+/* Elements */
+
+enum xml_dtd_elem_flags {
+  XML_DTD_ELEM_DECLARED = 0x1,         /* The element has been declared (internal usage) */
+};
+
+enum xml_dtd_elem_type {
+  XML_DTD_ELEM_EMPTY,
+  XML_DTD_ELEM_ANY,
+  XML_DTD_ELEM_MIXED,
+  XML_DTD_ELEM_CHILDREN,
+};
+
+struct xml_dtd_elem {
+  snode n;
+  uns flags;
+  uns type;
+  char *name;
+  struct xml_dtd_elem_node *node;
+};
+
+struct xml_dtd_elem_node {
+  snode n;
+  struct xml_dtd_elem_node *parent;
+  struct xml_dtd_elem *elem;
+  slist sons;
+  uns type;
+  uns occur;
+};
+
+enum xml_dtd_elem_node_type {
+  XML_DTD_ELEM_PCDATA,
+  XML_DTD_ELEM_SEQ,
+  XML_DTD_ELEM_OR,
+};
+
+enum xml_dtd_elem_node_occur {
+  XML_DTD_ELEM_OCCUR_ONCE,
+  XML_DTD_ELEM_OCCUR_OPT,
+  XML_DTD_ELEM_OCCUR_MULT,
+  XML_DTD_ELEM_OCCUR_PLUS,
+};
+
+/* Attributes */
+
+enum xml_dtd_attribute_default {
+  XML_ATTR_NONE,
+  XML_ATTR_REQUIRED,
+  XML_ATTR_IMPLIED,
+  XML_ATTR_FIXED,
+};
+
+enum xml_dtd_attribute_type {
+  XML_ATTR_CDATA,
+  XML_ATTR_ID,
+  XML_ATTR_IDREF,
+  XML_ATTR_IDREFS,
+  XML_ATTR_ENTITY,
+  XML_ATTR_ENTITIES,
+  XML_ATTR_NMTOKEN,
+  XML_ATTR_NMTOKENS,
+  XML_ATTR_ENUM,
+  XML_ATTR_NOTATION,
+};
+
+struct xml_dtd_attr {
+  char *name;
+  struct xml_dtd_elem *elem;
+  enum xml_dtd_attribute_type type;
+  enum xml_dtd_attribute_default default_mode;
+  char *default_value;
+};
+
+struct xml_dtd_eval {
+  struct xml_dtd_attr *attr;
+  char *val;
+};
+
+struct xml_dtd_enotn {
+  struct xml_dtd_attr *attr;
+  struct xml_dtd_notn *notn;
+};
+
+#endif
diff --git a/sherlock/xml/libshxml.pc b/sherlock/xml/libshxml.pc
new file mode 100644 (file)
index 0000000..c2172b3
--- /dev/null
@@ -0,0 +1,11 @@
+# pkg-config metadata for libshxml
+
+libdir=@LIBDIR@
+incdir=.
+
+Name: libshxml
+Description: XML parser for Sherlock project
+Version: @SHERLOCK_VERSION@
+Cflags: -I${incdir}
+Libs: -L${libdir} -lshxml
+Requires: @DEPS@
diff --git a/sherlock/xml/unicat.pl b/sherlock/xml/unicat.pl
new file mode 100755 (executable)
index 0000000..fc39bba
--- /dev/null
@@ -0,0 +1,155 @@
+#!/usr/bin/perl
+#
+#      UCW Library -- Character map for the XML parser
+#
+#      (c) 2007 Pavel Charvat <pchar@ucw.cz>
+#
+#      This software may be freely distributed and used according to the terms
+#      of the GNU Lesser General Public License.
+#
+
+my @cat = ();
+my @lcat = ();
+my %ids = ();
+my %cls = ();
+for (my $i = 0; $i < 0x10000; $i++) { $cat[$i] = 0; }
+for (my $i = 0; $i < 0x11; $i++) { $lcat[$i] = 0; }
+
+my @white = (0x9, 0xA, 0xD, 0x20);
+my @base_char_1_0 = (
+  [0x0041,0x005A], [0x0061,0x007A], [0x00C0,0x00D6], [0x00D8,0x00F6], [0x00F8,0x00FF], [0x0100,0x0131],
+  [0x0134,0x013E], [0x0141,0x0148], [0x014A,0x017E], [0x0180,0x01C3], [0x01CD,0x01F0], [0x01F4,0x01F5],
+  [0x01FA,0x0217], [0x0250,0x02A8], [0x02BB,0x02C1], 0x0386, [0x0388,0x038A], 0x038C, [0x038E,0x03A1],
+  [0x03A3,0x03CE], [0x03D0,0x03D6], 0x03DA, 0x03DC, 0x03DE, 0x03E0, [0x03E2,0x03F3], [0x0401,0x040C],
+  [0x040E,0x044F], [0x0451,0x045C], [0x045E,0x0481], [0x0490,0x04C4], [0x04C7,0x04C8], [0x04CB,0x04CC],
+  [0x04D0,0x04EB], [0x04EE,0x04F5], [0x04F8,0x04F9], [0x0531,0x0556], 0x0559, [0x0561,0x0586], [0x05D0,0x05EA],
+  [0x05F0,0x05F2], [0x0621,0x063A], [0x0641,0x064A], [0x0671,0x06B7], [0x06BA,0x06BE], [0x06C0,0x06CE],
+  [0x06D0,0x06D3], 0x06D5, [0x06E5,0x06E6], [0x0905,0x0939], 0x093D, [0x0958,0x0961], [0x0985,0x098C],
+  [0x098F,0x0990], [0x0993,0x09A8], [0x09AA,0x09B0], 0x09B2, [0x09B6,0x09B9], [0x09DC,0x09DD], [0x09DF,0x09E1],
+  [0x09F0,0x09F1], [0x0A05,0x0A0A], [0x0A0F,0x0A10], [0x0A13,0x0A28], [0x0A2A,0x0A30], [0x0A32,0x0A33],
+  [0x0A35,0x0A36], [0x0A38,0x0A39], [0x0A59,0x0A5C], 0x0A5E, [0x0A72,0x0A74], [0x0A85,0x0A8B], 0x0A8D,
+  [0x0A8F,0x0A91], [0x0A93,0x0AA8], [0x0AAA,0x0AB0], [0x0AB2,0x0AB3], [0x0AB5,0x0AB9], 0x0ABD, 0x0AE0,
+  [0x0B05,0x0B0C], [0x0B0F,0x0B10], [0x0B13,0x0B28], [0x0B2A,0x0B30], [0x0B32,0x0B33], [0x0B36,0x0B39],
+  0x0B3D, [0x0B5C,0x0B5D], [0x0B5F,0x0B61], [0x0B85,0x0B8A], [0x0B8E,0x0B90], [0x0B92,0x0B95], [0x0B99,0x0B9A],
+  0x0B9C, [0x0B9E,0x0B9F], [0x0BA3,0x0BA4], [0x0BA8,0x0BAA], [0x0BAE,0x0BB5], [0x0BB7,0x0BB9], [0x0C05,0x0C0C],
+  [0x0C0E,0x0C10], [0x0C12,0x0C28], [0x0C2A,0x0C33], [0x0C35,0x0C39], [0x0C60,0x0C61], [0x0C85,0x0C8C],
+  [0x0C8E,0x0C90], [0x0C92,0x0CA8], [0x0CAA,0x0CB3], [0x0CB5,0x0CB9], 0x0CDE, [0x0CE0,0x0CE1], [0x0D05,0x0D0C],
+  [0x0D0E,0x0D10], [0x0D12,0x0D28], [0x0D2A,0x0D39], [0x0D60,0x0D61], [0x0E01,0x0E2E], 0x0E30, [0x0E32,0x0E33],
+  [0x0E40,0x0E45], [0x0E81,0x0E82], 0x0E84, [0x0E87,0x0E88], 0x0E8A, 0x0E8D, [0x0E94,0x0E97], [0x0E99,0x0E9F],
+  [0x0EA1,0x0EA3], 0x0EA5, 0x0EA7, [0x0EAA,0x0EAB], [0x0EAD,0x0EAE], 0x0EB0, [0x0EB2,0x0EB3], 0x0EBD,
+  [0x0EC0,0x0EC4], [0x0F40,0x0F47], [0x0F49,0x0F69], [0x10A0,0x10C5], [0x10D0,0x10F6], 0x1100, [0x1102,0x1103],
+  [0x1105,0x1107], 0x1109, [0x110B,0x110C], [0x110E,0x1112], 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150,
+  [0x1154,0x1155], 0x1159, [0x115F,0x1161], 0x1163, 0x1165, 0x1167, 0x1169, [0x116D,0x116E], [0x1172,0x1173],
+  0x1175, 0x119E, 0x11A8, 0x11AB, [0x11AE,0x11AF], [0x11B7,0x11B8], 0x11BA, [0x11BC,0x11C2], 0x11EB, 0x11F0,
+  0x11F9, [0x1E00,0x1E9B], [0x1EA0,0x1EF9], [0x1F00,0x1F15], [0x1F18,0x1F1D], [0x1F20,0x1F45], [0x1F48,0x1F4D],
+  [0x1F50,0x1F57], 0x1F59, 0x1F5B, 0x1F5D, [0x1F5F,0x1F7D], [0x1F80,0x1FB4], [0x1FB6,0x1FBC], 0x1FBE,
+  [0x1FC2,0x1FC4], [0x1FC6,0x1FCC], [0x1FD0,0x1FD3], [0x1FD6,0x1FDB], [0x1FE0,0x1FEC], [0x1FF2,0x1FF4],
+  [0x1FF6,0x1FFC], 0x2126, [0x212A,0x212B], 0x212E, [0x2180,0x2182], [0x3041,0x3094], [0x30A1,0x30FA],
+  [0x3105,0x312C], [0xAC00,0xD7A3]);
+my @ideographic_1_0 = ([0x4E00,0x9FA5], 0x3007, [0x3021,0x3029]);
+my @combining_char_1_0 = (
+  [0x0300,0x0345], [0x0360,0x0361], [0x0483,0x0486], [0x0591,0x05A1], [0x05A3,0x05B9], [0x05BB,0x05BD],
+  0x05BF, [0x05C1,0x05C2], 0x05C4, [0x064B,0x0652], 0x0670, [0x06D6,0x06DC], [0x06DD,0x06DF], [0x06E0,0x06E4],
+  [0x06E7,0x06E8], [0x06EA,0x06ED], [0x0901,0x0903], 0x093C, [0x093E,0x094C], 0x094D, [0x0951,0x0954],
+  [0x0962,0x0963], [0x0981,0x0983], 0x09BC, 0x09BE, 0x09BF, [0x09C0,0x09C4], [0x09C7,0x09C8], [0x09CB,0x09CD],
+  0x09D7, [0x09E2,0x09E3], 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, [0x0A40,0x0A42], [0x0A47,0x0A48], [0x0A4B,0x0A4D],
+  [0x0A70,0x0A71], [0x0A81,0x0A83], 0x0ABC, [0x0ABE,0x0AC5], [0x0AC7,0x0AC9], [0x0ACB,0x0ACD], [0x0B01,0x0B03],
+  0x0B3C, [0x0B3E,0x0B43], [0x0B47,0x0B48], [0x0B4B,0x0B4D], [0x0B56,0x0B57], [0x0B82,0x0B83], [0x0BBE,0x0BC2],
+  [0x0BC6,0x0BC8], [0x0BCA,0x0BCD], 0x0BD7, [0x0C01,0x0C03], [0x0C3E,0x0C44], [0x0C46,0x0C48], [0x0C4A,0x0C4D],
+  [0x0C55,0x0C56], [0x0C82,0x0C83], [0x0CBE,0x0CC4], [0x0CC6,0x0CC8], [0x0CCA,0x0CCD], [0x0CD5,0x0CD6],
+  [0x0D02,0x0D03], [0x0D3E,0x0D43], [0x0D46,0x0D48], [0x0D4A,0x0D4D], 0x0D57, 0x0E31, [0x0E34,0x0E3A],
+  [0x0E47,0x0E4E], 0x0EB1, [0x0EB4,0x0EB9], [0x0EBB,0x0EBC], [0x0EC8,0x0ECD], [0x0F18,0x0F19], 0x0F35,
+  0x0F37, 0x0F39, 0x0F3E, 0x0F3F, [0x0F71,0x0F84], [0x0F86,0x0F8B], [0x0F90,0x0F95], 0x0F97, [0x0F99,0x0FAD],
+  [0x0FB1,0x0FB7], 0x0FB9, [0x20D0,0x20DC], 0x20E1, [0x302A,0x302F], 0x3099, 0x309A);
+my @digit_1_0 = (
+  [0x0030,0x0039], [0x0660,0x0669], [0x06F0,0x06F9], [0x0966,0x096F], [0x09E6,0x09EF], [0x0A66,0x0A6F],
+  [0x0AE6,0x0AEF], [0x0B66,0x0B6F], [0x0BE7,0x0BEF], [0x0C66,0x0C6F], [0x0CE6,0x0CEF], [0x0D66,0x0D6F],
+  [0x0E50,0x0E59], [0x0ED0,0x0ED9], [0x0F20,0x0F29]);
+my @extender_1_0 = (
+  0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, [0x3031,0x3035], [0x309D,0x309E], [0x30FC,0x30FE]);
+my @sname_1_1 = (
+  "[:A-Z_a-z]", [0xC0,0xD6], [0xD8,0xF6], [0xF8,0x2FF], [0x370,0x37D], [0x37F,0x1FFF],
+  [0x200C,0x200D], [0x2070,0x218F], [0x2C00,0x2FEF], [0x3001,0xD7FF], [0xF900,0xFDCF], [0xFDF0,0xFFFD], [0x10000,0xEFFFF]);
+
+set("WHITE", @white);
+set("NEW_LINE_1_0", 0xA, 0xD);
+set("NEW_LINE_1_1", 0xA, 0xD, 0x85, 0x2028);
+set("DIGIT", "[0-9]");
+set("XDIGIT", "[0-9a-fA-F]");
+set("VALID_1_0", @white, [0x20,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]);
+set("VALID_1_1", [0x1,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]);
+set("UNRESTRICTED_1_1", @white, [0x20,0x7E], 0x85, [0xA0,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]);
+set("PUBID", 0x20, 0xD, 0xA, "[-a-zA-Z0-9'()+,./:=?:!*#@\$_%]");
+set("ENC_SNAME", "[a-zA-Z]");
+set("ENC_NAME", "[-a-zA-Z0-9._]");
+set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0);
+set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0);
+set("SNAME_1_1", @sname_1_1);
+set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]);
+set("GT", "[>]");
+
+find_cls();
+gen_enum();
+gen_tabs();
+
+sub set {
+  my $id = shift;
+  $ids{$id} = scalar keys(%ids) if !defined($ids{$id});
+  my $mask = 1 << $ids{$id};
+  foreach my $i (@_) {
+    if (ref($i) eq "ARRAY") {
+      my $j = $i->[0];
+      for (; $j <= $i->[1] && $j < 0x10000; $j++) { $cat[$j] |= $mask; }
+      for (; $j <= $i->[1]; $j += 0x10000) { $lcat[$j >> 16] |= $mask; }
+    }
+    elsif ($i =~ /^\[/) { for (my $j=0; $j < 128; $j++) { if (chr($j) =~ /$i/) { $cat[$j] |= $mask; } } }
+    else { $cat[$i] |= $mask; }
+  }
+}
+
+sub find_cls {
+  foreach (my $i=0; $i<@cat; $i++) { $cls{$cat[$i]} = scalar keys(%cls) if !defined($cls{$cat[$i]}); }
+  foreach (my $i=0; $i<@lcat; $i++) { $cls{$lcat[$i]} = scalar keys(%cls) if !defined($cls{$lcat[$i]}); }
+}
+
+sub gen_enum {
+  print "enum xml_char_type {\n";
+  foreach my $id (sort keys %ids) {
+    my $mask = 0;
+    foreach my $i (keys %cls) {
+      $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id}));
+    }
+    printf "  XML_CHAR_%-20s = 0x%08x,\n", $id, $mask;
+  }
+  print "};\n\n";
+}
+
+sub gen_tabs {
+  my @tab = ();
+  my %hash = ();
+  print "static const uns xml_char_tab1[] = {\n  ";
+  for (my $t=0; $t<256; $t++) {
+    my $i = $t * 256;
+    my @x = ();
+    for (my $j=0; $j<256; $j += 32) {
+      push @x, join(",", map($cls{$_}, @cat[$i+$j..$i+$j+31]));
+    }
+    my $sub = "  " . join(",\n  ", @x);
+    if (!defined($hash{$sub})) {
+      $hash{$sub} = 256 * scalar @tab;
+      push @tab, $sub;
+    }
+    printf("0x%x", $hash{$sub});
+    print((~$t & 15) ? "," : ($t < 255) ? ",\n  " : "\n};\n\n");
+  }
+
+  print "static const byte xml_char_tab2[] = {\n";
+  print join(",\n\n", @tab);
+  print "\n};\n\n";
+
+  my @l = ();
+  for (my $i=0; $i<0x11; $i++) {
+    push @l, sprintf("%d", $cls{$lcat[$i]});
+  }
+  print "static const byte xml_char_tab3[] = {" . join(",", @l) . "};\n";
+}
diff --git a/sherlock/xml/xml-ucat.pl b/sherlock/xml/xml-ucat.pl
deleted file mode 100755 (executable)
index eeb948e..0000000
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/usr/bin/perl
-#
-#      UCW Library -- Character map for the XML parser
-#
-#      (c) 2007 Pavel Charvat <pchar@ucw.cz>
-#
-#      This software may be freely distributed and used according to the terms
-#      of the GNU Lesser General Public License.
-#
-
-my @cat = ();
-my @lcat = ();
-my %ids = ();
-my %cls = ();
-for (my $i = 0; $i < 0x10000; $i++) { $cat[$i] = 0; }
-for (my $i = 0; $i < 0x11; $i++) { $lcat[$i] = 0; }
-
-my @white = (0x9, 0xA, 0xD, 0x20);
-my @base_char_1_0 = (
-  [0x0041,0x005A], [0x0061,0x007A], [0x00C0,0x00D6], [0x00D8,0x00F6], [0x00F8,0x00FF], [0x0100,0x0131],
-  [0x0134,0x013E], [0x0141,0x0148], [0x014A,0x017E], [0x0180,0x01C3], [0x01CD,0x01F0], [0x01F4,0x01F5],
-  [0x01FA,0x0217], [0x0250,0x02A8], [0x02BB,0x02C1], 0x0386, [0x0388,0x038A], 0x038C, [0x038E,0x03A1],
-  [0x03A3,0x03CE], [0x03D0,0x03D6], 0x03DA, 0x03DC, 0x03DE, 0x03E0, [0x03E2,0x03F3], [0x0401,0x040C],
-  [0x040E,0x044F], [0x0451,0x045C], [0x045E,0x0481], [0x0490,0x04C4], [0x04C7,0x04C8], [0x04CB,0x04CC],
-  [0x04D0,0x04EB], [0x04EE,0x04F5], [0x04F8,0x04F9], [0x0531,0x0556], 0x0559, [0x0561,0x0586], [0x05D0,0x05EA],
-  [0x05F0,0x05F2], [0x0621,0x063A], [0x0641,0x064A], [0x0671,0x06B7], [0x06BA,0x06BE], [0x06C0,0x06CE],
-  [0x06D0,0x06D3], 0x06D5, [0x06E5,0x06E6], [0x0905,0x0939], 0x093D, [0x0958,0x0961], [0x0985,0x098C],
-  [0x098F,0x0990], [0x0993,0x09A8], [0x09AA,0x09B0], 0x09B2, [0x09B6,0x09B9], [0x09DC,0x09DD], [0x09DF,0x09E1],
-  [0x09F0,0x09F1], [0x0A05,0x0A0A], [0x0A0F,0x0A10], [0x0A13,0x0A28], [0x0A2A,0x0A30], [0x0A32,0x0A33],
-  [0x0A35,0x0A36], [0x0A38,0x0A39], [0x0A59,0x0A5C], 0x0A5E, [0x0A72,0x0A74], [0x0A85,0x0A8B], 0x0A8D,
-  [0x0A8F,0x0A91], [0x0A93,0x0AA8], [0x0AAA,0x0AB0], [0x0AB2,0x0AB3], [0x0AB5,0x0AB9], 0x0ABD, 0x0AE0,
-  [0x0B05,0x0B0C], [0x0B0F,0x0B10], [0x0B13,0x0B28], [0x0B2A,0x0B30], [0x0B32,0x0B33], [0x0B36,0x0B39],
-  0x0B3D, [0x0B5C,0x0B5D], [0x0B5F,0x0B61], [0x0B85,0x0B8A], [0x0B8E,0x0B90], [0x0B92,0x0B95], [0x0B99,0x0B9A],
-  0x0B9C, [0x0B9E,0x0B9F], [0x0BA3,0x0BA4], [0x0BA8,0x0BAA], [0x0BAE,0x0BB5], [0x0BB7,0x0BB9], [0x0C05,0x0C0C],
-  [0x0C0E,0x0C10], [0x0C12,0x0C28], [0x0C2A,0x0C33], [0x0C35,0x0C39], [0x0C60,0x0C61], [0x0C85,0x0C8C],
-  [0x0C8E,0x0C90], [0x0C92,0x0CA8], [0x0CAA,0x0CB3], [0x0CB5,0x0CB9], 0x0CDE, [0x0CE0,0x0CE1], [0x0D05,0x0D0C],
-  [0x0D0E,0x0D10], [0x0D12,0x0D28], [0x0D2A,0x0D39], [0x0D60,0x0D61], [0x0E01,0x0E2E], 0x0E30, [0x0E32,0x0E33],
-  [0x0E40,0x0E45], [0x0E81,0x0E82], 0x0E84, [0x0E87,0x0E88], 0x0E8A, 0x0E8D, [0x0E94,0x0E97], [0x0E99,0x0E9F],
-  [0x0EA1,0x0EA3], 0x0EA5, 0x0EA7, [0x0EAA,0x0EAB], [0x0EAD,0x0EAE], 0x0EB0, [0x0EB2,0x0EB3], 0x0EBD,
-  [0x0EC0,0x0EC4], [0x0F40,0x0F47], [0x0F49,0x0F69], [0x10A0,0x10C5], [0x10D0,0x10F6], 0x1100, [0x1102,0x1103],
-  [0x1105,0x1107], 0x1109, [0x110B,0x110C], [0x110E,0x1112], 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150,
-  [0x1154,0x1155], 0x1159, [0x115F,0x1161], 0x1163, 0x1165, 0x1167, 0x1169, [0x116D,0x116E], [0x1172,0x1173],
-  0x1175, 0x119E, 0x11A8, 0x11AB, [0x11AE,0x11AF], [0x11B7,0x11B8], 0x11BA, [0x11BC,0x11C2], 0x11EB, 0x11F0,
-  0x11F9, [0x1E00,0x1E9B], [0x1EA0,0x1EF9], [0x1F00,0x1F15], [0x1F18,0x1F1D], [0x1F20,0x1F45], [0x1F48,0x1F4D],
-  [0x1F50,0x1F57], 0x1F59, 0x1F5B, 0x1F5D, [0x1F5F,0x1F7D], [0x1F80,0x1FB4], [0x1FB6,0x1FBC], 0x1FBE,
-  [0x1FC2,0x1FC4], [0x1FC6,0x1FCC], [0x1FD0,0x1FD3], [0x1FD6,0x1FDB], [0x1FE0,0x1FEC], [0x1FF2,0x1FF4],
-  [0x1FF6,0x1FFC], 0x2126, [0x212A,0x212B], 0x212E, [0x2180,0x2182], [0x3041,0x3094], [0x30A1,0x30FA],
-  [0x3105,0x312C], [0xAC00,0xD7A3]);
-my @ideographic_1_0 = ([0x4E00,0x9FA5], 0x3007, [0x3021,0x3029]);
-my @combining_char_1_0 = (
-  [0x0300,0x0345], [0x0360,0x0361], [0x0483,0x0486], [0x0591,0x05A1], [0x05A3,0x05B9], [0x05BB,0x05BD],
-  0x05BF, [0x05C1,0x05C2], 0x05C4, [0x064B,0x0652], 0x0670, [0x06D6,0x06DC], [0x06DD,0x06DF], [0x06E0,0x06E4],
-  [0x06E7,0x06E8], [0x06EA,0x06ED], [0x0901,0x0903], 0x093C, [0x093E,0x094C], 0x094D, [0x0951,0x0954],
-  [0x0962,0x0963], [0x0981,0x0983], 0x09BC, 0x09BE, 0x09BF, [0x09C0,0x09C4], [0x09C7,0x09C8], [0x09CB,0x09CD],
-  0x09D7, [0x09E2,0x09E3], 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, [0x0A40,0x0A42], [0x0A47,0x0A48], [0x0A4B,0x0A4D],
-  [0x0A70,0x0A71], [0x0A81,0x0A83], 0x0ABC, [0x0ABE,0x0AC5], [0x0AC7,0x0AC9], [0x0ACB,0x0ACD], [0x0B01,0x0B03],
-  0x0B3C, [0x0B3E,0x0B43], [0x0B47,0x0B48], [0x0B4B,0x0B4D], [0x0B56,0x0B57], [0x0B82,0x0B83], [0x0BBE,0x0BC2],
-  [0x0BC6,0x0BC8], [0x0BCA,0x0BCD], 0x0BD7, [0x0C01,0x0C03], [0x0C3E,0x0C44], [0x0C46,0x0C48], [0x0C4A,0x0C4D],
-  [0x0C55,0x0C56], [0x0C82,0x0C83], [0x0CBE,0x0CC4], [0x0CC6,0x0CC8], [0x0CCA,0x0CCD], [0x0CD5,0x0CD6],
-  [0x0D02,0x0D03], [0x0D3E,0x0D43], [0x0D46,0x0D48], [0x0D4A,0x0D4D], 0x0D57, 0x0E31, [0x0E34,0x0E3A],
-  [0x0E47,0x0E4E], 0x0EB1, [0x0EB4,0x0EB9], [0x0EBB,0x0EBC], [0x0EC8,0x0ECD], [0x0F18,0x0F19], 0x0F35,
-  0x0F37, 0x0F39, 0x0F3E, 0x0F3F, [0x0F71,0x0F84], [0x0F86,0x0F8B], [0x0F90,0x0F95], 0x0F97, [0x0F99,0x0FAD],
-  [0x0FB1,0x0FB7], 0x0FB9, [0x20D0,0x20DC], 0x20E1, [0x302A,0x302F], 0x3099, 0x309A);
-my @digit_1_0 = (
-  [0x0030,0x0039], [0x0660,0x0669], [0x06F0,0x06F9], [0x0966,0x096F], [0x09E6,0x09EF], [0x0A66,0x0A6F],
-  [0x0AE6,0x0AEF], [0x0B66,0x0B6F], [0x0BE7,0x0BEF], [0x0C66,0x0C6F], [0x0CE6,0x0CEF], [0x0D66,0x0D6F],
-  [0x0E50,0x0E59], [0x0ED0,0x0ED9], [0x0F20,0x0F29]);
-my @extender_1_0 = (
-  0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, [0x3031,0x3035], [0x309D,0x309E], [0x30FC,0x30FE]);
-my @sname_1_1 = (
-  "[:A-Z_a-z]", [0xC0,0xD6], [0xD8,0xF6], [0xF8,0x2FF], [0x370,0x37D], [0x37F,0x1FFF],
-  [0x200C,0x200D], [0x2070,0x218F], [0x2C00,0x2FEF], [0x3001,0xD7FF], [0xF900,0xFDCF], [0xFDF0,0xFFFD], [0x10000,0xEFFFF]);
-
-set("WHITE", @white);
-set("NEW_LINE_1_0", 0xA, 0xD);
-set("NEW_LINE_1_1", 0xA, 0xD, 0x85, 0x2028);
-set("DIGIT", "[0-9]");
-set("XDIGIT", "[0-9a-fA-F]");
-set("VALID_1_0", @white, [0x20,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]);
-set("VALID_1_1", [0x1,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]);
-set("UNRESTRICTED_1_1", @white, [0x20,0x7E], 0x85, [0xA0,0xD7FF], [0xE000,0xFFFD], [0x10000,0x10FFFF]);
-set("PUBID", 0x20, 0xD, 0xA, "[-a-zA-Z0-9'()+,./:=?:!*#@\$_%]");
-set("ENC_SNAME", "[a-zA-Z]");
-set("ENC_NAME", "[-a-zA-Z0-9._]");
-set("SNAME_1_0", "[_:]", @base_char_1_0, @ideographic_1_0);
-set("NAME_1_0", "[-_:.]", @base_char_1_0, @ideographic_1_0, @combining_char_1_0, @digit_1_0, @extender_1_0);
-set("SNAME_1_1", @sname_1_1);
-set("NAME_1_1", @sname_1_1, "[-.0-9]", 0xB7, [0x0300,0x036F], [0x203F,0x2040]);
-set("DECL", @white, [0x20,0x7E]);
-set("GT", "[>]");
-
-print "/* Automatically generated by xml-ucat.pl */\n\n";
-find_cls();
-gen_enum();
-gen_tabs();
-
-sub set {
-  my $id = shift;
-  $ids{$id} = scalar keys(%ids) if !defined($ids{$id});
-  my $mask = 1 << $ids{$id};
-  foreach my $i (@_) {
-    if (ref($i) eq "ARRAY") {
-      my $j = $i->[0];
-      for (; $j <= $i->[1] && $j < 0x10000; $j++) { $cat[$j] |= $mask; }
-      for (; $j <= $i->[1]; $j += 0x10000) { $lcat[$j >> 16] |= $mask; }
-    }
-    elsif ($i =~ /^\[/) { for (my $j=0; $j < 128; $j++) { if (chr($j) =~ /$i/) { $cat[$j] |= $mask; } } }
-    else { $cat[$i] |= $mask; }
-  }
-}
-
-sub find_cls {
-  foreach (my $i=0; $i<@cat; $i++) { $cls{$cat[$i]} = scalar keys(%cls) if !defined($cls{$cat[$i]}); }
-  foreach (my $i=0; $i<@lcat; $i++) { $cls{$lcat[$i]} = scalar keys(%cls) if !defined($cls{$lcat[$i]}); }
-}
-
-sub gen_enum {
-  print "enum xml_char_type {\n";
-  foreach my $id (sort keys %ids) {
-    my $mask = 0;
-    foreach my $i (keys %cls) {
-      $mask |= 1 << $cls{$i} if $cls{$i} && ($i & (1 << $ids{$id}));
-    }
-    printf "  XML_CHAR_%-20s = 0x%08x,\n", $id, $mask;
-  }
-  print "};\n\n";
-}
-
-sub gen_tabs {
-  my @tab = ();
-  my %hash = ();
-  print "static const uns xml_char_tab1[] = {\n  ";
-  for (my $t=0; $t<256; $t++) {
-    my $i = $t * 256;
-    my @x = ();
-    for (my $j=0; $j<256; $j += 32) {
-      push @x, join(",", map($cls{$_}, @cat[$i+$j..$i+$j+31]));
-    }
-    my $sub = "  " . join(",\n  ", @x);
-    if (!defined($hash{$sub})) {
-      $hash{$sub} = 256 * scalar @tab;
-      push @tab, $sub;
-    }
-    printf("0x%x", $hash{$sub});
-    print((~$t & 15) ? "," : ($t < 255) ? ",\n  " : "\n};\n\n");
-  }
-
-  print "static const byte xml_char_tab2[] = {\n";
-  print join(",\n\n", @tab);
-  print "\n};\n\n";
-
-  my @l = ();
-  for (my $i=0; $i<0x11; $i++) {
-    push @l, sprintf("%d", $cls{$lcat[$i]});
-  }
-  print "static const byte xml_char_tab3[] = {" . join(",", @l) . "};\n";
-}
index 2de0e818c5c58dd7bb0f93c552bdd3f34303cfdf..1d9f0f45cff5fd9b7cb5c90e306d85cda9f6d777 100644 (file)
@@ -27,6 +27,7 @@
 #include "charset/charconv.h"
 #include "charset/fb-charconv.h"
 #include "sherlock/xml/xml.h"
+#include "sherlock/xml/dtd.h"
 
 #include <setjmp.h>
 
@@ -97,7 +98,7 @@ xml_fatal(struct xml_context *ctx, const char *format, ...)
 
 /*** Charecter categorization ***/
 
-#include "obj/sherlock/xml/xml-ucat.h"
+#include "obj/sherlock/xml/unicat.h"
 
 static inline uns
 xml_char_cat(uns c)
@@ -941,6 +942,45 @@ xml_dtd_find_pent(struct xml_context *ctx, char *name)
 XML_HASH_GIVE_ALLOC
 #include "lib/hashtable.h"
 
+/* Element sons */
+
+struct xml_dtd_enodes_table;
+
+static inline uns
+xml_dtd_enodes_hash(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem)
+{
+  return hash_pointer(parent) ^ hash_pointer(elem);
+}
+
+static inline int
+xml_dtd_enodes_eq(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *parent1, struct xml_dtd_elem *elem1, struct xml_dtd_elem_node *parent2, struct xml_dtd_elem *elem2)
+{
+  return (parent1 == parent2) && (elem1 == elem2);
+}
+
+static inline void
+xml_dtd_enodes_init_key(struct xml_dtd_enodes_table *tab UNUSED, struct xml_dtd_elem_node *node, struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem)
+{
+  node->parent = parent;
+  node->elem = elem;
+}
+
+#define HASH_PREFIX(x) xml_dtd_enodes_##x
+#define HASH_NODE struct xml_dtd_elem_node
+#define HASH_KEY_COMPLEX(x) x parent, x elem
+#define HASH_KEY_DECL struct xml_dtd_elem_node *parent, struct xml_dtd_elem *elem
+#define HASH_GIVE_HASHFN
+#define HASH_GIVE_EQ
+#define HASH_GIVE_INIT_KEY
+#define HASH_TABLE_DYNAMIC
+#define HASH_ZERO_FILL
+#define HASH_WANT_FIND
+#define HASH_WANT_NEW
+#define HASH_GIVE_ALLOC
+#define HASH_TABLE_ALLOC
+XML_HASH_GIVE_ALLOC
+#include "lib/hashtable.h"
+
 /* Element attributes */
 
 struct xml_dtd_attrs_table;
@@ -1070,6 +1110,7 @@ xml_dtd_init(struct xml_context *ctx)
   xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table)));
   xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table)));
   xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table)));
+  xml_dtd_enodes_init(dtd->tab_enodes = xml_hash_new(pool, sizeof(struct xml_dtd_enodes_table)));
   xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table)));
   xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table)));
   xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table)));
@@ -1412,7 +1453,7 @@ xml_parse_external_id(struct xml_context *ctx, struct xml_ext_id *eid, uns allow
     xml_fatal(ctx, "Expected an external ID");
 }
 
-/* DTD: Notation declaration */
+/* DTD: <!NOTATION ...> */
 
 static void
 xml_parse_notation_decl(struct xml_context *ctx)
@@ -1441,6 +1482,8 @@ xml_parse_notation_decl(struct xml_context *ctx)
   xml_dec(ctx);
 }
 
+/* DTD: <!ENTITY ...> */
+
 static void
 xml_parse_entity_decl(struct xml_context *ctx)
 {
@@ -1530,6 +1573,314 @@ xml_parse_entity_decl(struct xml_context *ctx)
   xml_dec(ctx);
 }
 
+/* DTD: <!ELEMENT ...> */
+
+static void
+xml_parse_element_decl(struct xml_context *ctx)
+{
+  /* Elementdecl ::= '<!ELEMENT' S  Name  S  contentspec  S? '>'
+   * Already parsed: '<!ELEMENT' */
+  xml_parse_dtd_white(ctx, 1);
+  char *name = xml_parse_name(ctx);
+  xml_parse_dtd_white(ctx, 1);
+  struct xml_dtd *dtd = ctx->dtd;
+  struct xml_dtd_elem *elem = xml_dtd_elems_lookup(dtd->tab_elems, name);
+  if (elem->flags & XML_DTD_ELEM_DECLARED)
+    xml_fatal(ctx, "Element <%s> already declared", name);
+
+  /* contentspec ::= 'EMPTY' | 'ANY' | Mixed | children */
+  uns c = xml_peek_char(ctx);
+  if (c == 'E')
+    {
+      xml_parse_seq(ctx, "EMPTY");
+      elem->type = XML_DTD_ELEM_EMPTY;
+    }
+  else if (c == 'A')
+    {
+      xml_parse_seq(ctx, "ANY");
+      elem->type = XML_DTD_ELEM_ANY;
+    }
+  else if (c == '(')
+    {
+      xml_skip_char(ctx);
+      xml_inc(ctx);
+      xml_parse_dtd_white(ctx, 0);
+      struct xml_dtd_elem_node *parent = elem->node = mp_alloc_zero(dtd->pool, sizeof(*parent));
+      if (xml_peek_char(ctx) == '#')
+        {
+         /* Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' */
+         xml_skip_char(ctx);
+         xml_parse_seq(ctx, "PCDATA");
+         elem->type = XML_DTD_ELEM_MIXED;
+          parent->type = XML_DTD_ELEM_PCDATA;
+         while (1)
+           {
+             xml_parse_dtd_white(ctx, 0);
+             if ((c = xml_get_char(ctx)) == ')')
+               break;
+             else if (c != '|')
+               xml_fatal_expected(ctx, ')');
+             xml_parse_dtd_white(ctx, 0);
+             struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx));
+             if (xml_dtd_enodes_find(dtd->tab_enodes, parent, son_elem))
+               xml_error(ctx, "Duplicate content '%s'", son_elem->name);
+             else
+               {
+                 struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem);
+                 slist_add_tail(&parent->sons, &son->n);
+               }
+           }
+         xml_dec(ctx);
+         if (xml_peek_char(ctx) == '*')
+           {
+             xml_skip_char(ctx);
+             parent->occur = XML_DTD_ELEM_OCCUR_MULT;
+           }
+         else if (!slist_head(&parent->sons))
+           parent->occur = XML_DTD_ELEM_OCCUR_ONCE;
+         else
+           xml_fatal_expected(ctx, '*');
+       }
+      else
+        {
+         /* children ::= (choice | seq) ('?' | '*' | '+')?
+          * cp ::= (Name | choice | seq) ('?' | '*' | '+')?
+          * choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'
+          * seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' */
+
+         elem->type = XML_DTD_ELEM_CHILDREN;
+         parent->type = XML_DTD_ELEM_PCDATA;
+         uns c;
+         goto first;
+
+         while (1)
+           {
+             /* After name */
+             xml_parse_dtd_white(ctx, 0);
+             if ((c = xml_get_char(ctx)) ==  ')')
+               {
+                 xml_dec(ctx);
+                 if (parent->type == XML_DTD_ELEM_PCDATA)
+                   parent->type = XML_DTD_ELEM_SEQ;
+                 if ((c = xml_get_char(ctx)) == '?')
+                   parent->occur = XML_DTD_ELEM_OCCUR_OPT;
+                 else if (c == '*')
+                   parent->occur = XML_DTD_ELEM_OCCUR_MULT;
+                 else if (c == '+')
+                   parent->occur = XML_DTD_ELEM_OCCUR_PLUS;
+                 else
+                   {
+                     xml_unget_char(ctx);
+                     parent->occur = XML_DTD_ELEM_OCCUR_ONCE;
+                   }
+                 if (!parent->parent)
+                   break;
+                 parent = parent->parent;
+                 continue;
+               }
+             else if (c == '|')
+               {
+                 if (parent->type == XML_DTD_ELEM_PCDATA)
+                   parent->type = XML_DTD_ELEM_OR;
+                 else if (parent->type != XML_DTD_ELEM_OR)
+                   xml_fatal(ctx, "Mixed operators in the list of element children");
+               }
+             else if (c == ',')
+               {
+                 if (parent->type == XML_DTD_ELEM_PCDATA)
+                   parent->type = XML_DTD_ELEM_SEQ;
+                 else if (parent->type != XML_DTD_ELEM_SEQ)
+                   xml_fatal(ctx, "Mixed operators in the list of element children");
+               }
+             else if (c == '(')
+               {
+                 xml_inc(ctx);
+                 struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son));
+                 son->parent = parent;
+                 slist_add_tail(&parent->sons, &son->n);
+                 parent = son->parent;
+                 son->type = XML_DTD_ELEM_MIXED;
+               }
+             else
+               xml_unget_char(ctx);
+
+             /* Before name */
+             xml_parse_dtd_white(ctx, 0);
+first:;
+             struct xml_dtd_elem *son_elem = xml_dtd_elems_lookup(dtd->tab_elems, xml_parse_name(ctx));
+             // FIXME: duplicates, occurance
+             //struct xml_dtd_elem_node *son = xml_dtd_enodes_new(dtd->tab_enodes, parent, son_elem);
+             struct xml_dtd_elem_node *son = mp_alloc_zero(dtd->pool, sizeof(*son));
+             son->parent = parent;
+             son->elem = son_elem;
+             slist_add_tail(&parent->sons, &son->n);
+           }
+       }
+    }
+  else
+    xml_fatal(ctx, "Expected element content specification");
+
+  xml_parse_dtd_white(ctx, 0);
+  xml_parse_char(ctx, '>');
+  xml_dec(ctx);
+}
+
+static char *
+xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED)
+{
+  uns quote = xml_parse_quote(ctx);
+  xml_push(ctx);
+  struct fastbuf *out = ctx->value;
+  while (1)
+    {
+      uns c = xml_get_char(ctx);
+      if (c == '&')
+        {
+         xml_inc(ctx);
+         xml_parse_ge_ref(ctx, out);
+       }
+      else if (c == quote) // FIXME: beware quotes inside parsed
+       break;
+      else if (c == '<')
+       xml_error(ctx, "Attribute value must not contain '<'"); 
+      else
+       bput_utf8_32(out, c);
+    }
+  xml_pop(ctx);
+  bputc(out, 0);
+  fbgrow_rewind(out);
+  char *value = mp_memdup(ctx->pool, out->bptr, out->bstop - out->bptr);
+  // FIXME: check value constraints / normalize value
+  fbgrow_reset(out);
+  return value;
+}
+
+static void
+xml_parse_attr_list_decl(struct xml_context *ctx)
+{
+  /* AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
+   * AttDef ::= S Name S AttType S DefaultDecl
+   * Already parsed: '<!ATTLIST' */
+  xml_parse_dtd_white(ctx, 1);
+  struct xml_dtd_elem *elem = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx));
+
+  while (xml_parse_dtd_white(ctx, 0) && xml_peek_char(ctx) != '>')
+    {
+      char *name = xml_parse_name(ctx);
+      struct xml_dtd_attr *attr = xml_dtd_attrs_find(ctx->dtd->tab_attrs, elem, name);
+      uns ignored = 0;
+      if (attr)
+        {
+         xml_warn(ctx, "Duplicate attribute definition");
+         ignored++;
+       }
+      else
+       attr = xml_dtd_attrs_new(ctx->dtd->tab_attrs, elem, name);
+      xml_parse_dtd_white(ctx, 1);
+      if (xml_peek_char(ctx) == '(')
+        {
+         xml_skip_char(ctx); // FIXME: xml_inc/dec ?
+         if (!ignored)
+           attr->type = XML_ATTR_ENUM;
+         do
+           {
+             xml_parse_dtd_white(ctx, 0);
+             char *value = xml_parse_nmtoken(ctx);
+             if (!ignored)
+               if (xml_dtd_evals_find(ctx->dtd->tab_evals, attr, value))
+                 xml_error(ctx, "Duplicate enumeration value");
+               else
+                 xml_dtd_evals_new(ctx->dtd->tab_evals, attr, value);
+             xml_parse_dtd_white(ctx, 0);
+           }
+         while (xml_get_char(ctx) == '|');
+         xml_unget_char(ctx);
+         xml_parse_char(ctx, ')');
+       }
+      else
+        {
+         char *type = xml_parse_name(ctx);
+         enum xml_dtd_attribute_type t;
+         if (!strcmp(type, "CDATA"))
+           t = XML_ATTR_CDATA;
+         else if (!strcmp(type, "ID"))
+           t = XML_ATTR_ID;
+         else if (!strcmp(type, "IDREF"))
+           t = XML_ATTR_IDREF;
+         else if (!strcmp(type, "IDREFS"))
+           t = XML_ATTR_IDREFS;
+         else if (!strcmp(type, "ENTITY"))
+           t = XML_ATTR_ENTITY;
+         else if (!strcmp(type, "ENTITIES"))
+           t = XML_ATTR_ENTITIES;
+         else if (!strcmp(type, "NMTOKEN"))
+           t = XML_ATTR_NMTOKEN;
+         else if (!strcmp(type, "NMTOKENS"))
+           t = XML_ATTR_NMTOKENS;
+         else if (!strcmp(type, "NOTATION"))
+           {
+             if (elem->type == XML_DTD_ELEM_EMPTY)
+               xml_fatal(ctx, "Empty element must not have notation attribute");
+             // FIXME: An element type MUST NOT have more than one NOTATION attribute specified.
+             t = XML_ATTR_NOTATION;
+             xml_parse_dtd_white(ctx, 1);
+             xml_parse_char(ctx, '(');
+             do
+               {
+                 xml_parse_dtd_white(ctx, 0);
+                 struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx));
+                 if (!ignored)
+                   if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, attr, n))
+                     xml_error(ctx, "Duplicate enumerated notation");
+                   else
+                     xml_dtd_enotns_new(ctx->dtd->tab_enotns, attr, n);
+                 xml_parse_dtd_white(ctx, 0);
+               }
+             while (xml_get_char(ctx) == '|');
+             xml_unget_char(ctx);
+             xml_parse_char(ctx, ')');
+           }
+         else
+           xml_fatal(ctx, "Unknown attribute type");
+         if (!ignored)
+           attr->type = t;
+       }
+      xml_parse_dtd_white(ctx, 1);
+      enum xml_dtd_attribute_default def = XML_ATTR_NONE;
+      if (xml_get_char(ctx) == '#')
+       switch (xml_peek_char(ctx))
+          {
+           case 'R':
+             xml_parse_seq(ctx, "REQUIRED");
+             def = XML_ATTR_REQUIRED;
+             break;
+           case 'I':
+             xml_parse_seq(ctx, "IMPLIED");
+             def = XML_ATTR_IMPLIED;
+             break;
+           case 'F':
+             xml_parse_seq(ctx, "FIXED");
+             def = XML_ATTR_FIXED;
+             xml_parse_dtd_white(ctx, 1);
+             break;
+           default:
+             xml_fatal(ctx, "Expected a modifier for default attribute value");
+         }
+      else
+       xml_unget_char(ctx);
+      if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED)
+        {
+         char *v = xml_parse_attr_value(ctx, attr);
+         if (!ignored)
+           attr->default_value = v;
+       }
+      if (!ignored)
+       attr->default_mode = def;
+    }
+  xml_skip_char(ctx);
+  xml_dec(ctx);
+}
+
 /* DTD: Internal subset */
 
 static void
@@ -1565,14 +1916,14 @@ xml_parse_internal_subset(struct xml_context *ctx)
                else if (c == 'L')
                  {
                    xml_parse_seq(ctx, "EMENT");
-                   // FIXME: Element
+                   xml_parse_element_decl(ctx);
                  }
                else
                  goto invalid_markup;
                break;
              case 'A':
                xml_parse_seq(ctx, "TTLIST");
-               // FIXME: AttList
+               xml_parse_attr_list_decl(ctx);
                break;
              default:
                goto invalid_markup;
@@ -1813,209 +2164,6 @@ xml_pop_element(struct xml_context *ctx)
 #endif
 }
 
-static void
-xml_parse_element_decl(struct xml_context *ctx)
-{
-  // FIXME
-  mp_push(ctx->pool);
-  xml_parse_seq(ctx, "<!ELEMENT");
-  xml_parse_white(ctx, 1);
-  xml_parse_name(ctx);
-  xml_parse_white(ctx, 1);
-
-  uns c = xml_get_char(ctx);
-  if (c == 'E')
-    {
-      xml_parse_seq(ctx, "MPTY");
-      // FIXME
-    }
-  else if (c == 'A')
-    {
-      xml_parse_seq(ctx, "NY");
-      // FIXME
-    }
-  else if (c == '(')
-    {
-      xml_parse_white(ctx, 0);
-      if (xml_get_char(ctx) == '#')
-        {
-         xml_parse_seq(ctx, "PCDATA");
-         while (1)
-           {
-             xml_parse_white(ctx, 0);
-             if ((c = xml_get_char(ctx)) == ')')
-               break;
-             else if (c != '|')
-               xml_fatal_expected(ctx, ')');
-             xml_parse_white(ctx, 0);
-             xml_parse_name(ctx);
-             // FIXME
-           }
-       }
-      else
-        {
-         xml_unget_char(ctx);
-         uns depth = 1;
-         while (1)
-           {
-             xml_parse_white(ctx, 0);
-             if ((c = xml_get_char(ctx)) == '(')
-               {
-                 depth++;
-               }
-             else if (c == ')')
-               {
-                 if ((c = xml_get_char(ctx)) == '?' || c == '*' || c == '+')
-                   {
-                   }
-                 else
-                   xml_unget_char(ctx);
-                 if (!--depth)
-                   break;
-               }
-             else if (c == '|')
-               {
-               }
-             else if (c == ',')
-               {
-               }
-             else
-               {
-                 xml_unget_char(ctx);
-                 xml_parse_name(ctx);
-               }
-           }
-       }
-    }
-  else
-    xml_fatal(ctx, "Expected element content specification");
-
-  xml_parse_white(ctx, 0);
-  xml_parse_char(ctx, '>');
-  mp_pop(ctx->pool);
-}
-
-#if 0
-static void
-xml_parse_attr_list_decl(struct xml_context *ctx)
-{
-  /* AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
-   * AttDef ::= S Name S AttType S DefaultDecl */
-  xml_parse_seq(ctx, "ATTLIST");
-  xml_parse_white(ctx, 1);
-  struct xml_dtd_elem *e = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx));
-  e->attlist_declared = 1;
-
-  while (xml_parse_white(ctx, 0) && xml_get_char(ctx) != '>')
-    {
-      xml_unget_char(ctx);
-      char *name = xml_parse_name(ctx);
-      struct xml_dtd_attr *a = xml_dtd_attrs_find(ctx->dtd->tab_attrs, e, name);
-      uns ignored = 0;
-      if (a)
-        {
-         xml_warn(ctx, "Duplicate attribute definition");
-         ignored++;
-       }
-      else
-       a = xml_dtd_attrs_new(ctx->dtd->tab_attrs, e, name);
-      xml_parse_white(ctx, 1);
-      if (xml_get_char(ctx) == '(')
-        {
-         if (!ignored)
-           a->type = XML_ATTR_ENUM;
-         do
-           {
-             xml_parse_white(ctx, 0);
-             char *value = xml_parse_nmtoken(ctx);
-             if (!ignored)
-               if (xml_dtd_evals_find(ctx->dtd->tab_evals, a, value))
-                 xml_error(ctx, "Duplicate enumeration value");
-               else
-                 xml_dtd_evals_new(ctx->dtd->tab_evals, a, value);
-             xml_parse_white(ctx, 0);
-           }
-         while (xml_get_char(ctx) == '|');
-         xml_unget_char(ctx);
-         xml_parse_char(ctx, ')');
-       }
-      else
-        {
-         xml_unget_char(ctx);
-         char *type = xml_parse_name(ctx);
-         enum xml_dtd_attribute_type t;
-         if (!strcmp(type, "CDATA"))
-           t = XML_ATTR_CDATA;
-         else if (!strcmp(type, "ID"))
-           t = XML_ATTR_ID;
-         else if (!strcmp(type, "IDREF"))
-           t = XML_ATTR_IDREF;
-         else if (!strcmp(type, "IDREFS"))
-           t = XML_ATTR_IDREFS;
-         else if (!strcmp(type, "ENTITY"))
-           t = XML_ATTR_ENTITY;
-         else if (!strcmp(type, "ENTITIES"))
-           t = XML_ATTR_ENTITIES;
-         else if (!strcmp(type, "NMTOKEN"))
-           t = XML_ATTR_NMTOKEN;
-         else if (!strcmp(type, "NMTOKENS"))
-           t = XML_ATTR_NMTOKENS;
-         else if (!strcmp(type, "NOTATION"))
-           {
-             t = XML_ATTR_NOTATION;
-             xml_parse_white(ctx, 1);
-             xml_parse_char(ctx, '(');
-             do
-               {
-                 xml_parse_white(ctx, 0);
-                 struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx));
-                 if (!ignored)
-                   if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, a, n))
-                     xml_error(ctx, "Duplicate enumerated notation");
-                   else
-                     xml_dtd_enotns_new(ctx->dtd->tab_enotns, a, n);
-                 xml_parse_white(ctx, 0);
-               }
-             while (xml_get_char(ctx) == '|');
-             xml_unget_char(ctx);
-             xml_parse_char(ctx, ')');
-           }
-         else
-           xml_fatal(ctx, "Unknown attribute type");
-         if (!ignored)
-           a->type = t;
-       }
-      xml_parse_white(ctx, 1);
-      enum xml_dtd_attribute_default def = XML_ATTR_NONE;
-      if (xml_get_char(ctx) == '#')
-       switch (xml_get_char(ctx))
-          {
-           case 'R':
-             xml_parse_seq(ctx, "EQUIRED");
-             def = XML_ATTR_REQUIRED;
-             break;
-           case 'I':
-             xml_parse_seq(ctx, "MPLIED");
-             def = XML_ATTR_IMPLIED;
-             break;
-           case 'F':
-             xml_parse_seq(ctx, "IXED");
-             def = XML_ATTR_FIXED;
-             break;
-           default:
-             xml_fatal(ctx, "Expected a modifier for default attribute value");
-         }
-      else
-       xml_unget_char(ctx);
-      if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED)
-        {
-         xml_parse_system_literal(ctx);
-         // FIXME
-       }
-    }
-}
-#endif
-
 static void
 xml_parse_doctype_decl(struct xml_context *ctx)
 {
index 87cdff914f3bb270ccf3e887764a02d11f51be44..7e83f65ad31ef20164a4baada8d9494ee3020c96 100644 (file)
@@ -7,8 +7,8 @@
  *     of the GNU Lesser General Public License.
  */
 
-#ifndef _SHERLOCK_XML_H
-#define _SHERLOCK_XML_H
+#ifndef _SHERLOCK_XML_XML_H
+#define _SHERLOCK_XML_XML_H
 
 #include "lib/clists.h"
 #include "lib/slists.h"
@@ -204,132 +204,6 @@ struct xml_context {
   void (*unparsed_entity_decl)(struct xml_context *ctx);
 };
 
-/*** Document Type Definition (DTD) ***/
-
-struct xml_dtd {
-  struct mempool *pool;                        /* Memory pool where to allocate DTD */
-  slist gents;                         /* Link list of general entities */
-  slist pents;                         /* Link list of parapeter entities */
-  slist notns;                         /* Link list of notations */
-  slist elems;                         /* Link list of elements */
-  void *tab_gents;                     /* Hash table of general entities */
-  void *tab_pents;                     /* Hash table of parameter entities */
-  void *tab_notns;                     /* Hash table of notations */
-  void *tab_elems;                     /* Hash table of elements */
-  void *tab_attrs;                     /* Hash table of element attributes */
-  void *tab_evals;                     /* Hash table of enumerated attribute values */
-  void *tab_enotns;                    /* hash table of enumerated attribute notations */
-};
-
-/* Notations */
-
-enum xml_dtd_notn_flags {
-  XML_DTD_NOTN_DECLARED = 0x1,         /* The notation has been declared (interbal usage) */
-};
-
-struct xml_dtd_notn {
-  snode n;                             /* Node in xml_dtd.notns */
-  uns flags;                           /* XML_DTD_NOTN_x */
-  char *name;                          /* Notation name */
-  struct xml_ext_id eid;               /* External id */
-};
-
-/* Entities */
-
-enum xml_dtd_ent_flags {
-  XML_DTD_ENT_DECLARED = 0x1,          /* The entity has been declared (internal usage) */
-  XML_DTD_ENT_VISITED = 0x2,           /* Cycle detection (internal usage) */
-  XML_DTD_ENT_PARAMETER = 0x4,         /* Parameter entity, general otherwise */
-  XML_DTD_ENT_EXTERNAL = 0x8,          /* External entity, internal otherwise */
-  XML_DTD_ENT_UNPARSED = 0x10,         /* Unparsed entity, parsed otherwise */
-  XML_DTD_ENT_TRIVIAL = 0x20,          /* Replacement text is a sequence of characters and character references */
-};
-
-struct xml_dtd_ent {
-  snode n;                             /* Node in xml_dtd.[gp]ents */
-  uns flags;                           /* XML_DTD_ENT_x */
-  char *name;                          /* Entity name */
-  char *text;                          /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */
-  uns len;                             /* Text length */
-  struct xml_ext_id eid;               /* External ID */
-  struct xml_dtd_notn *notn;           /* Notation (XML_DTD_ENT_UNPARSED only) */
-};
-
-/* Elements */
-
-enum xml_dtd_elem_flags {
-  XML_DTD_ELEM_DECLARED = 0x1,         /* The element has been declared (internal usage) */
-};
-
-struct xml_dtd_elem {
-  snode n;
-  uns flags;
-  char *name;
-  struct xml_dtd_elem_node *node;
-};
-
-struct xml_dtd_elem_node {
-  snode n;
-  struct xml_dtd_elem_node *parent;
-  slist sons;
-  uns type;
-  uns occur;
-};
-
-enum xml_dtd_elem_node_type {
-  XML_DTD_ELEM_PCDATA,
-  XML_DTD_ELEM_SEQ,
-  XML_DTD_ELEM_OR,
-};
-
-enum xml_dtd_elem_node_occur {
-  XML_DTD_ELEM_OCCUR_ONCE,
-  XML_DTD_ELEM_OCCUR_OPT,
-  XML_DTD_ELEM_OCCUR_MULT,
-  XML_DTD_ELEM_OCCUR_PLUS,
-};
-
-/* Attributes */
-
-
-enum xml_dtd_attribute_default {
-  XML_ATTR_NONE,
-  XML_ATTR_REQUIRED,
-  XML_ATTR_IMPLIED,
-  XML_ATTR_FIXED,
-};
-
-enum xml_dtd_attribute_type {
-  XML_ATTR_CDATA,
-  XML_ATTR_ID,
-  XML_ATTR_IDREF,
-  XML_ATTR_IDREFS,
-  XML_ATTR_ENTITY,
-  XML_ATTR_ENTITIES,
-  XML_ATTR_NMTOKEN,
-  XML_ATTR_NMTOKENS,
-  XML_ATTR_ENUM,
-  XML_ATTR_NOTATION,
-};
-
-struct xml_dtd_attr {
-  char *name;
-  struct xml_dtd_elem *elem;
-  enum xml_dtd_attribute_type type;
-  enum xml_dtd_attribute_default default_mode;
-  char *default_value;
-};
-
-struct xml_dtd_eval {
-  struct xml_dtd_attr *attr;
-  char *val;
-};
-
-struct xml_dtd_enotn {
-  struct xml_dtd_attr *attr;
-  struct xml_dtd_notn *notn;
-};
-
 void xml_init(struct xml_context *ctx);
 void xml_cleanup(struct xml_context *ctx);
 void xml_set_source(struct xml_context *ctx, struct fastbuf *fb);