2 * UCW Library -- A simple XML parser
4 * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
5 * (c) 2015 Martin Mares <mj@ucw.cz>
7 * This software may be freely distributed and used according to the terms
8 * of the GNU Lesser General Public License.
14 #include <ucw-xml/xml.h>
15 #include <ucw-xml/dtd.h>
16 #include <ucw-xml/internals.h>
17 #include <ucw/fastbuf.h>
18 #include <ucw/ff-unicode.h>
19 #include <ucw/unicode.h>
20 #include <ucw/chartype.h>
21 #include <ucw/hashfunc.h>
25 /*** Basic parsing ***/
28 xml_fatal_expected(struct xml_context *ctx, uint c)
30 if (c >= 32 && c < 127)
31 xml_fatal(ctx, "Expected '%c'", c);
33 xml_fatal(ctx, "Expected U+%04x", c);
37 xml_fatal_expected_white(struct xml_context *ctx)
39 xml_fatal(ctx, "Expected a white space");
43 xml_fatal_expected_quot(struct xml_context *ctx)
45 xml_fatal(ctx, "Expected a quotation mark");
49 xml_parse_eq(struct xml_context *ctx)
51 /* Eq ::= S? '=' S? */
52 xml_parse_white(ctx, 0);
53 xml_parse_char(ctx, '=');
54 xml_parse_white(ctx, 0);
57 /*** Names and nmtokens ***/
60 xml_parse_string(struct xml_context *ctx, struct mempool *pool, uint first_cat, uint next_cat, char *err)
62 char *p = mp_start_noalign(pool, 2);
63 *p++ = '<'; /* We always prepend a '<', so we can seek backwards in the string */
64 if (unlikely(!(xml_peek_cat(ctx) & first_cat)))
65 xml_fatal(ctx, "%s", err);
68 p = mp_spread(pool, p, 5);
69 p = utf8_32_put(p, xml_skip_char(ctx));
71 while (xml_peek_cat(ctx) & next_cat);
73 return mp_end(pool, p) + 1;
77 xml_skip_string(struct xml_context *ctx, uint first_cat, uint next_cat, char *err)
79 if (unlikely(!(xml_get_cat(ctx) & first_cat)))
80 xml_fatal(ctx, "%s", err);
81 while (xml_peek_cat(ctx) & next_cat)
86 xml_parse_name(struct xml_context *ctx, struct mempool *pool)
88 /* Name ::= NameStartChar (NameChar)* */
89 return xml_parse_string(ctx, pool, ctx->cat_sname, ctx->cat_name, "Expected a name");
93 xml_skip_name(struct xml_context *ctx)
95 xml_skip_string(ctx, ctx->cat_sname, ctx->cat_name, "Expected a name");
99 xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool)
101 /* Nmtoken ::= (NameChar)+ */
102 return xml_parse_string(ctx, pool, ctx->cat_name, ctx->cat_name, "Expected a nmtoken");
105 /*** Simple literals ***/
108 xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool)
110 /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */
111 char *p = mp_start_noalign(pool, 1);
112 uint q = xml_parse_quote(ctx), c;
113 while ((c = xml_get_char(ctx)) != q)
115 p = mp_spread(pool, p, 5);
116 p = utf8_32_put(p, c);
119 return mp_end(pool, p);
123 xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool)
125 /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */
126 char *p = mp_start_noalign(pool, 1);
127 uint q = xml_parse_quote(ctx), c;
128 while ((c = xml_get_char(ctx)) != q)
130 if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID)))
131 xml_fatal(ctx, "Expected a pubid character");
132 p = mp_spread(pool, p, 2);
136 return mp_end(pool, p);
142 xml_push_comment(struct xml_context *ctx)
144 TRACE(ctx, "push_comment");
145 /* Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
146 * Already parsed: '<!-' */
147 xml_parse_char(ctx, '-');
148 struct xml_node *n = xml_push_dom(ctx, NULL);
149 n->type = XML_NODE_COMMENT;
150 char *p = mp_start_noalign(ctx->pool, 6);
153 if (xml_get_char(ctx) == '-')
154 if (xml_get_char(ctx) == '-')
158 p = utf8_32_put(p, xml_last_char(ctx));
159 p = mp_spread(ctx->pool, p, 6);
161 xml_parse_char(ctx, '>');
163 n->len = p - (char *)mp_ptr(ctx->pool);
164 n->text = mp_end(ctx->pool, p + 1);
165 if ((ctx->flags & XML_REPORT_COMMENTS) && ctx->h_comment)
170 xml_pop_comment(struct xml_context *ctx)
172 xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_COMMENTS));
174 TRACE(ctx, "pop_comment");
178 xml_skip_comment(struct xml_context *ctx)
180 TRACE(ctx, "skip_comment");
181 xml_parse_char(ctx, '-');
182 while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-');
183 xml_parse_char(ctx, '>');
187 /*** Processing instructions ***/
190 xml_push_pi(struct xml_context *ctx)
192 TRACE(ctx, "push_pi");
193 /* Parses a PI to ctx->value and ctx->name:
194 * PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
195 * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
196 * Already parsed: '<?' */
197 struct xml_node *n = xml_push_dom(ctx, NULL);
198 n->type = XML_NODE_PI;
199 n->name = xml_parse_name(ctx, ctx->pool);
200 if (unlikely(!strcasecmp(n->name, "xml")))
201 xml_error(ctx, "Reserved PI target");
202 char *p = mp_start_noalign(ctx->pool, 5);
203 if (!xml_parse_white(ctx, 0))
204 xml_parse_seq(ctx, "?>");
208 if (xml_get_char(ctx) == '?')
209 if (xml_peek_char(ctx) == '>')
217 p = utf8_32_put(p, xml_last_char(ctx));
218 p = mp_spread(ctx->pool, p, 5);
221 n->len = p - (char *)mp_ptr(ctx->pool);
222 n->text = mp_end(ctx->pool, p + 1);
223 if ((ctx->flags & XML_REPORT_PIS) && ctx->h_pi)
228 xml_pop_pi(struct xml_context *ctx)
230 xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_PIS));
232 TRACE(ctx, "pop_pi");
236 xml_skip_pi(struct xml_context *ctx)
238 TRACE(ctx, "skip_pi");
239 if (ctx->flags & XML_VALIDATING)
241 struct mempool_state state;
242 mp_save(ctx->stack, &state);
243 if (unlikely(!strcasecmp(xml_parse_name(ctx, ctx->stack), "xml")))
244 xml_error(ctx, "Reserved PI target");
245 mp_restore(ctx->stack, &state);
246 if (!xml_parse_white(ctx, 0))
248 xml_parse_seq(ctx, "?>");
254 if (xml_get_char(ctx) == '?')
255 if (xml_peek_char(ctx) == '>')
261 /*** Character references ***/
264 xml_parse_char_ref(struct xml_context *ctx)
266 TRACE(ctx, "parse_char_ref");
267 /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
268 * Already parsed: '&#' */
270 if (xml_get_char(ctx) == 'x')
272 if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT))
274 xml_error(ctx, "Expected a hexadecimal value of character reference");
279 v = (v << 4) + Cxvalue(xml_last_char(ctx));
281 while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT));
285 if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT))
287 xml_error(ctx, "Expected a numeric value of character reference");
292 v = v * 10 + xml_last_char(ctx) - '0';
294 while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT));
296 uint cat = xml_char_cat(v);
297 if (!(cat & ctx->cat_unrestricted))
299 xml_error(ctx, "Character reference out of range");
302 if (xml_last_char(ctx) == ';')
307 xml_error(ctx, "Expected ';'");
309 while (xml_last_char(ctx) != ';')
312 return UNI_REPLACEMENT;
315 /*** References to general entities ***/
318 xml_parse_ref(struct xml_context *ctx)
320 /* Reference ::= EntityRef | CharRef
321 * EntityRef ::= '&' Name ';'
322 * Already parsed: '&' */
323 struct fastbuf *out = &ctx->chars;
324 if (xml_peek_char(ctx) == '#')
327 bput_utf8_32(out, xml_parse_char_ref(ctx));
331 TRACE(ctx, "parse_ge_ref");
332 struct mempool_state state;
333 mp_save(ctx->stack, &state);
334 char *name = xml_parse_name(ctx, ctx->stack);
335 xml_parse_char(ctx, ';');
336 struct xml_dtd_entity *ent = xml_dtd_find_entity(ctx, name);
339 xml_error(ctx, "Unknown entity &%s;", name);
344 else if (ent->flags & XML_DTD_ENTITY_TRIVIAL)
346 TRACE(ctx, "Trivial entity &%s;", name);
347 bputs(out, ent->text);
351 TRACE(ctx, "Pushed entity &%s;", name);
352 mp_restore(ctx->stack, &state);
354 xml_push_entity(ctx, ent);
357 mp_restore(ctx->stack, &state);
362 /*** Character data ***/
365 xml_spout_chars(struct fastbuf *fb)
367 if (fb->bptr < fb->bufend)
369 struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb);
370 struct mempool *pool = ctx->pool;
371 if (fb->bufend != fb->buffer)
373 TRACE(ctx, "growing chars");
374 uint len = fb->bufend - fb->buffer;
375 uint reported = fb->bstop - fb->buffer;
376 fb->buffer = mp_expand(pool);
377 fb->bufend = fb->buffer + mp_avail(pool);
378 fb->bptr = fb->buffer + len;
379 fb->bstop = fb->buffer + reported;
383 TRACE(ctx, "starting chars");
384 mp_save(pool, &ctx->chars_state);
385 fb->bptr = fb->buffer = fb->bstop = mp_start_noalign(pool, 2);
386 fb->bufend = fb->buffer + mp_avail(pool) - 1;
391 xml_end_chars(struct xml_context *ctx, char **out)
393 struct fastbuf *fb = &ctx->chars;
394 uint len = fb->bptr - fb->buffer;
397 TRACE(ctx, "ending chars");
399 *out = mp_end(ctx->pool, fb->bptr + 1);
400 fb->bufend = fb->bstop = fb->bptr = fb->buffer;
406 xml_report_chars(struct xml_context *ctx, char **out)
408 struct fastbuf *fb = &ctx->chars;
409 uint len = fb->bptr - fb->buffer;
414 fb->bstop = fb->bptr;
420 xml_flush_chars(struct xml_context *ctx)
423 uint len = xml_end_chars(ctx, &text), rlen;
426 if (ctx->flags & XML_NO_CHARS)
428 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_ignorable)
429 ctx->h_ignorable(ctx, text, len);
430 mp_restore(ctx->pool, &ctx->chars_state);
433 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext)))
434 ctx->h_block(ctx, rtext, rlen);
435 if (!(ctx->flags & XML_ALLOC_CHARS) && !(ctx->flags & XML_REPORT_CHARS))
437 mp_restore(ctx->pool, &ctx->chars_state);
440 struct xml_node *n = xml_push_dom(ctx, &ctx->chars_state);
441 n->type = XML_NODE_CHARS;
444 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars)
451 xml_pop_chars(struct xml_context *ctx)
453 xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS));
454 TRACE(ctx, "pop_chars");
458 xml_append_chars(struct xml_context *ctx)
460 TRACE(ctx, "append_chars");
461 struct fastbuf *out = &ctx->chars;
462 if (ctx->flags & XML_NO_CHARS)
463 while (xml_get_char(ctx) != '<')
464 if (xml_last_cat(ctx) & XML_CHAR_WHITE)
465 bput_utf8_32(out, xml_last_char(ctx));
468 xml_error(ctx, "This element must not contain character data");
469 while (xml_get_char(ctx) != '<');
473 while (xml_get_char(ctx) != '<')
474 if (xml_last_char(ctx) == '&')
480 bput_utf8_32(out, xml_last_char(ctx));
484 /*** CDATA sections ***/
487 xml_skip_cdata(struct xml_context *ctx)
489 TRACE(ctx, "skip_cdata");
490 xml_parse_seq(ctx, "CDATA[");
491 while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>');
496 xml_append_cdata(struct xml_context *ctx)
498 /* CDSect :== '<![CDATA[' (Char* - (Char* ']]>' Char*)) ']]>'
499 * Already parsed: '<![' */
500 TRACE(ctx, "append_cdata");
501 if (ctx->flags & XML_NO_CHARS)
503 xml_error(ctx, "This element must not contain CDATA");
507 xml_parse_seq(ctx, "CDATA[");
508 struct fastbuf *out = &ctx->chars;
511 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext)))
512 ctx->h_block(ctx, rtext, rlen);
515 if (xml_get_char(ctx) == ']')
517 if (xml_get_char(ctx) == ']')
518 if (xml_get_char(ctx) == '>')
524 bput_utf8_32(out, xml_last_char(ctx));
526 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata && (rlen = xml_report_chars(ctx, &rtext)))
527 ctx->h_cdata(ctx, rtext, rlen);
531 /*** Attribute values ***/
534 xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED)
536 TRACE(ctx, "parse_attr_value");
537 /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */
538 /* FIXME: -- check value constrains / normalize leading/trailing WS and repeated WS */
539 struct mempool_state state;
540 uint quote = xml_parse_quote(ctx);
541 mp_save(ctx->stack, &state);
542 struct fastbuf *out = &ctx->chars;
543 struct xml_source *src = ctx->src;
546 uint c = xml_get_char(ctx);
552 else if (c == quote && src == ctx->src)
555 xml_error(ctx, "Attribute value must not contain '<'");
556 else if (xml_last_cat(ctx) & XML_CHAR_WHITE)
559 bput_utf8_32(out, c);
561 mp_restore(ctx->stack, &state);
563 return xml_end_chars(ctx, &text) ? text : "";
567 xml_normalize_white(struct xml_context *ctx UNUSED, char *text)
569 char *s = text, *d = text;
578 while (*++s == 0x20);
581 if (d != text && d[-1] == 0x20)
589 struct xml_attrs_table;
592 xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_node *e, uint ns, char *n)
594 return hash_pointer(e) ^ hash_string(n) ^ hash_u32(ns);
598 xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_node *e1, uint ns1, char *n1, struct xml_node *e2, uint ns2, char *n2)
600 return (e1 == e2) && (ns1 == ns2) && !strcmp(n1, n2);
604 xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_node *e, uint ns, char *name)
611 slist_add_tail(&e->attrs, &a->n);
614 #define HASH_PREFIX(x) xml_attrs_##x
615 #define HASH_NODE struct xml_attr
616 #define HASH_KEY_COMPLEX(x) x elem, x ns, x name
617 #define HASH_KEY_DECL struct xml_node *elem, uint ns, char *name
618 #define HASH_TABLE_DYNAMIC
620 #define HASH_GIVE_HASHFN
621 #define HASH_GIVE_INIT_KEY
622 #define HASH_WANT_CLEANUP
623 #define HASH_WANT_REMOVE
624 #define HASH_WANT_LOOKUP
625 #define HASH_WANT_FIND
626 #define HASH_GIVE_ALLOC
628 #include <ucw/hashtable.h>
631 xml_parse_attr(struct xml_context *ctx)
633 TRACE(ctx, "parse_attr");
634 /* Attribute ::= Name Eq AttValue */
635 struct xml_node *e = ctx->node;
636 char *n = xml_parse_name(ctx, ctx->pool);
637 // FIXME: This is wrong! This way, we never find attributes in a non-default NS.
638 struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, 0, n);
640 char *v = xml_parse_attr_value(ctx, NULL);
643 xml_error(ctx, "Attribute %s is not unique in element <%s>", n, e->name);
649 else if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name)))
650 xml_error(ctx, "Undefined attribute %s in element <%s>", n, e->name);
652 xml_validate_attr(ctx, a->dtd, a->val);
656 xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name)
658 return xml_attrs_find(ctx->tab_attrs, node, 0, name);
662 xml_attr_find_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name)
664 return xml_attrs_find(ctx->tab_attrs, node, ns, name);
668 xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name)
670 struct xml_attr *attr = xml_attrs_find(ctx->tab_attrs, node, 0, name);
675 struct xml_dtd_attr *dtd = xml_dtd_find_attr(ctx, node->dtd, name);
676 return dtd ? dtd->default_value : NULL;
680 xml_attrs_table_init(struct xml_context *ctx)
682 xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table)));
686 xml_attrs_table_cleanup(struct xml_context *ctx)
688 xml_attrs_cleanup(ctx->tab_attrs);
692 xml_attr_qname(struct xml_context *ctx UNUSED, struct xml_attr *attr)
694 char *n = attr->name;
703 xml_validate_element(struct xml_dtd_elem_node *root, struct xml_dtd_elem *elem)
706 return elem == root->elem;
708 SLIST_FOR_EACH(struct xml_dtd_elem_node *, son, root->sons)
709 if (xml_validate_element(son, elem))
715 xml_push_element(struct xml_context *ctx)
717 TRACE(ctx, "push_element");
718 /* EmptyElemTag | STag
719 * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
720 * STag ::= '<' Name (S Attribute)* S? '>'
721 * Already parsed: '<' */
722 struct xml_node *e = xml_push_dom(ctx, NULL);
723 clist_init(&e->sons);
724 e->type = XML_NODE_ELEM;
725 e->name = xml_parse_name(ctx, ctx->pool);
726 slist_init(&e->attrs);
731 if (ctx->doctype && strcmp(e->name, ctx->doctype))
732 xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype);
737 else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name)))
738 xml_error(ctx, "Undefined element <%s>", e->name);
741 struct xml_dtd_elem *dtd = e->dtd, *parent_dtd = e->parent ? e->parent->dtd : NULL;
742 if (dtd->type == XML_DTD_ELEM_MIXED)
743 ctx->flags &= ~XML_NO_CHARS;
745 ctx->flags |= XML_NO_CHARS;
747 if (parent_dtd->type == XML_DTD_ELEM_EMPTY)
748 xml_error(ctx, "Empty element must not contain children");
749 else if (parent_dtd->type != XML_DTD_ELEM_ANY)
751 // FIXME: validate regular expressions
752 if (!xml_validate_element(parent_dtd->node, dtd))
753 xml_error(ctx, "Unexpected element <%s>", e->name);
759 uint white = xml_parse_white(ctx, 0);
760 uint c = xml_get_char(ctx);
763 xml_parse_char(ctx, '>');
764 ctx->flags |= XML_EMPTY_ELEM_TAG;
770 xml_fatal_expected_white(ctx);
775 xml_ns_push_element(ctx);
777 /* FIXME: DTD logic is not namespace-aware */
779 SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs)
780 if (a->default_mode == XML_ATTR_REQUIRED)
782 if (!xml_attrs_find(ctx->tab_attrs, e, 0, a->name))
783 xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name);
785 else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS)
787 struct xml_attr *attr = xml_attrs_lookup(ctx->tab_attrs, e, 0, a->name);
789 attr->val = a->default_value;
791 if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag)
796 xml_pop_element(struct xml_context *ctx)
798 TRACE(ctx, "pop_element");
799 if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag)
802 xml_ns_pop_element(ctx);
804 struct xml_node *e = ctx->node;
805 uint free = !(ctx->flags & XML_ALLOC_TAGS);
810 /* Restore hash table of attributes */
811 SLIST_FOR_EACH(struct xml_attr *, a, e->attrs)
812 xml_attrs_remove(ctx->tab_attrs, a);
814 while (n = clist_head(&e->sons))
816 if (n->type == XML_NODE_ELEM)
818 SLIST_FOR_EACH(struct xml_attr *, a, n->attrs)
819 xml_attrs_remove(ctx->tab_attrs, a);
820 clist_insert_list_after(&n->sons, &n->n);
826 xml_pop_dom(ctx, free);
831 xml_parse_etag(struct xml_context *ctx)
833 /* ETag ::= '</' Name S? '>'
834 * Already parsed: '<' */
835 struct xml_node *e = ctx->node;
837 char *n = xml_node_qname(ctx, e);
841 n = utf8_32_get(n, &c);
842 if (xml_get_char(ctx) != c)
845 xml_parse_white(ctx, 0);
846 if (xml_get_char(ctx) != '>')
849 xml_error(ctx, "Invalid ETag, expected </%s>", e->name);
850 while (xml_get_char(ctx) != '>');
856 xml_node_qname(struct xml_context *ctx UNUSED, struct xml_node *node)
858 ASSERT(node->type == XML_NODE_ELEM);
859 char *n = node->name;
865 /*** Document type declaration ***/
868 xml_parse_doctype_decl(struct xml_context *ctx)
870 TRACE(ctx, "parse_doctype_decl");
871 /* doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
872 * Already parsed: '<!'
873 * Terminated before '[' or '>' */
875 xml_fatal(ctx, "Multiple document types not allowed");
876 xml_parse_seq(ctx, "DOCTYPE");
877 xml_parse_white(ctx, 1);
878 ctx->doctype = xml_parse_name(ctx, ctx->pool);
879 TRACE(ctx, "doctype=%s", ctx->doctype);
881 if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P'))
885 xml_parse_seq(ctx, "SYSTEM");
886 xml_parse_white(ctx, 1);
887 ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
891 xml_parse_seq(ctx, "PUBLIC");
892 xml_parse_white(ctx, 1);
893 ctx->public_id = xml_parse_pubid_literal(ctx, ctx->pool);
894 xml_parse_white(ctx, 1);
895 ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
897 xml_parse_white(ctx, 0);
898 ctx->flags |= XML_HAS_EXTERNAL_SUBSET;
900 if (xml_peek_char(ctx) == '[')
902 ctx->flags |= XML_HAS_INTERNAL_SUBSET;
906 if (ctx->h_doctype_decl)
907 ctx->h_doctype_decl(ctx);
912 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
914 /* DTD: Internal subset */
917 xml_parse_subset(struct xml_context *ctx, uint external)
920 // -- comments/pi have no parent
921 // -- conditional sections in external subset
922 // -- check corectness of parameter entities
925 * intSubset :== (markupdecl | DeclSep)
926 * Already parsed: '['
928 * extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)*
932 xml_parse_white(ctx, 0);
933 uint c = xml_get_char(ctx);
936 if ((c = xml_get_char(ctx)) == '!')
937 switch (c = xml_get_char(ctx))
940 xml_push_comment(ctx);
941 xml_pop_comment(ctx);
944 xml_parse_seq(ctx, "OTATION");
945 xml_parse_notation_decl(ctx);
948 if ((c = xml_get_char(ctx)) == 'N')
950 xml_parse_seq(ctx, "TITY");
951 xml_parse_entity_decl(ctx);
955 xml_parse_seq(ctx, "EMENT");
956 xml_parse_element_decl(ctx);
962 xml_parse_seq(ctx, "TTLIST");
963 xml_parse_attr_list_decl(ctx);
976 xml_parse_pe_ref(ctx);
977 else if (c == ']' && !external)
981 else if (c == '>' && external)
991 xml_fatal(ctx, "Invalid markup in the %s subset", external ? "external" : "internal");
994 /*** The State Machine ***/
997 xml_next(struct xml_context *ctx)
999 /* A nasty state machine */
1001 #define PULL(x) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##x; case XML_STATE_##x: ; } while (0)
1002 #define PULL_STATE(x, s) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##s, XML_STATE_##x; case XML_STATE_##s: ; } while (0)
1004 TRACE(ctx, "xml_next (state=%u)", ctx->state);
1006 ctx->throw_buf = &throw_buf;
1007 if (setjmp(throw_buf))
1010 if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal)
1012 TRACE(ctx, "raised fatal error");
1013 return ctx->state = XML_STATE_EOF;
1018 case XML_STATE_START:
1019 TRACE(ctx, "entering prolog");
1020 ctx->flags |= XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL;
1021 if (ctx->h_document_start)
1022 ctx->h_document_start(ctx);
1025 if (ctx->h_xml_decl)
1026 ctx->h_xml_decl(ctx);
1029 /* Misc* (doctypedecl Misc*)? */
1032 xml_parse_white(ctx, 0);
1033 xml_parse_char(ctx, '<');
1035 if ((c = xml_get_char(ctx)) == '?')
1036 /* Processing intruction */
1037 if (!(ctx->flags & XML_REPORT_PIS))
1042 PULL_STATE(PI, PROLOG_PI);
1047 /* Found the root tag */
1048 xml_unget_char(ctx);
1051 else if (xml_get_char(ctx) == '-')
1052 if (!(ctx->flags & XML_REPORT_COMMENTS))
1053 xml_skip_comment(ctx);
1056 xml_push_comment(ctx);
1057 PULL_STATE(COMMENT, PROLOG_COMMENT);
1058 xml_pop_comment(ctx);
1063 xml_unget_char(ctx);
1064 xml_parse_doctype_decl(ctx);
1066 if (ctx->flags & XML_HAS_DTD)
1067 if (ctx->flags & XML_PARSE_DTD)
1070 if (ctx->h_dtd_start)
1071 ctx->h_dtd_start(ctx);
1072 if (ctx->flags & XML_HAS_INTERNAL_SUBSET)
1074 xml_parse_subset(ctx, 0);
1077 if (ctx->flags & XML_HAS_EXTERNAL_SUBSET)
1079 struct xml_dtd_entity ent = {
1080 .system_id = ctx->system_id,
1081 .public_id = ctx->public_id,
1083 xml_parse_white(ctx, 0);
1084 xml_parse_char(ctx, '>');
1085 xml_unget_char(ctx);
1086 ASSERT(ctx->h_resolve_entity);
1087 ctx->h_resolve_entity(ctx, &ent);
1088 ctx->flags |= XML_SRC_EXPECTED_DECL;
1089 xml_parse_subset(ctx, 1);
1090 xml_unget_char(ctx);;
1093 ctx->h_dtd_end(ctx);
1095 else if (ctx->flags & XML_HAS_INTERNAL_SUBSET)
1096 xml_skip_internal_subset(ctx);
1097 xml_parse_white(ctx, 0);
1098 xml_parse_char(ctx, '>');
1103 case XML_STATE_CHARS:
1107 if (xml_peek_char(ctx) != '<')
1110 xml_append_chars(ctx);
1118 if ((c = xml_get_char(ctx)) == '?')
1121 if (!(ctx->flags & (XML_REPORT_PIS | XML_ALLOC_PIS)))
1125 if (xml_flush_chars(ctx))
1127 PULL_STATE(CHARS, CHARS_BEFORE_PI);
1137 if ((c = xml_get_char(ctx)) == '-')
1140 if (!(ctx->flags & (XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS)))
1141 xml_skip_comment(ctx);
1144 if (xml_flush_chars(ctx))
1146 PULL_STATE(CHARS, CHARS_BEFORE_COMMENT);
1149 xml_push_comment(ctx);
1151 xml_pop_comment(ctx);
1157 xml_append_cdata(ctx);
1160 xml_fatal(ctx, "Unexpected character after '<!'");
1164 /* STag | EmptyElemTag */
1165 xml_unget_char(ctx);
1166 if (xml_flush_chars(ctx))
1168 PULL_STATE(CHARS, CHARS_BEFORE_STAG);
1172 xml_push_element(ctx);
1174 if (ctx->flags & XML_EMPTY_ELEM_TAG)
1181 if (xml_flush_chars(ctx))
1183 PULL_STATE(CHARS, CHARS_BEFORE_ETAG);
1187 xml_parse_etag(ctx);
1190 xml_pop_element(ctx);
1198 TRACE(ctx, "entering epilog");
1201 /* Epilog whitespace is the only place, where a valid document can reach EOF */
1202 if (setjmp(throw_buf))
1203 if (ctx->err_code == XML_ERR_EOF)
1205 TRACE(ctx, "reached EOF");
1206 ctx->state = XML_STATE_EOF;
1207 if (ctx->h_document_end)
1208 ctx->h_document_end(ctx);
1211 ctx->err_msg = NULL;
1212 return XML_STATE_EOF;
1216 xml_parse_white(ctx, 0);
1217 if (setjmp(throw_buf))
1221 xml_parse_char(ctx, '<');
1223 if ((c = xml_get_char(ctx)) == '?')
1224 /* Processing instruction */
1225 if (!(ctx->flags & XML_REPORT_PIS))
1230 PULL_STATE(PI, EPILOG_PI);
1235 xml_parse_char(ctx, '-');
1237 if (!(ctx->flags & XML_REPORT_COMMENTS))
1238 xml_skip_comment(ctx);
1241 xml_push_comment(ctx);
1242 PULL_STATE(COMMENT, EPILOG_COMMENT);
1243 xml_pop_comment(ctx);
1247 xml_fatal(ctx, "Syntax error in the epilog");
1255 xml_next_state(struct xml_context *ctx, uint pull)
1257 uint saved = ctx->pull;
1259 uint res = xml_next(ctx);
1265 xml_skip_element(struct xml_context *ctx)
1267 ASSERT(ctx->state == XML_STATE_STAG);
1268 struct xml_node *node = ctx->node;
1269 uint saved = ctx->pull, res;
1270 ctx->pull = XML_PULL_ETAG;
1271 while ((res = xml_next(ctx)) && ctx->node != node);
1277 xml_parse(struct xml_context *ctx)
1279 /* This cycle should run only once unless the user overrides the value of ctx->pull in a SAX handler */
1284 while (xml_next(ctx));
1285 return ctx->err_code;
1289 xml_merge_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool)
1291 ASSERT(node->type == XML_NODE_ELEM);
1292 char *p = mp_start_noalign(pool, 1);
1293 XML_NODE_FOR_EACH(son, node)
1294 if (son->type == XML_NODE_CHARS)
1296 p = mp_spread(pool, p, son->len + 1);
1297 memcpy(p, son->text, son->len);
1301 return mp_end(pool, p);
1305 xml_append_dom_chars(char *p, struct mempool *pool, struct xml_node *node)
1307 XML_NODE_FOR_EACH(son, node)
1308 if (son->type == XML_NODE_CHARS)
1310 p = mp_spread(pool, p, son->len + 1);
1311 memcpy(p, son->text, son->len);
1314 else if (son->type == XML_NODE_ELEM)
1315 p = xml_append_dom_chars(p, pool, son);
1320 xml_merge_dom_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool)
1322 ASSERT(node->type == XML_NODE_ELEM);
1323 char *p = mp_start_noalign(pool, 1);
1324 p = xml_append_dom_chars(p, pool, node);
1326 return mp_end(pool, p);