2 * UCW Library -- A simple XML parser
4 * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
5 * (c) 2015 Martin Mares <mj@ucw.cz>
7 * This software may be freely distributed and used according to the terms
8 * of the GNU Lesser General Public License.
14 #include <ucw-xml/xml.h>
15 #include <ucw-xml/dtd.h>
16 #include <ucw-xml/internals.h>
17 #include <ucw/fastbuf.h>
18 #include <ucw/ff-unicode.h>
19 #include <ucw/unicode.h>
20 #include <ucw/chartype.h>
21 #include <ucw/hashfunc.h>
25 /*** Basic parsing ***/
28 xml_fatal_expected(struct xml_context *ctx, uint c)
30 if (c >= 32 && c < 127)
31 xml_fatal(ctx, "Expected '%c'", c);
33 xml_fatal(ctx, "Expected U+%04x", c);
37 xml_fatal_expected_white(struct xml_context *ctx)
39 xml_fatal(ctx, "Expected a white space");
43 xml_fatal_expected_quot(struct xml_context *ctx)
45 xml_fatal(ctx, "Expected a quotation mark");
49 xml_parse_eq(struct xml_context *ctx)
51 /* Eq ::= S? '=' S? */
52 xml_parse_white(ctx, 0);
53 xml_parse_char(ctx, '=');
54 xml_parse_white(ctx, 0);
57 /*** Memory management ***/
59 void *xml_do_push(struct xml_context *ctx, uint size)
61 /* Saves ctx->stack and ctx->flags state */
62 struct mempool_state state;
63 mp_save(ctx->stack, &state);
64 struct xml_stack *s = mp_alloc(ctx->stack, size);
66 s->flags = ctx->flags;
67 s->next = ctx->stack_list;
72 void xml_do_pop(struct xml_context *ctx, struct xml_stack *s)
74 /* Restore ctx->stack and ctx->flags state */
75 ctx->stack_list = s->next;
76 ctx->flags = s->flags;
77 mp_restore(ctx->stack, &s->state);
80 struct xml_node *xml_push_dom(struct xml_context *ctx, struct mempool_state *state)
82 /* Create a new DOM node */
83 TRACE(ctx, "push_dom");
84 struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s));
88 mp_save(ctx->pool, &s->state);
89 struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n));
91 if (n->parent = ctx->node)
92 clist_add_tail(&n->parent->sons, &n->n);
96 void xml_pop_dom(struct xml_context *ctx, uint free)
98 /* Leave DOM subtree */
99 TRACE(ctx, "pop_dom");
101 struct xml_node *p = ctx->node->parent;
102 struct xml_dom_stack *s = (void *)ctx->stack_list;
105 /* See xml_pop_element() for cleanup of attribute hash table */
107 clist_remove(&ctx->node->n);
108 mp_restore(ctx->pool, &s->state);
111 xml_do_pop(ctx, &s->stack);
116 uint xml_parse_white(struct xml_context *ctx, uint mandatory)
118 /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+
119 * mandatory=0 -> S? */
121 while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
126 if (unlikely(mandatory && !cnt))
127 xml_fatal_expected_white(ctx);
131 /*** Names and nmtokens ***/
134 xml_parse_string(struct xml_context *ctx, struct mempool *pool, uint first_cat, uint next_cat, char *err)
136 char *p = mp_start_noalign(pool, 2);
137 *p++ = '<'; /* We always prepend a '<', so we can seek backwards in the string */
138 if (unlikely(!(xml_peek_cat(ctx) & first_cat)))
139 xml_fatal(ctx, "%s", err);
142 p = mp_spread(pool, p, 5);
143 p = utf8_32_put(p, xml_skip_char(ctx));
145 while (xml_peek_cat(ctx) & next_cat);
147 return mp_end(pool, p) + 1;
151 xml_skip_string(struct xml_context *ctx, uint first_cat, uint next_cat, char *err)
153 if (unlikely(!(xml_get_cat(ctx) & first_cat)))
154 xml_fatal(ctx, "%s", err);
155 while (xml_peek_cat(ctx) & next_cat)
160 xml_parse_name(struct xml_context *ctx, struct mempool *pool)
162 /* Name ::= NameStartChar (NameChar)* */
163 return xml_parse_string(ctx, pool, ctx->cat_sname, ctx->cat_name, "Expected a name");
167 xml_skip_name(struct xml_context *ctx)
169 xml_skip_string(ctx, ctx->cat_sname, ctx->cat_name, "Expected a name");
173 xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool)
175 /* Nmtoken ::= (NameChar)+ */
176 return xml_parse_string(ctx, pool, ctx->cat_name, ctx->cat_name, "Expected a nmtoken");
179 /*** Simple literals ***/
182 xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool)
184 /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */
185 char *p = mp_start_noalign(pool, 1);
186 uint q = xml_parse_quote(ctx), c;
187 while ((c = xml_get_char(ctx)) != q)
189 p = mp_spread(pool, p, 5);
190 p = utf8_32_put(p, c);
193 return mp_end(pool, p);
197 xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool)
199 /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */
200 char *p = mp_start_noalign(pool, 1);
201 uint q = xml_parse_quote(ctx), c;
202 while ((c = xml_get_char(ctx)) != q)
204 if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID)))
205 xml_fatal(ctx, "Expected a pubid character");
206 p = mp_spread(pool, p, 2);
210 return mp_end(pool, p);
216 xml_push_comment(struct xml_context *ctx)
218 TRACE(ctx, "push_comment");
219 /* Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
220 * Already parsed: '<!-' */
221 xml_parse_char(ctx, '-');
222 struct xml_node *n = xml_push_dom(ctx, NULL);
223 n->type = XML_NODE_COMMENT;
224 char *p = mp_start_noalign(ctx->pool, 6);
227 if (xml_get_char(ctx) == '-')
228 if (xml_get_char(ctx) == '-')
232 p = utf8_32_put(p, xml_last_char(ctx));
233 p = mp_spread(ctx->pool, p, 6);
235 xml_parse_char(ctx, '>');
237 n->len = p - (char *)mp_ptr(ctx->pool);
238 n->text = mp_end(ctx->pool, p + 1);
239 if ((ctx->flags & XML_REPORT_COMMENTS) && ctx->h_comment)
244 xml_pop_comment(struct xml_context *ctx)
246 xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_COMMENTS));
248 TRACE(ctx, "pop_comment");
252 xml_skip_comment(struct xml_context *ctx)
254 TRACE(ctx, "skip_comment");
255 xml_parse_char(ctx, '-');
256 while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-');
257 xml_parse_char(ctx, '>');
261 /*** Processing instructions ***/
264 xml_push_pi(struct xml_context *ctx)
266 TRACE(ctx, "push_pi");
267 /* Parses a PI to ctx->value and ctx->name:
268 * PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
269 * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
270 * Already parsed: '<?' */
271 struct xml_node *n = xml_push_dom(ctx, NULL);
272 n->type = XML_NODE_PI;
273 n->name = xml_parse_name(ctx, ctx->pool);
274 if (unlikely(!strcasecmp(n->name, "xml")))
275 xml_error(ctx, "Reserved PI target");
276 char *p = mp_start_noalign(ctx->pool, 5);
277 if (!xml_parse_white(ctx, 0))
278 xml_parse_seq(ctx, "?>");
282 if (xml_get_char(ctx) == '?')
283 if (xml_peek_char(ctx) == '>')
291 p = utf8_32_put(p, xml_last_char(ctx));
292 p = mp_spread(ctx->pool, p, 5);
295 n->len = p - (char *)mp_ptr(ctx->pool);
296 n->text = mp_end(ctx->pool, p + 1);
297 if ((ctx->flags & XML_REPORT_PIS) && ctx->h_pi)
302 xml_pop_pi(struct xml_context *ctx)
304 xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_PIS));
306 TRACE(ctx, "pop_pi");
310 xml_skip_pi(struct xml_context *ctx)
312 TRACE(ctx, "skip_pi");
313 if (ctx->flags & XML_VALIDATING)
315 struct mempool_state state;
316 mp_save(ctx->stack, &state);
317 if (unlikely(!strcasecmp(xml_parse_name(ctx, ctx->stack), "xml")))
318 xml_error(ctx, "Reserved PI target");
319 mp_restore(ctx->stack, &state);
320 if (!xml_parse_white(ctx, 0))
322 xml_parse_seq(ctx, "?>");
328 if (xml_get_char(ctx) == '?')
329 if (xml_peek_char(ctx) == '>')
335 /*** Character references ***/
338 xml_parse_char_ref(struct xml_context *ctx)
340 TRACE(ctx, "parse_char_ref");
341 /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
342 * Already parsed: '&#' */
344 if (xml_get_char(ctx) == 'x')
346 if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT))
348 xml_error(ctx, "Expected a hexadecimal value of character reference");
353 v = (v << 4) + Cxvalue(xml_last_char(ctx));
355 while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT));
359 if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT))
361 xml_error(ctx, "Expected a numeric value of character reference");
366 v = v * 10 + xml_last_char(ctx) - '0';
368 while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT));
370 uint cat = xml_char_cat(v);
371 if (!(cat & ctx->cat_unrestricted))
373 xml_error(ctx, "Character reference out of range");
376 if (xml_last_char(ctx) == ';')
381 xml_error(ctx, "Expected ';'");
383 while (xml_last_char(ctx) != ';')
386 return UNI_REPLACEMENT;
389 /*** References to general entities ***/
392 xml_parse_ref(struct xml_context *ctx)
394 /* Reference ::= EntityRef | CharRef
395 * EntityRef ::= '&' Name ';'
396 * Already parsed: '&' */
397 struct fastbuf *out = &ctx->chars;
398 if (xml_peek_char(ctx) == '#')
401 bput_utf8_32(out, xml_parse_char_ref(ctx));
405 TRACE(ctx, "parse_ge_ref");
406 struct mempool_state state;
407 mp_save(ctx->stack, &state);
408 char *name = xml_parse_name(ctx, ctx->stack);
409 xml_parse_char(ctx, ';');
410 struct xml_dtd_entity *ent = xml_dtd_find_entity(ctx, name);
413 xml_error(ctx, "Unknown entity &%s;", name);
418 else if (ent->flags & XML_DTD_ENTITY_TRIVIAL)
420 TRACE(ctx, "Trivial entity &%s;", name);
421 bputs(out, ent->text);
425 TRACE(ctx, "Pushed entity &%s;", name);
426 mp_restore(ctx->stack, &state);
428 xml_push_entity(ctx, ent);
431 mp_restore(ctx->stack, &state);
436 /*** Character data ***/
439 xml_spout_chars(struct fastbuf *fb)
441 if (fb->bptr < fb->bufend)
443 struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb);
444 struct mempool *pool = ctx->pool;
445 if (fb->bufend != fb->buffer)
447 TRACE(ctx, "growing chars");
448 uint len = fb->bufend - fb->buffer;
449 uint reported = fb->bstop - fb->buffer;
450 fb->buffer = mp_expand(pool);
451 fb->bufend = fb->buffer + mp_avail(pool);
452 fb->bptr = fb->buffer + len;
453 fb->bstop = fb->buffer + reported;
457 TRACE(ctx, "starting chars");
458 mp_save(pool, &ctx->chars_state);
459 fb->bptr = fb->buffer = fb->bstop = mp_start_noalign(pool, 2);
460 fb->bufend = fb->buffer + mp_avail(pool) - 1;
465 xml_end_chars(struct xml_context *ctx, char **out)
467 struct fastbuf *fb = &ctx->chars;
468 uint len = fb->bptr - fb->buffer;
471 TRACE(ctx, "ending chars");
473 *out = mp_end(ctx->pool, fb->bptr + 1);
474 fb->bufend = fb->bstop = fb->bptr = fb->buffer;
480 xml_report_chars(struct xml_context *ctx, char **out)
482 struct fastbuf *fb = &ctx->chars;
483 uint len = fb->bptr - fb->buffer;
488 fb->bstop = fb->bptr;
494 xml_flush_chars(struct xml_context *ctx)
497 uint len = xml_end_chars(ctx, &text), rlen;
500 if (ctx->flags & XML_NO_CHARS)
502 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_ignorable)
503 ctx->h_ignorable(ctx, text, len);
504 mp_restore(ctx->pool, &ctx->chars_state);
507 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext)))
508 ctx->h_block(ctx, rtext, rlen);
509 if (!(ctx->flags & XML_ALLOC_CHARS) && !(ctx->flags & XML_REPORT_CHARS))
511 mp_restore(ctx->pool, &ctx->chars_state);
514 struct xml_node *n = xml_push_dom(ctx, &ctx->chars_state);
515 n->type = XML_NODE_CHARS;
518 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars)
525 xml_pop_chars(struct xml_context *ctx)
527 xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS));
528 TRACE(ctx, "pop_chars");
532 xml_append_chars(struct xml_context *ctx)
534 TRACE(ctx, "append_chars");
535 struct fastbuf *out = &ctx->chars;
536 if (ctx->flags & XML_NO_CHARS)
537 while (xml_get_char(ctx) != '<')
538 if (xml_last_cat(ctx) & XML_CHAR_WHITE)
539 bput_utf8_32(out, xml_last_char(ctx));
542 xml_error(ctx, "This element must not contain character data");
543 while (xml_get_char(ctx) != '<');
547 while (xml_get_char(ctx) != '<')
548 if (xml_last_char(ctx) == '&')
554 bput_utf8_32(out, xml_last_char(ctx));
558 /*** CDATA sections ***/
561 xml_skip_cdata(struct xml_context *ctx)
563 TRACE(ctx, "skip_cdata");
564 xml_parse_seq(ctx, "CDATA[");
565 while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>');
570 xml_append_cdata(struct xml_context *ctx)
572 /* CDSect :== '<![CDATA[' (Char* - (Char* ']]>' Char*)) ']]>'
573 * Already parsed: '<![' */
574 TRACE(ctx, "append_cdata");
575 if (ctx->flags & XML_NO_CHARS)
577 xml_error(ctx, "This element must not contain CDATA");
581 xml_parse_seq(ctx, "CDATA[");
582 struct fastbuf *out = &ctx->chars;
585 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext)))
586 ctx->h_block(ctx, rtext, rlen);
589 if (xml_get_char(ctx) == ']')
591 if (xml_get_char(ctx) == ']')
592 if (xml_get_char(ctx) == '>')
598 bput_utf8_32(out, xml_last_char(ctx));
600 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata && (rlen = xml_report_chars(ctx, &rtext)))
601 ctx->h_cdata(ctx, rtext, rlen);
605 /*** Attribute values ***/
608 xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED)
610 TRACE(ctx, "parse_attr_value");
611 /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */
612 /* FIXME: -- check value constrains / normalize leading/trailing WS and repeated WS */
613 struct mempool_state state;
614 uint quote = xml_parse_quote(ctx);
615 mp_save(ctx->stack, &state);
616 struct fastbuf *out = &ctx->chars;
617 struct xml_source *src = ctx->src;
620 uint c = xml_get_char(ctx);
626 else if (c == quote && src == ctx->src)
629 xml_error(ctx, "Attribute value must not contain '<'");
630 else if (xml_last_cat(ctx) & XML_CHAR_WHITE)
633 bput_utf8_32(out, c);
635 mp_restore(ctx->stack, &state);
637 return xml_end_chars(ctx, &text) ? text : "";
641 xml_normalize_white(struct xml_context *ctx UNUSED, char *text)
643 char *s = text, *d = text;
652 while (*++s == 0x20);
655 if (d != text && d[-1] == 0x20)
664 xml_raw_add_attr(struct xml_context *ctx, struct xml_node *e, char *name, char *value)
666 struct xml_attr *a = mp_alloc(ctx->pool, sizeof(*a));
668 a->ns = 0; /* Namespaces will be resolved later */
673 /* a->hash will be calculated later */
674 slist_add_tail(&e->attrs, &a->n);
678 xml_attr_hash(uint ns, char *name)
680 return hash_string(name) ^ hash_u32(ns);
684 xml_parse_attr(struct xml_context *ctx)
686 TRACE(ctx, "parse_attr");
687 /* Attribute ::= Name Eq AttValue */
688 struct xml_node *e = ctx->node;
689 char *n = xml_parse_name(ctx, ctx->pool);
691 char *v = xml_parse_attr_value(ctx, NULL);
692 xml_raw_add_attr(ctx, e, n, v);
696 xml_process_attr(struct xml_context *ctx, struct xml_attr *a)
698 struct xml_node *e = a->elem;
699 a->hash = xml_attr_hash(a->ns, a->name);
701 XML_ATTR_FOR_EACH(a2, e)
705 if (a2->hash == a->hash && a2->ns == a->ns && !strcmp(a2->name, a->name))
706 xml_error(ctx, "Attribute %s is not unique in element <%s>", xml_attr_qname(ctx, a), xml_node_qname(ctx, e));
711 xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name)
713 return xml_attr_find_ns(ctx, node, 0, name);
717 xml_attr_find_ns(struct xml_context *ctx UNUSED, struct xml_node *node, uint ns, char *name)
719 ASSERT(node->type == XML_NODE_ELEM);
720 uint hash = xml_attr_hash(ns, name);
721 XML_ATTR_FOR_EACH(a, node)
722 if (a->hash == hash && a->ns == ns && !strcmp(a->name, name))
728 xml_attr_value_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name)
730 struct xml_attr *attr = xml_attr_find_ns(ctx, node, ns, name);
735 if (ns) /* So far, our DTD support is not namespace-aware */
737 struct xml_dtd_attr *dtd = xml_dtd_find_attr(ctx, node->dtd, name);
738 return dtd ? dtd->default_value : NULL;
742 xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name)
744 return xml_attr_value_ns(ctx, node, 0, name);
748 xml_attr_qname(struct xml_context *ctx UNUSED, struct xml_attr *attr)
750 char *n = attr->name;
759 xml_validate_element(struct xml_dtd_elem_node *root, struct xml_dtd_elem *elem)
762 return elem == root->elem;
764 SLIST_FOR_EACH(struct xml_dtd_elem_node *, son, root->sons)
765 if (xml_validate_element(son, elem))
771 xml_push_element(struct xml_context *ctx)
773 TRACE(ctx, "push_element");
774 /* EmptyElemTag | STag
775 * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
776 * STag ::= '<' Name (S Attribute)* S? '>'
777 * Already parsed: '<' */
778 struct xml_node *e = xml_push_dom(ctx, NULL);
779 clist_init(&e->sons);
780 e->type = XML_NODE_ELEM;
781 e->name = xml_parse_name(ctx, ctx->pool);
782 slist_init(&e->attrs);
787 if (ctx->doctype && strcmp(e->name, ctx->doctype))
788 xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype);
793 else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name)))
794 xml_error(ctx, "Undefined element <%s>", e->name);
797 struct xml_dtd_elem *dtd = e->dtd, *parent_dtd = e->parent ? e->parent->dtd : NULL;
798 if (dtd->type == XML_DTD_ELEM_MIXED)
799 ctx->flags &= ~XML_NO_CHARS;
801 ctx->flags |= XML_NO_CHARS;
803 if (parent_dtd->type == XML_DTD_ELEM_EMPTY)
804 xml_error(ctx, "Empty element must not contain children");
805 else if (parent_dtd->type != XML_DTD_ELEM_ANY)
807 // FIXME: validate regular expressions
808 if (!xml_validate_element(parent_dtd->node, dtd))
809 xml_error(ctx, "Unexpected element <%s>", e->name);
813 /* Parse attributes */
816 uint white = xml_parse_white(ctx, 0);
817 uint c = xml_get_char(ctx);
820 xml_parse_char(ctx, '>');
821 ctx->flags |= XML_EMPTY_ELEM_TAG;
827 xml_fatal_expected_white(ctx);
832 /* Resolve namespaces */
833 xml_ns_push_element(ctx);
835 /* Once we have namespaces, hash attribute names */
836 XML_ATTR_FOR_EACH(a, e)
837 xml_process_attr(ctx, a);
839 /* FIXME: DTD logic is not namespace-aware */
842 XML_ATTR_FOR_EACH(a, e)
844 if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name)))
845 xml_error(ctx, "Undefined attribute %s in element <%s>", a->name, e->name);
847 xml_validate_attr(ctx, a->dtd, a->val);
849 SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs)
851 if (a->default_mode == XML_ATTR_REQUIRED)
853 if (!xml_attr_find(ctx, e, a->name))
854 xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name);
856 else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS)
858 if (!xml_attr_find(ctx, e, a->name))
859 xml_raw_add_attr(ctx, e, a->name, a->default_value);
864 if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag)
869 xml_pop_element(struct xml_context *ctx)
871 TRACE(ctx, "pop_element");
872 if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag)
875 xml_ns_pop_element(ctx);
877 struct xml_node *e = ctx->node;
878 uint free = !(ctx->flags & XML_ALLOC_TAGS);
885 * With the current data structures, freeing of attributes is not necessary,
886 * but it might be if we switch to a global hash table of large elements.
888 SLIST_FOR_EACH(struct xml_attr *, a, e->attrs)
889 xml_attrs_remove(ctx->tab_attrs, a);
891 while (n = clist_head(&e->sons))
893 if (n->type == XML_NODE_ELEM)
895 SLIST_FOR_EACH(struct xml_attr *, a, n->attrs)
896 xml_attrs_remove(ctx->tab_attrs, a);
897 clist_insert_list_after(&n->sons, &n->n);
904 xml_pop_dom(ctx, free);
909 xml_parse_etag(struct xml_context *ctx)
911 /* ETag ::= '</' Name S? '>'
912 * Already parsed: '<' */
913 struct xml_node *e = ctx->node;
915 char *n = xml_node_qname(ctx, e);
919 n = utf8_32_get(n, &c);
920 if (xml_get_char(ctx) != c)
923 xml_parse_white(ctx, 0);
924 if (xml_get_char(ctx) != '>')
927 xml_error(ctx, "Invalid ETag, expected </%s>", xml_node_qname(ctx, e));
928 while (xml_get_char(ctx) != '>');
934 xml_node_qname(struct xml_context *ctx UNUSED, struct xml_node *node)
936 ASSERT(node->type == XML_NODE_ELEM);
937 char *n = node->name;
943 /*** Document type declaration ***/
946 xml_parse_doctype_decl(struct xml_context *ctx)
948 TRACE(ctx, "parse_doctype_decl");
949 /* doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
950 * Already parsed: '<!'
951 * Terminated before '[' or '>' */
953 xml_fatal(ctx, "Multiple document types not allowed");
954 xml_parse_seq(ctx, "DOCTYPE");
955 xml_parse_white(ctx, 1);
956 ctx->doctype = xml_parse_name(ctx, ctx->pool);
957 TRACE(ctx, "doctype=%s", ctx->doctype);
959 if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P'))
963 xml_parse_seq(ctx, "SYSTEM");
964 xml_parse_white(ctx, 1);
965 ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
969 xml_parse_seq(ctx, "PUBLIC");
970 xml_parse_white(ctx, 1);
971 ctx->public_id = xml_parse_pubid_literal(ctx, ctx->pool);
972 xml_parse_white(ctx, 1);
973 ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
975 xml_parse_white(ctx, 0);
976 ctx->flags |= XML_HAS_EXTERNAL_SUBSET;
978 if (xml_peek_char(ctx) == '[')
980 ctx->flags |= XML_HAS_INTERNAL_SUBSET;
984 if (ctx->h_doctype_decl)
985 ctx->h_doctype_decl(ctx);
990 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
992 /* DTD: Internal subset */
995 xml_parse_subset(struct xml_context *ctx, uint external)
998 // -- comments/pi have no parent
999 // -- conditional sections in external subset
1000 // -- check corectness of parameter entities
1002 /* '[' intSubset ']'
1003 * intSubset :== (markupdecl | DeclSep)
1004 * Already parsed: '['
1006 * extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)*
1010 xml_parse_white(ctx, 0);
1011 uint c = xml_get_char(ctx);
1014 if ((c = xml_get_char(ctx)) == '!')
1015 switch (c = xml_get_char(ctx))
1018 xml_push_comment(ctx);
1019 xml_pop_comment(ctx);
1022 xml_parse_seq(ctx, "OTATION");
1023 xml_parse_notation_decl(ctx);
1026 if ((c = xml_get_char(ctx)) == 'N')
1028 xml_parse_seq(ctx, "TITY");
1029 xml_parse_entity_decl(ctx);
1033 xml_parse_seq(ctx, "EMENT");
1034 xml_parse_element_decl(ctx);
1037 goto invalid_markup;
1040 xml_parse_seq(ctx, "TTLIST");
1041 xml_parse_attr_list_decl(ctx);
1044 goto invalid_markup;
1052 goto invalid_markup;
1054 xml_parse_pe_ref(ctx);
1055 else if (c == ']' && !external)
1059 else if (c == '>' && external)
1064 goto invalid_markup;
1069 xml_fatal(ctx, "Invalid markup in the %s subset", external ? "external" : "internal");
1072 /*** The State Machine ***/
1075 xml_next(struct xml_context *ctx)
1077 /* A nasty state machine */
1079 #define PULL(x) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##x; case XML_STATE_##x: ; } while (0)
1080 #define PULL_STATE(x, s) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##s, XML_STATE_##x; case XML_STATE_##s: ; } while (0)
1082 TRACE(ctx, "xml_next (state=%u)", ctx->state);
1084 ctx->throw_buf = &throw_buf;
1085 if (setjmp(throw_buf))
1088 if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal)
1090 TRACE(ctx, "raised fatal error");
1091 return ctx->state = XML_STATE_EOF;
1096 case XML_STATE_START:
1097 TRACE(ctx, "entering prolog");
1098 ctx->flags |= XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL;
1099 if (ctx->h_document_start)
1100 ctx->h_document_start(ctx);
1103 if (ctx->h_xml_decl)
1104 ctx->h_xml_decl(ctx);
1107 /* Misc* (doctypedecl Misc*)? */
1110 xml_parse_white(ctx, 0);
1111 xml_parse_char(ctx, '<');
1113 if ((c = xml_get_char(ctx)) == '?')
1114 /* Processing intruction */
1115 if (!(ctx->flags & XML_REPORT_PIS))
1120 PULL_STATE(PI, PROLOG_PI);
1125 /* Found the root tag */
1126 xml_unget_char(ctx);
1129 else if (xml_get_char(ctx) == '-')
1130 if (!(ctx->flags & XML_REPORT_COMMENTS))
1131 xml_skip_comment(ctx);
1134 xml_push_comment(ctx);
1135 PULL_STATE(COMMENT, PROLOG_COMMENT);
1136 xml_pop_comment(ctx);
1141 xml_unget_char(ctx);
1142 xml_parse_doctype_decl(ctx);
1144 if (ctx->flags & XML_HAS_DTD)
1145 if (ctx->flags & XML_PARSE_DTD)
1148 if (ctx->h_dtd_start)
1149 ctx->h_dtd_start(ctx);
1150 if (ctx->flags & XML_HAS_INTERNAL_SUBSET)
1152 xml_parse_subset(ctx, 0);
1155 if (ctx->flags & XML_HAS_EXTERNAL_SUBSET)
1157 struct xml_dtd_entity ent = {
1158 .system_id = ctx->system_id,
1159 .public_id = ctx->public_id,
1161 xml_parse_white(ctx, 0);
1162 xml_parse_char(ctx, '>');
1163 xml_unget_char(ctx);
1164 ASSERT(ctx->h_resolve_entity);
1165 ctx->h_resolve_entity(ctx, &ent);
1166 ctx->flags |= XML_SRC_EXPECTED_DECL;
1167 xml_parse_subset(ctx, 1);
1168 xml_unget_char(ctx);;
1171 ctx->h_dtd_end(ctx);
1173 else if (ctx->flags & XML_HAS_INTERNAL_SUBSET)
1174 xml_skip_internal_subset(ctx);
1175 xml_parse_white(ctx, 0);
1176 xml_parse_char(ctx, '>');
1181 case XML_STATE_CHARS:
1185 if (xml_peek_char(ctx) != '<')
1188 xml_append_chars(ctx);
1196 if ((c = xml_get_char(ctx)) == '?')
1199 if (!(ctx->flags & (XML_REPORT_PIS | XML_ALLOC_PIS)))
1203 if (xml_flush_chars(ctx))
1205 PULL_STATE(CHARS, CHARS_BEFORE_PI);
1215 if ((c = xml_get_char(ctx)) == '-')
1218 if (!(ctx->flags & (XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS)))
1219 xml_skip_comment(ctx);
1222 if (xml_flush_chars(ctx))
1224 PULL_STATE(CHARS, CHARS_BEFORE_COMMENT);
1227 xml_push_comment(ctx);
1229 xml_pop_comment(ctx);
1235 xml_append_cdata(ctx);
1238 xml_fatal(ctx, "Unexpected character after '<!'");
1242 /* STag | EmptyElemTag */
1243 xml_unget_char(ctx);
1244 if (xml_flush_chars(ctx))
1246 PULL_STATE(CHARS, CHARS_BEFORE_STAG);
1250 xml_push_element(ctx);
1252 if (ctx->flags & XML_EMPTY_ELEM_TAG)
1259 if (xml_flush_chars(ctx))
1261 PULL_STATE(CHARS, CHARS_BEFORE_ETAG);
1265 xml_parse_etag(ctx);
1268 xml_pop_element(ctx);
1276 TRACE(ctx, "entering epilog");
1279 /* Epilog whitespace is the only place, where a valid document can reach EOF */
1280 if (setjmp(throw_buf))
1281 if (ctx->err_code == XML_ERR_EOF)
1283 TRACE(ctx, "reached EOF");
1284 ctx->state = XML_STATE_EOF;
1285 if (ctx->h_document_end)
1286 ctx->h_document_end(ctx);
1289 ctx->err_msg = NULL;
1290 return XML_STATE_EOF;
1294 xml_parse_white(ctx, 0);
1295 if (setjmp(throw_buf))
1299 xml_parse_char(ctx, '<');
1301 if ((c = xml_get_char(ctx)) == '?')
1302 /* Processing instruction */
1303 if (!(ctx->flags & XML_REPORT_PIS))
1308 PULL_STATE(PI, EPILOG_PI);
1313 xml_parse_char(ctx, '-');
1315 if (!(ctx->flags & XML_REPORT_COMMENTS))
1316 xml_skip_comment(ctx);
1319 xml_push_comment(ctx);
1320 PULL_STATE(COMMENT, EPILOG_COMMENT);
1321 xml_pop_comment(ctx);
1325 xml_fatal(ctx, "Syntax error in the epilog");
1333 xml_next_state(struct xml_context *ctx, uint pull)
1335 uint saved = ctx->pull;
1337 uint res = xml_next(ctx);
1343 xml_skip_element(struct xml_context *ctx)
1345 ASSERT(ctx->state == XML_STATE_STAG);
1346 struct xml_node *node = ctx->node;
1347 uint saved = ctx->pull, res;
1348 ctx->pull = XML_PULL_ETAG;
1349 while ((res = xml_next(ctx)) && ctx->node != node);
1355 xml_parse(struct xml_context *ctx)
1357 /* This cycle should run only once unless the user overrides the value of ctx->pull in a SAX handler */
1362 while (xml_next(ctx));
1363 return ctx->err_code;
1367 xml_merge_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool)
1369 ASSERT(node->type == XML_NODE_ELEM);
1370 char *p = mp_start_noalign(pool, 1);
1371 XML_NODE_FOR_EACH(son, node)
1372 if (son->type == XML_NODE_CHARS)
1374 p = mp_spread(pool, p, son->len + 1);
1375 memcpy(p, son->text, son->len);
1379 return mp_end(pool, p);
1383 xml_append_dom_chars(char *p, struct mempool *pool, struct xml_node *node)
1385 XML_NODE_FOR_EACH(son, node)
1386 if (son->type == XML_NODE_CHARS)
1388 p = mp_spread(pool, p, son->len + 1);
1389 memcpy(p, son->text, son->len);
1392 else if (son->type == XML_NODE_ELEM)
1393 p = xml_append_dom_chars(p, pool, son);
1398 xml_merge_dom_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool)
1400 ASSERT(node->type == XML_NODE_ELEM);
1401 char *p = mp_start_noalign(pool, 1);
1402 p = xml_append_dom_chars(p, pool, node);
1404 return mp_end(pool, p);