2 * Sherlock Library -- A simple XML parser
4 * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
13 #include <shxml/xml.h>
14 #include <shxml/dtd.h>
15 #include <shxml/internals.h>
16 #include <ucw/fastbuf.h>
17 #include <ucw/ff-unicode.h>
18 #include <ucw/unicode.h>
19 #include <ucw/chartype.h>
20 #include <ucw/hashfunc.h>
24 /*** Basic parsing ***/
27 xml_fatal_expected(struct xml_context *ctx, uns c)
29 if (c >= 32 && c < 128)
30 xml_fatal(ctx, "Expected '%c'", c);
32 xml_fatal(ctx, "Expected U+%04x", c);
36 xml_fatal_expected_white(struct xml_context *ctx)
38 xml_fatal(ctx, "Expected a white space");
42 xml_fatal_expected_quot(struct xml_context *ctx)
44 xml_fatal(ctx, "Expected a quotation mark");
48 xml_parse_eq(struct xml_context *ctx)
50 /* Eq ::= S? '=' S? */
51 xml_parse_white(ctx, 0);
52 xml_parse_char(ctx, '=');
53 xml_parse_white(ctx, 0);
56 /*** Names and nmtokens ***/
59 xml_parse_string(struct xml_context *ctx, struct mempool *pool, uns first_cat, uns next_cat, char *err)
61 char *p = mp_start_noalign(pool, 1);
62 if (unlikely(!(xml_peek_cat(ctx) & first_cat)))
63 xml_fatal(ctx, "%s", err);
66 p = mp_spread(pool, p, 5);
67 p = utf8_32_put(p, xml_skip_char(ctx));
69 while (xml_peek_cat(ctx) & next_cat);
71 return mp_end(pool, p);
75 xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err)
77 if (unlikely(!(xml_get_cat(ctx) & first_cat)))
78 xml_fatal(ctx, "%s", err);
79 while (xml_peek_cat(ctx) & next_cat)
84 xml_parse_name(struct xml_context *ctx, struct mempool *pool)
86 /* Name ::= NameStartChar (NameChar)* */
87 return xml_parse_string(ctx, pool, ctx->cat_sname, ctx->cat_name, "Expected a name");
91 xml_skip_name(struct xml_context *ctx)
93 xml_skip_string(ctx, ctx->cat_sname, ctx->cat_name, "Expected a name");
97 xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool)
99 /* Nmtoken ::= (NameChar)+ */
100 return xml_parse_string(ctx, pool, ctx->cat_name, ctx->cat_name, "Expected a nmtoken");
103 /*** Simple literals ***/
106 xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool)
108 /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */
109 char *p = mp_start_noalign(pool, 1);
110 uns q = xml_parse_quote(ctx), c;
111 while ((c = xml_get_char(ctx)) != q)
113 p = mp_spread(pool, p, 5);
114 p = utf8_32_put(p, c);
117 return mp_end(pool, p);
121 xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool)
123 /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */
124 char *p = mp_start_noalign(pool, 1);
125 uns q = xml_parse_quote(ctx), c;
126 while ((c = xml_get_char(ctx)) != q)
128 if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID)))
129 xml_fatal(ctx, "Expected a pubid character");
130 p = mp_spread(pool, p, 2);
134 return mp_end(pool, p);
140 xml_push_comment(struct xml_context *ctx)
142 TRACE(ctx, "push_comment");
143 /* Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
144 * Already parsed: '<!-' */
145 xml_parse_char(ctx, '-');
146 struct xml_node *n = xml_push_dom(ctx, NULL);
147 n->type = XML_NODE_COMMENT;
148 char *p = mp_start_noalign(ctx->pool, 6);
151 if (xml_get_char(ctx) == '-')
152 if (xml_get_char(ctx) == '-')
156 p = utf8_32_put(p, xml_last_char(ctx));
157 p = mp_spread(ctx->pool, p, 6);
159 xml_parse_char(ctx, '>');
161 n->len = p - (char *)mp_ptr(ctx->pool);
162 n->text = mp_end(ctx->pool, p + 1);
163 if ((ctx->flags & XML_REPORT_COMMENTS) && ctx->h_comment)
168 xml_pop_comment(struct xml_context *ctx)
170 xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_COMMENTS));
172 TRACE(ctx, "pop_comment");
176 xml_skip_comment(struct xml_context *ctx)
178 TRACE(ctx, "skip_comment");
179 xml_parse_char(ctx, '-');
180 while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-');
181 xml_parse_char(ctx, '>');
185 /*** Processing instructions ***/
188 xml_push_pi(struct xml_context *ctx)
190 TRACE(ctx, "push_pi");
191 /* Parses a PI to ctx->value and ctx->name:
192 * PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
193 * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
194 * Already parsed: '<?' */
195 struct xml_node *n = xml_push_dom(ctx, NULL);
196 n->type = XML_NODE_PI;
197 n->name = xml_parse_name(ctx, ctx->pool);
198 if (unlikely(!strcasecmp(n->name, "xml")))
199 xml_error(ctx, "Reserved PI target");
200 char *p = mp_start_noalign(ctx->pool, 5);
201 if (!xml_parse_white(ctx, 0))
202 xml_parse_seq(ctx, "?>");
206 if (xml_get_char(ctx) == '?')
207 if (xml_peek_char(ctx) == '>')
215 p = utf8_32_put(p, xml_last_char(ctx));
216 p = mp_spread(ctx->pool, p, 5);
219 n->len = p - (char *)mp_ptr(ctx->pool);
220 n->text = mp_end(ctx->pool, p + 1);
221 if ((ctx->flags & XML_REPORT_PIS) && ctx->h_pi)
226 xml_pop_pi(struct xml_context *ctx)
228 xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_PIS));
230 TRACE(ctx, "pop_pi");
234 xml_skip_pi(struct xml_context *ctx)
236 TRACE(ctx, "skip_pi");
237 if (ctx->flags & XML_VALIDATING)
239 struct mempool_state state;
240 mp_save(ctx->stack, &state);
241 if (unlikely(!strcasecmp(xml_parse_name(ctx, ctx->stack), "xml")))
242 xml_error(ctx, "Reserved PI target");
243 mp_restore(ctx->stack, &state);
244 if (!xml_parse_white(ctx, 0))
246 xml_parse_seq(ctx, "?>");
252 if (xml_get_char(ctx) == '?')
253 if (xml_peek_char(ctx) == '>')
259 /*** Character references ***/
262 xml_parse_char_ref(struct xml_context *ctx)
264 TRACE(ctx, "parse_char_ref");
265 /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
266 * Already parsed: '&#' */
268 if (xml_get_char(ctx) == 'x')
270 if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT))
272 xml_error(ctx, "Expected a hexadecimal value of character reference");
277 v = (v << 4) + Cxvalue(xml_last_char(ctx));
279 while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT));
283 if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT))
285 xml_error(ctx, "Expected a numeric value of character reference");
290 v = v * 10 + xml_last_char(ctx) - '0';
292 while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT));
294 uns cat = xml_char_cat(v);
295 if (!(cat & ctx->cat_unrestricted))
297 xml_error(ctx, "Character reference out of range");
300 if (xml_last_char(ctx) == ';')
305 xml_error(ctx, "Expected ';'");
307 while (xml_last_char(ctx) != ';')
310 return UNI_REPLACEMENT;
313 /*** References to general entities ***/
316 xml_parse_ref(struct xml_context *ctx)
318 /* Reference ::= EntityRef | CharRef
319 * EntityRef ::= '&' Name ';'
320 * Already parsed: '&' */
321 struct fastbuf *out = &ctx->chars;
322 if (xml_peek_char(ctx) == '#')
325 bput_utf8_32(out, xml_parse_char_ref(ctx));
329 TRACE(ctx, "parse_ge_ref");
330 struct mempool_state state;
331 mp_save(ctx->stack, &state);
332 char *name = xml_parse_name(ctx, ctx->stack);
333 xml_parse_char(ctx, ';');
334 struct xml_dtd_entity *ent = xml_dtd_find_entity(ctx, name);
337 xml_error(ctx, "Unknown entity &%s;", name);
342 else if (ent->flags & XML_DTD_ENTITY_TRIVIAL)
344 TRACE(ctx, "Trivial entity &%s;", name);
345 bputs(out, ent->text);
349 TRACE(ctx, "Pushed entity &%s;", name);
350 mp_restore(ctx->stack, &state);
352 xml_push_entity(ctx, ent);
355 mp_restore(ctx->stack, &state);
360 /*** Character data ***/
363 xml_spout_chars(struct fastbuf *fb)
365 if (fb->bptr < fb->bufend)
367 struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb);
368 struct mempool *pool = ctx->pool;
369 if (fb->bufend != fb->buffer)
371 TRACE(ctx, "growing chars");
372 uns len = fb->bufend - fb->buffer;
373 uns reported = fb->bstop - fb->buffer;
374 fb->buffer = mp_expand(pool);
375 fb->bufend = fb->buffer + mp_avail(pool);
376 fb->bptr = fb->buffer + len;
377 fb->bstop = fb->buffer + reported;
381 TRACE(ctx, "starting chars");
382 mp_save(pool, &ctx->chars_state);
383 fb->bptr = fb->buffer = fb->bstop = mp_start_noalign(pool, 2);
384 fb->bufend = fb->buffer + mp_avail(pool) - 1;
389 xml_end_chars(struct xml_context *ctx, char **out)
391 struct fastbuf *fb = &ctx->chars;
392 uns len = fb->bptr - fb->buffer;
395 TRACE(ctx, "ending chars");
397 *out = mp_end(ctx->pool, fb->bptr + 1);
398 fb->bufend = fb->bstop = fb->bptr = fb->buffer;
404 xml_report_chars(struct xml_context *ctx, char **out)
406 struct fastbuf *fb = &ctx->chars;
407 uns len = fb->bptr - fb->buffer;
412 fb->bstop = fb->bptr;
418 xml_flush_chars(struct xml_context *ctx)
421 uns len = xml_end_chars(ctx, &text), rlen;
424 if (ctx->flags & XML_NO_CHARS)
426 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_ignorable)
427 ctx->h_ignorable(ctx, text, len);
428 mp_restore(ctx->pool, &ctx->chars_state);
431 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext)))
432 ctx->h_block(ctx, rtext, rlen);
433 if (!(ctx->flags & XML_ALLOC_CHARS) && !(ctx->flags & XML_REPORT_CHARS))
435 mp_restore(ctx->pool, &ctx->chars_state);
438 struct xml_node *n = xml_push_dom(ctx, &ctx->chars_state);
439 n->type = XML_NODE_CHARS;
442 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars)
449 xml_pop_chars(struct xml_context *ctx)
451 xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS));
452 TRACE(ctx, "pop_chars");
456 xml_append_chars(struct xml_context *ctx)
458 TRACE(ctx, "append_chars");
459 struct fastbuf *out = &ctx->chars;
460 if (ctx->flags & XML_NO_CHARS)
461 while (xml_get_char(ctx) != '<')
462 if (xml_last_cat(ctx) & XML_CHAR_WHITE)
463 bput_utf8_32(out, xml_last_char(ctx));
466 xml_error(ctx, "This element must not contain character data");
467 while (xml_get_char(ctx) != '<');
471 while (xml_get_char(ctx) != '<')
472 if (xml_last_char(ctx) == '&')
478 bput_utf8_32(out, xml_last_char(ctx));
482 /*** CDATA sections ***/
485 xml_skip_cdata(struct xml_context *ctx)
487 TRACE(ctx, "skip_cdata");
488 xml_parse_seq(ctx, "CDATA[");
489 while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>');
494 xml_append_cdata(struct xml_context *ctx)
496 /* CDSect :== '<![CDATA[' (Char* - (Char* ']]>' Char*)) ']]>'
497 * Already parsed: '<![' */
498 TRACE(ctx, "append_cdata");
499 if (ctx->flags & XML_NO_CHARS)
501 xml_error(ctx, "This element must not contain CDATA");
505 xml_parse_seq(ctx, "CDATA[");
506 struct fastbuf *out = &ctx->chars;
509 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext)))
510 ctx->h_block(ctx, rtext, rlen);
513 if (xml_get_char(ctx) == ']')
515 if (xml_get_char(ctx) == ']')
516 if (xml_get_char(ctx) == '>')
522 bput_utf8_32(out, xml_last_char(ctx));
524 if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata && (rlen = xml_report_chars(ctx, &rtext)))
525 ctx->h_cdata(ctx, rtext, rlen);
529 /*** Attribute values ***/
532 xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED)
534 TRACE(ctx, "parse_attr_value");
535 /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */
536 /* FIXME: -- check value constrains / normalize leading/trailing WS and repeated WS */
537 struct mempool_state state;
538 uns quote = xml_parse_quote(ctx);
539 mp_save(ctx->stack, &state);
540 struct fastbuf *out = &ctx->chars;
541 struct xml_source *src = ctx->src;
544 uns c = xml_get_char(ctx);
550 else if (c == quote && src == ctx->src)
553 xml_error(ctx, "Attribute value must not contain '<'");
554 else if (xml_last_cat(ctx) & XML_CHAR_WHITE)
557 bput_utf8_32(out, c);
559 mp_restore(ctx->stack, &state);
561 return xml_end_chars(ctx, &text) ? text : "";
565 xml_normalize_white(struct xml_context *ctx UNUSED, char *text)
567 char *s = text, *d = text;
576 while (*++s == 0x20);
579 if (d != text && d[-1] == 0x20)
587 struct xml_attrs_table;
590 xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_node *e, char *n)
592 return hash_pointer(e) ^ hash_string(n);
596 xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_node *e1, char *n1, struct xml_node *e2, char *n2)
598 return (e1 == e2) && !strcmp(n1, n2);
602 xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_node *e, char *name)
608 slist_add_tail(&e->attrs, &a->n);
611 #define HASH_PREFIX(x) xml_attrs_##x
612 #define HASH_NODE struct xml_attr
613 #define HASH_KEY_COMPLEX(x) x elem, x name
614 #define HASH_KEY_DECL struct xml_node *elem, char *name
615 #define HASH_TABLE_DYNAMIC
617 #define HASH_GIVE_HASHFN
618 #define HASH_GIVE_INIT_KEY
619 #define HASH_WANT_CLEANUP
620 #define HASH_WANT_REMOVE
621 #define HASH_WANT_LOOKUP
622 #define HASH_WANT_FIND
623 #define HASH_GIVE_ALLOC
625 #include <ucw/hashtable.h>
628 xml_parse_attr(struct xml_context *ctx)
630 TRACE(ctx, "parse_attr");
631 /* Attribute ::= Name Eq AttValue */
632 struct xml_node *e = ctx->node;
633 char *n = xml_parse_name(ctx, ctx->pool);
634 struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, n);
636 char *v = xml_parse_attr_value(ctx, NULL);
639 xml_error(ctx, "Attribute %s is not unique in element <%s>", n, e->name);
645 else if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name)))
646 xml_error(ctx, "Undefined attribute %s in element <%s>", n, e->name);
648 xml_validate_attr(ctx, a->dtd, a->val);
652 xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name)
654 return xml_attrs_find(ctx->tab_attrs, node, name);
658 xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name)
660 struct xml_attr *attr = xml_attrs_find(ctx->tab_attrs, node, name);
665 struct xml_dtd_attr *dtd = xml_dtd_find_attr(ctx, node->dtd, name);
666 return dtd ? dtd->default_value : NULL;
670 xml_attrs_table_init(struct xml_context *ctx)
672 xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table)));
676 xml_attrs_table_cleanup(struct xml_context *ctx)
678 xml_attrs_cleanup(ctx->tab_attrs);
684 xml_validate_element(struct xml_dtd_elem_node *root, struct xml_dtd_elem *elem)
687 return elem == root->elem;
689 SLIST_FOR_EACH(struct xml_dtd_elem_node *, son, root->sons)
690 if (xml_validate_element(son, elem))
696 xml_push_element(struct xml_context *ctx)
698 TRACE(ctx, "push_element");
699 /* EmptyElemTag | STag
700 * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
701 * STag ::= '<' Name (S Attribute)* S? '>'
702 * Already parsed: '<' */
703 struct xml_node *e = xml_push_dom(ctx, NULL);
704 clist_init(&e->sons);
705 e->type = XML_NODE_ELEM;
706 e->name = xml_parse_name(ctx, ctx->pool);
707 slist_init(&e->attrs);
711 if (ctx->doctype && strcmp(e->name, ctx->doctype))
712 xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype);
716 else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name)))
717 xml_error(ctx, "Undefined element <%s>", e->name);
720 struct xml_dtd_elem *dtd = e->dtd, *parent_dtd = e->parent ? e->parent->dtd : NULL;
721 if (dtd->type == XML_DTD_ELEM_MIXED)
722 ctx->flags &= ~XML_NO_CHARS;
724 ctx->flags |= XML_NO_CHARS;
726 if (parent_dtd->type == XML_DTD_ELEM_EMPTY)
727 xml_error(ctx, "Empty element must not contain children");
728 else if (parent_dtd->type != XML_DTD_ELEM_ANY)
730 // FIXME: validate regular expressions
731 if (!xml_validate_element(parent_dtd->node, dtd))
732 xml_error(ctx, "Unexpected element <%s>", e->name);
737 uns white = xml_parse_white(ctx, 0);
738 uns c = xml_get_char(ctx);
741 xml_parse_char(ctx, '>');
742 ctx->flags |= XML_EMPTY_ELEM_TAG;
748 xml_fatal_expected_white(ctx);
753 SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs)
754 if (a->default_mode == XML_ATTR_REQUIRED)
756 if (!xml_attrs_find(ctx->tab_attrs, e, a->name))
757 xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name);
759 else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS)
761 struct xml_attr *attr = xml_attrs_lookup(ctx->tab_attrs, e, a->name);
763 attr->val = a->default_value;
765 if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag)
770 xml_pop_element(struct xml_context *ctx)
772 TRACE(ctx, "pop_element");
773 if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag)
775 struct xml_node *e = ctx->node;
776 uns free = !(ctx->flags & XML_ALLOC_TAGS);
781 /* Restore hash table of attributes */
782 SLIST_FOR_EACH(struct xml_attr *, a, e->attrs)
783 xml_attrs_remove(ctx->tab_attrs, a);
785 while (n = clist_head(&e->sons))
787 if (n->type == XML_NODE_ELEM)
789 SLIST_FOR_EACH(struct xml_attr *, a, n->attrs)
790 xml_attrs_remove(ctx->tab_attrs, a);
791 clist_insert_list_after(&n->sons, &n->n);
796 xml_pop_dom(ctx, free);
801 xml_parse_etag(struct xml_context *ctx)
803 /* ETag ::= '</' Name S? '>'
804 * Already parsed: '<' */
805 struct xml_node *e = ctx->node;
811 n = utf8_32_get(n, &c);
812 if (xml_get_char(ctx) != c)
815 xml_parse_white(ctx, 0);
816 if (xml_get_char(ctx) != '>')
819 xml_error(ctx, "Invalid ETag, expected </%s>", e->name);
820 while (xml_get_char(ctx) != '>');
825 /*** Document type declaration ***/
828 xml_parse_doctype_decl(struct xml_context *ctx)
830 TRACE(ctx, "parse_doctype_decl");
831 /* doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
832 * Already parsed: '<!'
833 * Terminated before '[' or '>' */
835 xml_fatal(ctx, "Multiple document types not allowed");
836 xml_parse_seq(ctx, "DOCTYPE");
837 xml_parse_white(ctx, 1);
838 ctx->doctype = xml_parse_name(ctx, ctx->pool);
839 TRACE(ctx, "doctype=%s", ctx->doctype);
841 if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P'))
845 xml_parse_seq(ctx, "SYSTEM");
846 xml_parse_white(ctx, 1);
847 ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
851 xml_parse_seq(ctx, "PUBLIC");
852 xml_parse_white(ctx, 1);
853 ctx->public_id = xml_parse_pubid_literal(ctx, ctx->pool);
854 xml_parse_white(ctx, 1);
855 ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
857 xml_parse_white(ctx, 0);
858 ctx->flags |= XML_HAS_EXTERNAL_SUBSET;
860 if (xml_peek_char(ctx) == '[')
862 ctx->flags |= XML_HAS_INTERNAL_SUBSET;
866 if (ctx->h_doctype_decl)
867 ctx->h_doctype_decl(ctx);
872 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
874 /* DTD: Internal subset */
877 xml_parse_subset(struct xml_context *ctx, uns external)
880 // -- comments/pi have no parent
881 // -- conditional sections in external subset
882 // -- check corectness of parameter entities
885 * intSubset :== (markupdecl | DeclSep)
886 * Already parsed: '['
888 * extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)*
892 xml_parse_white(ctx, 0);
893 uns c = xml_get_char(ctx);
896 if ((c = xml_get_char(ctx)) == '!')
897 switch (c = xml_get_char(ctx))
900 xml_push_comment(ctx);
901 xml_pop_comment(ctx);
904 xml_parse_seq(ctx, "OTATION");
905 xml_parse_notation_decl(ctx);
908 if ((c = xml_get_char(ctx)) == 'N')
910 xml_parse_seq(ctx, "TITY");
911 xml_parse_entity_decl(ctx);
915 xml_parse_seq(ctx, "EMENT");
916 xml_parse_element_decl(ctx);
922 xml_parse_seq(ctx, "TTLIST");
923 xml_parse_attr_list_decl(ctx);
936 xml_parse_pe_ref(ctx);
937 else if (c == ']' && !external)
941 else if (c == '>' && external)
951 xml_fatal(ctx, "Invalid markup in the %s subset", external ? "external" : "internal");
954 /*** The State Machine ***/
957 xml_next(struct xml_context *ctx)
959 /* A nasty state machine */
961 #define PULL(x) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##x; case XML_STATE_##x: ; } while (0)
962 #define PULL_STATE(x, s) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##s, XML_STATE_##x; case XML_STATE_##s: ; } while (0)
964 TRACE(ctx, "xml_next (state=%u)", ctx->state);
966 ctx->throw_buf = &throw_buf;
967 if (setjmp(throw_buf))
970 if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal)
972 TRACE(ctx, "raised fatal error");
973 return ctx->state = XML_STATE_EOF;
978 case XML_STATE_START:
979 TRACE(ctx, "entering prolog");
980 ctx->flags |= XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL;
981 if (ctx->h_document_start)
982 ctx->h_document_start(ctx);
986 ctx->h_xml_decl(ctx);
989 /* Misc* (doctypedecl Misc*)? */
992 xml_parse_white(ctx, 0);
993 xml_parse_char(ctx, '<');
995 if ((c = xml_get_char(ctx)) == '?')
996 /* Processing intruction */
997 if (!(ctx->flags & XML_REPORT_PIS))
1002 PULL_STATE(PI, PROLOG_PI);
1007 /* Found the root tag */
1008 xml_unget_char(ctx);
1011 else if (xml_get_char(ctx) == '-')
1012 if (!(ctx->flags & XML_REPORT_COMMENTS))
1013 xml_skip_comment(ctx);
1016 xml_push_comment(ctx);
1017 PULL_STATE(COMMENT, PROLOG_COMMENT);
1018 xml_pop_comment(ctx);
1023 xml_unget_char(ctx);
1024 xml_parse_doctype_decl(ctx);
1026 if (ctx->flags & XML_HAS_DTD)
1027 if (ctx->flags & XML_PARSE_DTD)
1030 if (ctx->h_dtd_start)
1031 ctx->h_dtd_start(ctx);
1032 if (ctx->flags & XML_HAS_INTERNAL_SUBSET)
1034 xml_parse_subset(ctx, 0);
1037 if (ctx->flags & XML_HAS_EXTERNAL_SUBSET)
1039 struct xml_dtd_entity ent = {
1040 .system_id = ctx->system_id,
1041 .public_id = ctx->public_id,
1043 xml_parse_white(ctx, 0);
1044 xml_parse_char(ctx, '>');
1045 xml_unget_char(ctx);
1046 ASSERT(ctx->h_resolve_entity);
1047 ctx->h_resolve_entity(ctx, &ent);
1048 ctx->flags |= XML_SRC_EXPECTED_DECL;
1049 xml_parse_subset(ctx, 1);
1050 xml_unget_char(ctx);;
1053 ctx->h_dtd_end(ctx);
1055 else if (ctx->flags & XML_HAS_INTERNAL_SUBSET)
1056 xml_skip_internal_subset(ctx);
1057 xml_parse_white(ctx, 0);
1058 xml_parse_char(ctx, '>');
1063 case XML_STATE_CHARS:
1067 if (xml_peek_char(ctx) != '<')
1070 xml_append_chars(ctx);
1078 if ((c = xml_get_char(ctx)) == '?')
1081 if (!(ctx->flags & (XML_REPORT_PIS | XML_ALLOC_PIS)))
1085 if (xml_flush_chars(ctx))
1087 PULL_STATE(CHARS, CHARS_BEFORE_PI);
1097 if ((c = xml_get_char(ctx)) == '-')
1100 if (!(ctx->flags & (XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS)))
1101 xml_skip_comment(ctx);
1104 if (xml_flush_chars(ctx))
1106 PULL_STATE(CHARS, CHARS_BEFORE_COMMENT);
1109 xml_push_comment(ctx);
1111 xml_pop_comment(ctx);
1117 xml_append_cdata(ctx);
1120 xml_fatal(ctx, "Unexpected character after '<!'");
1124 /* STag | EmptyElemTag */
1125 xml_unget_char(ctx);
1126 if (xml_flush_chars(ctx))
1128 PULL_STATE(CHARS, CHARS_BEFORE_STAG);
1132 xml_push_element(ctx);
1134 if (ctx->flags & XML_EMPTY_ELEM_TAG)
1141 if (xml_flush_chars(ctx))
1143 PULL_STATE(CHARS, CHARS_BEFORE_ETAG);
1147 xml_parse_etag(ctx);
1150 xml_pop_element(ctx);
1158 TRACE(ctx, "entering epilog");
1161 /* Epilog whitespace is the only place, where a valid document can reach EOF */
1162 if (setjmp(throw_buf))
1163 if (ctx->err_code == XML_ERR_EOF)
1165 TRACE(ctx, "reached EOF");
1166 ctx->state = XML_STATE_EOF;
1167 if (ctx->h_document_end)
1168 ctx->h_document_end(ctx);
1171 ctx->err_msg = NULL;
1172 return XML_STATE_EOF;
1176 xml_parse_white(ctx, 0);
1177 if (setjmp(throw_buf))
1181 xml_parse_char(ctx, '<');
1183 if ((c = xml_get_char(ctx)) == '?')
1184 /* Processing instruction */
1185 if (!(ctx->flags & XML_REPORT_PIS))
1190 PULL_STATE(PI, EPILOG_PI);
1195 xml_parse_char(ctx, '-');
1197 if (!(ctx->flags & XML_REPORT_COMMENTS))
1198 xml_skip_comment(ctx);
1201 xml_push_comment(ctx);
1202 PULL_STATE(COMMENT, EPILOG_COMMENT);
1203 xml_pop_comment(ctx);
1207 xml_fatal(ctx, "Syntax error in the epilog");
1215 xml_next_state(struct xml_context *ctx, uns pull)
1217 uns saved = ctx->pull;
1219 uns res = xml_next(ctx);
1225 xml_skip_element(struct xml_context *ctx)
1227 ASSERT(ctx->state == XML_STATE_STAG);
1228 struct xml_node *node = ctx->node;
1229 uns saved = ctx->pull, res;
1230 ctx->pull = XML_PULL_ETAG;
1231 while ((res = xml_next(ctx)) && ctx->node != node);
1237 xml_parse(struct xml_context *ctx)
1239 /* This cycle should run only once unless the user overrides the value of ctx->pull in a SAX handler */
1244 while (xml_next(ctx));
1245 return ctx->err_code;
1249 xml_merge_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool)
1251 ASSERT(node->type == XML_NODE_ELEM);
1252 char *p = mp_start_noalign(pool, 1);
1253 XML_NODE_FOR_EACH(son, node)
1254 if (son->type == XML_NODE_CHARS)
1256 p = mp_spread(pool, p, son->len + 1);
1257 memcpy(p, son->text, son->len);
1261 return mp_end(pool, p);
1265 xml_append_dom_chars(char *p, struct mempool *pool, struct xml_node *node)
1267 XML_NODE_FOR_EACH(son, node)
1268 if (son->type == XML_NODE_CHARS)
1270 p = mp_spread(pool, p, son->len + 1);
1271 memcpy(p, son->text, son->len);
1274 else if (son->type == XML_NODE_ELEM)
1275 p = xml_append_dom_chars(p, pool, son);
1280 xml_merge_dom_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool)
1282 ASSERT(node->type == XML_NODE_ELEM);
1283 char *p = mp_start_noalign(pool, 1);
1284 p = xml_append_dom_chars(p, pool, node);
1286 return mp_end(pool, p);