2 * UCW Library -- A simple XML parser
4 * (c) 2007 Pavel Charvat <pchar@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
11 * - various character encodings
13 * - stack-like memory handling where possible
19 #include "lib/mempool.h"
20 #include "lib/fastbuf.h"
21 #include "lib/ff-utf8.h"
22 #include "lib/chartype.h"
23 #include "lib/unicode.h"
25 #include "lib/hashfunc.h"
26 #include "lib/stkstring.h"
27 #include "charset/unicat.h"
31 /*** Error handling ***/
34 xml_throw(struct xml_context *ctx)
36 ASSERT(ctx->err_code && ctx->throw_buf);
37 longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code);
41 xml_warn(struct xml_context *ctx, const char *format, ...)
46 va_start(args, format);
47 ctx->err_msg = stk_vprintf(format, args);
48 ctx->err_code = XML_ERR_WARN;
52 ctx->err_code = XML_ERR_OK;
57 xml_error(struct xml_context *ctx, const char *format, ...)
62 va_start(args, format);
63 ctx->err_msg = stk_vprintf(format, args);
64 ctx->err_code = XML_ERR_ERROR;
68 ctx->err_code = XML_ERR_OK;
73 xml_fatal(struct xml_context *ctx, const char *format, ...)
76 va_start(args, format);
77 ctx->err_msg = mp_vprintf(ctx->pool, format, args);
78 ctx->err_code = XML_ERR_FATAL;
79 ctx->state = XML_STATE_FATAL;
86 /*** Charecter categorization ***/
88 #include "obj/lib/xml-ucat.h"
94 return 1U << xml_char_tab2[(c & 0xff) + xml_char_tab1[c >> 8]];
95 else if (likely(c < 0x110000))
96 return 1U << xml_char_tab3[c >> 16];
101 /*** Reading of document/external entities ***/
104 xml_eof(struct xml_context *ctx)
106 ctx->err_msg = "Unexpected EOF";
107 ctx->err_code = XML_ERR_EOF;
112 xml_fatal_nested(struct xml_context *ctx)
114 xml_fatal(ctx, "Entity is not tested correctly");
118 xml_inc_depth(struct xml_context *ctx)
124 xml_dec_depth(struct xml_context *ctx)
126 if (unlikely(!ctx->depth))
127 xml_fatal_nested(ctx);
132 xml_push_source(struct xml_context *ctx, struct fastbuf *fb, uns flags)
134 DBG("XML: xml_push_source");
135 struct xml_source *osrc = ctx->sources;
138 osrc->bptr = ctx->bptr;
139 osrc->bstop = ctx->bstop;
140 osrc->depth = ctx->depth;
142 struct xml_source *src = mp_alloc(ctx->pool, sizeof(*src));
148 ctx->bstop = ctx->bptr = src->buf;
149 if (flags & XML_SRC_SURROUND)
152 *ctx->bptr++ = xml_char_cat(0x20);
157 xml_set_source(struct xml_context *ctx, struct fastbuf *fb)
159 xml_push_source(ctx, fb, XML_SRC_DOCUMENT | XML_SRC_DECL);
163 xml_pop_source(struct xml_context *ctx)
165 DBG("XML: xml_pop_source");
166 if (unlikely(ctx->depth))
167 xml_fatal(ctx, "Invalid entity nesting");
168 struct xml_source *src = ctx->sources;
170 ctx->sources = src = src->next;
173 ctx->bptr = src->bptr;
174 ctx->bstop = src->bstop;
175 ctx->depth = src->depth;
179 xml_error_restricted(struct xml_context *ctx, uns c)
181 xml_error(ctx, "Restricted char U+%04X", c);
182 return UNI_REPLACEMENT;
185 static void xml_parse_decl(struct xml_context *ctx);
188 xml_refill(struct xml_context *ctx)
191 // -- various encodings, especially UTF-16
192 // -- track col/row numbers
193 // -- report incorrect encoding
194 // -- deal with forbidden XML 1.1 newlines in xml/text decl
197 struct xml_source *src = ctx->sources;
198 uns c, t, t1, t2, f = src->flags;
201 else if (f & XML_SRC_DECL)
205 struct fastbuf *fb = src->fb;
206 if (ctx->bptr == ctx->bstop)
207 ctx->bptr = ctx->bstop = src->buf;
208 u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, *last_0xd = (f & XML_SRC_NEW_LINE) ? bstop : bend;
209 if (ctx->flags & XML_FLAG_VERSION_1_1)
211 t2 = XML_CHAR_NEW_LINE_1_1;
212 t1 = XML_CHAR_UNRESTRICTED_1_1 & ~t2;
216 t2 = XML_CHAR_NEW_LINE_1_0;
217 t1 = XML_CHAR_VALID_1_0 & ~t2;
221 c = bget_utf8_32(fb);
232 * XML 1.0: 0xA | 0xD | 0xD 0xA
233 * XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */
235 *bstop++ = xml_char_cat(0xa);
238 else if (c != 0x2028 && last_0xd != bstop - 2)
241 else if ((int)c >= 0)
243 /* Restricted character */
244 c = xml_error_restricted(ctx, c);
246 *bstop++ = xml_char_cat(c);
251 if (f & XML_SRC_SURROUND)
254 *bstop++ = xml_char_cat(0x20);
260 if (last_0xd == bstop)
261 f |= XML_SRC_NEW_LINE;
263 f &= ~XML_SRC_NEW_LINE;
264 ctx->sources->flags = f;
266 DBG("XML: refilled %u characters", (uns)(ctx->bstop - ctx->bptr) / 2);
269 while (ctx->bptr == ctx->bstop);
273 xml_peek_char(struct xml_context *ctx)
275 if (ctx->bptr == ctx->bstop)
281 xml_peek_cat(struct xml_context *ctx)
283 if (ctx->bptr == ctx->bstop)
289 xml_get_char(struct xml_context *ctx)
291 uns c = xml_peek_char(ctx);
297 xml_get_cat(struct xml_context *ctx)
299 uns c = xml_peek_cat(ctx);
305 xml_last_char(struct xml_context *ctx)
307 return ctx->bptr[-2];
311 xml_last_cat(struct xml_context *ctx)
313 return ctx->bptr[-1];
317 xml_skip_char(struct xml_context *ctx)
319 uns c = ctx->bptr[0];
325 xml_unget_char(struct xml_context *ctx)
327 return *(ctx->bptr -= 2);
330 /*** Basic parsing ***/
333 xml_fatal_expected(struct xml_context *ctx, uns c)
335 xml_fatal(ctx, "Expected '%c'", c);
339 xml_fatal_expected_white(struct xml_context *ctx)
341 xml_fatal(ctx, "Expected a white space");
345 xml_fatal_expected_quot(struct xml_context *ctx)
347 xml_fatal(ctx, "Expected a quotation mark");
351 xml_parse_white(struct xml_context *ctx, uns mandatory)
353 /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+
354 * mandatory=0 -> S? */
356 while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
361 if (unlikely(mandatory && !cnt))
362 xml_fatal_expected_white(ctx);
367 xml_parse_char(struct xml_context *ctx, uns c)
369 /* Consumes a given Unicode character */
370 if (unlikely(c != xml_get_char(ctx)))
371 xml_fatal_expected(ctx, c);
375 xml_parse_seq(struct xml_context *ctx, const char *seq)
377 /* Consumes a given sequence of ASCII characters */
379 xml_parse_char(ctx, *seq++);
383 xml_parse_eq(struct xml_context *ctx)
385 /* Eq ::= S? '=' S? */
386 xml_parse_white(ctx, 0);
387 xml_parse_char(ctx, '=');
388 xml_parse_white(ctx, 0);
392 xml_parse_quote(struct xml_context *ctx)
395 uns c = xml_get_char(ctx);
396 if (unlikely(c != '\'' && c != '\"'))
397 xml_fatal_expected_quot(ctx);
401 /* Names and nmtokens */
404 xml_parse_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err)
406 char *p = mp_start_noalign(ctx->pool, 1);
407 if (unlikely(!(xml_peek_cat(ctx) & first_cat)))
408 xml_fatal(ctx, "%s", err);
411 p = mp_spread(ctx->pool, p, 5);
412 p = utf8_32_put(p, xml_skip_char(ctx));
414 while (xml_peek_cat(ctx) & next_cat);
416 return mp_end(ctx->pool, p);
420 xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err)
422 if (unlikely(!(xml_get_cat(ctx) & first_cat)))
423 xml_fatal(ctx, "%s", err);
424 while (xml_peek_cat(ctx) & next_cat)
429 xml_parse_name(struct xml_context *ctx)
431 /* Name ::= NameStartChar (NameChar)* */
432 return xml_parse_string(ctx,
433 !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1,
434 !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1,
439 xml_skip_name(struct xml_context *ctx)
442 !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1,
443 !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1,
448 xml_parse_nmtoken(struct xml_context *ctx)
450 /* Nmtoken ::= (NameChar)+ */
451 uns cat = !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1;
452 return xml_parse_string(ctx, cat, cat, "Expected a nmtoken");
455 /* Simple literals */
458 xml_parse_system_literal(struct xml_context *ctx)
460 /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */
461 char *p = mp_start_noalign(ctx->pool, 1);
462 uns q = xml_parse_quote(ctx), c;
463 while ((c = xml_get_char(ctx)) != q)
465 p = mp_spread(ctx->pool, p, 5);
466 p = utf8_32_put(p, c);
469 return mp_end(ctx->pool, p);
473 xml_parse_pubid_literal(struct xml_context *ctx)
475 /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */
476 char *p = mp_start_noalign(ctx->pool, 1);
477 uns q = xml_parse_quote(ctx), c;
478 while ((c = xml_get_char(ctx)) != q)
480 if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID)))
481 xml_fatal(ctx, "Expected a pubid character");
482 p = mp_spread(ctx->pool, p, 2);
486 return mp_end(ctx->pool, p);
490 xml_parse_encoding_name(struct xml_context *ctx)
492 /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */
493 char *p = mp_start_noalign(ctx->pool, 1);
494 uns q = xml_parse_quote(ctx);
495 if (unlikely(!(xml_peek_cat(ctx) & XML_CHAR_ENC_SNAME)))
496 xml_fatal(ctx, "Invalid character in the encoding name");
499 p = mp_spread(ctx->pool, p, 2);
500 *p++ = xml_skip_char(ctx);
501 if (xml_get_char(ctx) == q)
503 if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME)))
504 xml_fatal(ctx, "Invalid character in the encoding name");
507 return mp_end(ctx->pool, p);
510 /* Document/external entity header */
513 xml_detect_encoding(struct xml_context *ctx)
515 DBG("XML: xml_detect_encoding");
516 struct xml_source *src = ctx->sources;
517 struct fastbuf *fb = src->fb;
518 char *detected_encoding = NULL;
519 uns x = 0, l = 0, c, z = 1;
522 if (!~(c = bgetc(fb)))
524 src->flags |= XML_SRC_EOF;
527 else if (!c || c >= 0xfe || c == 0xa7 || c == 0x94)
529 else if ((c < 0x3c || c > 0x78))
543 xml_fatal(ctx, "UTF-16BE encoding not supported");
545 xml_fatal(ctx, "UTF-16LE encoding not supported");
553 xml_fatal(ctx, "UCS-4BE encoding not supported");
555 xml_fatal(ctx, "UCS-4LE encoding not supported");
557 xml_fatal(ctx, "UCS-4 encoding (order 2143) not supported");
559 xml_fatal(ctx, "UCS-4 encoding (order 3412) not supported");
561 xml_fatal(ctx, "UCS-4BE encoding not supported");
563 xml_fatal(ctx, "UCS-4LE encoding not supported");
565 xml_fatal(ctx, "UCS-4 encoding (order 2143) not supported");
567 xml_fatal(ctx, "UCS-4 encoding (order 3412) not supported");
569 xml_fatal(ctx, "UTF-16BE encoding not supported");
571 xml_fatal(ctx, "UTF-16LE encoding not supported");
573 xml_fatal(ctx, "EBCDIC encoding not supported");
579 xml_fatal(ctx, "Cannot detect the encoding");
580 ctx->bptr = ctx->bstop = src->buf + 8;
585 *--ctx->bptr = xml_char_cat(c);
588 if (!detected_encoding && ctx->bstop == ctx->bptr && xml_peek_char(ctx) == 0xfeff)
590 DBG("XML: Detected encoding: %s", detected_encoding ? : "UTF-8");
591 if (!(src->flags & XML_SRC_EOF))
596 xml_parse_decl(struct xml_context *ctx)
598 DBG("XML: xml_parse_decl");
599 ctx->sources->flags &= ~XML_SRC_DECL;
600 xml_detect_encoding(ctx);
601 uns document = ctx->sources->flags & XML_SRC_DOCUMENT;
602 u32 *bptr = ctx->bptr;
604 (12 <= ctx->bstop - ctx->bptr &&
605 bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L' &&
606 (bptr[11] & XML_CHAR_WHITE));
610 xml_fatal(ctx, "Missing or corrupted XML declaration header");
615 /* FIXME: the header must not contain exotic newlines */
616 xml_parse_white(ctx, 0);
618 if (xml_peek_char(ctx) == 'v')
620 xml_parse_seq(ctx, "version");
622 char *version = xml_parse_pubid_literal(ctx);
623 DBG("XML: Version=%s", version);
626 ctx->version_str = version;
627 if (!strcmp(ctx->version_str, "1.0"))
629 else if (!strcmp(ctx->version_str, "1.1"))
630 ctx->flags |= XML_FLAG_VERSION_1_1;
632 xml_fatal(ctx, "Unsupported XML version");
634 else if (strcmp(version, ctx->version_str))
635 xml_error(ctx, "Mixed XML versions");
638 xml_fatal(ctx, "Missing XML version");
640 // FIXME: TextDecl must contain encoding
641 if (!xml_parse_white(ctx, 0))
643 if (xml_peek_char(ctx) == 'e')
645 xml_parse_seq(ctx, "encoding");
647 ctx->encoding = xml_parse_encoding_name(ctx);
648 DBG("encoding=%s", ctx->encoding);
649 // FIXME: check encoding
650 if (!xml_parse_white(ctx, 0))
654 if (document && xml_peek_char(ctx) == 's')
656 xml_parse_seq(ctx, "standalone");
658 uns c = xml_parse_quote(ctx);
659 if (ctx->standalone = (xml_peek_char(ctx) == 'y'))
660 xml_parse_seq(ctx, "yes");
662 xml_parse_seq(ctx, "no");
663 xml_parse_char(ctx, c);
664 DBG("standalone=%d", ctx->standalone);
665 xml_parse_white(ctx, 0);
668 xml_parse_seq(ctx, "?>");
671 /*** Document Type Definition (DTD) ***/
675 #define HASH_PREFIX(x) xml_dtd_notns_##x
676 #define HASH_NODE struct xml_dtd_notn
677 #define HASH_KEY_STRING name
678 #define HASH_AUTO_POOL 1024
679 #define HASH_ZERO_FILL
680 #define HASH_TABLE_DYNAMIC
681 #define HASH_WANT_FIND
682 #define HASH_WANT_LOOKUP
683 #define HASH_WANT_CLEANUP
684 #include "lib/hashtable.h"
686 /* General entities */
688 #define HASH_PREFIX(x) xml_dtd_ents_##x
689 #define HASH_NODE struct xml_dtd_ent
690 #define HASH_KEY_STRING name
691 #define HASH_AUTO_POOL 1024
692 #define HASH_ZERO_FILL
693 #define HASH_TABLE_DYNAMIC
694 #define HASH_WANT_FIND
695 #define HASH_WANT_LOOKUP
696 #define HASH_WANT_CLEANUP
697 #include "lib/hashtable.h"
700 xml_dtd_declare_trivial_gent(struct xml_context *ctx, char *name, char *text)
702 struct xml_dtd *dtd = ctx->dtd;
703 struct xml_dtd_ent *ent = xml_dtd_ents_lookup(dtd->tab_gents, name);
704 if (ent->flags & XML_DTD_ENT_DECLARED)
706 xml_warn(ctx, "Entity &%s; already declared", name);
709 slist_add_tail(&dtd->gents, &ent->n);
710 ent->flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL;
715 xml_dtd_declare_default_gents(struct xml_context *ctx)
717 xml_dtd_declare_trivial_gent(ctx, "lt", "<");
718 xml_dtd_declare_trivial_gent(ctx, "gt", ">");
719 xml_dtd_declare_trivial_gent(ctx, "amp", "&");
720 xml_dtd_declare_trivial_gent(ctx, "apos", "'");
721 xml_dtd_declare_trivial_gent(ctx, "quot", "\"");
724 static struct xml_dtd_ent *
725 xml_dtd_find_gent(struct xml_context *ctx, char *name)
727 struct xml_dtd *dtd = ctx->dtd;
730 struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_gents, name);
731 return (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL;
735 #define ENT(n, t) ent_##n = { .name = #n, .text = t, .len = 1, .flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL }
736 static struct xml_dtd_ent ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\"");
741 if (!strcmp(name, "lt"))
745 if (!strcmp(name, "gt"))
749 if (!strcmp(name, "amp"))
751 if (!strcmp(name, "apos"))
755 if (!strcmp(name, "quot"))
763 /* Parameter entities */
765 static struct xml_dtd_ent *
766 xml_dtd_find_pent(struct xml_context *ctx, char *name)
768 struct xml_dtd *dtd = ctx->dtd;
769 struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_pents, name);
770 return (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL;
775 #define HASH_PREFIX(x) xml_dtd_elems_##x
776 #define HASH_NODE struct xml_dtd_elem
777 #define HASH_KEY_STRING name
778 #define HASH_TABLE_DYNAMIC
779 #define HASH_AUTO_POOL 1024
780 #define HASH_ZERO_FILL
781 #define HASH_WANT_LOOKUP
782 #define HASH_WANT_CLEANUP
783 #include "lib/hashtable.h"
785 /* Element attributes */
787 struct xml_dtd_attrs_table;
790 xml_dtd_attrs_hash(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem, char *name)
792 return hash_pointer(elem) ^ hash_string(name);
796 xml_dtd_attrs_eq(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem1, char *name1, struct xml_dtd_elem *elem2, char *name2)
798 return (elem1 == elem2) && !strcmp(name1, name2);
802 xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_elem *elem, char *name)
808 #define HASH_PREFIX(x) xml_dtd_attrs_##x
809 #define HASH_NODE struct xml_dtd_attr
810 #define HASH_AUTO_POOL 1024
811 #define HASH_ZERO_FILL
812 #define HASH_TABLE_DYNAMIC
813 #define HASH_KEY_COMPLEX(x) x elem, x name
814 #define HASH_KEY_DECL struct xml_dtd_elem *elem, char *name
815 #define HASH_GIVE_HASHFN
817 #define HASH_GIVE_INIT_KEY
818 #define HASH_WANT_FIND
819 #define HASH_WANT_NEW
820 #define HASH_WANT_CLEANUP
821 #include "lib/hashtable.h"
823 /* Enumerated attribute values */
825 struct xml_dtd_evals_table;
828 xml_dtd_evals_hash(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr, char *val)
830 return hash_pointer(attr) ^ hash_string(val);
834 xml_dtd_evals_eq(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr1, char *val1, struct xml_dtd_attr *attr2, char *val2)
836 return (attr1 == attr2) && !strcmp(val1, val2);
840 xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_eval *eval, struct xml_dtd_attr *attr, char *val)
846 #define HASH_PREFIX(x) xml_dtd_evals_##x
847 #define HASH_NODE struct xml_dtd_eval
848 #define HASH_AUTO_POOL 1024
849 #define HASH_TABLE_DYNAMIC
850 #define HASH_KEY_COMPLEX(x) x attr, x val
851 #define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val
852 #define HASH_GIVE_HASHFN
854 #define HASH_GIVE_INIT_KEY
855 #define HASH_WANT_FIND
856 #define HASH_WANT_NEW
857 #define HASH_WANT_CLEANUP
858 #include "lib/hashtable.h"
860 /* Enumerated attribute notations */
862 struct xml_dtd_enotns_table;
865 xml_dtd_enotns_hash(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn)
867 return hash_pointer(attr) ^ hash_pointer(notn);
871 xml_dtd_enotns_eq(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr1, struct xml_dtd_notn *notn1, struct xml_dtd_attr *attr2, struct xml_dtd_notn *notn2)
873 return (attr1 == attr2) && (notn1 == notn2);
877 xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_enotn *enotn, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn)
883 #define HASH_PREFIX(x) xml_dtd_enotns_##x
884 #define HASH_NODE struct xml_dtd_enotn
885 #define HASH_AUTO_POOL 1024
886 #define HASH_TABLE_DYNAMIC
887 #define HASH_KEY_COMPLEX(x) x attr, x notn
888 #define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn
889 #define HASH_GIVE_HASHFN
891 #define HASH_GIVE_INIT_KEY
892 #define HASH_WANT_FIND
893 #define HASH_WANT_NEW
894 #define HASH_WANT_CLEANUP
895 #include "lib/hashtable.h"
897 /* DTD initialization/cleanup */
900 xml_dtd_init(struct xml_context *ctx)
902 ctx->dtd = mp_alloc_zero(ctx->pool, sizeof(*ctx->dtd));
903 xml_dtd_ents_init(ctx->dtd->tab_gents = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_ents_table)));
904 xml_dtd_ents_init(ctx->dtd->tab_pents = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_ents_table)));
905 xml_dtd_notns_init(ctx->dtd->tab_notns = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_notns_table)));
906 xml_dtd_elems_init(ctx->dtd->tab_elems = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_elems_table)));
907 xml_dtd_attrs_init(ctx->dtd->tab_attrs = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_attrs_table)));
908 xml_dtd_evals_init(ctx->dtd->tab_evals = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_evals_table)));
909 xml_dtd_enotns_init(ctx->dtd->tab_enotns = mp_alloc_zero(ctx->pool, sizeof(struct xml_dtd_enotns_table)));
910 xml_dtd_declare_default_gents(ctx);
914 xml_dtd_cleanup(struct xml_context *ctx)
918 xml_dtd_ents_cleanup(ctx->dtd->tab_gents);
919 xml_dtd_ents_cleanup(ctx->dtd->tab_pents);
920 xml_dtd_notns_cleanup(ctx->dtd->tab_notns);
921 xml_dtd_elems_cleanup(ctx->dtd->tab_elems);
922 xml_dtd_attrs_cleanup(ctx->dtd->tab_attrs);
923 xml_dtd_evals_cleanup(ctx->dtd->tab_evals);
924 xml_dtd_enotns_cleanup(ctx->dtd->tab_enotns);
928 xml_dtd_finish(struct xml_context *ctx)
935 /*** Parsing functions ***/
940 xml_push_comment(struct xml_context *ctx)
942 /* Parse a comment to ctx->value:
943 * Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
944 * Already parsed: '<!-' */
945 struct fastbuf *out = ctx->value;
947 xml_parse_char(ctx, '-');
950 if ((c = xml_get_char(ctx)) == '-')
951 if ((c = xml_get_char(ctx)) == '-')
955 bput_utf8_32(out, c);
957 xml_parse_char(ctx, '>');
964 xml_pop_comment(struct xml_context *ctx)
966 fbgrow_rewind(ctx->value);
970 xml_skip_comment(struct xml_context *ctx)
972 xml_parse_char(ctx, '-');
973 while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-');
974 xml_parse_char(ctx, '>');
977 /* Processing instructions */
980 xml_push_pi(struct xml_context *ctx)
982 /* Parses a PI to ctx->value and ctx->name:
983 * PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
984 * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
985 * Already parsed: '<?' */
987 ctx->name = xml_parse_name(ctx);
988 if (unlikely(!strcasecmp(ctx->name, "xml")))
989 xml_fatal(ctx, "Reserved PI target");
990 struct fastbuf *out = ctx->value;
991 if (xml_parse_white(ctx, 0))
992 xml_parse_seq(ctx, "?>");
998 if ((c = xml_get_char(ctx)) == '?')
999 if (xml_get_char(ctx) == '>')
1003 xml_unget_char(ctx);
1007 bput_utf8_32(out, c);
1016 xml_pop_pi(struct xml_context *ctx)
1018 fbgrow_reset(ctx->value);
1022 xml_skip_pi(struct xml_context *ctx)
1024 if (ctx->flags & XML_FLAG_VALIDATING)
1027 if (unlikely(!strcasecmp(xml_parse_name(ctx), "xml")))
1028 xml_fatal(ctx, "Reserved PI target");
1030 if (!xml_parse_white(ctx, 0))
1032 xml_parse_seq(ctx, "?>");
1037 if (xml_get_char(ctx) == '?')
1038 if (xml_get_char(ctx) == '>')
1041 xml_unget_char(ctx);
1044 /* Character references */
1047 xml_parse_char_ref(struct xml_context *ctx)
1049 /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1050 * Already parsed: '&#' */
1052 if (xml_get_char(ctx) == 'x')
1054 if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT))
1056 xml_error(ctx, "Expected a hexadecimal value of character reference");
1061 v = (v << 4) + Cxvalue(xml_last_char(ctx));
1063 while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT));
1067 if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT))
1069 xml_error(ctx, "Expected a numeric value of character reference");
1074 v = v * 10 + xml_last_char(ctx) - '0';
1076 while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT));
1078 uns cat = xml_char_cat(v);
1079 if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_FLAG_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0)))
1081 xml_error(ctx, "Character reference out of range");
1084 if (xml_last_char(ctx) == ';')
1086 xml_error(ctx, "Expected ';'");
1088 while (xml_last_char(ctx) != ';')
1090 return UNI_REPLACEMENT;
1093 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
1096 xml_parse_parameter_ref(struct xml_context *ctx)
1098 char *name = xml_parse_name(ctx);
1099 xml_parse_char(ctx, ';');
1100 struct xml_dtd_ent *ent = xml_dtd_ents_find(ctx->dtd->tab_pents, name);
1101 if (!ent || !(ent->flags & XML_DTD_ENT_DECLARED))
1103 xml_error(ctx, "Reference to unknown parameter entity %%%s", name);
1106 if (ent->flags & XML_DTD_ENT_VISITED)
1108 xml_error(ctx, "Cycled references to parameter entity %%%s", name);
1111 if (ent->flags & XML_DTD_ENT_EXTERNAL)
1114 xml_error(ctx, "Support for external parsed entities not implemented");
1117 ent->flags |= XML_DTD_ENT_VISITED; // FIXME: clear
1118 struct fastbuf *fb = mp_alloc(ctx->pool, sizeof(*fb));
1119 fbbuf_init_read(fb, ent->text, ent->len, 0);
1120 xml_push_source(ctx, fb, 0);
1124 xml_check_parameter_ref(struct xml_context *ctx)
1126 if (xml_get_char(ctx) != '%')
1128 xml_unget_char(ctx);
1131 xml_parse_parameter_ref(ctx);
1135 xml_parse_external_id(struct xml_context *ctx, struct xml_ext_id *eid, uns allow_public)
1137 bzero(eid, sizeof(*eid));
1138 uns c = xml_get_char(ctx);
1141 xml_parse_seq(ctx, "YSTEM");
1142 xml_parse_white(ctx, 1);
1143 eid->system_id = xml_parse_system_literal(ctx);
1147 xml_parse_seq(ctx, "UBLIC");
1148 xml_parse_white(ctx, 1);
1149 eid->public_id = xml_parse_pubid_literal(ctx);
1150 if (xml_parse_white(ctx, 1))
1151 if ((c = xml_get_char(ctx)) == '\'' || c == '"' || !allow_public)
1153 xml_unget_char(ctx);
1154 eid->system_id = xml_parse_system_literal(ctx);
1157 xml_unget_char(ctx);
1160 xml_fatal(ctx, "Expected an external ID");
1164 xml_parse_notation_decl(struct xml_context *ctx)
1166 /* NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'*/
1167 xml_parse_white(ctx, 1);
1168 struct xml_dtd_notn *notn = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx));
1169 xml_parse_white(ctx, 1);
1170 struct xml_ext_id eid;
1171 xml_parse_external_id(ctx, &eid, 1);
1172 xml_parse_white(ctx, 0);
1173 xml_parse_char(ctx, '>');
1174 if (notn->flags & XML_DTD_NOTN_DECLARED)
1175 xml_warn(ctx, "Notation %s already declared", notn->name);
1178 notn->flags = XML_DTD_NOTN_DECLARED;
1184 xml_parse_internal_subset(struct xml_context *ctx)
1188 xml_parse_white(ctx, 0);
1189 uns c = xml_get_char(ctx);
1191 if ((c = xml_get_char(ctx)) == '!')
1192 switch (c = xml_get_char(ctx))
1195 xml_push_comment(ctx);
1196 xml_pop_comment(ctx);
1199 xml_parse_seq(ctx, "OTATION");
1200 xml_parse_notation_decl(ctx);
1203 if ((c = xml_get_char(ctx)) == 'N')
1205 xml_parse_seq(ctx, "TITY");
1206 //xml_parse_entity_decl(ctx);
1210 xml_parse_seq(ctx, "EMENT");
1214 goto invalid_markup;
1217 xml_parse_seq(ctx, "TTLIST");
1221 goto invalid_markup;
1229 goto invalid_markup;
1231 xml_parse_parameter_ref(ctx);
1235 goto invalid_markup;
1239 xml_fatal(ctx, "Invalid markup in the internal subset");
1242 /*----------------------------------------------*/
1247 struct xml_attribute_table;
1249 #define HASH_PREFIX(x) xml_attribute_##x
1250 #define HASH_NODE struct xml_attribute
1251 #define HASH_KEY_COMPLEX(x) x element, x name
1252 #define HASH_KEY_DECL struct xml_element *element, char *name
1253 #define HASH_TABLE_DYNAMIC
1254 #define HASH_AUTO_POOL 1024
1256 #define HASH_GIVE_HASHFN
1259 xml_attribute_hash(struct xml_attribute_table *t UNUSED, struct xml_element *e, char *n)
1261 return hash_pointer(e) ^ hash_string(n);
1264 #define HASH_GIVE_EQ
1267 xml_attribute_eq(struct xml_attribute_table *t UNUSED, struct xml_element *e1, char *n1, struct xml_element *e2, char *n2)
1269 return (e1 == e2) && !strcmp(n1, n2);
1272 #define HASH_GIVE_INIT_KEY
1275 xml_attribute_init_key(struct xml_attribute_table *t UNUSED, struct xml_attribute *a, struct xml_element *e, char *name)
1284 #define HASH_WANT_CLEANUP
1285 #define HASH_WANT_REMOVE
1286 #define HASH_WANT_LOOKUP
1287 #define HASH_WANT_FIND
1288 #include "lib/hashtable.h"
1292 #define HASH_PREFIX(x) xml_parsed_entities_##x
1293 #define HASH_NODE struct xml_parsed_entity
1294 #define HASH_KEY_STRING name
1295 #define HASH_TABLE_DYNAMIC
1296 #define HASH_AUTO_POOL 1024
1297 #define HASH_WANT_CLEANUP
1298 #include "lib/hashtable.h"
1302 xml_init(struct xml_context *ctx)
1304 bzero(ctx, sizeof(*ctx));
1305 ctx->pool = mp_new(65536);
1306 ctx->chars = fbgrow_create(4096);
1307 ctx->value = fbgrow_create(4096);
1312 xml_cleanup(struct xml_context *ctx)
1314 xml_dtd_cleanup(ctx);
1317 mp_delete(ctx->pool);
1321 xml_parse_cdata(struct xml_context *ctx)
1323 struct fastbuf *out = ctx->chars;
1324 xml_parse_seq(ctx, "CDATA[");
1328 if ((c = xml_get_char(ctx)) == ']')
1330 if ((c = xml_get_char(ctx)) == ']')
1331 if ((c = xml_get_char(ctx)) == '>')
1337 bput_utf8_32(out, c);
1342 xml_skip_cdata(struct xml_context *ctx)
1344 xml_parse_cdata(ctx);
1348 xml_parse_ref_entity(struct xml_context *ctx UNUSED, struct fastbuf *out UNUSED, struct xml_dtd_ent *entity UNUSED)
1351 for (struct xml_dtd_ent_node *node = entity->list; node; node = node->next)
1353 bwrite(out, node->ptr, node->len);
1355 xml_parse_ref_entity(ctx, out, node->ptr); // FIXME: do not call the recursion on stack -- could cause segfault
1360 xml_parse_ref(struct xml_context *ctx, struct fastbuf *out)
1362 if (xml_get_char(ctx) == '#')
1364 uns c = xml_parse_char_ref(ctx);
1365 bput_utf8_32(out, c);
1370 xml_unget_char(ctx);
1372 char *name = xml_parse_name(ctx);
1373 struct xml_parsed_entity *entity = xml_find_parsed_entity(ctx, name);
1375 xml_parse_char(ctx, ';');
1376 xml_parse_ref_entity(ctx, out, entity);
1382 xml_parse_chars(struct xml_context *ctx)
1385 struct fastbuf *out = ctx->chars;
1387 while ((c = xml_get_char(ctx)) != '<')
1389 xml_parse_ref(ctx, out);
1391 bput_utf8_32(out, c);
1392 xml_unget_char(ctx);
1396 xml_parse_attr(struct xml_context *ctx)
1399 struct xml_element *e = ctx->element;
1400 char *name = xml_parse_name(ctx);
1401 struct xml_attribute *a = xml_attribute_lookup(ctx->attribute_table, e, name);
1403 xml_fatal(ctx, "Attribute is not unique");
1406 char *value = xml_parse_system_literal(ctx);
1411 xml_parse_stag(struct xml_context *ctx)
1415 struct xml_element *e = mp_alloc_zero(ctx->pool, sizeof(*e));
1416 e->parent = ctx->element;
1418 e->name = xml_parse_name(ctx);
1421 uns white = xml_parse_white(ctx, 0);
1422 uns c = xml_get_char(ctx);
1425 xml_parse_char(ctx, '>');
1431 xml_fatal(ctx, "Expected a white space");
1432 xml_unget_char(ctx);
1433 xml_parse_attr(ctx);
1438 xml_parse_etag(struct xml_context *ctx)
1441 struct xml_element *e = ctx->element;
1443 char *name = xml_parse_name(ctx);
1444 if (strcmp(name, e->name))
1445 xml_fatal(ctx, "Invalid ETag, expected '%s'", e->name);
1446 xml_parse_white(ctx, 0);
1447 xml_parse_char(ctx, '>');
1448 // FIXME: remove on pooled hashtable?
1449 for (struct xml_attribute *a = e->attrs; a; a = a->next)
1450 xml_attribute_remove(ctx->attribute_table, a);
1451 ctx->element = e->parent;
1456 xml_parse_element_decl(struct xml_context *ctx)
1460 xml_parse_seq(ctx, "<!ELEMENT");
1461 xml_parse_white(ctx, 1);
1462 xml_parse_name(ctx);
1463 xml_parse_white(ctx, 1);
1465 uns c = xml_get_char(ctx);
1468 xml_parse_seq(ctx, "MPTY");
1473 xml_parse_seq(ctx, "NY");
1478 xml_parse_white(ctx, 0);
1479 if (xml_get_char(ctx) == '#')
1481 xml_parse_seq(ctx, "PCDATA");
1484 xml_parse_white(ctx, 0);
1485 if ((c = xml_get_char(ctx)) == ')')
1488 xml_fatal_expected(ctx, ')');
1489 xml_parse_white(ctx, 0);
1490 xml_parse_name(ctx);
1496 xml_unget_char(ctx);
1500 xml_parse_white(ctx, 0);
1501 if ((c = xml_get_char(ctx)) == '(')
1507 if ((c = xml_get_char(ctx)) == '?' || c == '*' || c == '+')
1511 xml_unget_char(ctx);
1523 xml_unget_char(ctx);
1524 xml_parse_name(ctx);
1530 xml_fatal(ctx, "Expected element content specification");
1532 xml_parse_white(ctx, 0);
1533 xml_parse_char(ctx, '>');
1539 xml_parse_attr_list_decl(struct xml_context *ctx)
1541 /* AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
1542 * AttDef ::= S Name S AttType S DefaultDecl */
1543 xml_parse_seq(ctx, "ATTLIST");
1544 xml_parse_white(ctx, 1);
1545 struct xml_dtd_elem *e = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx));
1546 e->attlist_declared = 1;
1548 while (xml_parse_white(ctx, 0) && xml_get_char(ctx) != '>')
1550 xml_unget_char(ctx);
1551 char *name = xml_parse_name(ctx);
1552 struct xml_dtd_attr *a = xml_dtd_attrs_find(ctx->dtd->tab_attrs, e, name);
1556 xml_warn(ctx, "Duplicate attribute definition");
1560 a = xml_dtd_attrs_new(ctx->dtd->tab_attrs, e, name);
1561 xml_parse_white(ctx, 1);
1562 if (xml_get_char(ctx) == '(')
1565 a->type = XML_ATTR_ENUM;
1568 xml_parse_white(ctx, 0);
1569 char *value = xml_parse_nmtoken(ctx);
1571 if (xml_dtd_evals_find(ctx->dtd->tab_evals, a, value))
1572 xml_error(ctx, "Duplicate enumeration value");
1574 xml_dtd_evals_new(ctx->dtd->tab_evals, a, value);
1575 xml_parse_white(ctx, 0);
1577 while (xml_get_char(ctx) == '|');
1578 xml_unget_char(ctx);
1579 xml_parse_char(ctx, ')');
1583 xml_unget_char(ctx);
1584 char *type = xml_parse_name(ctx);
1585 enum xml_dtd_attribute_type t;
1586 if (!strcmp(type, "CDATA"))
1588 else if (!strcmp(type, "ID"))
1590 else if (!strcmp(type, "IDREF"))
1592 else if (!strcmp(type, "IDREFS"))
1593 t = XML_ATTR_IDREFS;
1594 else if (!strcmp(type, "ENTITY"))
1595 t = XML_ATTR_ENTITY;
1596 else if (!strcmp(type, "ENTITIES"))
1597 t = XML_ATTR_ENTITIES;
1598 else if (!strcmp(type, "NMTOKEN"))
1599 t = XML_ATTR_NMTOKEN;
1600 else if (!strcmp(type, "NMTOKENS"))
1601 t = XML_ATTR_NMTOKENS;
1602 else if (!strcmp(type, "NOTATION"))
1604 t = XML_ATTR_NOTATION;
1605 xml_parse_white(ctx, 1);
1606 xml_parse_char(ctx, '(');
1609 xml_parse_white(ctx, 0);
1610 struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx));
1612 if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, a, n))
1613 xml_error(ctx, "Duplicate enumerated notation");
1615 xml_dtd_enotns_new(ctx->dtd->tab_enotns, a, n);
1616 xml_parse_white(ctx, 0);
1618 while (xml_get_char(ctx) == '|');
1619 xml_unget_char(ctx);
1620 xml_parse_char(ctx, ')');
1623 xml_fatal(ctx, "Unknown attribute type");
1627 xml_parse_white(ctx, 1);
1628 enum xml_dtd_attribute_default def = XML_ATTR_NONE;
1629 if (xml_get_char(ctx) == '#')
1630 switch (xml_get_char(ctx))
1633 xml_parse_seq(ctx, "EQUIRED");
1634 def = XML_ATTR_REQUIRED;
1637 xml_parse_seq(ctx, "MPLIED");
1638 def = XML_ATTR_IMPLIED;
1641 xml_parse_seq(ctx, "IXED");
1642 def = XML_ATTR_FIXED;
1645 xml_fatal(ctx, "Expected a modifier for default attribute value");
1648 xml_unget_char(ctx);
1649 if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED)
1651 xml_parse_system_literal(ctx);
1659 xml_parse_entity_decl(struct xml_context *ctx)
1661 struct xml_dtd *dtd = ctx->dtd;
1662 xml_parse_white(ctx, 1);
1664 uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENT_PARAMETER : 0;
1666 xml_parse_white(ctx, 1);
1668 xml_unget_char(ctx);
1670 struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_gents, xml_parse_name(ctx));
1671 slist *list = flags ? &dtd->pents : &dtd->gents;
1672 xml_parse_white(ctx, 1);
1673 if (ent->flags & XML_DTD_ENT_DECLARED)
1675 xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name);
1676 // FIXME: should be only warning
1679 uns sep = xml_get_char(ctx), c;
1680 if (sep == '\'' || sep == '"')
1683 * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */
1684 struct fastbuf *out = ctx->value;
1688 if ((c = xml_get_char(ctx)) == sep)
1694 //xml_parse_parameter_ref(ctx);
1697 bput_utf8_32(out, c);
1698 else if ((c = xml_get_char(ctx)) == '#')
1699 c = xml_parse_char_ref(ctx);
1702 /* Bypass references to general entities */
1705 xml_unget_char(ctx);
1706 bputs(out, xml_parse_name(ctx));
1707 xml_parse_char(ctx, ';');
1714 slist_add_tail(list, &ent->n);
1715 ent->flags = flags | XML_DTD_ENT_DECLARED;
1716 ent->len = out->bstop - out->bptr - 1;
1717 ent->text = mp_memdup(ctx->pool, out->bptr, ent->len + 1);
1722 /* External entity */
1723 struct xml_ext_id eid;
1724 struct xml_dtd_notn *notn = NULL;
1725 xml_parse_external_id(ctx, &eid, 0);
1726 if (!xml_parse_white(ctx, 0) || !flags)
1727 xml_parse_char(ctx, '>');
1728 else if (xml_get_char(ctx) != '>')
1730 /* General external unparsed entity */
1731 flags |= XML_DTD_ENT_UNPARSED;
1732 xml_parse_seq(ctx, "NDATA");
1733 xml_parse_white(ctx, 1);
1734 notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx));
1736 slist_add_tail(list, &ent->n);
1737 ent->flags = flags | XML_DTD_ENT_DECLARED | XML_DTD_ENT_EXTERNAL;
1744 xml_parse_doctype_decl(struct xml_context *ctx)
1746 if (ctx->document_type)
1747 xml_fatal(ctx, "Multiple document types not allowed");
1748 xml_parse_seq(ctx, "DOCTYPE");
1749 xml_parse_white(ctx, 1);
1750 ctx->document_type = xml_parse_name(ctx);
1751 DBG("XML: DocumentType=%s", ctx->document_type);
1752 uns white = xml_parse_white(ctx, 0);
1753 uns c = xml_peek_char(ctx);
1754 if (c != '>' && c != '[' && white)
1756 xml_parse_external_id(ctx, &ctx->eid, 0);
1757 xml_parse_white(ctx, 0);
1759 if (ctx->h_doctype_decl)
1760 ctx->h_doctype_decl(ctx);
1764 xml_next(struct xml_context *ctx)
1766 /* A nasty state machine */
1768 DBG("XML: xml_next (state=%u)", ctx->state);
1770 ctx->throw_buf = &throw_buf;
1771 if (setjmp(throw_buf))
1774 if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal)
1776 ctx->state = XML_STATE_FATAL;
1777 DBG("XML: raised fatal error");
1783 case XML_STATE_FATAL:
1786 case XML_STATE_START:
1787 DBG("XML: Entering Prolog");
1788 if (ctx->h_document_start)
1789 ctx->h_document_start(ctx);
1792 if (ctx->h_xml_decl)
1793 ctx->h_xml_decl(ctx);
1794 if (ctx->want & XML_WANT_DECL)
1795 return ctx->state = XML_STATE_DECL;
1796 case XML_STATE_DECL:
1798 /* Misc* (doctypedecl Misc*)? */
1801 xml_parse_white(ctx, 0);
1802 xml_parse_char(ctx, '<');
1803 if ((c = xml_get_char(ctx)) == '?')
1804 /* Processing intruction */
1805 if (!(ctx->want & XML_WANT_PI))
1810 ctx->state = XML_STATE_PROLOG_PI;
1811 return XML_STATE_PI;
1812 case XML_STATE_PROLOG_PI:
1817 /* Found the root tag */
1818 xml_unget_char(ctx);
1821 else if (xml_get_char(ctx) == '-')
1822 if (!(ctx->want & XML_WANT_COMMENT))
1823 xml_skip_comment(ctx);
1826 xml_push_comment(ctx);
1827 ctx->state = XML_STATE_PROLOG_COMMENT;
1828 return XML_STATE_COMMENT;
1829 case XML_STATE_PROLOG_COMMENT:
1830 xml_pop_comment(ctx);
1835 xml_unget_char(ctx);
1836 xml_parse_doctype_decl(ctx);
1837 if (ctx->want & XML_WANT_DOCUMENT_TYPE)
1838 return ctx->state = XML_STATE_DOCUMENT_TYPE;
1839 case XML_STATE_DOCUMENT_TYPE:
1840 if (xml_peek_char(ctx) == '[')
1844 while (xml_get_char(ctx) != ']');
1845 xml_parse_white(ctx, 0);
1847 xml_parse_char(ctx, '>');
1853 case XML_STATE_COMMENT:
1854 fbgrow_reset(ctx->value);
1856 case XML_STATE_CHARS:
1860 if (xml_get_char(ctx) != '<')
1863 xml_unget_char(ctx);
1864 xml_parse_chars(ctx);
1869 if ((c = xml_get_char(ctx)) == '?')
1872 if (!(ctx->want & XML_WANT_PI))
1876 if (btell(ctx->chars))
1878 fbgrow_rewind(ctx->chars);
1879 ctx->state = XML_STATE_CHARS_BEFORE_PI;
1880 return XML_STATE_PI;
1881 case XML_STATE_CHARS_BEFORE_PI:
1882 fbgrow_reset(ctx->chars);
1885 return ctx->state = XML_STATE_PI;
1890 if ((c = xml_get_char(ctx)) == '-')
1893 if (!(ctx->want & XML_WANT_COMMENT))
1894 xml_skip_comment(ctx);
1897 if (btell(ctx->chars))
1899 fbgrow_rewind(ctx->chars);
1900 ctx->state = XML_STATE_CHARS_BEFORE_COMMENT;
1901 return XML_STATE_CHARS;
1902 case XML_STATE_CHARS_BEFORE_COMMENT:
1903 fbgrow_reset(ctx->chars);
1905 xml_push_comment(ctx);
1906 return ctx->state = XML_STATE_COMMENT;
1912 if (!(ctx->want & XML_WANT_CDATA))
1913 xml_skip_cdata(ctx);
1916 if (btell(ctx->chars))
1918 fbgrow_rewind(ctx->chars);
1919 ctx->state = XML_STATE_CHARS_BEFORE_CDATA;
1920 return XML_STATE_CHARS;
1921 case XML_STATE_CHARS_BEFORE_CDATA:
1922 fbgrow_reset(ctx->chars);
1924 xml_parse_cdata(ctx);
1925 if (btell(ctx->chars))
1927 fbgrow_rewind(ctx->chars);
1928 return ctx->state = XML_STATE_CDATA;
1930 case XML_STATE_CDATA:
1931 fbgrow_reset(ctx->chars);
1935 xml_fatal(ctx, "Unexpected character after '<!'");
1939 /* STag | EmptyElemTag */
1940 xml_unget_char(ctx);
1941 if (btell(ctx->chars))
1943 fbgrow_rewind(ctx->chars);
1944 ctx->state = XML_STATE_CHARS_BEFORE_STAG;
1945 return XML_STATE_CHARS;
1946 case XML_STATE_CHARS_BEFORE_STAG:
1947 fbgrow_reset(ctx->chars);
1950 if (xml_parse_stag(ctx))
1953 if (ctx->want & XML_WANT_STAG)
1954 return ctx->state = XML_STATE_STAG;
1955 case XML_STATE_STAG:
1956 // FIXME: EmptyElemTag
1964 if (btell(ctx->chars))
1966 fbgrow_rewind(ctx->chars);
1967 ctx->state = XML_STATE_CHARS_BEFORE_ETAG;
1968 return XML_STATE_CHARS;
1969 case XML_STATE_CHARS_BEFORE_ETAG:
1970 fbgrow_reset(ctx->chars);
1973 if (ctx->want & XML_WANT_ETAG)
1974 return ctx->state = XML_STATE_ETAG;
1975 case XML_STATE_ETAG:
1977 xml_parse_etag(ctx);
1986 DBG("XML: Entering epilog");
1989 /* Epilog whitespace is the only place, where a valid document can reach EOF */
1990 if (setjmp(throw_buf))
1991 if (ctx->err_code == XML_ERR_EOF)
1993 DBG("XML: Reached EOF");
1994 ctx->state = XML_STATE_EOF;
1995 if (ctx->h_document_end)
1996 ctx->h_document_end(ctx);
1998 return XML_STATE_EOF;
2002 xml_parse_white(ctx, 0);
2003 if (setjmp(throw_buf))
2007 xml_parse_char(ctx, '<');
2008 if ((c = xml_get_char(ctx)) == '?')
2009 /* Processing instruction */
2010 if (!(ctx->want & XML_WANT_PI))
2015 return ctx->state = XML_STATE_EPILOG_PI, XML_STATE_PI;
2016 case XML_STATE_EPILOG_PI:
2021 if (!(ctx->want & XML_WANT_COMMENT))
2022 xml_skip_comment(ctx);
2025 xml_push_comment(ctx);
2026 return ctx->state = XML_STATE_EPILOG_COMMENT, XML_STATE_COMMENT;
2027 case XML_STATE_EPILOG_COMMENT:
2028 xml_pop_comment(ctx);
2031 xml_fatal(ctx, "Syntax error in the epilog");
2041 error(struct xml_context *ctx)
2043 msg((ctx->err_code < XML_ERR_ERROR) ? L_WARN_R : L_ERROR_R, "XML: %s", ctx->err_msg);
2047 test(struct fastbuf *in, struct fastbuf *out)
2049 struct xml_context ctx;
2051 ctx.h_warn = ctx.h_error = ctx.h_fatal = error;
2052 ctx.want = XML_WANT_ALL;
2053 xml_set_source(&ctx, in);
2055 while ((state = xml_next(&ctx)) >= 0)
2058 case XML_STATE_CHARS:
2059 bprintf(out, "CHARS [%.*s]\n", (int)(ctx.chars->bstop - ctx.chars->buffer), ctx.chars->buffer);
2061 case XML_STATE_STAG:
2062 bprintf(out, "STAG <%s>\n", ctx.element->name);
2063 for (struct xml_attribute *a = ctx.element->attrs; a; a = a->next)
2064 bprintf(out, " ATTR %s=[%s]\n", a->name, a->value);
2066 case XML_STATE_ETAG:
2067 bprintf(out, "ETAG </%s>\n", ctx.element->name);
2069 case XML_STATE_COMMENT:
2070 bprintf(out, "COMMENT [%.*s]\n", (int)(ctx.value->bstop - ctx.value->buffer), ctx.value->buffer);
2073 bprintf(out, "PI [%s] [%.*s]\n", ctx.name, (int)(ctx.value->bstop - ctx.value->buffer), ctx.value->buffer);
2075 case XML_STATE_CDATA:
2076 bprintf(out, "CDATA [%.*s]\n", (int)(ctx.chars->bstop - ctx.chars->buffer), ctx.chars->buffer);
2079 bprintf(out, "EOF\n");
2082 bprintf(out, "STATE %u\n", state);
2092 struct fastbuf *in = bfdopen_shared(0, 1024);
2093 struct fastbuf *out = bfdopen_shared(1, 1024);