2 * Sherlock Library -- A simple XML parser
4 * (c) 2007 Pavel Charvat <pchar@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
12 * - stack-like memory handling where possible
18 #include "lib/mempool.h"
19 #include "lib/fastbuf.h"
20 #include "lib/ff-unicode.h"
21 #include "lib/ff-binary.h"
22 #include "lib/chartype.h"
23 #include "lib/unicode.h"
24 #include "lib/hashfunc.h"
25 #include "lib/stkstring.h"
26 #include "lib/unaligned.h"
27 #include "charset/charconv.h"
28 #include "charset/fb-charconv.h"
29 #include "sherlock/xml/xml.h"
36 #define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0)
38 #define TRACE(c, f, p...) do {} while(0)
41 static uns xml_row(struct xml_context *ctx);
43 /*** Error handling ***/
46 xml_throw(struct xml_context *ctx)
48 ASSERT(ctx->err_code && ctx->throw_buf);
49 longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code);
53 xml_warn(struct xml_context *ctx, const char *format, ...)
58 va_start(args, format);
59 ctx->err_msg = stk_vprintf(format, args);
60 ctx->err_code = XML_ERR_WARN;
64 ctx->err_code = XML_ERR_OK;
69 xml_error(struct xml_context *ctx, const char *format, ...)
74 va_start(args, format);
75 ctx->err_msg = stk_vprintf(format, args);
76 ctx->err_code = XML_ERR_ERROR;
80 ctx->err_code = XML_ERR_OK;
85 xml_fatal(struct xml_context *ctx, const char *format, ...)
88 va_start(args, format);
89 ctx->err_msg = mp_vprintf(ctx->pool, format, args);
90 ctx->err_code = XML_ERR_FATAL;
91 ctx->state = XML_STATE_FATAL;
98 /*** Charecter categorization ***/
100 #include "obj/sherlock/xml/xml-ucat.h"
106 return 1U << xml_char_tab2[(c & 0xff) + xml_char_tab1[c >> 8]];
107 else if (likely(c < 0x110000))
108 return 1U << xml_char_tab3[c >> 16];
113 /*** Generic UTF decoding ***/
116 bget_utf16_le_slow(struct fastbuf *fb, uns repl)
118 if ((int)bpeekc(fb) < 0)
120 uns u = bgetw_le(fb), x, y;
123 if ((x = u - 0xd800) >= 0x800)
125 if (x >= 0x400 || (int)bpeekc(fb) < 0 || (y = bgetw_le(fb) - 0xdc00) >= 0x400)
127 return 0x10000 + (x << 10) + y;
131 bget_utf16_be_slow(struct fastbuf *fb, uns repl)
133 if ((int)bpeekc(fb) < 0)
135 uns u = bgetw_be(fb), x, y;
138 if ((x = u - 0xd800) >= 0x800)
140 if (x >= 0x400 || (int)bpeekc(fb) < 0 || (y = bgetw_be(fb) - 0xdc00) >= 0x400)
142 return 0x10000 + (x << 10) + y;
146 bget_utf16_le_repl(struct fastbuf *fb, uns repl)
149 if (bavailr(fb) >= 4)
151 fb->bptr = utf16_le_get_repl(fb->bptr, &u, repl);
155 return bget_utf16_le_slow(fb, repl);
159 bget_utf16_be_repl(struct fastbuf *fb, uns repl)
162 if (bavailr(fb) >= 4)
164 fb->bptr = utf16_be_get_repl(fb->bptr, &u, repl);
168 return bget_utf16_be_slow(fb, repl);
171 /*** Memory management ***/
174 xml_fatal_nested(struct xml_context *ctx)
176 xml_fatal(ctx, "Entity not nested correctly");
180 xml_inc(struct xml_context *ctx)
182 /* Called after the first character of a block */
188 xml_dec(struct xml_context *ctx)
190 /* Called after the last character of a block */
192 if (unlikely(!ctx->depth--))
193 xml_fatal_nested(ctx);
197 xml_push(struct xml_context *ctx)
200 struct xml_stack *s = mp_alloc(ctx->pool, sizeof(*s));
201 mp_save(ctx->pool, &s->saved_pool);
202 s->saved_flags = ctx->flags;
203 s->next = ctx->stack;
209 xml_pop(struct xml_context *ctx)
213 struct xml_stack *s = ctx->stack;
215 ctx->stack = s->next;
216 ctx->flags = s->saved_flags;
217 mp_restore(ctx->pool, &s->saved_pool);
220 #define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN)
221 #define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \
222 static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uns size) \
223 { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \
224 static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {}
227 xml_hash_new(struct mempool *pool, uns size)
229 void *tab = mp_alloc_zero(pool, size + XML_HASH_HDR_SIZE);
230 *(void **)tab = pool;
231 return tab + XML_HASH_HDR_SIZE;
234 /*** Reading of document/external entities ***/
237 xml_eof(struct xml_context *ctx)
239 ctx->err_msg = "Unexpected EOF";
240 ctx->err_code = XML_ERR_EOF;
245 xml_add_char(u32 **bstop, uns c)
248 *(*bstop)++ = xml_char_cat(c);
251 static struct xml_source *
252 xml_push_source(struct xml_context *ctx, uns flags)
255 struct xml_source *src = ctx->src;
258 src->bptr = ctx->bptr;
259 src->bstop = ctx->bstop;
261 src = mp_alloc_zero(ctx->pool, sizeof(*src));
262 src->next = ctx->src;
263 src->saved_depth = ctx->depth;
265 ctx->flags = (ctx->flags & ~(XML_FLAG_SRC_EOF | XML_FLAG_SRC_EXPECTED_DECL | XML_FLAG_SRC_NEW_LINE | XML_FLAG_SRC_SURROUND | XML_FLAG_SRC_DOCUMENT)) | flags;
266 ctx->bstop = ctx->bptr = src->buf;
268 if (flags & XML_FLAG_SRC_SURROUND)
269 xml_add_char(&ctx->bstop, 0x20);
274 xml_pop_source(struct xml_context *ctx)
276 TRACE(ctx, "xml_pop_source");
277 if (unlikely(ctx->depth != 0))
278 xml_fatal_nested(ctx);
279 struct xml_source *src = ctx->src;
282 ctx->depth = src->saved_depth;
283 ctx->src = src = src->next;
286 ctx->bptr = src->bptr;
287 ctx->bstop = src->bstop;
294 static void xml_refill_utf8(struct xml_context *ctx);
297 xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent)
299 TRACE(ctx, "xml_push_entity");
300 uns cat1 = ctx->src->refill_cat1;
301 uns cat2 = ctx->src->refill_cat2;
302 struct xml_source *src = xml_push_source(ctx, 0);
303 src->refill_cat1 = cat1;
304 src->refill_cat2 = cat2;
305 if (ent->flags & XML_DTD_ENT_EXTERNAL)
306 xml_fatal(ctx, "External entities not implemented"); // FIXME
309 fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0);
310 src->refill = xml_refill_utf8;
315 xml_set_source(struct xml_context *ctx, struct fastbuf *fb)
317 TRACE(ctx, "xml_set_source");
319 struct xml_source *src = xml_push_source(ctx, XML_FLAG_SRC_DOCUMENT | XML_FLAG_SRC_EXPECTED_DECL);
324 xml_error_restricted(struct xml_context *ctx, uns c)
326 xml_error(ctx, "Restricted char U+%04X", c);
327 return UNI_REPLACEMENT;
330 static void xml_parse_decl(struct xml_context *ctx);
332 #define REFILL(ctx, func, params...) \
333 struct xml_source *src = ctx->src; \
334 struct fastbuf *fb = src->fb; \
335 if (ctx->bptr == ctx->bstop) \
336 ctx->bptr = ctx->bstop = src->buf; \
337 uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \
338 u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \
339 *last_0xd = (f & XML_FLAG_SRC_NEW_LINE) ? bstop : bend; \
342 c = func(fb, ##params); \
343 uns t = xml_char_cat(c); \
345 /* Typical branch */ \
346 *bstop++ = c, *bstop++ = t; \
350 /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \
351 /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \
353 last_0xd = bstop + 2; \
354 else if (c != 0x2028 && last_0xd == bstop) \
359 xml_add_char(&bstop, 0xa), row++; \
363 /* Used only in XML/TextDecl to switch the encoding */ \
364 *bstop++ = c, *bstop++ = t; \
367 else if ((int)c >= 0) \
368 /* Restricted character */ \
369 xml_add_char(&bstop, xml_error_restricted(ctx, c)); \
373 if (f & XML_FLAG_SRC_SURROUND) \
374 xml_add_char(&bstop, 0x20); \
375 f |= XML_FLAG_SRC_EOF; \
379 while (bstop < bend); \
380 ctx->flags = (last_0xd == bstop) ? f | XML_FLAG_SRC_NEW_LINE : f & ~XML_FLAG_SRC_NEW_LINE; \
381 ctx->bstop = bstop; \
385 xml_refill_utf8(struct xml_context *ctx)
387 // FIXME: report corrupted encoding
388 REFILL(ctx, bget_utf8);
392 xml_refill_utf16_le(struct xml_context *ctx)
394 REFILL(ctx, bget_utf16_le_repl, 0);
398 xml_refill_utf16_be(struct xml_context *ctx)
400 REFILL(ctx, bget_utf16_be_repl, 0);
405 xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x)
409 return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]);
413 xml_refill_libcharset(struct xml_context *ctx)
415 unsigned short int *in_to_x = ctx->src->refill_in_to_x;
416 REFILL(ctx, xml_refill_libcharset_bget, in_to_x);
423 xml_refill(struct xml_context *ctx)
427 if (ctx->flags & XML_FLAG_SRC_EOF)
429 else if (ctx->flags & XML_FLAG_SRC_EXPECTED_DECL)
433 ctx->src->refill(ctx);
434 TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2));
437 while (ctx->bptr == ctx->bstop);
441 xml_peek_char(struct xml_context *ctx)
443 if (ctx->bptr == ctx->bstop)
449 xml_peek_cat(struct xml_context *ctx)
451 if (ctx->bptr == ctx->bstop)
457 xml_get_char(struct xml_context *ctx)
459 uns c = xml_peek_char(ctx);
465 xml_get_cat(struct xml_context *ctx)
467 uns c = xml_peek_cat(ctx);
473 xml_last_char(struct xml_context *ctx)
475 return ctx->bptr[-2];
479 xml_last_cat(struct xml_context *ctx)
481 return ctx->bptr[-1];
485 xml_skip_char(struct xml_context *ctx)
487 uns c = ctx->bptr[0];
493 xml_unget_char(struct xml_context *ctx)
495 return *(ctx->bptr -= 2);
499 xml_row(struct xml_context *ctx)
501 struct xml_source *src = ctx->src;
505 for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2)
506 if (p[-1] & src->refill_cat2)
511 /*** Basic parsing ***/
514 xml_fatal_expected(struct xml_context *ctx, uns c)
516 xml_fatal(ctx, "Expected '%c'", c);
520 xml_fatal_expected_white(struct xml_context *ctx)
522 xml_fatal(ctx, "Expected a white space");
526 xml_fatal_expected_quot(struct xml_context *ctx)
528 xml_fatal(ctx, "Expected a quotation mark");
532 xml_parse_white(struct xml_context *ctx, uns mandatory)
534 /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+
535 * mandatory=0 -> S? */
537 while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
542 if (unlikely(mandatory && !cnt))
543 xml_fatal_expected_white(ctx);
548 xml_parse_char(struct xml_context *ctx, uns c)
550 /* Consumes a given Unicode character */
551 if (unlikely(c != xml_get_char(ctx)))
552 xml_fatal_expected(ctx, c);
556 xml_parse_seq(struct xml_context *ctx, const char *seq)
558 /* Consumes a given sequence of ASCII characters */
560 xml_parse_char(ctx, *seq++);
564 xml_parse_eq(struct xml_context *ctx)
566 /* Eq ::= S? '=' S? */
567 xml_parse_white(ctx, 0);
568 xml_parse_char(ctx, '=');
569 xml_parse_white(ctx, 0);
573 xml_parse_quote(struct xml_context *ctx)
576 uns c = xml_get_char(ctx);
577 if (unlikely(c != '\'' && c != '\"'))
578 xml_fatal_expected_quot(ctx);
582 /* Names and nmtokens */
585 xml_parse_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err)
587 char *p = mp_start_noalign(ctx->pool, 1);
588 if (unlikely(!(xml_peek_cat(ctx) & first_cat)))
589 xml_fatal(ctx, "%s", err);
592 p = mp_spread(ctx->pool, p, 5);
593 p = utf8_32_put(p, xml_skip_char(ctx));
595 while (xml_peek_cat(ctx) & next_cat);
597 return mp_end(ctx->pool, p);
601 xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err)
603 if (unlikely(!(xml_get_cat(ctx) & first_cat)))
604 xml_fatal(ctx, "%s", err);
605 while (xml_peek_cat(ctx) & next_cat)
610 xml_parse_name(struct xml_context *ctx)
612 /* Name ::= NameStartChar (NameChar)* */
613 return xml_parse_string(ctx,
614 !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1,
615 !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1,
620 xml_skip_name(struct xml_context *ctx)
623 !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1,
624 !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1,
629 xml_parse_nmtoken(struct xml_context *ctx)
631 /* Nmtoken ::= (NameChar)+ */
632 uns cat = !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1;
633 return xml_parse_string(ctx, cat, cat, "Expected a nmtoken");
636 /* Simple literals */
639 xml_parse_system_literal(struct xml_context *ctx)
641 /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */
642 char *p = mp_start_noalign(ctx->pool, 1);
643 uns q = xml_parse_quote(ctx), c;
644 while ((c = xml_get_char(ctx)) != q)
646 p = mp_spread(ctx->pool, p, 5);
647 p = utf8_32_put(p, c);
650 return mp_end(ctx->pool, p);
654 xml_parse_pubid_literal(struct xml_context *ctx)
656 /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */
657 char *p = mp_start_noalign(ctx->pool, 1);
658 uns q = xml_parse_quote(ctx), c;
659 while ((c = xml_get_char(ctx)) != q)
661 if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID)))
662 xml_fatal(ctx, "Expected a pubid character");
663 p = mp_spread(ctx->pool, p, 2);
667 return mp_end(ctx->pool, p);
671 xml_parse_encoding_name(struct xml_context *ctx)
673 /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */
674 char *p = mp_start_noalign(ctx->pool, 1);
675 uns q = xml_parse_quote(ctx);
676 if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME)))
677 xml_fatal(ctx, "Invalid character in the encoding name");
680 p = mp_spread(ctx->pool, p, 2);
681 *p++ = xml_last_char(ctx);
682 if (xml_get_char(ctx) == q)
684 if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME)))
685 xml_fatal(ctx, "Invalid character in the encoding name");
688 return mp_end(ctx->pool, p);
691 /* Document/external entity header */
694 xml_init_cats(struct xml_context *ctx, uns mask)
696 if (!(ctx->flags & XML_FLAG_VERSION_1_1))
698 ctx->src->refill_cat1 = XML_CHAR_VALID_1_0 & ~XML_CHAR_NEW_LINE_1_0 & ~mask;
699 ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_0;
703 ctx->src->refill_cat1 = XML_CHAR_UNRESTRICTED_1_1 & ~XML_CHAR_NEW_LINE_1_1 & ~mask;
704 ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_1;
709 xml_init_charconv(struct xml_context *ctx, int cs)
712 struct xml_source *src = ctx->src;
713 TRACE(ctx, "wrapping charset %s", charset_name(cs));
715 struct conv_context conv;
716 conv_set_charset(&conv, cs, CONV_CHARSET_UTF8);
717 src->refill = xml_refill_libcharset;
718 src->refill_in_to_x = conv.in_to_x;
720 src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8);
721 // FIXME: memory leak
726 xml_parse_decl(struct xml_context *ctx)
728 TRACE(ctx, "xml_parse_decl");
729 struct xml_source *src = ctx->src;
730 ctx->flags &= ~XML_FLAG_SRC_EXPECTED_DECL;
732 /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */
733 xml_init_cats(ctx, XML_CHAR_GT);
735 /* Initialize the supplied charset (if any) or try to guess it */
736 char *expected_encoding = src->expected_encoding ? : src->fb_encoding;
737 src->refill = xml_refill_utf8;
738 int bom = bpeekc(src->fb);
740 ctx->flags |= XML_FLAG_SRC_EOF;
741 if (!src->fb_encoding)
744 src->refill = xml_refill_utf16_be;
745 else if (bom == 0xff)
746 src->refill = xml_refill_utf16_le;
750 int cs = find_charset_by_name(src->fb_encoding);
751 if (cs == CONV_CHARSET_UTF8)
755 xml_init_charconv(ctx, cs);
758 else if (strcasecmp(src->fb_encoding, "UTF-16"))
760 src->refill = xml_refill_utf16_be;
762 src->refill = xml_refill_utf16_le;
763 if (!src->expected_encoding)
764 expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE";
766 else if (strcasecmp(src->fb_encoding, "UTF-16BE"))
767 src->refill = xml_refill_utf16_be;
768 else if (strcasecmp(src->fb_encoding, "UTF-16LE"))
769 src->refill = xml_refill_utf16_le;
772 xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding);
773 expected_encoding = NULL;
776 uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be;
777 if (bom > 0 && xml_peek_char(ctx) == 0xfeff)
780 xml_error(ctx, "Missing or corrupted BOM");
782 /* Look ahead for presence of XMLDecl or optional TextDecl */
783 if (!(ctx->flags & XML_FLAG_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf))
785 uns doc = ctx->flags & XML_FLAG_SRC_DOCUMENT;
786 u32 *bptr = ctx->bptr;
787 uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) &&
788 bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L');
792 xml_fatal(ctx, "Missing or corrupted XML header");
793 else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16)
794 xml_error(ctx, "Missing or corrupted entity header");
797 ctx->bptr = bptr + 12;
798 xml_parse_white(ctx, 0);
800 /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */
801 if (xml_peek_char(ctx) == 'v')
803 xml_parse_seq(ctx, "version");
805 char *version = xml_parse_pubid_literal(ctx);
806 TRACE(ctx, "version=%s", version);
808 if (!strcmp(version, "1.1"))
809 v = XML_FLAG_VERSION_1_1;
810 else if (strcmp(version, "1.0"))
812 xml_error(ctx, "Unknown XML version string '%s'", version);
817 ctx->version_str = version;
820 else if (v > (ctx->flags & XML_FLAG_VERSION_1_1))
821 xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document");
822 if (!xml_parse_white(ctx, !doc))
827 xml_error(ctx, "Expected XML version");
828 ctx->version_str = "1.0";
831 /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */
832 if (xml_peek_char(ctx) == 'e')
834 xml_parse_seq(ctx, "encoding");
836 src->decl_encoding = xml_parse_encoding_name(ctx);
837 TRACE(ctx, "encoding=%s", src->decl_encoding);
838 if (!xml_parse_white(ctx, 0))
842 xml_error(ctx, "Expected XML encoding");
844 /* Parse whether the document is standalone (optional in XMLDecl) */
845 if (doc && xml_peek_char(ctx) == 's')
847 xml_parse_seq(ctx, "standalone");
849 uns c = xml_parse_quote(ctx);
850 if (ctx->standalone = (xml_peek_char(ctx) == 'y'))
851 xml_parse_seq(ctx, "yes");
853 xml_parse_seq(ctx, "no");
854 xml_parse_char(ctx, c);
855 TRACE(ctx, "standalone=%d", ctx->standalone);
856 xml_parse_white(ctx, 0);
859 xml_parse_seq(ctx, "?>");
861 /* Switch to the final encoding */
862 if (src->decl_encoding)
864 int cs = find_charset_by_name(src->decl_encoding);
865 if (cs < 0 && !expected_encoding)
866 xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding);
867 else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8)
868 xml_init_charconv(ctx, cs);
869 else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 ||
870 !(!strcasecmp(src->decl_encoding, "UTF-16") ||
871 (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) ||
872 (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE")))))
873 xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding);
877 /* Update valid Unicode ranges */
878 xml_init_cats(ctx, 0);
881 /*** Document Type Definition (DTD) ***/
885 #define HASH_PREFIX(x) xml_dtd_notns_##x
886 #define HASH_NODE struct xml_dtd_notn
887 #define HASH_KEY_STRING name
888 #define HASH_ZERO_FILL
889 #define HASH_TABLE_DYNAMIC
890 #define HASH_WANT_FIND
891 #define HASH_WANT_LOOKUP
892 #define HASH_GIVE_ALLOC
893 #define HASH_TABLE_ALLOC
895 #include "lib/hashtable.h"
897 /* General entities */
899 #define HASH_PREFIX(x) xml_dtd_ents_##x
900 #define HASH_NODE struct xml_dtd_ent
901 #define HASH_KEY_STRING name
902 #define HASH_ZERO_FILL
903 #define HASH_TABLE_DYNAMIC
904 #define HASH_WANT_FIND
905 #define HASH_WANT_LOOKUP
906 #define HASH_GIVE_ALLOC
907 #define HASH_TABLE_ALLOC
909 #include "lib/hashtable.h"
911 static struct xml_dtd_ent *
912 xml_dtd_declare_trivial_gent(struct xml_context *ctx, char *name, char *text)
914 struct xml_dtd *dtd = ctx->dtd;
915 struct xml_dtd_ent *ent = xml_dtd_ents_lookup(dtd->tab_gents, name);
916 if (ent->flags & XML_DTD_ENT_DECLARED)
918 xml_warn(ctx, "Entity &%s; already declared", name);
921 slist_add_tail(&dtd->gents, &ent->n);
922 ent->flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL;
924 ent->len = strlen(text);
929 xml_dtd_declare_default_gents(struct xml_context *ctx)
931 xml_dtd_declare_trivial_gent(ctx, "lt", "<");
932 xml_dtd_declare_trivial_gent(ctx, "gt", ">");
933 xml_dtd_declare_trivial_gent(ctx, "amp", "&");
934 xml_dtd_declare_trivial_gent(ctx, "apos", "'");
935 xml_dtd_declare_trivial_gent(ctx, "quot", "\"");
938 static struct xml_dtd_ent *
939 xml_dtd_find_gent(struct xml_context *ctx, char *name)
941 struct xml_dtd *dtd = ctx->dtd;
944 struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_gents, name);
945 return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL;
949 #define ENT(n, t) ent_##n = { .name = #n, .text = t, .len = 1, .flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL }
950 static struct xml_dtd_ent ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\"");
955 if (!strcmp(name, "lt"))
959 if (!strcmp(name, "gt"))
963 if (!strcmp(name, "amp"))
965 if (!strcmp(name, "apos"))
969 if (!strcmp(name, "quot"))
977 /* Parameter entities */
979 static struct xml_dtd_ent *
980 xml_dtd_find_pent(struct xml_context *ctx, char *name)
982 struct xml_dtd *dtd = ctx->dtd;
983 struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_pents, name);
984 return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL;
989 #define HASH_PREFIX(x) xml_dtd_elems_##x
990 #define HASH_NODE struct xml_dtd_elem
991 #define HASH_KEY_STRING name
992 #define HASH_TABLE_DYNAMIC
993 #define HASH_ZERO_FILL
994 #define HASH_WANT_LOOKUP
995 #define HASH_GIVE_ALLOC
996 #define HASH_TABLE_ALLOC
998 #include "lib/hashtable.h"
1000 /* Element attributes */
1002 struct xml_dtd_attrs_table;
1005 xml_dtd_attrs_hash(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem, char *name)
1007 return hash_pointer(elem) ^ hash_string(name);
1011 xml_dtd_attrs_eq(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_elem *elem1, char *name1, struct xml_dtd_elem *elem2, char *name2)
1013 return (elem1 == elem2) && !strcmp(name1, name2);
1017 xml_dtd_attrs_init_key(struct xml_dtd_attrs_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_elem *elem, char *name)
1023 #define HASH_PREFIX(x) xml_dtd_attrs_##x
1024 #define HASH_NODE struct xml_dtd_attr
1025 #define HASH_ZERO_FILL
1026 #define HASH_TABLE_DYNAMIC
1027 #define HASH_KEY_COMPLEX(x) x elem, x name
1028 #define HASH_KEY_DECL struct xml_dtd_elem *elem, char *name
1029 #define HASH_GIVE_HASHFN
1030 #define HASH_GIVE_EQ
1031 #define HASH_GIVE_INIT_KEY
1032 #define HASH_WANT_FIND
1033 #define HASH_WANT_NEW
1034 #define HASH_GIVE_ALLOC
1035 #define HASH_TABLE_ALLOC
1037 #include "lib/hashtable.h"
1039 /* Enumerated attribute values */
1041 struct xml_dtd_evals_table;
1044 xml_dtd_evals_hash(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr, char *val)
1046 return hash_pointer(attr) ^ hash_string(val);
1050 xml_dtd_evals_eq(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_attr *attr1, char *val1, struct xml_dtd_attr *attr2, char *val2)
1052 return (attr1 == attr2) && !strcmp(val1, val2);
1056 xml_dtd_evals_init_key(struct xml_dtd_evals_table *tab UNUSED, struct xml_dtd_eval *eval, struct xml_dtd_attr *attr, char *val)
1062 #define HASH_PREFIX(x) xml_dtd_evals_##x
1063 #define HASH_NODE struct xml_dtd_eval
1064 #define HASH_TABLE_DYNAMIC
1065 #define HASH_KEY_COMPLEX(x) x attr, x val
1066 #define HASH_KEY_DECL struct xml_dtd_attr *attr, char *val
1067 #define HASH_GIVE_HASHFN
1068 #define HASH_GIVE_EQ
1069 #define HASH_GIVE_INIT_KEY
1070 #define HASH_WANT_FIND
1071 #define HASH_WANT_NEW
1072 #define HASH_GIVE_ALLOC
1073 #define HASH_TABLE_ALLOC
1075 #include "lib/hashtable.h"
1077 /* Enumerated attribute notations */
1079 struct xml_dtd_enotns_table;
1082 xml_dtd_enotns_hash(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn)
1084 return hash_pointer(attr) ^ hash_pointer(notn);
1088 xml_dtd_enotns_eq(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_attr *attr1, struct xml_dtd_notn *notn1, struct xml_dtd_attr *attr2, struct xml_dtd_notn *notn2)
1090 return (attr1 == attr2) && (notn1 == notn2);
1094 xml_dtd_enotns_init_key(struct xml_dtd_enotns_table *tab UNUSED, struct xml_dtd_enotn *enotn, struct xml_dtd_attr *attr, struct xml_dtd_notn *notn)
1100 #define HASH_PREFIX(x) xml_dtd_enotns_##x
1101 #define HASH_NODE struct xml_dtd_enotn
1102 #define HASH_TABLE_DYNAMIC
1103 #define HASH_KEY_COMPLEX(x) x attr, x notn
1104 #define HASH_KEY_DECL struct xml_dtd_attr *attr, struct xml_dtd_notn *notn
1105 #define HASH_GIVE_HASHFN
1106 #define HASH_GIVE_EQ
1107 #define HASH_GIVE_INIT_KEY
1108 #define HASH_WANT_FIND
1109 #define HASH_WANT_NEW
1110 #define HASH_GIVE_ALLOC
1111 #define HASH_TABLE_ALLOC
1113 #include "lib/hashtable.h"
1115 /* DTD initialization/cleanup */
1118 xml_dtd_init(struct xml_context *ctx)
1122 struct mempool *pool = mp_new(4096);
1123 struct xml_dtd *dtd = ctx->dtd = mp_alloc_zero(pool, sizeof(*ctx->dtd));
1125 xml_dtd_ents_init(dtd->tab_gents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table)));
1126 xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table)));
1127 xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table)));
1128 xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table)));
1129 xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table)));
1130 xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table)));
1131 xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table)));
1132 xml_dtd_declare_default_gents(ctx);
1136 xml_dtd_cleanup(struct xml_context *ctx)
1140 mp_delete(ctx->dtd->pool);
1145 xml_dtd_finish(struct xml_context *ctx)
1152 /*** Parsing functions ***/
1157 xml_push_comment(struct xml_context *ctx)
1159 /* Parse a comment to ctx->value:
1160 * Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
1161 * Already parsed: '<!-' */
1162 struct fastbuf *out = ctx->value;
1164 xml_parse_char(ctx, '-');
1167 if ((c = xml_get_char(ctx)) == '-')
1168 if ((c = xml_get_char(ctx)) == '-')
1172 bput_utf8_32(out, c);
1174 xml_parse_char(ctx, '>');
1178 ctx->h_comment(ctx);
1182 xml_pop_comment(struct xml_context *ctx)
1184 fbgrow_rewind(ctx->value);
1188 xml_skip_comment(struct xml_context *ctx)
1190 xml_parse_char(ctx, '-');
1191 while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-');
1192 xml_parse_char(ctx, '>');
1196 /* Processing instructions */
1199 xml_push_pi(struct xml_context *ctx)
1201 /* Parses a PI to ctx->value and ctx->name:
1202 * PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
1203 * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
1204 * Already parsed: '<?' */
1206 ctx->name = xml_parse_name(ctx);
1207 if (unlikely(!strcasecmp(ctx->name, "xml")))
1208 xml_fatal(ctx, "Reserved PI target");
1209 struct fastbuf *out = ctx->value;
1210 if (xml_parse_white(ctx, 0))
1211 xml_parse_seq(ctx, "?>");
1217 if ((c = xml_get_char(ctx)) == '?')
1218 if (xml_get_char(ctx) == '>')
1222 xml_unget_char(ctx);
1226 bput_utf8_32(out, c);
1236 xml_pop_pi(struct xml_context *ctx)
1238 fbgrow_reset(ctx->value);
1242 xml_skip_pi(struct xml_context *ctx)
1244 if (ctx->flags & XML_FLAG_VALIDATING)
1247 if (unlikely(!strcasecmp(xml_parse_name(ctx), "xml")))
1248 xml_fatal(ctx, "Reserved PI target");
1250 if (!xml_parse_white(ctx, 0))
1252 xml_parse_seq(ctx, "?>");
1258 if (xml_get_char(ctx) == '?')
1259 if (xml_get_char(ctx) == '>')
1262 xml_unget_char(ctx);
1266 /* Character references */
1269 xml_parse_char_ref(struct xml_context *ctx)
1271 TRACE(ctx, "parse_char_ref");
1272 /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1273 * Already parsed: '&#' */
1275 if (xml_get_char(ctx) == 'x')
1277 if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT))
1279 xml_error(ctx, "Expected a hexadecimal value of character reference");
1284 v = (v << 4) + Cxvalue(xml_last_char(ctx));
1286 while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT));
1290 if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT))
1292 xml_error(ctx, "Expected a numeric value of character reference");
1297 v = v * 10 + xml_last_char(ctx) - '0';
1299 while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT));
1301 uns cat = xml_char_cat(v);
1302 if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_FLAG_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0)))
1304 xml_error(ctx, "Character reference out of range");
1307 if (xml_last_char(ctx) == ';')
1312 xml_error(ctx, "Expected ';'");
1314 while (xml_last_char(ctx) != ';')
1317 return UNI_REPLACEMENT;
1320 /* References to general entities */
1323 xml_parse_ge_ref(struct xml_context *ctx, struct fastbuf *out)
1325 /* Reference ::= EntityRef | CharRef
1326 * EntityRef ::= '&' Name ';'
1327 * Already parsed: '&' */
1328 if (xml_peek_char(ctx) == '#')
1331 uns c = xml_parse_char_ref(ctx);
1332 bput_utf8_32(out, c);
1336 struct mempool_state state;
1337 mp_save(ctx->pool, &state);
1338 char *name = xml_parse_name(ctx);
1339 xml_parse_char(ctx, ';');
1340 struct xml_dtd_ent *ent = xml_dtd_find_gent(ctx, name);
1343 xml_error(ctx, "Unknown entity &%s;", name);
1348 else if (ent->flags & XML_DTD_ENT_TRIVIAL)
1350 TRACE(ctx, "Trivial entity &%s;", name);
1351 bwrite(out, ent->text, ent->len);
1355 TRACE(ctx, "Pushed entity &%s;", name);
1356 mp_restore(ctx->pool, &state);
1358 xml_push_entity(ctx, ent);
1361 mp_restore(ctx->pool, &state);
1366 /* References to parameter entities */
1369 xml_parse_pe_ref(struct xml_context *ctx)
1371 /* PEReference ::= '%' Name ';'
1372 * Already parsed: '%' */
1373 struct mempool_state state;
1374 mp_save(ctx->pool, &state);
1375 char *name = xml_parse_name(ctx);
1376 xml_parse_char(ctx, ';');
1377 struct xml_dtd_ent *ent = xml_dtd_find_pent(ctx, name);
1379 xml_error(ctx, "Unknown entity %%%s;", name);
1382 TRACE(ctx, "Pushed entity %%%s;", name);
1383 mp_restore(ctx->pool, &state);
1385 xml_push_entity(ctx, ent);
1388 mp_restore(ctx->pool, &state);
1393 xml_parse_dtd_pe(struct xml_context *ctx)
1399 while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
1401 xml_parse_pe_ref(ctx);
1403 while (xml_peek_char(ctx) != '%');
1407 xml_parse_dtd_white(struct xml_context *ctx, uns mandatory)
1409 /* Whitespace or parameter entity */
1411 while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
1416 if (xml_peek_char(ctx) == '%')
1418 xml_parse_dtd_pe(ctx);
1421 else if (unlikely(mandatory && !cnt))
1422 xml_fatal_expected_white(ctx);
1427 xml_check_dtd_pe(struct xml_context *ctx)
1429 if (xml_peek_char(ctx) == '%')
1431 xml_parse_dtd_pe(ctx);
1440 xml_parse_external_id(struct xml_context *ctx, struct xml_ext_id *eid, uns allow_public, uns dtd)
1442 bzero(eid, sizeof(*eid));
1444 xml_check_dtd_pe(ctx);
1445 uns c = xml_peek_char(ctx);
1448 xml_parse_seq(ctx, "SYSTEM");
1450 xml_parse_dtd_white(ctx, 1);
1452 xml_parse_white(ctx, 1);
1453 eid->system_id = xml_parse_system_literal(ctx);
1457 xml_parse_seq(ctx, "PUBLIC");
1459 xml_parse_dtd_white(ctx, 1);
1461 xml_parse_white(ctx, 1);
1462 eid->public_id = xml_parse_pubid_literal(ctx);
1463 if (dtd ? xml_parse_dtd_white(ctx, 0) : xml_parse_white(ctx, 0))
1464 if ((c = xml_peek_char(ctx)) == '\'' || c == '"' || !allow_public)
1465 eid->system_id = xml_parse_system_literal(ctx);
1468 xml_fatal(ctx, "Expected an external ID");
1471 /* DTD: Notation declaration */
1474 xml_parse_notation_decl(struct xml_context *ctx)
1476 /* NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
1477 * Already parsed: '<!NOTATION' */
1478 TRACE(ctx, "parse_notation_decl");
1479 struct xml_dtd *dtd = ctx->dtd;
1480 xml_parse_dtd_white(ctx, 1);
1482 struct xml_dtd_notn *notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx));
1483 xml_parse_dtd_white(ctx, 1);
1484 struct xml_ext_id eid;
1485 xml_parse_external_id(ctx, &eid, 1, 1);
1486 xml_parse_dtd_white(ctx, 0);
1487 xml_parse_char(ctx, '>');
1489 if (notn->flags & XML_DTD_NOTN_DECLARED)
1490 xml_warn(ctx, "Notation %s already declared", notn->name);
1493 notn->flags = XML_DTD_NOTN_DECLARED;
1495 slist_add_tail(&dtd->notns, ¬n->n);
1501 xml_parse_entity_decl(struct xml_context *ctx)
1503 /* Already parsed: '<!ENTITY' */
1504 TRACE(ctx, "parse_entity_decl");
1505 struct xml_dtd *dtd = ctx->dtd;
1506 xml_parse_dtd_white(ctx, 1);
1508 uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENT_PARAMETER : 0;
1510 xml_parse_dtd_white(ctx, 1);
1512 xml_unget_char(ctx);
1514 struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_gents, xml_parse_name(ctx));
1515 slist *list = flags ? &dtd->pents : &dtd->gents;
1516 xml_parse_white(ctx, 1);
1517 if (ent->flags & XML_DTD_ENT_DECLARED)
1519 xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name);
1520 // FIXME: should be only warning
1523 uns c, sep = xml_get_char(ctx);
1524 if (sep == '\'' || sep == '"')
1527 * EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */
1528 struct fastbuf *out = ctx->value;
1531 if ((c = xml_get_char(ctx)) == sep)
1537 //xml_parse_parameter_ref(ctx);
1540 bput_utf8_32(out, c);
1541 else if ((c = xml_get_char(ctx)) == '#')
1542 c = xml_parse_char_ref(ctx);
1545 /* Bypass references to general entities */
1548 xml_unget_char(ctx);
1549 bputs(out, xml_parse_name(ctx));
1550 xml_parse_char(ctx, ';');
1557 slist_add_tail(list, &ent->n);
1558 ent->flags = flags | XML_DTD_ENT_DECLARED;
1559 ent->len = out->bstop - out->bptr - 1;
1560 ent->text = mp_memdup(ctx->pool, out->bptr, ent->len + 1);
1565 /* External entity */
1566 struct xml_ext_id eid;
1567 struct xml_dtd_notn *notn = NULL;
1568 xml_parse_external_id(ctx, &eid, 0, 0);
1569 if (!xml_parse_white(ctx, 0) || !flags)
1570 xml_parse_char(ctx, '>');
1571 else if (xml_get_char(ctx) != '>')
1573 /* General external unparsed entity */
1574 flags |= XML_DTD_ENT_UNPARSED;
1575 xml_parse_seq(ctx, "NDATA");
1576 xml_parse_white(ctx, 1);
1577 notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx));
1579 slist_add_tail(list, &ent->n);
1580 ent->flags = flags | XML_DTD_ENT_DECLARED | XML_DTD_ENT_EXTERNAL;
1584 xml_parse_dtd_white(ctx, 0);
1585 xml_parse_char(ctx, '>');
1589 /* DTD: Internal subset */
1592 xml_parse_internal_subset(struct xml_context *ctx)
1594 // FIXME: comments/pi have no parent
1595 /* '[' intSubset ']'
1596 * intSubset :== (markupdecl | DeclSep)
1597 * Already parsed: ']' */
1600 xml_parse_white(ctx, 0);
1601 uns c = xml_get_char(ctx);
1604 if ((c = xml_get_char(ctx)) == '!')
1605 switch (c = xml_get_char(ctx))
1608 xml_push_comment(ctx);
1609 xml_pop_comment(ctx);
1612 xml_parse_seq(ctx, "OTATION");
1613 xml_parse_notation_decl(ctx);
1616 if ((c = xml_get_char(ctx)) == 'N')
1618 xml_parse_seq(ctx, "TITY");
1619 xml_parse_entity_decl(ctx);
1623 xml_parse_seq(ctx, "EMENT");
1627 goto invalid_markup;
1630 xml_parse_seq(ctx, "TTLIST");
1634 goto invalid_markup;
1642 goto invalid_markup;
1644 xml_parse_dtd_pe(ctx);
1648 goto invalid_markup;
1654 xml_fatal(ctx, "Invalid markup in the internal subset");
1657 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
1660 xml_parse_cdata(struct xml_context *ctx)
1662 struct fastbuf *out = ctx->chars;
1663 xml_parse_seq(ctx, "CDATA[");
1667 if ((c = xml_get_char(ctx)) == ']')
1669 if ((c = xml_get_char(ctx)) == ']')
1670 if ((c = xml_get_char(ctx)) == '>')
1676 bput_utf8_32(out, c);
1681 xml_skip_cdata(struct xml_context *ctx)
1683 xml_parse_cdata(ctx);
1687 xml_parse_chars(struct xml_context *ctx)
1689 TRACE(ctx, "parse_chars");
1690 struct fastbuf *out = ctx->chars;
1692 while ((c = xml_get_char(ctx)) != '<')
1696 xml_parse_ge_ref(ctx, out);
1699 bput_utf8_32(out, c);
1700 xml_unget_char(ctx);
1703 /*----------------------------------------------*/
1705 struct xml_attrs_table;
1708 xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_elem *e, char *n)
1710 return hash_pointer(e) ^ hash_string(n);
1714 xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_elem *e1, char *n1, struct xml_elem *e2, char *n2)
1716 return (e1 == e2) && !strcmp(n1, n2);
1720 xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_elem *e, char *name)
1725 slist_add_tail(&e->attrs, &a->n);
1728 #define HASH_PREFIX(x) xml_attrs_##x
1729 #define HASH_NODE struct xml_attr
1730 #define HASH_KEY_COMPLEX(x) x elem, x name
1731 #define HASH_KEY_DECL struct xml_elem *elem, char *name
1732 #define HASH_TABLE_DYNAMIC
1733 #define HASH_GIVE_EQ
1734 #define HASH_GIVE_HASHFN
1735 #define HASH_GIVE_INIT_KEY
1736 #define HASH_WANT_CLEANUP
1737 #define HASH_WANT_REMOVE
1738 #define HASH_WANT_LOOKUP
1739 #define HASH_WANT_FIND
1740 #define HASH_GIVE_ALLOC
1742 #include "lib/hashtable.h"
1745 xml_init(struct xml_context *ctx)
1747 bzero(ctx, sizeof(*ctx));
1748 ctx->pool = mp_new(65536);
1749 ctx->chars = fbgrow_create(4096);
1750 ctx->value = fbgrow_create(4096);
1752 xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table)));
1756 xml_cleanup(struct xml_context *ctx)
1758 xml_attrs_cleanup(ctx->tab_attrs);
1759 xml_dtd_cleanup(ctx);
1762 mp_delete(ctx->pool);
1766 xml_parse_attr(struct xml_context *ctx)
1768 // FIXME: memory management, dtd, literal
1769 TRACE(ctx, "parse_attr");
1770 struct xml_elem *e = ctx->elem;
1771 char *name = xml_parse_name(ctx);
1772 struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, name);
1774 char *val =xml_parse_system_literal(ctx);
1776 xml_error(ctx, "Attribute is not unique");
1782 xml_parse_stag(struct xml_context *ctx)
1785 TRACE(ctx, "parse_stag");
1787 struct xml_elem *e = mp_alloc_zero(ctx->pool, sizeof(*e));
1788 struct xml_elem *parent = ctx->elem;
1789 clist_init(&e->sons);
1790 e->node.parent = (void *)parent;
1792 e->name = xml_parse_name(ctx);
1794 clist_add_tail(&parent->sons, &e->node.n);
1798 if (ctx->document_type && strcmp(e->name, ctx->document_type))
1799 xml_error(ctx, "The root element does not match the document type");
1803 uns white = xml_parse_white(ctx, 0);
1804 uns c = xml_get_char(ctx);
1807 xml_parse_char(ctx, '>');
1808 ctx->flags |= XML_FLAG_EMPTY_ELEM;
1814 xml_fatal_expected_white(ctx);
1815 xml_unget_char(ctx);
1816 xml_parse_attr(ctx);
1818 if (ctx->h_element_start)
1819 ctx->h_element_start(ctx);
1823 xml_parse_etag(struct xml_context *ctx)
1825 TRACE(ctx, "parse_etag");
1826 struct xml_elem *e = ctx->elem;
1828 char *name = xml_parse_name(ctx);
1829 if (strcmp(name, e->name))
1830 xml_fatal(ctx, "Invalid ETag, expected '%s'", e->name);
1831 xml_parse_white(ctx, 0);
1832 xml_parse_char(ctx, '>');
1837 xml_pop_element(struct xml_context *ctx)
1839 TRACE(ctx, "pop_element");
1840 if (ctx->h_element_end)
1841 ctx->h_element_end(ctx);
1842 struct xml_elem *e = ctx->elem;
1843 if (ctx->flags & XML_DOM_FREE)
1846 clist_remove(&e->node.n);
1849 SLIST_FOR_EACH(struct xml_attr *, a, e->attrs)
1850 xml_attrs_remove(ctx->tab_attrs, a);
1852 while (n = clist_head(&e->sons))
1854 if (n->type == XML_NODE_ELEM)
1856 SLIST_FOR_EACH(struct xml_attr *, a, ((struct xml_elem *)n)->attrs)
1857 xml_attrs_remove(ctx->tab_attrs, a);
1858 clist_insert_list_after(&((struct xml_elem *)n)->sons, &n->n);
1860 clist_remove(&n->n);
1863 ctx->node = e->node.parent;
1864 xml_pop(ctx); // FIXME: memory management without XML_DOM_FREE
1867 for (struct xml_attribute *a = e->attrs; a; a = a->next)
1868 xml_attribute_remove(ctx->attribute_table, a);
1873 xml_parse_element_decl(struct xml_context *ctx)
1877 xml_parse_seq(ctx, "<!ELEMENT");
1878 xml_parse_white(ctx, 1);
1879 xml_parse_name(ctx);
1880 xml_parse_white(ctx, 1);
1882 uns c = xml_get_char(ctx);
1885 xml_parse_seq(ctx, "MPTY");
1890 xml_parse_seq(ctx, "NY");
1895 xml_parse_white(ctx, 0);
1896 if (xml_get_char(ctx) == '#')
1898 xml_parse_seq(ctx, "PCDATA");
1901 xml_parse_white(ctx, 0);
1902 if ((c = xml_get_char(ctx)) == ')')
1905 xml_fatal_expected(ctx, ')');
1906 xml_parse_white(ctx, 0);
1907 xml_parse_name(ctx);
1913 xml_unget_char(ctx);
1917 xml_parse_white(ctx, 0);
1918 if ((c = xml_get_char(ctx)) == '(')
1924 if ((c = xml_get_char(ctx)) == '?' || c == '*' || c == '+')
1928 xml_unget_char(ctx);
1940 xml_unget_char(ctx);
1941 xml_parse_name(ctx);
1947 xml_fatal(ctx, "Expected element content specification");
1949 xml_parse_white(ctx, 0);
1950 xml_parse_char(ctx, '>');
1956 xml_parse_attr_list_decl(struct xml_context *ctx)
1958 /* AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
1959 * AttDef ::= S Name S AttType S DefaultDecl */
1960 xml_parse_seq(ctx, "ATTLIST");
1961 xml_parse_white(ctx, 1);
1962 struct xml_dtd_elem *e = xml_dtd_elems_lookup(ctx->dtd->tab_elems, xml_parse_name(ctx));
1963 e->attlist_declared = 1;
1965 while (xml_parse_white(ctx, 0) && xml_get_char(ctx) != '>')
1967 xml_unget_char(ctx);
1968 char *name = xml_parse_name(ctx);
1969 struct xml_dtd_attr *a = xml_dtd_attrs_find(ctx->dtd->tab_attrs, e, name);
1973 xml_warn(ctx, "Duplicate attribute definition");
1977 a = xml_dtd_attrs_new(ctx->dtd->tab_attrs, e, name);
1978 xml_parse_white(ctx, 1);
1979 if (xml_get_char(ctx) == '(')
1982 a->type = XML_ATTR_ENUM;
1985 xml_parse_white(ctx, 0);
1986 char *value = xml_parse_nmtoken(ctx);
1988 if (xml_dtd_evals_find(ctx->dtd->tab_evals, a, value))
1989 xml_error(ctx, "Duplicate enumeration value");
1991 xml_dtd_evals_new(ctx->dtd->tab_evals, a, value);
1992 xml_parse_white(ctx, 0);
1994 while (xml_get_char(ctx) == '|');
1995 xml_unget_char(ctx);
1996 xml_parse_char(ctx, ')');
2000 xml_unget_char(ctx);
2001 char *type = xml_parse_name(ctx);
2002 enum xml_dtd_attribute_type t;
2003 if (!strcmp(type, "CDATA"))
2005 else if (!strcmp(type, "ID"))
2007 else if (!strcmp(type, "IDREF"))
2009 else if (!strcmp(type, "IDREFS"))
2010 t = XML_ATTR_IDREFS;
2011 else if (!strcmp(type, "ENTITY"))
2012 t = XML_ATTR_ENTITY;
2013 else if (!strcmp(type, "ENTITIES"))
2014 t = XML_ATTR_ENTITIES;
2015 else if (!strcmp(type, "NMTOKEN"))
2016 t = XML_ATTR_NMTOKEN;
2017 else if (!strcmp(type, "NMTOKENS"))
2018 t = XML_ATTR_NMTOKENS;
2019 else if (!strcmp(type, "NOTATION"))
2021 t = XML_ATTR_NOTATION;
2022 xml_parse_white(ctx, 1);
2023 xml_parse_char(ctx, '(');
2026 xml_parse_white(ctx, 0);
2027 struct xml_dtd_notn *n = xml_dtd_notns_lookup(ctx->dtd->tab_notns, xml_parse_name(ctx));
2029 if (xml_dtd_enotns_find(ctx->dtd->tab_enotns, a, n))
2030 xml_error(ctx, "Duplicate enumerated notation");
2032 xml_dtd_enotns_new(ctx->dtd->tab_enotns, a, n);
2033 xml_parse_white(ctx, 0);
2035 while (xml_get_char(ctx) == '|');
2036 xml_unget_char(ctx);
2037 xml_parse_char(ctx, ')');
2040 xml_fatal(ctx, "Unknown attribute type");
2044 xml_parse_white(ctx, 1);
2045 enum xml_dtd_attribute_default def = XML_ATTR_NONE;
2046 if (xml_get_char(ctx) == '#')
2047 switch (xml_get_char(ctx))
2050 xml_parse_seq(ctx, "EQUIRED");
2051 def = XML_ATTR_REQUIRED;
2054 xml_parse_seq(ctx, "MPLIED");
2055 def = XML_ATTR_IMPLIED;
2058 xml_parse_seq(ctx, "IXED");
2059 def = XML_ATTR_FIXED;
2062 xml_fatal(ctx, "Expected a modifier for default attribute value");
2065 xml_unget_char(ctx);
2066 if (def != XML_ATTR_REQUIRED && def != XML_ATTR_IMPLIED)
2068 xml_parse_system_literal(ctx);
2076 xml_parse_doctype_decl(struct xml_context *ctx)
2078 if (ctx->document_type)
2079 xml_fatal(ctx, "Multiple document types not allowed");
2080 xml_parse_seq(ctx, "DOCTYPE");
2081 xml_parse_white(ctx, 1);
2082 ctx->document_type = xml_parse_name(ctx);
2083 TRACE(ctx, "doctyype=%s", ctx->document_type);
2084 uns white = xml_parse_white(ctx, 0);
2085 uns c = xml_peek_char(ctx);
2086 if (c != '>' && c != '[' && white)
2088 xml_parse_external_id(ctx, &ctx->eid, 0, 0);
2089 xml_parse_white(ctx, 0);
2090 ctx->flags |= XML_FLAG_HAS_EXTERNAL_SUBSET;
2092 if (xml_peek_char(ctx) == '[')
2093 ctx->flags |= XML_FLAG_HAS_INTERNAL_SUBSET;
2094 if (ctx->h_doctype_decl)
2095 ctx->h_doctype_decl(ctx);
2099 xml_next(struct xml_context *ctx)
2101 /* A nasty state machine */
2103 TRACE(ctx, "xml_next (state=%u)", ctx->state);
2105 ctx->throw_buf = &throw_buf;
2106 if (setjmp(throw_buf))
2109 if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal)
2111 ctx->state = XML_STATE_FATAL;
2112 TRACE(ctx, "raised fatal error");
2118 case XML_STATE_FATAL:
2121 case XML_STATE_START:
2122 TRACE(ctx, "entering prolog");
2123 if (ctx->h_document_start)
2124 ctx->h_document_start(ctx);
2127 if (ctx->h_xml_decl)
2128 ctx->h_xml_decl(ctx);
2129 if (ctx->want & XML_WANT_DECL)
2130 return ctx->state = XML_STATE_DECL;
2131 case XML_STATE_DECL:
2133 /* Misc* (doctypedecl Misc*)? */
2136 xml_parse_white(ctx, 0);
2137 xml_parse_char(ctx, '<');
2138 if ((c = xml_get_char(ctx)) == '?')
2139 /* Processing intruction */
2140 if (!(ctx->want & XML_WANT_PI))
2145 ctx->state = XML_STATE_PROLOG_PI;
2146 return XML_STATE_PI;
2147 case XML_STATE_PROLOG_PI:
2152 /* Found the root tag */
2153 xml_unget_char(ctx);
2156 else if (xml_get_char(ctx) == '-')
2157 if (!(ctx->want & XML_WANT_COMMENT))
2158 xml_skip_comment(ctx);
2161 xml_push_comment(ctx);
2162 ctx->state = XML_STATE_PROLOG_COMMENT;
2163 return XML_STATE_COMMENT;
2164 case XML_STATE_PROLOG_COMMENT:
2165 xml_pop_comment(ctx);
2170 xml_unget_char(ctx);
2171 xml_parse_doctype_decl(ctx);
2172 if (ctx->want & XML_WANT_DOCUMENT_TYPE)
2173 return ctx->state = XML_STATE_DOCUMENT_TYPE;
2174 case XML_STATE_DOCUMENT_TYPE:
2175 if (xml_peek_char(ctx) == '[')
2179 xml_parse_internal_subset(ctx);
2180 xml_parse_white(ctx, 0);
2182 xml_parse_char(ctx, '>');
2188 case XML_STATE_COMMENT:
2189 fbgrow_reset(ctx->value);
2191 case XML_STATE_CHARS:
2195 if (xml_peek_char(ctx) != '<')
2198 xml_parse_chars(ctx);
2206 if ((c = xml_get_char(ctx)) == '?')
2209 if (!(ctx->want & XML_WANT_PI))
2213 if (btell(ctx->chars))
2215 fbgrow_rewind(ctx->chars);
2216 ctx->state = XML_STATE_CHARS_BEFORE_PI;
2217 return XML_STATE_PI;
2218 case XML_STATE_CHARS_BEFORE_PI:
2219 fbgrow_reset(ctx->chars);
2222 return ctx->state = XML_STATE_PI;
2227 if ((c = xml_get_char(ctx)) == '-')
2230 if (!(ctx->want & XML_WANT_COMMENT))
2231 xml_skip_comment(ctx);
2234 if (btell(ctx->chars))
2236 fbgrow_rewind(ctx->chars);
2237 ctx->state = XML_STATE_CHARS_BEFORE_COMMENT;
2238 return XML_STATE_CHARS;
2239 case XML_STATE_CHARS_BEFORE_COMMENT:
2240 fbgrow_reset(ctx->chars);
2242 xml_push_comment(ctx);
2243 return ctx->state = XML_STATE_COMMENT;
2249 if (!(ctx->want & XML_WANT_CDATA))
2250 xml_skip_cdata(ctx);
2253 if (btell(ctx->chars))
2255 fbgrow_rewind(ctx->chars);
2256 ctx->state = XML_STATE_CHARS_BEFORE_CDATA;
2257 return XML_STATE_CHARS;
2258 case XML_STATE_CHARS_BEFORE_CDATA:
2259 fbgrow_reset(ctx->chars);
2261 xml_parse_cdata(ctx);
2262 if (btell(ctx->chars))
2264 fbgrow_rewind(ctx->chars);
2265 return ctx->state = XML_STATE_CDATA;
2267 case XML_STATE_CDATA:
2268 fbgrow_reset(ctx->chars);
2272 xml_fatal(ctx, "Unexpected character after '<!'");
2276 /* STag | EmptyElemTag */
2277 xml_unget_char(ctx);
2278 if (btell(ctx->chars))
2280 fbgrow_rewind(ctx->chars);
2281 ctx->state = XML_STATE_CHARS_BEFORE_STAG;
2282 return XML_STATE_CHARS;
2283 case XML_STATE_CHARS_BEFORE_STAG:
2284 fbgrow_reset(ctx->chars);
2287 xml_parse_stag(ctx);
2288 if (ctx->want & XML_WANT_STAG)
2289 return ctx->state = XML_STATE_STAG;
2290 case XML_STATE_STAG:
2291 if (ctx->flags & XML_FLAG_EMPTY_ELEM)
2298 if (btell(ctx->chars))
2300 fbgrow_rewind(ctx->chars);
2301 ctx->state = XML_STATE_CHARS_BEFORE_ETAG;
2302 return XML_STATE_CHARS;
2303 case XML_STATE_CHARS_BEFORE_ETAG:
2304 fbgrow_reset(ctx->chars);
2307 xml_parse_etag(ctx);
2309 if (ctx->want & XML_WANT_ETAG)
2310 return ctx->state = XML_STATE_ETAG;
2311 case XML_STATE_ETAG:
2312 xml_pop_element(ctx);
2320 TRACE(ctx, "entering epilog");
2323 /* Epilog whitespace is the only place, where a valid document can reach EOF */
2324 if (setjmp(throw_buf))
2325 if (ctx->err_code == XML_ERR_EOF)
2327 TRACE(ctx, "reached EOF");
2328 ctx->state = XML_STATE_EOF;
2329 if (ctx->h_document_end)
2330 ctx->h_document_end(ctx);
2332 return XML_STATE_EOF;
2336 xml_parse_white(ctx, 0);
2337 if (setjmp(throw_buf))
2341 xml_parse_char(ctx, '<');
2342 if ((c = xml_get_char(ctx)) == '?')
2343 /* Processing instruction */
2344 if (!(ctx->want & XML_WANT_PI))
2349 return ctx->state = XML_STATE_EPILOG_PI, XML_STATE_PI;
2350 case XML_STATE_EPILOG_PI:
2355 if (!(ctx->want & XML_WANT_COMMENT))
2356 xml_skip_comment(ctx);
2359 xml_push_comment(ctx);
2360 return ctx->state = XML_STATE_EPILOG_COMMENT, XML_STATE_COMMENT;
2361 case XML_STATE_EPILOG_COMMENT:
2362 xml_pop_comment(ctx);
2365 xml_fatal(ctx, "Syntax error in the epilog");
2375 error(struct xml_context *ctx)
2377 msg((ctx->err_code < XML_ERR_ERROR) ? L_WARN_R : L_ERROR_R, "XML %u: %s", xml_row(ctx), ctx->err_msg);
2381 test(struct fastbuf *in, struct fastbuf *out)
2383 struct xml_context ctx;
2385 ctx.h_warn = ctx.h_error = ctx.h_fatal = error;
2386 ctx.want = XML_WANT_ALL;
2387 ctx.flags |= XML_DOM_FREE;
2388 xml_set_source(&ctx, in);
2390 while ((state = xml_next(&ctx)) >= 0)
2393 case XML_STATE_CHARS:
2394 bprintf(out, "CHARS [%.*s]\n", (int)(ctx.chars->bstop - ctx.chars->buffer), ctx.chars->buffer);
2396 case XML_STATE_STAG:
2397 bprintf(out, "STAG <%s>\n", ctx.elem->name);
2398 SLIST_FOR_EACH(struct xml_attr *, a, ctx.elem->attrs)
2399 bprintf(out, " ATTR %s=[%s]\n", a->name, a->val);
2401 case XML_STATE_ETAG:
2402 bprintf(out, "ETAG </%s>\n", ctx.elem->name);
2404 case XML_STATE_COMMENT:
2405 bprintf(out, "COMMENT [%.*s]\n", (int)(ctx.value->bstop - ctx.value->buffer), ctx.value->buffer);
2408 bprintf(out, "PI [%s] [%.*s]\n", ctx.name, (int)(ctx.value->bstop - ctx.value->buffer), ctx.value->buffer);
2410 case XML_STATE_CDATA:
2411 bprintf(out, "CDATA [%.*s]\n", (int)(ctx.chars->bstop - ctx.chars->buffer), ctx.chars->buffer);
2414 bprintf(out, "EOF\n");
2425 struct fastbuf *in = bfdopen_shared(0, 1024);
2426 struct fastbuf *out = bfdopen_shared(1, 1024);