2 * Sherlock Library -- A simple XML parser
4 * (c) 2007 Pavel Charvat <pchar@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
13 #include "lib/mempool.h"
14 #include "lib/fastbuf.h"
15 #include "lib/ff-unicode.h"
16 #include "lib/ff-binary.h"
17 #include "lib/chartype.h"
18 #include "lib/unicode.h"
19 #include "lib/hashfunc.h"
20 #include "lib/stkstring.h"
21 #include "lib/unaligned.h"
22 #include "charset/charconv.h"
23 #include "charset/fb-charconv.h"
24 #include "sherlock/xml/xml.h"
25 #include "sherlock/xml/dtd.h"
26 #include "sherlock/xml/common.h"
30 /*** Error handling ***/
33 xml_throw(struct xml_context *ctx)
35 ASSERT(ctx->err_code && ctx->throw_buf);
36 longjmp(*(jmp_buf *)ctx->throw_buf, ctx->err_code);
40 xml_warn(struct xml_context *ctx, const char *format, ...)
45 va_start(args, format);
46 ctx->err_msg = stk_vprintf(format, args);
47 ctx->err_code = XML_ERR_WARN;
51 ctx->err_code = XML_ERR_OK;
56 xml_error(struct xml_context *ctx, const char *format, ...)
61 va_start(args, format);
62 ctx->err_msg = stk_vprintf(format, args);
63 ctx->err_code = XML_ERR_ERROR;
67 ctx->err_code = XML_ERR_OK;
72 xml_fatal(struct xml_context *ctx, const char *format, ...)
75 va_start(args, format);
76 ctx->err_msg = mp_vprintf(ctx->stack, format, args);
77 ctx->err_code = XML_ERR_FATAL;
78 ctx->state = XML_STATE_FATAL;
85 /*** Charecter categorization ***/
87 #include "obj/sherlock/xml/unicat.c"
89 /*** Memory management ***/
92 xml_fatal_nested(struct xml_context *ctx)
94 xml_fatal(ctx, "Entity not nested correctly");
98 xml_hash_new(struct mempool *pool, uns size)
100 void *tab = mp_alloc_zero(pool, size + XML_HASH_HDR_SIZE);
101 *(void **)tab = pool;
102 return tab + XML_HASH_HDR_SIZE;
105 /*** Reading of document/external entities ***/
108 xml_eof(struct xml_context *ctx)
110 ctx->err_msg = "Unexpected EOF";
111 ctx->err_code = XML_ERR_EOF;
116 xml_add_char(u32 **bstop, uns c)
119 *(*bstop)++ = xml_char_cat(c);
123 xml_push_source(struct xml_context *ctx, uns flags)
126 struct xml_source *src = ctx->src;
129 src->bptr = ctx->bptr;
130 src->bstop = ctx->bstop;
132 src = mp_alloc_zero(ctx->stack, sizeof(*src));
133 src->next = ctx->src;
134 src->saved_depth = ctx->depth;
136 ctx->flags = (ctx->flags & ~(XML_FLAG_SRC_EOF | XML_FLAG_SRC_EXPECTED_DECL | XML_FLAG_SRC_NEW_LINE | XML_FLAG_SRC_SURROUND | XML_FLAG_SRC_DOCUMENT)) | flags;
137 ctx->bstop = ctx->bptr = src->buf;
139 if (flags & XML_FLAG_SRC_SURROUND)
140 xml_add_char(&ctx->bstop, 0x20);
145 xml_pop_source(struct xml_context *ctx)
147 TRACE(ctx, "pop_source");
148 if (unlikely(ctx->depth != 0))
149 xml_fatal_nested(ctx);
150 struct xml_source *src = ctx->src;
153 ctx->depth = src->saved_depth;
154 ctx->src = src = src->next;
157 ctx->bptr = src->bptr;
158 ctx->bstop = src->bstop;
165 static void xml_refill_utf8(struct xml_context *ctx);
168 xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent)
170 TRACE(ctx, "xml_push_entity");
171 uns cat1 = ctx->src->refill_cat1;
172 uns cat2 = ctx->src->refill_cat2;
173 struct xml_source *src = xml_push_source(ctx, 0);
174 src->refill_cat1 = cat1;
175 src->refill_cat2 = cat2;
176 if (ent->flags & XML_DTD_ENT_EXTERNAL)
177 xml_fatal(ctx, "External entities not implemented"); // FIXME
180 fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0);
181 src->refill = xml_refill_utf8;
186 xml_set_source(struct xml_context *ctx, struct fastbuf *fb)
188 TRACE(ctx, "xml_set_source");
190 struct xml_source *src = xml_push_source(ctx, XML_FLAG_SRC_DOCUMENT | XML_FLAG_SRC_EXPECTED_DECL);
195 xml_error_restricted(struct xml_context *ctx, uns c)
198 xml_error(ctx, "Corrupted encoding");
200 xml_error(ctx, "Restricted char U+%04X", c);
201 return UNI_REPLACEMENT;
204 void xml_parse_decl(struct xml_context *ctx);
206 #define REFILL(ctx, func, params...) \
207 struct xml_source *src = ctx->src; \
208 struct fastbuf *fb = src->fb; \
209 if (ctx->bptr == ctx->bstop) \
210 ctx->bptr = ctx->bstop = src->buf; \
211 uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \
212 u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \
213 *last_0xd = (f & XML_FLAG_SRC_NEW_LINE) ? bstop : bend; \
216 c = func(fb, ##params); \
217 uns t = xml_char_cat(c); \
219 /* Typical branch */ \
220 *bstop++ = c, *bstop++ = t; \
224 /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \
225 /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \
227 last_0xd = bstop + 2; \
228 else if (c != 0x2028 && last_0xd == bstop) \
233 xml_add_char(&bstop, 0xa), row++; \
237 /* Used only in XML/TextDecl to switch the encoding */ \
238 *bstop++ = c, *bstop++ = t; \
242 /* Restricted character */ \
243 xml_add_char(&bstop, xml_error_restricted(ctx, c)); \
247 if (f & XML_FLAG_SRC_SURROUND) \
248 xml_add_char(&bstop, 0x20); \
249 f |= XML_FLAG_SRC_EOF; \
253 while (bstop < bend); \
254 ctx->flags = (last_0xd == bstop) ? f | XML_FLAG_SRC_NEW_LINE : f & ~XML_FLAG_SRC_NEW_LINE; \
255 ctx->bstop = bstop; \
259 xml_refill_utf8(struct xml_context *ctx)
261 REFILL(ctx, bget_utf8_repl, ~1U);
265 xml_refill_utf16_le(struct xml_context *ctx)
267 REFILL(ctx, bget_utf16_le_repl, ~1U);
271 xml_refill_utf16_be(struct xml_context *ctx)
273 REFILL(ctx, bget_utf16_be_repl, ~1U);
278 xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x)
282 return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]);
286 xml_refill_libcharset(struct xml_context *ctx)
288 unsigned short int *in_to_x = ctx->src->refill_in_to_x;
289 REFILL(ctx, xml_refill_libcharset_bget, in_to_x);
296 xml_refill(struct xml_context *ctx)
300 if (ctx->flags & XML_FLAG_SRC_EOF)
302 else if (ctx->flags & XML_FLAG_SRC_EXPECTED_DECL)
306 ctx->src->refill(ctx);
307 TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2));
310 while (ctx->bptr == ctx->bstop);
314 xml_row(struct xml_context *ctx)
316 struct xml_source *src = ctx->src;
320 for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2)
321 if (p[-1] & src->refill_cat2)
326 /*** Basic parsing ***/
329 xml_fatal_expected(struct xml_context *ctx, uns c)
331 xml_fatal(ctx, "Expected '%c'", c);
335 xml_fatal_expected_white(struct xml_context *ctx)
337 xml_fatal(ctx, "Expected a white space");
341 xml_fatal_expected_quot(struct xml_context *ctx)
343 xml_fatal(ctx, "Expected a quotation mark");
347 xml_parse_eq(struct xml_context *ctx)
349 /* Eq ::= S? '=' S? */
350 xml_parse_white(ctx, 0);
351 xml_parse_char(ctx, '=');
352 xml_parse_white(ctx, 0);
355 /* Names and nmtokens */
358 xml_parse_string(struct xml_context *ctx, struct mempool *pool, uns first_cat, uns next_cat, char *err)
360 char *p = mp_start_noalign(pool, 1);
361 if (unlikely(!(xml_peek_cat(ctx) & first_cat)))
362 xml_fatal(ctx, "%s", err);
365 p = mp_spread(pool, p, 5);
366 p = utf8_32_put(p, xml_skip_char(ctx));
368 while (xml_peek_cat(ctx) & next_cat);
370 return mp_end(pool, p);
374 xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err)
376 if (unlikely(!(xml_get_cat(ctx) & first_cat)))
377 xml_fatal(ctx, "%s", err);
378 while (xml_peek_cat(ctx) & next_cat)
383 xml_parse_name(struct xml_context *ctx, struct mempool *pool)
385 /* Name ::= NameStartChar (NameChar)* */
386 return xml_parse_string(ctx, pool,
387 !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1,
388 !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1,
393 xml_skip_name(struct xml_context *ctx)
396 !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1,
397 !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1,
402 xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool)
404 /* Nmtoken ::= (NameChar)+ */
405 uns cat = !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1;
406 return xml_parse_string(ctx, pool, cat, cat, "Expected a nmtoken");
409 /* Simple literals */
412 xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool)
414 /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */
415 char *p = mp_start_noalign(pool, 1);
416 uns q = xml_parse_quote(ctx), c;
417 while ((c = xml_get_char(ctx)) != q)
419 p = mp_spread(pool, p, 5);
420 p = utf8_32_put(p, c);
423 return mp_end(pool, p);
427 xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool)
429 /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */
430 char *p = mp_start_noalign(pool, 1);
431 uns q = xml_parse_quote(ctx), c;
432 while ((c = xml_get_char(ctx)) != q)
434 if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID)))
435 xml_fatal(ctx, "Expected a pubid character");
436 p = mp_spread(pool, p, 2);
440 return mp_end(pool, p);
444 xml_parse_encoding_name(struct xml_context *ctx)
446 /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */
447 char *p = mp_start_noalign(ctx->pool, 1);
448 uns q = xml_parse_quote(ctx);
449 if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME)))
450 xml_fatal(ctx, "Invalid character in the encoding name");
453 p = mp_spread(ctx->pool, p, 2);
454 *p++ = xml_last_char(ctx);
455 if (xml_get_char(ctx) == q)
457 if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME)))
458 xml_fatal(ctx, "Invalid character in the encoding name");
461 return mp_end(ctx->pool, p);
464 /* Document/external entity header */
467 xml_init_cats(struct xml_context *ctx, uns mask)
469 if (!(ctx->flags & XML_FLAG_VERSION_1_1))
471 ctx->src->refill_cat1 = XML_CHAR_VALID_1_0 & ~XML_CHAR_NEW_LINE_1_0 & ~mask;
472 ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_0;
476 ctx->src->refill_cat1 = XML_CHAR_UNRESTRICTED_1_1 & ~XML_CHAR_NEW_LINE_1_1 & ~mask;
477 ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_1;
482 xml_init_charconv(struct xml_context *ctx, int cs)
485 struct xml_source *src = ctx->src;
486 TRACE(ctx, "wrapping charset %s", charset_name(cs));
488 struct conv_context conv;
489 conv_set_charset(&conv, cs, CONV_CHARSET_UTF8);
490 src->refill = xml_refill_libcharset;
491 src->refill_in_to_x = conv.in_to_x;
493 src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8);
494 // FIXME: memory leak
499 xml_parse_decl(struct xml_context *ctx)
501 TRACE(ctx, "xml_parse_decl");
502 struct xml_source *src = ctx->src;
503 ctx->flags &= ~XML_FLAG_SRC_EXPECTED_DECL;
505 /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */
506 xml_init_cats(ctx, XML_CHAR_GT);
508 /* Initialize the supplied charset (if any) or try to guess it */
509 char *expected_encoding = src->expected_encoding ? : src->fb_encoding;
510 src->refill = xml_refill_utf8;
511 int bom = bpeekc(src->fb);
513 ctx->flags |= XML_FLAG_SRC_EOF;
514 if (!src->fb_encoding)
517 src->refill = xml_refill_utf16_be;
518 else if (bom == 0xff)
519 src->refill = xml_refill_utf16_le;
523 int cs = find_charset_by_name(src->fb_encoding);
524 if (cs == CONV_CHARSET_UTF8)
528 xml_init_charconv(ctx, cs);
531 else if (strcasecmp(src->fb_encoding, "UTF-16"))
533 src->refill = xml_refill_utf16_be;
535 src->refill = xml_refill_utf16_le;
536 if (!src->expected_encoding)
537 expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE";
539 else if (strcasecmp(src->fb_encoding, "UTF-16BE"))
540 src->refill = xml_refill_utf16_be;
541 else if (strcasecmp(src->fb_encoding, "UTF-16LE"))
542 src->refill = xml_refill_utf16_le;
545 xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding);
546 expected_encoding = NULL;
549 uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be;
550 if (bom > 0 && xml_peek_char(ctx) == 0xfeff)
553 xml_error(ctx, "Missing or corrupted BOM");
555 /* Look ahead for presence of XMLDecl or optional TextDecl */
556 if (!(ctx->flags & XML_FLAG_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf))
558 uns doc = ctx->flags & XML_FLAG_SRC_DOCUMENT;
559 u32 *bptr = ctx->bptr;
560 uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) &&
561 bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L');
565 xml_fatal(ctx, "Missing or corrupted XML header");
566 else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16)
567 xml_error(ctx, "Missing or corrupted entity header");
570 ctx->bptr = bptr + 12;
571 xml_parse_white(ctx, 0);
573 /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */
574 if (xml_peek_char(ctx) == 'v')
576 xml_parse_seq(ctx, "version");
578 char *version = xml_parse_pubid_literal(ctx, ctx->pool);
579 TRACE(ctx, "version=%s", version);
581 if (!strcmp(version, "1.1"))
582 v = XML_FLAG_VERSION_1_1;
583 else if (strcmp(version, "1.0"))
585 xml_error(ctx, "Unknown XML version string '%s'", version);
590 ctx->version_str = version;
593 else if (v > (ctx->flags & XML_FLAG_VERSION_1_1))
594 xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document");
595 if (!xml_parse_white(ctx, !doc))
600 xml_error(ctx, "Expected XML version");
601 ctx->version_str = "1.0";
604 /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */
605 if (xml_peek_char(ctx) == 'e')
607 xml_parse_seq(ctx, "encoding");
609 src->decl_encoding = xml_parse_encoding_name(ctx);
610 TRACE(ctx, "encoding=%s", src->decl_encoding);
611 if (!xml_parse_white(ctx, 0))
615 xml_error(ctx, "Expected XML encoding");
617 /* Parse whether the document is standalone (optional in XMLDecl) */
618 if (doc && xml_peek_char(ctx) == 's')
620 xml_parse_seq(ctx, "standalone");
622 uns c = xml_parse_quote(ctx);
623 if (ctx->standalone = (xml_peek_char(ctx) == 'y'))
624 xml_parse_seq(ctx, "yes");
626 xml_parse_seq(ctx, "no");
627 xml_parse_char(ctx, c);
628 TRACE(ctx, "standalone=%d", ctx->standalone);
629 xml_parse_white(ctx, 0);
632 xml_parse_seq(ctx, "?>");
634 /* Switch to the final encoding */
635 if (src->decl_encoding)
637 int cs = find_charset_by_name(src->decl_encoding);
638 if (cs < 0 && !expected_encoding)
639 xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding);
640 else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8)
641 xml_init_charconv(ctx, cs);
642 else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 ||
643 !(!strcasecmp(src->decl_encoding, "UTF-16") ||
644 (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) ||
645 (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE")))))
646 xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding);
650 /* Update valid Unicode ranges */
651 xml_init_cats(ctx, 0);