2 * Sherlock Library -- A simple XML parser
4 * (c) 2007 Pavel Charvat <pchar@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
12 #include "sherlock/sherlock.h"
13 #include "sherlock/xml/xml.h"
14 #include "sherlock/xml/dtd.h"
15 #include "sherlock/xml/common.h"
16 #include "lib/unicode.h"
17 #include "lib/ff-unicode.h"
18 #include "charset/charconv.h"
19 #include "charset/fb-charconv.h"
21 /*** Charecter categorization ***/
23 #include "obj/sherlock/xml/unicat.c"
26 xml_init_cats(struct xml_context *ctx)
28 if (!(ctx->flags & XML_VERSION_1_1))
30 ctx->cat_chars = XML_CHAR_VALID_1_0;
31 ctx->cat_unrestricted = XML_CHAR_VALID_1_0;
32 ctx->cat_new_line = XML_CHAR_NEW_LINE_1_0;
33 ctx->cat_name = XML_CHAR_NAME_1_0;
34 ctx->cat_sname = XML_CHAR_SNAME_1_0;
38 ctx->cat_chars = XML_CHAR_VALID_1_1;
39 ctx->cat_unrestricted = XML_CHAR_UNRESTRICTED_1_1;
40 ctx->cat_new_line = XML_CHAR_NEW_LINE_1_1;
41 ctx->cat_name = XML_CHAR_NAME_1_1;
42 ctx->cat_sname = XML_CHAR_SNAME_1_1;
46 /*** Reading of document/external entities ***/
49 xml_eof(struct xml_context *ctx)
51 ctx->err_msg = "Unexpected EOF";
52 ctx->err_code = XML_ERR_EOF;
57 xml_fatal_nested(struct xml_context *ctx)
59 xml_fatal(ctx, "Entity is not nested correctly");
63 xml_add_char(u32 **bstop, uns c)
66 *(*bstop)++ = xml_char_cat(c);
70 xml_push_source(struct xml_context *ctx, uns flags)
73 struct xml_source *src = ctx->src;
76 src->bptr = ctx->bptr;
77 src->bstop = ctx->bstop;
79 src = mp_alloc_zero(ctx->stack, sizeof(*src));
81 src->saved_depth = ctx->depth;
83 ctx->flags = (ctx->flags & ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_NEW_LINE | XML_SRC_SURROUND | XML_SRC_DOCUMENT)) | flags;
84 ctx->bstop = ctx->bptr = src->buf;
86 if (flags & XML_SRC_SURROUND)
87 xml_add_char(&ctx->bstop, 0x20);
92 xml_close_source(struct xml_source *src)
96 bclose(src->wrapped_fb);
100 xml_pop_source(struct xml_context *ctx)
102 TRACE(ctx, "pop_source");
103 if (unlikely(ctx->depth != 0))
105 xml_fatal(ctx, "Unexpected end of entity");
107 struct xml_source *src = ctx->src;
109 xml_close_source(src);
110 ctx->depth = src->saved_depth;
111 ctx->src = src = src->next;
114 ctx->bptr = src->bptr;
115 ctx->bstop = src->bstop;
123 xml_sources_cleanup(struct xml_context *ctx)
125 struct xml_source *s;
133 static void xml_refill_utf8(struct xml_context *ctx);
136 xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent)
138 TRACE(ctx, "xml_push_entity");
139 uns cat1 = ctx->src->refill_cat1;
140 uns cat2 = ctx->src->refill_cat2;
141 struct xml_source *src = xml_push_source(ctx, 0);
142 src->refill_cat1 = cat1;
143 src->refill_cat2 = cat2;
144 if (ent->flags & XML_DTD_ENT_EXTERNAL)
145 xml_fatal(ctx, "External entities not implemented"); // FIXME
148 fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0);
149 src->refill = xml_refill_utf8;
154 xml_set_source(struct xml_context *ctx, struct fastbuf *fb)
156 TRACE(ctx, "xml_set_source");
158 struct xml_source *src = xml_push_source(ctx, XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL);
160 ctx->state = XML_STATE_START;
164 xml_error_restricted(struct xml_context *ctx, uns c)
167 xml_error(ctx, "Corrupted encoding");
169 xml_error(ctx, "Restricted char U+%04X", c);
170 return UNI_REPLACEMENT;
173 void xml_parse_decl(struct xml_context *ctx);
175 #define REFILL(ctx, func, params...) \
176 struct xml_source *src = ctx->src; \
177 struct fastbuf *fb = src->fb; \
178 if (ctx->bptr == ctx->bstop) \
179 ctx->bptr = ctx->bstop = src->buf; \
180 uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \
181 u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \
182 *last_0xd = (f & XML_SRC_NEW_LINE) ? bstop : bend; \
185 c = func(fb, ##params); \
186 uns t = xml_char_cat(c); \
188 /* Typical branch */ \
189 *bstop++ = c, *bstop++ = t; \
193 /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \
194 /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \
196 last_0xd = bstop + 2; \
197 else if (c != 0x2028 && last_0xd == bstop) \
202 xml_add_char(&bstop, 0xa), row++; \
206 /* Used only in XML/TextDecl to switch the encoding */ \
207 *bstop++ = c, *bstop++ = t; \
211 /* Restricted character */ \
212 xml_add_char(&bstop, xml_error_restricted(ctx, c)); \
216 if (f & XML_SRC_SURROUND) \
217 xml_add_char(&bstop, 0x20); \
222 while (bstop < bend); \
223 ctx->flags = (last_0xd == bstop) ? f | XML_SRC_NEW_LINE : f & ~XML_SRC_NEW_LINE; \
224 ctx->bstop = bstop; \
228 xml_refill_utf8(struct xml_context *ctx)
230 REFILL(ctx, bget_utf8_repl, ~1U);
234 xml_refill_utf16_le(struct xml_context *ctx)
236 REFILL(ctx, bget_utf16_le_repl, ~1U);
240 xml_refill_utf16_be(struct xml_context *ctx)
242 REFILL(ctx, bget_utf16_be_repl, ~1U);
247 xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x)
251 return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]);
255 xml_refill_libcharset(struct xml_context *ctx)
257 unsigned short int *in_to_x = ctx->src->refill_in_to_x;
258 REFILL(ctx, xml_refill_libcharset_bget, in_to_x);
265 xml_refill(struct xml_context *ctx)
269 if (ctx->flags & XML_SRC_EOF)
271 else if (ctx->flags & XML_SRC_EXPECTED_DECL)
275 ctx->src->refill(ctx);
276 TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2));
279 while (ctx->bptr == ctx->bstop);
283 xml_row(struct xml_context *ctx)
285 struct xml_source *src = ctx->src;
289 for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2)
290 if (p[-1] & src->refill_cat2)
295 /* Document/external entity header */
298 xml_parse_encoding_name(struct xml_context *ctx)
300 /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */
301 char *p = mp_start_noalign(ctx->pool, 1);
302 uns q = xml_parse_quote(ctx);
303 if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME)))
304 xml_fatal(ctx, "Invalid character in the encoding name");
307 p = mp_spread(ctx->pool, p, 2);
308 *p++ = xml_last_char(ctx);
309 if (xml_get_char(ctx) == q)
311 if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME)))
312 xml_fatal(ctx, "Invalid character in the encoding name");
315 return mp_end(ctx->pool, p);
319 xml_init_charconv(struct xml_context *ctx, int cs)
322 struct xml_source *src = ctx->src;
323 TRACE(ctx, "wrapping charset %s", charset_name(cs));
325 struct conv_context conv;
326 conv_set_charset(&conv, cs, CONV_CHARSET_UTF8);
327 src->refill = xml_refill_libcharset;
328 src->refill_in_to_x = conv.in_to_x;
330 src->wrapped_fb = src->fb;
331 src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8);
336 xml_parse_decl(struct xml_context *ctx)
338 TRACE(ctx, "xml_parse_decl");
339 struct xml_source *src = ctx->src;
340 ctx->flags &= ~XML_SRC_EXPECTED_DECL;
341 uns doc = ctx->flags & XML_SRC_DOCUMENT;
343 /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */
346 src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line & ~XML_CHAR_GT;
347 src->refill_cat2 = ctx->cat_new_line;
349 /* Initialize the supplied charset (if any) or try to guess it */
350 char *expected_encoding = src->expected_encoding ? : src->fb_encoding;
351 src->refill = xml_refill_utf8;
352 int bom = bpeekc(src->fb);
354 ctx->flags |= XML_SRC_EOF;
355 if (!src->fb_encoding)
358 src->refill = xml_refill_utf16_be;
359 else if (bom == 0xff)
360 src->refill = xml_refill_utf16_le;
364 int cs = find_charset_by_name(src->fb_encoding);
365 if (cs == CONV_CHARSET_UTF8)
369 xml_init_charconv(ctx, cs);
372 else if (strcasecmp(src->fb_encoding, "UTF-16"))
374 src->refill = xml_refill_utf16_be;
376 src->refill = xml_refill_utf16_le;
377 if (!src->expected_encoding)
378 expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE";
380 else if (strcasecmp(src->fb_encoding, "UTF-16BE"))
381 src->refill = xml_refill_utf16_be;
382 else if (strcasecmp(src->fb_encoding, "UTF-16LE"))
383 src->refill = xml_refill_utf16_le;
386 xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding);
387 expected_encoding = NULL;
390 uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be;
391 if (bom > 0 && xml_peek_char(ctx) == 0xfeff)
394 xml_error(ctx, "Missing or corrupted BOM");
396 /* Look ahead for presence of XMLDecl or optional TextDecl */
397 if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf))
399 u32 *bptr = ctx->bptr;
400 uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) &&
401 bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L');
405 xml_fatal(ctx, "Missing or corrupted XML header");
406 else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16)
407 xml_error(ctx, "Missing or corrupted entity header");
410 ctx->bptr = bptr + 12;
411 xml_parse_white(ctx, 0);
413 /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */
414 if (xml_peek_char(ctx) == 'v')
416 xml_parse_seq(ctx, "version");
418 char *version = xml_parse_pubid_literal(ctx, ctx->pool);
419 TRACE(ctx, "version=%s", version);
421 if (!strcmp(version, "1.1"))
423 else if (strcmp(version, "1.0"))
425 xml_error(ctx, "Unknown XML version string '%s'", version);
430 ctx->version_str = version;
433 else if (v > (ctx->flags & XML_VERSION_1_1))
434 xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document");
435 if (!xml_parse_white(ctx, !doc))
440 xml_error(ctx, "Expected XML version");
441 ctx->version_str = "1.0";
444 /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */
445 if (xml_peek_char(ctx) == 'e')
447 xml_parse_seq(ctx, "encoding");
449 src->decl_encoding = xml_parse_encoding_name(ctx);
450 TRACE(ctx, "encoding=%s", src->decl_encoding);
451 if (!xml_parse_white(ctx, 0))
455 xml_error(ctx, "Expected XML encoding");
457 /* Parse whether the document is standalone (optional in XMLDecl) */
458 if (doc && xml_peek_char(ctx) == 's')
460 xml_parse_seq(ctx, "standalone");
462 uns c = xml_parse_quote(ctx);
463 if (ctx->standalone = (xml_peek_char(ctx) == 'y'))
464 xml_parse_seq(ctx, "yes");
466 xml_parse_seq(ctx, "no");
467 xml_parse_char(ctx, c);
468 TRACE(ctx, "standalone=%d", ctx->standalone);
469 xml_parse_white(ctx, 0);
472 xml_parse_seq(ctx, "?>");
474 /* Switch to the final encoding */
475 if (src->decl_encoding)
477 int cs = find_charset_by_name(src->decl_encoding);
478 if (cs < 0 && !expected_encoding)
479 xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding);
480 else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8)
481 xml_init_charconv(ctx, cs);
482 else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 ||
483 !(!strcasecmp(src->decl_encoding, "UTF-16") ||
484 (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) ||
485 (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE")))))
486 xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding);
490 /* Update valid Unicode ranges */
493 src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line;
494 src->refill_cat2 = ctx->cat_new_line;