see xml-test.c and xml.h for the details.
DIRS+=sherlock/xml
PROGS+=$(o)/sherlock/xml/xml-test
-LIBSHXML_MODS=common parse dtd
+LIBSHXML_MODS=common source parse dtd
LIBSHXML_INCLUDES=xml.h dtd.h
LIBSHXML_MOD_PATHS=$(addprefix $(o)/sherlock/xml/,$(LIBSHXML_MODS))
* of the GNU Lesser General Public License.
*/
-#define LOCAL_DEBUG
+#undef LOCAL_DEBUG
-#include "lib/lib.h"
-#include "lib/mempool.h"
-#include "lib/fastbuf.h"
-#include "lib/ff-unicode.h"
-#include "lib/ff-binary.h"
-#include "lib/chartype.h"
-#include "lib/unicode.h"
-#include "lib/hashfunc.h"
-#include "lib/stkstring.h"
-#include "lib/unaligned.h"
-#include "charset/charconv.h"
-#include "charset/fb-charconv.h"
+#include "sherlock/sherlock.h"
#include "sherlock/xml/xml.h"
#include "sherlock/xml/dtd.h"
#include "sherlock/xml/common.h"
+#include "lib/stkstring.h"
+#include "lib/ff-unicode.h"
#include <setjmp.h>
va_start(args, format);
ctx->err_msg = mp_vprintf(ctx->stack, format, args);
ctx->err_code = XML_ERR_FATAL;
- ctx->state = XML_STATE_FATAL;
+ ctx->state = XML_STATE_EOF;
va_end(args);
if (ctx->h_fatal)
ctx->h_fatal(ctx);
xml_throw(ctx);
}
-/*** Charecter categorization ***/
-
-#include "obj/sherlock/xml/unicat.c"
-
/*** Memory management ***/
-void NONRET
-xml_fatal_nested(struct xml_context *ctx)
-{
- xml_fatal(ctx, "Entity not nested correctly");
-}
-
void *
xml_hash_new(struct mempool *pool, uns size)
{
return tab + XML_HASH_HDR_SIZE;
}
-/*** Reading of document/external entities ***/
-
-static void NONRET
-xml_eof(struct xml_context *ctx)
-{
- ctx->err_msg = "Unexpected EOF";
- ctx->err_code = XML_ERR_EOF;
- xml_throw(ctx);
-}
-
-static inline void
-xml_add_char(u32 **bstop, uns c)
-{
- *(*bstop)++ = c;
- *(*bstop)++ = xml_char_cat(c);
-}
-
-struct xml_source *
-xml_push_source(struct xml_context *ctx, uns flags)
-{
- xml_push(ctx);
- struct xml_source *src = ctx->src;
- if (src)
- {
- src->bptr = ctx->bptr;
- src->bstop = ctx->bstop;
- }
- src = mp_alloc_zero(ctx->stack, sizeof(*src));
- src->next = ctx->src;
- src->saved_depth = ctx->depth;
- ctx->src = src;
- ctx->flags = (ctx->flags & ~(XML_FLAG_SRC_EOF | XML_FLAG_SRC_EXPECTED_DECL | XML_FLAG_SRC_NEW_LINE | XML_FLAG_SRC_SURROUND | XML_FLAG_SRC_DOCUMENT)) | flags;
- ctx->bstop = ctx->bptr = src->buf;
- ctx->depth = 0;
- if (flags & XML_FLAG_SRC_SURROUND)
- xml_add_char(&ctx->bstop, 0x20);
- return src;
-}
-
-static void
-xml_pop_source(struct xml_context *ctx)
-{
- TRACE(ctx, "pop_source");
- if (unlikely(ctx->depth != 0))
- xml_fatal_nested(ctx);
- struct xml_source *src = ctx->src;
- ASSERT(src);
- bclose(src->fb);
- ctx->depth = src->saved_depth;
- ctx->src = src = src->next;
- if (src)
- {
- ctx->bptr = src->bptr;
- ctx->bstop = src->bstop;
- }
- xml_pop(ctx);
- if (unlikely(!src))
- xml_eof(ctx);
-}
-
-static void xml_refill_utf8(struct xml_context *ctx);
-
-void
-xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent)
-{
- TRACE(ctx, "xml_push_entity");
- uns cat1 = ctx->src->refill_cat1;
- uns cat2 = ctx->src->refill_cat2;
- struct xml_source *src = xml_push_source(ctx, 0);
- src->refill_cat1 = cat1;
- src->refill_cat2 = cat2;
- if (ent->flags & XML_DTD_ENT_EXTERNAL)
- xml_fatal(ctx, "External entities not implemented"); // FIXME
- else
- {
- fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0);
- src->refill = xml_refill_utf8;
- }
-}
-
-void
-xml_set_source(struct xml_context *ctx, struct fastbuf *fb)
-{
- TRACE(ctx, "xml_set_source");
- ASSERT(!ctx->src);
- struct xml_source *src = xml_push_source(ctx, XML_FLAG_SRC_DOCUMENT | XML_FLAG_SRC_EXPECTED_DECL);
- src->fb = fb;
-}
-
-static uns
-xml_error_restricted(struct xml_context *ctx, uns c)
-{
- if (c == ~1U)
- xml_error(ctx, "Corrupted encoding");
- else
- xml_error(ctx, "Restricted char U+%04X", c);
- return UNI_REPLACEMENT;
-}
-
-void xml_parse_decl(struct xml_context *ctx);
-
-#define REFILL(ctx, func, params...) \
- struct xml_source *src = ctx->src; \
- struct fastbuf *fb = src->fb; \
- if (ctx->bptr == ctx->bstop) \
- ctx->bptr = ctx->bstop = src->buf; \
- uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \
- u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \
- *last_0xd = (f & XML_FLAG_SRC_NEW_LINE) ? bstop : bend; \
- do \
- { \
- c = func(fb, ##params); \
- uns t = xml_char_cat(c); \
- if (t & t1) \
- /* Typical branch */ \
- *bstop++ = c, *bstop++ = t; \
- else if (t & t2) \
- { \
- /* New line */ \
- /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \
- /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \
- if (c == 0xd) \
- last_0xd = bstop + 2; \
- else if (c != 0x2028 && last_0xd == bstop) \
- { \
- last_0xd = bend; \
- continue; \
- } \
- xml_add_char(&bstop, 0xa), row++; \
- } \
- else if (c == '>') \
- { \
- /* Used only in XML/TextDecl to switch the encoding */ \
- *bstop++ = c, *bstop++ = t; \
- break; \
- } \
- else if (~c) \
- /* Restricted character */ \
- xml_add_char(&bstop, xml_error_restricted(ctx, c)); \
- else \
- { \
- /* EOF */ \
- if (f & XML_FLAG_SRC_SURROUND) \
- xml_add_char(&bstop, 0x20); \
- f |= XML_FLAG_SRC_EOF; \
- break; \
- } \
- } \
- while (bstop < bend); \
- ctx->flags = (last_0xd == bstop) ? f | XML_FLAG_SRC_NEW_LINE : f & ~XML_FLAG_SRC_NEW_LINE; \
- ctx->bstop = bstop; \
- src->row = row;
-
-static void
-xml_refill_utf8(struct xml_context *ctx)
-{
- REFILL(ctx, bget_utf8_repl, ~1U);
-}
-
-static void
-xml_refill_utf16_le(struct xml_context *ctx)
-{
- REFILL(ctx, bget_utf16_le_repl, ~1U);
-}
-
-static void
-xml_refill_utf16_be(struct xml_context *ctx)
-{
- REFILL(ctx, bget_utf16_be_repl, ~1U);
-}
-
-#if 0
-static inline uns
-xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x)
-{
- // FIXME: slow
- int c;
- return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]);
-}
-
static void
-xml_refill_libcharset(struct xml_context *ctx)
+xml_chars_spout(struct fastbuf *fb)
{
- unsigned short int *in_to_x = ctx->src->refill_in_to_x;
- REFILL(ctx, xml_refill_libcharset_bget, in_to_x);
-}
-#endif
-
-#undef REFILL
-
-void
-xml_refill(struct xml_context *ctx)
-{
- do
+ if (fb->bptr >= fb->bufend)
{
- if (ctx->flags & XML_FLAG_SRC_EOF)
- xml_pop_source(ctx);
- else if (ctx->flags & XML_FLAG_SRC_EXPECTED_DECL)
- xml_parse_decl(ctx);
+ struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb);
+ struct mempool *pool = ctx->pool;
+ if (fb->bufend != fb->buffer)
+ {
+ uns len = fb->bufend - fb->buffer;
+ TRACE(ctx, "grow_chars");
+ fb->buffer = mp_expand(pool);
+ fb->bufend = fb->buffer + mp_avail(pool);
+ fb->bstop = fb->buffer;
+ fb->bptr = fb->buffer + len;
+ }
else
{
- ctx->src->refill(ctx);
- TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2));
+ TRACE(ctx, "push_chars");
+ struct xml_node *n = xml_push_dom(ctx);
+ n->type = XML_NODE_CHARS;
+ xml_start_chars(ctx);
}
}
- while (ctx->bptr == ctx->bstop);
-}
-
-uns
-xml_row(struct xml_context *ctx)
-{
- struct xml_source *src = ctx->src;
- if (!src)
- return 0;
- uns row = src->row;
- for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2)
- if (p[-1] & src->refill_cat2)
- row--;
- return row + 1;
}
-/*** Basic parsing ***/
-
-void NONRET
-xml_fatal_expected(struct xml_context *ctx, uns c)
-{
- xml_fatal(ctx, "Expected '%c'", c);
-}
-
-void NONRET
-xml_fatal_expected_white(struct xml_context *ctx)
-{
- xml_fatal(ctx, "Expected a white space");
-}
-
-void NONRET
-xml_fatal_expected_quot(struct xml_context *ctx)
-{
- xml_fatal(ctx, "Expected a quotation mark");
-}
-
-void
-xml_parse_eq(struct xml_context *ctx)
+static void
+xml_init_chars(struct xml_context *ctx)
{
- /* Eq ::= S? '=' S? */
- xml_parse_white(ctx, 0);
- xml_parse_char(ctx, '=');
- xml_parse_white(ctx, 0);
+ struct fastbuf *fb = &ctx->chars;
+ fb->name = "<xml-chars>";
+ fb->spout = xml_chars_spout;
+ fb->can_overwrite_buffer = 1;
+ fb->bptr = fb->bstop = fb->buffer = fb->bufend = NULL;
}
-/* Names and nmtokens */
-
-static char *
-xml_parse_string(struct xml_context *ctx, struct mempool *pool, uns first_cat, uns next_cat, char *err)
-{
- char *p = mp_start_noalign(pool, 1);
- if (unlikely(!(xml_peek_cat(ctx) & first_cat)))
- xml_fatal(ctx, "%s", err);
- do
- {
- p = mp_spread(pool, p, 5);
- p = utf8_32_put(p, xml_skip_char(ctx));
- }
- while (xml_peek_cat(ctx) & next_cat);
- *p++ = 0;
- return mp_end(pool, p);
-}
+/*** Initialization ***/
static void
-xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err)
+xml_do_init(struct xml_context *ctx)
{
- if (unlikely(!(xml_get_cat(ctx) & first_cat)))
- xml_fatal(ctx, "%s", err);
- while (xml_peek_cat(ctx) & next_cat)
- xml_skip_char(ctx);
-}
-
-char *
-xml_parse_name(struct xml_context *ctx, struct mempool *pool)
-{
- /* Name ::= NameStartChar (NameChar)* */
- return xml_parse_string(ctx, pool,
- !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1,
- !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1,
- "Expected a name");
+ ctx->flags = XML_REPORT_ALL;
+ xml_init_chars(ctx);
+ xml_attrs_table_init(ctx);
}
void
-xml_skip_name(struct xml_context *ctx)
-{
- xml_skip_string(ctx,
- !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1,
- !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1,
- "Expected a name");
-}
-
-char *
-xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool)
-{
- /* Nmtoken ::= (NameChar)+ */
- uns cat = !(ctx->flags & XML_FLAG_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1;
- return xml_parse_string(ctx, pool, cat, cat, "Expected a nmtoken");
-}
-
-/* Simple literals */
-
-char *
-xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool)
+xml_init(struct xml_context *ctx)
{
- /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */
- char *p = mp_start_noalign(pool, 1);
- uns q = xml_parse_quote(ctx), c;
- while ((c = xml_get_char(ctx)) != q)
- {
- p = mp_spread(pool, p, 5);
- p = utf8_32_put(p, c);
- }
- *p++ = 0;
- return mp_end(pool, p);
-}
-
-char *
-xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool)
-{
- /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */
- char *p = mp_start_noalign(pool, 1);
- uns q = xml_parse_quote(ctx), c;
- while ((c = xml_get_char(ctx)) != q)
- {
- if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID)))
- xml_fatal(ctx, "Expected a pubid character");
- p = mp_spread(pool, p, 2);
- *p++ = c;
- }
- *p++ = 0;
- return mp_end(pool, p);
-}
-
-static char *
-xml_parse_encoding_name(struct xml_context *ctx)
-{
- /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */
- char *p = mp_start_noalign(ctx->pool, 1);
- uns q = xml_parse_quote(ctx);
- if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME)))
- xml_fatal(ctx, "Invalid character in the encoding name");
- while (1)
- {
- p = mp_spread(ctx->pool, p, 2);
- *p++ = xml_last_char(ctx);
- if (xml_get_char(ctx) == q)
- break;
- if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME)))
- xml_fatal(ctx, "Invalid character in the encoding name");
- }
- *p++ = 0;
- return mp_end(ctx->pool, p);
-}
-
-/* Document/external entity header */
-
-static inline void
-xml_init_cats(struct xml_context *ctx, uns mask)
-{
- if (!(ctx->flags & XML_FLAG_VERSION_1_1))
- {
- ctx->src->refill_cat1 = XML_CHAR_VALID_1_0 & ~XML_CHAR_NEW_LINE_1_0 & ~mask;
- ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_0;
- }
- else
- {
- ctx->src->refill_cat1 = XML_CHAR_UNRESTRICTED_1_1 & ~XML_CHAR_NEW_LINE_1_1 & ~mask;
- ctx->src->refill_cat2 = XML_CHAR_NEW_LINE_1_1;
- }
+ bzero(ctx, sizeof(*ctx));
+ ctx->pool = mp_new(65536);
+ ctx->stack = mp_new(65536);
+ xml_do_init(ctx);
+ TRACE(ctx, "init");
}
-static void
-xml_init_charconv(struct xml_context *ctx, int cs)
+void
+xml_cleanup(struct xml_context *ctx)
{
- // FIXME: hack
- struct xml_source *src = ctx->src;
- TRACE(ctx, "wrapping charset %s", charset_name(cs));
-#if 0
- struct conv_context conv;
- conv_set_charset(&conv, cs, CONV_CHARSET_UTF8);
- src->refill = xml_refill_libcharset;
- src->refill_in_to_x = conv.in_to_x;
-#else
- src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8);
- // FIXME: memory leak
-#endif
+ TRACE(ctx, "cleanup");
+ xml_attrs_table_cleanup(ctx);
+ xml_dtd_cleanup(ctx);
+ xml_sources_cleanup(ctx);
+ mp_delete(ctx->pool);
+ mp_delete(ctx->stack);
}
void
-xml_parse_decl(struct xml_context *ctx)
-{
- TRACE(ctx, "xml_parse_decl");
- struct xml_source *src = ctx->src;
- ctx->flags &= ~XML_FLAG_SRC_EXPECTED_DECL;
-
- /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */
- xml_init_cats(ctx, XML_CHAR_GT);
-
- /* Initialize the supplied charset (if any) or try to guess it */
- char *expected_encoding = src->expected_encoding ? : src->fb_encoding;
- src->refill = xml_refill_utf8;
- int bom = bpeekc(src->fb);
- if (bom < 0)
- ctx->flags |= XML_FLAG_SRC_EOF;
- if (!src->fb_encoding)
- {
- if (bom == 0xfe)
- src->refill = xml_refill_utf16_be;
- else if (bom == 0xff)
- src->refill = xml_refill_utf16_le;
- }
- else
- {
- int cs = find_charset_by_name(src->fb_encoding);
- if (cs == CONV_CHARSET_UTF8)
- {}
- else if (cs >= 0)
- {
- xml_init_charconv(ctx, cs);
- bom = 0;
- }
- else if (strcasecmp(src->fb_encoding, "UTF-16"))
- {
- src->refill = xml_refill_utf16_be;
- if (bom == 0xff)
- src->refill = xml_refill_utf16_le;
- if (!src->expected_encoding)
- expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE";
- }
- else if (strcasecmp(src->fb_encoding, "UTF-16BE"))
- src->refill = xml_refill_utf16_be;
- else if (strcasecmp(src->fb_encoding, "UTF-16LE"))
- src->refill = xml_refill_utf16_le;
- else
- {
- xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding);
- expected_encoding = NULL;
- }
- }
- uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be;
- if (bom > 0 && xml_peek_char(ctx) == 0xfeff)
- xml_skip_char(ctx);
- else if (utf16)
- xml_error(ctx, "Missing or corrupted BOM");
-
- /* Look ahead for presence of XMLDecl or optional TextDecl */
- if (!(ctx->flags & XML_FLAG_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf))
- xml_refill(ctx);
- uns doc = ctx->flags & XML_FLAG_SRC_DOCUMENT;
- u32 *bptr = ctx->bptr;
- uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) &&
- bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L');
- if (!have_decl)
- {
- if (doc)
- xml_fatal(ctx, "Missing or corrupted XML header");
- else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16)
- xml_error(ctx, "Missing or corrupted entity header");
- goto exit;
- }
- ctx->bptr = bptr + 12;
- xml_parse_white(ctx, 0);
-
- /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */
- if (xml_peek_char(ctx) == 'v')
- {
- xml_parse_seq(ctx, "version");
- xml_parse_eq(ctx);
- char *version = xml_parse_pubid_literal(ctx, ctx->pool);
- TRACE(ctx, "version=%s", version);
- uns v = 0;
- if (!strcmp(version, "1.1"))
- v = XML_FLAG_VERSION_1_1;
- else if (strcmp(version, "1.0"))
- {
- xml_error(ctx, "Unknown XML version string '%s'", version);
- version = "1.0";
- }
- if (doc)
- {
- ctx->version_str = version;
- ctx->flags |= v;
- }
- else if (v > (ctx->flags & XML_FLAG_VERSION_1_1))
- xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document");
- if (!xml_parse_white(ctx, !doc))
- goto end;
- }
- else if (doc)
- {
- xml_error(ctx, "Expected XML version");
- ctx->version_str = "1.0";
- }
-
- /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */
- if (xml_peek_char(ctx) == 'e')
- {
- xml_parse_seq(ctx, "encoding");
- xml_parse_eq(ctx);
- src->decl_encoding = xml_parse_encoding_name(ctx);
- TRACE(ctx, "encoding=%s", src->decl_encoding);
- if (!xml_parse_white(ctx, 0))
- goto end;
- }
- else if (!doc)
- xml_error(ctx, "Expected XML encoding");
-
- /* Parse whether the document is standalone (optional in XMLDecl) */
- if (doc && xml_peek_char(ctx) == 's')
- {
- xml_parse_seq(ctx, "standalone");
- xml_parse_eq(ctx);
- uns c = xml_parse_quote(ctx);
- if (ctx->standalone = (xml_peek_char(ctx) == 'y'))
- xml_parse_seq(ctx, "yes");
- else
- xml_parse_seq(ctx, "no");
- xml_parse_char(ctx, c);
- TRACE(ctx, "standalone=%d", ctx->standalone);
- xml_parse_white(ctx, 0);
- }
-end:
- xml_parse_seq(ctx, "?>");
-
- /* Switch to the final encoding */
- if (src->decl_encoding)
- {
- int cs = find_charset_by_name(src->decl_encoding);
- if (cs < 0 && !expected_encoding)
- xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding);
- else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8)
- xml_init_charconv(ctx, cs);
- else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 ||
- !(!strcasecmp(src->decl_encoding, "UTF-16") ||
- (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) ||
- (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE")))))
- xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding);
- }
-
-exit:
- /* Update valid Unicode ranges */
- xml_init_cats(ctx, 0);
+xml_reset(struct xml_context *ctx)
+{
+ TRACE(ctx, "reset");
+ struct mempool *pool = ctx->pool, *stack = ctx->stack;
+ xml_attrs_table_cleanup(ctx);
+ xml_dtd_cleanup(ctx);
+ xml_sources_cleanup(ctx);
+ mp_flush(pool);
+ mp_flush(stack);
+ bzero(ctx, sizeof(*ctx));
+ xml_do_init(ctx);
}
void NONRET xml_throw(struct xml_context *ctx);
void xml_warn(struct xml_context *ctx, const char *format, ...);
void xml_error(struct xml_context *ctx, const char *format, ...);
-void xml_fatal(struct xml_context *ctx, const char *format, ...);
-
-/*** Charecter categorization ***/
-
-#include "obj/sherlock/xml/unicat.h"
-
-static inline uns
-xml_char_cat(uns c)
-{
- if (c < 0x10000)
- return 1U << xml_char_tab1[(c & 0xff) + xml_char_tab2[c >> 8]];
- else if (likely(c < 0x110000))
- return 1U << xml_char_tab3[c >> 16];
- else
- return 1;
-}
-
-static inline uns
-xml_ascii_cat(uns c)
-{
- return xml_char_tab1[c];
-}
+void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...);
/*** Memory management ***/
-void NONRET xml_fatal_nested(struct xml_context *ctx);
-
-static inline void
-xml_inc(struct xml_context *ctx)
-{
- /* Called after the first character of a block */
- TRACE(ctx, "inc");
- ctx->depth++;
-}
-
-static inline void
-xml_dec(struct xml_context *ctx)
-{
- /* Called after the last character of a block */
- TRACE(ctx, "dec");
- if (unlikely(!ctx->depth--))
- xml_fatal_nested(ctx);
-}
-
struct xml_stack {
struct xml_stack *next;
struct mempool_state state;
}
static inline void
-xml_pop_dom(struct xml_context *ctx)
+xml_pop_dom(struct xml_context *ctx, uns free)
{
/* Leave DOM subtree */
TRACE(ctx, "pop_dom");
ASSERT(ctx->node);
struct xml_node *p = ctx->node->parent;
struct xml_dom_stack *s = (void *)ctx->stack_list;
- if (ctx->flags & XML_DOM_FREE)
+ if (free)
{
/* See xml_pop_element() for cleanup of attribute hash table */
if (p)
/*** Reading of document/external entities ***/
+#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */
+
+struct xml_source {
+ struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */
+ struct fastbuf *fb; /* Source fastbuf */
+ struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */
+ struct fastbuf wrap_fb; /* Fbmem wrapper */
+ u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */
+ u32 *bptr, *bstop; /* Current state of the buffer */
+ uns row; /* File position */
+ char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */
+ char *fb_encoding; /* Encoding of the source fastbuf */
+ char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */
+ uns refill_cat1; /* Character categories, which should be directly passed to the buffer */
+ uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in sequences) */
+ void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */
+ unsigned short *refill_in_to_x; /* Libcharset input table */
+ uns saved_depth; /* Saved ctx->depth */
+};
+
+void NONRET xml_fatal_nested(struct xml_context *ctx);
+
+static inline void
+xml_inc(struct xml_context *ctx)
+{
+ /* Called after the first character of a block */
+ TRACE(ctx, "inc");
+ ctx->depth++;
+}
+
+static inline void
+xml_dec(struct xml_context *ctx)
+{
+ /* Called after the last character of a block */
+ TRACE(ctx, "dec");
+ if (unlikely(!ctx->depth--))
+ xml_fatal_nested(ctx);
+}
+
+#include "obj/sherlock/xml/unicat.h"
+
+static inline uns
+xml_char_cat(uns c)
+{
+ if (c < 0x10000)
+ return 1U << xml_char_tab1[(c & 0xff) + xml_char_tab2[c >> 8]];
+ else if (likely(c < 0x110000))
+ return 1U << xml_char_tab3[c >> 16];
+ else
+ return 1;
+}
+
+static inline uns
+xml_ascii_cat(uns c)
+{
+ return xml_char_tab1[c];
+}
+
struct xml_source *xml_push_source(struct xml_context *ctx, uns flags);
void xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent);
return *(ctx->bptr -= 2);
}
-/*** Basic parsing ***/
+void xml_sources_cleanup(struct xml_context *ctx);
+
+/*** Parsing ***/
void NONRET xml_fatal_expected(struct xml_context *ctx, uns c);
void NONRET xml_fatal_expected_white(struct xml_context *ctx);
return c;
}
-/* Names and nmtokens */
-
char *xml_parse_name(struct xml_context *ctx, struct mempool *pool);
void xml_skip_name(struct xml_context *ctx);
char *xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool);
-/* Simple literals */
-
char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool);
char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool);
-/* Parsing */
-
uns xml_parse_char_ref(struct xml_context *ctx);
void xml_parse_ref(struct xml_context *ctx);
void xml_parse_pe_ref(struct xml_context *ctx);
+
char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr);
+
void xml_parse_notation_decl(struct xml_context *ctx);
void xml_parse_entity_decl(struct xml_context *ctx);
void xml_parse_element_decl(struct xml_context *ctx);
void xml_parse_attr_list_decl(struct xml_context *ctx);
+
void xml_push_comment(struct xml_context *ctx);
void xml_pop_comment(struct xml_context *ctx);
void xml_skip_comment(struct xml_context *ctx);
+
void xml_push_pi(struct xml_context *ctx);
void xml_pop_pi(struct xml_context *ctx);
void xml_skip_pi(struct xml_context *ctx);
+void xml_attrs_table_init(struct xml_context *ctx);
+void xml_attrs_table_cleanup(struct xml_context *ctx);
+
#endif
* of the GNU Lesser General Public License.
*/
-#define LOCAL_DEBUG
+#undef LOCAL_DEBUG
#include "sherlock/sherlock.h"
#include "sherlock/xml/xml.h"
#define HASH_KEY_STRING name
#define HASH_ZERO_FILL
#define HASH_TABLE_DYNAMIC
-#define HASH_WANT_FIND
#define HASH_WANT_LOOKUP
#define HASH_GIVE_ALLOC
#define HASH_TABLE_ALLOC
#include "lib/hashtable.h"
static struct xml_dtd_ent *
-xml_dtd_declare_trivial_gent(struct xml_context *ctx, char *name, char *text)
+xml_dtd_declare_trivial_ent(struct xml_context *ctx, char *name, char *text)
{
struct xml_dtd *dtd = ctx->dtd;
- struct xml_dtd_ent *ent = xml_dtd_ents_lookup(dtd->tab_gents, name);
+ struct xml_dtd_ent *ent = xml_dtd_ents_lookup(dtd->tab_ents, name);
if (ent->flags & XML_DTD_ENT_DECLARED)
{
xml_warn(ctx, "Entity &%s; already declared", name);
return NULL;
}
- slist_add_tail(&dtd->gents, &ent->n);
+ slist_add_tail(&dtd->ents, &ent->n);
ent->flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL;
ent->text = text;
ent->len = strlen(text);
}
static void
-xml_dtd_declare_default_gents(struct xml_context *ctx)
+xml_dtd_declare_default_ents(struct xml_context *ctx)
{
- xml_dtd_declare_trivial_gent(ctx, "lt", "<");
- xml_dtd_declare_trivial_gent(ctx, "gt", ">");
- xml_dtd_declare_trivial_gent(ctx, "amp", "&");
- xml_dtd_declare_trivial_gent(ctx, "apos", "'");
- xml_dtd_declare_trivial_gent(ctx, "quot", "\"");
+ xml_dtd_declare_trivial_ent(ctx, "lt", "<");
+ xml_dtd_declare_trivial_ent(ctx, "gt", ">");
+ xml_dtd_declare_trivial_ent(ctx, "amp", "&");
+ xml_dtd_declare_trivial_ent(ctx, "apos", "'");
+ xml_dtd_declare_trivial_ent(ctx, "quot", "\"");
}
struct xml_dtd_ent *
-xml_dtd_find_gent(struct xml_context *ctx, char *name)
+xml_dtd_find_ent(struct xml_context *ctx, char *name)
{
struct xml_dtd *dtd = ctx->dtd;
if (dtd)
{
- struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_gents, name);
+ struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_ents, name);
return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL;
}
else
#define HASH_KEY_STRING name
#define HASH_TABLE_DYNAMIC
#define HASH_ZERO_FILL
+#define HASH_WANT_FIND
#define HASH_WANT_LOOKUP
#define HASH_GIVE_ALLOC
#define HASH_TABLE_ALLOC
XML_HASH_GIVE_ALLOC
#include "lib/hashtable.h"
+struct xml_dtd_elem *
+xml_dtd_find_elem(struct xml_context *ctx, char *name)
+{
+ return ctx->dtd ? xml_dtd_elems_find(ctx->dtd->tab_elems, name) : NULL;
+}
+
/* Element sons */
struct xml_dtd_enodes_table;
XML_HASH_GIVE_ALLOC
#include "lib/hashtable.h"
+struct xml_dtd_attr *
+xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name)
+{
+ return ctx->dtd ? xml_dtd_attrs_find(ctx->dtd->tab_attrs, elem, name) : NULL;
+}
+
/* Enumerated attribute values */
struct xml_dtd_evals_table;
struct mempool *pool = mp_new(4096);
struct xml_dtd *dtd = ctx->dtd = mp_alloc_zero(pool, sizeof(*ctx->dtd));
dtd->pool = pool;
- xml_dtd_ents_init(dtd->tab_gents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table)));
+ xml_dtd_ents_init(dtd->tab_ents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table)));
xml_dtd_ents_init(dtd->tab_pents = xml_hash_new(pool, sizeof(struct xml_dtd_ents_table)));
xml_dtd_notns_init(dtd->tab_notns = xml_hash_new(pool, sizeof(struct xml_dtd_notns_table)));
xml_dtd_elems_init(dtd->tab_elems = xml_hash_new(pool, sizeof(struct xml_dtd_elems_table)));
xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table)));
xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table)));
xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table)));
- xml_dtd_declare_default_gents(ctx);
+ xml_dtd_declare_default_ents(ctx);
}
void
else
xml_unget_char(ctx);
- struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_gents, xml_parse_name(ctx, dtd->pool));
- slist *list = flags ? &dtd->pents : &dtd->gents;
+ struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool));
+ slist *list = flags ? &dtd->pents : &dtd->ents;
xml_parse_dtd_white(ctx, 1);
if (ent->flags & XML_DTD_ENT_DECLARED)
{
struct xml_dtd {
struct mempool *pool; /* Memory pool where to allocate DTD */
- slist gents; /* Link list of general entities */
+ slist ents; /* Link list of general entities */
slist pents; /* Link list of parapeter entities */
slist notns; /* Link list of notations */
slist elems; /* Link list of elements */
- void *tab_gents; /* Hash table of general entities */
+ void *tab_ents; /* Hash table of general entities */
void *tab_pents; /* Hash table of parameter entities */
void *tab_notns; /* Hash table of notations */
void *tab_elems; /* Hash table of elements */
void *tab_enotns; /* hash table of enumerated attribute notations */
};
+struct xml_ext_id {
+ char *system_id;
+ char *public_id;
+};
+
/* Notations */
enum xml_dtd_notn_flags {
struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */
};
-struct xml_dtd_ent *xml_dtd_find_gent(struct xml_context *ctx, char *name);
+struct xml_dtd_ent *xml_dtd_find_ent(struct xml_context *ctx, char *name);
/* Elements */
XML_DTD_ELEM_OCCUR_PLUS,
};
+struct xml_dtd_elem *xml_dtd_find_elem(struct xml_context *ctx, char *name);
+
/* Attributes */
enum xml_dtd_attribute_default {
void xml_dtd_cleanup(struct xml_context *ctx);
void xml_dtd_finish(struct xml_context *ctx);
+struct xml_dtd_attr *xml_dtd_find_attr(struct xml_context *ctx, struct xml_dtd_elem *elem, char *name);
+
#endif
* of the GNU Lesser General Public License.
*/
-#define LOCAL_DEBUG
+#undef LOCAL_DEBUG
#include "sherlock/sherlock.h"
#include "sherlock/xml/xml.h"
#include <setjmp.h>
+/*** Basic parsing ***/
+
+void NONRET
+xml_fatal_expected(struct xml_context *ctx, uns c)
+{
+ if (c >= 32 && c < 128)
+ xml_fatal(ctx, "Expected '%c'", c);
+ else
+ xml_fatal(ctx, "Expected U+%04x", c);
+}
+
+void NONRET
+xml_fatal_expected_white(struct xml_context *ctx)
+{
+ xml_fatal(ctx, "Expected a white space");
+}
+
+void NONRET
+xml_fatal_expected_quot(struct xml_context *ctx)
+{
+ xml_fatal(ctx, "Expected a quotation mark");
+}
+
+void
+xml_parse_eq(struct xml_context *ctx)
+{
+ /* Eq ::= S? '=' S? */
+ xml_parse_white(ctx, 0);
+ xml_parse_char(ctx, '=');
+ xml_parse_white(ctx, 0);
+}
+
+/*** Names and nmtokens ***/
+
+static char *
+xml_parse_string(struct xml_context *ctx, struct mempool *pool, uns first_cat, uns next_cat, char *err)
+{
+ char *p = mp_start_noalign(pool, 1);
+ if (unlikely(!(xml_peek_cat(ctx) & first_cat)))
+ xml_fatal(ctx, "%s", err);
+ do
+ {
+ p = mp_spread(pool, p, 5);
+ p = utf8_32_put(p, xml_skip_char(ctx));
+ }
+ while (xml_peek_cat(ctx) & next_cat);
+ *p++ = 0;
+ return mp_end(pool, p);
+}
+
+static void
+xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err)
+{
+ if (unlikely(!(xml_get_cat(ctx) & first_cat)))
+ xml_fatal(ctx, "%s", err);
+ while (xml_peek_cat(ctx) & next_cat)
+ xml_skip_char(ctx);
+}
+
+char *
+xml_parse_name(struct xml_context *ctx, struct mempool *pool)
+{
+ /* Name ::= NameStartChar (NameChar)* */
+ return xml_parse_string(ctx, pool,
+ !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1,
+ !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1,
+ "Expected a name");
+}
+
+void
+xml_skip_name(struct xml_context *ctx)
+{
+ xml_skip_string(ctx,
+ !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1,
+ !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1,
+ "Expected a name");
+}
+
+char *
+xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool)
+{
+ /* Nmtoken ::= (NameChar)+ */
+ uns cat = !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1;
+ return xml_parse_string(ctx, pool, cat, cat, "Expected a nmtoken");
+}
+
+/*** Simple literals ***/
+
+char *
+xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool)
+{
+ /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */
+ char *p = mp_start_noalign(pool, 1);
+ uns q = xml_parse_quote(ctx), c;
+ while ((c = xml_get_char(ctx)) != q)
+ {
+ p = mp_spread(pool, p, 5);
+ p = utf8_32_put(p, c);
+ }
+ *p++ = 0;
+ return mp_end(pool, p);
+}
+
+char *
+xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool)
+{
+ /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */
+ char *p = mp_start_noalign(pool, 1);
+ uns q = xml_parse_quote(ctx), c;
+ while ((c = xml_get_char(ctx)) != q)
+ {
+ if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID)))
+ xml_fatal(ctx, "Expected a pubid character");
+ p = mp_spread(pool, p, 2);
+ *p++ = c;
+ }
+ *p++ = 0;
+ return mp_end(pool, p);
+}
+
/*** Comments ***/
void
*p = 0;
n->len = p - (char *)mp_ptr(ctx->pool);
n->text = mp_end(ctx->pool, p + 1);
- if (ctx->h_comment)
+ if ((ctx->flags & XML_REPORT_COMMENTS) && ctx->h_comment)
ctx->h_comment(ctx);
}
void
xml_pop_comment(struct xml_context *ctx)
{
- xml_pop_dom(ctx);
+ xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_COMMENTS));
xml_dec(ctx);
TRACE(ctx, "pop_comment");
}
*p = 0;
n->len = p - (char *)mp_ptr(ctx->pool);
n->text = mp_end(ctx->pool, p + 1);
- if (ctx->h_pi)
+ if ((ctx->flags & XML_REPORT_PIS) && ctx->h_pi)
ctx->h_pi(ctx);
}
void
xml_pop_pi(struct xml_context *ctx)
{
- xml_pop_dom(ctx);
+ xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_PIS));
xml_dec(ctx);
TRACE(ctx, "pop_pi");
}
xml_skip_pi(struct xml_context *ctx)
{
TRACE(ctx, "skip_pi");
- if (ctx->flags & XML_FLAG_VALIDATING)
+ if (ctx->flags & XML_VALIDATING)
{
struct mempool_state state;
mp_save(ctx->stack, &state);
/*** Character data ***/
-static void
-xml_chars_spout(struct fastbuf *fb)
-{
- if (fb->bptr >= fb->bufend)
- {
- struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb);
- struct mempool *pool = ctx->pool;
- if (fb->bufend != fb->buffer)
- {
- uns len = fb->bufend - fb->buffer;
- TRACE(ctx, "grow_chars");
- fb->buffer = mp_expand(pool);
- fb->bufend = fb->buffer + mp_avail(pool);
- fb->bstop = fb->buffer;
- fb->bptr = fb->buffer + len;
- }
- else
- {
- TRACE(ctx, "push_chars");
- struct xml_node *n = xml_push_dom(ctx);
- n->type = XML_NODE_CDATA;
- xml_start_chars(ctx);
- }
- }
-}
-
-static void
-xml_init_chars(struct xml_context *ctx)
-{
- struct fastbuf *fb = &ctx->chars;
- fb->name = "<xml-chars>";
- fb->spout = xml_chars_spout;
- fb->can_overwrite_buffer = 1;
- fb->bptr = fb->bstop = fb->buffer = fb->bufend = NULL;
-}
-
static inline uns
xml_flush_chars(struct xml_context *ctx)
{
struct xml_node *n = ctx->node;
n->text = xml_end_chars(ctx, &n->len);
n->len = fb->bufend - fb->buffer;
- if (ctx->h_chars)
+ if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars)
ctx->h_chars(ctx);
return 1;
}
static inline void
xml_pop_chars(struct xml_context *ctx)
{
- xml_pop_dom(ctx);
+ xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS));
TRACE(ctx, "pop_chars");
}
* Already parsed: '<![' */
xml_parse_seq(ctx, "CDATA[");
struct xml_node *n = xml_push_dom(ctx);
- n->type = XML_NODE_CDATA;
+ n->type = XML_NODE_CHARS;
char *p = mp_start_noalign(ctx->pool, 7);
while (1)
{
*p = 0;
n->len = p - (char *)mp_ptr(ctx->pool);
n->text = mp_end(ctx->pool, p + 1);
- if (ctx->h_cdata)
+ if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata)
ctx->h_cdata(ctx);
}
static void
xml_pop_cdata(struct xml_context *ctx)
{
- xml_pop_dom(ctx);
+ xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS));
xml_dec(ctx);
TRACE(ctx, "pop_cdata");
}
while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT));
}
uns cat = xml_char_cat(v);
- if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_FLAG_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0)))
+ if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0)))
{
xml_error(ctx, "Character reference out of range");
goto recover;
mp_save(ctx->stack, &state);
char *name = xml_parse_name(ctx, ctx->stack);
xml_parse_char(ctx, ';');
- struct xml_dtd_ent *ent = xml_dtd_find_gent(ctx, name);
+ struct xml_dtd_ent *ent = xml_dtd_find_ent(ctx, name);
if (!ent)
{
xml_error(ctx, "Unknown entity &%s;", name);
a->val = v;
}
+struct xml_attr *
+xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name)
+{
+ return xml_attrs_find(ctx->tab_attrs, node, name);
+}
+
+void
+xml_attrs_table_init(struct xml_context *ctx)
+{
+ xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table)));
+}
+
+void
+xml_attrs_table_cleanup(struct xml_context *ctx)
+{
+ xml_attrs_cleanup(ctx->tab_attrs);
+}
+
/*** Elements ***/
static void
if (!e->parent)
{
ctx->root = e;
- if (ctx->document_type && strcmp(e->name, ctx->document_type))
- xml_error(ctx, "The root element %s does not match the document type %s", e->name, ctx->document_type);
+ if (ctx->doctype && strcmp(e->name, ctx->doctype))
+ xml_error(ctx, "The root element %s does not match the document type %s", e->name, ctx->doctype);
}
while (1)
{
if (c == '/')
{
xml_parse_char(ctx, '>');
- ctx->flags |= XML_FLAG_EMPTY_ELEM;
+ ctx->flags |= XML_EMPTY_ELEM_TAG;
break;
}
else if (c == '>')
xml_unget_char(ctx);
xml_parse_attr(ctx);
}
- if (ctx->h_element_start)
- ctx->h_element_start(ctx);
+ if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag)
+ ctx->h_stag(ctx);
}
static void
xml_pop_element(struct xml_context *ctx)
{
TRACE(ctx, "pop_element");
- if (ctx->h_element_end)
- ctx->h_element_end(ctx);
+ if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag)
+ ctx->h_etag(ctx);
struct xml_node *e = ctx->node;
- if (ctx->flags & XML_DOM_FREE)
+ uns free = !(ctx->flags & XML_ALLOC_TAGS);
+ if (free)
{
if (!e->parent)
ctx->root = NULL;
clist_remove(&n->n);
}
}
- xml_pop_dom(ctx);
+ xml_pop_dom(ctx, free);
xml_dec(ctx);
}
/* doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
* Already parsed: '<!'
* Terminated before '[' or '>' */
- if (ctx->document_type)
+ if (ctx->doctype)
xml_fatal(ctx, "Multiple document types not allowed");
xml_parse_seq(ctx, "DOCTYPE");
xml_parse_white(ctx, 1);
- ctx->document_type = xml_parse_name(ctx, ctx->pool);
- TRACE(ctx, "doctyype=%s", ctx->document_type);
+ ctx->doctype = xml_parse_name(ctx, ctx->pool);
+ TRACE(ctx, "doctype=%s", ctx->doctype);
uns c;
if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P'))
{
{
xml_parse_seq(ctx, "SYSTEM");
xml_parse_white(ctx, 1);
- ctx->eid.system_id = xml_parse_system_literal(ctx, ctx->pool);
+ ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
}
else
{
xml_parse_seq(ctx, "PUBLIC");
xml_parse_white(ctx, 1);
- ctx->eid.public_id = xml_parse_pubid_literal(ctx, ctx->pool);
+ ctx->public_id = xml_parse_pubid_literal(ctx, ctx->pool);
xml_parse_white(ctx, 1);
- ctx->eid.system_id = xml_parse_system_literal(ctx, ctx->pool);
+ ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
}
xml_parse_white(ctx, 0);
- ctx->flags |= XML_FLAG_HAS_EXTERNAL_SUBSET;
+ ctx->flags |= XML_HAS_EXTERNAL_SUBSET;
}
if (xml_peek_char(ctx) == '[')
- ctx->flags |= XML_FLAG_HAS_INTERNAL_SUBSET;
+ ctx->flags |= XML_HAS_INTERNAL_SUBSET;
if (ctx->h_doctype_decl)
ctx->h_doctype_decl(ctx);
}
xml_fatal(ctx, "Invalid markup in the internal subset");
}
+/*** The State Machine ***/
-/*----------------------------------------------*/
-
-void
-xml_init(struct xml_context *ctx)
-{
- bzero(ctx, sizeof(*ctx));
- ctx->pool = mp_new(65536);
- ctx->stack = mp_new(65536);
- ctx->flags = XML_DOM_FREE;
- xml_init_chars(ctx);
- xml_dtd_init(ctx);
- xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table)));
-}
-
-void
-xml_cleanup(struct xml_context *ctx)
-{
- xml_attrs_cleanup(ctx->tab_attrs);
- xml_dtd_cleanup(ctx);
- mp_delete(ctx->pool);
- mp_delete(ctx->stack);
-}
-
-int
+uns
xml_next(struct xml_context *ctx)
{
/* A nasty state machine */
+#define PULL(x) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##x; case XML_STATE_##x: ; } while (0)
+#define PULL_STATE(x, s) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##s, XML_STATE_##x; case XML_STATE_##s: ; } while (0)
+
TRACE(ctx, "xml_next (state=%u)", ctx->state);
jmp_buf throw_buf;
ctx->throw_buf = &throw_buf;
error:
if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal)
ctx->h_fatal(ctx);
- ctx->state = XML_STATE_FATAL;
TRACE(ctx, "raised fatal error");
- return -1;
+ return ctx->state = XML_STATE_EOF;
}
uns c;
switch (ctx->state)
{
- case XML_STATE_FATAL:
- return -1;
-
case XML_STATE_START:
TRACE(ctx, "entering prolog");
if (ctx->h_document_start)
xml_refill(ctx);
if (ctx->h_xml_decl)
ctx->h_xml_decl(ctx);
- if (ctx->want & XML_WANT_DECL)
- return ctx->state = XML_STATE_DECL;
- case XML_STATE_DECL:
+ PULL(XML_DECL);
/* Misc* (doctypedecl Misc*)? */
while (1)
xml_parse_char(ctx, '<');
if ((c = xml_get_char(ctx)) == '?')
/* Processing intruction */
- if (!(ctx->want & XML_WANT_PI))
+ if (!(ctx->flags & XML_REPORT_PIS))
xml_skip_pi(ctx);
else
{
xml_push_pi(ctx);
- ctx->state = XML_STATE_PROLOG_PI;
- return XML_STATE_PI;
- case XML_STATE_PROLOG_PI:
+ PULL_STATE(PI, PROLOG_PI);
xml_pop_pi(ctx);
}
else if (c != '!')
goto first_tag;
}
else if (xml_get_char(ctx) == '-')
- if (!(ctx->want & XML_WANT_COMMENT))
+ if (!(ctx->flags & XML_REPORT_COMMENTS))
xml_skip_comment(ctx);
else
{
xml_push_comment(ctx);
- ctx->state = XML_STATE_PROLOG_COMMENT;
- return XML_STATE_COMMENT;
- case XML_STATE_PROLOG_COMMENT:
+ PULL_STATE(COMMENT, PROLOG_COMMENT);
xml_pop_comment(ctx);
}
else
/* DocTypeDecl */
xml_unget_char(ctx);
xml_parse_doctype_decl(ctx);
- if (ctx->want & XML_WANT_DOCUMENT_TYPE)
- return ctx->state = XML_STATE_DOCUMENT_TYPE;
- case XML_STATE_DOCUMENT_TYPE:
+ PULL(DOCTYPE_DECL);
if (xml_peek_char(ctx) == '[')
{
+ // FIXME: ability to skip the subset
xml_skip_char(ctx);
xml_inc(ctx);
+ xml_dtd_init(ctx);
+ if (ctx->h_dtd_start)
+ ctx->h_dtd_start(ctx);
xml_parse_internal_subset(ctx);
+ // FIXME: external subset
+ if (ctx->h_dtd_end)
+ ctx->h_dtd_end(ctx);
xml_parse_white(ctx, 0);
}
xml_parse_char(ctx, '>');
if ((c = xml_get_char(ctx)) == '?')
{
/* PI */
- if (!(ctx->want & XML_WANT_PI))
+ if (!(ctx->flags & (XML_REPORT_PIS | XML_ALLOC_PIS)))
xml_skip_pi(ctx);
else
{
if (xml_flush_chars(ctx))
{
- if (ctx->want & XML_WANT_CHARS)
- {
- ctx->state = XML_STATE_CHARS_BEFORE_PI;
- return XML_STATE_CHARS;
- }
- case XML_STATE_CHARS_BEFORE_PI:
+ PULL_STATE(CHARS, CHARS_BEFORE_PI);
xml_pop_chars(ctx);
}
xml_push_pi(ctx);
- return ctx->state = XML_STATE_PI;
- case XML_STATE_PI:
+ PULL(PI);
xml_pop_pi(ctx);
}
}
if ((c = xml_get_char(ctx)) == '-')
{
/* Comment */
- if (!(ctx->want & XML_WANT_COMMENT))
+ if (!(ctx->flags & (XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS)))
xml_skip_comment(ctx);
else
{
if (xml_flush_chars(ctx))
{
- if (ctx->want & XML_WANT_CHARS)
- {
- ctx->state = XML_STATE_CHARS_BEFORE_COMMENT;
- return XML_STATE_CHARS;
- }
- case XML_STATE_CHARS_BEFORE_COMMENT:
+ PULL_STATE(CHARS, CHARS_BEFORE_COMMENT);
xml_pop_chars(ctx);
}
xml_push_comment(ctx);
- return ctx->state = XML_STATE_COMMENT;
- case XML_STATE_COMMENT:
+ PULL(COMMENT);
xml_pop_comment(ctx);
}
}
else if (c == '[')
{
/* CDATA */
- if (!(ctx->want & XML_WANT_CDATA))
+ if (!(ctx->flags & XML_UNFOLD_CDATA))
xml_append_cdata(ctx);
else
{
if (xml_flush_chars(ctx))
{
- if (ctx->want & XML_WANT_CHARS)
- {
- ctx->state = XML_STATE_CHARS_BEFORE_CDATA;
- return XML_STATE_CHARS;
- }
- case XML_STATE_CHARS_BEFORE_CDATA:
+ PULL_STATE(CHARS, CHARS_BEFORE_CDATA);
xml_pop_chars(ctx);
}
xml_push_cdata(ctx);
- return ctx->state = XML_STATE_CDATA;
- case XML_STATE_CDATA:
+ PULL(CDATA);
xml_pop_cdata(ctx);
}
}
xml_unget_char(ctx);
if (xml_flush_chars(ctx))
{
- if (ctx->want & XML_WANT_CHARS)
- {
- ctx->state = XML_STATE_CHARS_BEFORE_STAG;
- return XML_STATE_CHARS;
- }
- case XML_STATE_CHARS_BEFORE_STAG:
+ PULL_STATE(CHARS, CHARS_BEFORE_STAG);
xml_pop_chars(ctx);
}
xml_push_element(ctx);
- if (ctx->want & XML_WANT_STAG)
- return ctx->state = XML_STATE_STAG;
- case XML_STATE_STAG:
- if (ctx->flags & XML_FLAG_EMPTY_ELEM)
+ PULL(STAG);
+ if (ctx->flags & XML_EMPTY_ELEM_TAG)
goto pop_element;
}
/* ETag */
if (xml_flush_chars(ctx))
{
- if (ctx->want & XML_WANT_CHARS)
- {
- ctx->state = XML_STATE_CHARS_BEFORE_ETAG;
- return XML_STATE_CHARS;
- }
- case XML_STATE_CHARS_BEFORE_ETAG:
+ PULL_STATE(CHARS, CHARS_BEFORE_ETAG);
xml_pop_chars(ctx);
}
xml_parse_etag(ctx);
pop_element:
- if (ctx->want & XML_WANT_ETAG)
- return ctx->state = XML_STATE_ETAG;
- case XML_STATE_ETAG:
+ PULL(ETAG);
xml_pop_element(ctx);
if (!ctx->node)
goto epilog;
if (ctx->h_document_end)
ctx->h_document_end(ctx);
case XML_STATE_EOF:
+ ctx->err_code = 0;
+ ctx->err_msg = NULL;
return XML_STATE_EOF;
}
else
/* Misc */
xml_parse_char(ctx, '<');
+ xml_inc(ctx);
if ((c = xml_get_char(ctx)) == '?')
/* Processing instruction */
- if (!(ctx->want & XML_WANT_PI))
+ if (!(ctx->flags & XML_REPORT_PIS))
xml_skip_pi(ctx);
else
{
xml_push_pi(ctx);
- return ctx->state = XML_STATE_EPILOG_PI, XML_STATE_PI;
- case XML_STATE_EPILOG_PI:
+ PULL_STATE(PI, EPILOG_PI);
xml_pop_pi(ctx);
}
else if (c == '!')
- /* Comment */
- if (!(ctx->want & XML_WANT_COMMENT))
- xml_skip_comment(ctx);
- else
- {
- xml_push_comment(ctx);
- return ctx->state = XML_STATE_EPILOG_COMMENT, XML_STATE_COMMENT;
- case XML_STATE_EPILOG_COMMENT:
- xml_pop_comment(ctx);
- }
+ {
+ xml_parse_char(ctx, '-');
+ /* Comment */
+ if (!(ctx->flags & XML_REPORT_COMMENTS))
+ xml_skip_comment(ctx);
+ else
+ {
+ xml_push_comment(ctx);
+ PULL_STATE(COMMENT, EPILOG_COMMENT);
+ xml_pop_comment(ctx);
+ }
+ }
else
xml_fatal(ctx, "Syntax error in the epilog");
}
}
- return -1;
+ ASSERT(0);
+}
+
+uns
+xml_parse(struct xml_context *ctx)
+{
+ ctx->pull = 0;
+ xml_next(ctx);
+ return ctx->err_code;
}
--- /dev/null
+/*
+ * Sherlock Library -- A simple XML parser
+ *
+ * (c) 2007 Pavel Charvat <pchar@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+#undef LOCAL_DEBUG
+
+#include "sherlock/sherlock.h"
+#include "sherlock/xml/xml.h"
+#include "sherlock/xml/dtd.h"
+#include "sherlock/xml/common.h"
+#include "lib/unicode.h"
+#include "lib/ff-unicode.h"
+#include "charset/charconv.h"
+#include "charset/fb-charconv.h"
+
+/*** Charecter categorization ***/
+
+#include "obj/sherlock/xml/unicat.c"
+
+static void
+xml_init_cats(struct xml_context *ctx)
+{
+ if (!(ctx->flags & XML_VERSION_1_1))
+ {
+ ctx->cat_chars = XML_CHAR_VALID_1_0;
+ ctx->cat_unrestricted = XML_CHAR_VALID_1_0;
+ ctx->cat_new_line = XML_CHAR_NEW_LINE_1_0;
+ ctx->cat_name = XML_CHAR_NAME_1_0;
+ ctx->cat_sname = XML_CHAR_SNAME_1_0;
+ }
+ else
+ {
+ ctx->cat_chars = XML_CHAR_VALID_1_1;
+ ctx->cat_unrestricted = XML_CHAR_UNRESTRICTED_1_1;
+ ctx->cat_new_line = XML_CHAR_NEW_LINE_1_1;
+ ctx->cat_name = XML_CHAR_NAME_1_1;
+ ctx->cat_sname = XML_CHAR_SNAME_1_1;
+ }
+}
+
+/*** Reading of document/external entities ***/
+
+static void NONRET
+xml_eof(struct xml_context *ctx)
+{
+ ctx->err_msg = "Unexpected EOF";
+ ctx->err_code = XML_ERR_EOF;
+ xml_throw(ctx);
+}
+
+void NONRET
+xml_fatal_nested(struct xml_context *ctx)
+{
+ xml_fatal(ctx, "Entity is not nested correctly");
+}
+
+static inline void
+xml_add_char(u32 **bstop, uns c)
+{
+ *(*bstop)++ = c;
+ *(*bstop)++ = xml_char_cat(c);
+}
+
+struct xml_source *
+xml_push_source(struct xml_context *ctx, uns flags)
+{
+ xml_push(ctx);
+ struct xml_source *src = ctx->src;
+ if (src)
+ {
+ src->bptr = ctx->bptr;
+ src->bstop = ctx->bstop;
+ }
+ src = mp_alloc_zero(ctx->stack, sizeof(*src));
+ src->next = ctx->src;
+ src->saved_depth = ctx->depth;
+ ctx->src = src;
+ ctx->flags = (ctx->flags & ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_NEW_LINE | XML_SRC_SURROUND | XML_SRC_DOCUMENT)) | flags;
+ ctx->bstop = ctx->bptr = src->buf;
+ ctx->depth = 0;
+ if (flags & XML_SRC_SURROUND)
+ xml_add_char(&ctx->bstop, 0x20);
+ return src;
+}
+
+static void
+xml_close_source(struct xml_source *src)
+{
+ bclose(src->fb);
+ if (src->wrapped_fb)
+ bclose(src->wrapped_fb);
+}
+
+static void
+xml_pop_source(struct xml_context *ctx)
+{
+ TRACE(ctx, "pop_source");
+ if (unlikely(ctx->depth != 0))
+ {
+ xml_fatal(ctx, "Unexpected end of entity");
+ }
+ struct xml_source *src = ctx->src;
+ ASSERT(src);
+ xml_close_source(src);
+ ctx->depth = src->saved_depth;
+ ctx->src = src = src->next;
+ if (src)
+ {
+ ctx->bptr = src->bptr;
+ ctx->bstop = src->bstop;
+ }
+ xml_pop(ctx);
+ if (unlikely(!src))
+ xml_eof(ctx);
+}
+
+void
+xml_sources_cleanup(struct xml_context *ctx)
+{
+ struct xml_source *s;
+ while (s = ctx->src)
+ {
+ ctx->src = s->next;
+ xml_close_source(s);
+ }
+}
+
+static void xml_refill_utf8(struct xml_context *ctx);
+
+void
+xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent)
+{
+ TRACE(ctx, "xml_push_entity");
+ uns cat1 = ctx->src->refill_cat1;
+ uns cat2 = ctx->src->refill_cat2;
+ struct xml_source *src = xml_push_source(ctx, 0);
+ src->refill_cat1 = cat1;
+ src->refill_cat2 = cat2;
+ if (ent->flags & XML_DTD_ENT_EXTERNAL)
+ xml_fatal(ctx, "External entities not implemented"); // FIXME
+ else
+ {
+ fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0);
+ src->refill = xml_refill_utf8;
+ }
+}
+
+void
+xml_set_source(struct xml_context *ctx, struct fastbuf *fb)
+{
+ TRACE(ctx, "xml_set_source");
+ ASSERT(!ctx->src);
+ struct xml_source *src = xml_push_source(ctx, XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL);
+ src->fb = fb;
+ ctx->state = XML_STATE_START;
+}
+
+static uns
+xml_error_restricted(struct xml_context *ctx, uns c)
+{
+ if (c == ~1U)
+ xml_error(ctx, "Corrupted encoding");
+ else
+ xml_error(ctx, "Restricted char U+%04X", c);
+ return UNI_REPLACEMENT;
+}
+
+void xml_parse_decl(struct xml_context *ctx);
+
+#define REFILL(ctx, func, params...) \
+ struct xml_source *src = ctx->src; \
+ struct fastbuf *fb = src->fb; \
+ if (ctx->bptr == ctx->bstop) \
+ ctx->bptr = ctx->bstop = src->buf; \
+ uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \
+ u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \
+ *last_0xd = (f & XML_SRC_NEW_LINE) ? bstop : bend; \
+ do \
+ { \
+ c = func(fb, ##params); \
+ uns t = xml_char_cat(c); \
+ if (t & t1) \
+ /* Typical branch */ \
+ *bstop++ = c, *bstop++ = t; \
+ else if (t & t2) \
+ { \
+ /* New line */ \
+ /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \
+ /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \
+ if (c == 0xd) \
+ last_0xd = bstop + 2; \
+ else if (c != 0x2028 && last_0xd == bstop) \
+ { \
+ last_0xd = bend; \
+ continue; \
+ } \
+ xml_add_char(&bstop, 0xa), row++; \
+ } \
+ else if (c == '>') \
+ { \
+ /* Used only in XML/TextDecl to switch the encoding */ \
+ *bstop++ = c, *bstop++ = t; \
+ break; \
+ } \
+ else if (~c) \
+ /* Restricted character */ \
+ xml_add_char(&bstop, xml_error_restricted(ctx, c)); \
+ else \
+ { \
+ /* EOF */ \
+ if (f & XML_SRC_SURROUND) \
+ xml_add_char(&bstop, 0x20); \
+ f |= XML_SRC_EOF; \
+ break; \
+ } \
+ } \
+ while (bstop < bend); \
+ ctx->flags = (last_0xd == bstop) ? f | XML_SRC_NEW_LINE : f & ~XML_SRC_NEW_LINE; \
+ ctx->bstop = bstop; \
+ src->row = row;
+
+static void
+xml_refill_utf8(struct xml_context *ctx)
+{
+ REFILL(ctx, bget_utf8_repl, ~1U);
+}
+
+static void
+xml_refill_utf16_le(struct xml_context *ctx)
+{
+ REFILL(ctx, bget_utf16_le_repl, ~1U);
+}
+
+static void
+xml_refill_utf16_be(struct xml_context *ctx)
+{
+ REFILL(ctx, bget_utf16_be_repl, ~1U);
+}
+
+#if 0
+static inline uns
+xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x)
+{
+ // FIXME: slow
+ int c;
+ return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]);
+}
+
+static void
+xml_refill_libcharset(struct xml_context *ctx)
+{
+ unsigned short int *in_to_x = ctx->src->refill_in_to_x;
+ REFILL(ctx, xml_refill_libcharset_bget, in_to_x);
+}
+#endif
+
+#undef REFILL
+
+void
+xml_refill(struct xml_context *ctx)
+{
+ do
+ {
+ if (ctx->flags & XML_SRC_EOF)
+ xml_pop_source(ctx);
+ else if (ctx->flags & XML_SRC_EXPECTED_DECL)
+ xml_parse_decl(ctx);
+ else
+ {
+ ctx->src->refill(ctx);
+ TRACE(ctx, "refilled %u characters", (uns)((ctx->bstop - ctx->bptr) / 2));
+ }
+ }
+ while (ctx->bptr == ctx->bstop);
+}
+
+uns
+xml_row(struct xml_context *ctx)
+{
+ struct xml_source *src = ctx->src;
+ if (!src)
+ return 0;
+ uns row = src->row;
+ for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2)
+ if (p[-1] & src->refill_cat2)
+ row--;
+ return row + 1;
+}
+
+/* Document/external entity header */
+
+static char *
+xml_parse_encoding_name(struct xml_context *ctx)
+{
+ /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */
+ char *p = mp_start_noalign(ctx->pool, 1);
+ uns q = xml_parse_quote(ctx);
+ if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME)))
+ xml_fatal(ctx, "Invalid character in the encoding name");
+ while (1)
+ {
+ p = mp_spread(ctx->pool, p, 2);
+ *p++ = xml_last_char(ctx);
+ if (xml_get_char(ctx) == q)
+ break;
+ if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME)))
+ xml_fatal(ctx, "Invalid character in the encoding name");
+ }
+ *p++ = 0;
+ return mp_end(ctx->pool, p);
+}
+
+static void
+xml_init_charconv(struct xml_context *ctx, int cs)
+{
+ // FIXME: hack
+ struct xml_source *src = ctx->src;
+ TRACE(ctx, "wrapping charset %s", charset_name(cs));
+#if 0
+ struct conv_context conv;
+ conv_set_charset(&conv, cs, CONV_CHARSET_UTF8);
+ src->refill = xml_refill_libcharset;
+ src->refill_in_to_x = conv.in_to_x;
+#else
+ src->wrapped_fb = src->fb;
+ src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8);
+#endif
+}
+
+void
+xml_parse_decl(struct xml_context *ctx)
+{
+ TRACE(ctx, "xml_parse_decl");
+ struct xml_source *src = ctx->src;
+ ctx->flags &= ~XML_SRC_EXPECTED_DECL;
+ uns doc = ctx->flags & XML_SRC_DOCUMENT;
+
+ /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */
+ if (doc)
+ xml_init_cats(ctx);
+ src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line & ~XML_CHAR_GT;
+ src->refill_cat2 = ctx->cat_new_line;
+
+ /* Initialize the supplied charset (if any) or try to guess it */
+ char *expected_encoding = src->expected_encoding ? : src->fb_encoding;
+ src->refill = xml_refill_utf8;
+ int bom = bpeekc(src->fb);
+ if (bom < 0)
+ ctx->flags |= XML_SRC_EOF;
+ if (!src->fb_encoding)
+ {
+ if (bom == 0xfe)
+ src->refill = xml_refill_utf16_be;
+ else if (bom == 0xff)
+ src->refill = xml_refill_utf16_le;
+ }
+ else
+ {
+ int cs = find_charset_by_name(src->fb_encoding);
+ if (cs == CONV_CHARSET_UTF8)
+ {}
+ else if (cs >= 0)
+ {
+ xml_init_charconv(ctx, cs);
+ bom = 0;
+ }
+ else if (strcasecmp(src->fb_encoding, "UTF-16"))
+ {
+ src->refill = xml_refill_utf16_be;
+ if (bom == 0xff)
+ src->refill = xml_refill_utf16_le;
+ if (!src->expected_encoding)
+ expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE";
+ }
+ else if (strcasecmp(src->fb_encoding, "UTF-16BE"))
+ src->refill = xml_refill_utf16_be;
+ else if (strcasecmp(src->fb_encoding, "UTF-16LE"))
+ src->refill = xml_refill_utf16_le;
+ else
+ {
+ xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding);
+ expected_encoding = NULL;
+ }
+ }
+ uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be;
+ if (bom > 0 && xml_peek_char(ctx) == 0xfeff)
+ xml_skip_char(ctx);
+ else if (utf16)
+ xml_error(ctx, "Missing or corrupted BOM");
+
+ /* Look ahead for presence of XMLDecl or optional TextDecl */
+ if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf))
+ xml_refill(ctx);
+ u32 *bptr = ctx->bptr;
+ uns have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) &&
+ bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L');
+ if (!have_decl)
+ {
+ if (doc)
+ xml_fatal(ctx, "Missing or corrupted XML header");
+ else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16)
+ xml_error(ctx, "Missing or corrupted entity header");
+ goto exit;
+ }
+ ctx->bptr = bptr + 12;
+ xml_parse_white(ctx, 0);
+
+ /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */
+ if (xml_peek_char(ctx) == 'v')
+ {
+ xml_parse_seq(ctx, "version");
+ xml_parse_eq(ctx);
+ char *version = xml_parse_pubid_literal(ctx, ctx->pool);
+ TRACE(ctx, "version=%s", version);
+ uns v = 0;
+ if (!strcmp(version, "1.1"))
+ v = XML_VERSION_1_1;
+ else if (strcmp(version, "1.0"))
+ {
+ xml_error(ctx, "Unknown XML version string '%s'", version);
+ version = "1.0";
+ }
+ if (doc)
+ {
+ ctx->version_str = version;
+ ctx->flags |= v;
+ }
+ else if (v > (ctx->flags & XML_VERSION_1_1))
+ xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document");
+ if (!xml_parse_white(ctx, !doc))
+ goto end;
+ }
+ else if (doc)
+ {
+ xml_error(ctx, "Expected XML version");
+ ctx->version_str = "1.0";
+ }
+
+ /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */
+ if (xml_peek_char(ctx) == 'e')
+ {
+ xml_parse_seq(ctx, "encoding");
+ xml_parse_eq(ctx);
+ src->decl_encoding = xml_parse_encoding_name(ctx);
+ TRACE(ctx, "encoding=%s", src->decl_encoding);
+ if (!xml_parse_white(ctx, 0))
+ goto end;
+ }
+ else if (!doc)
+ xml_error(ctx, "Expected XML encoding");
+
+ /* Parse whether the document is standalone (optional in XMLDecl) */
+ if (doc && xml_peek_char(ctx) == 's')
+ {
+ xml_parse_seq(ctx, "standalone");
+ xml_parse_eq(ctx);
+ uns c = xml_parse_quote(ctx);
+ if (ctx->standalone = (xml_peek_char(ctx) == 'y'))
+ xml_parse_seq(ctx, "yes");
+ else
+ xml_parse_seq(ctx, "no");
+ xml_parse_char(ctx, c);
+ TRACE(ctx, "standalone=%d", ctx->standalone);
+ xml_parse_white(ctx, 0);
+ }
+end:
+ xml_parse_seq(ctx, "?>");
+
+ /* Switch to the final encoding */
+ if (src->decl_encoding)
+ {
+ int cs = find_charset_by_name(src->decl_encoding);
+ if (cs < 0 && !expected_encoding)
+ xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding);
+ else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8)
+ xml_init_charconv(ctx, cs);
+ else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 ||
+ !(!strcasecmp(src->decl_encoding, "UTF-16") ||
+ (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) ||
+ (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE")))))
+ xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding);
+ }
+
+exit:
+ /* Update valid Unicode ranges */
+ if (doc)
+ xml_init_cats(ctx);
+ src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line;
+ src->refill_cat2 = ctx->cat_new_line;
+}
#include <stdio.h>
#include <stdlib.h>
-static char *shortopts = "sp" CF_SHORT_OPTS;
+enum {
+ WANT_FIRST = 0x100,
+ WANT_HIDE_ERRORS,
+ WANT_UNFOLD_CDATA,
+ WANT_IGNORE_COMMENTS,
+ WANT_IGNORE_PIS,
+};
+
+static char *shortopts = "spd" CF_SHORT_OPTS;
static struct option longopts[] = {
CF_LONG_OPTS
- { "sax", 0, 0, 's' },
- { "pull", 0, 0, 'p' },
- { "dom", 0, 0, 'd' },
- { NULL, 0, 0, 0 }
+ { "sax", 0, 0, 's' },
+ { "pull", 0, 0, 'p' },
+ { "dom", 0, 0, 'd' },
+ { "hide-errors", 0, 0, WANT_HIDE_ERRORS },
+ { "unfold-cdata", 0, 0, WANT_UNFOLD_CDATA },
+ { "ignore-comments", 0, 0, WANT_IGNORE_COMMENTS },
+ { "ignore-pis", 0, 0, WANT_IGNORE_PIS },
+ { NULL, 0, 0, 0 }
};
static void NONRET
usage(void)
{
fputs("\
-Usage: xml-test [options] < in.xml\n\
+Usage: xml-test [options] < input.xml\n\
\n\
Options:\n"
CF_USAGE
"\
--s, --pull Test PULL interface\n\
--s, --sax Test SAX interface\n\
--d, --dom Test DOM interface\n\
+-s, --pull Test PULL interface\n\
+-s, --sax Test SAX interface\n\
+-d, --dom Test DOM interface\n\
+ --hide-errors Hide warnings and error messages\n\
+ --unfold-cdata Unfold CDATA sections\n\
+ --ignore-comments Ignore processing instructions\n\
+ --ignore-pis Ignore comments\n\
\n", stderr);
exit(1);
}
static uns want_sax;
static uns want_pull;
static uns want_dom;
+static uns want_hide_errors;
+static uns want_unfold_cdata;
+static uns want_ignore_comments;
+static uns want_ignore_pis;
+
static struct fastbuf *out;
static char *
case XML_NODE_ELEM: return "element";
case XML_NODE_COMMENT: return "comment";
case XML_NODE_PI: return "pi";
- case XML_NODE_CDATA: return "chars";
+ case XML_NODE_CHARS: return "chars";
default: return "unknown";
}
}
{
case XML_NODE_ELEM:
bprintf(out, " <%s>", node->name);
- SLIST_FOR_EACH(struct xml_attr *, a, node->attrs)
+ XML_ATTR_FOR_EACH(a, node)
bprintf(out, " %s='%s'", a->name, a->val);
bputc(out, '\n');
break;
case XML_NODE_PI:
bprintf(out, " target=%s text='%s'\n", node->name, node->text);
break;
- case XML_NODE_CDATA:
+ case XML_NODE_CHARS:
bprintf(out, " text='%s'\n", node->text);
break;
default:
bputs(out, node_type(node));
show_node(node);
if (node->type == XML_NODE_ELEM)
- CLIST_FOR_EACH(struct xml_node *, son, node->sons)
+ XML_NODE_FOR_EACH(son, node)
show_tree(son, level + 1);
}
h_doctype_decl(struct xml_context *ctx)
{
bprintf(out, "SAX: doctype_decl type=%s public='%s' system='%s' extsub=%d intsub=%d\n",
- ctx->document_type, ctx->eid.public_id ? : "", ctx->eid.system_id ? : "",
- !!(ctx->flags & XML_FLAG_HAS_EXTERNAL_SUBSET), !!(ctx->flags & XML_FLAG_HAS_INTERNAL_SUBSET));
+ ctx->doctype, ctx->public_id ? : "", ctx->system_id ? : "",
+ !!(ctx->flags & XML_HAS_EXTERNAL_SUBSET), !!(ctx->flags & XML_HAS_INTERNAL_SUBSET));
}
static void
static void
h_pi(struct xml_context *ctx)
{
- bprintf(out, "SAX: pi");
+ bputs(out, "SAX: pi");
show_node(ctx->node);
}
static void
-h_element_start(struct xml_context *ctx)
+h_stag(struct xml_context *ctx)
{
- bprintf(out, "SAX: element_start");
+ bputs(out, "SAX: stag");
show_node(ctx->node);
}
static void
-h_element_end(struct xml_context *ctx)
+h_etag(struct xml_context *ctx)
{
- bprintf(out, "SAX: element_end </%s>\n", ctx->node->name);
+ bprintf(out, "SAX: etag </%s>\n", ctx->node->name);
}
static void
h_chars(struct xml_context *ctx)
{
- bprintf(out, "SAX: chars");
+ bputs(out, "SAX: chars");
+ show_node(ctx->node);
+}
+
+static void
+h_cdata(struct xml_context *ctx)
+{
+ bputs(out, "SAX: cdata");
show_node(ctx->node);
}
+static void
+h_dtd_start(struct xml_context *ctx UNUSED)
+{
+ bputs(out, "SAX: dtd_start\n");
+}
+
+static void
+h_dtd_end(struct xml_context *ctx UNUSED)
+{
+ bputs(out, "SAX: dtd_end\n");
+}
+
int
main(int argc, char **argv)
{
int opt;
- cf_def_file = NULL; // FIXME
+ cf_def_file = NULL;
log_init(argv[0]);
while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0)
switch (opt)
case 'd':
want_dom++;
break;
+ case WANT_HIDE_ERRORS:
+ want_hide_errors++;
+ break;
+ case WANT_UNFOLD_CDATA:
+ want_unfold_cdata++;
+ break;
+ case WANT_IGNORE_COMMENTS:
+ want_ignore_comments++;
+ break;
+ case WANT_IGNORE_PIS:
+ want_ignore_pis++;
+ break;
default:
usage();
}
out = bfdopen_shared(1, 4096);
struct xml_context ctx;
xml_init(&ctx);
- ctx.h_warn = ctx.h_error = ctx.h_fatal = h_error;
+ if (!want_hide_errors)
+ ctx.h_warn = ctx.h_error = ctx.h_fatal = h_error;
if (want_sax)
{
ctx.h_document_start = h_document_start;
ctx.h_doctype_decl = h_doctype_decl;
ctx.h_comment = h_comment;
ctx.h_pi = h_pi;
- ctx.h_element_start = h_element_start;
- ctx.h_element_end = h_element_end;
+ ctx.h_stag = h_stag;
+ ctx.h_etag = h_etag;
ctx.h_chars = h_chars;
+ ctx.h_cdata = h_cdata;
+ ctx.h_dtd_start = h_dtd_start;
+ ctx.h_dtd_end = h_dtd_end;
}
- if (want_pull)
- ctx.want = XML_WANT_CHARS | XML_WANT_STAG | XML_WANT_ETAG | XML_WANT_COMMENT | XML_WANT_PI;
if (want_dom)
- ctx.flags &= ~XML_DOM_FREE;
+ ctx.flags |= XML_ALLOC_ALL;
+ if (want_unfold_cdata)
+ ctx.flags |= XML_UNFOLD_CDATA;
+ if (want_ignore_comments)
+ ctx.flags &= ~(XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS);
+ if (want_ignore_pis)
+ ctx.flags &= ~(XML_REPORT_PIS | XML_ALLOC_PIS);
xml_set_source(&ctx, bfdopen_shared(0, 4096));
- int state;
- bprintf(out, "PULL: start\n");
- while ((state = xml_next(&ctx)) >= 0 && state != XML_STATE_EOF)
- switch (state)
- {
- case XML_STATE_CHARS:
- bprintf(out, "PULL: chars");
- show_node(ctx.node);
- break;
- case XML_STATE_STAG:
- bprintf(out, "PULL: element_start");
- show_node(ctx.node);
- break;
- case XML_STATE_ETAG:
- bprintf(out, "PULL: element_end </%s>\n", ctx.node->name);
- break;
- case XML_STATE_COMMENT:
- bprintf(out, "PULL: comment");
- show_node(ctx.node);
- break;
- case XML_STATE_PI:
- bprintf(out, "PULL: pi");
- show_node(ctx.node);
- break;
-#if 0
- case XML_STATE_CDATA:
- bprintf(out, "PULL: cdata [%s]\n", ctx.node->text);
- break;
-#endif
- }
- if (state != XML_STATE_EOF)
- bprintf(out, "PULL: fatal error\n");
+ bputs(out, "PULL: start\n");
+ if (want_pull)
+ {
+ ctx.pull = XML_PULL_CHARS | XML_PULL_CDATA | XML_PULL_STAG | XML_PULL_ETAG | XML_PULL_COMMENT | XML_PULL_PI;
+ uns state;
+ while (state = xml_next(&ctx))
+ switch (state)
+ {
+ case XML_STATE_CHARS:
+ bputs(out, "PULL: chars");
+ show_node(ctx.node);
+ break;
+ case XML_STATE_CDATA:
+ bputs(out, "PULL: cdata");
+ show_node(ctx.node);
+ break;
+ case XML_STATE_STAG:
+ bputs(out, "PULL: stag");
+ show_node(ctx.node);
+ break;
+ case XML_STATE_ETAG:
+ bprintf(out, "PULL: etag </%s>\n", ctx.node->name);
+ break;
+ case XML_STATE_COMMENT:
+ bputs(out, "PULL: comment");
+ show_node(ctx.node);
+ break;
+ case XML_STATE_PI:
+ bputs(out, "PULL: pi");
+ show_node(ctx.node);
+ break;
+ default:
+ bputs(out, "PULL: unknown\n");
+ break;
+ }
+ }
else
- bprintf(out, "PULL: eof\n");
-
- if (want_dom)
- show_tree(ctx.root, 0);
+ xml_parse(&ctx);
+ if (ctx.err_code)
+ bprintf(out, "PULL: fatal error at %u: %s\n", xml_row(&ctx), ctx.err_msg);
+ else
+ {
+ bputs(out, "PULL: eof\n");
+ if (want_dom)
+ show_tree(ctx.root, 0);
+ }
xml_cleanup(&ctx);
bclose(out);
#include "lib/fastbuf.h"
enum xml_error {
+ // FIXME
XML_ERR_OK = 0,
- XML_ERR_WARN = 1000, /* Warning */
- XML_ERR_ERROR = 2000, /* Recoverable error */
- XML_ERR_FATAL = 3000, /* Unrecoverable error */
+ XML_ERR_WARN = 1000, /* Warning */
+ XML_ERR_ERROR = 2000, /* Recoverable error */
+ XML_ERR_FATAL = 3000, /* Unrecoverable error */
XML_ERR_EOF,
};
enum xml_state {
- XML_STATE_START = 0,
- XML_STATE_DECL,
- XML_STATE_DOCUMENT_TYPE,
- XML_STATE_CHARS,
- XML_STATE_WHITE,
- XML_STATE_CDATA,
- XML_STATE_STAG,
- XML_STATE_ETAG,
- XML_STATE_COMMENT,
- XML_STATE_PI,
- XML_STATE_EOF,
- XML_STATE_FATAL,
+ XML_STATE_EOF, /* EOF or a fatal error */
+ XML_STATE_START, /* Initial state */
+ XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */
+ XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */
+ XML_STATE_CHARS, /* XML_PULL_CHARS */
+ XML_STATE_CDATA, /* XML_PULL_CDATA */
+ XML_STATE_STAG, /* XML_PULL_STAG */
+ XML_STATE_ETAG, /* XML_PULL_ETAG */
+ XML_STATE_COMMENT, /* XML_PULL_COMMENT */
+ XML_STATE_PI, /* XML_PULL_PI */
/* Internal states */
XML_STATE_CHARS_BEFORE_STAG,
XML_STATE_CHARS_BEFORE_ETAG,
XML_STATE_CHARS_BEFORE_CDATA,
- XML_STATE_CHARS_BEFORE_PI,
XML_STATE_CHARS_BEFORE_COMMENT,
- XML_STATE_PROLOG_PI,
+ XML_STATE_CHARS_BEFORE_PI,
XML_STATE_PROLOG_COMMENT,
- XML_STATE_EPILOG_PI,
+ XML_STATE_PROLOG_PI,
XML_STATE_EPILOG_COMMENT,
+ XML_STATE_EPILOG_PI,
};
-enum xml_want {
- XML_WANT_DECL = 1 << XML_STATE_DECL,
- XML_WANT_DOCUMENT_TYPE = 1 << XML_STATE_DOCUMENT_TYPE,
- XML_WANT_CHARS = 1 << XML_STATE_CHARS,
- XML_WANT_WHITE = 1 << XML_STATE_WHITE,
- XML_WANT_CDATA = 1 << XML_STATE_CDATA,
- XML_WANT_STAG = 1 << XML_STATE_STAG,
- XML_WANT_ETAG = 1 << XML_STATE_ETAG,
- XML_WANT_COMMENT = 1 << XML_STATE_COMMENT,
- XML_WANT_PI = 1 << XML_STATE_PI,
- XML_WANT_EOF = 1 << XML_STATE_EOF,
- XML_WANT_ALL = ~0U,
+enum xml_pull {
+ XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */
+ XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */
+ XML_PULL_CHARS = 0x00000004,
+ XML_PULL_CDATA = 0x00000008,
+ XML_PULL_STAG = 0x00000010,
+ XML_PULL_ETAG = 0x00000020,
+ XML_PULL_COMMENT = 0x00000040,
+ XML_PULL_PI = 0x00000080,
+ XML_PULL_ALL = 0xffffffff,
};
enum xml_flags {
- XML_FLAG_VALIDATING = 0x1,
- XML_FLAG_VERSION_1_1 = 0x2, /* XML version 1.1, otherwise 1.0 */
- XML_FLAG_HAS_EXTERNAL_SUBSET = 0x4, /* The document contains a reference to external DTD subset */
- XML_FLAG_HAS_INTERNAL_SUBSET = 0x8, /* The document contains an internal subset */
-
- XML_FLAG_SRC_EOF = 0x10, /* EOF reached */
- XML_FLAG_SRC_EXPECTED_DECL = 0x20, /* Just before optional or required XMLDecl/TextDecl */
- XML_FLAG_SRC_NEW_LINE = 0x40, /* The last read character is 0xD */
- XML_FLAG_SRC_SURROUND = 0x80, /* Surround the text with 0x20 (references to parameter entities) */
- XML_FLAG_SRC_DOCUMENT = 0x100, /* The document entity */
- XML_FLAG_SRC_EXTERNAL = 0x200, /* An external entity */
-
- XML_DOM_SKIP = 0x1000, /* Do not report DOM nodes */
- XML_DOM_FREE = 0x2000, /* Free the subtree when leaving */
- XML_DOM_IGNORE = XML_DOM_SKIP | XML_DOM_FREE, /* Completely ignore the subtree */
-
- XML_FLAG_EMPTY_ELEM = 0x100000,
-};
-
-struct xml_ext_id {
- char *system_id;
- char *public_id;
+ /* Enable reporting of various events via SAX and/or PUSH interface */
+ XML_REPORT_COMMENTS = 0x00000001, /* Report comments */
+ XML_REPORT_PIS = 0x00000002, /* Report processing instructions */
+ XML_REPORT_CHARS = 0x00000004, /* Report characters */
+ XML_REPORT_TAGS = 0x00000008, /* Report element starts/ends */
+ XML_REPORT_MISC = XML_REPORT_COMMENTS | XML_REPORT_PIS,
+ XML_REPORT_ALL = XML_REPORT_MISC | XML_REPORT_CHARS | XML_REPORT_TAGS,
+
+ /* Enable construction of DOM for these types */
+ XML_ALLOC_COMMENTS = 0x00000010, /* Create comment nodes */
+ XML_ALLOC_PIS = 0x00000020, /* Create processing instruction nodes */
+ XML_ALLOC_CHARS = 0x00000040, /* Create character nodes */
+ XML_ALLOC_TAGS = 0x00000080, /* Create element nodes */
+ XML_ALLOC_MISC = XML_ALLOC_COMMENTS | XML_ALLOC_PIS,
+ XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS,
+
+ /* Other parameters */
+ XML_UNFOLD_CDATA = 0x00000100, /* Unfold CDATA sections */
+ XML_VALIDATING = 0x00000200, /* Validate everything (not fully implemented!) */
+
+ /* Internals, do not change! */
+ XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */
+ XML_VERSION_1_1 = 0x00020000, /* XML version is 1.1, otherwise 1.0 */
+ XML_HAS_EXTERNAL_SUBSET = 0x00040000, /* The document contains a reference to external DTD subset */
+ XML_HAS_INTERNAL_SUBSET = 0x00080000, /* The document contains an internal subset */
+ XML_SRC_EOF = 0x00100000, /* EOF reached */
+ XML_SRC_EXPECTED_DECL = 0x00200000, /* Just before optional or required XMLDecl/TextDecl */
+ XML_SRC_NEW_LINE = 0x00400000, /* The last read character is 0xD */
+ XML_SRC_SURROUND = 0x00800000, /* Surround the text with 0x20 (references to parameter entities) */
+ XML_SRC_DOCUMENT = 0x01000000, /* The document entity */
+ XML_SRC_EXTERNAL = 0x02000000, /* An external entity */
};
enum xml_node_type {
XML_NODE_ELEM,
XML_NODE_COMMENT,
- XML_NODE_CDATA,
+ XML_NODE_CHARS,
XML_NODE_PI,
};
+#define XML_NODE_FOR_EACH(var, node) CLIST_FOR_EACH(struct xml_node *, var, (node)->sons)
+#define XML_ATTR_FOR_EACH(var, node) SLIST_FOR_EACH(struct xml_attr *, var, (node)->attrs)
+
struct xml_node {
- cnode n; /* Node for list of parent's sons */
- uns type; /* XML_NODE_x */
- struct xml_node *parent; /* Parent node */
- char *name; /* Element name / PI target */
- clist sons; /* Children nodes */
+ cnode n; /* Node for list of parent's sons */
+ uns type; /* XML_NODE_x */
+ struct xml_node *parent; /* Parent node */
+ char *name; /* Element name / PI target */
+ clist sons; /* Children nodes */
union {
struct {
- char *text; /* PI text / Comment / CDATA */
- uns len; /* Text length in bytes */
+ char *text; /* PI text / Comment / CDATA */
+ uns len; /* Text length in bytes */
};
struct {
- struct xml_dtd_elem *dtd; /* Element DTD */
- slist attrs; /* Link list of element attributes */
+ struct xml_dtd_elem *dtd; /* Element DTD */
+ slist attrs; /* Link list of element attributes */
};
};
};
struct xml_attr {
- snode n;
- struct xml_node *elem;
- char *name;
- char *val;
-};
-
-struct xml_context;
-
-#define XML_BUF_SIZE 32 /* At least 16 -- hardcoded */
-
-struct xml_source {
- struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */
- struct fastbuf *fb; /* Source fastbuf */
- struct fastbuf wrap_fb; /* Libcharset or fbmem wrapper */
- u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */
- u32 *bptr, *bstop; /* Current state of the buffer */
- uns row; /* File position */
- char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */
- char *fb_encoding; /* Encoding of the source fastbuf */
- char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */
- uns refill_cat1; /* Character categories, which should be directly passed to the buffer */
- uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in sequences) */
- void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */
- unsigned short *refill_in_to_x; /* Libcharset input table */
- uns saved_depth; /* Saved ctx->depth */
+ snode n; /* Node for elem->attrs */
+ struct xml_node *elem; /* Parent element */
+ char *name; /* Attribute name */
+ char *val; /* Attribute value */
};
struct xml_context {
/* Memory management */
struct mempool *pool; /* DOM pool */
- struct mempool *stack; /* Stack pool (freed as soon as possible) */
+ struct mempool *stack; /* Stack pool (free as soon as possible) */
struct xml_stack *stack_list; /* See xml_push(), xml_pop() */
uns flags; /* XML_FLAG_x (restored on xml_pop()) */
uns depth; /* Nesting level */
struct fastbuf chars; /* Character data / attribute value */
- void *tab_attrs;
+ void *tab_attrs; /* Hash table of element attributes */
/* Input */
struct xml_source *src; /* Current source */
- u32 *bptr, *bstop; /* Character buffer */
+ u32 *bptr, *bstop; /* Buffer with preprocessed characters (validated UCS-4 + category flags) */
+ uns cat_chars; /* Unicode range of supported characters (cdata, attribute values, ...) */
+ uns cat_unrestricted; /* Unrestricted characters (may appear in document/external entities) */
+ uns cat_new_line; /* New line characters */
+ uns cat_name; /* Characters that may appear in names */
+ uns cat_sname; /* Characters that may begin a name */
/* SAX-like interface */
void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */
void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */
void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */
- void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration just before internal subset */
- void (*h_comment)(struct xml_context *ctx); /* Called after a comment */
- void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction */
- void (*h_element_start)(struct xml_context *ctx); /* Called after STag or EmptyElemTag */
- void (*h_element_end)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag */
- void (*h_chars)(struct xml_context *ctx); /* Called after some characters */
- void (*h_cdata)(struct xml_context *ctx); /* Called after a CDATA section */
+ void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration (before optional internal subset) */
+ void (*h_comment)(struct xml_context *ctx); /* Called after a comment (only with XML_REPORT_COMMENTS) */
+ void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */
+ void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */
+ void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */
+ void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */
+ void (*h_cdata)(struct xml_context *ctx); /* Called after a CDATA section (only with XML_REPORT_CHARS and XML_UNFOLD_CDATA) */
+ void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */
+ void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */
/* DOM */
struct xml_node *root; /* DOM root */
char *version_str;
uns standalone;
- char *document_type;
- struct xml_dtd *dtd;
- struct xml_ext_id eid;
- uns state;
- uns want;
-
- void (*start_dtd)(struct xml_context *ctx);
- void (*end_dtd)(struct xml_context *ctx);
+ char *doctype; /* The document type (or NULL if unknown) */
+ char *system_id; /* DTD external id */
+ char *public_id; /* DTD public id */
+ struct xml_dtd *dtd; /* The DTD structure (or NULL) */
+ uns state; /* Current state for the PULL interface (XML_STATE_x) */
+ uns pull; /* Parameters for the PULL interface (XML_PULL_x) */
+
void (*start_entity)(struct xml_context *ctx);
void (*end_entity)(struct xml_context *ctx);
struct fastbuf *(*resolve_entity)(struct xml_context *ctx);
void (*unparsed_entity_decl)(struct xml_context *ctx);
};
+/* Initialize XML context */
void xml_init(struct xml_context *ctx);
+
+/* Clean up all internal structures */
void xml_cleanup(struct xml_context *ctx);
+
+/* Reuse XML context */
+void xml_reset(struct xml_context *ctx);
+
+/* Setup XML source (fastbuf will be automatically closed) */
void xml_set_source(struct xml_context *ctx, struct fastbuf *fb);
-int xml_next(struct xml_context *ctx);
+
+/* Parse without the PUSH interface, return XML_ERR_x code (zero on success) */
+uns xml_parse(struct xml_context *ctx);
+
+/* Parse with the PUSH interface, return XML_STATE_x (zero on EOF or fatal error) */
+uns xml_next(struct xml_context *ctx);
+
uns xml_row(struct xml_context *ctx);
+struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name);
#endif