-- full support for standalone documents
-- Unicode normalization
-Bugs:
--- definitions of parameter entities do not work because of '%' expansion in "<!ELEMENT %" is expanded as a reference
-
Optimizations:
-- detect definitions of trivial entities
#include "sherlock/sherlock.h"
#include "sherlock/xml/xml.h"
#include "sherlock/xml/dtd.h"
-#include "sherlock/xml/common.h"
+#include "sherlock/xml/internals.h"
#include "lib/stkstring.h"
#include "lib/ff-unicode.h"
+++ /dev/null
-/*
- * Sherlock Library -- A simple XML parser
- *
- * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
- *
- * This software may be freely distributed and used according to the terms
- * of the GNU Lesser General Public License.
- */
-
-#ifndef _SHERLOCK_XML_COMMON_H
-#define _SHERLOCK_XML_COMMON_H
-
-#include "sherlock/xml/xml.h"
-#include "sherlock/xml/dtd.h"
-
-/*** Debugging ***/
-
-#ifdef LOCAL_DEBUG
-#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0)
-#else
-#define TRACE(c, f, p...) do {} while(0)
-#endif
-
-/*** Error handling ***/
-
-void NONRET xml_throw(struct xml_context *ctx);
-void xml_warn(struct xml_context *ctx, const char *format, ...);
-void xml_error(struct xml_context *ctx, const char *format, ...);
-void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...);
-
-/*** Memory management ***/
-
-struct xml_stack {
- struct xml_stack *next;
- struct mempool_state state;
- uns flags;
-};
-
-static inline void *
-xml_do_push(struct xml_context *ctx, uns size)
-{
- /* Saves ctx->stack and ctx->flags state */
- struct mempool_state state;
- mp_save(ctx->stack, &state);
- struct xml_stack *s = mp_alloc(ctx->stack, size);
- s->state = state;
- s->flags = ctx->flags;
- s->next = ctx->stack_list;
- ctx->stack_list = s;
- return s;
-}
-
-static inline void
-xml_do_pop(struct xml_context *ctx, struct xml_stack *s)
-{
- /* Restore ctx->stack and ctx->flags state */
- ctx->stack_list = s->next;
- ctx->flags = s->flags;
- mp_restore(ctx->stack, &s->state);
-}
-
-static inline void
-xml_push(struct xml_context *ctx)
-{
- TRACE(ctx, "push");
- xml_do_push(ctx, sizeof(struct xml_stack));
-}
-
-static inline void
-xml_pop(struct xml_context *ctx)
-{
- TRACE(ctx, "pop");
- ASSERT(ctx->stack_list);
- xml_do_pop(ctx, ctx->stack_list);
-}
-
-struct xml_dom_stack {
- struct xml_stack stack;
- struct mempool_state state;
-};
-
-static inline struct xml_node *
-xml_push_dom(struct xml_context *ctx, struct mempool_state *state)
-{
- /* Create a new DOM node */
- TRACE(ctx, "push_dom");
- struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s));
- if (state)
- s->state = *state;
- else
- mp_save(ctx->pool, &s->state);
- struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n));
- n->user = NULL;
- if (n->parent = ctx->node)
- clist_add_tail(&n->parent->sons, &n->n);
- return ctx->node = n;
-}
-
-static inline void
-xml_pop_dom(struct xml_context *ctx, uns free)
-{
- /* Leave DOM subtree */
- TRACE(ctx, "pop_dom");
- ASSERT(ctx->node);
- struct xml_node *p = ctx->node->parent;
- struct xml_dom_stack *s = (void *)ctx->stack_list;
- if (free)
- {
- /* See xml_pop_element() for cleanup of attribute hash table */
- if (p)
- clist_remove(&ctx->node->n);
- mp_restore(ctx->pool, &s->state);
- }
- ctx->node = p;
- xml_do_pop(ctx, &s->stack);
-}
-
-#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN)
-#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \
- static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uns size) \
- { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \
- static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {}
-
-void *xml_hash_new(struct mempool *pool, uns size);
-
-void xml_spout_chars(struct fastbuf *fb);
-
-/*** Reading of document/external entities ***/
-
-#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */
-
-struct xml_source {
- struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */
- struct fastbuf *fb; /* Source fastbuf */
- struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */
- struct fastbuf wrap_fb; /* Fbmem wrapper */
- u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */
- u32 *bptr, *bstop; /* Current state of the buffer */
- uns row; /* File position */
- char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */
- char *fb_encoding; /* Encoding of the source fastbuf */
- char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */
- uns refill_cat1; /* Character categories, which should be directly passed to the buffer */
- uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in sequences) */
- void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */
- unsigned short *refill_in_to_x; /* Libcharset input table */
- uns saved_depth; /* Saved ctx->depth */
-};
-
-void NONRET xml_fatal_nested(struct xml_context *ctx);
-
-static inline void
-xml_inc(struct xml_context *ctx)
-{
- /* Called after the first character of a block */
- TRACE(ctx, "inc");
- ctx->depth++;
-}
-
-static inline void
-xml_dec(struct xml_context *ctx)
-{
- /* Called after the last character of a block */
- TRACE(ctx, "dec");
- if (unlikely(!ctx->depth--))
- xml_fatal_nested(ctx);
-}
-
-#include "obj/sherlock/xml/unicat.h"
-
-static inline uns
-xml_char_cat(uns c)
-{
- if (c < 0x10000)
- return 1U << xml_char_tab1[(c & 0xff) + xml_char_tab2[c >> 8]];
- else if (likely(c < 0x110000))
- return 1U << xml_char_tab3[c >> 16];
- else
- return 1;
-}
-
-static inline uns
-xml_ascii_cat(uns c)
-{
- return xml_char_tab1[c];
-}
-
-struct xml_source *xml_push_source(struct xml_context *ctx);
-void xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent);
-
-void xml_refill(struct xml_context *ctx);
-
-static inline uns
-xml_peek_char(struct xml_context *ctx)
-{
- if (ctx->bptr == ctx->bstop)
- xml_refill(ctx);
- return ctx->bptr[0];
-}
-
-static inline uns
-xml_peek_cat(struct xml_context *ctx)
-{
- if (ctx->bptr == ctx->bstop)
- xml_refill(ctx);
- return ctx->bptr[1];
-}
-
-static inline uns
-xml_get_char(struct xml_context *ctx)
-{
- uns c = xml_peek_char(ctx);
- ctx->bptr += 2;
- return c;
-}
-
-static inline uns
-xml_get_cat(struct xml_context *ctx)
-{
- uns c = xml_peek_cat(ctx);
- ctx->bptr += 2;
- return c;
-}
-
-static inline uns
-xml_last_char(struct xml_context *ctx)
-{
- return ctx->bptr[-2];
-}
-
-static inline uns
-xml_last_cat(struct xml_context *ctx)
-{
- return ctx->bptr[-1];
-}
-
-static inline uns
-xml_skip_char(struct xml_context *ctx)
-{
- uns c = ctx->bptr[0];
- ctx->bptr += 2;
- return c;
-}
-
-static inline uns
-xml_unget_char(struct xml_context *ctx)
-{
- return *(ctx->bptr -= 2);
-}
-
-void xml_sources_cleanup(struct xml_context *ctx);
-
-/*** Parsing ***/
-
-void NONRET xml_fatal_expected(struct xml_context *ctx, uns c);
-void NONRET xml_fatal_expected_white(struct xml_context *ctx);
-void NONRET xml_fatal_expected_quot(struct xml_context *ctx);
-
-static inline uns
-xml_parse_white(struct xml_context *ctx, uns mandatory)
-{
- /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+
- * mandatory=0 -> S? */
- uns cnt = 0;
- while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
- {
- xml_skip_char(ctx);
- cnt++;
- }
- if (unlikely(mandatory && !cnt))
- xml_fatal_expected_white(ctx);
- return cnt;
-}
-
-static inline void
-xml_parse_char(struct xml_context *ctx, uns c)
-{
- /* Consumes a given Unicode character */
- if (unlikely(c != xml_get_char(ctx)))
- xml_fatal_expected(ctx, c);
-}
-
-static inline void
-xml_parse_seq(struct xml_context *ctx, const char *seq)
-{
- /* Consumes a given sequence of ASCII characters */
- while (*seq)
- xml_parse_char(ctx, *seq++);
-}
-
-void xml_parse_eq(struct xml_context *ctx);
-
-static inline uns
-xml_parse_quote(struct xml_context *ctx)
-{
- /* "'" | '"' */
- uns c = xml_get_char(ctx);
- if (unlikely(c != '\'' && c != '\"'))
- xml_fatal_expected_quot(ctx);
- return c;
-}
-
-char *xml_parse_name(struct xml_context *ctx, struct mempool *pool);
-void xml_skip_name(struct xml_context *ctx);
-char *xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool);
-
-char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool);
-char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool);
-
-uns xml_parse_char_ref(struct xml_context *ctx);
-void xml_parse_pe_ref(struct xml_context *ctx);
-
-char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr);
-
-void xml_skip_internal_subset(struct xml_context *ctx);
-void xml_parse_notation_decl(struct xml_context *ctx);
-void xml_parse_entity_decl(struct xml_context *ctx);
-void xml_parse_element_decl(struct xml_context *ctx);
-void xml_parse_attr_list_decl(struct xml_context *ctx);
-
-void xml_push_comment(struct xml_context *ctx);
-void xml_pop_comment(struct xml_context *ctx);
-void xml_skip_comment(struct xml_context *ctx);
-
-void xml_push_pi(struct xml_context *ctx);
-void xml_pop_pi(struct xml_context *ctx);
-void xml_skip_pi(struct xml_context *ctx);
-
-void xml_attrs_table_init(struct xml_context *ctx);
-void xml_attrs_table_cleanup(struct xml_context *ctx);
-
-void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value);
-
-#endif
#include "sherlock/sherlock.h"
#include "sherlock/xml/xml.h"
#include "sherlock/xml/dtd.h"
-#include "sherlock/xml/common.h"
+#include "sherlock/xml/internals.h"
#include "lib/fastbuf.h"
#include "lib/ff-unicode.h"
#include "lib/unicode.h"
xml_dec(ctx);
}
-static void
-xml_parse_dtd_pe(struct xml_context *ctx)
+static uns
+xml_parse_dtd_pe(struct xml_context *ctx, uns entity_decl)
{
+ /* Already parsed: '%' */
do
{
- xml_skip_char(ctx);
xml_inc(ctx);
+ if (!~entity_decl && (xml_peek_cat(ctx) & XML_CHAR_WHITE))
+ {
+ xml_dec(ctx);
+ return ~0U;
+ }
+ xml_parse_pe_ref(ctx);
while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
xml_skip_char(ctx);
- xml_parse_pe_ref(ctx);
}
- while (xml_peek_char(ctx) != '%');
+ while (xml_get_char(ctx) == '%');
+ xml_unget_char(ctx);
+ return 1;
}
static inline uns
xml_parse_dtd_white(struct xml_context *ctx, uns mandatory)
{
- /* Whitespace or parameter entity */
+ /* Whitespace or parameter entity,
+ * mandatory==~0U has a special maening of the whitespace before the '%' character in an parameter entity declaration */
uns cnt = 0;
while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
{
}
if (xml_peek_char(ctx) == '%')
{
- xml_parse_dtd_pe(ctx);
- return 1;
+ xml_skip_char(ctx);
+ return xml_parse_dtd_pe(ctx, mandatory);
}
else if (unlikely(mandatory && !cnt))
xml_fatal_expected_white(ctx);
/* Already parsed: '<!ENTITY' */
TRACE(ctx, "parse_entity_decl");
struct xml_dtd *dtd = ctx->dtd;
- xml_parse_dtd_white(ctx, 1);
-
- uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENTITY_PARAMETER : 0;
+ uns flags = ~xml_parse_dtd_white(ctx, ~0U) ? 0 : XML_DTD_ENTITY_PARAMETER;
if (flags)
xml_parse_dtd_white(ctx, 1);
- else
- xml_unget_char(ctx);
-
struct xml_dtd_entity *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool));
- slist *list = flags ? &dtd->pents : &dtd->ents;
xml_parse_dtd_white(ctx, 1);
+ slist *list = flags ? &dtd->pents : &dtd->ents;
if (ent->flags & XML_DTD_ENTITY_DECLARED)
{
xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name);
// FIXME: should be only warning
}
-
uns c, sep = xml_get_char(ctx);
if (sep == '\'' || sep == '"')
{
--- /dev/null
+/*
+ * Sherlock Library -- A simple XML parser
+ *
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
+ *
+ * This software may be freely distributed and used according to the terms
+ * of the GNU Lesser General Public License.
+ */
+
+#ifndef _SHERLOCK_XML_INTERNALS_H
+#define _SHERLOCK_XML_INTERNALS_H
+
+#include "sherlock/xml/xml.h"
+#include "sherlock/xml/dtd.h"
+
+/*** Debugging ***/
+
+#ifdef LOCAL_DEBUG
+#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0)
+#else
+#define TRACE(c, f, p...) do {} while(0)
+#endif
+
+/*** Error handling ***/
+
+void NONRET xml_throw(struct xml_context *ctx);
+
+/*** Memory management ***/
+
+struct xml_stack {
+ struct xml_stack *next;
+ struct mempool_state state;
+ uns flags;
+};
+
+static inline void *
+xml_do_push(struct xml_context *ctx, uns size)
+{
+ /* Saves ctx->stack and ctx->flags state */
+ struct mempool_state state;
+ mp_save(ctx->stack, &state);
+ struct xml_stack *s = mp_alloc(ctx->stack, size);
+ s->state = state;
+ s->flags = ctx->flags;
+ s->next = ctx->stack_list;
+ ctx->stack_list = s;
+ return s;
+}
+
+static inline void
+xml_do_pop(struct xml_context *ctx, struct xml_stack *s)
+{
+ /* Restore ctx->stack and ctx->flags state */
+ ctx->stack_list = s->next;
+ ctx->flags = s->flags;
+ mp_restore(ctx->stack, &s->state);
+}
+
+static inline void
+xml_push(struct xml_context *ctx)
+{
+ TRACE(ctx, "push");
+ xml_do_push(ctx, sizeof(struct xml_stack));
+}
+
+static inline void
+xml_pop(struct xml_context *ctx)
+{
+ TRACE(ctx, "pop");
+ ASSERT(ctx->stack_list);
+ xml_do_pop(ctx, ctx->stack_list);
+}
+
+struct xml_dom_stack {
+ struct xml_stack stack;
+ struct mempool_state state;
+};
+
+static inline struct xml_node *
+xml_push_dom(struct xml_context *ctx, struct mempool_state *state)
+{
+ /* Create a new DOM node */
+ TRACE(ctx, "push_dom");
+ struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s));
+ if (state)
+ s->state = *state;
+ else
+ mp_save(ctx->pool, &s->state);
+ struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n));
+ n->user = NULL;
+ if (n->parent = ctx->node)
+ clist_add_tail(&n->parent->sons, &n->n);
+ return ctx->node = n;
+}
+
+static inline void
+xml_pop_dom(struct xml_context *ctx, uns free)
+{
+ /* Leave DOM subtree */
+ TRACE(ctx, "pop_dom");
+ ASSERT(ctx->node);
+ struct xml_node *p = ctx->node->parent;
+ struct xml_dom_stack *s = (void *)ctx->stack_list;
+ if (free)
+ {
+ /* See xml_pop_element() for cleanup of attribute hash table */
+ if (p)
+ clist_remove(&ctx->node->n);
+ mp_restore(ctx->pool, &s->state);
+ }
+ ctx->node = p;
+ xml_do_pop(ctx, &s->stack);
+}
+
+#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN)
+#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \
+ static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uns size) \
+ { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \
+ static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {}
+
+void *xml_hash_new(struct mempool *pool, uns size);
+
+void xml_spout_chars(struct fastbuf *fb);
+
+/*** Reading of document/external entities ***/
+
+void NONRET xml_fatal_nested(struct xml_context *ctx);
+
+static inline void
+xml_inc(struct xml_context *ctx)
+{
+ /* Called after the first character of a block */
+ TRACE(ctx, "inc");
+ ctx->depth++;
+}
+
+static inline void
+xml_dec(struct xml_context *ctx)
+{
+ /* Called after the last character of a block */
+ TRACE(ctx, "dec");
+ if (unlikely(!ctx->depth--))
+ xml_fatal_nested(ctx);
+}
+
+#include "obj/sherlock/xml/unicat.h"
+
+static inline uns
+xml_char_cat(uns c)
+{
+ if (c < 0x10000)
+ return 1U << xml_char_tab1[(c & 0xff) + xml_char_tab2[c >> 8]];
+ else if (likely(c < 0x110000))
+ return 1U << xml_char_tab3[c >> 16];
+ else
+ return 1;
+}
+
+static inline uns
+xml_ascii_cat(uns c)
+{
+ return xml_char_tab1[c];
+}
+
+struct xml_source *xml_push_source(struct xml_context *ctx);
+void xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent);
+
+void xml_refill(struct xml_context *ctx);
+
+static inline uns
+xml_peek_char(struct xml_context *ctx)
+{
+ if (ctx->bptr == ctx->bstop)
+ xml_refill(ctx);
+ return ctx->bptr[0];
+}
+
+static inline uns
+xml_peek_cat(struct xml_context *ctx)
+{
+ if (ctx->bptr == ctx->bstop)
+ xml_refill(ctx);
+ return ctx->bptr[1];
+}
+
+static inline uns
+xml_get_char(struct xml_context *ctx)
+{
+ uns c = xml_peek_char(ctx);
+ ctx->bptr += 2;
+ return c;
+}
+
+static inline uns
+xml_get_cat(struct xml_context *ctx)
+{
+ uns c = xml_peek_cat(ctx);
+ ctx->bptr += 2;
+ return c;
+}
+
+static inline uns
+xml_last_char(struct xml_context *ctx)
+{
+ return ctx->bptr[-2];
+}
+
+static inline uns
+xml_last_cat(struct xml_context *ctx)
+{
+ return ctx->bptr[-1];
+}
+
+static inline uns
+xml_skip_char(struct xml_context *ctx)
+{
+ uns c = ctx->bptr[0];
+ ctx->bptr += 2;
+ return c;
+}
+
+static inline uns
+xml_unget_char(struct xml_context *ctx)
+{
+ return *(ctx->bptr -= 2);
+}
+
+void xml_sources_cleanup(struct xml_context *ctx);
+
+/*** Parsing ***/
+
+void NONRET xml_fatal_expected(struct xml_context *ctx, uns c);
+void NONRET xml_fatal_expected_white(struct xml_context *ctx);
+void NONRET xml_fatal_expected_quot(struct xml_context *ctx);
+
+static inline uns
+xml_parse_white(struct xml_context *ctx, uns mandatory)
+{
+ /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+
+ * mandatory=0 -> S? */
+ uns cnt = 0;
+ while (xml_peek_cat(ctx) & XML_CHAR_WHITE)
+ {
+ xml_skip_char(ctx);
+ cnt++;
+ }
+ if (unlikely(mandatory && !cnt))
+ xml_fatal_expected_white(ctx);
+ return cnt;
+}
+
+static inline void
+xml_parse_char(struct xml_context *ctx, uns c)
+{
+ /* Consumes a given Unicode character */
+ if (unlikely(c != xml_get_char(ctx)))
+ xml_fatal_expected(ctx, c);
+}
+
+static inline void
+xml_parse_seq(struct xml_context *ctx, const char *seq)
+{
+ /* Consumes a given sequence of ASCII characters */
+ while (*seq)
+ xml_parse_char(ctx, *seq++);
+}
+
+void xml_parse_eq(struct xml_context *ctx);
+
+static inline uns
+xml_parse_quote(struct xml_context *ctx)
+{
+ /* "'" | '"' */
+ uns c = xml_get_char(ctx);
+ if (unlikely(c != '\'' && c != '\"'))
+ xml_fatal_expected_quot(ctx);
+ return c;
+}
+
+char *xml_parse_name(struct xml_context *ctx, struct mempool *pool);
+void xml_skip_name(struct xml_context *ctx);
+char *xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool);
+
+char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool);
+char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool);
+
+uns xml_parse_char_ref(struct xml_context *ctx);
+void xml_parse_pe_ref(struct xml_context *ctx);
+
+char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr);
+
+void xml_skip_internal_subset(struct xml_context *ctx);
+void xml_parse_notation_decl(struct xml_context *ctx);
+void xml_parse_entity_decl(struct xml_context *ctx);
+void xml_parse_element_decl(struct xml_context *ctx);
+void xml_parse_attr_list_decl(struct xml_context *ctx);
+
+void xml_push_comment(struct xml_context *ctx);
+void xml_pop_comment(struct xml_context *ctx);
+void xml_skip_comment(struct xml_context *ctx);
+
+void xml_push_pi(struct xml_context *ctx);
+void xml_pop_pi(struct xml_context *ctx);
+void xml_skip_pi(struct xml_context *ctx);
+
+void xml_attrs_table_init(struct xml_context *ctx);
+void xml_attrs_table_cleanup(struct xml_context *ctx);
+
+void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value);
+
+#endif
#include "sherlock/sherlock.h"
#include "sherlock/xml/xml.h"
#include "sherlock/xml/dtd.h"
-#include "sherlock/xml/common.h"
+#include "sherlock/xml/internals.h"
#include "lib/fastbuf.h"
#include "lib/ff-unicode.h"
#include "lib/unicode.h"
#include "sherlock/sherlock.h"
#include "sherlock/xml/xml.h"
#include "sherlock/xml/dtd.h"
-#include "sherlock/xml/common.h"
+#include "sherlock/xml/internals.h"
#include "lib/unicode.h"
#include "lib/ff-unicode.h"
#include "charset/charconv.h"
src->refill_cat2 = ctx->cat_new_line;
/* Initialize the supplied charset (if any) or try to guess it */
- char *expected_encoding = src->expected_encoding ? : src->fb_encoding;
+ char *expected_encoding = src->expected_encoding;
src->refill = xml_refill_utf8;
int bom = bpeekc(src->fb);
if (bom < 0)
src->refill = xml_refill_utf16_be;
if (bom == 0xff)
src->refill = xml_refill_utf16_le;
- if (!src->expected_encoding)
- expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE";
}
else if (strcasecmp(src->fb_encoding, "UTF-16BE"))
src->refill = xml_refill_utf16_be;
}
}
uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be;
+ if (utf16)
+ src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE";
+ if (!expected_encoding)
+ expected_encoding = src->fb_encoding;
if (bom > 0 && xml_peek_char(ctx) == 0xfeff)
xml_skip_char(ctx);
else if (utf16)
xml_error(ctx, "Missing or corrupted BOM");
+ TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?");
/* Look ahead for presence of XMLDecl or optional TextDecl */
if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf))
if (cs < 0 && !expected_encoding)
xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding);
else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8)
- xml_init_charconv(ctx, cs);
+ {
+ xml_init_charconv(ctx, cs);
+ src->fb_encoding = src->decl_encoding;
+ }
else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 ||
!(!strcasecmp(src->decl_encoding, "UTF-16") ||
(!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) ||
(!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE")))))
xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding);
}
+ if (!src->fb_encoding)
+ src->fb_encoding = "UTF-8";
+ TRACE(ctx, "Final encoding=%s", src->fb_encoding);
exit:
/* Update valid Unicode ranges */
static void
h_xml_decl(struct xml_context *ctx)
{
- bprintf(out, "SAX: xml_decl version=%s standalone=%d\n", ctx->version_str, ctx->standalone);
+ bprintf(out, "SAX: xml_decl version=%s standalone=%d fb_encoding=%s\n", ctx->version_str, ctx->standalone, ctx->src->fb_encoding);
}
static void
<html><a a1="val1" a2="val2">text1&amp;<</a>text2</html>
Out: PULL: start
SAX: document_start
- SAX: xml_decl version=1.0 standalone=0
+ SAX: xml_decl version=1.0 standalone=0 fb_encoding=ISO-8859-1
SAX: stag <html>
SAX: stag <a> a1='val1' a2='val2'
SAX: chars text='text1&<'
SAX: document_end
PULL: eof
-Run: ../obj/sherlock/xml/xml-test -s --dtd
+Run: (printf '\376\377' && bin/cs2cs UTF-8 UTF-16BE) | ../obj/sherlock/xml/xml-test -spd --dtd
In: <?xml version="1.0"?>
<!DOCTYPE root [
<!ELEMENT root ANY>
- <!ENTITY e1 "text">
+ <!ENTITY % pe1 "<!ENTITY e1 'text'>">
+ %pe1;
<!ENTITY e2 '<&e1;>'>
+ <!ELEMENT a ANY>
]>
- <root>&e1;&e2;</root>
+ <root>&e1;<a>&e2;</a></root>
Out: PULL: start
SAX: document_start
- SAX: xml_decl version=1.0 standalone=0
+ SAX: xml_decl version=1.0 standalone=0 fb_encoding=UTF-16BE
SAX: doctype_decl type=root public='' system='' extsub=0 intsub=1
SAX: dtd_start
SAX: dtd_end
SAX: stag <root>
- SAX: chars text='text<text>'
+ PULL: stag <root>
+ SAX: chars text='text'
+ PULL: chars text='text'
+ SAX: stag <a>
+ PULL: stag <a>
+ SAX: chars text='<text>'
+ PULL: chars text='<text>'
+ PULL: etag </a>
+ SAX: etag </a>
+ PULL: etag </root>
SAX: etag </root>
SAX: document_end
PULL: eof
+ DOM: element <root>
+ DOM: chars text='text'
+ DOM: element <a>
+ DOM: chars text='<text>'
#include "lib/fastbuf.h"
struct xml_context;
-struct xml_source;
struct xml_dtd_entity;
enum xml_error {
void *user; /* User-defined (initialized to NULL) */
};
+#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */
+
+struct xml_source {
+ struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */
+ struct fastbuf *fb; /* Source fastbuf */
+ struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */
+ struct fastbuf wrap_fb; /* Fbmem wrapper */
+ u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */
+ u32 *bptr, *bstop; /* Current state of the buffer */
+ uns row; /* File position */
+ char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */
+ char *fb_encoding; /* Encoding of the source fastbuf */
+ char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */
+ uns refill_cat1; /* Character categories, which should be directly passed to the buffer */
+ uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in
+ sequences) */
+ void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */
+ unsigned short *refill_in_to_x; /* Libcharset input table */
+ uns saved_depth; /* Saved ctx->depth */
+};
+
struct xml_context {
/* Error handling */
char *err_msg; /* Last error message */
/* Clean up all internal structures */
void xml_cleanup(struct xml_context *ctx);
-/* Reuse XML context */
+/* Reuse XML context, equivalent to xml_cleanup() and xml_init() */
void xml_reset(struct xml_context *ctx);
/* Add XML source (fastbuf will be automatically closed) */
/* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */
uns xml_normalize_white(struct xml_context *ctx, char *value);
+/* Public part of error handling */
+void xml_warn(struct xml_context *ctx, const char *format, ...);
+void xml_error(struct xml_context *ctx, const char *format, ...);
+void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...);
+
#endif