* of the GNU Lesser General Public License.
*/
-#define LOCAL_DEBUG
+#undef LOCAL_DEBUG
#include "sherlock/sherlock.h"
#include "sherlock/xml/xml.h"
#include <setjmp.h>
+/*** Basic parsing ***/
+
+void NONRET
+xml_fatal_expected(struct xml_context *ctx, uns c)
+{
+ if (c >= 32 && c < 128)
+ xml_fatal(ctx, "Expected '%c'", c);
+ else
+ xml_fatal(ctx, "Expected U+%04x", c);
+}
+
+void NONRET
+xml_fatal_expected_white(struct xml_context *ctx)
+{
+ xml_fatal(ctx, "Expected a white space");
+}
+
+void NONRET
+xml_fatal_expected_quot(struct xml_context *ctx)
+{
+ xml_fatal(ctx, "Expected a quotation mark");
+}
+
+void
+xml_parse_eq(struct xml_context *ctx)
+{
+ /* Eq ::= S? '=' S? */
+ xml_parse_white(ctx, 0);
+ xml_parse_char(ctx, '=');
+ xml_parse_white(ctx, 0);
+}
+
+/*** Names and nmtokens ***/
+
+static char *
+xml_parse_string(struct xml_context *ctx, struct mempool *pool, uns first_cat, uns next_cat, char *err)
+{
+ char *p = mp_start_noalign(pool, 1);
+ if (unlikely(!(xml_peek_cat(ctx) & first_cat)))
+ xml_fatal(ctx, "%s", err);
+ do
+ {
+ p = mp_spread(pool, p, 5);
+ p = utf8_32_put(p, xml_skip_char(ctx));
+ }
+ while (xml_peek_cat(ctx) & next_cat);
+ *p++ = 0;
+ return mp_end(pool, p);
+}
+
+static void
+xml_skip_string(struct xml_context *ctx, uns first_cat, uns next_cat, char *err)
+{
+ if (unlikely(!(xml_get_cat(ctx) & first_cat)))
+ xml_fatal(ctx, "%s", err);
+ while (xml_peek_cat(ctx) & next_cat)
+ xml_skip_char(ctx);
+}
+
+char *
+xml_parse_name(struct xml_context *ctx, struct mempool *pool)
+{
+ /* Name ::= NameStartChar (NameChar)* */
+ return xml_parse_string(ctx, pool,
+ !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1,
+ !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1,
+ "Expected a name");
+}
+
+void
+xml_skip_name(struct xml_context *ctx)
+{
+ xml_skip_string(ctx,
+ !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1,
+ !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1,
+ "Expected a name");
+}
+
+char *
+xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool)
+{
+ /* Nmtoken ::= (NameChar)+ */
+ uns cat = !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1;
+ return xml_parse_string(ctx, pool, cat, cat, "Expected a nmtoken");
+}
+
+/*** Simple literals ***/
+
+char *
+xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool)
+{
+ /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */
+ char *p = mp_start_noalign(pool, 1);
+ uns q = xml_parse_quote(ctx), c;
+ while ((c = xml_get_char(ctx)) != q)
+ {
+ p = mp_spread(pool, p, 5);
+ p = utf8_32_put(p, c);
+ }
+ *p++ = 0;
+ return mp_end(pool, p);
+}
+
+char *
+xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool)
+{
+ /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */
+ char *p = mp_start_noalign(pool, 1);
+ uns q = xml_parse_quote(ctx), c;
+ while ((c = xml_get_char(ctx)) != q)
+ {
+ if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID)))
+ xml_fatal(ctx, "Expected a pubid character");
+ p = mp_spread(pool, p, 2);
+ *p++ = c;
+ }
+ *p++ = 0;
+ return mp_end(pool, p);
+}
+
/*** Comments ***/
void
*p = 0;
n->len = p - (char *)mp_ptr(ctx->pool);
n->text = mp_end(ctx->pool, p + 1);
- if (ctx->h_comment)
+ if ((ctx->flags & XML_REPORT_COMMENTS) && ctx->h_comment)
ctx->h_comment(ctx);
}
void
xml_pop_comment(struct xml_context *ctx)
{
- xml_pop_dom(ctx);
+ xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_COMMENTS));
xml_dec(ctx);
TRACE(ctx, "pop_comment");
}
*p = 0;
n->len = p - (char *)mp_ptr(ctx->pool);
n->text = mp_end(ctx->pool, p + 1);
- if (ctx->h_pi)
+ if ((ctx->flags & XML_REPORT_PIS) && ctx->h_pi)
ctx->h_pi(ctx);
}
void
xml_pop_pi(struct xml_context *ctx)
{
- xml_pop_dom(ctx);
+ xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_PIS));
xml_dec(ctx);
TRACE(ctx, "pop_pi");
}
xml_skip_pi(struct xml_context *ctx)
{
TRACE(ctx, "skip_pi");
- if (ctx->flags & XML_FLAG_VALIDATING)
+ if (ctx->flags & XML_VALIDATING)
{
struct mempool_state state;
mp_save(ctx->stack, &state);
/*** Character data ***/
-static void
-xml_chars_spout(struct fastbuf *fb)
-{
- if (fb->bptr >= fb->bufend)
- {
- struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb);
- struct mempool *pool = ctx->pool;
- if (fb->bufend != fb->buffer)
- {
- uns len = fb->bufend - fb->buffer;
- TRACE(ctx, "grow_chars");
- fb->buffer = mp_expand(pool);
- fb->bufend = fb->buffer + mp_avail(pool);
- fb->bstop = fb->buffer;
- fb->bptr = fb->buffer + len;
- }
- else
- {
- TRACE(ctx, "push_chars");
- struct xml_node *n = xml_push_dom(ctx);
- n->type = XML_NODE_CDATA;
- xml_start_chars(ctx);
- }
- }
-}
-
-static void
-xml_init_chars(struct xml_context *ctx)
-{
- struct fastbuf *fb = &ctx->chars;
- fb->name = "<xml-chars>";
- fb->spout = xml_chars_spout;
- fb->can_overwrite_buffer = 1;
- fb->bptr = fb->bstop = fb->buffer = fb->bufend = NULL;
-}
-
static inline uns
xml_flush_chars(struct xml_context *ctx)
{
struct xml_node *n = ctx->node;
n->text = xml_end_chars(ctx, &n->len);
n->len = fb->bufend - fb->buffer;
- if (ctx->h_chars)
+ if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars)
ctx->h_chars(ctx);
return 1;
}
static inline void
xml_pop_chars(struct xml_context *ctx)
{
- xml_pop_dom(ctx);
+ xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS));
TRACE(ctx, "pop_chars");
}
* Already parsed: '<![' */
xml_parse_seq(ctx, "CDATA[");
struct xml_node *n = xml_push_dom(ctx);
- n->type = XML_NODE_CDATA;
+ n->type = XML_NODE_CHARS;
char *p = mp_start_noalign(ctx->pool, 7);
while (1)
{
*p = 0;
n->len = p - (char *)mp_ptr(ctx->pool);
n->text = mp_end(ctx->pool, p + 1);
- if (ctx->h_cdata)
+ if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata)
ctx->h_cdata(ctx);
}
static void
xml_pop_cdata(struct xml_context *ctx)
{
- xml_pop_dom(ctx);
+ xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS));
xml_dec(ctx);
TRACE(ctx, "pop_cdata");
}
while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT));
}
uns cat = xml_char_cat(v);
- if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_FLAG_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0)))
+ if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0)))
{
xml_error(ctx, "Character reference out of range");
goto recover;
mp_save(ctx->stack, &state);
char *name = xml_parse_name(ctx, ctx->stack);
xml_parse_char(ctx, ';');
- struct xml_dtd_ent *ent = xml_dtd_find_gent(ctx, name);
+ struct xml_dtd_ent *ent = xml_dtd_find_ent(ctx, name);
if (!ent)
{
xml_error(ctx, "Unknown entity &%s;", name);
a->val = v;
}
+struct xml_attr *
+xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name)
+{
+ return xml_attrs_find(ctx->tab_attrs, node, name);
+}
+
+void
+xml_attrs_table_init(struct xml_context *ctx)
+{
+ xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table)));
+}
+
+void
+xml_attrs_table_cleanup(struct xml_context *ctx)
+{
+ xml_attrs_cleanup(ctx->tab_attrs);
+}
+
/*** Elements ***/
static void
if (!e->parent)
{
ctx->root = e;
- if (ctx->document_type && strcmp(e->name, ctx->document_type))
- xml_error(ctx, "The root element %s does not match the document type %s", e->name, ctx->document_type);
+ if (ctx->doctype && strcmp(e->name, ctx->doctype))
+ xml_error(ctx, "The root element %s does not match the document type %s", e->name, ctx->doctype);
}
while (1)
{
if (c == '/')
{
xml_parse_char(ctx, '>');
- ctx->flags |= XML_FLAG_EMPTY_ELEM;
+ ctx->flags |= XML_EMPTY_ELEM_TAG;
break;
}
else if (c == '>')
xml_unget_char(ctx);
xml_parse_attr(ctx);
}
- if (ctx->h_element_start)
- ctx->h_element_start(ctx);
+ if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag)
+ ctx->h_stag(ctx);
}
static void
xml_pop_element(struct xml_context *ctx)
{
TRACE(ctx, "pop_element");
- if (ctx->h_element_end)
- ctx->h_element_end(ctx);
+ if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag)
+ ctx->h_etag(ctx);
struct xml_node *e = ctx->node;
- if (ctx->flags & XML_DOM_FREE)
+ uns free = !(ctx->flags & XML_ALLOC_TAGS);
+ if (free)
{
if (!e->parent)
ctx->root = NULL;
clist_remove(&n->n);
}
}
- xml_pop_dom(ctx);
+ xml_pop_dom(ctx, free);
xml_dec(ctx);
}
/* doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
* Already parsed: '<!'
* Terminated before '[' or '>' */
- if (ctx->document_type)
+ if (ctx->doctype)
xml_fatal(ctx, "Multiple document types not allowed");
xml_parse_seq(ctx, "DOCTYPE");
xml_parse_white(ctx, 1);
- ctx->document_type = xml_parse_name(ctx, ctx->pool);
- TRACE(ctx, "doctyype=%s", ctx->document_type);
+ ctx->doctype = xml_parse_name(ctx, ctx->pool);
+ TRACE(ctx, "doctype=%s", ctx->doctype);
uns c;
if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P'))
{
{
xml_parse_seq(ctx, "SYSTEM");
xml_parse_white(ctx, 1);
- ctx->eid.system_id = xml_parse_system_literal(ctx, ctx->pool);
+ ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
}
else
{
xml_parse_seq(ctx, "PUBLIC");
xml_parse_white(ctx, 1);
- ctx->eid.public_id = xml_parse_pubid_literal(ctx, ctx->pool);
+ ctx->public_id = xml_parse_pubid_literal(ctx, ctx->pool);
xml_parse_white(ctx, 1);
- ctx->eid.system_id = xml_parse_system_literal(ctx, ctx->pool);
+ ctx->system_id = xml_parse_system_literal(ctx, ctx->pool);
}
xml_parse_white(ctx, 0);
- ctx->flags |= XML_FLAG_HAS_EXTERNAL_SUBSET;
+ ctx->flags |= XML_HAS_EXTERNAL_SUBSET;
}
if (xml_peek_char(ctx) == '[')
- ctx->flags |= XML_FLAG_HAS_INTERNAL_SUBSET;
+ ctx->flags |= XML_HAS_INTERNAL_SUBSET;
if (ctx->h_doctype_decl)
ctx->h_doctype_decl(ctx);
}
xml_fatal(ctx, "Invalid markup in the internal subset");
}
+/*** The State Machine ***/
-/*----------------------------------------------*/
-
-void
-xml_init(struct xml_context *ctx)
-{
- bzero(ctx, sizeof(*ctx));
- ctx->pool = mp_new(65536);
- ctx->stack = mp_new(65536);
- ctx->flags = XML_DOM_FREE;
- xml_init_chars(ctx);
- xml_dtd_init(ctx);
- xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table)));
-}
-
-void
-xml_cleanup(struct xml_context *ctx)
-{
- xml_attrs_cleanup(ctx->tab_attrs);
- xml_dtd_cleanup(ctx);
- mp_delete(ctx->pool);
- mp_delete(ctx->stack);
-}
-
-int
+uns
xml_next(struct xml_context *ctx)
{
/* A nasty state machine */
+#define PULL(x) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##x; case XML_STATE_##x: ; } while (0)
+#define PULL_STATE(x, s) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##s, XML_STATE_##x; case XML_STATE_##s: ; } while (0)
+
TRACE(ctx, "xml_next (state=%u)", ctx->state);
jmp_buf throw_buf;
ctx->throw_buf = &throw_buf;
error:
if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal)
ctx->h_fatal(ctx);
- ctx->state = XML_STATE_FATAL;
TRACE(ctx, "raised fatal error");
- return -1;
+ return ctx->state = XML_STATE_EOF;
}
uns c;
switch (ctx->state)
{
- case XML_STATE_FATAL:
- return -1;
-
case XML_STATE_START:
TRACE(ctx, "entering prolog");
if (ctx->h_document_start)
xml_refill(ctx);
if (ctx->h_xml_decl)
ctx->h_xml_decl(ctx);
- if (ctx->want & XML_WANT_DECL)
- return ctx->state = XML_STATE_DECL;
- case XML_STATE_DECL:
+ PULL(XML_DECL);
/* Misc* (doctypedecl Misc*)? */
while (1)
xml_parse_char(ctx, '<');
if ((c = xml_get_char(ctx)) == '?')
/* Processing intruction */
- if (!(ctx->want & XML_WANT_PI))
+ if (!(ctx->flags & XML_REPORT_PIS))
xml_skip_pi(ctx);
else
{
xml_push_pi(ctx);
- ctx->state = XML_STATE_PROLOG_PI;
- return XML_STATE_PI;
- case XML_STATE_PROLOG_PI:
+ PULL_STATE(PI, PROLOG_PI);
xml_pop_pi(ctx);
}
else if (c != '!')
goto first_tag;
}
else if (xml_get_char(ctx) == '-')
- if (!(ctx->want & XML_WANT_COMMENT))
+ if (!(ctx->flags & XML_REPORT_COMMENTS))
xml_skip_comment(ctx);
else
{
xml_push_comment(ctx);
- ctx->state = XML_STATE_PROLOG_COMMENT;
- return XML_STATE_COMMENT;
- case XML_STATE_PROLOG_COMMENT:
+ PULL_STATE(COMMENT, PROLOG_COMMENT);
xml_pop_comment(ctx);
}
else
/* DocTypeDecl */
xml_unget_char(ctx);
xml_parse_doctype_decl(ctx);
- if (ctx->want & XML_WANT_DOCUMENT_TYPE)
- return ctx->state = XML_STATE_DOCUMENT_TYPE;
- case XML_STATE_DOCUMENT_TYPE:
+ PULL(DOCTYPE_DECL);
if (xml_peek_char(ctx) == '[')
{
+ // FIXME: ability to skip the subset
xml_skip_char(ctx);
xml_inc(ctx);
+ xml_dtd_init(ctx);
+ if (ctx->h_dtd_start)
+ ctx->h_dtd_start(ctx);
xml_parse_internal_subset(ctx);
+ // FIXME: external subset
+ if (ctx->h_dtd_end)
+ ctx->h_dtd_end(ctx);
xml_parse_white(ctx, 0);
}
xml_parse_char(ctx, '>');
if ((c = xml_get_char(ctx)) == '?')
{
/* PI */
- if (!(ctx->want & XML_WANT_PI))
+ if (!(ctx->flags & (XML_REPORT_PIS | XML_ALLOC_PIS)))
xml_skip_pi(ctx);
else
{
if (xml_flush_chars(ctx))
{
- if (ctx->want & XML_WANT_CHARS)
- {
- ctx->state = XML_STATE_CHARS_BEFORE_PI;
- return XML_STATE_CHARS;
- }
- case XML_STATE_CHARS_BEFORE_PI:
+ PULL_STATE(CHARS, CHARS_BEFORE_PI);
xml_pop_chars(ctx);
}
xml_push_pi(ctx);
- return ctx->state = XML_STATE_PI;
- case XML_STATE_PI:
+ PULL(PI);
xml_pop_pi(ctx);
}
}
if ((c = xml_get_char(ctx)) == '-')
{
/* Comment */
- if (!(ctx->want & XML_WANT_COMMENT))
+ if (!(ctx->flags & (XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS)))
xml_skip_comment(ctx);
else
{
if (xml_flush_chars(ctx))
{
- if (ctx->want & XML_WANT_CHARS)
- {
- ctx->state = XML_STATE_CHARS_BEFORE_COMMENT;
- return XML_STATE_CHARS;
- }
- case XML_STATE_CHARS_BEFORE_COMMENT:
+ PULL_STATE(CHARS, CHARS_BEFORE_COMMENT);
xml_pop_chars(ctx);
}
xml_push_comment(ctx);
- return ctx->state = XML_STATE_COMMENT;
- case XML_STATE_COMMENT:
+ PULL(COMMENT);
xml_pop_comment(ctx);
}
}
else if (c == '[')
{
/* CDATA */
- if (!(ctx->want & XML_WANT_CDATA))
+ if (!(ctx->flags & XML_UNFOLD_CDATA))
xml_append_cdata(ctx);
else
{
if (xml_flush_chars(ctx))
{
- if (ctx->want & XML_WANT_CHARS)
- {
- ctx->state = XML_STATE_CHARS_BEFORE_CDATA;
- return XML_STATE_CHARS;
- }
- case XML_STATE_CHARS_BEFORE_CDATA:
+ PULL_STATE(CHARS, CHARS_BEFORE_CDATA);
xml_pop_chars(ctx);
}
xml_push_cdata(ctx);
- return ctx->state = XML_STATE_CDATA;
- case XML_STATE_CDATA:
+ PULL(CDATA);
xml_pop_cdata(ctx);
}
}
xml_unget_char(ctx);
if (xml_flush_chars(ctx))
{
- if (ctx->want & XML_WANT_CHARS)
- {
- ctx->state = XML_STATE_CHARS_BEFORE_STAG;
- return XML_STATE_CHARS;
- }
- case XML_STATE_CHARS_BEFORE_STAG:
+ PULL_STATE(CHARS, CHARS_BEFORE_STAG);
xml_pop_chars(ctx);
}
xml_push_element(ctx);
- if (ctx->want & XML_WANT_STAG)
- return ctx->state = XML_STATE_STAG;
- case XML_STATE_STAG:
- if (ctx->flags & XML_FLAG_EMPTY_ELEM)
+ PULL(STAG);
+ if (ctx->flags & XML_EMPTY_ELEM_TAG)
goto pop_element;
}
/* ETag */
if (xml_flush_chars(ctx))
{
- if (ctx->want & XML_WANT_CHARS)
- {
- ctx->state = XML_STATE_CHARS_BEFORE_ETAG;
- return XML_STATE_CHARS;
- }
- case XML_STATE_CHARS_BEFORE_ETAG:
+ PULL_STATE(CHARS, CHARS_BEFORE_ETAG);
xml_pop_chars(ctx);
}
xml_parse_etag(ctx);
pop_element:
- if (ctx->want & XML_WANT_ETAG)
- return ctx->state = XML_STATE_ETAG;
- case XML_STATE_ETAG:
+ PULL(ETAG);
xml_pop_element(ctx);
if (!ctx->node)
goto epilog;
if (ctx->h_document_end)
ctx->h_document_end(ctx);
case XML_STATE_EOF:
+ ctx->err_code = 0;
+ ctx->err_msg = NULL;
return XML_STATE_EOF;
}
else
/* Misc */
xml_parse_char(ctx, '<');
+ xml_inc(ctx);
if ((c = xml_get_char(ctx)) == '?')
/* Processing instruction */
- if (!(ctx->want & XML_WANT_PI))
+ if (!(ctx->flags & XML_REPORT_PIS))
xml_skip_pi(ctx);
else
{
xml_push_pi(ctx);
- return ctx->state = XML_STATE_EPILOG_PI, XML_STATE_PI;
- case XML_STATE_EPILOG_PI:
+ PULL_STATE(PI, EPILOG_PI);
xml_pop_pi(ctx);
}
else if (c == '!')
- /* Comment */
- if (!(ctx->want & XML_WANT_COMMENT))
- xml_skip_comment(ctx);
- else
- {
- xml_push_comment(ctx);
- return ctx->state = XML_STATE_EPILOG_COMMENT, XML_STATE_COMMENT;
- case XML_STATE_EPILOG_COMMENT:
- xml_pop_comment(ctx);
- }
+ {
+ xml_parse_char(ctx, '-');
+ /* Comment */
+ if (!(ctx->flags & XML_REPORT_COMMENTS))
+ xml_skip_comment(ctx);
+ else
+ {
+ xml_push_comment(ctx);
+ PULL_STATE(COMMENT, EPILOG_COMMENT);
+ xml_pop_comment(ctx);
+ }
+ }
else
xml_fatal(ctx, "Syntax error in the epilog");
}
}
- return -1;
+ ASSERT(0);
+}
+
+uns
+xml_parse(struct xml_context *ctx)
+{
+ ctx->pull = 0;
+ xml_next(ctx);
+ return ctx->err_code;
}