$(Q)$< $(addprefix $(o)/sherlock/xml/unicat,.h .c)
$(Q)touch $@
+TESTS+=$(o)/sherlock/xml/xml-test.test
$(o)/sherlock/xml/xml-test: $(o)/sherlock/xml/xml-test.o $(LIBSHXML)
+$(o)/sherlock/xml/xml-test.test: $(o)/sherlock/xml/xml-test
API_LIBS+=libshxml
API_INCLUDES+=$(o)/sherlock/xml/.include-stamp
--- /dev/null
+Non-normative / not-implemented:
+-- introduce numeric error codes
+-- cycle detection in internal entities (and possibly external?)
+-- conditional sections in DTD
+-- validation of elements (regular expressions, non-cdata)
+-- validation of attributes (unfinished)
+-- notations
+-- URI normalization
+-- support for xml:space
+-- support for xml:lang
+-- full support for standalone documents
+-- Unicode normalization
+
+Bugs:
+-- definitions of parameter entities do not work because of '%' expansion in "<!ELEMENT %" is expanded as a reference
+
+Optimizations:
+-- detect definitions of trivial entities
return tab + XML_HASH_HDR_SIZE;
}
-static void
-xml_chars_spout(struct fastbuf *fb)
-{
- if (fb->bptr >= fb->bufend)
- {
- struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb);
- struct mempool *pool = ctx->pool;
- if (fb->bufend != fb->buffer)
- {
- uns len = fb->bufend - fb->buffer;
- TRACE(ctx, "grow_chars");
- fb->buffer = mp_expand(pool);
- fb->bufend = fb->buffer + mp_avail(pool);
- fb->bstop = fb->buffer;
- fb->bptr = fb->buffer + len;
- }
- else
- {
- TRACE(ctx, "push_chars");
- struct xml_node *n = xml_push_dom(ctx);
- n->type = XML_NODE_CHARS;
- xml_start_chars(ctx);
- }
- }
-}
-
-static void
-xml_init_chars(struct xml_context *ctx)
-{
- struct fastbuf *fb = &ctx->chars;
- fb->name = "<xml-chars>";
- fb->spout = xml_chars_spout;
- fb->can_overwrite_buffer = 1;
- fb->bptr = fb->bstop = fb->buffer = fb->bufend = NULL;
-}
-
/*** Initialization ***/
+static struct xml_context xml_defaults = {
+ .flags = XML_SRC_EOF | XML_REPORT_ALL,
+ .state = XML_STATE_START,
+ .h_resolve_entity = xml_def_resolve_entity,
+ .chars = {
+ .name = "<xml_chars>",
+ .spout = xml_spout_chars,
+ .can_overwrite_buffer = 1,
+ },
+};
+
static void
xml_do_init(struct xml_context *ctx)
{
- ctx->flags = XML_REPORT_ALL;
- xml_init_chars(ctx);
xml_attrs_table_init(ctx);
}
void
xml_init(struct xml_context *ctx)
{
- bzero(ctx, sizeof(*ctx));
+ *ctx = xml_defaults;
ctx->pool = mp_new(65536);
ctx->stack = mp_new(65536);
xml_do_init(ctx);
xml_sources_cleanup(ctx);
mp_flush(pool);
mp_flush(stack);
- bzero(ctx, sizeof(*ctx));
+ *ctx = xml_defaults;
ctx->pool = pool;
ctx->stack = stack;
xml_do_init(ctx);
/*
* Sherlock Library -- A simple XML parser
*
- * (c) 2007 Pavel Charvat <pchar@ucw.cz>
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
};
static inline struct xml_node *
-xml_push_dom(struct xml_context *ctx)
+xml_push_dom(struct xml_context *ctx, struct mempool_state *state)
{
/* Create a new DOM node */
TRACE(ctx, "push_dom");
struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s));
- mp_save(ctx->pool, &s->state);
+ if (state)
+ s->state = *state;
+ else
+ mp_save(ctx->pool, &s->state);
struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n));
n->user = NULL;
if (n->parent = ctx->node)
void *xml_hash_new(struct mempool *pool, uns size);
-static inline void
-xml_start_chars(struct xml_context *ctx)
-{
- struct fastbuf *fb = &ctx->chars;
- fb->bstop = fb->bptr = fb->buffer = mp_start_noalign(ctx->pool, 1);
- fb->bufend = fb->buffer + mp_avail(ctx->pool);
-}
-
-static inline char *
-xml_end_chars(struct xml_context *ctx, uns *len)
-{
- struct fastbuf *fb = &ctx->chars;
- uns l = fb->bufend - fb->buffer;
- if (fb->bptr == fb->bufend)
- fb->bptr = mp_expand(ctx->pool) + l;
- *fb->bptr = 0;
- char *c = mp_end(ctx->pool, fb->bptr + 1);
- fb->bptr = fb->bstop = fb->buffer = fb->bufend = NULL;
- *len = l;
- return c;
-}
+void xml_spout_chars(struct fastbuf *fb);
/*** Reading of document/external entities ***/
return xml_char_tab1[c];
}
-struct xml_source *xml_push_source(struct xml_context *ctx, uns flags);
-void xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent);
+struct xml_source *xml_push_source(struct xml_context *ctx);
+void xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent);
void xml_refill(struct xml_context *ctx);
char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool);
uns xml_parse_char_ref(struct xml_context *ctx);
-void xml_parse_ref(struct xml_context *ctx);
void xml_parse_pe_ref(struct xml_context *ctx);
char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr);
void xml_attrs_table_init(struct xml_context *ctx);
void xml_attrs_table_cleanup(struct xml_context *ctx);
+void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value);
+
#endif
/*
* Sherlock Library -- A simple XML parser
*
- * (c) 2007 Pavel Charvat <pchar@ucw.cz>
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
#include "sherlock/xml/common.h"
#include "lib/fastbuf.h"
#include "lib/ff-unicode.h"
+#include "lib/unicode.h"
/* Notations */
#define HASH_ZERO_FILL
#define HASH_TABLE_DYNAMIC
#define HASH_WANT_LOOKUP
+#define HASH_WANT_FIND
#define HASH_GIVE_ALLOC
#define HASH_TABLE_ALLOC
XML_HASH_GIVE_ALLOC
#include "lib/hashtable.h"
+struct xml_dtd_notn *
+xml_dtd_find_notn(struct xml_context *ctx, char *name)
+{
+ struct xml_dtd *dtd = ctx->dtd;
+ struct xml_dtd_notn *notn = xml_dtd_notns_find(dtd->tab_notns, name);
+ return !notn ? NULL : (notn->flags & XML_DTD_NOTN_DECLARED) ? notn : NULL;
+}
+
/* General entities */
#define HASH_PREFIX(x) xml_dtd_ents_##x
-#define HASH_NODE struct xml_dtd_ent
+#define HASH_NODE struct xml_dtd_entity
#define HASH_KEY_STRING name
#define HASH_ZERO_FILL
#define HASH_TABLE_DYNAMIC
XML_HASH_GIVE_ALLOC
#include "lib/hashtable.h"
-static struct xml_dtd_ent *
-xml_dtd_declare_trivial_ent(struct xml_context *ctx, char *name, uns uni)
+static struct xml_dtd_entity *
+xml_dtd_declare_trivial_entity(struct xml_context *ctx, char *name, char *text)
{
struct xml_dtd *dtd = ctx->dtd;
- struct xml_dtd_ent *ent = xml_dtd_ents_lookup(dtd->tab_ents, name);
- if (ent->flags & XML_DTD_ENT_DECLARED)
+ struct xml_dtd_entity *ent = xml_dtd_ents_lookup(dtd->tab_ents, name);
+ if (ent->flags & XML_DTD_ENTITY_DECLARED)
{
xml_warn(ctx, "Entity &%s; already declared", name);
return NULL;
}
slist_add_tail(&dtd->ents, &ent->n);
- ent->flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL_UNI;
- ent->uni = uni;
+ ent->flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL;
+ ent->text = text;
return ent;
}
static void
-xml_dtd_declare_default_ents(struct xml_context *ctx)
+xml_dtd_declare_default_entities(struct xml_context *ctx)
{
- xml_dtd_declare_trivial_ent(ctx, "lt", 60);
- xml_dtd_declare_trivial_ent(ctx, "gt", 62);
- xml_dtd_declare_trivial_ent(ctx, "amp", 38);
- xml_dtd_declare_trivial_ent(ctx, "apos", 39);
- xml_dtd_declare_trivial_ent(ctx, "quot", 34);
+ xml_dtd_declare_trivial_entity(ctx, "lt", "<");
+ xml_dtd_declare_trivial_entity(ctx, "gt", ">");
+ xml_dtd_declare_trivial_entity(ctx, "amp", "&");
+ xml_dtd_declare_trivial_entity(ctx, "apos", "'");
+ xml_dtd_declare_trivial_entity(ctx, "quot", "\"");
}
-struct xml_dtd_ent *
-xml_dtd_find_ent(struct xml_context *ctx, char *name)
+struct xml_dtd_entity *
+xml_def_find_entity(struct xml_context *ctx UNUSED, char *name)
+{
+#define ENT(n, t) ent_##n = { .name = #n, .text = t, .flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL }
+ static struct xml_dtd_entity ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\"");
+#undef ENT
+ switch (name[0])
+ {
+ case 'l':
+ if (!strcmp(name, "lt"))
+ return &ent_lt;
+ break;
+ case 'g':
+ if (!strcmp(name, "gt"))
+ return &ent_gt;
+ break;
+ case 'a':
+ if (!strcmp(name, "amp"))
+ return &ent_amp;
+ if (!strcmp(name, "apos"))
+ return &ent_apos;
+ break;
+ case 'q':
+ if (!strcmp(name, "quot"))
+ return &ent_quot;
+ break;
+ }
+ return NULL;
+}
+
+struct xml_dtd_entity *
+xml_dtd_find_entity(struct xml_context *ctx, char *name)
{
struct xml_dtd *dtd = ctx->dtd;
- if (ctx->h_resolve_entity)
- return ctx->h_resolve_entity(ctx, name);
+ if (ctx->h_find_entity)
+ return ctx->h_find_entity(ctx, name);
else if (dtd)
{
- struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_ents, name);
- return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL;
+ struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_ents, name);
+ return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL;
}
else
- {
-#define ENT(n, u) ent_##n = { .name = #n, .uni = u, .flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL_UNI }
- static struct xml_dtd_ent ENT(lt, 60), ENT(gt, 62), ENT(amp, 38), ENT(apos, 39), ENT(quot, 34);
-#undef ENT
- switch (name[0])
- {
- case 'l':
- if (!strcmp(name, "lt"))
- return &ent_lt;
- break;
- case 'g':
- if (!strcmp(name, "gt"))
- return &ent_gt;
- break;
- case 'a':
- if (!strcmp(name, "amp"))
- return &ent_amp;
- if (!strcmp(name, "apos"))
- return &ent_apos;
- break;
- case 'q':
- if (!strcmp(name, "quot"))
- return &ent_quot;
- break;
- }
- return NULL;
- }
+ return xml_def_find_entity(ctx, name);
}
/* Parameter entities */
-static struct xml_dtd_ent *
-xml_dtd_find_pent(struct xml_context *ctx, char *name)
+static struct xml_dtd_entity *
+xml_dtd_find_pentity(struct xml_context *ctx, char *name)
{
struct xml_dtd *dtd = ctx->dtd;
- struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_pents, name);
- return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL;
+ struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_pents, name);
+ return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL;
}
/* Elements */
xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table)));
xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table)));
xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table)));
- xml_dtd_declare_default_ents(ctx);
+ xml_dtd_declare_default_entities(ctx);
}
void
mp_save(ctx->stack, &state);
char *name = xml_parse_name(ctx, ctx->stack);
xml_parse_char(ctx, ';');
- struct xml_dtd_ent *ent = xml_dtd_find_pent(ctx, name);
+ struct xml_dtd_entity *ent = xml_dtd_find_pentity(ctx, name);
if (!ent)
xml_error(ctx, "Unknown entity %%%s;", name);
else
}
static void
-xml_dtd_parse_external_id(struct xml_context *ctx, struct xml_ext_id *eid, uns allow_public)
+xml_dtd_parse_external_id(struct xml_context *ctx, char **system_id, char **public_id, uns allow_public)
{
struct xml_dtd *dtd = ctx->dtd;
- bzero(eid, sizeof(*eid));
uns c = xml_peek_char(ctx);
if (c == 'S')
{
xml_parse_seq(ctx, "SYSTEM");
xml_parse_dtd_white(ctx, 1);
- eid->system_id = xml_parse_system_literal(ctx, dtd->pool);
+ *public_id = NULL;
+ *system_id = xml_parse_system_literal(ctx, dtd->pool);
}
else if (c == 'P')
{
xml_parse_seq(ctx, "PUBLIC");
xml_parse_dtd_white(ctx, 1);
- eid->public_id = xml_parse_pubid_literal(ctx, dtd->pool);
- if (xml_parse_dtd_white(ctx, 0)) // FIXME
+ *system_id = NULL;
+ *public_id = xml_parse_pubid_literal(ctx, dtd->pool);
+ if (xml_parse_dtd_white(ctx, !allow_public))
if ((c = xml_peek_char(ctx)) == '\'' || c == '"' || !allow_public)
- eid->system_id = xml_parse_system_literal(ctx, dtd->pool);
+ *system_id = xml_parse_system_literal(ctx, dtd->pool);
}
else
xml_fatal(ctx, "Expected an external ID");
struct xml_dtd_notn *notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool));
xml_parse_dtd_white(ctx, 1);
- struct xml_ext_id eid;
- xml_dtd_parse_external_id(ctx, &eid, 1);
+ char *system_id, *public_id;
+ xml_dtd_parse_external_id(ctx, &system_id, &public_id, 1);
xml_parse_dtd_white(ctx, 0);
xml_parse_char(ctx, '>');
else
{
notn->flags = XML_DTD_NOTN_DECLARED;
- notn->eid = eid;
+ notn->system_id = system_id;
+ notn->public_id = public_id;
slist_add_tail(&dtd->notns, ¬n->n);
}
xml_dec(ctx);
struct xml_dtd *dtd = ctx->dtd;
xml_parse_dtd_white(ctx, 1);
- uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENT_PARAMETER : 0;
+ uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENTITY_PARAMETER : 0;
if (flags)
xml_parse_dtd_white(ctx, 1);
else
xml_unget_char(ctx);
- struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool));
+ struct xml_dtd_entity *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool));
slist *list = flags ? &dtd->pents : &dtd->ents;
xml_parse_dtd_white(ctx, 1);
- if (ent->flags & XML_DTD_ENT_DECLARED)
+ if (ent->flags & XML_DTD_ENTITY_DECLARED)
{
xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name);
// FIXME: should be only warning
p = mp_spread(dtd->pool, p, 3 + l);
*p++ = '&';
memcpy(p, n, l);
+ p += l;
*p++ = ';';;
mp_restore(ctx->stack, &state);
continue;
ent->len = p - (char *)mp_ptr(dtd->pool);
ent->text = mp_end(dtd->pool, p + 1);
slist_add_tail(list, &ent->n);
- ent->flags = flags | XML_DTD_ENT_DECLARED;
+ ent->flags = flags | XML_DTD_ENTITY_DECLARED;
}
else
{
/* External entity */
- struct xml_ext_id eid;
struct xml_dtd_notn *notn = NULL;
- xml_dtd_parse_external_id(ctx, &eid, 0);
- if (!xml_parse_dtd_white(ctx, 0) || !flags)
- xml_parse_char(ctx, '>');
- else if (xml_get_char(ctx) != '>')
+ char *system_id, *public_id;
+ xml_unget_char(ctx);
+ xml_dtd_parse_external_id(ctx, &system_id, &public_id, 0);
+ if (xml_parse_dtd_white(ctx, 0) && flags && xml_peek_char(ctx) != '>')
{
/* General external unparsed entity */
- flags |= XML_DTD_ENT_UNPARSED;
+ flags |= XML_DTD_ENTITY_UNPARSED;
xml_parse_seq(ctx, "NDATA");
xml_parse_dtd_white(ctx, 1);
notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool));
}
slist_add_tail(list, &ent->n);
- ent->flags = flags | XML_DTD_ENT_DECLARED | XML_DTD_ENT_EXTERNAL;
- ent->eid = eid;
+ ent->flags = flags | XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_EXTERNAL;
+ ent->system_id = system_id;
+ ent->public_id = public_id;
ent->notn = notn;
}
xml_parse_dtd_white(ctx, 0);
else
{
char *type = xml_parse_name(ctx, dtd->pool);
- enum xml_dtd_attribute_type t = XML_ATTR_CDATA;
+ enum xml_dtd_attr_type t = XML_ATTR_CDATA;
if (!strcmp(type, "CDATA"))
t = XML_ATTR_CDATA;
else if (!strcmp(type, "ID"))
attr->type = t;
}
xml_parse_dtd_white(ctx, 1);
- enum xml_dtd_attribute_default def = XML_ATTR_NONE;
+ enum xml_dtd_attr_default def = XML_ATTR_NONE;
if (xml_get_char(ctx) == '#')
switch (xml_peek_char(ctx))
{
}
xml_dec(ctx);
}
+
+/*** Validation of attribute values ***/
+
+static uns
+xml_check_tokens(char *value, uns first_cat, uns next_cat, uns seq)
+{
+ char *p = value;
+ uns u;
+ while (1)
+ {
+ p = utf8_32_get(p, &u);
+ if (!(xml_char_cat(u) & first_cat))
+ return 0;
+ while (*p & ~0x20)
+ {
+ p = utf8_32_get(p, &u);
+ if (!(xml_char_cat(u) & next_cat))
+ return 0;
+ }
+ if (!*p)
+ return 1;
+ if (!seq)
+ return 0;
+ p++;
+ }
+}
+
+static uns
+xml_is_name(struct xml_context *ctx, char *value)
+{
+ /* Name ::= NameStartChar (NameChar)* */
+ return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 0);
+}
+
+static uns
+xml_is_names(struct xml_context *ctx, char *value)
+{
+ /* Names ::= Name (#x20 Name)* */
+ return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 1);
+}
+
+static uns
+xml_is_nmtoken(struct xml_context *ctx, char *value)
+{
+ /* Nmtoken ::= (NameChar)+ */
+ return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 0);
+}
+
+static uns
+xml_is_nmtokens(struct xml_context *ctx, char *value)
+{
+ /* Nmtokens ::= Nmtoken (#x20 Nmtoken)* */
+ return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 1);
+}
+
+static void
+xml_err_attr_format(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *type)
+{
+ xml_error(ctx, "Attribute %s in <%s> does not match the production of %s", dtd->name, dtd->elem->name, type);
+}
+
+void
+xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value)
+{
+ if (dtd->type == XML_ATTR_CDATA)
+ return;
+ xml_normalize_white(ctx, value);
+ switch (dtd->type)
+ {
+ case XML_ATTR_ID:
+ if (!xml_is_name(ctx, value))
+ xml_err_attr_format(ctx, dtd, "NAME");
+ //FIXME: add to a hash table
+ break;
+ case XML_ATTR_IDREF:
+ if (!xml_is_name(ctx, value))
+ xml_err_attr_format(ctx, dtd, "NAME");
+ // FIXME: find in hash table (beware forward references)
+ break;
+ case XML_ATTR_IDREFS:
+ if (!xml_is_names(ctx, value))
+ xml_err_attr_format(ctx, dtd, "NAMES");
+ // FIXME: find
+ break;
+ case XML_ATTR_ENTITY:
+ // FIXME
+ break;
+ case XML_ATTR_ENTITIES:
+ // FIXME
+ break;
+ case XML_ATTR_NMTOKEN:
+ if (!xml_is_nmtoken(ctx, value))
+ xml_err_attr_format(ctx, dtd, "NMTOKEN");
+ break;
+ case XML_ATTR_NMTOKENS:
+ if (!xml_is_nmtokens(ctx, value))
+ xml_err_attr_format(ctx, dtd, "NMTOKENS");
+ break;
+ case XML_ATTR_ENUM:
+ if (!xml_dtd_evals_find(ctx->dtd->tab_evals, dtd, value))
+ xml_error(ctx, "Attribute %s in <%s> contains an undefined enumeration value", dtd->name, dtd->elem->name);
+ break;
+ case XML_ATTR_NOTATION:
+ if (!xml_dtd_find_notn(ctx, value))
+ xml_error(ctx, "Attribute %s in <%s> contains an undefined notation", dtd->name, dtd->elem->name);
+ break;
+ }
+}
/*
* Sherlock Library -- A simple XML parser
*
- * (c) 2007 Pavel Charvat <pchar@ucw.cz>
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
void *tab_enotns; /* hash table of enumerated attribute notations */
};
-struct xml_ext_id {
- char *system_id;
- char *public_id;
-};
-
/* Notations */
enum xml_dtd_notn_flags {
snode n; /* Node in xml_dtd.notns */
uns flags; /* XML_DTD_NOTN_x */
char *name; /* Notation name */
- struct xml_ext_id eid; /* External id */
+ char *system_id; /* External ID */
+ char *public_id;
+ void *user; /* User-defined */
};
+struct xml_dtd_notn *xml_dtd_find_notn(struct xml_context *ctx, char *name);
+
/* Entities */
-enum xml_dtd_ent_flags {
- XML_DTD_ENT_DECLARED = 0x1, /* The entity has been declared (internal usage) */
- XML_DTD_ENT_VISITED = 0x2, /* Cycle detection (internal usage) */
- XML_DTD_ENT_PARAMETER = 0x4, /* Parameter entity, general otherwise */
- XML_DTD_ENT_EXTERNAL = 0x8, /* External entity, internal otherwise */
- XML_DTD_ENT_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */
- XML_DTD_ENT_TRIVIAL_STR = 0x20, /* Replacement text is a sequence of characters and character references */
- XML_DTD_ENT_TRIVIAL_UNI = 0x40, /* Replacement text is a single Unicode character */
+enum xml_dtd_entity_flags {
+ XML_DTD_ENTITY_DECLARED = 0x1, /* The entity has been declared (internal usage) */
+ XML_DTD_ENTITY_VISITED = 0x2, /* Cycle detection (internal usage) */
+ XML_DTD_ENTITY_PARAMETER = 0x4, /* Parameter entity, general otherwise */
+ XML_DTD_ENTITY_EXTERNAL = 0x8, /* External entity, internal otherwise */
+ XML_DTD_ENTITY_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */
+ XML_DTD_ENTITY_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */
};
-struct xml_dtd_ent {
+struct xml_dtd_entity {
snode n; /* Node in xml_dtd.[gp]ents */
uns flags; /* XML_DTD_ENT_x */
char *name; /* Entity name */
char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */
uns len; /* Text length */
- uns uni; /* Unicode value */
- struct xml_ext_id eid; /* External ID */
+ char *system_id; /* External ID */
+ char *public_id;
struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */
+ void *user; /* User-defined */
};
-struct xml_dtd_ent *xml_dtd_find_ent(struct xml_context *ctx, char *name);
+struct xml_dtd_entity *xml_dtd_find_entity(struct xml_context *ctx, char *name);
/* Elements */
uns type;
char *name;
struct xml_dtd_elem_node *node;
+ void *user; /* User-defined */
};
struct xml_dtd_elem_node {
slist sons;
uns type;
uns occur;
+ void *user; /* User-defined */
};
enum xml_dtd_elem_node_type {
/* Attributes */
-enum xml_dtd_attribute_default {
+enum xml_dtd_attr_default {
XML_ATTR_NONE,
XML_ATTR_REQUIRED,
XML_ATTR_IMPLIED,
XML_ATTR_FIXED,
};
-enum xml_dtd_attribute_type {
+enum xml_dtd_attr_type {
XML_ATTR_CDATA,
XML_ATTR_ID,
XML_ATTR_IDREF,
};
struct xml_dtd_attr {
- char *name;
- struct xml_dtd_elem *elem;
- enum xml_dtd_attribute_type type;
- enum xml_dtd_attribute_default default_mode;
- char *default_value;
+ char *name; /* Attribute name */
+ struct xml_dtd_elem *elem; /* Owner element */
+ uns type; /* See enum xml_dtd_attr_type */
+ uns default_mode; /* See enum xml_dtd_attr_default */
+ char *default_value; /* The default value defined in DTD (or NULL) */
};
struct xml_dtd_eval {
/*
* Sherlock Library -- A simple XML parser
*
- * (c) 2007 Pavel Charvat <pchar@ucw.cz>
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
/* Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
* Already parsed: '<!-' */
xml_parse_char(ctx, '-');
- struct xml_node *n = xml_push_dom(ctx);
+ struct xml_node *n = xml_push_dom(ctx, NULL);
n->type = XML_NODE_COMMENT;
char *p = mp_start_noalign(ctx->pool, 6);
while (1)
* PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
* PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
* Already parsed: '<?' */
- struct xml_node *n = xml_push_dom(ctx);
+ struct xml_node *n = xml_push_dom(ctx, NULL);
n->type = XML_NODE_PI;
n->name = xml_parse_name(ctx, ctx->pool);
if (unlikely(!strcasecmp(n->name, "xml")))
xml_dec(ctx);
}
-/*** Character data ***/
-
-static inline uns
-xml_flush_chars(struct xml_context *ctx)
-{
- struct fastbuf *fb = &ctx->chars;
- if (fb->bufend == fb->buffer)
- return 0;
- TRACE(ctx, "flush_chars");
- struct xml_node *n = ctx->node;
- n->text = xml_end_chars(ctx, &n->len);
- n->len = fb->bufend - fb->buffer;
- if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars)
- ctx->h_chars(ctx);
- return 1;
-}
-
-static inline void
-xml_pop_chars(struct xml_context *ctx)
-{
- xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS));
- TRACE(ctx, "pop_chars");
-}
-
-static inline void
-xml_append_chars(struct xml_context *ctx)
-{
- TRACE(ctx, "append_chars");
- struct fastbuf *out = &ctx->chars;
- while (xml_get_char(ctx) != '<')
- if (xml_last_char(ctx) == '&')
- {
- xml_inc(ctx);
- xml_parse_ref(ctx);
- }
- else
- bput_utf8_32(out, xml_last_char(ctx));
- xml_unget_char(ctx);
-}
-
-/*** CDATA sections ***/
-
-static void
-xml_push_cdata(struct xml_context *ctx)
-{
- TRACE(ctx, "push_cdata");
- /* CDSect :== '<![CDATA[' (Char* - (Char* ']]>' Char*)) ']]>'
- * Already parsed: '<![' */
- xml_parse_seq(ctx, "CDATA[");
- struct xml_node *n = xml_push_dom(ctx);
- n->type = XML_NODE_CHARS;
- char *p = mp_start_noalign(ctx->pool, 7);
- while (1)
- {
- if (xml_get_char(ctx) == ']')
- {
- if (xml_get_char(ctx) == ']')
- if (xml_get_char(ctx) == '>')
- break;
- else
- *p++ = ']';
- *p++ = ']';
- }
- p = utf8_32_put(p, xml_last_char(ctx));
- p = mp_spread(ctx->pool, p, 7);
- }
- *p = 0;
- n->len = p - (char *)mp_ptr(ctx->pool);
- n->text = mp_end(ctx->pool, p + 1);
- if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata)
- ctx->h_cdata(ctx);
-}
-
-static void
-xml_pop_cdata(struct xml_context *ctx)
-{
- xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS));
- xml_dec(ctx);
- TRACE(ctx, "pop_cdata");
-}
-
-static void
-xml_append_cdata(struct xml_context *ctx)
-{
- TRACE(ctx, "append_cdata");
- xml_parse_seq(ctx, "CDATA[");
- struct fastbuf *out = &ctx->chars;
- while (1)
- {
- if (xml_get_char(ctx) == ']')
- {
- if (xml_get_char(ctx) == ']')
- if (xml_get_char(ctx) == '>')
- break;
- else
- bputc(out, ']');
- bputc(out, ']');
- }
- bput_utf8_32(out, xml_last_char(ctx));
- }
- xml_dec(ctx);
-}
-
-static void UNUSED
-xml_skip_cdata(struct xml_context *ctx)
-{
- TRACE(ctx, "skip_cdata");
- xml_parse_seq(ctx, "CDATA[");
- while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>');
- xml_dec(ctx);
-}
-
/*** Character references ***/
uns
/*** References to general entities ***/
-void
+static void
xml_parse_ref(struct xml_context *ctx)
{
/* Reference ::= EntityRef | CharRef
mp_save(ctx->stack, &state);
char *name = xml_parse_name(ctx, ctx->stack);
xml_parse_char(ctx, ';');
- struct xml_dtd_ent *ent = xml_dtd_find_ent(ctx, name);
+ struct xml_dtd_entity *ent = xml_dtd_find_entity(ctx, name);
if (!ent)
{
xml_error(ctx, "Unknown entity &%s;", name);
bputs(out, name);
bputc(out, ';');
}
- else if (ent->flags & XML_DTD_ENT_TRIVIAL_UNI)
+ else if (ent->flags & XML_DTD_ENTITY_TRIVIAL)
{
TRACE(ctx, "Trivial entity &%s;", name);
- bput_utf8_32(out, ent->uni);
- }
- else if (ent->flags & XML_DTD_ENT_TRIVIAL_STR)
- {
- TRACE(ctx, "Trivial entity &%s;", name);
- bwrite(out, ent->text, ent->len);
+ bputs(out, ent->text);
}
else
{
}
}
+/*** Character data ***/
+
+void
+xml_spout_chars(struct fastbuf *fb)
+{
+ if (fb->bptr < fb->bufend)
+ return;
+ struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb);
+ struct mempool *pool = ctx->pool;
+ if (fb->bufend != fb->buffer)
+ {
+ TRACE(ctx, "growing chars");
+ uns len = fb->bufend - fb->buffer;
+ uns reported = fb->bstop - fb->buffer;
+ fb->buffer = mp_expand(pool);
+ fb->bufend = fb->buffer + mp_avail(pool);
+ fb->bptr = fb->buffer + len;
+ fb->bstop = fb->buffer + reported;
+ }
+ else
+ {
+ TRACE(ctx, "starting chars");
+ mp_save(pool, &ctx->chars_state);
+ fb->bptr = fb->buffer = fb->bstop = mp_start_noalign(pool, 2);
+ fb->bufend = fb->buffer + mp_avail(pool) - 1;
+ }
+}
+
+static inline uns
+xml_end_chars(struct xml_context *ctx, char **out)
+{
+ struct fastbuf *fb = &ctx->chars;
+ uns len = fb->bptr - fb->buffer;
+ if (len)
+ {
+ TRACE(ctx, "ending chars");
+ *fb->bptr = 0;
+ *out = mp_end(ctx->pool, fb->bptr + 1);
+ fb->bufend = fb->bstop = fb->bptr = fb->buffer;
+ }
+ return len;
+}
+
+static inline uns
+xml_report_chars(struct xml_context *ctx, char **out)
+{
+ struct fastbuf *fb = &ctx->chars;
+ uns len = fb->bptr - fb->buffer;
+ if (len)
+ {
+ *fb->bptr = 0;
+ *out = fb->bstop;
+ fb->bstop = fb->bptr;
+ }
+ return len;
+}
+
+static inline uns
+xml_flush_chars(struct xml_context *ctx)
+{
+ char *text, *rtext;
+ uns len = xml_end_chars(ctx, &text), rlen;
+ if (len)
+ {
+ if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext)))
+ ctx->h_block(ctx, rtext, rlen);
+ if (!(ctx->flags & XML_ALLOC_CHARS) && (!(ctx->flags & XML_REPORT_CHARS) || !ctx->h_chars))
+ {
+ mp_restore(ctx->pool, &ctx->chars_state);
+ return 0;
+ }
+ struct xml_node *n = xml_push_dom(ctx, &ctx->chars_state);
+ n->type = XML_NODE_CHARS;
+ n->text = text;
+ n->len = len;
+ if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars)
+ ctx->h_chars(ctx);
+ }
+ return len;
+}
+
+static inline void
+xml_pop_chars(struct xml_context *ctx)
+{
+ xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS));
+ TRACE(ctx, "pop_chars");
+}
+
+static inline void
+xml_append_chars(struct xml_context *ctx)
+{
+ TRACE(ctx, "append_chars");
+ struct fastbuf *out = &ctx->chars;
+ while (xml_get_char(ctx) != '<')
+ if (xml_last_char(ctx) == '&')
+ {
+ xml_inc(ctx);
+ xml_parse_ref(ctx);
+ }
+ else
+ bput_utf8_32(out, xml_last_char(ctx));
+ xml_unget_char(ctx);
+}
+
+/*** CDATA sections ***/
+
+static void
+xml_append_cdata(struct xml_context *ctx)
+{
+ /* CDSect :== '<![CDATA[' (Char* - (Char* ']]>' Char*)) ']]>'
+ * Already parsed: '<![' */
+ TRACE(ctx, "append_cdata");
+ xml_parse_seq(ctx, "CDATA[");
+ struct fastbuf *out = &ctx->chars;
+ uns rlen;
+ char *rtext;
+ if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext)))
+ ctx->h_block(ctx, rtext, rlen);
+ while (1)
+ {
+ if (xml_get_char(ctx) == ']')
+ {
+ if (xml_get_char(ctx) == ']')
+ if (xml_get_char(ctx) == '>')
+ break;
+ else
+ bputc(out, ']');
+ bputc(out, ']');
+ }
+ bput_utf8_32(out, xml_last_char(ctx));
+ }
+ if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata && (rlen = xml_report_chars(ctx, &rtext)))
+ ctx->h_cdata(ctx, rtext, rlen);
+ xml_dec(ctx);
+}
+
+static void UNUSED
+xml_skip_cdata(struct xml_context *ctx)
+{
+ TRACE(ctx, "skip_cdata");
+ xml_parse_seq(ctx, "CDATA[");
+ while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>');
+ xml_dec(ctx);
+}
+
/*** Attribute values ***/
char *
{
TRACE(ctx, "parse_attr_value");
/* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */
- /* FIXME:
- * -- copying from ctx->chars to ctx->pool is not necessary, we could directly write to ctx->pool
- * -- berare quotes inside parased entities
- * -- check value constrains / normalize value */
+ /* FIXME: -- check value constrains / normalize leading/trailing WS and repeated WS */
struct mempool_state state;
uns quote = xml_parse_quote(ctx);
mp_save(ctx->stack, &state);
- xml_start_chars(ctx);
struct fastbuf *out = &ctx->chars;
+ struct xml_source *src = ctx->src;
while (1)
{
uns c = xml_get_char(ctx);
xml_inc(ctx);
xml_parse_ref(ctx);
}
- else if (c == quote) // FIXME: beware quotes inside parsed entities
+ else if (c == quote && src == ctx->src)
break;
else if (c == '<')
xml_error(ctx, "Attribute value must not contain '<'");
bput_utf8_32(out, c);
}
mp_restore(ctx->stack, &state);
- uns len;
- return xml_end_chars(ctx, &len);
+ char *text;
+ return xml_end_chars(ctx, &text) ? text : "";
+}
+
+uns
+xml_normalize_white(struct xml_context *ctx UNUSED, char *text)
+{
+ char *s = text, *d = text;
+ while (*s == 0x20)
+ s++;
+ while (1)
+ {
+ while (*s & ~0x20)
+ *d++ = *s++;
+ if (!*s)
+ break;
+ while (*++s == 0x20);
+ *d++ = 0x20;
+ }
+ if (d != text && d[-1] == 0x20)
+ d--;
+ *d = 0;
+ return d - text;
}
/*** Attributes ***/
{
TRACE(ctx, "parse_attr");
/* Attribute ::= Name Eq AttValue */
- /* FIXME:
- * -- memory management
- * -- DTD */
struct xml_node *e = ctx->node;
char *n = xml_parse_name(ctx, ctx->pool);
struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, n);
xml_parse_eq(ctx);
char *v = xml_parse_attr_value(ctx, NULL);
if (a->val)
- xml_error(ctx, "Attribute %s is not unique", n);
+ {
+ xml_error(ctx, "Attribute %s is not unique in element <%s>", n, e->name);
+ return;
+ }
+ a->val = v;
+ if (!e->dtd)
+ a->dtd = NULL;
+ else if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name)))
+ xml_error(ctx, "Undefined attribute %s in element <%s>", n, e->name);
else
- a->val = v;
+ xml_validate_attr(ctx, a->dtd, a->val);
}
struct xml_attr *
* EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
* STag ::= '<' Name (S Attribute)* S? '>'
* Already parsed: '<' */
- struct xml_node *e = xml_push_dom(ctx);
+ struct xml_node *e = xml_push_dom(ctx, NULL);
clist_init(&e->sons);
e->type = XML_NODE_ELEM;
e->name = xml_parse_name(ctx, ctx->pool);
{
ctx->dom = e;
if (ctx->doctype && strcmp(e->name, ctx->doctype))
- xml_error(ctx, "The root element %s does not match the document type %s", e->name, ctx->doctype);
+ xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype);
+ }
+ if (!ctx->dtd)
+ e->dtd = NULL;
+ else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name)))
+ xml_error(ctx, "Undefined element <%s>", e->name);
+ else
+ {
+ // FIXME: validate regular expressions
}
while (1)
{
{
case XML_STATE_START:
TRACE(ctx, "entering prolog");
+ ctx->flags |= XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL;
if (ctx->h_document_start)
ctx->h_document_start(ctx);
/* XMLDecl */
else if (c == '[')
{
/* CDATA */
- if (!(ctx->flags & XML_UNFOLD_CDATA))
- xml_append_cdata(ctx);
- else
- {
- if (xml_flush_chars(ctx))
- {
- PULL_STATE(CHARS, CHARS_BEFORE_CDATA);
- xml_pop_chars(ctx);
- }
- xml_push_cdata(ctx);
- PULL(CDATA);
- xml_pop_cdata(ctx);
- }
+ xml_append_cdata(ctx);
}
else
xml_fatal(ctx, "Unexpected character after '<!'");
/*
* Sherlock Library -- A simple XML parser
*
- * (c) 2007 Pavel Charvat <pchar@ucw.cz>
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
}
struct xml_source *
-xml_push_source(struct xml_context *ctx, uns flags)
+xml_push_source(struct xml_context *ctx)
{
xml_push(ctx);
struct xml_source *src = ctx->src;
src->next = ctx->src;
src->saved_depth = ctx->depth;
ctx->src = src;
- ctx->flags = (ctx->flags & ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_NEW_LINE | XML_SRC_SURROUND | XML_SRC_DOCUMENT)) | flags;
+ ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_NEW_LINE | XML_SRC_SURROUND | XML_SRC_DOCUMENT);
ctx->bstop = ctx->bptr = src->buf;
ctx->depth = 0;
- if (flags & XML_SRC_SURROUND)
- xml_add_char(&ctx->bstop, 0x20);
+ return src;
+}
+
+struct xml_source *
+xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb)
+{
+ struct xml_source *src = xml_push_source(ctx);
+ src->fb = fb;
return src;
}
{
TRACE(ctx, "pop_source");
if (unlikely(ctx->depth != 0))
- {
- xml_fatal(ctx, "Unexpected end of entity");
- }
+ xml_fatal(ctx, "Unexpected end of entity");
struct xml_source *src = ctx->src;
- ASSERT(src);
+ if (!src)
+ xml_fatal(ctx, "Undefined source");
xml_close_source(src);
ctx->depth = src->saved_depth;
ctx->src = src = src->next;
static void xml_refill_utf8(struct xml_context *ctx);
void
-xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent)
+xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent UNUSED)
{
- TRACE(ctx, "xml_push_entity");
- uns cat1 = ctx->src->refill_cat1;
- uns cat2 = ctx->src->refill_cat2;
- struct xml_source *src = xml_push_source(ctx, 0);
- src->refill_cat1 = cat1;
- src->refill_cat2 = cat2;
- if (ent->flags & XML_DTD_ENT_EXTERNAL)
- xml_fatal(ctx, "External entities not implemented"); // FIXME
- else
- {
- fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0);
- src->refill = xml_refill_utf8;
- }
+ xml_error(ctx, "References to external entities are not supported");
}
void
-xml_set_source(struct xml_context *ctx, struct fastbuf *fb)
+xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent)
{
- TRACE(ctx, "xml_set_source");
- ASSERT(!ctx->src);
- struct xml_source *src = xml_push_source(ctx, XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL);
- src->fb = fb;
- ctx->state = XML_STATE_START;
+ TRACE(ctx, "xml_push_entity");
+ struct xml_source *src;
+ if (ent->flags & XML_DTD_ENTITY_EXTERNAL)
+ {
+ ASSERT(ctx->h_resolve_entity);
+ ctx->h_resolve_entity(ctx, ent);
+ ctx->flags |= XML_SRC_EXPECTED_DECL;
+ src = ctx->src;
+ }
+ else
+ {
+ src = xml_push_source(ctx);
+ fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, strlen(ent->text), 0);
+ }
+ src->refill = xml_refill_utf8;
+ src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line;
+ src->refill_cat2 = ctx->cat_new_line;
}
static uns
REFILL(ctx, bget_utf16_be_repl, ~1U);
}
-#if 0
-static inline uns
-xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x)
-{
- // FIXME: slow
- int c;
- return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]);
-}
-
-static void
-xml_refill_libcharset(struct xml_context *ctx)
-{
- unsigned short int *in_to_x = ctx->src->refill_in_to_x;
- REFILL(ctx, xml_refill_libcharset_bget, in_to_x);
-}
-#endif
-
#undef REFILL
void
while (ctx->bptr == ctx->bstop);
}
-uns
-xml_row(struct xml_context *ctx)
+static uns
+xml_source_row(struct xml_context *ctx, struct xml_source *src)
{
- struct xml_source *src = ctx->src;
- if (!src)
- return 0;
uns row = src->row;
for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2)
if (p[-1] & src->refill_cat2)
return row + 1;
}
+uns
+xml_row(struct xml_context *ctx)
+{
+ return ctx->src ? xml_source_row(ctx, ctx->src) : 0;
+}
+
/* Document/external entity header */
static char *
static void
xml_init_charconv(struct xml_context *ctx, int cs)
{
- // FIXME: hack
+ // XXX: with a direct access to libcharset tables could be faster
struct xml_source *src = ctx->src;
TRACE(ctx, "wrapping charset %s", charset_name(cs));
-#if 0
- struct conv_context conv;
- conv_set_charset(&conv, cs, CONV_CHARSET_UTF8);
- src->refill = xml_refill_libcharset;
- src->refill_in_to_x = conv.in_to_x;
-#else
src->wrapped_fb = src->fb;
src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8);
-#endif
}
void
/*
* Sherlock Library -- A simple XML parser
*
- * (c) 2007 Pavel Charvat <pchar@ucw.cz>
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
#include "sherlock/sherlock.h"
#include "sherlock/xml/xml.h"
+#include "sherlock/xml/dtd.h"
#include "lib/getopt.h"
#include "lib/fastbuf.h"
#include <stdio.h>
#include <stdlib.h>
+#include <fcntl.h>
enum {
WANT_FIRST = 0x100,
WANT_PARSE_DTD,
WANT_HIDE_ERRORS,
- WANT_UNFOLD_CDATA,
WANT_IGNORE_COMMENTS,
WANT_IGNORE_PIS,
+ WANT_REPORT_BLOCKS,
+ WANT_FILE_ENTITIES,
};
static char *shortopts = "spd" CF_SHORT_OPTS;
{ "dom", 0, 0, 'd' },
{ "dtd", 0, 0, WANT_PARSE_DTD },
{ "hide-errors", 0, 0, WANT_HIDE_ERRORS },
- { "unfold-cdata", 0, 0, WANT_UNFOLD_CDATA },
{ "ignore-comments", 0, 0, WANT_IGNORE_COMMENTS },
{ "ignore-pis", 0, 0, WANT_IGNORE_PIS },
+ { "reports-blocks", 0, 0, WANT_REPORT_BLOCKS },
+ { "file-entities", 0, 0, WANT_FILE_ENTITIES },
{ NULL, 0, 0, 0 }
};
-d, --dom Test DOM interface\n\
--dtd Enable parsing of DTD\n\
--hide-errors Hide warnings and error messages\n\
- --unfold-cdata Unfold CDATA sections\n\
--ignore-comments Ignore processing instructions\n\
--ignore-pis Ignore comments\n\
+ --report-blocks Report blocks or characters and CDATA sections\n\
+ --file-entities Resolve file external entities (not fully normative)\n\
\n", stderr);
exit(1);
}
static uns want_dom;
static uns want_parse_dtd;
static uns want_hide_errors;
-static uns want_unfold_cdata;
static uns want_ignore_comments;
static uns want_ignore_pis;
+static uns want_report_blocks;
+static uns want_file_entities;
static struct fastbuf *out;
}
static void
-h_cdata(struct xml_context *ctx)
+h_block(struct xml_context *ctx UNUSED, char *text, uns len UNUSED)
{
- bputs(out, "SAX: cdata");
- show_node(ctx->node);
+ bprintf(out, "SAX: block text='%s'\n", text);
+}
+
+static void
+h_cdata(struct xml_context *ctx UNUSED, char *text, uns len UNUSED)
+{
+ bprintf(out, "SAX: cdata text='%s'\n", text);
}
static void
bputs(out, "SAX: dtd_end\n");
}
+static void
+h_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *e)
+{
+ xml_push_fastbuf(ctx, bopen(e->system_id, O_RDONLY, 4096));
+}
+
int
main(int argc, char **argv)
{
case WANT_HIDE_ERRORS:
want_hide_errors++;
break;
- case WANT_UNFOLD_CDATA:
- want_unfold_cdata++;
- break;
case WANT_IGNORE_COMMENTS:
want_ignore_comments++;
break;
case WANT_IGNORE_PIS:
want_ignore_pis++;
break;
+ case WANT_REPORT_BLOCKS:
+ want_report_blocks++;
+ break;
+ case WANT_FILE_ENTITIES:
+ want_file_entities++;
+ break;
default:
usage();
}
ctx.h_stag = h_stag;
ctx.h_etag = h_etag;
ctx.h_chars = h_chars;
- ctx.h_cdata = h_cdata;
+ if (want_report_blocks)
+ {
+ ctx.h_block = h_block;
+ ctx.h_cdata = h_cdata;
+ }
ctx.h_dtd_start = h_dtd_start;
ctx.h_dtd_end = h_dtd_end;
}
ctx.flags |= XML_ALLOC_ALL;
if (want_parse_dtd)
ctx.flags |= XML_PARSE_DTD;
- if (want_unfold_cdata)
- ctx.flags |= XML_UNFOLD_CDATA;
if (want_ignore_comments)
ctx.flags &= ~(XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS);
if (want_ignore_pis)
ctx.flags &= ~(XML_REPORT_PIS | XML_ALLOC_PIS);
- xml_set_source(&ctx, bfdopen_shared(0, 4096));
+ if (want_file_entities)
+ ctx.h_resolve_entity = h_resolve_entity;
+ xml_push_fastbuf(&ctx, bfdopen_shared(0, 4096));
bputs(out, "PULL: start\n");
if (want_pull)
{
- ctx.pull = XML_PULL_CHARS | XML_PULL_CDATA | XML_PULL_STAG | XML_PULL_ETAG | XML_PULL_COMMENT | XML_PULL_PI;
+ ctx.pull = XML_PULL_CHARS | XML_PULL_STAG | XML_PULL_ETAG | XML_PULL_COMMENT | XML_PULL_PI;
uns state;
while (state = xml_next(&ctx))
switch (state)
bputs(out, "PULL: chars");
show_node(ctx.node);
break;
- case XML_STATE_CDATA:
- bputs(out, "PULL: cdata");
- show_node(ctx.node);
- break;
case XML_STATE_STAG:
bputs(out, "PULL: stag");
show_node(ctx.node);
--- /dev/null
+# Tests for the XML parser
+# (c) 2008 Pavel Charvat <pchar@ucw.cz>
+
+Run: ../obj/sherlock/xml/xml-test
+In: <?xml version="1.0"?>
+ <html></html>
+Out: PULL: start
+ PULL: eof
+
+Run: ../obj/sherlock/xml/xml-test -s
+In: <?xml version="1.0" encoding="ISO-8859-1"?>
+ <html><a a1="val1" a2="val2">text1&amp;<</a>text2</html>
+Out: PULL: start
+ SAX: document_start
+ SAX: xml_decl version=1.0 standalone=0
+ SAX: stag <html>
+ SAX: stag <a> a1='val1' a2='val2'
+ SAX: chars text='text1&<'
+ SAX: etag </a>
+ SAX: chars text='text2'
+ SAX: etag </html>
+ SAX: document_end
+ PULL: eof
+
+Run: ../obj/sherlock/xml/xml-test -s --dtd
+In: <?xml version="1.0"?>
+ <!DOCTYPE root [
+ <!ELEMENT root ANY>
+ <!ENTITY e1 "text">
+ <!ENTITY e2 '<&e1;>'>
+ ]>
+ <root>&e1;&e2;</root>
+Out: PULL: start
+ SAX: document_start
+ SAX: xml_decl version=1.0 standalone=0
+ SAX: doctype_decl type=root public='' system='' extsub=0 intsub=1
+ SAX: dtd_start
+ SAX: dtd_end
+ SAX: stag <root>
+ SAX: chars text='text<text>'
+ SAX: etag </root>
+ SAX: document_end
+ PULL: eof
/*
* Sherlock Library -- A simple XML parser
*
- * (c) 2007 Pavel Charvat <pchar@ucw.cz>
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
#include "lib/fastbuf.h"
struct xml_context;
-struct xml_dtd_ent;
+struct xml_source;
+struct xml_dtd_entity;
enum xml_error {
- // FIXME
XML_ERR_OK = 0,
XML_ERR_WARN = 1000, /* Warning */
XML_ERR_ERROR = 2000, /* Recoverable error */
XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */
XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */
XML_STATE_CHARS, /* XML_PULL_CHARS */
- XML_STATE_CDATA, /* XML_PULL_CDATA */
XML_STATE_STAG, /* XML_PULL_STAG */
XML_STATE_ETAG, /* XML_PULL_ETAG */
XML_STATE_COMMENT, /* XML_PULL_COMMENT */
XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */
XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */
XML_PULL_CHARS = 0x00000004,
- XML_PULL_CDATA = 0x00000008,
- XML_PULL_STAG = 0x00000010,
- XML_PULL_ETAG = 0x00000020,
- XML_PULL_COMMENT = 0x00000040,
- XML_PULL_PI = 0x00000080,
+ XML_PULL_STAG = 0x00000008,
+ XML_PULL_ETAG = 0x00000010,
+ XML_PULL_COMMENT = 0x00000020,
+ XML_PULL_PI = 0x00000040,
XML_PULL_ALL = 0xffffffff,
};
XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS,
/* Other parameters */
- XML_UNFOLD_CDATA = 0x00000100, /* Unfold CDATA sections */
- XML_VALIDATING = 0x00000200, /* Validate everything (not fully implemented!) */
- XML_PARSE_DTD = 0x00000400, /* Enable parsing of DTD */
+ XML_VALIDATING = 0x00000100, /* Validate everything (not fully implemented!) */
+ XML_PARSE_DTD = 0x00000200, /* Enable parsing of DTD */
/* Internals, do not change! */
XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */
struct xml_attr {
snode n; /* Node for elem->attrs */
struct xml_node *elem; /* Parent element */
+ struct xml_dtd_attr *dtd; /* Attribute DTD */
char *name; /* Attribute name */
char *val; /* Attribute value */
void *user; /* User-defined (initialized to NULL) */
/* Memory management */
struct mempool *pool; /* DOM pool */
- struct mempool *stack; /* Stack pool (free as soon as possible) */
+ struct mempool *stack; /* Stack pool (freed as soon as possible) */
struct xml_stack *stack_list; /* See xml_push(), xml_pop() */
uns flags; /* XML_FLAG_x (restored on xml_pop()) */
- uns depth; /* Nesting level */
+ uns depth; /* Nesting level (for checking of valid source nesting -> valid pushes/pops on memory pools) */
struct fastbuf chars; /* Character data / attribute value */
+ struct mempool_state chars_state; /* Mempool state before the current character block has started */
+ char *chars_trivial; /* If not empty, it will be appended to chars */
void *tab_attrs; /* Hash table of element attributes */
/* Input */
void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */
void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration (before optional internal subset) */
void (*h_comment)(struct xml_context *ctx); /* Called after a comment (only with XML_REPORT_COMMENTS) */
- void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */
+ void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */
void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */
void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */
void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */
- void (*h_cdata)(struct xml_context *ctx); /* Called after a CDATA section (only with XML_REPORT_CHARS and XML_UNFOLD_CDATA) */
+ void (*h_block)(struct xml_context *ctx, char *text, uns len); /* Called for each continuous block of characters not reported by h_cdata() (only with XML_REPORT_CHARS) */
+ void (*h_cdata)(struct xml_context *ctx, char *text, uns len); /* Called for each CDATA section (only with XML_REPORT_CHARS) */
void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */
void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */
- struct xml_dtd_ent *(*h_resolve_entity)(struct xml_context *ctx, char *name);
+ struct xml_dtd_entity *(*h_find_entity)(struct xml_context *ctx, char *name); /* Called when needed to resolve a general entity */
+ void (*h_resolve_entity)(struct xml_context *ctx, struct xml_dtd_entity *ent); /* User should push source fastbuf for a parsed external entity (either general or parameter) */
/* DOM */
struct xml_node *dom; /* DOM root */
/* Reuse XML context */
void xml_reset(struct xml_context *ctx);
-/* Setup XML source (fastbuf will be automatically closed) */
-void xml_set_source(struct xml_context *ctx, struct fastbuf *fb);
+/* Add XML source (fastbuf will be automatically closed) */
+struct xml_source *xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb);
/* Parse without the PUSH interface, return XML_ERR_x code (zero on success) */
uns xml_parse(struct xml_context *ctx);
/* Parse with the PUSH interface, return XML_STATE_x (zero on EOF or fatal error) */
uns xml_next(struct xml_context *ctx);
+/* Returns the current row number in the document entity */
uns xml_row(struct xml_context *ctx);
+
+/* Finds a given attribute value in a XML_NODE_ELEM node */
struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name);
+/* The default value of h_find_entity(), knows <, >, &, ' and " */
+struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name);
+
+/* The default value of h_resolve_entity(), throws an error */
+void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent);
+
+/* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */
+uns xml_normalize_white(struct xml_context *ctx, char *value);
+
#endif