From ccf64507b45774b007ab6200036827f1597022d8 Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Thu, 17 Jan 2008 11:32:37 +0100 Subject: [PATCH] XML: Several fixes, slightly changed the iface. --- sherlock/xml/Makefile | 2 + sherlock/xml/TODO | 18 ++ sherlock/xml/common.c | 53 ++---- sherlock/xml/common.h | 38 ++--- sherlock/xml/dtd.c | 277 ++++++++++++++++++++++--------- sherlock/xml/dtd.h | 53 +++--- sherlock/xml/parse.c | 352 +++++++++++++++++++++++----------------- sherlock/xml/source.c | 102 +++++------- sherlock/xml/xml-test.c | 60 ++++--- sherlock/xml/xml-test.t | 43 +++++ sherlock/xml/xml.h | 52 +++--- 11 files changed, 631 insertions(+), 419 deletions(-) create mode 100644 sherlock/xml/TODO create mode 100644 sherlock/xml/xml-test.t diff --git a/sherlock/xml/Makefile b/sherlock/xml/Makefile index a265b96d..c305be23 100644 --- a/sherlock/xml/Makefile +++ b/sherlock/xml/Makefile @@ -19,7 +19,9 @@ $(o)/sherlock/xml/unicat.stamp: $(s)/sherlock/xml/unicat.pl $(Q)$< $(addprefix $(o)/sherlock/xml/unicat,.h .c) $(Q)touch $@ +TESTS+=$(o)/sherlock/xml/xml-test.test $(o)/sherlock/xml/xml-test: $(o)/sherlock/xml/xml-test.o $(LIBSHXML) +$(o)/sherlock/xml/xml-test.test: $(o)/sherlock/xml/xml-test API_LIBS+=libshxml API_INCLUDES+=$(o)/sherlock/xml/.include-stamp diff --git a/sherlock/xml/TODO b/sherlock/xml/TODO new file mode 100644 index 00000000..bf377f5c --- /dev/null +++ b/sherlock/xml/TODO @@ -0,0 +1,18 @@ +Non-normative / not-implemented: +-- introduce numeric error codes +-- cycle detection in internal entities (and possibly external?) +-- conditional sections in DTD +-- validation of elements (regular expressions, non-cdata) +-- validation of attributes (unfinished) +-- notations +-- URI normalization +-- support for xml:space +-- support for xml:lang +-- full support for standalone documents +-- Unicode normalization + +Bugs: +-- definitions of parameter entities do not work because of '%' expansion in "bptr >= fb->bufend) - { - struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb); - struct mempool *pool = ctx->pool; - if (fb->bufend != fb->buffer) - { - uns len = fb->bufend - fb->buffer; - TRACE(ctx, "grow_chars"); - fb->buffer = mp_expand(pool); - fb->bufend = fb->buffer + mp_avail(pool); - fb->bstop = fb->buffer; - fb->bptr = fb->buffer + len; - } - else - { - TRACE(ctx, "push_chars"); - struct xml_node *n = xml_push_dom(ctx); - n->type = XML_NODE_CHARS; - xml_start_chars(ctx); - } - } -} - -static void -xml_init_chars(struct xml_context *ctx) -{ - struct fastbuf *fb = &ctx->chars; - fb->name = ""; - fb->spout = xml_chars_spout; - fb->can_overwrite_buffer = 1; - fb->bptr = fb->bstop = fb->buffer = fb->bufend = NULL; -} - /*** Initialization ***/ +static struct xml_context xml_defaults = { + .flags = XML_SRC_EOF | XML_REPORT_ALL, + .state = XML_STATE_START, + .h_resolve_entity = xml_def_resolve_entity, + .chars = { + .name = "", + .spout = xml_spout_chars, + .can_overwrite_buffer = 1, + }, +}; + static void xml_do_init(struct xml_context *ctx) { - ctx->flags = XML_REPORT_ALL; - xml_init_chars(ctx); xml_attrs_table_init(ctx); } void xml_init(struct xml_context *ctx) { - bzero(ctx, sizeof(*ctx)); + *ctx = xml_defaults; ctx->pool = mp_new(65536); ctx->stack = mp_new(65536); xml_do_init(ctx); @@ -160,7 +133,7 @@ xml_reset(struct xml_context *ctx) xml_sources_cleanup(ctx); mp_flush(pool); mp_flush(stack); - bzero(ctx, sizeof(*ctx)); + *ctx = xml_defaults; ctx->pool = pool; ctx->stack = stack; xml_do_init(ctx); diff --git a/sherlock/xml/common.h b/sherlock/xml/common.h index 9ea8f74e..edff07dc 100644 --- a/sherlock/xml/common.h +++ b/sherlock/xml/common.h @@ -1,7 +1,7 @@ /* * Sherlock Library -- A simple XML parser * - * (c) 2007 Pavel Charvat + * (c) 2007--2008 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -80,12 +80,15 @@ struct xml_dom_stack { }; static inline struct xml_node * -xml_push_dom(struct xml_context *ctx) +xml_push_dom(struct xml_context *ctx, struct mempool_state *state) { /* Create a new DOM node */ TRACE(ctx, "push_dom"); struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s)); - mp_save(ctx->pool, &s->state); + if (state) + s->state = *state; + else + mp_save(ctx->pool, &s->state); struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n)); n->user = NULL; if (n->parent = ctx->node) @@ -120,27 +123,7 @@ xml_pop_dom(struct xml_context *ctx, uns free) void *xml_hash_new(struct mempool *pool, uns size); -static inline void -xml_start_chars(struct xml_context *ctx) -{ - struct fastbuf *fb = &ctx->chars; - fb->bstop = fb->bptr = fb->buffer = mp_start_noalign(ctx->pool, 1); - fb->bufend = fb->buffer + mp_avail(ctx->pool); -} - -static inline char * -xml_end_chars(struct xml_context *ctx, uns *len) -{ - struct fastbuf *fb = &ctx->chars; - uns l = fb->bufend - fb->buffer; - if (fb->bptr == fb->bufend) - fb->bptr = mp_expand(ctx->pool) + l; - *fb->bptr = 0; - char *c = mp_end(ctx->pool, fb->bptr + 1); - fb->bptr = fb->bstop = fb->buffer = fb->bufend = NULL; - *len = l; - return c; -} +void xml_spout_chars(struct fastbuf *fb); /*** Reading of document/external entities ***/ @@ -202,8 +185,8 @@ xml_ascii_cat(uns c) return xml_char_tab1[c]; } -struct xml_source *xml_push_source(struct xml_context *ctx, uns flags); -void xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent); +struct xml_source *xml_push_source(struct xml_context *ctx); +void xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); void xml_refill(struct xml_context *ctx); @@ -325,7 +308,6 @@ char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool); char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool); uns xml_parse_char_ref(struct xml_context *ctx); -void xml_parse_ref(struct xml_context *ctx); void xml_parse_pe_ref(struct xml_context *ctx); char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr); @@ -347,4 +329,6 @@ void xml_skip_pi(struct xml_context *ctx); void xml_attrs_table_init(struct xml_context *ctx); void xml_attrs_table_cleanup(struct xml_context *ctx); +void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value); + #endif diff --git a/sherlock/xml/dtd.c b/sherlock/xml/dtd.c index fbe7a325..289a2243 100644 --- a/sherlock/xml/dtd.c +++ b/sherlock/xml/dtd.c @@ -1,7 +1,7 @@ /* * Sherlock Library -- A simple XML parser * - * (c) 2007 Pavel Charvat + * (c) 2007--2008 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -15,6 +15,7 @@ #include "sherlock/xml/common.h" #include "lib/fastbuf.h" #include "lib/ff-unicode.h" +#include "lib/unicode.h" /* Notations */ @@ -24,15 +25,24 @@ #define HASH_ZERO_FILL #define HASH_TABLE_DYNAMIC #define HASH_WANT_LOOKUP +#define HASH_WANT_FIND #define HASH_GIVE_ALLOC #define HASH_TABLE_ALLOC XML_HASH_GIVE_ALLOC #include "lib/hashtable.h" +struct xml_dtd_notn * +xml_dtd_find_notn(struct xml_context *ctx, char *name) +{ + struct xml_dtd *dtd = ctx->dtd; + struct xml_dtd_notn *notn = xml_dtd_notns_find(dtd->tab_notns, name); + return !notn ? NULL : (notn->flags & XML_DTD_NOTN_DECLARED) ? notn : NULL; +} + /* General entities */ #define HASH_PREFIX(x) xml_dtd_ents_##x -#define HASH_NODE struct xml_dtd_ent +#define HASH_NODE struct xml_dtd_entity #define HASH_KEY_STRING name #define HASH_ZERO_FILL #define HASH_TABLE_DYNAMIC @@ -43,81 +53,85 @@ XML_HASH_GIVE_ALLOC XML_HASH_GIVE_ALLOC #include "lib/hashtable.h" -static struct xml_dtd_ent * -xml_dtd_declare_trivial_ent(struct xml_context *ctx, char *name, uns uni) +static struct xml_dtd_entity * +xml_dtd_declare_trivial_entity(struct xml_context *ctx, char *name, char *text) { struct xml_dtd *dtd = ctx->dtd; - struct xml_dtd_ent *ent = xml_dtd_ents_lookup(dtd->tab_ents, name); - if (ent->flags & XML_DTD_ENT_DECLARED) + struct xml_dtd_entity *ent = xml_dtd_ents_lookup(dtd->tab_ents, name); + if (ent->flags & XML_DTD_ENTITY_DECLARED) { xml_warn(ctx, "Entity &%s; already declared", name); return NULL; } slist_add_tail(&dtd->ents, &ent->n); - ent->flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL_UNI; - ent->uni = uni; + ent->flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL; + ent->text = text; return ent; } static void -xml_dtd_declare_default_ents(struct xml_context *ctx) +xml_dtd_declare_default_entities(struct xml_context *ctx) { - xml_dtd_declare_trivial_ent(ctx, "lt", 60); - xml_dtd_declare_trivial_ent(ctx, "gt", 62); - xml_dtd_declare_trivial_ent(ctx, "amp", 38); - xml_dtd_declare_trivial_ent(ctx, "apos", 39); - xml_dtd_declare_trivial_ent(ctx, "quot", 34); + xml_dtd_declare_trivial_entity(ctx, "lt", "<"); + xml_dtd_declare_trivial_entity(ctx, "gt", ">"); + xml_dtd_declare_trivial_entity(ctx, "amp", "&"); + xml_dtd_declare_trivial_entity(ctx, "apos", "'"); + xml_dtd_declare_trivial_entity(ctx, "quot", "\""); } -struct xml_dtd_ent * -xml_dtd_find_ent(struct xml_context *ctx, char *name) +struct xml_dtd_entity * +xml_def_find_entity(struct xml_context *ctx UNUSED, char *name) +{ +#define ENT(n, t) ent_##n = { .name = #n, .text = t, .flags = XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_TRIVIAL } + static struct xml_dtd_entity ENT(lt, "<"), ENT(gt, ">"), ENT(amp, "&"), ENT(apos, "'"), ENT(quot, "\""); +#undef ENT + switch (name[0]) + { + case 'l': + if (!strcmp(name, "lt")) + return &ent_lt; + break; + case 'g': + if (!strcmp(name, "gt")) + return &ent_gt; + break; + case 'a': + if (!strcmp(name, "amp")) + return &ent_amp; + if (!strcmp(name, "apos")) + return &ent_apos; + break; + case 'q': + if (!strcmp(name, "quot")) + return &ent_quot; + break; + } + return NULL; +} + +struct xml_dtd_entity * +xml_dtd_find_entity(struct xml_context *ctx, char *name) { struct xml_dtd *dtd = ctx->dtd; - if (ctx->h_resolve_entity) - return ctx->h_resolve_entity(ctx, name); + if (ctx->h_find_entity) + return ctx->h_find_entity(ctx, name); else if (dtd) { - struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_ents, name); - return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; + struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_ents, name); + return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL; } else - { -#define ENT(n, u) ent_##n = { .name = #n, .uni = u, .flags = XML_DTD_ENT_DECLARED | XML_DTD_ENT_TRIVIAL_UNI } - static struct xml_dtd_ent ENT(lt, 60), ENT(gt, 62), ENT(amp, 38), ENT(apos, 39), ENT(quot, 34); -#undef ENT - switch (name[0]) - { - case 'l': - if (!strcmp(name, "lt")) - return &ent_lt; - break; - case 'g': - if (!strcmp(name, "gt")) - return &ent_gt; - break; - case 'a': - if (!strcmp(name, "amp")) - return &ent_amp; - if (!strcmp(name, "apos")) - return &ent_apos; - break; - case 'q': - if (!strcmp(name, "quot")) - return &ent_quot; - break; - } - return NULL; - } + return xml_def_find_entity(ctx, name); } /* Parameter entities */ -static struct xml_dtd_ent * -xml_dtd_find_pent(struct xml_context *ctx, char *name) +static struct xml_dtd_entity * +xml_dtd_find_pentity(struct xml_context *ctx, char *name) { struct xml_dtd *dtd = ctx->dtd; - struct xml_dtd_ent *ent = xml_dtd_ents_find(dtd->tab_pents, name); - return !ent ? NULL : (ent->flags & XML_DTD_ENT_DECLARED) ? ent : NULL; + struct xml_dtd_entity *ent = xml_dtd_ents_find(dtd->tab_pents, name); + return !ent ? NULL : (ent->flags & XML_DTD_ENTITY_DECLARED) ? ent : NULL; } /* Elements */ @@ -318,7 +332,7 @@ xml_dtd_init(struct xml_context *ctx) xml_dtd_attrs_init(dtd->tab_attrs = xml_hash_new(pool, sizeof(struct xml_dtd_attrs_table))); xml_dtd_evals_init(dtd->tab_evals = xml_hash_new(pool, sizeof(struct xml_dtd_evals_table))); xml_dtd_enotns_init(dtd->tab_enotns = xml_hash_new(pool, sizeof(struct xml_dtd_enotns_table))); - xml_dtd_declare_default_ents(ctx); + xml_dtd_declare_default_entities(ctx); } void @@ -351,7 +365,7 @@ xml_parse_pe_ref(struct xml_context *ctx) mp_save(ctx->stack, &state); char *name = xml_parse_name(ctx, ctx->stack); xml_parse_char(ctx, ';'); - struct xml_dtd_ent *ent = xml_dtd_find_pent(ctx, name); + struct xml_dtd_entity *ent = xml_dtd_find_pentity(ctx, name); if (!ent) xml_error(ctx, "Unknown entity %%%s;", name); else @@ -401,25 +415,26 @@ xml_parse_dtd_white(struct xml_context *ctx, uns mandatory) } static void -xml_dtd_parse_external_id(struct xml_context *ctx, struct xml_ext_id *eid, uns allow_public) +xml_dtd_parse_external_id(struct xml_context *ctx, char **system_id, char **public_id, uns allow_public) { struct xml_dtd *dtd = ctx->dtd; - bzero(eid, sizeof(*eid)); uns c = xml_peek_char(ctx); if (c == 'S') { xml_parse_seq(ctx, "SYSTEM"); xml_parse_dtd_white(ctx, 1); - eid->system_id = xml_parse_system_literal(ctx, dtd->pool); + *public_id = NULL; + *system_id = xml_parse_system_literal(ctx, dtd->pool); } else if (c == 'P') { xml_parse_seq(ctx, "PUBLIC"); xml_parse_dtd_white(ctx, 1); - eid->public_id = xml_parse_pubid_literal(ctx, dtd->pool); - if (xml_parse_dtd_white(ctx, 0)) // FIXME + *system_id = NULL; + *public_id = xml_parse_pubid_literal(ctx, dtd->pool); + if (xml_parse_dtd_white(ctx, !allow_public)) if ((c = xml_peek_char(ctx)) == '\'' || c == '"' || !allow_public) - eid->system_id = xml_parse_system_literal(ctx, dtd->pool); + *system_id = xml_parse_system_literal(ctx, dtd->pool); } else xml_fatal(ctx, "Expected an external ID"); @@ -438,8 +453,8 @@ xml_parse_notation_decl(struct xml_context *ctx) struct xml_dtd_notn *notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); xml_parse_dtd_white(ctx, 1); - struct xml_ext_id eid; - xml_dtd_parse_external_id(ctx, &eid, 1); + char *system_id, *public_id; + xml_dtd_parse_external_id(ctx, &system_id, &public_id, 1); xml_parse_dtd_white(ctx, 0); xml_parse_char(ctx, '>'); @@ -448,7 +463,8 @@ xml_parse_notation_decl(struct xml_context *ctx) else { notn->flags = XML_DTD_NOTN_DECLARED; - notn->eid = eid; + notn->system_id = system_id; + notn->public_id = public_id; slist_add_tail(&dtd->notns, ¬n->n); } xml_dec(ctx); @@ -464,16 +480,16 @@ xml_parse_entity_decl(struct xml_context *ctx) struct xml_dtd *dtd = ctx->dtd; xml_parse_dtd_white(ctx, 1); - uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENT_PARAMETER : 0; + uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENTITY_PARAMETER : 0; if (flags) xml_parse_dtd_white(ctx, 1); else xml_unget_char(ctx); - struct xml_dtd_ent *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool)); + struct xml_dtd_entity *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool)); slist *list = flags ? &dtd->pents : &dtd->ents; xml_parse_dtd_white(ctx, 1); - if (ent->flags & XML_DTD_ENT_DECLARED) + if (ent->flags & XML_DTD_ENTITY_DECLARED) { xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); // FIXME: should be only warning @@ -511,6 +527,7 @@ xml_parse_entity_decl(struct xml_context *ctx) p = mp_spread(dtd->pool, p, 3 + l); *p++ = '&'; memcpy(p, n, l); + p += l; *p++ = ';';; mp_restore(ctx->stack, &state); continue; @@ -528,27 +545,27 @@ xml_parse_entity_decl(struct xml_context *ctx) ent->len = p - (char *)mp_ptr(dtd->pool); ent->text = mp_end(dtd->pool, p + 1); slist_add_tail(list, &ent->n); - ent->flags = flags | XML_DTD_ENT_DECLARED; + ent->flags = flags | XML_DTD_ENTITY_DECLARED; } else { /* External entity */ - struct xml_ext_id eid; struct xml_dtd_notn *notn = NULL; - xml_dtd_parse_external_id(ctx, &eid, 0); - if (!xml_parse_dtd_white(ctx, 0) || !flags) - xml_parse_char(ctx, '>'); - else if (xml_get_char(ctx) != '>') + char *system_id, *public_id; + xml_unget_char(ctx); + xml_dtd_parse_external_id(ctx, &system_id, &public_id, 0); + if (xml_parse_dtd_white(ctx, 0) && flags && xml_peek_char(ctx) != '>') { /* General external unparsed entity */ - flags |= XML_DTD_ENT_UNPARSED; + flags |= XML_DTD_ENTITY_UNPARSED; xml_parse_seq(ctx, "NDATA"); xml_parse_dtd_white(ctx, 1); notn = xml_dtd_notns_lookup(dtd->tab_notns, xml_parse_name(ctx, dtd->pool)); } slist_add_tail(list, &ent->n); - ent->flags = flags | XML_DTD_ENT_DECLARED | XML_DTD_ENT_EXTERNAL; - ent->eid = eid; + ent->flags = flags | XML_DTD_ENTITY_DECLARED | XML_DTD_ENTITY_EXTERNAL; + ent->system_id = system_id; + ent->public_id = public_id; ent->notn = notn; } xml_parse_dtd_white(ctx, 0); @@ -754,7 +771,7 @@ xml_parse_attr_list_decl(struct xml_context *ctx) else { char *type = xml_parse_name(ctx, dtd->pool); - enum xml_dtd_attribute_type t = XML_ATTR_CDATA; + enum xml_dtd_attr_type t = XML_ATTR_CDATA; if (!strcmp(type, "CDATA")) t = XML_ATTR_CDATA; else if (!strcmp(type, "ID")) @@ -800,7 +817,7 @@ xml_parse_attr_list_decl(struct xml_context *ctx) attr->type = t; } xml_parse_dtd_white(ctx, 1); - enum xml_dtd_attribute_default def = XML_ATTR_NONE; + enum xml_dtd_attr_default def = XML_ATTR_NONE; if (xml_get_char(ctx) == '#') switch (xml_peek_char(ctx)) { @@ -864,3 +881,111 @@ xml_skip_internal_subset(struct xml_context *ctx) } xml_dec(ctx); } + +/*** Validation of attribute values ***/ + +static uns +xml_check_tokens(char *value, uns first_cat, uns next_cat, uns seq) +{ + char *p = value; + uns u; + while (1) + { + p = utf8_32_get(p, &u); + if (!(xml_char_cat(u) & first_cat)) + return 0; + while (*p & ~0x20) + { + p = utf8_32_get(p, &u); + if (!(xml_char_cat(u) & next_cat)) + return 0; + } + if (!*p) + return 1; + if (!seq) + return 0; + p++; + } +} + +static uns +xml_is_name(struct xml_context *ctx, char *value) +{ + /* Name ::= NameStartChar (NameChar)* */ + return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 0); +} + +static uns +xml_is_names(struct xml_context *ctx, char *value) +{ + /* Names ::= Name (#x20 Name)* */ + return xml_check_tokens(value, ctx->cat_sname, ctx->cat_name, 1); +} + +static uns +xml_is_nmtoken(struct xml_context *ctx, char *value) +{ + /* Nmtoken ::= (NameChar)+ */ + return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 0); +} + +static uns +xml_is_nmtokens(struct xml_context *ctx, char *value) +{ + /* Nmtokens ::= Nmtoken (#x20 Nmtoken)* */ + return xml_check_tokens(value, ctx->cat_name, ctx->cat_name, 1); +} + +static void +xml_err_attr_format(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *type) +{ + xml_error(ctx, "Attribute %s in <%s> does not match the production of %s", dtd->name, dtd->elem->name, type); +} + +void +xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value) +{ + if (dtd->type == XML_ATTR_CDATA) + return; + xml_normalize_white(ctx, value); + switch (dtd->type) + { + case XML_ATTR_ID: + if (!xml_is_name(ctx, value)) + xml_err_attr_format(ctx, dtd, "NAME"); + //FIXME: add to a hash table + break; + case XML_ATTR_IDREF: + if (!xml_is_name(ctx, value)) + xml_err_attr_format(ctx, dtd, "NAME"); + // FIXME: find in hash table (beware forward references) + break; + case XML_ATTR_IDREFS: + if (!xml_is_names(ctx, value)) + xml_err_attr_format(ctx, dtd, "NAMES"); + // FIXME: find + break; + case XML_ATTR_ENTITY: + // FIXME + break; + case XML_ATTR_ENTITIES: + // FIXME + break; + case XML_ATTR_NMTOKEN: + if (!xml_is_nmtoken(ctx, value)) + xml_err_attr_format(ctx, dtd, "NMTOKEN"); + break; + case XML_ATTR_NMTOKENS: + if (!xml_is_nmtokens(ctx, value)) + xml_err_attr_format(ctx, dtd, "NMTOKENS"); + break; + case XML_ATTR_ENUM: + if (!xml_dtd_evals_find(ctx->dtd->tab_evals, dtd, value)) + xml_error(ctx, "Attribute %s in <%s> contains an undefined enumeration value", dtd->name, dtd->elem->name); + break; + case XML_ATTR_NOTATION: + if (!xml_dtd_find_notn(ctx, value)) + xml_error(ctx, "Attribute %s in <%s> contains an undefined notation", dtd->name, dtd->elem->name); + break; + } +} diff --git a/sherlock/xml/dtd.h b/sherlock/xml/dtd.h index 522274f2..ba57ee35 100644 --- a/sherlock/xml/dtd.h +++ b/sherlock/xml/dtd.h @@ -1,7 +1,7 @@ /* * Sherlock Library -- A simple XML parser * - * (c) 2007 Pavel Charvat + * (c) 2007--2008 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -28,11 +28,6 @@ struct xml_dtd { void *tab_enotns; /* hash table of enumerated attribute notations */ }; -struct xml_ext_id { - char *system_id; - char *public_id; -}; - /* Notations */ enum xml_dtd_notn_flags { @@ -43,33 +38,37 @@ struct xml_dtd_notn { snode n; /* Node in xml_dtd.notns */ uns flags; /* XML_DTD_NOTN_x */ char *name; /* Notation name */ - struct xml_ext_id eid; /* External id */ + char *system_id; /* External ID */ + char *public_id; + void *user; /* User-defined */ }; +struct xml_dtd_notn *xml_dtd_find_notn(struct xml_context *ctx, char *name); + /* Entities */ -enum xml_dtd_ent_flags { - XML_DTD_ENT_DECLARED = 0x1, /* The entity has been declared (internal usage) */ - XML_DTD_ENT_VISITED = 0x2, /* Cycle detection (internal usage) */ - XML_DTD_ENT_PARAMETER = 0x4, /* Parameter entity, general otherwise */ - XML_DTD_ENT_EXTERNAL = 0x8, /* External entity, internal otherwise */ - XML_DTD_ENT_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */ - XML_DTD_ENT_TRIVIAL_STR = 0x20, /* Replacement text is a sequence of characters and character references */ - XML_DTD_ENT_TRIVIAL_UNI = 0x40, /* Replacement text is a single Unicode character */ +enum xml_dtd_entity_flags { + XML_DTD_ENTITY_DECLARED = 0x1, /* The entity has been declared (internal usage) */ + XML_DTD_ENTITY_VISITED = 0x2, /* Cycle detection (internal usage) */ + XML_DTD_ENTITY_PARAMETER = 0x4, /* Parameter entity, general otherwise */ + XML_DTD_ENTITY_EXTERNAL = 0x8, /* External entity, internal otherwise */ + XML_DTD_ENTITY_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */ + XML_DTD_ENTITY_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */ }; -struct xml_dtd_ent { +struct xml_dtd_entity { snode n; /* Node in xml_dtd.[gp]ents */ uns flags; /* XML_DTD_ENT_x */ char *name; /* Entity name */ char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */ uns len; /* Text length */ - uns uni; /* Unicode value */ - struct xml_ext_id eid; /* External ID */ + char *system_id; /* External ID */ + char *public_id; struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */ + void *user; /* User-defined */ }; -struct xml_dtd_ent *xml_dtd_find_ent(struct xml_context *ctx, char *name); +struct xml_dtd_entity *xml_dtd_find_entity(struct xml_context *ctx, char *name); /* Elements */ @@ -90,6 +89,7 @@ struct xml_dtd_elem { uns type; char *name; struct xml_dtd_elem_node *node; + void *user; /* User-defined */ }; struct xml_dtd_elem_node { @@ -99,6 +99,7 @@ struct xml_dtd_elem_node { slist sons; uns type; uns occur; + void *user; /* User-defined */ }; enum xml_dtd_elem_node_type { @@ -118,14 +119,14 @@ struct xml_dtd_elem *xml_dtd_find_elem(struct xml_context *ctx, char *name); /* Attributes */ -enum xml_dtd_attribute_default { +enum xml_dtd_attr_default { XML_ATTR_NONE, XML_ATTR_REQUIRED, XML_ATTR_IMPLIED, XML_ATTR_FIXED, }; -enum xml_dtd_attribute_type { +enum xml_dtd_attr_type { XML_ATTR_CDATA, XML_ATTR_ID, XML_ATTR_IDREF, @@ -139,11 +140,11 @@ enum xml_dtd_attribute_type { }; struct xml_dtd_attr { - char *name; - struct xml_dtd_elem *elem; - enum xml_dtd_attribute_type type; - enum xml_dtd_attribute_default default_mode; - char *default_value; + char *name; /* Attribute name */ + struct xml_dtd_elem *elem; /* Owner element */ + uns type; /* See enum xml_dtd_attr_type */ + uns default_mode; /* See enum xml_dtd_attr_default */ + char *default_value; /* The default value defined in DTD (or NULL) */ }; struct xml_dtd_eval { diff --git a/sherlock/xml/parse.c b/sherlock/xml/parse.c index cc0e59d2..8f8d8f48 100644 --- a/sherlock/xml/parse.c +++ b/sherlock/xml/parse.c @@ -1,7 +1,7 @@ /* * Sherlock Library -- A simple XML parser * - * (c) 2007 Pavel Charvat + * (c) 2007--2008 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -143,7 +143,7 @@ xml_push_comment(struct xml_context *ctx) /* Comment ::= '' * Already parsed: 'type = XML_NODE_COMMENT; char *p = mp_start_noalign(ctx->pool, 6); while (1) @@ -192,7 +192,7 @@ xml_push_pi(struct xml_context *ctx) * PI ::= '' Char*)))? '?>' * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) * Already parsed: 'type = XML_NODE_PI; n->name = xml_parse_name(ctx, ctx->pool); if (unlikely(!strcasecmp(n->name, "xml"))) @@ -256,118 +256,6 @@ xml_skip_pi(struct xml_context *ctx) xml_dec(ctx); } -/*** Character data ***/ - -static inline uns -xml_flush_chars(struct xml_context *ctx) -{ - struct fastbuf *fb = &ctx->chars; - if (fb->bufend == fb->buffer) - return 0; - TRACE(ctx, "flush_chars"); - struct xml_node *n = ctx->node; - n->text = xml_end_chars(ctx, &n->len); - n->len = fb->bufend - fb->buffer; - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars) - ctx->h_chars(ctx); - return 1; -} - -static inline void -xml_pop_chars(struct xml_context *ctx) -{ - xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS)); - TRACE(ctx, "pop_chars"); -} - -static inline void -xml_append_chars(struct xml_context *ctx) -{ - TRACE(ctx, "append_chars"); - struct fastbuf *out = &ctx->chars; - while (xml_get_char(ctx) != '<') - if (xml_last_char(ctx) == '&') - { - xml_inc(ctx); - xml_parse_ref(ctx); - } - else - bput_utf8_32(out, xml_last_char(ctx)); - xml_unget_char(ctx); -} - -/*** CDATA sections ***/ - -static void -xml_push_cdata(struct xml_context *ctx) -{ - TRACE(ctx, "push_cdata"); - /* CDSect :== '' Char*)) ']]>' - * Already parsed: 'type = XML_NODE_CHARS; - char *p = mp_start_noalign(ctx->pool, 7); - while (1) - { - if (xml_get_char(ctx) == ']') - { - if (xml_get_char(ctx) == ']') - if (xml_get_char(ctx) == '>') - break; - else - *p++ = ']'; - *p++ = ']'; - } - p = utf8_32_put(p, xml_last_char(ctx)); - p = mp_spread(ctx->pool, p, 7); - } - *p = 0; - n->len = p - (char *)mp_ptr(ctx->pool); - n->text = mp_end(ctx->pool, p + 1); - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata) - ctx->h_cdata(ctx); -} - -static void -xml_pop_cdata(struct xml_context *ctx) -{ - xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS)); - xml_dec(ctx); - TRACE(ctx, "pop_cdata"); -} - -static void -xml_append_cdata(struct xml_context *ctx) -{ - TRACE(ctx, "append_cdata"); - xml_parse_seq(ctx, "CDATA["); - struct fastbuf *out = &ctx->chars; - while (1) - { - if (xml_get_char(ctx) == ']') - { - if (xml_get_char(ctx) == ']') - if (xml_get_char(ctx) == '>') - break; - else - bputc(out, ']'); - bputc(out, ']'); - } - bput_utf8_32(out, xml_last_char(ctx)); - } - xml_dec(ctx); -} - -static void UNUSED -xml_skip_cdata(struct xml_context *ctx) -{ - TRACE(ctx, "skip_cdata"); - xml_parse_seq(ctx, "CDATA["); - while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>'); - xml_dec(ctx); -} - /*** Character references ***/ uns @@ -424,7 +312,7 @@ recover: /*** References to general entities ***/ -void +static void xml_parse_ref(struct xml_context *ctx) { /* Reference ::= EntityRef | CharRef @@ -443,7 +331,7 @@ xml_parse_ref(struct xml_context *ctx) mp_save(ctx->stack, &state); char *name = xml_parse_name(ctx, ctx->stack); xml_parse_char(ctx, ';'); - struct xml_dtd_ent *ent = xml_dtd_find_ent(ctx, name); + struct xml_dtd_entity *ent = xml_dtd_find_entity(ctx, name); if (!ent) { xml_error(ctx, "Unknown entity &%s;", name); @@ -451,15 +339,10 @@ xml_parse_ref(struct xml_context *ctx) bputs(out, name); bputc(out, ';'); } - else if (ent->flags & XML_DTD_ENT_TRIVIAL_UNI) + else if (ent->flags & XML_DTD_ENTITY_TRIVIAL) { TRACE(ctx, "Trivial entity &%s;", name); - bput_utf8_32(out, ent->uni); - } - else if (ent->flags & XML_DTD_ENT_TRIVIAL_STR) - { - TRACE(ctx, "Trivial entity &%s;", name); - bwrite(out, ent->text, ent->len); + bputs(out, ent->text); } else { @@ -474,6 +357,151 @@ xml_parse_ref(struct xml_context *ctx) } } +/*** Character data ***/ + +void +xml_spout_chars(struct fastbuf *fb) +{ + if (fb->bptr < fb->bufend) + return; + struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb); + struct mempool *pool = ctx->pool; + if (fb->bufend != fb->buffer) + { + TRACE(ctx, "growing chars"); + uns len = fb->bufend - fb->buffer; + uns reported = fb->bstop - fb->buffer; + fb->buffer = mp_expand(pool); + fb->bufend = fb->buffer + mp_avail(pool); + fb->bptr = fb->buffer + len; + fb->bstop = fb->buffer + reported; + } + else + { + TRACE(ctx, "starting chars"); + mp_save(pool, &ctx->chars_state); + fb->bptr = fb->buffer = fb->bstop = mp_start_noalign(pool, 2); + fb->bufend = fb->buffer + mp_avail(pool) - 1; + } +} + +static inline uns +xml_end_chars(struct xml_context *ctx, char **out) +{ + struct fastbuf *fb = &ctx->chars; + uns len = fb->bptr - fb->buffer; + if (len) + { + TRACE(ctx, "ending chars"); + *fb->bptr = 0; + *out = mp_end(ctx->pool, fb->bptr + 1); + fb->bufend = fb->bstop = fb->bptr = fb->buffer; + } + return len; +} + +static inline uns +xml_report_chars(struct xml_context *ctx, char **out) +{ + struct fastbuf *fb = &ctx->chars; + uns len = fb->bptr - fb->buffer; + if (len) + { + *fb->bptr = 0; + *out = fb->bstop; + fb->bstop = fb->bptr; + } + return len; +} + +static inline uns +xml_flush_chars(struct xml_context *ctx) +{ + char *text, *rtext; + uns len = xml_end_chars(ctx, &text), rlen; + if (len) + { + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) + ctx->h_block(ctx, rtext, rlen); + if (!(ctx->flags & XML_ALLOC_CHARS) && (!(ctx->flags & XML_REPORT_CHARS) || !ctx->h_chars)) + { + mp_restore(ctx->pool, &ctx->chars_state); + return 0; + } + struct xml_node *n = xml_push_dom(ctx, &ctx->chars_state); + n->type = XML_NODE_CHARS; + n->text = text; + n->len = len; + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars) + ctx->h_chars(ctx); + } + return len; +} + +static inline void +xml_pop_chars(struct xml_context *ctx) +{ + xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS)); + TRACE(ctx, "pop_chars"); +} + +static inline void +xml_append_chars(struct xml_context *ctx) +{ + TRACE(ctx, "append_chars"); + struct fastbuf *out = &ctx->chars; + while (xml_get_char(ctx) != '<') + if (xml_last_char(ctx) == '&') + { + xml_inc(ctx); + xml_parse_ref(ctx); + } + else + bput_utf8_32(out, xml_last_char(ctx)); + xml_unget_char(ctx); +} + +/*** CDATA sections ***/ + +static void +xml_append_cdata(struct xml_context *ctx) +{ + /* CDSect :== '' Char*)) ']]>' + * Already parsed: 'chars; + uns rlen; + char *rtext; + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) + ctx->h_block(ctx, rtext, rlen); + while (1) + { + if (xml_get_char(ctx) == ']') + { + if (xml_get_char(ctx) == ']') + if (xml_get_char(ctx) == '>') + break; + else + bputc(out, ']'); + bputc(out, ']'); + } + bput_utf8_32(out, xml_last_char(ctx)); + } + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata && (rlen = xml_report_chars(ctx, &rtext))) + ctx->h_cdata(ctx, rtext, rlen); + xml_dec(ctx); +} + +static void UNUSED +xml_skip_cdata(struct xml_context *ctx) +{ + TRACE(ctx, "skip_cdata"); + xml_parse_seq(ctx, "CDATA["); + while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>'); + xml_dec(ctx); +} + /*** Attribute values ***/ char * @@ -481,15 +509,12 @@ xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) { TRACE(ctx, "parse_attr_value"); /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */ - /* FIXME: - * -- copying from ctx->chars to ctx->pool is not necessary, we could directly write to ctx->pool - * -- berare quotes inside parased entities - * -- check value constrains / normalize value */ + /* FIXME: -- check value constrains / normalize leading/trailing WS and repeated WS */ struct mempool_state state; uns quote = xml_parse_quote(ctx); mp_save(ctx->stack, &state); - xml_start_chars(ctx); struct fastbuf *out = &ctx->chars; + struct xml_source *src = ctx->src; while (1) { uns c = xml_get_char(ctx); @@ -498,7 +523,7 @@ xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) xml_inc(ctx); xml_parse_ref(ctx); } - else if (c == quote) // FIXME: beware quotes inside parsed entities + else if (c == quote && src == ctx->src) break; else if (c == '<') xml_error(ctx, "Attribute value must not contain '<'"); @@ -508,8 +533,29 @@ xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) bput_utf8_32(out, c); } mp_restore(ctx->stack, &state); - uns len; - return xml_end_chars(ctx, &len); + char *text; + return xml_end_chars(ctx, &text) ? text : ""; +} + +uns +xml_normalize_white(struct xml_context *ctx UNUSED, char *text) +{ + char *s = text, *d = text; + while (*s == 0x20) + s++; + while (1) + { + while (*s & ~0x20) + *d++ = *s++; + if (!*s) + break; + while (*++s == 0x20); + *d++ = 0x20; + } + if (d != text && d[-1] == 0x20) + d--; + *d = 0; + return d - text; } /*** Attributes ***/ @@ -559,18 +605,23 @@ xml_parse_attr(struct xml_context *ctx) { TRACE(ctx, "parse_attr"); /* Attribute ::= Name Eq AttValue */ - /* FIXME: - * -- memory management - * -- DTD */ struct xml_node *e = ctx->node; char *n = xml_parse_name(ctx, ctx->pool); struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, n); xml_parse_eq(ctx); char *v = xml_parse_attr_value(ctx, NULL); if (a->val) - xml_error(ctx, "Attribute %s is not unique", n); + { + xml_error(ctx, "Attribute %s is not unique in element <%s>", n, e->name); + return; + } + a->val = v; + if (!e->dtd) + a->dtd = NULL; + else if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name))) + xml_error(ctx, "Undefined attribute %s in element <%s>", n, e->name); else - a->val = v; + xml_validate_attr(ctx, a->dtd, a->val); } struct xml_attr * @@ -601,7 +652,7 @@ xml_push_element(struct xml_context *ctx) * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' * STag ::= '<' Name (S Attribute)* S? '>' * Already parsed: '<' */ - struct xml_node *e = xml_push_dom(ctx); + struct xml_node *e = xml_push_dom(ctx, NULL); clist_init(&e->sons); e->type = XML_NODE_ELEM; e->name = xml_parse_name(ctx, ctx->pool); @@ -610,7 +661,15 @@ xml_push_element(struct xml_context *ctx) { ctx->dom = e; if (ctx->doctype && strcmp(e->name, ctx->doctype)) - xml_error(ctx, "The root element %s does not match the document type %s", e->name, ctx->doctype); + xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype); + } + if (!ctx->dtd) + e->dtd = NULL; + else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name))) + xml_error(ctx, "Undefined element <%s>", e->name); + else + { + // FIXME: validate regular expressions } while (1) { @@ -828,6 +887,7 @@ error: { case XML_STATE_START: TRACE(ctx, "entering prolog"); + ctx->flags |= XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL; if (ctx->h_document_start) ctx->h_document_start(ctx); /* XMLDecl */ @@ -951,19 +1011,7 @@ first_tag: else if (c == '[') { /* CDATA */ - if (!(ctx->flags & XML_UNFOLD_CDATA)) - xml_append_cdata(ctx); - else - { - if (xml_flush_chars(ctx)) - { - PULL_STATE(CHARS, CHARS_BEFORE_CDATA); - xml_pop_chars(ctx); - } - xml_push_cdata(ctx); - PULL(CDATA); - xml_pop_cdata(ctx); - } + xml_append_cdata(ctx); } else xml_fatal(ctx, "Unexpected character after ' + * (c) 2007--2008 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -67,7 +67,7 @@ xml_add_char(u32 **bstop, uns c) } struct xml_source * -xml_push_source(struct xml_context *ctx, uns flags) +xml_push_source(struct xml_context *ctx) { xml_push(ctx); struct xml_source *src = ctx->src; @@ -80,11 +80,17 @@ xml_push_source(struct xml_context *ctx, uns flags) src->next = ctx->src; src->saved_depth = ctx->depth; ctx->src = src; - ctx->flags = (ctx->flags & ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_NEW_LINE | XML_SRC_SURROUND | XML_SRC_DOCUMENT)) | flags; + ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_NEW_LINE | XML_SRC_SURROUND | XML_SRC_DOCUMENT); ctx->bstop = ctx->bptr = src->buf; ctx->depth = 0; - if (flags & XML_SRC_SURROUND) - xml_add_char(&ctx->bstop, 0x20); + return src; +} + +struct xml_source * +xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb) +{ + struct xml_source *src = xml_push_source(ctx); + src->fb = fb; return src; } @@ -101,11 +107,10 @@ xml_pop_source(struct xml_context *ctx) { TRACE(ctx, "pop_source"); if (unlikely(ctx->depth != 0)) - { - xml_fatal(ctx, "Unexpected end of entity"); - } + xml_fatal(ctx, "Unexpected end of entity"); struct xml_source *src = ctx->src; - ASSERT(src); + if (!src) + xml_fatal(ctx, "Undefined source"); xml_close_source(src); ctx->depth = src->saved_depth; ctx->src = src = src->next; @@ -133,31 +138,31 @@ xml_sources_cleanup(struct xml_context *ctx) static void xml_refill_utf8(struct xml_context *ctx); void -xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent) +xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent UNUSED) { - TRACE(ctx, "xml_push_entity"); - uns cat1 = ctx->src->refill_cat1; - uns cat2 = ctx->src->refill_cat2; - struct xml_source *src = xml_push_source(ctx, 0); - src->refill_cat1 = cat1; - src->refill_cat2 = cat2; - if (ent->flags & XML_DTD_ENT_EXTERNAL) - xml_fatal(ctx, "External entities not implemented"); // FIXME - else - { - fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0); - src->refill = xml_refill_utf8; - } + xml_error(ctx, "References to external entities are not supported"); } void -xml_set_source(struct xml_context *ctx, struct fastbuf *fb) +xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent) { - TRACE(ctx, "xml_set_source"); - ASSERT(!ctx->src); - struct xml_source *src = xml_push_source(ctx, XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL); - src->fb = fb; - ctx->state = XML_STATE_START; + TRACE(ctx, "xml_push_entity"); + struct xml_source *src; + if (ent->flags & XML_DTD_ENTITY_EXTERNAL) + { + ASSERT(ctx->h_resolve_entity); + ctx->h_resolve_entity(ctx, ent); + ctx->flags |= XML_SRC_EXPECTED_DECL; + src = ctx->src; + } + else + { + src = xml_push_source(ctx); + fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, strlen(ent->text), 0); + } + src->refill = xml_refill_utf8; + src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line; + src->refill_cat2 = ctx->cat_new_line; } static uns @@ -242,23 +247,6 @@ xml_refill_utf16_be(struct xml_context *ctx) REFILL(ctx, bget_utf16_be_repl, ~1U); } -#if 0 -static inline uns -xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x) -{ - // FIXME: slow - int c; - return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]); -} - -static void -xml_refill_libcharset(struct xml_context *ctx) -{ - unsigned short int *in_to_x = ctx->src->refill_in_to_x; - REFILL(ctx, xml_refill_libcharset_bget, in_to_x); -} -#endif - #undef REFILL void @@ -279,12 +267,9 @@ xml_refill(struct xml_context *ctx) while (ctx->bptr == ctx->bstop); } -uns -xml_row(struct xml_context *ctx) +static uns +xml_source_row(struct xml_context *ctx, struct xml_source *src) { - struct xml_source *src = ctx->src; - if (!src) - return 0; uns row = src->row; for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2) if (p[-1] & src->refill_cat2) @@ -292,6 +277,12 @@ xml_row(struct xml_context *ctx) return row + 1; } +uns +xml_row(struct xml_context *ctx) +{ + return ctx->src ? xml_source_row(ctx, ctx->src) : 0; +} + /* Document/external entity header */ static char * @@ -318,18 +309,11 @@ xml_parse_encoding_name(struct xml_context *ctx) static void xml_init_charconv(struct xml_context *ctx, int cs) { - // FIXME: hack + // XXX: with a direct access to libcharset tables could be faster struct xml_source *src = ctx->src; TRACE(ctx, "wrapping charset %s", charset_name(cs)); -#if 0 - struct conv_context conv; - conv_set_charset(&conv, cs, CONV_CHARSET_UTF8); - src->refill = xml_refill_libcharset; - src->refill_in_to_x = conv.in_to_x; -#else src->wrapped_fb = src->fb; src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8); -#endif } void diff --git a/sherlock/xml/xml-test.c b/sherlock/xml/xml-test.c index db252c8d..76c5042b 100644 --- a/sherlock/xml/xml-test.c +++ b/sherlock/xml/xml-test.c @@ -1,7 +1,7 @@ /* * Sherlock Library -- A simple XML parser * - * (c) 2007 Pavel Charvat + * (c) 2007--2008 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -9,19 +9,22 @@ #include "sherlock/sherlock.h" #include "sherlock/xml/xml.h" +#include "sherlock/xml/dtd.h" #include "lib/getopt.h" #include "lib/fastbuf.h" #include #include +#include enum { WANT_FIRST = 0x100, WANT_PARSE_DTD, WANT_HIDE_ERRORS, - WANT_UNFOLD_CDATA, WANT_IGNORE_COMMENTS, WANT_IGNORE_PIS, + WANT_REPORT_BLOCKS, + WANT_FILE_ENTITIES, }; static char *shortopts = "spd" CF_SHORT_OPTS; @@ -32,9 +35,10 @@ static struct option longopts[] = { { "dom", 0, 0, 'd' }, { "dtd", 0, 0, WANT_PARSE_DTD }, { "hide-errors", 0, 0, WANT_HIDE_ERRORS }, - { "unfold-cdata", 0, 0, WANT_UNFOLD_CDATA }, { "ignore-comments", 0, 0, WANT_IGNORE_COMMENTS }, { "ignore-pis", 0, 0, WANT_IGNORE_PIS }, + { "reports-blocks", 0, 0, WANT_REPORT_BLOCKS }, + { "file-entities", 0, 0, WANT_FILE_ENTITIES }, { NULL, 0, 0, 0 } }; @@ -52,9 +56,10 @@ CF_USAGE -d, --dom Test DOM interface\n\ --dtd Enable parsing of DTD\n\ --hide-errors Hide warnings and error messages\n\ - --unfold-cdata Unfold CDATA sections\n\ --ignore-comments Ignore processing instructions\n\ --ignore-pis Ignore comments\n\ + --report-blocks Report blocks or characters and CDATA sections\n\ + --file-entities Resolve file external entities (not fully normative)\n\ \n", stderr); exit(1); } @@ -64,9 +69,10 @@ static uns want_pull; static uns want_dom; static uns want_parse_dtd; static uns want_hide_errors; -static uns want_unfold_cdata; static uns want_ignore_comments; static uns want_ignore_pis; +static uns want_report_blocks; +static uns want_file_entities; static struct fastbuf *out; @@ -190,10 +196,15 @@ h_chars(struct xml_context *ctx) } static void -h_cdata(struct xml_context *ctx) +h_block(struct xml_context *ctx UNUSED, char *text, uns len UNUSED) { - bputs(out, "SAX: cdata"); - show_node(ctx->node); + bprintf(out, "SAX: block text='%s'\n", text); +} + +static void +h_cdata(struct xml_context *ctx UNUSED, char *text, uns len UNUSED) +{ + bprintf(out, "SAX: cdata text='%s'\n", text); } static void @@ -208,6 +219,12 @@ h_dtd_end(struct xml_context *ctx UNUSED) bputs(out, "SAX: dtd_end\n"); } +static void +h_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *e) +{ + xml_push_fastbuf(ctx, bopen(e->system_id, O_RDONLY, 4096)); +} + int main(int argc, char **argv) { @@ -232,15 +249,18 @@ main(int argc, char **argv) case WANT_HIDE_ERRORS: want_hide_errors++; break; - case WANT_UNFOLD_CDATA: - want_unfold_cdata++; - break; case WANT_IGNORE_COMMENTS: want_ignore_comments++; break; case WANT_IGNORE_PIS: want_ignore_pis++; break; + case WANT_REPORT_BLOCKS: + want_report_blocks++; + break; + case WANT_FILE_ENTITIES: + want_file_entities++; + break; default: usage(); } @@ -263,7 +283,11 @@ main(int argc, char **argv) ctx.h_stag = h_stag; ctx.h_etag = h_etag; ctx.h_chars = h_chars; - ctx.h_cdata = h_cdata; + if (want_report_blocks) + { + ctx.h_block = h_block; + ctx.h_cdata = h_cdata; + } ctx.h_dtd_start = h_dtd_start; ctx.h_dtd_end = h_dtd_end; } @@ -271,17 +295,17 @@ main(int argc, char **argv) ctx.flags |= XML_ALLOC_ALL; if (want_parse_dtd) ctx.flags |= XML_PARSE_DTD; - if (want_unfold_cdata) - ctx.flags |= XML_UNFOLD_CDATA; if (want_ignore_comments) ctx.flags &= ~(XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS); if (want_ignore_pis) ctx.flags &= ~(XML_REPORT_PIS | XML_ALLOC_PIS); - xml_set_source(&ctx, bfdopen_shared(0, 4096)); + if (want_file_entities) + ctx.h_resolve_entity = h_resolve_entity; + xml_push_fastbuf(&ctx, bfdopen_shared(0, 4096)); bputs(out, "PULL: start\n"); if (want_pull) { - ctx.pull = XML_PULL_CHARS | XML_PULL_CDATA | XML_PULL_STAG | XML_PULL_ETAG | XML_PULL_COMMENT | XML_PULL_PI; + ctx.pull = XML_PULL_CHARS | XML_PULL_STAG | XML_PULL_ETAG | XML_PULL_COMMENT | XML_PULL_PI; uns state; while (state = xml_next(&ctx)) switch (state) @@ -290,10 +314,6 @@ main(int argc, char **argv) bputs(out, "PULL: chars"); show_node(ctx.node); break; - case XML_STATE_CDATA: - bputs(out, "PULL: cdata"); - show_node(ctx.node); - break; case XML_STATE_STAG: bputs(out, "PULL: stag"); show_node(ctx.node); diff --git a/sherlock/xml/xml-test.t b/sherlock/xml/xml-test.t new file mode 100644 index 00000000..1a28be66 --- /dev/null +++ b/sherlock/xml/xml-test.t @@ -0,0 +1,43 @@ +# Tests for the XML parser +# (c) 2008 Pavel Charvat + +Run: ../obj/sherlock/xml/xml-test +In: + +Out: PULL: start + PULL: eof + +Run: ../obj/sherlock/xml/xml-test -s +In: + text1&amp;<text2 +Out: PULL: start + SAX: document_start + SAX: xml_decl version=1.0 standalone=0 + SAX: stag + SAX: stag a1='val1' a2='val2' + SAX: chars text='text1&<' + SAX: etag + SAX: chars text='text2' + SAX: etag + SAX: document_end + PULL: eof + +Run: ../obj/sherlock/xml/xml-test -s --dtd +In: + + + + ]> + &e1;&e2; +Out: PULL: start + SAX: document_start + SAX: xml_decl version=1.0 standalone=0 + SAX: doctype_decl type=root public='' system='' extsub=0 intsub=1 + SAX: dtd_start + SAX: dtd_end + SAX: stag + SAX: chars text='text' + SAX: etag + SAX: document_end + PULL: eof diff --git a/sherlock/xml/xml.h b/sherlock/xml/xml.h index 0e3cacea..8e416dbc 100644 --- a/sherlock/xml/xml.h +++ b/sherlock/xml/xml.h @@ -1,7 +1,7 @@ /* * Sherlock Library -- A simple XML parser * - * (c) 2007 Pavel Charvat + * (c) 2007--2008 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -16,10 +16,10 @@ #include "lib/fastbuf.h" struct xml_context; -struct xml_dtd_ent; +struct xml_source; +struct xml_dtd_entity; enum xml_error { - // FIXME XML_ERR_OK = 0, XML_ERR_WARN = 1000, /* Warning */ XML_ERR_ERROR = 2000, /* Recoverable error */ @@ -33,7 +33,6 @@ enum xml_state { XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */ XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */ XML_STATE_CHARS, /* XML_PULL_CHARS */ - XML_STATE_CDATA, /* XML_PULL_CDATA */ XML_STATE_STAG, /* XML_PULL_STAG */ XML_STATE_ETAG, /* XML_PULL_ETAG */ XML_STATE_COMMENT, /* XML_PULL_COMMENT */ @@ -55,11 +54,10 @@ enum xml_pull { XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */ XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */ XML_PULL_CHARS = 0x00000004, - XML_PULL_CDATA = 0x00000008, - XML_PULL_STAG = 0x00000010, - XML_PULL_ETAG = 0x00000020, - XML_PULL_COMMENT = 0x00000040, - XML_PULL_PI = 0x00000080, + XML_PULL_STAG = 0x00000008, + XML_PULL_ETAG = 0x00000010, + XML_PULL_COMMENT = 0x00000020, + XML_PULL_PI = 0x00000040, XML_PULL_ALL = 0xffffffff, }; @@ -81,9 +79,8 @@ enum xml_flags { XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS, /* Other parameters */ - XML_UNFOLD_CDATA = 0x00000100, /* Unfold CDATA sections */ - XML_VALIDATING = 0x00000200, /* Validate everything (not fully implemented!) */ - XML_PARSE_DTD = 0x00000400, /* Enable parsing of DTD */ + XML_VALIDATING = 0x00000100, /* Validate everything (not fully implemented!) */ + XML_PARSE_DTD = 0x00000200, /* Enable parsing of DTD */ /* Internals, do not change! */ XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */ @@ -130,6 +127,7 @@ struct xml_node { struct xml_attr { snode n; /* Node for elem->attrs */ struct xml_node *elem; /* Parent element */ + struct xml_dtd_attr *dtd; /* Attribute DTD */ char *name; /* Attribute name */ char *val; /* Attribute value */ void *user; /* User-defined (initialized to NULL) */ @@ -146,11 +144,13 @@ struct xml_context { /* Memory management */ struct mempool *pool; /* DOM pool */ - struct mempool *stack; /* Stack pool (free as soon as possible) */ + struct mempool *stack; /* Stack pool (freed as soon as possible) */ struct xml_stack *stack_list; /* See xml_push(), xml_pop() */ uns flags; /* XML_FLAG_x (restored on xml_pop()) */ - uns depth; /* Nesting level */ + uns depth; /* Nesting level (for checking of valid source nesting -> valid pushes/pops on memory pools) */ struct fastbuf chars; /* Character data / attribute value */ + struct mempool_state chars_state; /* Mempool state before the current character block has started */ + char *chars_trivial; /* If not empty, it will be appended to chars */ void *tab_attrs; /* Hash table of element attributes */ /* Input */ @@ -168,14 +168,16 @@ struct xml_context { void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */ void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration (before optional internal subset) */ void (*h_comment)(struct xml_context *ctx); /* Called after a comment (only with XML_REPORT_COMMENTS) */ - void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */ + void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */ void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */ void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */ void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */ - void (*h_cdata)(struct xml_context *ctx); /* Called after a CDATA section (only with XML_REPORT_CHARS and XML_UNFOLD_CDATA) */ + void (*h_block)(struct xml_context *ctx, char *text, uns len); /* Called for each continuous block of characters not reported by h_cdata() (only with XML_REPORT_CHARS) */ + void (*h_cdata)(struct xml_context *ctx, char *text, uns len); /* Called for each CDATA section (only with XML_REPORT_CHARS) */ void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */ void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */ - struct xml_dtd_ent *(*h_resolve_entity)(struct xml_context *ctx, char *name); + struct xml_dtd_entity *(*h_find_entity)(struct xml_context *ctx, char *name); /* Called when needed to resolve a general entity */ + void (*h_resolve_entity)(struct xml_context *ctx, struct xml_dtd_entity *ent); /* User should push source fastbuf for a parsed external entity (either general or parameter) */ /* DOM */ struct xml_node *dom; /* DOM root */ @@ -205,8 +207,8 @@ void xml_cleanup(struct xml_context *ctx); /* Reuse XML context */ void xml_reset(struct xml_context *ctx); -/* Setup XML source (fastbuf will be automatically closed) */ -void xml_set_source(struct xml_context *ctx, struct fastbuf *fb); +/* Add XML source (fastbuf will be automatically closed) */ +struct xml_source *xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb); /* Parse without the PUSH interface, return XML_ERR_x code (zero on success) */ uns xml_parse(struct xml_context *ctx); @@ -214,7 +216,19 @@ uns xml_parse(struct xml_context *ctx); /* Parse with the PUSH interface, return XML_STATE_x (zero on EOF or fatal error) */ uns xml_next(struct xml_context *ctx); +/* Returns the current row number in the document entity */ uns xml_row(struct xml_context *ctx); + +/* Finds a given attribute value in a XML_NODE_ELEM node */ struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name); +/* The default value of h_find_entity(), knows <, >, &, ' and " */ +struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name); + +/* The default value of h_resolve_entity(), throws an error */ +void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); + +/* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */ +uns xml_normalize_white(struct xml_context *ctx, char *value); + #endif -- 2.39.2