From: Pavel Charvat Date: Sun, 20 Jan 2008 11:28:39 +0000 (+0100) Subject: Small changes in XML parser: X-Git-Tag: holmes-import~471 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=5da13cd16371faa6df5b880387b5b172a1704aef;p=libucw.git Small changes in XML parser: -- UTF-16 should now work -- support for parameter entities in DTD -- improved test cases -- renamed common.h to more intuitive internals.h --- diff --git a/sherlock/xml/TODO b/sherlock/xml/TODO index bf377f5c..b8dbc29c 100644 --- a/sherlock/xml/TODO +++ b/sherlock/xml/TODO @@ -11,8 +11,5 @@ Non-normative / not-implemented: -- full support for standalone documents -- Unicode normalization -Bugs: --- definitions of parameter entities do not work because of '%' expansion in " - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#ifndef _SHERLOCK_XML_COMMON_H -#define _SHERLOCK_XML_COMMON_H - -#include "sherlock/xml/xml.h" -#include "sherlock/xml/dtd.h" - -/*** Debugging ***/ - -#ifdef LOCAL_DEBUG -#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0) -#else -#define TRACE(c, f, p...) do {} while(0) -#endif - -/*** Error handling ***/ - -void NONRET xml_throw(struct xml_context *ctx); -void xml_warn(struct xml_context *ctx, const char *format, ...); -void xml_error(struct xml_context *ctx, const char *format, ...); -void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...); - -/*** Memory management ***/ - -struct xml_stack { - struct xml_stack *next; - struct mempool_state state; - uns flags; -}; - -static inline void * -xml_do_push(struct xml_context *ctx, uns size) -{ - /* Saves ctx->stack and ctx->flags state */ - struct mempool_state state; - mp_save(ctx->stack, &state); - struct xml_stack *s = mp_alloc(ctx->stack, size); - s->state = state; - s->flags = ctx->flags; - s->next = ctx->stack_list; - ctx->stack_list = s; - return s; -} - -static inline void -xml_do_pop(struct xml_context *ctx, struct xml_stack *s) -{ - /* Restore ctx->stack and ctx->flags state */ - ctx->stack_list = s->next; - ctx->flags = s->flags; - mp_restore(ctx->stack, &s->state); -} - -static inline void -xml_push(struct xml_context *ctx) -{ - TRACE(ctx, "push"); - xml_do_push(ctx, sizeof(struct xml_stack)); -} - -static inline void -xml_pop(struct xml_context *ctx) -{ - TRACE(ctx, "pop"); - ASSERT(ctx->stack_list); - xml_do_pop(ctx, ctx->stack_list); -} - -struct xml_dom_stack { - struct xml_stack stack; - struct mempool_state state; -}; - -static inline struct xml_node * -xml_push_dom(struct xml_context *ctx, struct mempool_state *state) -{ - /* Create a new DOM node */ - TRACE(ctx, "push_dom"); - struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s)); - if (state) - s->state = *state; - else - mp_save(ctx->pool, &s->state); - struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n)); - n->user = NULL; - if (n->parent = ctx->node) - clist_add_tail(&n->parent->sons, &n->n); - return ctx->node = n; -} - -static inline void -xml_pop_dom(struct xml_context *ctx, uns free) -{ - /* Leave DOM subtree */ - TRACE(ctx, "pop_dom"); - ASSERT(ctx->node); - struct xml_node *p = ctx->node->parent; - struct xml_dom_stack *s = (void *)ctx->stack_list; - if (free) - { - /* See xml_pop_element() for cleanup of attribute hash table */ - if (p) - clist_remove(&ctx->node->n); - mp_restore(ctx->pool, &s->state); - } - ctx->node = p; - xml_do_pop(ctx, &s->stack); -} - -#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN) -#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \ - static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uns size) \ - { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \ - static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {} - -void *xml_hash_new(struct mempool *pool, uns size); - -void xml_spout_chars(struct fastbuf *fb); - -/*** Reading of document/external entities ***/ - -#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */ - -struct xml_source { - struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ - struct fastbuf *fb; /* Source fastbuf */ - struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */ - struct fastbuf wrap_fb; /* Fbmem wrapper */ - u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ - u32 *bptr, *bstop; /* Current state of the buffer */ - uns row; /* File position */ - char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ - char *fb_encoding; /* Encoding of the source fastbuf */ - char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ - uns refill_cat1; /* Character categories, which should be directly passed to the buffer */ - uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in sequences) */ - void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ - unsigned short *refill_in_to_x; /* Libcharset input table */ - uns saved_depth; /* Saved ctx->depth */ -}; - -void NONRET xml_fatal_nested(struct xml_context *ctx); - -static inline void -xml_inc(struct xml_context *ctx) -{ - /* Called after the first character of a block */ - TRACE(ctx, "inc"); - ctx->depth++; -} - -static inline void -xml_dec(struct xml_context *ctx) -{ - /* Called after the last character of a block */ - TRACE(ctx, "dec"); - if (unlikely(!ctx->depth--)) - xml_fatal_nested(ctx); -} - -#include "obj/sherlock/xml/unicat.h" - -static inline uns -xml_char_cat(uns c) -{ - if (c < 0x10000) - return 1U << xml_char_tab1[(c & 0xff) + xml_char_tab2[c >> 8]]; - else if (likely(c < 0x110000)) - return 1U << xml_char_tab3[c >> 16]; - else - return 1; -} - -static inline uns -xml_ascii_cat(uns c) -{ - return xml_char_tab1[c]; -} - -struct xml_source *xml_push_source(struct xml_context *ctx); -void xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); - -void xml_refill(struct xml_context *ctx); - -static inline uns -xml_peek_char(struct xml_context *ctx) -{ - if (ctx->bptr == ctx->bstop) - xml_refill(ctx); - return ctx->bptr[0]; -} - -static inline uns -xml_peek_cat(struct xml_context *ctx) -{ - if (ctx->bptr == ctx->bstop) - xml_refill(ctx); - return ctx->bptr[1]; -} - -static inline uns -xml_get_char(struct xml_context *ctx) -{ - uns c = xml_peek_char(ctx); - ctx->bptr += 2; - return c; -} - -static inline uns -xml_get_cat(struct xml_context *ctx) -{ - uns c = xml_peek_cat(ctx); - ctx->bptr += 2; - return c; -} - -static inline uns -xml_last_char(struct xml_context *ctx) -{ - return ctx->bptr[-2]; -} - -static inline uns -xml_last_cat(struct xml_context *ctx) -{ - return ctx->bptr[-1]; -} - -static inline uns -xml_skip_char(struct xml_context *ctx) -{ - uns c = ctx->bptr[0]; - ctx->bptr += 2; - return c; -} - -static inline uns -xml_unget_char(struct xml_context *ctx) -{ - return *(ctx->bptr -= 2); -} - -void xml_sources_cleanup(struct xml_context *ctx); - -/*** Parsing ***/ - -void NONRET xml_fatal_expected(struct xml_context *ctx, uns c); -void NONRET xml_fatal_expected_white(struct xml_context *ctx); -void NONRET xml_fatal_expected_quot(struct xml_context *ctx); - -static inline uns -xml_parse_white(struct xml_context *ctx, uns mandatory) -{ - /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+ - * mandatory=0 -> S? */ - uns cnt = 0; - while (xml_peek_cat(ctx) & XML_CHAR_WHITE) - { - xml_skip_char(ctx); - cnt++; - } - if (unlikely(mandatory && !cnt)) - xml_fatal_expected_white(ctx); - return cnt; -} - -static inline void -xml_parse_char(struct xml_context *ctx, uns c) -{ - /* Consumes a given Unicode character */ - if (unlikely(c != xml_get_char(ctx))) - xml_fatal_expected(ctx, c); -} - -static inline void -xml_parse_seq(struct xml_context *ctx, const char *seq) -{ - /* Consumes a given sequence of ASCII characters */ - while (*seq) - xml_parse_char(ctx, *seq++); -} - -void xml_parse_eq(struct xml_context *ctx); - -static inline uns -xml_parse_quote(struct xml_context *ctx) -{ - /* "'" | '"' */ - uns c = xml_get_char(ctx); - if (unlikely(c != '\'' && c != '\"')) - xml_fatal_expected_quot(ctx); - return c; -} - -char *xml_parse_name(struct xml_context *ctx, struct mempool *pool); -void xml_skip_name(struct xml_context *ctx); -char *xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool); - -char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool); -char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool); - -uns xml_parse_char_ref(struct xml_context *ctx); -void xml_parse_pe_ref(struct xml_context *ctx); - -char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr); - -void xml_skip_internal_subset(struct xml_context *ctx); -void xml_parse_notation_decl(struct xml_context *ctx); -void xml_parse_entity_decl(struct xml_context *ctx); -void xml_parse_element_decl(struct xml_context *ctx); -void xml_parse_attr_list_decl(struct xml_context *ctx); - -void xml_push_comment(struct xml_context *ctx); -void xml_pop_comment(struct xml_context *ctx); -void xml_skip_comment(struct xml_context *ctx); - -void xml_push_pi(struct xml_context *ctx); -void xml_pop_pi(struct xml_context *ctx); -void xml_skip_pi(struct xml_context *ctx); - -void xml_attrs_table_init(struct xml_context *ctx); -void xml_attrs_table_cleanup(struct xml_context *ctx); - -void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value); - -#endif diff --git a/sherlock/xml/dtd.c b/sherlock/xml/dtd.c index 289a2243..0abca72a 100644 --- a/sherlock/xml/dtd.c +++ b/sherlock/xml/dtd.c @@ -12,7 +12,7 @@ #include "sherlock/sherlock.h" #include "sherlock/xml/xml.h" #include "sherlock/xml/dtd.h" -#include "sherlock/xml/common.h" +#include "sherlock/xml/internals.h" #include "lib/fastbuf.h" #include "lib/ff-unicode.h" #include "lib/unicode.h" @@ -380,24 +380,32 @@ xml_parse_pe_ref(struct xml_context *ctx) xml_dec(ctx); } -static void -xml_parse_dtd_pe(struct xml_context *ctx) +static uns +xml_parse_dtd_pe(struct xml_context *ctx, uns entity_decl) { + /* Already parsed: '%' */ do { - xml_skip_char(ctx); xml_inc(ctx); + if (!~entity_decl && (xml_peek_cat(ctx) & XML_CHAR_WHITE)) + { + xml_dec(ctx); + return ~0U; + } + xml_parse_pe_ref(ctx); while (xml_peek_cat(ctx) & XML_CHAR_WHITE) xml_skip_char(ctx); - xml_parse_pe_ref(ctx); } - while (xml_peek_char(ctx) != '%'); + while (xml_get_char(ctx) == '%'); + xml_unget_char(ctx); + return 1; } static inline uns xml_parse_dtd_white(struct xml_context *ctx, uns mandatory) { - /* Whitespace or parameter entity */ + /* Whitespace or parameter entity, + * mandatory==~0U has a special maening of the whitespace before the '%' character in an parameter entity declaration */ uns cnt = 0; while (xml_peek_cat(ctx) & XML_CHAR_WHITE) { @@ -406,8 +414,8 @@ xml_parse_dtd_white(struct xml_context *ctx, uns mandatory) } if (xml_peek_char(ctx) == '%') { - xml_parse_dtd_pe(ctx); - return 1; + xml_skip_char(ctx); + return xml_parse_dtd_pe(ctx, mandatory); } else if (unlikely(mandatory && !cnt)) xml_fatal_expected_white(ctx); @@ -478,23 +486,17 @@ xml_parse_entity_decl(struct xml_context *ctx) /* Already parsed: 'dtd; - xml_parse_dtd_white(ctx, 1); - - uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENTITY_PARAMETER : 0; + uns flags = ~xml_parse_dtd_white(ctx, ~0U) ? 0 : XML_DTD_ENTITY_PARAMETER; if (flags) xml_parse_dtd_white(ctx, 1); - else - xml_unget_char(ctx); - struct xml_dtd_entity *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool)); - slist *list = flags ? &dtd->pents : &dtd->ents; xml_parse_dtd_white(ctx, 1); + slist *list = flags ? &dtd->pents : &dtd->ents; if (ent->flags & XML_DTD_ENTITY_DECLARED) { xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); // FIXME: should be only warning } - uns c, sep = xml_get_char(ctx); if (sep == '\'' || sep == '"') { diff --git a/sherlock/xml/internals.h b/sherlock/xml/internals.h new file mode 100644 index 00000000..bbf28c05 --- /dev/null +++ b/sherlock/xml/internals.h @@ -0,0 +1,311 @@ +/* + * Sherlock Library -- A simple XML parser + * + * (c) 2007--2008 Pavel Charvat + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _SHERLOCK_XML_INTERNALS_H +#define _SHERLOCK_XML_INTERNALS_H + +#include "sherlock/xml/xml.h" +#include "sherlock/xml/dtd.h" + +/*** Debugging ***/ + +#ifdef LOCAL_DEBUG +#define TRACE(c, f, p...) do { DBG("XML %u: " f, xml_row(c), ##p); } while(0) +#else +#define TRACE(c, f, p...) do {} while(0) +#endif + +/*** Error handling ***/ + +void NONRET xml_throw(struct xml_context *ctx); + +/*** Memory management ***/ + +struct xml_stack { + struct xml_stack *next; + struct mempool_state state; + uns flags; +}; + +static inline void * +xml_do_push(struct xml_context *ctx, uns size) +{ + /* Saves ctx->stack and ctx->flags state */ + struct mempool_state state; + mp_save(ctx->stack, &state); + struct xml_stack *s = mp_alloc(ctx->stack, size); + s->state = state; + s->flags = ctx->flags; + s->next = ctx->stack_list; + ctx->stack_list = s; + return s; +} + +static inline void +xml_do_pop(struct xml_context *ctx, struct xml_stack *s) +{ + /* Restore ctx->stack and ctx->flags state */ + ctx->stack_list = s->next; + ctx->flags = s->flags; + mp_restore(ctx->stack, &s->state); +} + +static inline void +xml_push(struct xml_context *ctx) +{ + TRACE(ctx, "push"); + xml_do_push(ctx, sizeof(struct xml_stack)); +} + +static inline void +xml_pop(struct xml_context *ctx) +{ + TRACE(ctx, "pop"); + ASSERT(ctx->stack_list); + xml_do_pop(ctx, ctx->stack_list); +} + +struct xml_dom_stack { + struct xml_stack stack; + struct mempool_state state; +}; + +static inline struct xml_node * +xml_push_dom(struct xml_context *ctx, struct mempool_state *state) +{ + /* Create a new DOM node */ + TRACE(ctx, "push_dom"); + struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s)); + if (state) + s->state = *state; + else + mp_save(ctx->pool, &s->state); + struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n)); + n->user = NULL; + if (n->parent = ctx->node) + clist_add_tail(&n->parent->sons, &n->n); + return ctx->node = n; +} + +static inline void +xml_pop_dom(struct xml_context *ctx, uns free) +{ + /* Leave DOM subtree */ + TRACE(ctx, "pop_dom"); + ASSERT(ctx->node); + struct xml_node *p = ctx->node->parent; + struct xml_dom_stack *s = (void *)ctx->stack_list; + if (free) + { + /* See xml_pop_element() for cleanup of attribute hash table */ + if (p) + clist_remove(&ctx->node->n); + mp_restore(ctx->pool, &s->state); + } + ctx->node = p; + xml_do_pop(ctx, &s->stack); +} + +#define XML_HASH_HDR_SIZE ALIGN_TO(sizeof(void *), CPU_STRUCT_ALIGN) +#define XML_HASH_GIVE_ALLOC struct HASH_PREFIX(table); \ + static inline void *HASH_PREFIX(alloc)(struct HASH_PREFIX(table) *t, uns size) \ + { return mp_alloc(*(void **)((void *)t - XML_HASH_HDR_SIZE), size); } \ + static inline void HASH_PREFIX(free)(struct HASH_PREFIX(table) *t UNUSED, void *p UNUSED) {} + +void *xml_hash_new(struct mempool *pool, uns size); + +void xml_spout_chars(struct fastbuf *fb); + +/*** Reading of document/external entities ***/ + +void NONRET xml_fatal_nested(struct xml_context *ctx); + +static inline void +xml_inc(struct xml_context *ctx) +{ + /* Called after the first character of a block */ + TRACE(ctx, "inc"); + ctx->depth++; +} + +static inline void +xml_dec(struct xml_context *ctx) +{ + /* Called after the last character of a block */ + TRACE(ctx, "dec"); + if (unlikely(!ctx->depth--)) + xml_fatal_nested(ctx); +} + +#include "obj/sherlock/xml/unicat.h" + +static inline uns +xml_char_cat(uns c) +{ + if (c < 0x10000) + return 1U << xml_char_tab1[(c & 0xff) + xml_char_tab2[c >> 8]]; + else if (likely(c < 0x110000)) + return 1U << xml_char_tab3[c >> 16]; + else + return 1; +} + +static inline uns +xml_ascii_cat(uns c) +{ + return xml_char_tab1[c]; +} + +struct xml_source *xml_push_source(struct xml_context *ctx); +void xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); + +void xml_refill(struct xml_context *ctx); + +static inline uns +xml_peek_char(struct xml_context *ctx) +{ + if (ctx->bptr == ctx->bstop) + xml_refill(ctx); + return ctx->bptr[0]; +} + +static inline uns +xml_peek_cat(struct xml_context *ctx) +{ + if (ctx->bptr == ctx->bstop) + xml_refill(ctx); + return ctx->bptr[1]; +} + +static inline uns +xml_get_char(struct xml_context *ctx) +{ + uns c = xml_peek_char(ctx); + ctx->bptr += 2; + return c; +} + +static inline uns +xml_get_cat(struct xml_context *ctx) +{ + uns c = xml_peek_cat(ctx); + ctx->bptr += 2; + return c; +} + +static inline uns +xml_last_char(struct xml_context *ctx) +{ + return ctx->bptr[-2]; +} + +static inline uns +xml_last_cat(struct xml_context *ctx) +{ + return ctx->bptr[-1]; +} + +static inline uns +xml_skip_char(struct xml_context *ctx) +{ + uns c = ctx->bptr[0]; + ctx->bptr += 2; + return c; +} + +static inline uns +xml_unget_char(struct xml_context *ctx) +{ + return *(ctx->bptr -= 2); +} + +void xml_sources_cleanup(struct xml_context *ctx); + +/*** Parsing ***/ + +void NONRET xml_fatal_expected(struct xml_context *ctx, uns c); +void NONRET xml_fatal_expected_white(struct xml_context *ctx); +void NONRET xml_fatal_expected_quot(struct xml_context *ctx); + +static inline uns +xml_parse_white(struct xml_context *ctx, uns mandatory) +{ + /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+ + * mandatory=0 -> S? */ + uns cnt = 0; + while (xml_peek_cat(ctx) & XML_CHAR_WHITE) + { + xml_skip_char(ctx); + cnt++; + } + if (unlikely(mandatory && !cnt)) + xml_fatal_expected_white(ctx); + return cnt; +} + +static inline void +xml_parse_char(struct xml_context *ctx, uns c) +{ + /* Consumes a given Unicode character */ + if (unlikely(c != xml_get_char(ctx))) + xml_fatal_expected(ctx, c); +} + +static inline void +xml_parse_seq(struct xml_context *ctx, const char *seq) +{ + /* Consumes a given sequence of ASCII characters */ + while (*seq) + xml_parse_char(ctx, *seq++); +} + +void xml_parse_eq(struct xml_context *ctx); + +static inline uns +xml_parse_quote(struct xml_context *ctx) +{ + /* "'" | '"' */ + uns c = xml_get_char(ctx); + if (unlikely(c != '\'' && c != '\"')) + xml_fatal_expected_quot(ctx); + return c; +} + +char *xml_parse_name(struct xml_context *ctx, struct mempool *pool); +void xml_skip_name(struct xml_context *ctx); +char *xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool); + +char *xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool); +char *xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool); + +uns xml_parse_char_ref(struct xml_context *ctx); +void xml_parse_pe_ref(struct xml_context *ctx); + +char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr); + +void xml_skip_internal_subset(struct xml_context *ctx); +void xml_parse_notation_decl(struct xml_context *ctx); +void xml_parse_entity_decl(struct xml_context *ctx); +void xml_parse_element_decl(struct xml_context *ctx); +void xml_parse_attr_list_decl(struct xml_context *ctx); + +void xml_push_comment(struct xml_context *ctx); +void xml_pop_comment(struct xml_context *ctx); +void xml_skip_comment(struct xml_context *ctx); + +void xml_push_pi(struct xml_context *ctx); +void xml_pop_pi(struct xml_context *ctx); +void xml_skip_pi(struct xml_context *ctx); + +void xml_attrs_table_init(struct xml_context *ctx); +void xml_attrs_table_cleanup(struct xml_context *ctx); + +void xml_validate_attr(struct xml_context *ctx, struct xml_dtd_attr *dtd, char *value); + +#endif diff --git a/sherlock/xml/parse.c b/sherlock/xml/parse.c index 8f8d8f48..572b6b96 100644 --- a/sherlock/xml/parse.c +++ b/sherlock/xml/parse.c @@ -12,7 +12,7 @@ #include "sherlock/sherlock.h" #include "sherlock/xml/xml.h" #include "sherlock/xml/dtd.h" -#include "sherlock/xml/common.h" +#include "sherlock/xml/internals.h" #include "lib/fastbuf.h" #include "lib/ff-unicode.h" #include "lib/unicode.h" diff --git a/sherlock/xml/source.c b/sherlock/xml/source.c index aebe5cce..af70644d 100644 --- a/sherlock/xml/source.c +++ b/sherlock/xml/source.c @@ -12,7 +12,7 @@ #include "sherlock/sherlock.h" #include "sherlock/xml/xml.h" #include "sherlock/xml/dtd.h" -#include "sherlock/xml/common.h" +#include "sherlock/xml/internals.h" #include "lib/unicode.h" #include "lib/ff-unicode.h" #include "charset/charconv.h" @@ -331,7 +331,7 @@ xml_parse_decl(struct xml_context *ctx) src->refill_cat2 = ctx->cat_new_line; /* Initialize the supplied charset (if any) or try to guess it */ - char *expected_encoding = src->expected_encoding ? : src->fb_encoding; + char *expected_encoding = src->expected_encoding; src->refill = xml_refill_utf8; int bom = bpeekc(src->fb); if (bom < 0) @@ -358,8 +358,6 @@ xml_parse_decl(struct xml_context *ctx) src->refill = xml_refill_utf16_be; if (bom == 0xff) src->refill = xml_refill_utf16_le; - if (!src->expected_encoding) - expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE"; } else if (strcasecmp(src->fb_encoding, "UTF-16BE")) src->refill = xml_refill_utf16_be; @@ -372,10 +370,15 @@ xml_parse_decl(struct xml_context *ctx) } } uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; + if (utf16) + src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE"; + if (!expected_encoding) + expected_encoding = src->fb_encoding; if (bom > 0 && xml_peek_char(ctx) == 0xfeff) xml_skip_char(ctx); else if (utf16) xml_error(ctx, "Missing or corrupted BOM"); + TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?"); /* Look ahead for presence of XMLDecl or optional TextDecl */ if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) @@ -462,13 +465,19 @@ end: if (cs < 0 && !expected_encoding) xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) - xml_init_charconv(ctx, cs); + { + xml_init_charconv(ctx, cs); + src->fb_encoding = src->decl_encoding; + } else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || !(!strcasecmp(src->decl_encoding, "UTF-16") || (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); } + if (!src->fb_encoding) + src->fb_encoding = "UTF-8"; + TRACE(ctx, "Final encoding=%s", src->fb_encoding); exit: /* Update valid Unicode ranges */ diff --git a/sherlock/xml/xml-test.c b/sherlock/xml/xml-test.c index 76c5042b..186bdd2f 100644 --- a/sherlock/xml/xml-test.c +++ b/sherlock/xml/xml-test.c @@ -150,7 +150,7 @@ h_document_end(struct xml_context *ctx UNUSED) static void h_xml_decl(struct xml_context *ctx) { - bprintf(out, "SAX: xml_decl version=%s standalone=%d\n", ctx->version_str, ctx->standalone); + bprintf(out, "SAX: xml_decl version=%s standalone=%d fb_encoding=%s\n", ctx->version_str, ctx->standalone, ctx->src->fb_encoding); } static void diff --git a/sherlock/xml/xml-test.t b/sherlock/xml/xml-test.t index 1a28be66..9ca7d7dd 100644 --- a/sherlock/xml/xml-test.t +++ b/sherlock/xml/xml-test.t @@ -12,7 +12,7 @@ In: text1&amp;<text2 Out: PULL: start SAX: document_start - SAX: xml_decl version=1.0 standalone=0 + SAX: xml_decl version=1.0 standalone=0 fb_encoding=ISO-8859-1 SAX: stag SAX: stag a1='val1' a2='val2' SAX: chars text='text1&<' @@ -22,22 +22,37 @@ Out: PULL: start SAX: document_end PULL: eof -Run: ../obj/sherlock/xml/xml-test -s --dtd +Run: (printf '\376\377' && bin/cs2cs UTF-8 UTF-16BE) | ../obj/sherlock/xml/xml-test -spd --dtd In: - + "> + %pe1; + ]> - &e1;&e2; + &e1;&e2; Out: PULL: start SAX: document_start - SAX: xml_decl version=1.0 standalone=0 + SAX: xml_decl version=1.0 standalone=0 fb_encoding=UTF-16BE SAX: doctype_decl type=root public='' system='' extsub=0 intsub=1 SAX: dtd_start SAX: dtd_end SAX: stag - SAX: chars text='text' + PULL: stag + SAX: chars text='text' + PULL: chars text='text' + SAX: stag + PULL: stag + SAX: chars text='' + PULL: chars text='' + PULL: etag + SAX: etag + PULL: etag SAX: etag SAX: document_end PULL: eof + DOM: element + DOM: chars text='text' + DOM: element + DOM: chars text='' diff --git a/sherlock/xml/xml.h b/sherlock/xml/xml.h index 8e416dbc..bd4f9ffe 100644 --- a/sherlock/xml/xml.h +++ b/sherlock/xml/xml.h @@ -16,7 +16,6 @@ #include "lib/fastbuf.h" struct xml_context; -struct xml_source; struct xml_dtd_entity; enum xml_error { @@ -133,6 +132,27 @@ struct xml_attr { void *user; /* User-defined (initialized to NULL) */ }; +#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */ + +struct xml_source { + struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ + struct fastbuf *fb; /* Source fastbuf */ + struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */ + struct fastbuf wrap_fb; /* Fbmem wrapper */ + u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ + u32 *bptr, *bstop; /* Current state of the buffer */ + uns row; /* File position */ + char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ + char *fb_encoding; /* Encoding of the source fastbuf */ + char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ + uns refill_cat1; /* Character categories, which should be directly passed to the buffer */ + uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in + sequences) */ + void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ + unsigned short *refill_in_to_x; /* Libcharset input table */ + uns saved_depth; /* Saved ctx->depth */ +}; + struct xml_context { /* Error handling */ char *err_msg; /* Last error message */ @@ -204,7 +224,7 @@ void xml_init(struct xml_context *ctx); /* Clean up all internal structures */ void xml_cleanup(struct xml_context *ctx); -/* Reuse XML context */ +/* Reuse XML context, equivalent to xml_cleanup() and xml_init() */ void xml_reset(struct xml_context *ctx); /* Add XML source (fastbuf will be automatically closed) */ @@ -231,4 +251,9 @@ void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent) /* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */ uns xml_normalize_white(struct xml_context *ctx, char *value); +/* Public part of error handling */ +void xml_warn(struct xml_context *ctx, const char *format, ...); +void xml_error(struct xml_context *ctx, const char *format, ...); +void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...); + #endif