From 5da13cd16371faa6df5b880387b5b172a1704aef Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Sun, 20 Jan 2008 12:28:39 +0100 Subject: [PATCH] Small changes in XML parser: -- UTF-16 should now work -- support for parameter entities in DTD -- improved test cases -- renamed common.h to more intuitive internals.h --- sherlock/xml/TODO | 3 --- sherlock/xml/common.c | 2 +- sherlock/xml/dtd.c | 36 ++++++++++++++------------ sherlock/xml/{common.h => internals.h} | 27 ++----------------- sherlock/xml/parse.c | 2 +- sherlock/xml/source.c | 19 ++++++++++---- sherlock/xml/xml-test.c | 2 +- sherlock/xml/xml-test.t | 27 ++++++++++++++----- sherlock/xml/xml.h | 29 +++++++++++++++++++-- 9 files changed, 86 insertions(+), 61 deletions(-) rename sherlock/xml/{common.h => internals.h} (83%) diff --git a/sherlock/xml/TODO b/sherlock/xml/TODO index bf377f5c..b8dbc29c 100644 --- a/sherlock/xml/TODO +++ b/sherlock/xml/TODO @@ -11,8 +11,5 @@ Non-normative / not-implemented: -- full support for standalone documents -- Unicode normalization -Bugs: --- definitions of parameter entities do not work because of '%' expansion in "dtd; - xml_parse_dtd_white(ctx, 1); - - uns flags = (xml_get_char(ctx) == '%') ? XML_DTD_ENTITY_PARAMETER : 0; + uns flags = ~xml_parse_dtd_white(ctx, ~0U) ? 0 : XML_DTD_ENTITY_PARAMETER; if (flags) xml_parse_dtd_white(ctx, 1); - else - xml_unget_char(ctx); - struct xml_dtd_entity *ent = xml_dtd_ents_lookup(flags ? dtd->tab_pents : dtd->tab_ents, xml_parse_name(ctx, dtd->pool)); - slist *list = flags ? &dtd->pents : &dtd->ents; xml_parse_dtd_white(ctx, 1); + slist *list = flags ? &dtd->pents : &dtd->ents; if (ent->flags & XML_DTD_ENTITY_DECLARED) { xml_fatal(ctx, "Entity &%s; already declared, skipping not implemented", ent->name); // FIXME: should be only warning } - uns c, sep = xml_get_char(ctx); if (sep == '\'' || sep == '"') { diff --git a/sherlock/xml/common.h b/sherlock/xml/internals.h similarity index 83% rename from sherlock/xml/common.h rename to sherlock/xml/internals.h index edff07dc..bbf28c05 100644 --- a/sherlock/xml/common.h +++ b/sherlock/xml/internals.h @@ -7,8 +7,8 @@ * of the GNU Lesser General Public License. */ -#ifndef _SHERLOCK_XML_COMMON_H -#define _SHERLOCK_XML_COMMON_H +#ifndef _SHERLOCK_XML_INTERNALS_H +#define _SHERLOCK_XML_INTERNALS_H #include "sherlock/xml/xml.h" #include "sherlock/xml/dtd.h" @@ -24,9 +24,6 @@ /*** Error handling ***/ void NONRET xml_throw(struct xml_context *ctx); -void xml_warn(struct xml_context *ctx, const char *format, ...); -void xml_error(struct xml_context *ctx, const char *format, ...); -void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...); /*** Memory management ***/ @@ -127,26 +124,6 @@ void xml_spout_chars(struct fastbuf *fb); /*** Reading of document/external entities ***/ -#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */ - -struct xml_source { - struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ - struct fastbuf *fb; /* Source fastbuf */ - struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */ - struct fastbuf wrap_fb; /* Fbmem wrapper */ - u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ - u32 *bptr, *bstop; /* Current state of the buffer */ - uns row; /* File position */ - char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ - char *fb_encoding; /* Encoding of the source fastbuf */ - char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ - uns refill_cat1; /* Character categories, which should be directly passed to the buffer */ - uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in sequences) */ - void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ - unsigned short *refill_in_to_x; /* Libcharset input table */ - uns saved_depth; /* Saved ctx->depth */ -}; - void NONRET xml_fatal_nested(struct xml_context *ctx); static inline void diff --git a/sherlock/xml/parse.c b/sherlock/xml/parse.c index 8f8d8f48..572b6b96 100644 --- a/sherlock/xml/parse.c +++ b/sherlock/xml/parse.c @@ -12,7 +12,7 @@ #include "sherlock/sherlock.h" #include "sherlock/xml/xml.h" #include "sherlock/xml/dtd.h" -#include "sherlock/xml/common.h" +#include "sherlock/xml/internals.h" #include "lib/fastbuf.h" #include "lib/ff-unicode.h" #include "lib/unicode.h" diff --git a/sherlock/xml/source.c b/sherlock/xml/source.c index aebe5cce..af70644d 100644 --- a/sherlock/xml/source.c +++ b/sherlock/xml/source.c @@ -12,7 +12,7 @@ #include "sherlock/sherlock.h" #include "sherlock/xml/xml.h" #include "sherlock/xml/dtd.h" -#include "sherlock/xml/common.h" +#include "sherlock/xml/internals.h" #include "lib/unicode.h" #include "lib/ff-unicode.h" #include "charset/charconv.h" @@ -331,7 +331,7 @@ xml_parse_decl(struct xml_context *ctx) src->refill_cat2 = ctx->cat_new_line; /* Initialize the supplied charset (if any) or try to guess it */ - char *expected_encoding = src->expected_encoding ? : src->fb_encoding; + char *expected_encoding = src->expected_encoding; src->refill = xml_refill_utf8; int bom = bpeekc(src->fb); if (bom < 0) @@ -358,8 +358,6 @@ xml_parse_decl(struct xml_context *ctx) src->refill = xml_refill_utf16_be; if (bom == 0xff) src->refill = xml_refill_utf16_le; - if (!src->expected_encoding) - expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE"; } else if (strcasecmp(src->fb_encoding, "UTF-16BE")) src->refill = xml_refill_utf16_be; @@ -372,10 +370,15 @@ xml_parse_decl(struct xml_context *ctx) } } uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; + if (utf16) + src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE"; + if (!expected_encoding) + expected_encoding = src->fb_encoding; if (bom > 0 && xml_peek_char(ctx) == 0xfeff) xml_skip_char(ctx); else if (utf16) xml_error(ctx, "Missing or corrupted BOM"); + TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?"); /* Look ahead for presence of XMLDecl or optional TextDecl */ if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) @@ -462,13 +465,19 @@ end: if (cs < 0 && !expected_encoding) xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) - xml_init_charconv(ctx, cs); + { + xml_init_charconv(ctx, cs); + src->fb_encoding = src->decl_encoding; + } else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || !(!strcasecmp(src->decl_encoding, "UTF-16") || (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); } + if (!src->fb_encoding) + src->fb_encoding = "UTF-8"; + TRACE(ctx, "Final encoding=%s", src->fb_encoding); exit: /* Update valid Unicode ranges */ diff --git a/sherlock/xml/xml-test.c b/sherlock/xml/xml-test.c index 76c5042b..186bdd2f 100644 --- a/sherlock/xml/xml-test.c +++ b/sherlock/xml/xml-test.c @@ -150,7 +150,7 @@ h_document_end(struct xml_context *ctx UNUSED) static void h_xml_decl(struct xml_context *ctx) { - bprintf(out, "SAX: xml_decl version=%s standalone=%d\n", ctx->version_str, ctx->standalone); + bprintf(out, "SAX: xml_decl version=%s standalone=%d fb_encoding=%s\n", ctx->version_str, ctx->standalone, ctx->src->fb_encoding); } static void diff --git a/sherlock/xml/xml-test.t b/sherlock/xml/xml-test.t index 1a28be66..9ca7d7dd 100644 --- a/sherlock/xml/xml-test.t +++ b/sherlock/xml/xml-test.t @@ -12,7 +12,7 @@ In: text1&amp;<text2 Out: PULL: start SAX: document_start - SAX: xml_decl version=1.0 standalone=0 + SAX: xml_decl version=1.0 standalone=0 fb_encoding=ISO-8859-1 SAX: stag SAX: stag a1='val1' a2='val2' SAX: chars text='text1&<' @@ -22,22 +22,37 @@ Out: PULL: start SAX: document_end PULL: eof -Run: ../obj/sherlock/xml/xml-test -s --dtd +Run: (printf '\376\377' && bin/cs2cs UTF-8 UTF-16BE) | ../obj/sherlock/xml/xml-test -spd --dtd In: - + "> + %pe1; + ]> - &e1;&e2; + &e1;&e2; Out: PULL: start SAX: document_start - SAX: xml_decl version=1.0 standalone=0 + SAX: xml_decl version=1.0 standalone=0 fb_encoding=UTF-16BE SAX: doctype_decl type=root public='' system='' extsub=0 intsub=1 SAX: dtd_start SAX: dtd_end SAX: stag - SAX: chars text='text' + PULL: stag + SAX: chars text='text' + PULL: chars text='text' + SAX: stag + PULL: stag + SAX: chars text='' + PULL: chars text='' + PULL: etag + SAX: etag + PULL: etag SAX: etag SAX: document_end PULL: eof + DOM: element + DOM: chars text='text' + DOM: element + DOM: chars text='' diff --git a/sherlock/xml/xml.h b/sherlock/xml/xml.h index 8e416dbc..bd4f9ffe 100644 --- a/sherlock/xml/xml.h +++ b/sherlock/xml/xml.h @@ -16,7 +16,6 @@ #include "lib/fastbuf.h" struct xml_context; -struct xml_source; struct xml_dtd_entity; enum xml_error { @@ -133,6 +132,27 @@ struct xml_attr { void *user; /* User-defined (initialized to NULL) */ }; +#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */ + +struct xml_source { + struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ + struct fastbuf *fb; /* Source fastbuf */ + struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */ + struct fastbuf wrap_fb; /* Fbmem wrapper */ + u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ + u32 *bptr, *bstop; /* Current state of the buffer */ + uns row; /* File position */ + char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ + char *fb_encoding; /* Encoding of the source fastbuf */ + char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ + uns refill_cat1; /* Character categories, which should be directly passed to the buffer */ + uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in + sequences) */ + void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ + unsigned short *refill_in_to_x; /* Libcharset input table */ + uns saved_depth; /* Saved ctx->depth */ +}; + struct xml_context { /* Error handling */ char *err_msg; /* Last error message */ @@ -204,7 +224,7 @@ void xml_init(struct xml_context *ctx); /* Clean up all internal structures */ void xml_cleanup(struct xml_context *ctx); -/* Reuse XML context */ +/* Reuse XML context, equivalent to xml_cleanup() and xml_init() */ void xml_reset(struct xml_context *ctx); /* Add XML source (fastbuf will be automatically closed) */ @@ -231,4 +251,9 @@ void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent) /* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */ uns xml_normalize_white(struct xml_context *ctx, char *value); +/* Public part of error handling */ +void xml_warn(struct xml_context *ctx, const char *format, ...); +void xml_error(struct xml_context *ctx, const char *format, ...); +void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...); + #endif -- 2.39.2