X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=xml%2Fsource.c;h=3b06f510b41c7b9a896969a3963d9662dfac05ee;hb=HEAD;hp=d369c319a5b9dd8212d34513e7eb3def27b18e7a;hpb=cf171e985a6f6b88428a27de76b91bf56e0c033b;p=libucw.git diff --git a/xml/source.c b/xml/source.c deleted file mode 100644 index d369c319..00000000 --- a/xml/source.c +++ /dev/null @@ -1,486 +0,0 @@ -/* - * UCW Library -- A simple XML parser - * - * (c) 2007--2008 Pavel Charvat - * - * This software may be freely distributed and used according to the terms - * of the GNU Lesser General Public License. - */ - -#undef LOCAL_DEBUG - -#include -#include -#include -#include -#include -#include -#include -#include - -/*** Character categorization ***/ - -#include "obj/xml/unicat.c" - -static void -xml_init_cats(struct xml_context *ctx) -{ - if (!(ctx->flags & XML_VERSION_1_1)) - { - ctx->cat_chars = XML_CHAR_VALID_1_0; - ctx->cat_unrestricted = XML_CHAR_VALID_1_0; - ctx->cat_new_line = XML_CHAR_NEW_LINE_1_0; - ctx->cat_name = XML_CHAR_NAME_1_0; - ctx->cat_sname = XML_CHAR_SNAME_1_0; - } - else - { - ctx->cat_chars = XML_CHAR_VALID_1_1; - ctx->cat_unrestricted = XML_CHAR_UNRESTRICTED_1_1; - ctx->cat_new_line = XML_CHAR_NEW_LINE_1_1; - ctx->cat_name = XML_CHAR_NAME_1_1; - ctx->cat_sname = XML_CHAR_SNAME_1_1; - } -} - -/*** Reading of document/external entities ***/ - -static void NONRET -xml_eof(struct xml_context *ctx) -{ - ctx->err_msg = "Unexpected EOF"; - ctx->err_code = XML_ERR_EOF; - xml_throw(ctx); -} - -void NONRET -xml_fatal_nested(struct xml_context *ctx) -{ - xml_fatal(ctx, "Entity is not nested correctly"); -} - -static inline void -xml_add_char(u32 **bstop, uint c) -{ - *(*bstop)++ = c; - *(*bstop)++ = xml_char_cat(c); -} - -struct xml_source * -xml_push_source(struct xml_context *ctx) -{ - xml_push(ctx); - struct xml_source *src = ctx->src; - if (src) - { - src->bptr = ctx->bptr; - src->bstop = ctx->bstop; - } - src = mp_alloc_zero(ctx->stack, sizeof(*src)); - src->next = ctx->src; - src->saved_depth = ctx->depth; - ctx->src = src; - ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_DOCUMENT); - ctx->bstop = ctx->bptr = src->buf; - ctx->depth = 0; - return src; -} - -struct xml_source * -xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb) -{ - struct xml_source *src = xml_push_source(ctx); - src->fb = fb; - return src; -} - -static void -xml_close_source(struct xml_source *src) -{ - bclose(src->fb); - if (src->wrapped_fb) - bclose(src->wrapped_fb); -} - -static void -xml_pop_source(struct xml_context *ctx) -{ - TRACE(ctx, "pop_source"); - if (unlikely(ctx->depth != 0)) - xml_fatal(ctx, "Unexpected end of entity"); - struct xml_source *src = ctx->src; - if (!src) - xml_fatal(ctx, "Undefined source"); - xml_close_source(src); - ctx->depth = src->saved_depth; - ctx->src = src = src->next; - if (src) - { - ctx->bptr = src->bptr; - ctx->bstop = src->bstop; - } - xml_pop(ctx); - if (unlikely(!src)) - xml_eof(ctx); -} - -void -xml_sources_cleanup(struct xml_context *ctx) -{ - struct xml_source *s; - while (s = ctx->src) - { - ctx->src = s->next; - xml_close_source(s); - } -} - -static void xml_refill_utf8(struct xml_context *ctx); - -void -xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent UNUSED) -{ - xml_error(ctx, "References to external entities are not supported"); -} - -void -xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent) -{ - TRACE(ctx, "xml_push_entity"); - struct xml_source *src; - if (ent->flags & XML_DTD_ENTITY_EXTERNAL) - { - ASSERT(ctx->h_resolve_entity); - ctx->h_resolve_entity(ctx, ent); - ctx->flags |= XML_SRC_EXPECTED_DECL; - src = ctx->src; - } - else - { - src = xml_push_source(ctx); - fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, strlen(ent->text), 0); - } - src->refill = xml_refill_utf8; - src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line; - src->refill_cat2 = ctx->cat_new_line; -} - -static uint -xml_error_restricted(struct xml_context *ctx, uint c) -{ - if (c == ~1U) - xml_error(ctx, "Corrupted encoding"); - else - xml_error(ctx, "Restricted char U+%04X", c); - return UNI_REPLACEMENT; -} - -static void xml_parse_decl(struct xml_context *ctx); - -#define REFILL(ctx, func, params...) \ - struct xml_source *src = ctx->src; \ - struct fastbuf *fb = src->fb; \ - if (ctx->bptr == ctx->bstop) \ - ctx->bptr = ctx->bstop = src->buf; \ - uint c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ - u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \ - *last_0xd = src->pending_0xd ? bstop : NULL; \ - do \ - { \ - c = func(fb, ##params); \ - uint t = xml_char_cat(c); \ - if (t & t1) \ - /* Typical branch */ \ - *bstop++ = c, *bstop++ = t; \ - else if (t & t2) \ - { \ - /* New line */ \ - /* XML 1.0: 0xA | 0xD | 0xD 0xA */ \ - /* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \ - if (c == 0xd) \ - last_0xd = bstop + 2; \ - else if (c != 0x2028 && last_0xd == bstop) \ - { \ - last_0xd = NULL; \ - continue; \ - } \ - xml_add_char(&bstop, 0xa), row++; \ - } \ - else if (c == '>') \ - { \ - /* Used only in XML/TextDecl to switch the encoding */ \ - *bstop++ = c, *bstop++ = t; \ - break; \ - } \ - else if (~c) \ - /* Restricted character */ \ - xml_add_char(&bstop, xml_error_restricted(ctx, c)); \ - else \ - { \ - /* EOF */ \ - ctx->flags |= XML_SRC_EOF; \ - break; \ - } \ - } \ - while (bstop < bend); \ - src->pending_0xd = (last_0xd == bstop); \ - ctx->bstop = bstop; \ - src->row = row; - -static void -xml_refill_utf8(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf8_repl, ~1U); -} - -static void -xml_refill_utf16_le(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf16_le_repl, ~1U); -} - -static void -xml_refill_utf16_be(struct xml_context *ctx) -{ - REFILL(ctx, bget_utf16_be_repl, ~1U); -} - -#undef REFILL - -void -xml_refill(struct xml_context *ctx) -{ - do - { - if (ctx->flags & XML_SRC_EOF) - xml_pop_source(ctx); - else if (ctx->flags & XML_SRC_EXPECTED_DECL) - xml_parse_decl(ctx); - else - { - ctx->src->refill(ctx); - TRACE(ctx, "refilled %u characters", (uint)((ctx->bstop - ctx->bptr) / 2)); - } - } - while (ctx->bptr == ctx->bstop); -} - -static uint -xml_source_row(struct xml_context *ctx, struct xml_source *src) -{ - uint row = src->row; - for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2) - if (p[-1] & src->refill_cat2) - row--; - return row + 1; -} - -uint -xml_row(struct xml_context *ctx) -{ - return ctx->src ? xml_source_row(ctx, ctx->src) : 0; -} - -/* Document/external entity header */ - -static char * -xml_parse_encoding_name(struct xml_context *ctx) -{ - /* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */ - char *p = mp_start_noalign(ctx->pool, 1); - uint q = xml_parse_quote(ctx); - if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME))) - xml_fatal(ctx, "Invalid character in the encoding name"); - while (1) - { - p = mp_spread(ctx->pool, p, 2); - *p++ = xml_last_char(ctx); - if (xml_get_char(ctx) == q) - break; - if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME))) - xml_fatal(ctx, "Invalid character in the encoding name"); - } - *p++ = 0; - return mp_end(ctx->pool, p); -} - -static void -xml_init_charconv(struct xml_context *ctx, int cs) -{ - // XXX: with a direct access to libucw-charset tables could be faster - struct xml_source *src = ctx->src; - TRACE(ctx, "wrapping charset %s", charset_name(cs)); - src->wrapped_fb = src->fb; - src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8); -} - -static void -xml_parse_decl(struct xml_context *ctx) -{ - TRACE(ctx, "xml_parse_decl"); - struct xml_source *src = ctx->src; - ctx->flags &= ~XML_SRC_EXPECTED_DECL; - uint doc = ctx->flags & XML_SRC_DOCUMENT; - - /* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */ - if (doc) - xml_init_cats(ctx); - src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line & ~XML_CHAR_GT; - src->refill_cat2 = ctx->cat_new_line; - - /* Initialize the supplied charset (if any) or try to guess it */ - char *expected_encoding = src->expected_encoding; - src->refill = xml_refill_utf8; - int bom = bpeekc(src->fb); - if (bom < 0) - ctx->flags |= XML_SRC_EOF; - if (!src->fb_encoding) - { - if (bom == 0xfe) - src->refill = xml_refill_utf16_be; - else if (bom == 0xff) - src->refill = xml_refill_utf16_le; - } - else - { - int cs = find_charset_by_name(src->fb_encoding); - if (cs == CONV_CHARSET_UTF8) - {} - else if (cs >= 0) - { - xml_init_charconv(ctx, cs); - bom = 0; - } - else if (strcasecmp(src->fb_encoding, "UTF-16")) - { - src->refill = xml_refill_utf16_be; - if (bom == 0xff) - src->refill = xml_refill_utf16_le; - } - else if (strcasecmp(src->fb_encoding, "UTF-16BE")) - src->refill = xml_refill_utf16_be; - else if (strcasecmp(src->fb_encoding, "UTF-16LE")) - src->refill = xml_refill_utf16_le; - else - { - xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding); - expected_encoding = NULL; - } - } - uint utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; - if (utf16) - src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE"; - if (!expected_encoding) - expected_encoding = src->fb_encoding; - if (bom > 0 && xml_peek_char(ctx) == 0xfeff) - xml_skip_char(ctx); - else if (utf16) - xml_error(ctx, "Missing or corrupted BOM"); - TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?"); - - /* Look ahead for presence of XMLDecl or optional TextDecl */ - if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) - xml_refill(ctx); - u32 *bptr = ctx->bptr; - uint have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) && - bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L'); - if (!have_decl) - { - if (doc) - xml_fatal(ctx, "Missing or corrupted XML header"); - else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16) - xml_error(ctx, "Missing or corrupted entity header"); - goto exit; - } - ctx->bptr = bptr + 12; - xml_parse_white(ctx, 0); - - /* Parse version string (mandatory in XMLDecl, optional in TextDecl) */ - if (xml_peek_char(ctx) == 'v') - { - xml_parse_seq(ctx, "version"); - xml_parse_eq(ctx); - char *version = xml_parse_pubid_literal(ctx, ctx->pool); - TRACE(ctx, "version=%s", version); - uint v = 0; - if (!strcmp(version, "1.1")) - v = XML_VERSION_1_1; - else if (strcmp(version, "1.0")) - { - xml_error(ctx, "Unknown XML version string '%s'", version); - version = "1.0"; - } - if (doc) - { - ctx->version_str = version; - ctx->flags |= v; - } - else if (v > (ctx->flags & XML_VERSION_1_1)) - xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document"); - if (!xml_parse_white(ctx, !doc)) - goto end; - } - else if (doc) - { - xml_error(ctx, "Expected XML version"); - ctx->version_str = "1.0"; - } - - /* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */ - if (xml_peek_char(ctx) == 'e') - { - xml_parse_seq(ctx, "encoding"); - xml_parse_eq(ctx); - src->decl_encoding = xml_parse_encoding_name(ctx); - TRACE(ctx, "encoding=%s", src->decl_encoding); - if (!xml_parse_white(ctx, 0)) - goto end; - } - else if (!doc) - xml_error(ctx, "Expected XML encoding"); - - /* Parse whether the document is standalone (optional in XMLDecl) */ - if (doc && xml_peek_char(ctx) == 's') - { - xml_parse_seq(ctx, "standalone"); - xml_parse_eq(ctx); - uint c = xml_parse_quote(ctx); - if (ctx->standalone = (xml_peek_char(ctx) == 'y')) - xml_parse_seq(ctx, "yes"); - else - xml_parse_seq(ctx, "no"); - xml_parse_char(ctx, c); - TRACE(ctx, "standalone=%d", ctx->standalone); - xml_parse_white(ctx, 0); - } -end: - xml_parse_seq(ctx, "?>"); - - /* Switch to the final encoding */ - if (src->decl_encoding) - { - int cs = find_charset_by_name(src->decl_encoding); - if (cs < 0 && !expected_encoding) - xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); - else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) - { - xml_init_charconv(ctx, cs); - src->fb_encoding = src->decl_encoding; - } - else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || - !(!strcasecmp(src->decl_encoding, "UTF-16") || - (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || - (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) - xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); - } - if (!src->fb_encoding) - src->fb_encoding = "UTF-8"; - TRACE(ctx, "Final encoding=%s", src->fb_encoding); - -exit: - /* Update valid Unicode ranges */ - if (doc) - xml_init_cats(ctx); - src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line; - src->refill_cat2 = ctx->cat_new_line; -}