X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;ds=inline;f=sherlock%2Fxml%2Fsource.c;h=29226f0fa062c086667980cdd2cb7fb25ea00456;hb=003ca21e49148941c07f69c87e2553f660913b65;hp=e77cca3e0842c3069a9180e34fa96120f71a9765;hpb=6b475e1471242a1db6ee254f98501cc11c8bf1c4;p=libucw.git diff --git a/sherlock/xml/source.c b/sherlock/xml/source.c index e77cca3e..29226f0f 100644 --- a/sherlock/xml/source.c +++ b/sherlock/xml/source.c @@ -1,7 +1,7 @@ /* * Sherlock Library -- A simple XML parser * - * (c) 2007 Pavel Charvat + * (c) 2007--2008 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -12,9 +12,9 @@ #include "sherlock/sherlock.h" #include "sherlock/xml/xml.h" #include "sherlock/xml/dtd.h" -#include "sherlock/xml/common.h" -#include "lib/unicode.h" -#include "lib/ff-unicode.h" +#include "sherlock/xml/internals.h" +#include "ucw/unicode.h" +#include "ucw/ff-unicode.h" #include "charset/charconv.h" #include "charset/fb-charconv.h" @@ -67,7 +67,7 @@ xml_add_char(u32 **bstop, uns c) } struct xml_source * -xml_push_source(struct xml_context *ctx, uns flags) +xml_push_source(struct xml_context *ctx) { xml_push(ctx); struct xml_source *src = ctx->src; @@ -80,11 +80,17 @@ xml_push_source(struct xml_context *ctx, uns flags) src->next = ctx->src; src->saved_depth = ctx->depth; ctx->src = src; - ctx->flags = (ctx->flags & ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_NEW_LINE | XML_SRC_SURROUND | XML_SRC_DOCUMENT)) | flags; + ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_DOCUMENT); ctx->bstop = ctx->bptr = src->buf; ctx->depth = 0; - if (flags & XML_SRC_SURROUND) - xml_add_char(&ctx->bstop, 0x20); + return src; +} + +struct xml_source * +xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb) +{ + struct xml_source *src = xml_push_source(ctx); + src->fb = fb; return src; } @@ -101,11 +107,10 @@ xml_pop_source(struct xml_context *ctx) { TRACE(ctx, "pop_source"); if (unlikely(ctx->depth != 0)) - { - xml_fatal(ctx, "Unexpected end of entity"); - } + xml_fatal(ctx, "Unexpected end of entity"); struct xml_source *src = ctx->src; - ASSERT(src); + if (!src) + xml_fatal(ctx, "Undefined source"); xml_close_source(src); ctx->depth = src->saved_depth; ctx->src = src = src->next; @@ -133,31 +138,31 @@ xml_sources_cleanup(struct xml_context *ctx) static void xml_refill_utf8(struct xml_context *ctx); void -xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent) +xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent UNUSED) { - TRACE(ctx, "xml_push_entity"); - uns cat1 = ctx->src->refill_cat1; - uns cat2 = ctx->src->refill_cat2; - struct xml_source *src = xml_push_source(ctx, 0); - src->refill_cat1 = cat1; - src->refill_cat2 = cat2; - if (ent->flags & XML_DTD_ENT_EXTERNAL) - xml_fatal(ctx, "External entities not implemented"); // FIXME - else - { - fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0); - src->refill = xml_refill_utf8; - } + xml_error(ctx, "References to external entities are not supported"); } void -xml_set_source(struct xml_context *ctx, struct fastbuf *fb) +xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent) { - TRACE(ctx, "xml_set_source"); - ASSERT(!ctx->src); - struct xml_source *src = xml_push_source(ctx, XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL); - src->fb = fb; - ctx->state = XML_STATE_START; + TRACE(ctx, "xml_push_entity"); + struct xml_source *src; + if (ent->flags & XML_DTD_ENTITY_EXTERNAL) + { + ASSERT(ctx->h_resolve_entity); + ctx->h_resolve_entity(ctx, ent); + ctx->flags |= XML_SRC_EXPECTED_DECL; + src = ctx->src; + } + else + { + src = xml_push_source(ctx); + fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, strlen(ent->text), 0); + } + src->refill = xml_refill_utf8; + src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line; + src->refill_cat2 = ctx->cat_new_line; } static uns @@ -177,9 +182,9 @@ void xml_parse_decl(struct xml_context *ctx); struct fastbuf *fb = src->fb; \ if (ctx->bptr == ctx->bstop) \ ctx->bptr = ctx->bstop = src->buf; \ - uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ + uns c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \ - *last_0xd = (f & XML_SRC_NEW_LINE) ? bstop : bend; \ + *last_0xd = src->pending_0xd ? bstop : NULL; \ do \ { \ c = func(fb, ##params); \ @@ -196,7 +201,7 @@ void xml_parse_decl(struct xml_context *ctx); last_0xd = bstop + 2; \ else if (c != 0x2028 && last_0xd == bstop) \ { \ - last_0xd = bend; \ + last_0xd = NULL; \ continue; \ } \ xml_add_char(&bstop, 0xa), row++; \ @@ -213,14 +218,12 @@ void xml_parse_decl(struct xml_context *ctx); else \ { \ /* EOF */ \ - if (f & XML_SRC_SURROUND) \ - xml_add_char(&bstop, 0x20); \ - f |= XML_SRC_EOF; \ + ctx->flags |= XML_SRC_EOF; \ break; \ } \ } \ while (bstop < bend); \ - ctx->flags = (last_0xd == bstop) ? f | XML_SRC_NEW_LINE : f & ~XML_SRC_NEW_LINE; \ + src->pending_0xd = (last_0xd == bstop); \ ctx->bstop = bstop; \ src->row = row; @@ -242,23 +245,6 @@ xml_refill_utf16_be(struct xml_context *ctx) REFILL(ctx, bget_utf16_be_repl, ~1U); } -#if 0 -static inline uns -xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x) -{ - // FIXME: slow - int c; - return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]); -} - -static void -xml_refill_libcharset(struct xml_context *ctx) -{ - unsigned short int *in_to_x = ctx->src->refill_in_to_x; - REFILL(ctx, xml_refill_libcharset_bget, in_to_x); -} -#endif - #undef REFILL void @@ -279,12 +265,9 @@ xml_refill(struct xml_context *ctx) while (ctx->bptr == ctx->bstop); } -uns -xml_row(struct xml_context *ctx) +static uns +xml_source_row(struct xml_context *ctx, struct xml_source *src) { - struct xml_source *src = ctx->src; - if (!src) - return 0; uns row = src->row; for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2) if (p[-1] & src->refill_cat2) @@ -292,6 +275,12 @@ xml_row(struct xml_context *ctx) return row + 1; } +uns +xml_row(struct xml_context *ctx) +{ + return ctx->src ? xml_source_row(ctx, ctx->src) : 0; +} + /* Document/external entity header */ static char * @@ -318,18 +307,11 @@ xml_parse_encoding_name(struct xml_context *ctx) static void xml_init_charconv(struct xml_context *ctx, int cs) { - // FIXME: hack + // XXX: with a direct access to libcharset tables could be faster struct xml_source *src = ctx->src; TRACE(ctx, "wrapping charset %s", charset_name(cs)); -#if 0 - struct conv_context conv; - conv_set_charset(&conv, cs, CONV_CHARSET_UTF8); - src->refill = xml_refill_libcharset; - src->refill_in_to_x = conv.in_to_x; -#else src->wrapped_fb = src->fb; src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8); -#endif } void @@ -347,7 +329,7 @@ xml_parse_decl(struct xml_context *ctx) src->refill_cat2 = ctx->cat_new_line; /* Initialize the supplied charset (if any) or try to guess it */ - char *expected_encoding = src->expected_encoding ? : src->fb_encoding; + char *expected_encoding = src->expected_encoding; src->refill = xml_refill_utf8; int bom = bpeekc(src->fb); if (bom < 0) @@ -374,8 +356,6 @@ xml_parse_decl(struct xml_context *ctx) src->refill = xml_refill_utf16_be; if (bom == 0xff) src->refill = xml_refill_utf16_le; - if (!src->expected_encoding) - expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE"; } else if (strcasecmp(src->fb_encoding, "UTF-16BE")) src->refill = xml_refill_utf16_be; @@ -388,10 +368,15 @@ xml_parse_decl(struct xml_context *ctx) } } uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; + if (utf16) + src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE"; + if (!expected_encoding) + expected_encoding = src->fb_encoding; if (bom > 0 && xml_peek_char(ctx) == 0xfeff) xml_skip_char(ctx); else if (utf16) xml_error(ctx, "Missing or corrupted BOM"); + TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?"); /* Look ahead for presence of XMLDecl or optional TextDecl */ if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) @@ -478,13 +463,19 @@ end: if (cs < 0 && !expected_encoding) xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) - xml_init_charconv(ctx, cs); + { + xml_init_charconv(ctx, cs); + src->fb_encoding = src->decl_encoding; + } else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || !(!strcasecmp(src->decl_encoding, "UTF-16") || (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); } + if (!src->fb_encoding) + src->fb_encoding = "UTF-8"; + TRACE(ctx, "Final encoding=%s", src->fb_encoding); exit: /* Update valid Unicode ranges */