X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;ds=inline;f=sherlock%2Fxml%2Fsource.c;h=29226f0fa062c086667980cdd2cb7fb25ea00456;hb=40e9285259a3311feaffa3fa654f91b731684036;hp=aebe5ccea23101ef25ebb6dc2c06cf72d4af354b;hpb=ccf64507b45774b007ab6200036827f1597022d8;p=libucw.git diff --git a/sherlock/xml/source.c b/sherlock/xml/source.c index aebe5cce..29226f0f 100644 --- a/sherlock/xml/source.c +++ b/sherlock/xml/source.c @@ -12,9 +12,9 @@ #include "sherlock/sherlock.h" #include "sherlock/xml/xml.h" #include "sherlock/xml/dtd.h" -#include "sherlock/xml/common.h" -#include "lib/unicode.h" -#include "lib/ff-unicode.h" +#include "sherlock/xml/internals.h" +#include "ucw/unicode.h" +#include "ucw/ff-unicode.h" #include "charset/charconv.h" #include "charset/fb-charconv.h" @@ -80,7 +80,7 @@ xml_push_source(struct xml_context *ctx) src->next = ctx->src; src->saved_depth = ctx->depth; ctx->src = src; - ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_NEW_LINE | XML_SRC_SURROUND | XML_SRC_DOCUMENT); + ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_DOCUMENT); ctx->bstop = ctx->bptr = src->buf; ctx->depth = 0; return src; @@ -182,9 +182,9 @@ void xml_parse_decl(struct xml_context *ctx); struct fastbuf *fb = src->fb; \ if (ctx->bptr == ctx->bstop) \ ctx->bptr = ctx->bstop = src->buf; \ - uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ + uns c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \ u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \ - *last_0xd = (f & XML_SRC_NEW_LINE) ? bstop : bend; \ + *last_0xd = src->pending_0xd ? bstop : NULL; \ do \ { \ c = func(fb, ##params); \ @@ -201,7 +201,7 @@ void xml_parse_decl(struct xml_context *ctx); last_0xd = bstop + 2; \ else if (c != 0x2028 && last_0xd == bstop) \ { \ - last_0xd = bend; \ + last_0xd = NULL; \ continue; \ } \ xml_add_char(&bstop, 0xa), row++; \ @@ -218,14 +218,12 @@ void xml_parse_decl(struct xml_context *ctx); else \ { \ /* EOF */ \ - if (f & XML_SRC_SURROUND) \ - xml_add_char(&bstop, 0x20); \ - f |= XML_SRC_EOF; \ + ctx->flags |= XML_SRC_EOF; \ break; \ } \ } \ while (bstop < bend); \ - ctx->flags = (last_0xd == bstop) ? f | XML_SRC_NEW_LINE : f & ~XML_SRC_NEW_LINE; \ + src->pending_0xd = (last_0xd == bstop); \ ctx->bstop = bstop; \ src->row = row; @@ -331,7 +329,7 @@ xml_parse_decl(struct xml_context *ctx) src->refill_cat2 = ctx->cat_new_line; /* Initialize the supplied charset (if any) or try to guess it */ - char *expected_encoding = src->expected_encoding ? : src->fb_encoding; + char *expected_encoding = src->expected_encoding; src->refill = xml_refill_utf8; int bom = bpeekc(src->fb); if (bom < 0) @@ -358,8 +356,6 @@ xml_parse_decl(struct xml_context *ctx) src->refill = xml_refill_utf16_be; if (bom == 0xff) src->refill = xml_refill_utf16_le; - if (!src->expected_encoding) - expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE"; } else if (strcasecmp(src->fb_encoding, "UTF-16BE")) src->refill = xml_refill_utf16_be; @@ -372,10 +368,15 @@ xml_parse_decl(struct xml_context *ctx) } } uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be; + if (utf16) + src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE"; + if (!expected_encoding) + expected_encoding = src->fb_encoding; if (bom > 0 && xml_peek_char(ctx) == 0xfeff) xml_skip_char(ctx); else if (utf16) xml_error(ctx, "Missing or corrupted BOM"); + TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?"); /* Look ahead for presence of XMLDecl or optional TextDecl */ if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf)) @@ -462,13 +463,19 @@ end: if (cs < 0 && !expected_encoding) xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding); else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8) - xml_init_charconv(ctx, cs); + { + xml_init_charconv(ctx, cs); + src->fb_encoding = src->decl_encoding; + } else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 || !(!strcasecmp(src->decl_encoding, "UTF-16") || (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) || (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE"))))) xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding); } + if (!src->fb_encoding) + src->fb_encoding = "UTF-8"; + TRACE(ctx, "Final encoding=%s", src->fb_encoding); exit: /* Update valid Unicode ranges */