]> mj.ucw.cz Git - libucw.git/blobdiff - sherlock/xml/source.c
XML: Small bugfix in xml_merge_chars.
[libucw.git] / sherlock / xml / source.c
index aebe5ccea23101ef25ebb6dc2c06cf72d4af354b..3b06f510b41c7b9a896969a3963d9662dfac05ee 100644 (file)
@@ -12,7 +12,7 @@
 #include "sherlock/sherlock.h"
 #include "sherlock/xml/xml.h"
 #include "sherlock/xml/dtd.h"
 #include "sherlock/sherlock.h"
 #include "sherlock/xml/xml.h"
 #include "sherlock/xml/dtd.h"
-#include "sherlock/xml/common.h"
+#include "sherlock/xml/internals.h"
 #include "lib/unicode.h"
 #include "lib/ff-unicode.h"
 #include "charset/charconv.h"
 #include "lib/unicode.h"
 #include "lib/ff-unicode.h"
 #include "charset/charconv.h"
@@ -80,7 +80,7 @@ xml_push_source(struct xml_context *ctx)
   src->next = ctx->src;
   src->saved_depth = ctx->depth;
   ctx->src = src;
   src->next = ctx->src;
   src->saved_depth = ctx->depth;
   ctx->src = src;
-  ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_NEW_LINE | XML_SRC_SURROUND | XML_SRC_DOCUMENT);
+  ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_DOCUMENT);
   ctx->bstop = ctx->bptr = src->buf;
   ctx->depth = 0;
   return src;
   ctx->bstop = ctx->bptr = src->buf;
   ctx->depth = 0;
   return src;
@@ -182,9 +182,9 @@ void xml_parse_decl(struct xml_context *ctx);
   struct fastbuf *fb = src->fb;                                                                \
   if (ctx->bptr == ctx->bstop)                                                         \
     ctx->bptr = ctx->bstop = src->buf;                                                 \
   struct fastbuf *fb = src->fb;                                                                \
   if (ctx->bptr == ctx->bstop)                                                         \
     ctx->bptr = ctx->bstop = src->buf;                                                 \
-  uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \
+  uns c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row;         \
   u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop,                    \
   u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop,                    \
-      *last_0xd = (f & XML_SRC_NEW_LINE) ? bstop : bend;                               \
+      *last_0xd = src->pending_0xd ? bstop : NULL;                                     \
   do                                                                                   \
     {                                                                                  \
       c = func(fb, ##params);                                                          \
   do                                                                                   \
     {                                                                                  \
       c = func(fb, ##params);                                                          \
@@ -201,7 +201,7 @@ void xml_parse_decl(struct xml_context *ctx);
            last_0xd = bstop + 2;                                                       \
          else if (c != 0x2028 && last_0xd == bstop)                                    \
            {                                                                           \
            last_0xd = bstop + 2;                                                       \
          else if (c != 0x2028 && last_0xd == bstop)                                    \
            {                                                                           \
-             last_0xd = bend;                                                          \
+             last_0xd = NULL;                                                          \
              continue;                                                                 \
            }                                                                           \
          xml_add_char(&bstop, 0xa), row++;                                             \
              continue;                                                                 \
            }                                                                           \
          xml_add_char(&bstop, 0xa), row++;                                             \
@@ -218,14 +218,12 @@ void xml_parse_decl(struct xml_context *ctx);
       else                                                                             \
         {                                                                              \
          /* EOF */                                                                     \
       else                                                                             \
         {                                                                              \
          /* EOF */                                                                     \
-         if (f & XML_SRC_SURROUND)                                                     \
-           xml_add_char(&bstop, 0x20);                                                 \
-          f |= XML_SRC_EOF;                                                            \
+          ctx->flags |= XML_SRC_EOF;                                                   \
           break;                                                                       \
        }                                                                               \
     }                                                                                  \
   while (bstop < bend);                                                                        \
           break;                                                                       \
        }                                                                               \
     }                                                                                  \
   while (bstop < bend);                                                                        \
-  ctx->flags = (last_0xd == bstop) ? f | XML_SRC_NEW_LINE : f & ~XML_SRC_NEW_LINE;     \
+  src->pending_0xd = (last_0xd == bstop);                                              \
   ctx->bstop = bstop;                                                                  \
   src->row = row;
 
   ctx->bstop = bstop;                                                                  \
   src->row = row;
 
@@ -331,7 +329,7 @@ xml_parse_decl(struct xml_context *ctx)
   src->refill_cat2 = ctx->cat_new_line;
 
   /* Initialize the supplied charset (if any) or try to guess it */
   src->refill_cat2 = ctx->cat_new_line;
 
   /* Initialize the supplied charset (if any) or try to guess it */
-  char *expected_encoding = src->expected_encoding ? : src->fb_encoding;
+  char *expected_encoding = src->expected_encoding;
   src->refill = xml_refill_utf8;
   int bom = bpeekc(src->fb);
   if (bom < 0)
   src->refill = xml_refill_utf8;
   int bom = bpeekc(src->fb);
   if (bom < 0)
@@ -358,8 +356,6 @@ xml_parse_decl(struct xml_context *ctx)
          src->refill = xml_refill_utf16_be;
          if (bom == 0xff)
            src->refill = xml_refill_utf16_le;
          src->refill = xml_refill_utf16_be;
          if (bom == 0xff)
            src->refill = xml_refill_utf16_le;
-         if (!src->expected_encoding)
-           expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE";
        }
       else if (strcasecmp(src->fb_encoding, "UTF-16BE"))
        src->refill = xml_refill_utf16_be;
        }
       else if (strcasecmp(src->fb_encoding, "UTF-16BE"))
        src->refill = xml_refill_utf16_be;
@@ -372,10 +368,15 @@ xml_parse_decl(struct xml_context *ctx)
        }
     }
   uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be;
        }
     }
   uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be;
+  if (utf16)
+    src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE";
+  if (!expected_encoding)
+    expected_encoding = src->fb_encoding;
   if (bom > 0 && xml_peek_char(ctx) == 0xfeff)
     xml_skip_char(ctx);
   else if (utf16)
     xml_error(ctx, "Missing or corrupted BOM");
   if (bom > 0 && xml_peek_char(ctx) == 0xfeff)
     xml_skip_char(ctx);
   else if (utf16)
     xml_error(ctx, "Missing or corrupted BOM");
+  TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?");
 
   /* Look ahead for presence of XMLDecl or optional TextDecl */
   if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf))
 
   /* Look ahead for presence of XMLDecl or optional TextDecl */
   if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf))
@@ -462,13 +463,19 @@ end:
       if (cs < 0 && !expected_encoding)
        xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding);
       else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8)
       if (cs < 0 && !expected_encoding)
        xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding);
       else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8)
-       xml_init_charconv(ctx, cs);
+        {
+         xml_init_charconv(ctx, cs);
+         src->fb_encoding = src->decl_encoding;
+       }
       else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 ||
        !(!strcasecmp(src->decl_encoding, "UTF-16") ||
         (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) ||
         (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE")))))
        xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding);
     }
       else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 ||
        !(!strcasecmp(src->decl_encoding, "UTF-16") ||
         (!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) ||
         (!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE")))))
        xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding);
     }
+  if (!src->fb_encoding)
+    src->fb_encoding = "UTF-8";
+  TRACE(ctx, "Final encoding=%s", src->fb_encoding);
 
 exit:
   /* Update valid Unicode ranges */
 
 exit:
   /* Update valid Unicode ranges */