/*
* Sherlock Library -- A simple XML parser
*
- * (c) 2007 Pavel Charvat <pchar@ucw.cz>
+ * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
#include "sherlock/sherlock.h"
#include "sherlock/xml/xml.h"
#include "sherlock/xml/dtd.h"
-#include "sherlock/xml/common.h"
-#include "lib/unicode.h"
-#include "lib/ff-unicode.h"
+#include "sherlock/xml/internals.h"
+#include "ucw/unicode.h"
+#include "ucw/ff-unicode.h"
#include "charset/charconv.h"
#include "charset/fb-charconv.h"
}
struct xml_source *
-xml_push_source(struct xml_context *ctx, uns flags)
+xml_push_source(struct xml_context *ctx)
{
xml_push(ctx);
struct xml_source *src = ctx->src;
src->next = ctx->src;
src->saved_depth = ctx->depth;
ctx->src = src;
- ctx->flags = (ctx->flags & ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_NEW_LINE | XML_SRC_SURROUND | XML_SRC_DOCUMENT)) | flags;
+ ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_DOCUMENT);
ctx->bstop = ctx->bptr = src->buf;
ctx->depth = 0;
- if (flags & XML_SRC_SURROUND)
- xml_add_char(&ctx->bstop, 0x20);
+ return src;
+}
+
+struct xml_source *
+xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb)
+{
+ struct xml_source *src = xml_push_source(ctx);
+ src->fb = fb;
return src;
}
{
TRACE(ctx, "pop_source");
if (unlikely(ctx->depth != 0))
- {
- xml_fatal(ctx, "Unexpected end of entity");
- }
+ xml_fatal(ctx, "Unexpected end of entity");
struct xml_source *src = ctx->src;
- ASSERT(src);
+ if (!src)
+ xml_fatal(ctx, "Undefined source");
xml_close_source(src);
ctx->depth = src->saved_depth;
ctx->src = src = src->next;
static void xml_refill_utf8(struct xml_context *ctx);
void
-xml_push_entity(struct xml_context *ctx, struct xml_dtd_ent *ent)
+xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent UNUSED)
{
- TRACE(ctx, "xml_push_entity");
- uns cat1 = ctx->src->refill_cat1;
- uns cat2 = ctx->src->refill_cat2;
- struct xml_source *src = xml_push_source(ctx, 0);
- src->refill_cat1 = cat1;
- src->refill_cat2 = cat2;
- if (ent->flags & XML_DTD_ENT_EXTERNAL)
- xml_fatal(ctx, "External entities not implemented"); // FIXME
- else
- {
- fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, ent->len, 0);
- src->refill = xml_refill_utf8;
- }
+ xml_error(ctx, "References to external entities are not supported");
}
void
-xml_set_source(struct xml_context *ctx, struct fastbuf *fb)
+xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent)
{
- TRACE(ctx, "xml_set_source");
- ASSERT(!ctx->src);
- struct xml_source *src = xml_push_source(ctx, XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL);
- src->fb = fb;
- ctx->state = XML_STATE_START;
+ TRACE(ctx, "xml_push_entity");
+ struct xml_source *src;
+ if (ent->flags & XML_DTD_ENTITY_EXTERNAL)
+ {
+ ASSERT(ctx->h_resolve_entity);
+ ctx->h_resolve_entity(ctx, ent);
+ ctx->flags |= XML_SRC_EXPECTED_DECL;
+ src = ctx->src;
+ }
+ else
+ {
+ src = xml_push_source(ctx);
+ fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, strlen(ent->text), 0);
+ }
+ src->refill = xml_refill_utf8;
+ src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line;
+ src->refill_cat2 = ctx->cat_new_line;
}
static uns
struct fastbuf *fb = src->fb; \
if (ctx->bptr == ctx->bstop) \
ctx->bptr = ctx->bstop = src->buf; \
- uns f = ctx->flags, c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \
+ uns c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \
u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \
- *last_0xd = (f & XML_SRC_NEW_LINE) ? bstop : bend; \
+ *last_0xd = src->pending_0xd ? bstop : NULL; \
do \
{ \
c = func(fb, ##params); \
last_0xd = bstop + 2; \
else if (c != 0x2028 && last_0xd == bstop) \
{ \
- last_0xd = bend; \
+ last_0xd = NULL; \
continue; \
} \
xml_add_char(&bstop, 0xa), row++; \
else \
{ \
/* EOF */ \
- if (f & XML_SRC_SURROUND) \
- xml_add_char(&bstop, 0x20); \
- f |= XML_SRC_EOF; \
+ ctx->flags |= XML_SRC_EOF; \
break; \
} \
} \
while (bstop < bend); \
- ctx->flags = (last_0xd == bstop) ? f | XML_SRC_NEW_LINE : f & ~XML_SRC_NEW_LINE; \
+ src->pending_0xd = (last_0xd == bstop); \
ctx->bstop = bstop; \
src->row = row;
REFILL(ctx, bget_utf16_be_repl, ~1U);
}
-#if 0
-static inline uns
-xml_refill_libcharset_bget(struct fastbuf *fb, unsigned short int *in_to_x)
-{
- // FIXME: slow
- int c;
- return (unlikely(c = bgetc(fb) < 0)) ? c : (int)conv_x_to_ucs(in_to_x[c]);
-}
-
-static void
-xml_refill_libcharset(struct xml_context *ctx)
-{
- unsigned short int *in_to_x = ctx->src->refill_in_to_x;
- REFILL(ctx, xml_refill_libcharset_bget, in_to_x);
-}
-#endif
-
#undef REFILL
void
while (ctx->bptr == ctx->bstop);
}
-uns
-xml_row(struct xml_context *ctx)
+static uns
+xml_source_row(struct xml_context *ctx, struct xml_source *src)
{
- struct xml_source *src = ctx->src;
- if (!src)
- return 0;
uns row = src->row;
for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2)
if (p[-1] & src->refill_cat2)
return row + 1;
}
+uns
+xml_row(struct xml_context *ctx)
+{
+ return ctx->src ? xml_source_row(ctx, ctx->src) : 0;
+}
+
/* Document/external entity header */
static char *
static void
xml_init_charconv(struct xml_context *ctx, int cs)
{
- // FIXME: hack
+ // XXX: with a direct access to libcharset tables could be faster
struct xml_source *src = ctx->src;
TRACE(ctx, "wrapping charset %s", charset_name(cs));
-#if 0
- struct conv_context conv;
- conv_set_charset(&conv, cs, CONV_CHARSET_UTF8);
- src->refill = xml_refill_libcharset;
- src->refill_in_to_x = conv.in_to_x;
-#else
src->wrapped_fb = src->fb;
src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8);
-#endif
}
void
src->refill_cat2 = ctx->cat_new_line;
/* Initialize the supplied charset (if any) or try to guess it */
- char *expected_encoding = src->expected_encoding ? : src->fb_encoding;
+ char *expected_encoding = src->expected_encoding;
src->refill = xml_refill_utf8;
int bom = bpeekc(src->fb);
if (bom < 0)
src->refill = xml_refill_utf16_be;
if (bom == 0xff)
src->refill = xml_refill_utf16_le;
- if (!src->expected_encoding)
- expected_encoding = (bom == 0xff) ? "UTF-16LE" : "UTF-16BE";
}
else if (strcasecmp(src->fb_encoding, "UTF-16BE"))
src->refill = xml_refill_utf16_be;
}
}
uns utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be;
+ if (utf16)
+ src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE";
+ if (!expected_encoding)
+ expected_encoding = src->fb_encoding;
if (bom > 0 && xml_peek_char(ctx) == 0xfeff)
xml_skip_char(ctx);
else if (utf16)
xml_error(ctx, "Missing or corrupted BOM");
+ TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?");
/* Look ahead for presence of XMLDecl or optional TextDecl */
if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf))
if (cs < 0 && !expected_encoding)
xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding);
else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8)
- xml_init_charconv(ctx, cs);
+ {
+ xml_init_charconv(ctx, cs);
+ src->fb_encoding = src->decl_encoding;
+ }
else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 ||
!(!strcasecmp(src->decl_encoding, "UTF-16") ||
(!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) ||
(!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE")))))
xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding);
}
+ if (!src->fb_encoding)
+ src->fb_encoding = "UTF-8";
+ TRACE(ctx, "Final encoding=%s", src->fb_encoding);
exit:
/* Update valid Unicode ranges */