From 2c7501836b5e6a120d2846e168d1c44e3a43435f Mon Sep 17 00:00:00 2001 From: Pavel Charvat Date: Thu, 13 Dec 2007 08:30:58 +0100 Subject: [PATCH] XML: The parser can completely skip the internal subset (nonvalidating, of course). --- sherlock/xml/common.h | 1 + sherlock/xml/dtd.c | 30 ++++++++++++++++++++++++++++++ sherlock/xml/parse.c | 37 ++++++++++++++++++++++++------------- sherlock/xml/xml-test.c | 11 ++++++++++- sherlock/xml/xml.h | 1 + 5 files changed, 66 insertions(+), 14 deletions(-) diff --git a/sherlock/xml/common.h b/sherlock/xml/common.h index cecd6119..dd540e86 100644 --- a/sherlock/xml/common.h +++ b/sherlock/xml/common.h @@ -329,6 +329,7 @@ void xml_parse_pe_ref(struct xml_context *ctx); char *xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr); +void xml_skip_internal_subset(struct xml_context *ctx); void xml_parse_notation_decl(struct xml_context *ctx); void xml_parse_entity_decl(struct xml_context *ctx); void xml_parse_element_decl(struct xml_context *ctx); diff --git a/sherlock/xml/dtd.c b/sherlock/xml/dtd.c index aa99f3c1..a4ae9a02 100644 --- a/sherlock/xml/dtd.c +++ b/sherlock/xml/dtd.c @@ -833,3 +833,33 @@ xml_parse_attr_list_decl(struct xml_context *ctx) xml_skip_char(ctx); xml_dec(ctx); } + +void +xml_skip_internal_subset(struct xml_context *ctx) +{ + TRACE(ctx, "skip_internal_subset"); + /* AlreadyParsed: '[' */ + uns c; + while ((c = xml_get_char(ctx)) != ']') + { + if (c != '<') + continue; + if ((c = xml_get_char(ctx)) == '?') + { + xml_inc(ctx); + xml_skip_pi(ctx); + } + else if (c != '!') + xml_dec(ctx); + else if (xml_get_char(ctx) == '-') + { + xml_inc(ctx); + xml_skip_comment(ctx); + } + else + while ((c = xml_get_char(ctx)) != '>') + if (c == '\'' || c == '"') + while (xml_get_char(ctx) != c); + } + xml_dec(ctx); +} diff --git a/sherlock/xml/parse.c b/sherlock/xml/parse.c index 0926679f..25ab84cb 100644 --- a/sherlock/xml/parse.c +++ b/sherlock/xml/parse.c @@ -835,6 +835,7 @@ error: { xml_parse_white(ctx, 0); xml_parse_char(ctx, '<'); + xml_inc(ctx); if ((c = xml_get_char(ctx)) == '?') /* Processing intruction */ if (!(ctx->flags & XML_REPORT_PIS)) @@ -868,19 +869,25 @@ error: PULL(DOCTYPE_DECL); if (xml_peek_char(ctx) == '[') { - // FIXME: ability to skip the subset xml_skip_char(ctx); xml_inc(ctx); - xml_dtd_init(ctx); - if (ctx->h_dtd_start) - ctx->h_dtd_start(ctx); - xml_parse_internal_subset(ctx); - // FIXME: external subset - if (ctx->h_dtd_end) - ctx->h_dtd_end(ctx); - xml_parse_white(ctx, 0); + if (ctx->flags & XML_PARSE_DTD) + { + xml_dtd_init(ctx); + if (ctx->h_dtd_start) + ctx->h_dtd_start(ctx); + // FIXME: pu;; iface? + xml_parse_internal_subset(ctx); + // FIXME: external subset + if (ctx->h_dtd_end) + ctx->h_dtd_end(ctx); + } + else + xml_skip_internal_subset(ctx); } + xml_parse_white(ctx, 0); xml_parse_char(ctx, '>'); + xml_dec(ctx); } } @@ -896,9 +903,9 @@ error: } else xml_skip_char(ctx); -first_tag: ; - xml_inc(ctx); +first_tag: + if ((c = xml_get_char(ctx)) == '?') { /* PI */ @@ -1050,7 +1057,11 @@ epilog: uns xml_parse(struct xml_context *ctx) { - ctx->pull = 0; - xml_next(ctx); + /* This cycle shoud run only once unless the user overrides the value of ctx->pull in a SAX handler */ + do + { + ctx->pull = 0; + } + while (xml_next(ctx)); return ctx->err_code; } diff --git a/sherlock/xml/xml-test.c b/sherlock/xml/xml-test.c index a7ecda83..ab492e19 100644 --- a/sherlock/xml/xml-test.c +++ b/sherlock/xml/xml-test.c @@ -17,6 +17,7 @@ enum { WANT_FIRST = 0x100, + WANT_PARSE_DTD, WANT_HIDE_ERRORS, WANT_UNFOLD_CDATA, WANT_IGNORE_COMMENTS, @@ -29,6 +30,7 @@ static struct option longopts[] = { { "sax", 0, 0, 's' }, { "pull", 0, 0, 'p' }, { "dom", 0, 0, 'd' }, + { "dtd", 0, 0, WANT_PARSE_DTD }, { "hide-errors", 0, 0, WANT_HIDE_ERRORS }, { "unfold-cdata", 0, 0, WANT_UNFOLD_CDATA }, { "ignore-comments", 0, 0, WANT_IGNORE_COMMENTS }, @@ -45,9 +47,10 @@ Usage: xml-test [options] < input.xml\n\ Options:\n" CF_USAGE "\ --s, --pull Test PULL interface\n\ +-p, --pull Test PULL interface\n\ -s, --sax Test SAX interface\n\ -d, --dom Test DOM interface\n\ + --dtd Enable parsing of DTD\n\ --hide-errors Hide warnings and error messages\n\ --unfold-cdata Unfold CDATA sections\n\ --ignore-comments Ignore processing instructions\n\ @@ -59,6 +62,7 @@ CF_USAGE static uns want_sax; static uns want_pull; static uns want_dom; +static uns want_parse_dtd; static uns want_hide_errors; static uns want_unfold_cdata; static uns want_ignore_comments; @@ -222,6 +226,9 @@ main(int argc, char **argv) case 'd': want_dom++; break; + case WANT_PARSE_DTD: + want_parse_dtd++; + break; case WANT_HIDE_ERRORS: want_hide_errors++; break; @@ -262,6 +269,8 @@ main(int argc, char **argv) } if (want_dom) ctx.flags |= XML_ALLOC_ALL; + if (want_parse_dtd) + ctx.flags |= XML_PARSE_DTD; if (want_unfold_cdata) ctx.flags |= XML_UNFOLD_CDATA; if (want_ignore_comments) diff --git a/sherlock/xml/xml.h b/sherlock/xml/xml.h index c63d229b..a608bfb8 100644 --- a/sherlock/xml/xml.h +++ b/sherlock/xml/xml.h @@ -80,6 +80,7 @@ enum xml_flags { /* Other parameters */ XML_UNFOLD_CDATA = 0x00000100, /* Unfold CDATA sections */ XML_VALIDATING = 0x00000200, /* Validate everything (not fully implemented!) */ + XML_PARSE_DTD = 0x00000400, /* Enable parsing of DTD */ /* Internals, do not change! */ XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */ -- 2.39.5