From: Pavel Charvat Date: Thu, 24 Apr 2008 07:07:20 +0000 (+0200) Subject: XML: Implemented detection and validation of ignorable whitespace X-Git-Tag: holmes-import~449^2~8 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;ds=sidebyside;h=150d10693b285ce5ff296571100b23b884f30560;p=libucw.git XML: Implemented detection and validation of ignorable whitespace --- diff --git a/sherlock/xml/parse.c b/sherlock/xml/parse.c index 0d62112a..67ef1076 100644 --- a/sherlock/xml/parse.c +++ b/sherlock/xml/parse.c @@ -421,9 +421,16 @@ xml_flush_chars(struct xml_context *ctx) uns len = xml_end_chars(ctx, &text), rlen; if (len) { + if (ctx->flags & XML_NO_CHARS) + { + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_ignorable) + ctx->h_ignorable(ctx, text, len); + mp_restore(ctx->pool, &ctx->chars_state); + return 0; + } if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) ctx->h_block(ctx, rtext, rlen); - if (!(ctx->flags & XML_ALLOC_CHARS) && !(ctx->flags & XML_REPORT_CHARS) && !ctx->h_chars) + if (!(ctx->flags & XML_ALLOC_CHARS) && !(ctx->flags & XML_REPORT_CHARS)) { mp_restore(ctx->pool, &ctx->chars_state); return 0; @@ -450,25 +457,51 @@ xml_append_chars(struct xml_context *ctx) { TRACE(ctx, "append_chars"); struct fastbuf *out = &ctx->chars; - while (xml_get_char(ctx) != '<') - if (xml_last_char(ctx) == '&') - { - xml_inc(ctx); - xml_parse_ref(ctx); - } - else - bput_utf8_32(out, xml_last_char(ctx)); + if (ctx->flags & XML_NO_CHARS) + while (xml_get_char(ctx) != '<') + if (xml_last_cat(ctx) & XML_CHAR_WHITE) + bput_utf8_32(out, xml_last_char(ctx)); + else + { + xml_error(ctx, "This element must not contain character data"); + while (xml_get_char(ctx) != '<'); + break; + } + else + while (xml_get_char(ctx) != '<') + if (xml_last_char(ctx) == '&') + { + xml_inc(ctx); + xml_parse_ref(ctx); + } + else + bput_utf8_32(out, xml_last_char(ctx)); xml_unget_char(ctx); } /*** CDATA sections ***/ +static void +xml_skip_cdata(struct xml_context *ctx) +{ + TRACE(ctx, "skip_cdata"); + xml_parse_seq(ctx, "CDATA["); + while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>'); + xml_dec(ctx); +} + static void xml_append_cdata(struct xml_context *ctx) { /* CDSect :== '' Char*)) ']]>' * Already parsed: 'flags & XML_NO_CHARS) + { + xml_error(ctx, "This element must not contain CDATA"); + xml_skip_cdata(ctx); + return; + } xml_parse_seq(ctx, "CDATA["); struct fastbuf *out = &ctx->chars; uns rlen; @@ -493,15 +526,6 @@ xml_append_cdata(struct xml_context *ctx) xml_dec(ctx); } -static void UNUSED -xml_skip_cdata(struct xml_context *ctx) -{ - TRACE(ctx, "skip_cdata"); - xml_parse_seq(ctx, "CDATA["); - while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>'); - xml_dec(ctx); -} - /*** Attribute values ***/ char * @@ -669,6 +693,11 @@ xml_push_element(struct xml_context *ctx) xml_error(ctx, "Undefined element <%s>", e->name); else { + if (e->dtd->type == XML_DTD_ELEM_MIXED) + ctx->flags &= ~XML_NO_CHARS; + else + ctx->flags |= XML_NO_CHARS; + // FIXME: validate regular expressions } while (1) diff --git a/sherlock/xml/xml-test.c b/sherlock/xml/xml-test.c index 972a339b..0c685d8f 100644 --- a/sherlock/xml/xml-test.c +++ b/sherlock/xml/xml-test.c @@ -23,6 +23,7 @@ enum { WANT_IGNORE_COMMENTS, WANT_IGNORE_PIS, WANT_REPORT_BLOCKS, + WANT_REPORT_IGNORABLE, WANT_FILE_ENTITIES, }; @@ -36,7 +37,8 @@ static struct option longopts[] = { { "hide-errors", 0, 0, WANT_HIDE_ERRORS }, { "ignore-comments", 0, 0, WANT_IGNORE_COMMENTS }, { "ignore-pis", 0, 0, WANT_IGNORE_PIS }, - { "reports-blocks", 0, 0, WANT_REPORT_BLOCKS }, + { "report-blocks", 0, 0, WANT_REPORT_BLOCKS }, + { "report-ignorable", 0, 0, WANT_REPORT_IGNORABLE }, { "file-entities", 0, 0, WANT_FILE_ENTITIES }, { NULL, 0, 0, 0 } }; @@ -58,6 +60,7 @@ CF_USAGE --ignore-comments Ignore comments\n\ --ignore-pis Ignore processing instructions\n\ --report-blocks Report blocks or characters and CDATA sections\n\ + --report-ignorable Report ignorable whitespace\n\ --file-entities Resolve file external entities (not fully normative)\n\ \n", stderr); exit(1); @@ -71,6 +74,7 @@ static uns want_hide_errors; static uns want_ignore_comments; static uns want_ignore_pis; static uns want_report_blocks; +static uns want_report_ignorable; static uns want_file_entities; static struct fastbuf *out; @@ -206,6 +210,12 @@ h_cdata(struct xml_context *ctx UNUSED, char *text, uns len UNUSED) bprintf(out, "SAX: cdata text='%s'\n", text); } +static void +h_ignorable(struct xml_context *ctx UNUSED, char *text, uns len UNUSED) +{ + bprintf(out, "SAX: ignorable text='%s'\n", text); +} + static void h_dtd_start(struct xml_context *ctx UNUSED) { @@ -257,6 +267,9 @@ main(int argc, char **argv) case WANT_REPORT_BLOCKS: want_report_blocks++; break; + case WANT_REPORT_IGNORABLE: + want_report_ignorable++; + break; case WANT_FILE_ENTITIES: want_file_entities++; break; @@ -287,6 +300,8 @@ main(int argc, char **argv) ctx.h_block = h_block; ctx.h_cdata = h_cdata; } + if (want_report_ignorable) + ctx.h_ignorable = h_ignorable; ctx.h_dtd_start = h_dtd_start; ctx.h_dtd_end = h_dtd_end; } diff --git a/sherlock/xml/xml.h b/sherlock/xml/xml.h index ac9ebefb..01b115da 100644 --- a/sherlock/xml/xml.h +++ b/sherlock/xml/xml.h @@ -80,6 +80,7 @@ enum xml_flags { /* Other parameters */ XML_VALIDATING = 0x00000100, /* Validate everything (not fully implemented!) */ XML_PARSE_DTD = 0x00000200, /* Enable parsing of DTD */ + XML_NO_CHARS = 0x00000400, /* The current element must not contain character data (filled automaticaly if using DTD) */ /* Internals, do not change! */ XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */ @@ -193,6 +194,7 @@ struct xml_context { void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */ void (*h_block)(struct xml_context *ctx, char *text, uns len); /* Called for each continuous block of characters not reported by h_cdata() (only with XML_REPORT_CHARS) */ void (*h_cdata)(struct xml_context *ctx, char *text, uns len); /* Called for each CDATA section (only with XML_REPORT_CHARS) */ + void (*h_ignorable)(struct xml_context *ctx, char *text, uns len); /* Called for ignorable whitespace (content in tags without #PCDATA) */ void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */ void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */ struct xml_dtd_entity *(*h_find_entity)(struct xml_context *ctx, char *name); /* Called when needed to resolve a general entity */