X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=sherlock%2Fxml%2Fparse.c;h=5581ed597cc70dce2297d770144aa33bd065e24d;hb=68721201144bd814bec0b361229de9009147c927;hp=ea8feab190fcd85900659d5e0d980f88ff6149c5;hpb=4041eceea24b578a0c5edaf0b82361283e6bbafb;p=libucw.git diff --git a/sherlock/xml/parse.c b/sherlock/xml/parse.c index ea8feab1..5581ed59 100644 --- a/sherlock/xml/parse.c +++ b/sherlock/xml/parse.c @@ -1,7 +1,7 @@ /* * Sherlock Library -- A simple XML parser * - * (c) 2007 Pavel Charvat + * (c) 2007--2008 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -12,7 +12,7 @@ #include "sherlock/sherlock.h" #include "sherlock/xml/xml.h" #include "sherlock/xml/dtd.h" -#include "sherlock/xml/common.h" +#include "sherlock/xml/internals.h" #include "lib/fastbuf.h" #include "lib/ff-unicode.h" #include "lib/unicode.h" @@ -84,27 +84,20 @@ char * xml_parse_name(struct xml_context *ctx, struct mempool *pool) { /* Name ::= NameStartChar (NameChar)* */ - return xml_parse_string(ctx, pool, - !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, - !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, - "Expected a name"); + return xml_parse_string(ctx, pool, ctx->cat_sname, ctx->cat_name, "Expected a name"); } void xml_skip_name(struct xml_context *ctx) { - xml_skip_string(ctx, - !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_SNAME_1_0 : XML_CHAR_SNAME_1_1, - !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1, - "Expected a name"); + xml_skip_string(ctx, ctx->cat_sname, ctx->cat_name, "Expected a name"); } char * xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool) { /* Nmtoken ::= (NameChar)+ */ - uns cat = !(ctx->flags & XML_VERSION_1_1) ? XML_CHAR_NAME_1_0 : XML_CHAR_NAME_1_1; - return xml_parse_string(ctx, pool, cat, cat, "Expected a nmtoken"); + return xml_parse_string(ctx, pool, ctx->cat_name, ctx->cat_name, "Expected a nmtoken"); } /*** Simple literals ***/ @@ -150,7 +143,7 @@ xml_push_comment(struct xml_context *ctx) /* Comment ::= '' * Already parsed: 'type = XML_NODE_COMMENT; char *p = mp_start_noalign(ctx->pool, 6); while (1) @@ -199,7 +192,7 @@ xml_push_pi(struct xml_context *ctx) * PI ::= '' Char*)))? '?>' * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) * Already parsed: 'type = XML_NODE_PI; n->name = xml_parse_name(ctx, ctx->pool); if (unlikely(!strcasecmp(n->name, "xml"))) @@ -263,118 +256,6 @@ xml_skip_pi(struct xml_context *ctx) xml_dec(ctx); } -/*** Character data ***/ - -static inline uns -xml_flush_chars(struct xml_context *ctx) -{ - struct fastbuf *fb = &ctx->chars; - if (fb->bufend == fb->buffer) - return 0; - TRACE(ctx, "flush_chars"); - struct xml_node *n = ctx->node; - n->text = xml_end_chars(ctx, &n->len); - n->len = fb->bufend - fb->buffer; - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars) - ctx->h_chars(ctx); - return 1; -} - -static inline void -xml_pop_chars(struct xml_context *ctx) -{ - xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS)); - TRACE(ctx, "pop_chars"); -} - -static inline void -xml_append_chars(struct xml_context *ctx) -{ - TRACE(ctx, "append_chars"); - struct fastbuf *out = &ctx->chars; - while (xml_get_char(ctx) != '<') - if (xml_last_char(ctx) == '&') - { - xml_inc(ctx); - xml_parse_ref(ctx); - } - else - bput_utf8_32(out, xml_last_char(ctx)); - xml_unget_char(ctx); -} - -/*** CDATA sections ***/ - -static void -xml_push_cdata(struct xml_context *ctx) -{ - TRACE(ctx, "push_cdata"); - /* CDSect :== '' Char*)) ']]>' - * Already parsed: 'type = XML_NODE_CHARS; - char *p = mp_start_noalign(ctx->pool, 7); - while (1) - { - if (xml_get_char(ctx) == ']') - { - if (xml_get_char(ctx) == ']') - if (xml_get_char(ctx) == '>') - break; - else - *p++ = ']'; - *p++ = ']'; - } - p = utf8_32_put(p, xml_last_char(ctx)); - p = mp_spread(ctx->pool, p, 7); - } - *p = 0; - n->len = p - (char *)mp_ptr(ctx->pool); - n->text = mp_end(ctx->pool, p + 1); - if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata) - ctx->h_cdata(ctx); -} - -static void -xml_pop_cdata(struct xml_context *ctx) -{ - xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS)); - xml_dec(ctx); - TRACE(ctx, "pop_cdata"); -} - -static void -xml_append_cdata(struct xml_context *ctx) -{ - TRACE(ctx, "append_cdata"); - xml_parse_seq(ctx, "CDATA["); - struct fastbuf *out = &ctx->chars; - while (1) - { - if (xml_get_char(ctx) == ']') - { - if (xml_get_char(ctx) == ']') - if (xml_get_char(ctx) == '>') - break; - else - bputc(out, ']'); - bputc(out, ']'); - } - bput_utf8_32(out, xml_last_char(ctx)); - } - xml_dec(ctx); -} - -static void UNUSED -xml_skip_cdata(struct xml_context *ctx) -{ - TRACE(ctx, "skip_cdata"); - xml_parse_seq(ctx, "CDATA["); - while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>'); - xml_dec(ctx); -} - /*** Character references ***/ uns @@ -411,7 +292,7 @@ xml_parse_char_ref(struct xml_context *ctx) while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT)); } uns cat = xml_char_cat(v); - if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0))) + if (!(cat & ctx->cat_unrestricted)) { xml_error(ctx, "Character reference out of range"); goto recover; @@ -431,7 +312,7 @@ recover: /*** References to general entities ***/ -void +static void xml_parse_ref(struct xml_context *ctx) { /* Reference ::= EntityRef | CharRef @@ -450,7 +331,7 @@ xml_parse_ref(struct xml_context *ctx) mp_save(ctx->stack, &state); char *name = xml_parse_name(ctx, ctx->stack); xml_parse_char(ctx, ';'); - struct xml_dtd_ent *ent = xml_dtd_find_ent(ctx, name); + struct xml_dtd_entity *ent = xml_dtd_find_entity(ctx, name); if (!ent) { xml_error(ctx, "Unknown entity &%s;", name); @@ -458,10 +339,10 @@ xml_parse_ref(struct xml_context *ctx) bputs(out, name); bputc(out, ';'); } - else if (ent->flags & XML_DTD_ENT_TRIVIAL) + else if (ent->flags & XML_DTD_ENTITY_TRIVIAL) { TRACE(ctx, "Trivial entity &%s;", name); - bwrite(out, ent->text, ent->len); + bputs(out, ent->text); } else { @@ -476,6 +357,175 @@ xml_parse_ref(struct xml_context *ctx) } } +/*** Character data ***/ + +void +xml_spout_chars(struct fastbuf *fb) +{ + if (fb->bptr < fb->bufend) + return; + struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb); + struct mempool *pool = ctx->pool; + if (fb->bufend != fb->buffer) + { + TRACE(ctx, "growing chars"); + uns len = fb->bufend - fb->buffer; + uns reported = fb->bstop - fb->buffer; + fb->buffer = mp_expand(pool); + fb->bufend = fb->buffer + mp_avail(pool); + fb->bptr = fb->buffer + len; + fb->bstop = fb->buffer + reported; + } + else + { + TRACE(ctx, "starting chars"); + mp_save(pool, &ctx->chars_state); + fb->bptr = fb->buffer = fb->bstop = mp_start_noalign(pool, 2); + fb->bufend = fb->buffer + mp_avail(pool) - 1; + } +} + +static inline uns +xml_end_chars(struct xml_context *ctx, char **out) +{ + struct fastbuf *fb = &ctx->chars; + uns len = fb->bptr - fb->buffer; + if (len) + { + TRACE(ctx, "ending chars"); + *fb->bptr = 0; + *out = mp_end(ctx->pool, fb->bptr + 1); + fb->bufend = fb->bstop = fb->bptr = fb->buffer; + } + return len; +} + +static inline uns +xml_report_chars(struct xml_context *ctx, char **out) +{ + struct fastbuf *fb = &ctx->chars; + uns len = fb->bptr - fb->buffer; + if (len) + { + *fb->bptr = 0; + *out = fb->bstop; + fb->bstop = fb->bptr; + } + return len; +} + +static inline uns +xml_flush_chars(struct xml_context *ctx) +{ + char *text, *rtext; + uns len = xml_end_chars(ctx, &text), rlen; + if (len) + { + if (ctx->flags & XML_NO_CHARS) + { + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_ignorable) + ctx->h_ignorable(ctx, text, len); + mp_restore(ctx->pool, &ctx->chars_state); + return 0; + } + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) + ctx->h_block(ctx, rtext, rlen); + if (!(ctx->flags & XML_ALLOC_CHARS) && !(ctx->flags & XML_REPORT_CHARS)) + { + mp_restore(ctx->pool, &ctx->chars_state); + return 0; + } + struct xml_node *n = xml_push_dom(ctx, &ctx->chars_state); + n->type = XML_NODE_CHARS; + n->text = text; + n->len = len; + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars) + ctx->h_chars(ctx); + } + return len; +} + +static inline void +xml_pop_chars(struct xml_context *ctx) +{ + xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS)); + TRACE(ctx, "pop_chars"); +} + +static inline void +xml_append_chars(struct xml_context *ctx) +{ + TRACE(ctx, "append_chars"); + struct fastbuf *out = &ctx->chars; + if (ctx->flags & XML_NO_CHARS) + while (xml_get_char(ctx) != '<') + if (xml_last_cat(ctx) & XML_CHAR_WHITE) + bput_utf8_32(out, xml_last_char(ctx)); + else + { + xml_error(ctx, "This element must not contain character data"); + while (xml_get_char(ctx) != '<'); + break; + } + else + while (xml_get_char(ctx) != '<') + if (xml_last_char(ctx) == '&') + { + xml_inc(ctx); + xml_parse_ref(ctx); + } + else + bput_utf8_32(out, xml_last_char(ctx)); + xml_unget_char(ctx); +} + +/*** CDATA sections ***/ + +static void +xml_skip_cdata(struct xml_context *ctx) +{ + TRACE(ctx, "skip_cdata"); + xml_parse_seq(ctx, "CDATA["); + while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>'); + xml_dec(ctx); +} + +static void +xml_append_cdata(struct xml_context *ctx) +{ + /* CDSect :== '' Char*)) ']]>' + * Already parsed: 'flags & XML_NO_CHARS) + { + xml_error(ctx, "This element must not contain CDATA"); + xml_skip_cdata(ctx); + return; + } + xml_parse_seq(ctx, "CDATA["); + struct fastbuf *out = &ctx->chars; + uns rlen; + char *rtext; + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) + ctx->h_block(ctx, rtext, rlen); + while (1) + { + if (xml_get_char(ctx) == ']') + { + if (xml_get_char(ctx) == ']') + if (xml_get_char(ctx) == '>') + break; + else + bputc(out, ']'); + bputc(out, ']'); + } + bput_utf8_32(out, xml_last_char(ctx)); + } + if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata && (rlen = xml_report_chars(ctx, &rtext))) + ctx->h_cdata(ctx, rtext, rlen); + xml_dec(ctx); +} + /*** Attribute values ***/ char * @@ -483,15 +533,12 @@ xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) { TRACE(ctx, "parse_attr_value"); /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */ - /* FIXME: - * -- copying from ctx->chars to ctx->pool is not necessary, we could directly write to ctx->pool - * -- berare quotes inside parased entities - * -- check value constrains / normalize value */ + /* FIXME: -- check value constrains / normalize leading/trailing WS and repeated WS */ struct mempool_state state; uns quote = xml_parse_quote(ctx); mp_save(ctx->stack, &state); - xml_start_chars(ctx); struct fastbuf *out = &ctx->chars; + struct xml_source *src = ctx->src; while (1) { uns c = xml_get_char(ctx); @@ -500,7 +547,7 @@ xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) xml_inc(ctx); xml_parse_ref(ctx); } - else if (c == quote) // FIXME: beware quotes inside parsed entities + else if (c == quote && src == ctx->src) break; else if (c == '<') xml_error(ctx, "Attribute value must not contain '<'"); @@ -510,8 +557,29 @@ xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) bput_utf8_32(out, c); } mp_restore(ctx->stack, &state); - uns len; - return xml_end_chars(ctx, &len); + char *text; + return xml_end_chars(ctx, &text) ? text : ""; +} + +uns +xml_normalize_white(struct xml_context *ctx UNUSED, char *text) +{ + char *s = text, *d = text; + while (*s == 0x20) + s++; + while (1) + { + while (*s & ~0x20) + *d++ = *s++; + if (!*s) + break; + while (*++s == 0x20); + *d++ = 0x20; + } + if (d != text && d[-1] == 0x20) + d--; + *d = 0; + return d - text; } /*** Attributes ***/ @@ -536,6 +604,7 @@ xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct a->elem = e; a->name = name; a->val = NULL; + a->user = NULL; slist_add_tail(&e->attrs, &a->n); } @@ -560,18 +629,23 @@ xml_parse_attr(struct xml_context *ctx) { TRACE(ctx, "parse_attr"); /* Attribute ::= Name Eq AttValue */ - /* FIXME: - * -- memory management - * -- DTD */ struct xml_node *e = ctx->node; char *n = xml_parse_name(ctx, ctx->pool); struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, n); xml_parse_eq(ctx); char *v = xml_parse_attr_value(ctx, NULL); if (a->val) - xml_error(ctx, "Attribute %s is not unique", n); + { + xml_error(ctx, "Attribute %s is not unique in element <%s>", n, e->name); + return; + } + a->val = v; + if (!e->dtd) + a->dtd = NULL; + else if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name))) + xml_error(ctx, "Undefined attribute %s in element <%s>", n, e->name); else - a->val = v; + xml_validate_attr(ctx, a->dtd, a->val); } struct xml_attr * @@ -580,6 +654,18 @@ xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name) return xml_attrs_find(ctx->tab_attrs, node, name); } +char * +xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name) +{ + struct xml_attr *attr = xml_attrs_find(ctx->tab_attrs, node, name); + if (attr) + return attr->val; + if (!node->dtd) + return NULL; + struct xml_dtd_attr *dtd = xml_dtd_find_attr(ctx, node->dtd, name); + return dtd ? dtd->default_value : NULL; +} + void xml_attrs_table_init(struct xml_context *ctx) { @@ -594,6 +680,18 @@ xml_attrs_table_cleanup(struct xml_context *ctx) /*** Elements ***/ +static uns +xml_validate_element(struct xml_dtd_elem_node *root, struct xml_dtd_elem *elem) +{ + if (root->elem) + return elem == root->elem; + else + SLIST_FOR_EACH(struct xml_dtd_elem_node *, son, root->sons) + if (xml_validate_element(son, elem)) + return 1; + return 0; +} + static void xml_push_element(struct xml_context *ctx) { @@ -602,16 +700,37 @@ xml_push_element(struct xml_context *ctx) * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' * STag ::= '<' Name (S Attribute)* S? '>' * Already parsed: '<' */ - struct xml_node *e = xml_push_dom(ctx); + struct xml_node *e = xml_push_dom(ctx, NULL); clist_init(&e->sons); e->type = XML_NODE_ELEM; e->name = xml_parse_name(ctx, ctx->pool); slist_init(&e->attrs); if (!e->parent) { - ctx->root = e; + ctx->dom = e; if (ctx->doctype && strcmp(e->name, ctx->doctype)) - xml_error(ctx, "The root element %s does not match the document type %s", e->name, ctx->doctype); + xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype); + } + if (!ctx->dtd) + e->dtd = NULL; + else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name))) + xml_error(ctx, "Undefined element <%s>", e->name); + else + { + struct xml_dtd_elem *dtd = e->dtd, *parent_dtd = e->parent ? e->parent->dtd : NULL; + if (dtd->type == XML_DTD_ELEM_MIXED) + ctx->flags &= ~XML_NO_CHARS; + else + ctx->flags |= XML_NO_CHARS; + if (parent_dtd) + if (parent_dtd->type == XML_DTD_ELEM_EMPTY) + xml_error(ctx, "Empty element must not contain children"); + else if (parent_dtd->type != XML_DTD_ELEM_ANY) + { + // FIXME: validate regular expressions + if (!xml_validate_element(parent_dtd->node, dtd)) + xml_error(ctx, "Unexpected element <%s>", e->name); + } } while (1) { @@ -630,6 +749,19 @@ xml_push_element(struct xml_context *ctx) xml_unget_char(ctx); xml_parse_attr(ctx); } + if (e->dtd) + SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs) + if (a->default_mode == XML_ATTR_REQUIRED) + { + if (!xml_attrs_find(ctx->tab_attrs, e, a->name)) + xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name); + } + else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS) + { + struct xml_attr *attr = xml_attrs_lookup(ctx->tab_attrs, e, a->name); + if (!attr->val) + attr->val = a->default_value; + } if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag) ctx->h_stag(ctx); } @@ -645,7 +777,7 @@ xml_pop_element(struct xml_context *ctx) if (free) { if (!e->parent) - ctx->root = NULL; + ctx->dom = NULL; /* Restore hash table of attributes */ SLIST_FOR_EACH(struct xml_attr *, a, e->attrs) xml_attrs_remove(ctx->tab_attrs, a); @@ -726,7 +858,11 @@ xml_parse_doctype_decl(struct xml_context *ctx) ctx->flags |= XML_HAS_EXTERNAL_SUBSET; } if (xml_peek_char(ctx) == '[') - ctx->flags |= XML_HAS_INTERNAL_SUBSET; + { + ctx->flags |= XML_HAS_INTERNAL_SUBSET; + xml_skip_char(ctx); + xml_inc(ctx); + } if (ctx->h_doctype_decl) ctx->h_doctype_decl(ctx); } @@ -738,12 +874,19 @@ xml_parse_doctype_decl(struct xml_context *ctx) /* DTD: Internal subset */ static void -xml_parse_internal_subset(struct xml_context *ctx) +xml_parse_subset(struct xml_context *ctx, uns external) { - // FIXME: comments/pi have no parent + // FIXME: + // -- comments/pi have no parent + // -- conditional sections in external subset + // -- check corectness of parameter entities + /* '[' intSubset ']' * intSubset :== (markupdecl | DeclSep) - * Already parsed: ']' */ + * Already parsed: '[' + * + * extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* + */ while (1) { xml_parse_white(ctx, 0); @@ -791,16 +934,21 @@ xml_parse_internal_subset(struct xml_context *ctx) goto invalid_markup; else if (c == '%') xml_parse_pe_ref(ctx); - else if (c == ']') - break; + else if (c == ']' && !external) + { + break; + } + else if (c == '>' && external) + { + break; + } else goto invalid_markup; } xml_dec(ctx); - xml_dec(ctx); return; -invalid_markup: - xml_fatal(ctx, "Invalid markup in the internal subset"); +invalid_markup: ; + xml_fatal(ctx, "Invalid markup in the %s subset", external ? "external" : "internal"); } /*** The State Machine ***/ @@ -829,6 +977,7 @@ error: { case XML_STATE_START: TRACE(ctx, "entering prolog"); + ctx->flags |= XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL; if (ctx->h_document_start) ctx->h_document_start(ctx); /* XMLDecl */ @@ -842,6 +991,7 @@ error: { xml_parse_white(ctx, 0); xml_parse_char(ctx, '<'); + xml_inc(ctx); if ((c = xml_get_char(ctx)) == '?') /* Processing intruction */ if (!(ctx->flags & XML_REPORT_PIS)) @@ -873,21 +1023,40 @@ error: xml_unget_char(ctx); xml_parse_doctype_decl(ctx); PULL(DOCTYPE_DECL); - if (xml_peek_char(ctx) == '[') - { - // FIXME: ability to skip the subset - xml_skip_char(ctx); - xml_inc(ctx); - xml_dtd_init(ctx); - if (ctx->h_dtd_start) - ctx->h_dtd_start(ctx); - xml_parse_internal_subset(ctx); - // FIXME: external subset - if (ctx->h_dtd_end) - ctx->h_dtd_end(ctx); - xml_parse_white(ctx, 0); - } + if (ctx->flags & XML_HAS_DTD) + if (ctx->flags & XML_PARSE_DTD) + { + xml_dtd_init(ctx); + if (ctx->h_dtd_start) + ctx->h_dtd_start(ctx); + if (ctx->flags & XML_HAS_INTERNAL_SUBSET) + { + xml_parse_subset(ctx, 0); + xml_dec(ctx); + } + if (ctx->flags & XML_HAS_EXTERNAL_SUBSET) + { + struct xml_dtd_entity ent = { + .system_id = ctx->system_id, + .public_id = ctx->public_id, + }; + xml_parse_white(ctx, 0); + xml_parse_char(ctx, '>'); + xml_unget_char(ctx); + ASSERT(ctx->h_resolve_entity); + ctx->h_resolve_entity(ctx, &ent); + ctx->flags |= XML_SRC_EXPECTED_DECL; + xml_parse_subset(ctx, 1); + xml_unget_char(ctx);; + } + if (ctx->h_dtd_end) + ctx->h_dtd_end(ctx); + } + else if (ctx->flags & XML_HAS_INTERNAL_SUBSET) + xml_skip_internal_subset(ctx); + xml_parse_white(ctx, 0); xml_parse_char(ctx, '>'); + xml_dec(ctx); } } @@ -903,9 +1072,9 @@ error: } else xml_skip_char(ctx); -first_tag: ; - xml_inc(ctx); +first_tag: + if ((c = xml_get_char(ctx)) == '?') { /* PI */ @@ -945,19 +1114,7 @@ first_tag: ; else if (c == '[') { /* CDATA */ - if (!(ctx->flags & XML_UNFOLD_CDATA)) - xml_append_cdata(ctx); - else - { - if (xml_flush_chars(ctx)) - { - PULL_STATE(CHARS, CHARS_BEFORE_CDATA); - xml_pop_chars(ctx); - } - xml_push_cdata(ctx); - PULL(CDATA); - xml_pop_cdata(ctx); - } + xml_append_cdata(ctx); } else xml_fatal(ctx, "Unexpected character after 'pull; + ctx->pull = pull; + uns res = xml_next(ctx); + ctx->pull = saved; + return res; +} + +uns +xml_skip_element(struct xml_context *ctx) +{ + ASSERT(ctx->state == XML_STATE_STAG); + struct xml_node *node = ctx->node; + uns saved = ctx->pull, res; + ctx->pull = XML_PULL_ETAG; + while ((res = xml_next(ctx)) && ctx->node != node); + ctx->pull = saved; + return res; +} + uns xml_parse(struct xml_context *ctx) { - ctx->pull = 0; - xml_next(ctx); + /* This cycle should run only once unless the user overrides the value of ctx->pull in a SAX handler */ + do + { + ctx->pull = 0; + } + while (xml_next(ctx)); return ctx->err_code; } + +char * +xml_merge_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool) +{ + ASSERT(node->type == XML_NODE_ELEM); + char *p = mp_start_noalign(pool, 1); + XML_NODE_FOR_EACH(son, node) + if (son->type == XML_NODE_CHARS) + { + p = mp_spread(pool, p, son->len + 1); + memcpy(p, son->text, son->len); + p += son->len; + } + *p++ = 0; + return mp_end(pool, p); +} + +static char * +xml_append_dom_chars(char *p, struct mempool *pool, struct xml_node *node) +{ + XML_NODE_FOR_EACH(son, node) + if (son->type == XML_NODE_CHARS) + { + p = mp_spread(pool, p, son->len + 1); + memcpy(p, son->text, son->len); + p += son->len; + } + else if (son->type == XML_NODE_ELEM) + p = xml_append_dom_chars(p, pool, son); + return p; +} + +char * +xml_merge_dom_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool) +{ + ASSERT(node->type == XML_NODE_ELEM); + char *p = mp_start_noalign(pool, 1); + p = xml_append_dom_chars(p, pool, node); + *p++ = 0; + return mp_end(pool, p); +}