2 * Sherlock Library -- A simple XML parser
4 * (c) 2007 Pavel Charvat <pchar@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
12 #include "sherlock/sherlock.h"
13 #include "sherlock/xml/xml.h"
14 #include "sherlock/xml/dtd.h"
15 #include "sherlock/xml/common.h"
16 #include "lib/fastbuf.h"
17 #include "lib/ff-unicode.h"
18 #include "lib/unicode.h"
19 #include "lib/chartype.h"
20 #include "lib/hashfunc.h"
27 xml_push_comment(struct xml_context *ctx)
29 TRACE(ctx, "push_comment");
30 /* Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
31 * Already parsed: '<!-' */
32 xml_parse_char(ctx, '-');
33 struct xml_node *n = xml_push_dom(ctx);
34 n->type = XML_NODE_COMMENT;
35 char *p = mp_start_noalign(ctx->pool, 6);
38 if (xml_get_char(ctx) == '-')
39 if (xml_get_char(ctx) == '-')
43 p = utf8_32_put(p, xml_last_char(ctx));
44 p = mp_spread(ctx->pool, p, 6);
46 xml_parse_char(ctx, '>');
48 n->len = p - (char *)mp_ptr(ctx->pool);
49 n->text = mp_end(ctx->pool, p + 1);
55 xml_pop_comment(struct xml_context *ctx)
59 TRACE(ctx, "pop_comment");
63 xml_skip_comment(struct xml_context *ctx)
65 TRACE(ctx, "skip_comment");
66 xml_parse_char(ctx, '-');
67 while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-');
68 xml_parse_char(ctx, '>');
72 /*** Processing instructions ***/
75 xml_push_pi(struct xml_context *ctx)
77 TRACE(ctx, "push_pi");
78 /* Parses a PI to ctx->value and ctx->name:
79 * PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
80 * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
81 * Already parsed: '<?' */
82 struct xml_node *n = xml_push_dom(ctx);
83 n->type = XML_NODE_PI;
84 n->name = xml_parse_name(ctx, ctx->pool);
85 if (unlikely(!strcasecmp(n->name, "xml")))
86 xml_error(ctx, "Reserved PI target");
87 char *p = mp_start_noalign(ctx->pool, 5);
88 if (!xml_parse_white(ctx, 0))
89 xml_parse_seq(ctx, "?>");
93 if (xml_get_char(ctx) == '?')
94 if (xml_peek_char(ctx) == '>')
102 p = utf8_32_put(p, xml_last_char(ctx));
103 p = mp_spread(ctx->pool, p, 5);
106 n->len = p - (char *)mp_ptr(ctx->pool);
107 n->text = mp_end(ctx->pool, p + 1);
113 xml_pop_pi(struct xml_context *ctx)
117 TRACE(ctx, "pop_pi");
121 xml_skip_pi(struct xml_context *ctx)
123 TRACE(ctx, "skip_pi");
124 if (ctx->flags & XML_FLAG_VALIDATING)
126 struct mempool_state state;
127 mp_save(ctx->stack, &state);
128 if (unlikely(!strcasecmp(xml_parse_name(ctx, ctx->stack), "xml")))
129 xml_error(ctx, "Reserved PI target");
130 mp_restore(ctx->stack, &state);
131 if (!xml_parse_white(ctx, 0))
133 xml_parse_seq(ctx, "?>");
139 if (xml_get_char(ctx) == '?')
140 if (xml_peek_char(ctx) == '>')
146 /*** Character data ***/
149 xml_chars_spout(struct fastbuf *fb)
151 if (fb->bptr >= fb->bufend)
153 struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb);
154 struct mempool *pool = ctx->pool;
155 if (fb->bufend != fb->buffer)
157 uns len = fb->bufend - fb->buffer;
158 TRACE(ctx, "grow_chars");
159 fb->buffer = mp_expand(pool);
160 fb->bufend = fb->buffer + mp_avail(pool);
161 fb->bstop = fb->buffer;
162 fb->bptr = fb->buffer + len;
166 TRACE(ctx, "push_chars");
167 struct xml_node *n = xml_push_dom(ctx);
168 n->type = XML_NODE_CDATA;
169 xml_start_chars(ctx);
175 xml_init_chars(struct xml_context *ctx)
177 struct fastbuf *fb = &ctx->chars;
178 fb->name = "<xml-chars>";
179 fb->spout = xml_chars_spout;
180 fb->can_overwrite_buffer = 1;
181 fb->bptr = fb->bstop = fb->buffer = fb->bufend = NULL;
185 xml_flush_chars(struct xml_context *ctx)
187 struct fastbuf *fb = &ctx->chars;
188 if (fb->bufend == fb->buffer)
190 TRACE(ctx, "flush_chars");
191 struct xml_node *n = ctx->node;
192 n->text = xml_end_chars(ctx, &n->len);
193 n->len = fb->bufend - fb->buffer;
200 xml_pop_chars(struct xml_context *ctx)
203 TRACE(ctx, "pop_chars");
207 xml_append_chars(struct xml_context *ctx)
209 TRACE(ctx, "append_chars");
210 struct fastbuf *out = &ctx->chars;
211 while (xml_get_char(ctx) != '<')
212 if (xml_last_char(ctx) == '&')
218 bput_utf8_32(out, xml_last_char(ctx));
222 /*** CDATA sections ***/
225 xml_push_cdata(struct xml_context *ctx)
227 TRACE(ctx, "push_cdata");
228 /* CDSect :== '<![CDATA[' (Char* - (Char* ']]>' Char*)) ']]>'
229 * Already parsed: '<![' */
230 xml_parse_seq(ctx, "CDATA[");
231 struct xml_node *n = xml_push_dom(ctx);
232 n->type = XML_NODE_CDATA;
233 char *p = mp_start_noalign(ctx->pool, 7);
236 if (xml_get_char(ctx) == ']')
238 if (xml_get_char(ctx) == ']')
239 if (xml_get_char(ctx) == '>')
245 p = utf8_32_put(p, xml_last_char(ctx));
246 p = mp_spread(ctx->pool, p, 7);
249 n->len = p - (char *)mp_ptr(ctx->pool);
250 n->text = mp_end(ctx->pool, p + 1);
256 xml_pop_cdata(struct xml_context *ctx)
260 TRACE(ctx, "pop_cdata");
264 xml_append_cdata(struct xml_context *ctx)
266 TRACE(ctx, "append_cdata");
267 xml_parse_seq(ctx, "CDATA[");
268 struct fastbuf *out = &ctx->chars;
271 if (xml_get_char(ctx) == ']')
273 if (xml_get_char(ctx) == ']')
274 if (xml_get_char(ctx) == '>')
280 bput_utf8_32(out, xml_last_char(ctx));
286 xml_skip_cdata(struct xml_context *ctx)
288 TRACE(ctx, "skip_cdata");
289 xml_parse_seq(ctx, "CDATA[");
290 while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>');
294 /*** Character references ***/
297 xml_parse_char_ref(struct xml_context *ctx)
299 TRACE(ctx, "parse_char_ref");
300 /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
301 * Already parsed: '&#' */
303 if (xml_get_char(ctx) == 'x')
305 if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT))
307 xml_error(ctx, "Expected a hexadecimal value of character reference");
312 v = (v << 4) + Cxvalue(xml_last_char(ctx));
314 while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT));
318 if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT))
320 xml_error(ctx, "Expected a numeric value of character reference");
325 v = v * 10 + xml_last_char(ctx) - '0';
327 while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT));
329 uns cat = xml_char_cat(v);
330 if (!(cat & XML_CHAR_UNRESTRICTED_1_1) && ((ctx->flags & XML_FLAG_VERSION_1_1) || !(cat & XML_CHAR_VALID_1_0)))
332 xml_error(ctx, "Character reference out of range");
335 if (xml_last_char(ctx) == ';')
340 xml_error(ctx, "Expected ';'");
342 while (xml_last_char(ctx) != ';')
345 return UNI_REPLACEMENT;
348 /*** References to general entities ***/
351 xml_parse_ref(struct xml_context *ctx)
353 /* Reference ::= EntityRef | CharRef
354 * EntityRef ::= '&' Name ';'
355 * Already parsed: '&' */
356 struct fastbuf *out = &ctx->chars;
357 if (xml_peek_char(ctx) == '#')
360 bput_utf8_32(out, xml_parse_char_ref(ctx));
364 TRACE(ctx, "parse_ge_ref");
365 struct mempool_state state;
366 mp_save(ctx->stack, &state);
367 char *name = xml_parse_name(ctx, ctx->stack);
368 xml_parse_char(ctx, ';');
369 struct xml_dtd_ent *ent = xml_dtd_find_gent(ctx, name);
372 xml_error(ctx, "Unknown entity &%s;", name);
377 else if (ent->flags & XML_DTD_ENT_TRIVIAL)
379 TRACE(ctx, "Trivial entity &%s;", name);
380 bwrite(out, ent->text, ent->len);
384 TRACE(ctx, "Pushed entity &%s;", name);
385 mp_restore(ctx->stack, &state);
387 xml_push_entity(ctx, ent);
390 mp_restore(ctx->stack, &state);
395 /*** Attribute values ***/
398 xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED)
400 TRACE(ctx, "parse_attr_value");
401 /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */
403 * -- copying from ctx->chars to ctx->pool is not necessary, we could directly write to ctx->pool
404 * -- berare quotes inside parased entities
405 * -- check value constrains / normalize value */
406 struct mempool_state state;
407 uns quote = xml_parse_quote(ctx);
408 mp_save(ctx->stack, &state);
409 xml_start_chars(ctx);
410 struct fastbuf *out = &ctx->chars;
413 uns c = xml_get_char(ctx);
419 else if (c == quote) // FIXME: beware quotes inside parsed entities
422 xml_error(ctx, "Attribute value must not contain '<'");
423 else if (xml_last_cat(ctx) & XML_CHAR_WHITE)
426 bput_utf8_32(out, c);
428 mp_restore(ctx->stack, &state);
430 return xml_end_chars(ctx, &len);
435 struct xml_attrs_table;
438 xml_attrs_hash(struct xml_attrs_table *t UNUSED, struct xml_node *e, char *n)
440 return hash_pointer(e) ^ hash_string(n);
444 xml_attrs_eq(struct xml_attrs_table *t UNUSED, struct xml_node *e1, char *n1, struct xml_node *e2, char *n2)
446 return (e1 == e2) && !strcmp(n1, n2);
450 xml_attrs_init_key(struct xml_attrs_table *t UNUSED, struct xml_attr *a, struct xml_node *e, char *name)
455 slist_add_tail(&e->attrs, &a->n);
458 #define HASH_PREFIX(x) xml_attrs_##x
459 #define HASH_NODE struct xml_attr
460 #define HASH_KEY_COMPLEX(x) x elem, x name
461 #define HASH_KEY_DECL struct xml_node *elem, char *name
462 #define HASH_TABLE_DYNAMIC
464 #define HASH_GIVE_HASHFN
465 #define HASH_GIVE_INIT_KEY
466 #define HASH_WANT_CLEANUP
467 #define HASH_WANT_REMOVE
468 #define HASH_WANT_LOOKUP
469 #define HASH_WANT_FIND
470 #define HASH_GIVE_ALLOC
472 #include "lib/hashtable.h"
475 xml_parse_attr(struct xml_context *ctx)
477 TRACE(ctx, "parse_attr");
478 /* Attribute ::= Name Eq AttValue */
480 * -- memory management
482 struct xml_node *e = ctx->node;
483 char *n = xml_parse_name(ctx, ctx->pool);
484 struct xml_attr *a = xml_attrs_lookup(ctx->tab_attrs, e, n);
486 char *v = xml_parse_attr_value(ctx, NULL);
488 xml_error(ctx, "Attribute %s is not unique", n);
496 xml_push_element(struct xml_context *ctx)
498 TRACE(ctx, "push_element");
499 /* EmptyElemTag | STag
500 * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
501 * STag ::= '<' Name (S Attribute)* S? '>'
502 * Already parsed: '<' */
503 struct xml_node *e = xml_push_dom(ctx);
504 clist_init(&e->sons);
505 e->type = XML_NODE_ELEM;
506 e->name = xml_parse_name(ctx, ctx->pool);
507 slist_init(&e->attrs);
511 if (ctx->document_type && strcmp(e->name, ctx->document_type))
512 xml_error(ctx, "The root element %s does not match the document type %s", e->name, ctx->document_type);
516 uns white = xml_parse_white(ctx, 0);
517 uns c = xml_get_char(ctx);
520 xml_parse_char(ctx, '>');
521 ctx->flags |= XML_FLAG_EMPTY_ELEM;
527 xml_fatal_expected_white(ctx);
531 if (ctx->h_element_start)
532 ctx->h_element_start(ctx);
536 xml_pop_element(struct xml_context *ctx)
538 TRACE(ctx, "pop_element");
539 if (ctx->h_element_end)
540 ctx->h_element_end(ctx);
541 struct xml_node *e = ctx->node;
542 if (ctx->flags & XML_DOM_FREE)
546 /* Restore hash table of attributes */
547 SLIST_FOR_EACH(struct xml_attr *, a, e->attrs)
548 xml_attrs_remove(ctx->tab_attrs, a);
550 while (n = clist_head(&e->sons))
552 if (n->type == XML_NODE_ELEM)
554 SLIST_FOR_EACH(struct xml_attr *, a, n->attrs)
555 xml_attrs_remove(ctx->tab_attrs, a);
556 clist_insert_list_after(&n->sons, &n->n);
566 xml_parse_etag(struct xml_context *ctx)
568 /* ETag ::= '</' Name S? '>'
569 * Already parsed: '<' */
570 struct xml_node *e = ctx->node;
576 n = utf8_32_get(n, &c);
577 if (xml_get_char(ctx) != c)
580 xml_parse_white(ctx, 0);
581 if (xml_get_char(ctx) != '>')
584 xml_error(ctx, "Invalid ETag, expected </%s>", e->name);
585 while (xml_get_char(ctx) != '>');
590 /*** Document type declaration ***/
593 xml_parse_doctype_decl(struct xml_context *ctx)
595 TRACE(ctx, "parse_doctype_decl");
596 /* doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
597 * Already parsed: '<!'
598 * Terminated before '[' or '>' */
599 if (ctx->document_type)
600 xml_fatal(ctx, "Multiple document types not allowed");
601 xml_parse_seq(ctx, "DOCTYPE");
602 xml_parse_white(ctx, 1);
603 ctx->document_type = xml_parse_name(ctx, ctx->pool);
604 TRACE(ctx, "doctyype=%s", ctx->document_type);
606 if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P'))
610 xml_parse_seq(ctx, "SYSTEM");
611 xml_parse_white(ctx, 1);
612 ctx->eid.system_id = xml_parse_system_literal(ctx, ctx->pool);
616 xml_parse_seq(ctx, "PUBLIC");
617 xml_parse_white(ctx, 1);
618 ctx->eid.public_id = xml_parse_pubid_literal(ctx, ctx->pool);
619 xml_parse_white(ctx, 1);
620 ctx->eid.system_id = xml_parse_system_literal(ctx, ctx->pool);
622 xml_parse_white(ctx, 0);
623 ctx->flags |= XML_FLAG_HAS_EXTERNAL_SUBSET;
625 if (xml_peek_char(ctx) == '[')
626 ctx->flags |= XML_FLAG_HAS_INTERNAL_SUBSET;
627 if (ctx->h_doctype_decl)
628 ctx->h_doctype_decl(ctx);
633 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
635 /* DTD: Internal subset */
638 xml_parse_internal_subset(struct xml_context *ctx)
640 // FIXME: comments/pi have no parent
642 * intSubset :== (markupdecl | DeclSep)
643 * Already parsed: ']' */
646 xml_parse_white(ctx, 0);
647 uns c = xml_get_char(ctx);
650 if ((c = xml_get_char(ctx)) == '!')
651 switch (c = xml_get_char(ctx))
654 xml_push_comment(ctx);
655 xml_pop_comment(ctx);
658 xml_parse_seq(ctx, "OTATION");
659 xml_parse_notation_decl(ctx);
662 if ((c = xml_get_char(ctx)) == 'N')
664 xml_parse_seq(ctx, "TITY");
665 xml_parse_entity_decl(ctx);
669 xml_parse_seq(ctx, "EMENT");
670 xml_parse_element_decl(ctx);
676 xml_parse_seq(ctx, "TTLIST");
677 xml_parse_attr_list_decl(ctx);
690 xml_parse_pe_ref(ctx);
700 xml_fatal(ctx, "Invalid markup in the internal subset");
704 /*----------------------------------------------*/
707 xml_init(struct xml_context *ctx)
709 bzero(ctx, sizeof(*ctx));
710 ctx->pool = mp_new(65536);
711 ctx->stack = mp_new(65536);
712 ctx->flags = XML_DOM_FREE;
715 xml_attrs_init(ctx->tab_attrs = xml_hash_new(ctx->pool, sizeof(struct xml_attrs_table)));
719 xml_cleanup(struct xml_context *ctx)
721 xml_attrs_cleanup(ctx->tab_attrs);
722 xml_dtd_cleanup(ctx);
723 mp_delete(ctx->pool);
724 mp_delete(ctx->stack);
728 xml_next(struct xml_context *ctx)
730 /* A nasty state machine */
732 TRACE(ctx, "xml_next (state=%u)", ctx->state);
734 ctx->throw_buf = &throw_buf;
735 if (setjmp(throw_buf))
738 if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal)
740 ctx->state = XML_STATE_FATAL;
741 TRACE(ctx, "raised fatal error");
747 case XML_STATE_FATAL:
750 case XML_STATE_START:
751 TRACE(ctx, "entering prolog");
752 if (ctx->h_document_start)
753 ctx->h_document_start(ctx);
757 ctx->h_xml_decl(ctx);
758 if (ctx->want & XML_WANT_DECL)
759 return ctx->state = XML_STATE_DECL;
762 /* Misc* (doctypedecl Misc*)? */
765 xml_parse_white(ctx, 0);
766 xml_parse_char(ctx, '<');
767 if ((c = xml_get_char(ctx)) == '?')
768 /* Processing intruction */
769 if (!(ctx->want & XML_WANT_PI))
774 ctx->state = XML_STATE_PROLOG_PI;
776 case XML_STATE_PROLOG_PI:
781 /* Found the root tag */
785 else if (xml_get_char(ctx) == '-')
786 if (!(ctx->want & XML_WANT_COMMENT))
787 xml_skip_comment(ctx);
790 xml_push_comment(ctx);
791 ctx->state = XML_STATE_PROLOG_COMMENT;
792 return XML_STATE_COMMENT;
793 case XML_STATE_PROLOG_COMMENT:
794 xml_pop_comment(ctx);
800 xml_parse_doctype_decl(ctx);
801 if (ctx->want & XML_WANT_DOCUMENT_TYPE)
802 return ctx->state = XML_STATE_DOCUMENT_TYPE;
803 case XML_STATE_DOCUMENT_TYPE:
804 if (xml_peek_char(ctx) == '[')
808 xml_parse_internal_subset(ctx);
809 xml_parse_white(ctx, 0);
811 xml_parse_char(ctx, '>');
815 case XML_STATE_CHARS:
819 if (xml_peek_char(ctx) != '<')
822 xml_append_chars(ctx);
830 if ((c = xml_get_char(ctx)) == '?')
833 if (!(ctx->want & XML_WANT_PI))
837 if (xml_flush_chars(ctx))
839 if (ctx->want & XML_WANT_CHARS)
841 ctx->state = XML_STATE_CHARS_BEFORE_PI;
842 return XML_STATE_CHARS;
844 case XML_STATE_CHARS_BEFORE_PI:
848 return ctx->state = XML_STATE_PI;
855 if ((c = xml_get_char(ctx)) == '-')
858 if (!(ctx->want & XML_WANT_COMMENT))
859 xml_skip_comment(ctx);
862 if (xml_flush_chars(ctx))
864 if (ctx->want & XML_WANT_CHARS)
866 ctx->state = XML_STATE_CHARS_BEFORE_COMMENT;
867 return XML_STATE_CHARS;
869 case XML_STATE_CHARS_BEFORE_COMMENT:
872 xml_push_comment(ctx);
873 return ctx->state = XML_STATE_COMMENT;
874 case XML_STATE_COMMENT:
875 xml_pop_comment(ctx);
881 if (!(ctx->want & XML_WANT_CDATA))
882 xml_append_cdata(ctx);
885 if (xml_flush_chars(ctx))
887 if (ctx->want & XML_WANT_CHARS)
889 ctx->state = XML_STATE_CHARS_BEFORE_CDATA;
890 return XML_STATE_CHARS;
892 case XML_STATE_CHARS_BEFORE_CDATA:
896 return ctx->state = XML_STATE_CDATA;
897 case XML_STATE_CDATA:
902 xml_fatal(ctx, "Unexpected character after '<!'");
906 /* STag | EmptyElemTag */
908 if (xml_flush_chars(ctx))
910 if (ctx->want & XML_WANT_CHARS)
912 ctx->state = XML_STATE_CHARS_BEFORE_STAG;
913 return XML_STATE_CHARS;
915 case XML_STATE_CHARS_BEFORE_STAG:
919 xml_push_element(ctx);
920 if (ctx->want & XML_WANT_STAG)
921 return ctx->state = XML_STATE_STAG;
923 if (ctx->flags & XML_FLAG_EMPTY_ELEM)
930 if (xml_flush_chars(ctx))
932 if (ctx->want & XML_WANT_CHARS)
934 ctx->state = XML_STATE_CHARS_BEFORE_ETAG;
935 return XML_STATE_CHARS;
937 case XML_STATE_CHARS_BEFORE_ETAG:
943 if (ctx->want & XML_WANT_ETAG)
944 return ctx->state = XML_STATE_ETAG;
946 xml_pop_element(ctx);
954 TRACE(ctx, "entering epilog");
957 /* Epilog whitespace is the only place, where a valid document can reach EOF */
958 if (setjmp(throw_buf))
959 if (ctx->err_code == XML_ERR_EOF)
961 TRACE(ctx, "reached EOF");
962 ctx->state = XML_STATE_EOF;
963 if (ctx->h_document_end)
964 ctx->h_document_end(ctx);
966 return XML_STATE_EOF;
970 xml_parse_white(ctx, 0);
971 if (setjmp(throw_buf))
975 xml_parse_char(ctx, '<');
976 if ((c = xml_get_char(ctx)) == '?')
977 /* Processing instruction */
978 if (!(ctx->want & XML_WANT_PI))
983 return ctx->state = XML_STATE_EPILOG_PI, XML_STATE_PI;
984 case XML_STATE_EPILOG_PI:
989 if (!(ctx->want & XML_WANT_COMMENT))
990 xml_skip_comment(ctx);
993 xml_push_comment(ctx);
994 return ctx->state = XML_STATE_EPILOG_COMMENT, XML_STATE_COMMENT;
995 case XML_STATE_EPILOG_COMMENT:
996 xml_pop_comment(ctx);
999 xml_fatal(ctx, "Syntax error in the epilog");