X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;ds=sidebyside;f=sherlock%2Fxml%2Fxml.h;h=ac9ebefb70f8a88ab6f1056326dd876eb0e44002;hb=83400e61386e912475562889bfb2c39dc5ecf6d0;hp=7e83f65ad31ef20164a4baada8d9494ee3020c96;hpb=637533a60b2201eaadedcb00fc66ef1e20237432;p=libucw.git diff --git a/sherlock/xml/xml.h b/sherlock/xml/xml.h index 7e83f65a..ac9ebefb 100644 --- a/sherlock/xml/xml.h +++ b/sherlock/xml/xml.h @@ -1,7 +1,7 @@ /* * Sherlock Library -- A simple XML parser * - * (c) 2007 Pavel Charvat + * (c) 2007--2008 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -13,133 +13,143 @@ #include "lib/clists.h" #include "lib/slists.h" #include "lib/mempool.h" +#include "lib/fastbuf.h" + +struct xml_context; +struct xml_dtd_entity; enum xml_error { XML_ERR_OK = 0, - XML_ERR_WARN = 1000, /* Warning */ - XML_ERR_ERROR = 2000, /* Recoverable error */ - XML_ERR_FATAL = 3000, /* Unrecoverable error */ + XML_ERR_WARN = 1000, /* Warning */ + XML_ERR_ERROR = 2000, /* Recoverable error */ + XML_ERR_FATAL = 3000, /* Unrecoverable error */ XML_ERR_EOF, }; enum xml_state { - XML_STATE_START = 0, - XML_STATE_DECL, - XML_STATE_DOCUMENT_TYPE, - XML_STATE_CHARS, - XML_STATE_WHITE, - XML_STATE_CDATA, - XML_STATE_STAG, - XML_STATE_ETAG, - XML_STATE_COMMENT, - XML_STATE_PI, - XML_STATE_EOF, - XML_STATE_FATAL, + XML_STATE_EOF, /* EOF or a fatal error */ + XML_STATE_START, /* Initial state */ + XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */ + XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */ + XML_STATE_CHARS, /* XML_PULL_CHARS */ + XML_STATE_STAG, /* XML_PULL_STAG */ + XML_STATE_ETAG, /* XML_PULL_ETAG */ + XML_STATE_COMMENT, /* XML_PULL_COMMENT */ + XML_STATE_PI, /* XML_PULL_PI */ /* Internal states */ XML_STATE_CHARS_BEFORE_STAG, XML_STATE_CHARS_BEFORE_ETAG, XML_STATE_CHARS_BEFORE_CDATA, - XML_STATE_CHARS_BEFORE_PI, XML_STATE_CHARS_BEFORE_COMMENT, - XML_STATE_PROLOG_PI, + XML_STATE_CHARS_BEFORE_PI, XML_STATE_PROLOG_COMMENT, - XML_STATE_EPILOG_PI, + XML_STATE_PROLOG_PI, XML_STATE_EPILOG_COMMENT, + XML_STATE_EPILOG_PI, }; -enum xml_want { - XML_WANT_DECL = 1 << XML_STATE_DECL, - XML_WANT_DOCUMENT_TYPE = 1 << XML_STATE_DOCUMENT_TYPE, - XML_WANT_CHARS = 1 << XML_STATE_CHARS, - XML_WANT_WHITE = 1 << XML_STATE_WHITE, - XML_WANT_CDATA = 1 << XML_STATE_CDATA, - XML_WANT_STAG = 1 << XML_STATE_STAG, - XML_WANT_ETAG = 1 << XML_STATE_ETAG, - XML_WANT_COMMENT = 1 << XML_STATE_COMMENT, - XML_WANT_PI = 1 << XML_STATE_PI, - XML_WANT_EOF = 1 << XML_STATE_EOF, - XML_WANT_ALL = ~0U, +enum xml_pull { + XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */ + XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */ + XML_PULL_CHARS = 0x00000004, + XML_PULL_STAG = 0x00000008, + XML_PULL_ETAG = 0x00000010, + XML_PULL_COMMENT = 0x00000020, + XML_PULL_PI = 0x00000040, + XML_PULL_ALL = 0xffffffff, }; enum xml_flags { - XML_FLAG_VALIDATING = 0x1, - XML_FLAG_VERSION_1_1 = 0x2, /* XML version 1.1, otherwise 1.0 */ - XML_FLAG_HAS_EXTERNAL_SUBSET = 0x4, /* The document contains a reference to external DTD subset */ - XML_FLAG_HAS_INTERNAL_SUBSET = 0x8, /* The document contains an internal subset */ - - XML_FLAG_SRC_EOF = 0x10, /* EOF reached */ - XML_FLAG_SRC_EXPECTED_DECL = 0x20, /* Just before optional or required XMLDecl/TextDecl */ - XML_FLAG_SRC_NEW_LINE = 0x40, /* The last read character is 0xD */ - XML_FLAG_SRC_SURROUND = 0x80, /* Surround the text with 0x20 (references to parameter entities) */ - XML_FLAG_SRC_DOCUMENT = 0x100, /* The document entity */ - XML_FLAG_SRC_EXTERNAL = 0x200, /* An external entity */ - - XML_DOM_SKIP = 0x1000, /* Do not report DOM nodes */ - XML_DOM_FREE = 0x2000, /* Free the subtree when leaving */ - XML_DOM_IGNORE = XML_DOM_SKIP | XML_DOM_FREE, /* Completely ignore the subtree */ - - XML_FLAG_EMPTY_ELEM = 0x100000, -}; - -struct xml_ext_id { - char *system_id; - char *public_id; + /* Enable reporting of various events via SAX and/or PUSH interface */ + XML_REPORT_COMMENTS = 0x00000001, /* Report comments */ + XML_REPORT_PIS = 0x00000002, /* Report processing instructions */ + XML_REPORT_CHARS = 0x00000004, /* Report characters */ + XML_REPORT_TAGS = 0x00000008, /* Report element starts/ends */ + XML_REPORT_MISC = XML_REPORT_COMMENTS | XML_REPORT_PIS, + XML_REPORT_ALL = XML_REPORT_MISC | XML_REPORT_CHARS | XML_REPORT_TAGS, + + /* Enable construction of DOM for these types */ + XML_ALLOC_COMMENTS = 0x00000010, /* Create comment nodes */ + XML_ALLOC_PIS = 0x00000020, /* Create processing instruction nodes */ + XML_ALLOC_CHARS = 0x00000040, /* Create character nodes */ + XML_ALLOC_TAGS = 0x00000080, /* Create element nodes */ + XML_ALLOC_MISC = XML_ALLOC_COMMENTS | XML_ALLOC_PIS, + XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS, + + /* Other parameters */ + XML_VALIDATING = 0x00000100, /* Validate everything (not fully implemented!) */ + XML_PARSE_DTD = 0x00000200, /* Enable parsing of DTD */ + + /* Internals, do not change! */ + XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */ + XML_VERSION_1_1 = 0x00020000, /* XML version is 1.1, otherwise 1.0 */ + XML_HAS_EXTERNAL_SUBSET = 0x00040000, /* The document contains a reference to external DTD subset */ + XML_HAS_INTERNAL_SUBSET = 0x00080000, /* The document contains an internal subset */ + XML_SRC_EOF = 0x00100000, /* EOF reached */ + XML_SRC_EXPECTED_DECL = 0x00200000, /* Just before optional or required XMLDecl/TextDecl */ + XML_SRC_DOCUMENT = 0x00400000, /* The document entity */ + XML_SRC_EXTERNAL = 0x00800000, /* An external entity */ }; enum xml_node_type { XML_NODE_ELEM, XML_NODE_COMMENT, - XML_NODE_CDATA, + XML_NODE_CHARS, XML_NODE_PI, }; -struct xml_node { - cnode n; /* Node for list of parent's sons */ - uns type; /* XML_NODE_x */ - struct xml_node *parent; /* Parent node */ -}; +#define XML_NODE_FOR_EACH(var, node) CLIST_FOR_EACH(struct xml_node *, var, (node)->sons) +#define XML_ATTR_FOR_EACH(var, node) SLIST_FOR_EACH(struct xml_attr *, var, (node)->attrs) -struct xml_elem { - struct xml_node node; - char *name; /* Element name */ - clist sons; /* List of subnodes */ - struct xml_dtd_elem *dtd; /* Element DTD */ - slist attrs; /* Link list of attributes */ +struct xml_node { + cnode n; /* Node for list of parent's sons */ + uns type; /* XML_NODE_x */ + struct xml_node *parent; /* Parent node */ + char *name; /* Element name / PI target */ + clist sons; /* Children nodes */ + union { + struct { + char *text; /* PI text / Comment / CDATA */ + uns len; /* Text length in bytes */ + }; + struct { + struct xml_dtd_elem *dtd; /* Element DTD */ + slist attrs; /* Link list of element attributes */ + }; + }; + void *user; /* User-defined (initialized to NULL) */ }; struct xml_attr { - snode n; - struct xml_elem *elem; - char *name; - char *val; -}; - -struct xml_context; - -struct xml_stack { - struct xml_stack *next; /* Link list of stack records */ - uns saved_flags; /* Saved ctx->flags */ - struct mempool_state saved_pool; /* Saved ctx->pool state */ + snode n; /* Node for elem->attrs */ + struct xml_node *elem; /* Parent element */ + struct xml_dtd_attr *dtd; /* Attribute DTD */ + char *name; /* Attribute name */ + char *val; /* Attribute value */ + void *user; /* User-defined (initialized to NULL) */ }; -#define XML_BUF_SIZE 32 /* At least 16 -- hardcoded */ +#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */ struct xml_source { - struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ - struct fastbuf *fb; /* Source fastbuf */ - struct fastbuf wrap_fb; /* Libcharset or fbmem wrapper */ - u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ - u32 *bptr, *bstop; /* Current state of the buffer */ - uns row; /* File position */ - char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ - char *fb_encoding; /* Encoding of the source fastbuf */ - char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ - uns refill_cat1; /* Character categories, which should be directly passed to the buffer */ - uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in sequences) */ - void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ - unsigned short *refill_in_to_x; /* Libcharset input table */ - uns saved_depth; /* Saved ctx->depth */ + struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ + struct fastbuf *fb; /* Source fastbuf */ + struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */ + struct fastbuf wrap_fb; /* Fbmem wrapper */ + u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ + u32 *bptr, *bstop; /* Current state of the buffer */ + uns row; /* File position */ + char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ + char *fb_encoding; /* Encoding of the source fastbuf */ + char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ + uns refill_cat1; /* Character categories, which should be directly passed to the buffer */ + uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in + sequences) */ + void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ + unsigned short *refill_in_to_x; /* Libcharset input table */ + uns saved_depth; /* Saved ctx->depth */ + uns pending_0xd; /* The last read character is 0xD */ }; struct xml_context { @@ -152,61 +162,92 @@ struct xml_context { void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */ /* Memory management */ - struct mempool *pool; /* Most data */ - struct fastbuf *chars; /* Character data */ - struct fastbuf *value; /* Attribute value / comment / processing instruction data */ - char *name; /* Attribute name, processing instruction target */ - void *tab_attrs; - - /* Stack */ - struct xml_stack *stack; /* See xml_push(), xml_pop() */ + struct mempool *pool; /* DOM pool */ + struct mempool *stack; /* Stack pool (freed as soon as possible) */ + struct xml_stack *stack_list; /* See xml_push(), xml_pop() */ uns flags; /* XML_FLAG_x (restored on xml_pop()) */ - uns depth; /* Nesting level */ + uns depth; /* Nesting level (for checking of valid source nesting -> valid pushes/pops on memory pools) */ + struct fastbuf chars; /* Character data / attribute value */ + struct mempool_state chars_state; /* Mempool state before the current character block has started */ + char *chars_trivial; /* If not empty, it will be appended to chars */ + void *tab_attrs; /* Hash table of element attributes */ /* Input */ struct xml_source *src; /* Current source */ - u32 *bptr, *bstop; /* Character buffer */ + u32 *bptr, *bstop; /* Buffer with preprocessed characters (validated UCS-4 + category flags) */ + uns cat_chars; /* Unicode range of supported characters (cdata, attribute values, ...) */ + uns cat_unrestricted; /* Unrestricted characters (may appear in document/external entities) */ + uns cat_new_line; /* New line characters */ + uns cat_name; /* Characters that may appear in names */ + uns cat_sname; /* Characters that may begin a name */ /* SAX-like interface */ void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */ void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */ void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */ - void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration just before internal subset */ - void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction */ - void (*h_comment)(struct xml_context *ctx); /* Called after a comment */ - void (*h_element_start)(struct xml_context *ctx); /* Called after STag or EmptyElemTag */ - void (*h_element_end)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag */ + void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration (before optional internal subset) */ + void (*h_comment)(struct xml_context *ctx); /* Called after a comment (only with XML_REPORT_COMMENTS) */ + void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */ + void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */ + void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */ + void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */ + void (*h_block)(struct xml_context *ctx, char *text, uns len); /* Called for each continuous block of characters not reported by h_cdata() (only with XML_REPORT_CHARS) */ + void (*h_cdata)(struct xml_context *ctx, char *text, uns len); /* Called for each CDATA section (only with XML_REPORT_CHARS) */ + void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */ + void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */ + struct xml_dtd_entity *(*h_find_entity)(struct xml_context *ctx, char *name); /* Called when needed to resolve a general entity */ + void (*h_resolve_entity)(struct xml_context *ctx, struct xml_dtd_entity *ent); /* User should push source fastbuf for a parsed external entity (either general or parameter) */ /* DOM */ - struct xml_elem *root; /* DOM root */ - union { - struct xml_node *node; /* Current DOM node */ - struct xml_elem *elem; /* Current element */ - }; + struct xml_node *dom; /* DOM root */ + struct xml_node *node; /* Current DOM node */ char *version_str; uns standalone; - char *document_type; - struct xml_dtd *dtd; - struct xml_ext_id eid; - uns state; - uns want; - - void (*start_dtd)(struct xml_context *ctx); - void (*end_dtd)(struct xml_context *ctx); - void (*start_cdata)(struct xml_context *ctx); - void (*end_cdata)(struct xml_context *ctx); - void (*start_entity)(struct xml_context *ctx); - void (*end_entity)(struct xml_context *ctx); - void (*chacacters)(struct xml_context *ctx); - struct fastbuf *(*resolve_entity)(struct xml_context *ctx); - void (*notation_decl)(struct xml_context *ctx); - void (*unparsed_entity_decl)(struct xml_context *ctx); + char *doctype; /* The document type (or NULL if unknown) */ + char *system_id; /* DTD external id */ + char *public_id; /* DTD public id */ + struct xml_dtd *dtd; /* The DTD structure (or NULL) */ + uns state; /* Current state for the PULL interface (XML_STATE_x) */ + uns pull; /* Parameters for the PULL interface (XML_PULL_x) */ }; +/* Initialize XML context */ void xml_init(struct xml_context *ctx); + +/* Clean up all internal structures */ void xml_cleanup(struct xml_context *ctx); -void xml_set_source(struct xml_context *ctx, struct fastbuf *fb); -int xml_next(struct xml_context *ctx); + +/* Reuse XML context, equivalent to xml_cleanup() and xml_init() */ +void xml_reset(struct xml_context *ctx); + +/* Add XML source (fastbuf will be automatically closed) */ +struct xml_source *xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb); + +/* Parse without the PUSH interface, return XML_ERR_x code (zero on success) */ +uns xml_parse(struct xml_context *ctx); + +/* Parse with the PUSH interface, return XML_STATE_x (zero on EOF or fatal error) */ +uns xml_next(struct xml_context *ctx); + +/* Returns the current row number in the document entity */ +uns xml_row(struct xml_context *ctx); + +/* Finds a given attribute value in a XML_NODE_ELEM node */ +struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name); + +/* The default value of h_find_entity(), knows <, >, &, ' and " */ +struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name); + +/* The default value of h_resolve_entity(), throws an error */ +void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); + +/* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */ +uns xml_normalize_white(struct xml_context *ctx, char *value); + +/* Public part of error handling */ +void xml_warn(struct xml_context *ctx, const char *format, ...); +void xml_error(struct xml_context *ctx, const char *format, ...); +void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...); #endif