X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;ds=sidebyside;f=sherlock%2Fxml%2Fxml.h;h=ac9ebefb70f8a88ab6f1056326dd876eb0e44002;hb=83400e61386e912475562889bfb2c39dc5ecf6d0;hp=a608bfb8ef6015e85b6af3f75b1bc7e3cad75c15;hpb=2c7501836b5e6a120d2846e168d1c44e3a43435f;p=libucw.git diff --git a/sherlock/xml/xml.h b/sherlock/xml/xml.h index a608bfb8..ac9ebefb 100644 --- a/sherlock/xml/xml.h +++ b/sherlock/xml/xml.h @@ -1,7 +1,7 @@ /* * Sherlock Library -- A simple XML parser * - * (c) 2007 Pavel Charvat + * (c) 2007--2008 Pavel Charvat * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. @@ -15,8 +15,10 @@ #include "lib/mempool.h" #include "lib/fastbuf.h" +struct xml_context; +struct xml_dtd_entity; + enum xml_error { - // FIXME XML_ERR_OK = 0, XML_ERR_WARN = 1000, /* Warning */ XML_ERR_ERROR = 2000, /* Recoverable error */ @@ -30,7 +32,6 @@ enum xml_state { XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */ XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */ XML_STATE_CHARS, /* XML_PULL_CHARS */ - XML_STATE_CDATA, /* XML_PULL_CDATA */ XML_STATE_STAG, /* XML_PULL_STAG */ XML_STATE_ETAG, /* XML_PULL_ETAG */ XML_STATE_COMMENT, /* XML_PULL_COMMENT */ @@ -52,11 +53,10 @@ enum xml_pull { XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */ XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */ XML_PULL_CHARS = 0x00000004, - XML_PULL_CDATA = 0x00000008, - XML_PULL_STAG = 0x00000010, - XML_PULL_ETAG = 0x00000020, - XML_PULL_COMMENT = 0x00000040, - XML_PULL_PI = 0x00000080, + XML_PULL_STAG = 0x00000008, + XML_PULL_ETAG = 0x00000010, + XML_PULL_COMMENT = 0x00000020, + XML_PULL_PI = 0x00000040, XML_PULL_ALL = 0xffffffff, }; @@ -78,9 +78,8 @@ enum xml_flags { XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS, /* Other parameters */ - XML_UNFOLD_CDATA = 0x00000100, /* Unfold CDATA sections */ - XML_VALIDATING = 0x00000200, /* Validate everything (not fully implemented!) */ - XML_PARSE_DTD = 0x00000400, /* Enable parsing of DTD */ + XML_VALIDATING = 0x00000100, /* Validate everything (not fully implemented!) */ + XML_PARSE_DTD = 0x00000200, /* Enable parsing of DTD */ /* Internals, do not change! */ XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */ @@ -89,10 +88,8 @@ enum xml_flags { XML_HAS_INTERNAL_SUBSET = 0x00080000, /* The document contains an internal subset */ XML_SRC_EOF = 0x00100000, /* EOF reached */ XML_SRC_EXPECTED_DECL = 0x00200000, /* Just before optional or required XMLDecl/TextDecl */ - XML_SRC_NEW_LINE = 0x00400000, /* The last read character is 0xD */ - XML_SRC_SURROUND = 0x00800000, /* Surround the text with 0x20 (references to parameter entities) */ - XML_SRC_DOCUMENT = 0x01000000, /* The document entity */ - XML_SRC_EXTERNAL = 0x02000000, /* An external entity */ + XML_SRC_DOCUMENT = 0x00400000, /* The document entity */ + XML_SRC_EXTERNAL = 0x00800000, /* An external entity */ }; enum xml_node_type { @@ -121,13 +118,38 @@ struct xml_node { slist attrs; /* Link list of element attributes */ }; }; + void *user; /* User-defined (initialized to NULL) */ }; struct xml_attr { snode n; /* Node for elem->attrs */ struct xml_node *elem; /* Parent element */ + struct xml_dtd_attr *dtd; /* Attribute DTD */ char *name; /* Attribute name */ char *val; /* Attribute value */ + void *user; /* User-defined (initialized to NULL) */ +}; + +#define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */ + +struct xml_source { + struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */ + struct fastbuf *fb; /* Source fastbuf */ + struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */ + struct fastbuf wrap_fb; /* Fbmem wrapper */ + u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */ + u32 *bptr, *bstop; /* Current state of the buffer */ + uns row; /* File position */ + char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */ + char *fb_encoding; /* Encoding of the source fastbuf */ + char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */ + uns refill_cat1; /* Character categories, which should be directly passed to the buffer */ + uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in + sequences) */ + void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */ + unsigned short *refill_in_to_x; /* Libcharset input table */ + uns saved_depth; /* Saved ctx->depth */ + uns pending_0xd; /* The last read character is 0xD */ }; struct xml_context { @@ -141,11 +163,13 @@ struct xml_context { /* Memory management */ struct mempool *pool; /* DOM pool */ - struct mempool *stack; /* Stack pool (free as soon as possible) */ + struct mempool *stack; /* Stack pool (freed as soon as possible) */ struct xml_stack *stack_list; /* See xml_push(), xml_pop() */ uns flags; /* XML_FLAG_x (restored on xml_pop()) */ - uns depth; /* Nesting level */ + uns depth; /* Nesting level (for checking of valid source nesting -> valid pushes/pops on memory pools) */ struct fastbuf chars; /* Character data / attribute value */ + struct mempool_state chars_state; /* Mempool state before the current character block has started */ + char *chars_trivial; /* If not empty, it will be appended to chars */ void *tab_attrs; /* Hash table of element attributes */ /* Input */ @@ -167,12 +191,15 @@ struct xml_context { void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */ void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */ void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */ - void (*h_cdata)(struct xml_context *ctx); /* Called after a CDATA section (only with XML_REPORT_CHARS and XML_UNFOLD_CDATA) */ + void (*h_block)(struct xml_context *ctx, char *text, uns len); /* Called for each continuous block of characters not reported by h_cdata() (only with XML_REPORT_CHARS) */ + void (*h_cdata)(struct xml_context *ctx, char *text, uns len); /* Called for each CDATA section (only with XML_REPORT_CHARS) */ void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */ void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */ + struct xml_dtd_entity *(*h_find_entity)(struct xml_context *ctx, char *name); /* Called when needed to resolve a general entity */ + void (*h_resolve_entity)(struct xml_context *ctx, struct xml_dtd_entity *ent); /* User should push source fastbuf for a parsed external entity (either general or parameter) */ /* DOM */ - struct xml_node *root; /* DOM root */ + struct xml_node *dom; /* DOM root */ struct xml_node *node; /* Current DOM node */ char *version_str; @@ -183,12 +210,6 @@ struct xml_context { struct xml_dtd *dtd; /* The DTD structure (or NULL) */ uns state; /* Current state for the PULL interface (XML_STATE_x) */ uns pull; /* Parameters for the PULL interface (XML_PULL_x) */ - - void (*start_entity)(struct xml_context *ctx); - void (*end_entity)(struct xml_context *ctx); - struct fastbuf *(*resolve_entity)(struct xml_context *ctx); - void (*notation_decl)(struct xml_context *ctx); - void (*unparsed_entity_decl)(struct xml_context *ctx); }; /* Initialize XML context */ @@ -197,11 +218,11 @@ void xml_init(struct xml_context *ctx); /* Clean up all internal structures */ void xml_cleanup(struct xml_context *ctx); -/* Reuse XML context */ +/* Reuse XML context, equivalent to xml_cleanup() and xml_init() */ void xml_reset(struct xml_context *ctx); -/* Setup XML source (fastbuf will be automatically closed) */ -void xml_set_source(struct xml_context *ctx, struct fastbuf *fb); +/* Add XML source (fastbuf will be automatically closed) */ +struct xml_source *xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb); /* Parse without the PUSH interface, return XML_ERR_x code (zero on success) */ uns xml_parse(struct xml_context *ctx); @@ -209,7 +230,24 @@ uns xml_parse(struct xml_context *ctx); /* Parse with the PUSH interface, return XML_STATE_x (zero on EOF or fatal error) */ uns xml_next(struct xml_context *ctx); +/* Returns the current row number in the document entity */ uns xml_row(struct xml_context *ctx); + +/* Finds a given attribute value in a XML_NODE_ELEM node */ struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name); +/* The default value of h_find_entity(), knows <, >, &, ' and " */ +struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name); + +/* The default value of h_resolve_entity(), throws an error */ +void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); + +/* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */ +uns xml_normalize_white(struct xml_context *ctx, char *value); + +/* Public part of error handling */ +void xml_warn(struct xml_context *ctx, const char *format, ...); +void xml_error(struct xml_context *ctx, const char *format, ...); +void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...); + #endif