From 6c4c397f94ec5f5df6bcc178fb5fa4e84d3505fc Mon Sep 17 00:00:00 2001 From: Martin Mares Date: Sat, 14 Feb 2015 00:20:45 +0100 Subject: [PATCH] XML: Brief documentation of LibUCW-XML --- ucw-xml/Makefile | 2 + ucw-xml/doc/Makefile | 20 ++++++ ucw-xml/doc/index.txt | 34 +++++++++ ucw-xml/doc/xml.txt | 12 ++++ ucw-xml/xml.h | 159 ++++++++++++++++++++++++++++-------------- ucw/doc/index.txt | 6 ++ 6 files changed, 182 insertions(+), 51 deletions(-) create mode 100644 ucw-xml/doc/Makefile create mode 100644 ucw-xml/doc/index.txt create mode 100644 ucw-xml/doc/xml.txt diff --git a/ucw-xml/Makefile b/ucw-xml/Makefile index 5b391889..ffd1db37 100644 --- a/ucw-xml/Makefile +++ b/ucw-xml/Makefile @@ -50,3 +50,5 @@ install-libucw-xml-api: ln -sf libucw-xml$(LV).so.0.0 $(DESTDIR)$(INSTALL_LIB_DIR)/libucw-xml$(LV).so install -m 644 run/lib/libucw-xml$(LV).a $(DESTDIR)$(INSTALL_LIB_DIR) .PHONY: install-libucw-xml-api + +include $(s)/ucw-xml/doc/Makefile diff --git a/ucw-xml/doc/Makefile b/ucw-xml/doc/Makefile new file mode 100644 index 00000000..2ae21aa9 --- /dev/null +++ b/ucw-xml/doc/Makefile @@ -0,0 +1,20 @@ +# Makefile for the UCW-XML documentation + +DIRS+=ucw-xml/doc + +XML_DOCS=xml index +XML_DOCS_HTML=$(addprefix $(o)/ucw-xml/doc/,$(addsuffix .html,$(XML_DOCS))) + +DOCS+=$(XML_DOCS_HTML) +DOC_MODULES+=ucw-xml +$(XML_DOCS_HTML): DOC_MODULE=ucw-xml + +ifdef CONFIG_DOC +INSTALL_TARGETS+=install-libucw-xml-docs +endif + +.PHONY: install-libucw-xml-docs + +install-libucw-xml-docs: $(XML_DOCS_HTML) + install -d -m 755 $(DESTDIR)$(INSTALL_DOC_DIR)/ucw-xml/ + install -m 644 $^ $(DESTDIR)$(INSTALL_DOC_DIR)/ucw-xml/ diff --git a/ucw-xml/doc/index.txt b/ucw-xml/doc/index.txt new file mode 100644 index 00000000..a3c8efb2 --- /dev/null +++ b/ucw-xml/doc/index.txt @@ -0,0 +1,34 @@ +The UCW-XML library +=================== + +This library provides a light-weight XML parser built atop <<../ucw/index:,LibUCW>>. +It is primarily intended for efficient parsing of huge data sets, where other +parsers are too slow and cumbersome. + +Its features include: + +* High speed and low memory consumption, mainly thanks to efficient LibUCW + primitives like fastbufs and mempools. +* Multiple interfaces: +** SAX-like: callback functions called on various parser events +** Pull: for each call of the parser, it returns the next node +** DOM-like: returns a data structure describing the whole tree of nodes +** Any combination of the above: For example, when given a database with millions + of records, you can pull on the top level and ask for DOM of each record + separately. +* Support of namespaces. +* Complies with W3C recommendations on XML 1.0, XML 1.1, and Namespaces in XML 1.0 + as a non-validating parser, but does not aim to support all frills of other + XML-related standards. +* Partial support for DTD-driven parsing: basic checks of document structure, + filling in default values, expanding user-defined entities. + +Modules +------- +- <> + +Authors +------- + +- Pavel Charvát (main author) +- Martin Mareš (minor hacking and support for namespaces) diff --git a/ucw-xml/doc/xml.txt b/ucw-xml/doc/xml.txt new file mode 100644 index 00000000..07852d3c --- /dev/null +++ b/ucw-xml/doc/xml.txt @@ -0,0 +1,12 @@ +XML Parser +========== + +ucw-xml/xml.h +------------- + +To parse a document, create a parser context (<>), +initialize it with <>, fill in requested parsing mode, pointers to hooks, and +other parameters. Then call <> or <> as you need. At the end, dispose +of the context by <> or recycle it by <>. + +!!ucw-xml/xml.h diff --git a/ucw-xml/xml.h b/ucw-xml/xml.h index 38ede891..17126073 100644 --- a/ucw-xml/xml.h +++ b/ucw-xml/xml.h @@ -45,9 +45,14 @@ #define xml_warn ucw_xml_warn #endif +/*** + * === Constants + ***/ + struct xml_context; struct xml_dtd_entity; +/** Error code reported by the parser. So far, only the basic error classes are recognized. **/ enum xml_error { XML_ERR_OK = 0, XML_ERR_WARN = 1000, /* Warning */ @@ -56,6 +61,7 @@ enum xml_error { XML_ERR_EOF, }; +/** Parser state. A pull parser returns one of these to indicate the type of the current node. **/ enum xml_state { XML_STATE_EOF, /* EOF or a fatal error */ XML_STATE_START, /* Initial state */ @@ -79,6 +85,7 @@ enum xml_state { XML_STATE_EPILOG_PI, }; +/** Pull requests: a bit mask of node types you want to return. The other nodes are silently skipped. **/ enum xml_pull { XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */ XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */ @@ -90,6 +97,7 @@ enum xml_pull { XML_PULL_ALL = 0xffffffff, }; +/** Parser mode flags. **/ enum xml_flags { /* Enable reporting of various events via SAX and/or PULL interface */ XML_REPORT_COMMENTS = 0x00000001, /* Report comments */ @@ -126,16 +134,29 @@ enum xml_flags { XML_SRC_EXTERNAL = 0x00800000, /* An external entity */ }; +/*** + * === Internal representation of DOM + * + * All DOM nodes are allocated within temporary memory pools and they are not + * guaranteed to survive when the parser leaves the element. Upon <>, + * all remaining nodes are always freed. + ***/ + +/** Node types **/ enum xml_node_type { - XML_NODE_ELEM, - XML_NODE_COMMENT, - XML_NODE_CHARS, - XML_NODE_PI, + XML_NODE_ELEM, /* Element */ + XML_NODE_COMMENT, /* Comment */ + XML_NODE_CHARS, /* Character data */ + XML_NODE_PI, /* Processing instruction */ }; +/** Iterate over all children of a node. **/ #define XML_NODE_FOR_EACH(var, node) CLIST_FOR_EACH(struct xml_node *, var, (node)->sons) + +/** Iterate over all attributes of a node. **/ #define XML_ATTR_FOR_EACH(var, node) SLIST_FOR_EACH(struct xml_attr *, var, (node)->attrs) +/** A single DOM node. **/ struct xml_node { cnode n; /* Node for list of parent's sons */ uint type; /* XML_NODE_x */ @@ -164,6 +185,7 @@ struct xml_node { void *user; /* User-defined (initialized to NULL) */ }; +/** A single attribute. **/ struct xml_attr { snode n; /* Node for elem->attrs */ uint hash; /* Internal hash of ns + name */ @@ -197,6 +219,42 @@ struct xml_source { uint pending_0xd; /* The last read character is 0xD */ }; +/** Finds a qualified name (including namespace prefix) of a given element node. **/ +char *xml_node_qname(struct xml_context *ctx, struct xml_node *node); + +/** Finds a qualified name (including namespace prefix) of a given attribute. **/ +char *xml_attr_qname(struct xml_context *ctx, struct xml_attr *node); + +/** Finds a given attribute value in a `XML_NODE_ELEM` node **/ +struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name); + +/** The same, but namespace-aware **/ +struct xml_attr *xml_attr_find_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name); + +/** Similar to xml_attr_find, but it deals also with default values **/ +char *xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name); + +/** The same, but namespace-aware **/ +char *xml_attr_value_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name); + +/** Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) **/ +uint xml_normalize_white(struct xml_context *ctx, char *value); + +/** Merge character contents of a given element to a single string (not recursive) **/ +char *xml_merge_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool); + +/** Merge character contents of a given subtree to a single string **/ +char *xml_merge_dom_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool); + +/*** + * === Parser context + ***/ + +/** + * The state of the parser is kept in this structure. There are some + * user-accessible parts (like pointers to various hooks), but the + * majority of fields is private. + **/ struct xml_context { /* Error handling */ char *err_msg; /* Last error message */ @@ -206,7 +264,7 @@ struct xml_context { void (*h_error)(struct xml_context *ctx); /* Recoverable error callback */ void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */ - /* Memory management */ + /* Memory management (private) */ struct mempool *pool; /* DOM pool */ struct mempool *stack; /* Stack pool (freed as soon as possible) */ struct xml_stack *stack_list; /* See xml_push(), xml_pop() */ @@ -216,7 +274,7 @@ struct xml_context { struct mempool_state chars_state; /* Mempool state before the current character block has started */ char *chars_trivial; /* If not empty, it will be appended to chars */ - /* Input */ + /* Input (private) */ struct xml_source *src; /* Current source */ u32 *bptr, *bstop; /* Buffer with preprocessed characters (validated UCS-4 + category flags) */ uint cat_chars; /* Unicode range of supported characters (cdata, attribute values, ...) */ @@ -247,7 +305,7 @@ struct xml_context { struct xml_node *dom; /* DOM root */ struct xml_node *node; /* Current DOM node */ - /* Namespaces */ + /* Namespaces (private) */ struct mempool *ns_pool; /* Memory pool for NS definitions */ const char **ns_by_id; /* A growing array translating NS IDs to their names */ void *ns_by_name; /* Hash table translating NS names to their IDs */ @@ -255,6 +313,7 @@ struct xml_context { struct xml_ns_prefix *ns_prefix_stack; /* A stack of prefix definitions, allocated from xml->stack */ uint ns_default; /* Current default namespace */ + /* Other stuff */ char *version_str; uint standalone; char *doctype; /* The document type (or NULL if unknown) */ @@ -265,83 +324,81 @@ struct xml_context { uint pull; /* Parameters for the PULL interface (XML_PULL_x) */ }; -/* Initialize XML context */ +/** Initialize XML context **/ void xml_init(struct xml_context *ctx); -/* Clean up all internal structures */ +/** Clean up all internal structures **/ void xml_cleanup(struct xml_context *ctx); -/* Reuse XML context, equivalent to xml_cleanup() and xml_init() */ +/** Reuse XML context, equivalent to xml_cleanup() and xml_init(), but faster **/ void xml_reset(struct xml_context *ctx); -/* Add XML source (fastbuf will be automatically closed) */ +/** Add XML source (fastbuf will be automatically closed) **/ struct xml_source *xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb); -/* Parse without the PULL interface, return XML_ERR_x code (zero on success) */ +/** Parse the whole document without the PULL interface, return `XML_ERR_x` code (zero on success) **/ uint xml_parse(struct xml_context *ctx); -/* Parse with the PULL interface, return XML_STATE_x (zero on EOF or fatal error) */ +/** Parse with the PULL interface, return `XML_STATE_x` (zero on EOF or fatal error) **/ uint xml_next(struct xml_context *ctx); -/* Equivalent to xml_next, but with temporarily changed ctx->pull value */ +/** Equivalent to xml_next, but with temporarily changed ctx->pull value **/ uint xml_next_state(struct xml_context *ctx, uint pull); -/* May be called on XML_STATE_STAG to skip it's content; can return XML_STATE_ETAG or XML_STATE_EOF on fatal error */ +/** May be called on XML_STATE_STAG to skip its content; can return `XML_STATE_ETAG` or `XML_STATE_EOF` on fatal error **/ uint xml_skip_element(struct xml_context *ctx); -/* Returns the current row number in the document entity */ +/** Returns the current row (line) number in the document entity **/ uint xml_row(struct xml_context *ctx); -/* Finds a qualified name (including namespace prefix) of a given element node. */ -char *xml_node_qname(struct xml_context *ctx, struct xml_node *node); - -/* Finds a qualified name (including namespace prefix) of a given attribute. */ -char *xml_attr_qname(struct xml_context *ctx, struct xml_attr *node); - -/* Finds a given attribute value in a XML_NODE_ELEM node */ -struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name); - -/* The same, but namespace-aware */ -struct xml_attr *xml_attr_find_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name); - -/* Similar to xml_attr_find, but it deals also with default values */ -char *xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name); - -/* The same, but namespace-aware */ -char *xml_attr_value_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name); - /* The default value of h_find_entity(), knows <, >, &, ' and " */ struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name); /* The default value of h_resolve_entity(), throws an error */ void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent); -/* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */ -uint xml_normalize_white(struct xml_context *ctx, char *value); - -/* Merge character contents of a given element to a single string (not recursive) */ -char *xml_merge_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool); - -/* Merge character contents of a given subtree to a single string */ -char *xml_merge_dom_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool); - -/* Public part of error handling */ +/** Throw a warning at the current node **/ void xml_warn(struct xml_context *ctx, const char *format, ...); + +/** Throw an error at the current node **/ void xml_error(struct xml_context *ctx, const char *format, ...); + +/** Throw a fatal error, aborting parsing. This can be called only from SAX hooks (and from parser internals). **/ void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...); -/* Request processing of namespaces */ +/*** + * === Namespaces + * + * When namespace-aware parsing is requested by calling xml_ns_enable(), + * all namespaces are collected and assigned integer identifiers. Names of + * elements and attributes then always contain a namespace ID and a local + * name within the namespace. An ID of zero corresponds to an unspecified + * namespace. + * + * Once an ID is assigned, it is never changed, even if the namespace + * goes out of scope temporarily. + */ + +/** Request processing of namespaces (must be called before the first node is parsed). **/ void xml_ns_enable(struct xml_context *ctx); -/* Looks up namespace by its ID, dies on an invalid ID */ +/** + * Looks up namespace by its ID, dies on an invalid ID. Returns a pointer + * which remains valid until the context is cleaned up or reset. + **/ const char *xml_ns_by_id(struct xml_context *ctx, uint ns); -/* Looks up namespace by its name and returns its ID. Creates a new ID if necessary. */ +/** + * Looks up namespace by its name and returns its ID. Assigns a new ID if necessary. + * When this function returns, @name is not referenced any more. + **/ uint xml_ns_by_name(struct xml_context *ctx, const char *name); -/* Well-known namespaces */ -#define XML_NS_NONE 0 /* This element has no namespace */ -#define XML_NS_XMLNS 1 /* xmlns: */ -#define XML_NS_XML 2 /* xml: */ +/** Well-known namespaces. **/ +enum xml_ns_id { + XML_NS_NONE = 0, /* This element has no namespace */ + XML_NS_XMLNS = 1, /* xmlns: */ + XML_NS_XML = 2, /* xml: */ +}; #endif diff --git a/ucw/doc/index.txt b/ucw/doc/index.txt index 5d20d5e6..b6db3202 100644 --- a/ucw/doc/index.txt +++ b/ucw/doc/index.txt @@ -88,6 +88,12 @@ Yet undocumented modules * `sighandler.h` * `process.h` +Companion libraries +------------------- +- <<../ucw-xml/index:,LibUCW-XML>> -- a XML parser +- LibUCW-charset -- character set conversion (undocumented) +- LibUCW-images -- loading and conversion of images (undocumented) + License ------- The UCW library is copyrighted by its authors. -- 2.39.2