2 * UCW Library -- A simple XML parser
4 * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
5 * (c) 2015 Martin Mares <mj@ucw.cz>
7 * This software may be freely distributed and used according to the terms
8 * of the GNU Lesser General Public License.
11 #ifndef _UCW_XML_XML_H
12 #define _UCW_XML_XML_H
14 #include <ucw/clists.h>
15 #include <ucw/slists.h>
16 #include <ucw/mempool.h>
17 #include <ucw/fastbuf.h>
19 #ifdef CONFIG_UCW_CLEAN_ABI
20 #define xml_attr_find ucw_xml_attr_find
21 #define xml_attr_find_ns ucw_xml_attr_find_ns
22 #define xml_attr_qname ucw_xml_attr_qname
23 #define xml_attr_value ucw_xml_attr_value
24 #define xml_attr_value_ns ucw_xml_attr_value_ns
25 #define xml_cleanup ucw_xml_cleanup
26 #define xml_def_find_entity ucw_xml_def_find_entity
27 #define xml_def_resolve_entity ucw_xml_def_resolve_entity
28 #define xml_error ucw_xml_error
29 #define xml_fatal ucw_xml_fatal
30 #define xml_init ucw_xml_init
31 #define xml_merge_chars ucw_xml_merge_chars
32 #define xml_merge_dom_chars ucw_xml_merge_dom_chars
33 #define xml_next ucw_xml_next
34 #define xml_next_state ucw_xml_next_state
35 #define xml_node_qname ucw_xml_node_qname
36 #define xml_normalize_white ucw_xml_normalize_white
37 #define xml_ns_by_id ucw_xml_ns_by_id
38 #define xml_ns_by_name ucw_xml_ns_by_name
39 #define xml_ns_enable ucw_xml_ns_enable
40 #define xml_parse ucw_xml_parse
41 #define xml_push_fastbuf ucw_xml_push_fastbuf
42 #define xml_reset ucw_xml_reset
43 #define xml_row ucw_xml_row
44 #define xml_skip_element ucw_xml_skip_element
45 #define xml_warn ucw_xml_warn
53 struct xml_dtd_entity;
55 /** Error code reported by the parser. So far, only the basic error classes are recognized. **/
58 XML_ERR_WARN = 1000, /* Warning */
59 XML_ERR_ERROR = 2000, /* Recoverable error */
60 XML_ERR_FATAL = 3000, /* Unrecoverable error */
64 /** Parser state. A pull parser returns one of these to indicate the type of the current node. **/
66 XML_STATE_EOF, /* EOF or a fatal error */
67 XML_STATE_START, /* Initial state */
68 XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */
69 XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */
70 XML_STATE_CHARS, /* XML_PULL_CHARS */
71 XML_STATE_STAG, /* XML_PULL_STAG */
72 XML_STATE_ETAG, /* XML_PULL_ETAG */
73 XML_STATE_COMMENT, /* XML_PULL_COMMENT */
74 XML_STATE_PI, /* XML_PULL_PI */
77 XML_STATE_CHARS_BEFORE_STAG,
78 XML_STATE_CHARS_BEFORE_ETAG,
79 XML_STATE_CHARS_BEFORE_CDATA,
80 XML_STATE_CHARS_BEFORE_COMMENT,
81 XML_STATE_CHARS_BEFORE_PI,
82 XML_STATE_PROLOG_COMMENT,
84 XML_STATE_EPILOG_COMMENT,
88 /** Pull requests: a bit mask of node types you want to return. The other nodes are silently skipped. **/
90 XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */
91 XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */
92 XML_PULL_CHARS = 0x00000004,
93 XML_PULL_STAG = 0x00000008,
94 XML_PULL_ETAG = 0x00000010,
95 XML_PULL_COMMENT = 0x00000020,
96 XML_PULL_PI = 0x00000040,
97 XML_PULL_ALL = 0xffffffff,
100 /** Parser mode flags. **/
102 /* Enable reporting of various events via SAX and/or PULL interface */
103 XML_REPORT_COMMENTS = 0x00000001, /* Report comments */
104 XML_REPORT_PIS = 0x00000002, /* Report processing instructions */
105 XML_REPORT_CHARS = 0x00000004, /* Report characters */
106 XML_REPORT_TAGS = 0x00000008, /* Report element starts/ends */
107 XML_REPORT_MISC = XML_REPORT_COMMENTS | XML_REPORT_PIS,
108 XML_REPORT_ALL = XML_REPORT_MISC | XML_REPORT_CHARS | XML_REPORT_TAGS,
110 /* Enable construction of DOM for these types */
111 XML_ALLOC_COMMENTS = 0x00000010, /* Create comment nodes */
112 XML_ALLOC_PIS = 0x00000020, /* Create processing instruction nodes */
113 XML_ALLOC_CHARS = 0x00000040, /* Create character nodes */
114 XML_ALLOC_TAGS = 0x00000080, /* Create element nodes */
115 XML_ALLOC_MISC = XML_ALLOC_COMMENTS | XML_ALLOC_PIS,
116 XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS,
118 /* Other parameters */
119 XML_VALIDATING = 0x00000100, /* Validate everything (not fully implemented!) */
120 XML_PARSE_DTD = 0x00000200, /* Enable parsing of DTD */
121 XML_NO_CHARS = 0x00000400, /* The current element must not contain character data (filled automaticaly if using DTD) */
122 XML_ALLOC_DEFAULT_ATTRS = 0x00000800, /* Allocate default attribute values so they can be found by XML_ATTR_FOR_EACH */
123 XML_NAMESPACES = 0x00001000, /* Parse namespaces, use xml_ns_enable() to set this */
125 /* Internals, do not change! */
126 XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */
127 XML_VERSION_1_1 = 0x00020000, /* XML version is 1.1, otherwise 1.0 */
128 XML_HAS_EXTERNAL_SUBSET = 0x00040000, /* The document contains a reference to external DTD subset */
129 XML_HAS_INTERNAL_SUBSET = 0x00080000, /* The document contains an internal subset */
130 XML_HAS_DTD = XML_HAS_EXTERNAL_SUBSET | XML_HAS_INTERNAL_SUBSET,
131 XML_SRC_EOF = 0x00100000, /* EOF reached */
132 XML_SRC_EXPECTED_DECL = 0x00200000, /* Just before optional or required XMLDecl/TextDecl */
133 XML_SRC_DOCUMENT = 0x00400000, /* The document entity */
134 XML_SRC_EXTERNAL = 0x00800000, /* An external entity */
138 * === Internal representation of DOM
140 * All DOM nodes are allocated within temporary memory pools and they are not
141 * guaranteed to survive when the parser leaves the element. Upon <<xml_cleanup()>>,
142 * all remaining nodes are always freed.
147 XML_NODE_ELEM, /* Element */
148 XML_NODE_COMMENT, /* Comment */
149 XML_NODE_CHARS, /* Character data */
150 XML_NODE_PI, /* Processing instruction */
153 /** Iterate over all children of a node. **/
154 #define XML_NODE_FOR_EACH(var, node) CLIST_FOR_EACH(struct xml_node *, var, (node)->sons)
156 /** Iterate over all attributes of a node. **/
157 #define XML_ATTR_FOR_EACH(var, node) SLIST_FOR_EACH(struct xml_attr *, var, (node)->attrs)
159 /** A single DOM node. **/
161 cnode n; /* Node for list of parent's sons */
162 uint type; /* XML_NODE_x */
163 struct xml_node *parent; /* Parent node */
165 * If namespaces are enabled, node->name points to the local part of the name
166 * and node->ns is the resolved namespace ID.
168 * However, the namespace prefix is kept in memory just before the local part,
169 * so you can use xml_node_qname() to find out the full qualified name.
170 * The same applies to attributes, but the function is xml_attr_qname().
172 char *name; /* Element name / PI target */
173 clist sons; /* Children nodes */
176 char *text; /* PI text / Comment / CDATA */
177 uint len; /* Text length in bytes */
180 uint ns; /* Namespace ID */
181 struct xml_dtd_elem *dtd; /* Element DTD */
182 slist attrs; /* Link list of element attributes */
185 void *user; /* User-defined (initialized to NULL) */
188 /** A single attribute. **/
190 snode n; /* Node for elem->attrs */
191 uint hash; /* Internal hash of ns + name */
192 struct xml_node *elem; /* Parent element */
193 struct xml_dtd_attr *dtd; /* Attribute DTD */
194 uint ns; /* Namespace ID */
195 char *name; /* Attribute name without NS prefix */
196 char *val; /* Attribute value */
197 void *user; /* User-defined (initialized to NULL) */
200 #define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */
203 struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */
204 struct fastbuf *fb; /* Source fastbuf */
205 struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */
206 struct fastbuf wrap_fb; /* Fbmem wrapper */
207 u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */
208 u32 *bptr, *bstop; /* Current state of the buffer */
209 uint row; /* File position */
210 char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */
211 char *fb_encoding; /* Encoding of the source fastbuf */
212 char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */
213 uint refill_cat1; /* Character categories, which should be directly passed to the buffer */
214 uint refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in
216 void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */
217 unsigned short *refill_in_to_x; /* Libucw-charset input table */
218 uint saved_depth; /* Saved ctx->depth */
219 uint pending_0xd; /* The last read character is 0xD */
222 /** Finds a qualified name (including namespace prefix) of a given element node. **/
223 char *xml_node_qname(struct xml_context *ctx, struct xml_node *node);
225 /** Finds a qualified name (including namespace prefix) of a given attribute. **/
226 char *xml_attr_qname(struct xml_context *ctx, struct xml_attr *node);
228 /** Finds a given attribute value in a `XML_NODE_ELEM` node **/
229 struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name);
231 /** The same, but namespace-aware **/
232 struct xml_attr *xml_attr_find_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name);
234 /** Similar to xml_attr_find, but it deals also with default values **/
235 char *xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name);
237 /** The same, but namespace-aware **/
238 char *xml_attr_value_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name);
240 /** Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) **/
241 uint xml_normalize_white(struct xml_context *ctx, char *value);
243 /** Merge character contents of a given element to a single string (not recursive) **/
244 char *xml_merge_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool);
246 /** Merge character contents of a given subtree to a single string **/
247 char *xml_merge_dom_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool);
254 * The state of the parser is kept in this structure. There are some
255 * user-accessible parts (like pointers to various hooks), but the
256 * majority of fields is private.
260 char *err_msg; /* Last error message */
261 enum xml_error err_code; /* Last error code */
262 void *throw_buf; /* Where to jump on error */
263 void (*h_warn)(struct xml_context *ctx); /* Warning callback */
264 void (*h_error)(struct xml_context *ctx); /* Recoverable error callback */
265 void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */
267 /* Memory management (private) */
268 struct mempool *pool; /* DOM pool */
269 struct mempool *stack; /* Stack pool (freed as soon as possible) */
270 struct xml_stack *stack_list; /* See xml_push(), xml_pop() */
271 uint flags; /* XML_FLAG_x (restored on xml_pop()) */
272 uint depth; /* Nesting level (for checking of valid source nesting -> valid pushes/pops on memory pools) */
273 struct fastbuf chars; /* Character data / attribute value */
274 struct mempool_state chars_state; /* Mempool state before the current character block has started */
275 char *chars_trivial; /* If not empty, it will be appended to chars */
277 /* Input (private) */
278 struct xml_source *src; /* Current source */
279 u32 *bptr, *bstop; /* Buffer with preprocessed characters (validated UCS-4 + category flags) */
280 uint cat_chars; /* Unicode range of supported characters (cdata, attribute values, ...) */
281 uint cat_unrestricted; /* Unrestricted characters (may appear in document/external entities) */
282 uint cat_new_line; /* New line characters */
283 uint cat_name; /* Characters that may appear in names */
284 uint cat_sname; /* Characters that may begin a name */
286 /* SAX-like interface */
287 void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */
288 void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */
289 void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */
290 void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration (before optional internal subset) */
291 void (*h_comment)(struct xml_context *ctx); /* Called after a comment (only with XML_REPORT_COMMENTS) */
292 void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */
293 void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */
294 void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */
295 void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */
296 void (*h_block)(struct xml_context *ctx, char *text, uint len); /* Called for each continuous block of characters not reported by h_cdata() (only with XML_REPORT_CHARS) */
297 void (*h_cdata)(struct xml_context *ctx, char *text, uint len); /* Called for each CDATA section (only with XML_REPORT_CHARS) */
298 void (*h_ignorable)(struct xml_context *ctx, char *text, uint len); /* Called for ignorable whitespace (content in tags without #PCDATA) */
299 void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */
300 void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */
301 struct xml_dtd_entity *(*h_find_entity)(struct xml_context *ctx, char *name); /* Called when needed to resolve a general entity */
302 void (*h_resolve_entity)(struct xml_context *ctx, struct xml_dtd_entity *ent); /* User should push source fastbuf for a parsed external entity (either general or parameter) */
305 struct xml_node *dom; /* DOM root */
306 struct xml_node *node; /* Current DOM node */
308 /* Namespaces (private) */
309 struct mempool *ns_pool; /* Memory pool for NS definitions */
310 const char **ns_by_id; /* A growing array translating NS IDs to their names */
311 void *ns_by_name; /* Hash table translating NS names to their IDs */
312 void *ns_by_prefix; /* Hash table translating current prefixes to NS IDs, allocated from xml->stack */
313 struct xml_ns_prefix *ns_prefix_stack; /* A stack of prefix definitions, allocated from xml->stack */
314 uint ns_default; /* Current default namespace */
319 char *doctype; /* The document type (or NULL if unknown) */
320 char *system_id; /* DTD external id */
321 char *public_id; /* DTD public id */
322 struct xml_dtd *dtd; /* The DTD structure (or NULL) */
323 uint state; /* Current state for the PULL interface (XML_STATE_x) */
324 uint pull; /* Parameters for the PULL interface (XML_PULL_x) */
327 /** Initialize XML context **/
328 void xml_init(struct xml_context *ctx);
330 /** Clean up all internal structures **/
331 void xml_cleanup(struct xml_context *ctx);
333 /** Reuse XML context, equivalent to xml_cleanup() and xml_init(), but faster **/
334 void xml_reset(struct xml_context *ctx);
336 /** Add XML source (fastbuf will be automatically closed) **/
337 struct xml_source *xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb);
339 /** Parse the whole document without the PULL interface, return `XML_ERR_x` code (zero on success) **/
340 uint xml_parse(struct xml_context *ctx);
342 /** Parse with the PULL interface, return `XML_STATE_x` (zero on EOF or fatal error) **/
343 uint xml_next(struct xml_context *ctx);
345 /** Equivalent to xml_next, but with temporarily changed ctx->pull value **/
346 uint xml_next_state(struct xml_context *ctx, uint pull);
348 /** May be called on XML_STATE_STAG to skip its content; can return `XML_STATE_ETAG` or `XML_STATE_EOF` on fatal error **/
349 uint xml_skip_element(struct xml_context *ctx);
351 /** Returns the current row (line) number in the document entity **/
352 uint xml_row(struct xml_context *ctx);
354 /* The default value of h_find_entity(), knows <, >, &, ' and " */
355 struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name);
357 /* The default value of h_resolve_entity(), throws an error */
358 void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent);
360 /** Throw a warning at the current node **/
361 void xml_warn(struct xml_context *ctx, const char *format, ...);
363 /** Throw an error at the current node **/
364 void xml_error(struct xml_context *ctx, const char *format, ...);
366 /** Throw a fatal error, aborting parsing. This can be called only from SAX hooks (and from parser internals). **/
367 void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...);
372 * When namespace-aware parsing is requested by calling xml_ns_enable(),
373 * all namespaces are collected and assigned integer identifiers. Names of
374 * elements and attributes then always contain a namespace ID and a local
375 * name within the namespace. An ID of zero corresponds to an unspecified
378 * Once an ID is assigned, it is never changed, even if the namespace
379 * goes out of scope temporarily.
382 /** Request processing of namespaces (must be called before the first node is parsed). **/
383 void xml_ns_enable(struct xml_context *ctx);
386 * Looks up namespace by its ID, dies on an invalid ID. Returns a pointer
387 * which remains valid until the context is cleaned up or reset.
389 const char *xml_ns_by_id(struct xml_context *ctx, uint ns);
392 * Looks up namespace by its name and returns its ID. Assigns a new ID if necessary.
393 * When this function returns, @name is not referenced any more.
395 uint xml_ns_by_name(struct xml_context *ctx, const char *name);
397 /** Well-known namespaces. **/
399 XML_NS_NONE = 0, /* This element has no namespace */
400 XML_NS_XMLNS = 1, /* xmlns: */
401 XML_NS_XML = 2, /* xml: */