2 * Sherlock Library -- A simple XML parser
4 * (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
10 #ifndef _SHERLOCK_XML_XML_H
11 #define _SHERLOCK_XML_XML_H
13 #include <ucw/clists.h>
14 #include <ucw/slists.h>
15 #include <ucw/mempool.h>
16 #include <ucw/fastbuf.h>
19 struct xml_dtd_entity;
23 XML_ERR_WARN = 1000, /* Warning */
24 XML_ERR_ERROR = 2000, /* Recoverable error */
25 XML_ERR_FATAL = 3000, /* Unrecoverable error */
30 XML_STATE_EOF, /* EOF or a fatal error */
31 XML_STATE_START, /* Initial state */
32 XML_STATE_XML_DECL, /* XML_PULL_XML_DECL */
33 XML_STATE_DOCTYPE_DECL, /* XML_PULL_DOCTYPE_DECL */
34 XML_STATE_CHARS, /* XML_PULL_CHARS */
35 XML_STATE_STAG, /* XML_PULL_STAG */
36 XML_STATE_ETAG, /* XML_PULL_ETAG */
37 XML_STATE_COMMENT, /* XML_PULL_COMMENT */
38 XML_STATE_PI, /* XML_PULL_PI */
41 XML_STATE_CHARS_BEFORE_STAG,
42 XML_STATE_CHARS_BEFORE_ETAG,
43 XML_STATE_CHARS_BEFORE_CDATA,
44 XML_STATE_CHARS_BEFORE_COMMENT,
45 XML_STATE_CHARS_BEFORE_PI,
46 XML_STATE_PROLOG_COMMENT,
48 XML_STATE_EPILOG_COMMENT,
53 XML_PULL_XML_DECL = 0x00000001, /* Stop after the XML declaration */
54 XML_PULL_DOCTYPE_DECL = 0x00000002, /* Stop in the doctype declaration (before optional internal subset) */
55 XML_PULL_CHARS = 0x00000004,
56 XML_PULL_STAG = 0x00000008,
57 XML_PULL_ETAG = 0x00000010,
58 XML_PULL_COMMENT = 0x00000020,
59 XML_PULL_PI = 0x00000040,
60 XML_PULL_ALL = 0xffffffff,
64 /* Enable reporting of various events via SAX and/or PULL interface */
65 XML_REPORT_COMMENTS = 0x00000001, /* Report comments */
66 XML_REPORT_PIS = 0x00000002, /* Report processing instructions */
67 XML_REPORT_CHARS = 0x00000004, /* Report characters */
68 XML_REPORT_TAGS = 0x00000008, /* Report element starts/ends */
69 XML_REPORT_MISC = XML_REPORT_COMMENTS | XML_REPORT_PIS,
70 XML_REPORT_ALL = XML_REPORT_MISC | XML_REPORT_CHARS | XML_REPORT_TAGS,
72 /* Enable construction of DOM for these types */
73 XML_ALLOC_COMMENTS = 0x00000010, /* Create comment nodes */
74 XML_ALLOC_PIS = 0x00000020, /* Create processing instruction nodes */
75 XML_ALLOC_CHARS = 0x00000040, /* Create character nodes */
76 XML_ALLOC_TAGS = 0x00000080, /* Create element nodes */
77 XML_ALLOC_MISC = XML_ALLOC_COMMENTS | XML_ALLOC_PIS,
78 XML_ALLOC_ALL = XML_ALLOC_MISC | XML_ALLOC_CHARS | XML_ALLOC_TAGS,
80 /* Other parameters */
81 XML_VALIDATING = 0x00000100, /* Validate everything (not fully implemented!) */
82 XML_PARSE_DTD = 0x00000200, /* Enable parsing of DTD */
83 XML_NO_CHARS = 0x00000400, /* The current element must not contain character data (filled automaticaly if using DTD) */
84 XML_ALLOC_DEFAULT_ATTRS = 0x00000800, /* Allocate default attribute values so they can be found by XML_ATTR_FOR_EACH */
86 /* Internals, do not change! */
87 XML_EMPTY_ELEM_TAG = 0x00010000, /* The current element match EmptyElemTag */
88 XML_VERSION_1_1 = 0x00020000, /* XML version is 1.1, otherwise 1.0 */
89 XML_HAS_EXTERNAL_SUBSET = 0x00040000, /* The document contains a reference to external DTD subset */
90 XML_HAS_INTERNAL_SUBSET = 0x00080000, /* The document contains an internal subset */
91 XML_HAS_DTD = XML_HAS_EXTERNAL_SUBSET | XML_HAS_INTERNAL_SUBSET,
92 XML_SRC_EOF = 0x00100000, /* EOF reached */
93 XML_SRC_EXPECTED_DECL = 0x00200000, /* Just before optional or required XMLDecl/TextDecl */
94 XML_SRC_DOCUMENT = 0x00400000, /* The document entity */
95 XML_SRC_EXTERNAL = 0x00800000, /* An external entity */
105 #define XML_NODE_FOR_EACH(var, node) CLIST_FOR_EACH(struct xml_node *, var, (node)->sons)
106 #define XML_ATTR_FOR_EACH(var, node) SLIST_FOR_EACH(struct xml_attr *, var, (node)->attrs)
109 cnode n; /* Node for list of parent's sons */
110 uns type; /* XML_NODE_x */
111 struct xml_node *parent; /* Parent node */
112 char *name; /* Element name / PI target */
113 clist sons; /* Children nodes */
116 char *text; /* PI text / Comment / CDATA */
117 uns len; /* Text length in bytes */
120 struct xml_dtd_elem *dtd; /* Element DTD */
121 slist attrs; /* Link list of element attributes */
124 void *user; /* User-defined (initialized to NULL) */
128 snode n; /* Node for elem->attrs */
129 struct xml_node *elem; /* Parent element */
130 struct xml_dtd_attr *dtd; /* Attribute DTD */
131 char *name; /* Attribute name */
132 char *val; /* Attribute value */
133 void *user; /* User-defined (initialized to NULL) */
136 #define XML_BUF_SIZE 32 /* At least 8 -- hardcoded */
139 struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */
140 struct fastbuf *fb; /* Source fastbuf */
141 struct fastbuf *wrapped_fb; /* Original wrapped fastbuf (needed for cleanup) */
142 struct fastbuf wrap_fb; /* Fbmem wrapper */
143 u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */
144 u32 *bptr, *bstop; /* Current state of the buffer */
145 uns row; /* File position */
146 char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */
147 char *fb_encoding; /* Encoding of the source fastbuf */
148 char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */
149 uns refill_cat1; /* Character categories, which should be directly passed to the buffer */
150 uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in
152 void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */
153 unsigned short *refill_in_to_x; /* Libucw-charset input table */
154 uns saved_depth; /* Saved ctx->depth */
155 uns pending_0xd; /* The last read character is 0xD */
160 char *err_msg; /* Last error message */
161 enum xml_error err_code; /* Last error code */
162 void *throw_buf; /* Where to jump on error */
163 void (*h_warn)(struct xml_context *ctx); /* Warning callback */
164 void (*h_error)(struct xml_context *ctx); /* Recoverable error callback */
165 void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */
167 /* Memory management */
168 struct mempool *pool; /* DOM pool */
169 struct mempool *stack; /* Stack pool (freed as soon as possible) */
170 struct xml_stack *stack_list; /* See xml_push(), xml_pop() */
171 uns flags; /* XML_FLAG_x (restored on xml_pop()) */
172 uns depth; /* Nesting level (for checking of valid source nesting -> valid pushes/pops on memory pools) */
173 struct fastbuf chars; /* Character data / attribute value */
174 struct mempool_state chars_state; /* Mempool state before the current character block has started */
175 char *chars_trivial; /* If not empty, it will be appended to chars */
176 void *tab_attrs; /* Hash table of element attributes */
179 struct xml_source *src; /* Current source */
180 u32 *bptr, *bstop; /* Buffer with preprocessed characters (validated UCS-4 + category flags) */
181 uns cat_chars; /* Unicode range of supported characters (cdata, attribute values, ...) */
182 uns cat_unrestricted; /* Unrestricted characters (may appear in document/external entities) */
183 uns cat_new_line; /* New line characters */
184 uns cat_name; /* Characters that may appear in names */
185 uns cat_sname; /* Characters that may begin a name */
187 /* SAX-like interface */
188 void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */
189 void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */
190 void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */
191 void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration (before optional internal subset) */
192 void (*h_comment)(struct xml_context *ctx); /* Called after a comment (only with XML_REPORT_COMMENTS) */
193 void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction (only with XML_REPORT_PIS) */
194 void (*h_stag)(struct xml_context *ctx); /* Called after STag or EmptyElemTag (only with XML_REPORT_TAGS) */
195 void (*h_etag)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag (only with XML_REPORT_TAGS) */
196 void (*h_chars)(struct xml_context *ctx); /* Called after some characters (only with XML_REPORT_CHARS) */
197 void (*h_block)(struct xml_context *ctx, char *text, uns len); /* Called for each continuous block of characters not reported by h_cdata() (only with XML_REPORT_CHARS) */
198 void (*h_cdata)(struct xml_context *ctx, char *text, uns len); /* Called for each CDATA section (only with XML_REPORT_CHARS) */
199 void (*h_ignorable)(struct xml_context *ctx, char *text, uns len); /* Called for ignorable whitespace (content in tags without #PCDATA) */
200 void (*h_dtd_start)(struct xml_context *ctx); /* Called just after the DTD structure is initialized */
201 void (*h_dtd_end)(struct xml_context *ctx); /* Called after DTD subsets subsets */
202 struct xml_dtd_entity *(*h_find_entity)(struct xml_context *ctx, char *name); /* Called when needed to resolve a general entity */
203 void (*h_resolve_entity)(struct xml_context *ctx, struct xml_dtd_entity *ent); /* User should push source fastbuf for a parsed external entity (either general or parameter) */
206 struct xml_node *dom; /* DOM root */
207 struct xml_node *node; /* Current DOM node */
211 char *doctype; /* The document type (or NULL if unknown) */
212 char *system_id; /* DTD external id */
213 char *public_id; /* DTD public id */
214 struct xml_dtd *dtd; /* The DTD structure (or NULL) */
215 uns state; /* Current state for the PULL interface (XML_STATE_x) */
216 uns pull; /* Parameters for the PULL interface (XML_PULL_x) */
219 /* Initialize XML context */
220 void xml_init(struct xml_context *ctx);
222 /* Clean up all internal structures */
223 void xml_cleanup(struct xml_context *ctx);
225 /* Reuse XML context, equivalent to xml_cleanup() and xml_init() */
226 void xml_reset(struct xml_context *ctx);
228 /* Add XML source (fastbuf will be automatically closed) */
229 struct xml_source *xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb);
231 /* Parse without the PULL interface, return XML_ERR_x code (zero on success) */
232 uns xml_parse(struct xml_context *ctx);
234 /* Parse with the PULL interface, return XML_STATE_x (zero on EOF or fatal error) */
235 uns xml_next(struct xml_context *ctx);
237 /* Equivalent to xml_next, but with temporarily changed ctx->pull value */
238 uns xml_next_state(struct xml_context *ctx, uns pull);
240 /* May be called on XML_STATE_STAG to skip it's content; can return XML_STATE_ETAG or XML_STATE_EOF on fatal error */
241 uns xml_skip_element(struct xml_context *ctx);
243 /* Returns the current row number in the document entity */
244 uns xml_row(struct xml_context *ctx);
246 /* Finds a given attribute value in a XML_NODE_ELEM node */
247 struct xml_attr *xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name);
249 /* Similar to xml_attr_find, but it deals also with default values */
250 char *xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name);
252 /* The default value of h_find_entity(), knows <, >, &, ' and " */
253 struct xml_dtd_entity *xml_def_find_entity(struct xml_context *ctx, char *name);
255 /* The default value of h_resolve_entity(), throws an error */
256 void xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent);
258 /* Remove leading/trailing spaces and replaces sequences of spaces to a single space character (non-CDATA attribute normalization) */
259 uns xml_normalize_white(struct xml_context *ctx, char *value);
261 /* Merge character contents of a given element to a single string (not recursive) */
262 char *xml_merge_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool);
264 /* Merge character contents of a given subtree to a single string */
265 char *xml_merge_dom_chars(struct xml_context *ctx, struct xml_node *node, struct mempool *pool);
267 /* Public part of error handling */
268 void xml_warn(struct xml_context *ctx, const char *format, ...);
269 void xml_error(struct xml_context *ctx, const char *format, ...);
270 void NONRET xml_fatal(struct xml_context *ctx, const char *format, ...);