2 * Sherlock Library -- A simple XML parser
4 * (c) 2007 Pavel Charvat <pchar@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
10 #ifndef _SHERLOCK_XML_H
11 #define _SHERLOCK_XML_H
13 #include "lib/clists.h"
14 #include "lib/slists.h"
15 #include "lib/mempool.h"
19 XML_ERR_WARN = 1000, /* Warning */
20 XML_ERR_ERROR = 2000, /* Recoverable error */
21 XML_ERR_FATAL = 3000, /* Unrecoverable error */
28 XML_STATE_DOCUMENT_TYPE,
40 XML_STATE_CHARS_BEFORE_STAG,
41 XML_STATE_CHARS_BEFORE_ETAG,
42 XML_STATE_CHARS_BEFORE_CDATA,
43 XML_STATE_CHARS_BEFORE_PI,
44 XML_STATE_CHARS_BEFORE_COMMENT,
46 XML_STATE_PROLOG_COMMENT,
48 XML_STATE_EPILOG_COMMENT,
52 XML_WANT_DECL = 1 << XML_STATE_DECL,
53 XML_WANT_DOCUMENT_TYPE = 1 << XML_STATE_DOCUMENT_TYPE,
54 XML_WANT_CHARS = 1 << XML_STATE_CHARS,
55 XML_WANT_WHITE = 1 << XML_STATE_WHITE,
56 XML_WANT_CDATA = 1 << XML_STATE_CDATA,
57 XML_WANT_STAG = 1 << XML_STATE_STAG,
58 XML_WANT_ETAG = 1 << XML_STATE_ETAG,
59 XML_WANT_COMMENT = 1 << XML_STATE_COMMENT,
60 XML_WANT_PI = 1 << XML_STATE_PI,
61 XML_WANT_EOF = 1 << XML_STATE_EOF,
66 XML_FLAG_VALIDATING = 0x1,
67 XML_FLAG_VERSION_1_1 = 0x2, /* XML version 1.1, otherwise 1.0 */
68 XML_FLAG_HAS_EXTERNAL_SUBSET = 0x4, /* The document contains a reference to external DTD subset */
69 XML_FLAG_HAS_INTERNAL_SUBSET = 0x8, /* The document contains an internal subset */
71 XML_FLAG_SRC_EOF = 0x10, /* EOF reached */
72 XML_FLAG_SRC_EXPECTED_DECL = 0x20, /* Just before optional or required XMLDecl/TextDecl */
73 XML_FLAG_SRC_NEW_LINE = 0x40, /* The last read character is 0xD */
74 XML_FLAG_SRC_SURROUND = 0x80, /* Surround the text with 0x20 (references to parameter entities) */
75 XML_FLAG_SRC_DOCUMENT = 0x100, /* The document entity */
76 XML_FLAG_SRC_EXTERNAL = 0x200, /* An external entity */
78 XML_DOM_SKIP = 0x1000, /* Do not report DOM nodes */
79 XML_DOM_FREE = 0x2000, /* Free the subtree when leaving */
80 XML_DOM_IGNORE = XML_DOM_SKIP | XML_DOM_FREE, /* Completely ignore the subtree */
82 XML_FLAG_EMPTY_ELEM = 0x100000,
98 cnode n; /* Node for list of parent's sons */
99 uns type; /* XML_NODE_x */
100 struct xml_node *parent; /* Parent node */
104 struct xml_node node;
105 char *name; /* Element name */
106 clist sons; /* List of subnodes */
107 struct xml_dtd_elem *dtd; /* Element DTD */
108 slist attrs; /* Link list of attributes */
113 struct xml_elem *elem;
121 struct xml_stack *next; /* Link list of stack records */
122 uns saved_flags; /* Saved ctx->flags */
123 struct mempool_state saved_pool; /* Saved ctx->pool state */
126 #define XML_BUF_SIZE 32 /* At least 16 -- hardcoded */
129 struct xml_source *next; /* Link list of pending fastbufs (xml_context.sources) */
130 struct fastbuf *fb; /* Source fastbuf */
131 struct fastbuf wrap_fb; /* Libcharset or fbmem wrapper */
132 u32 buf[2 * XML_BUF_SIZE]; /* Read buffer with Unicode values and categories */
133 u32 *bptr, *bstop; /* Current state of the buffer */
134 uns row; /* File position */
135 char *expected_encoding; /* Initial encoding before any transformation has been made (expected in XMLDecl/TextDecl) */
136 char *fb_encoding; /* Encoding of the source fastbuf */
137 char *decl_encoding; /* Encoding read from the XMLDecl/TextDecl */
138 uns refill_cat1; /* Character categories, which should be directly passed to the buffer */
139 uns refill_cat2; /* Character categories, which should be processed as newlines (possibly in some built-in sequences) */
140 void (*refill)(struct xml_context *ctx); /* Callback to decode source characters to the buffer */
141 unsigned short *refill_in_to_x; /* Libcharset input table */
142 uns saved_depth; /* Saved ctx->depth */
147 char *err_msg; /* Last error message */
148 enum xml_error err_code; /* Last error code */
149 void *throw_buf; /* Where to jump on error */
150 void (*h_warn)(struct xml_context *ctx); /* Warning callback */
151 void (*h_error)(struct xml_context *ctx); /* Recoverable error callback */
152 void (*h_fatal)(struct xml_context *ctx); /* Unrecoverable error callback */
154 /* Memory management */
155 struct mempool *pool; /* Most data */
156 struct fastbuf *chars; /* Character data */
157 struct fastbuf *value; /* Attribute value / comment / processing instruction data */
158 char *name; /* Attribute name, processing instruction target */
162 struct xml_stack *stack; /* See xml_push(), xml_pop() */
163 uns flags; /* XML_FLAG_x (restored on xml_pop()) */
164 uns depth; /* Nesting level */
167 struct xml_source *src; /* Current source */
168 u32 *bptr, *bstop; /* Character buffer */
170 /* SAX-like interface */
171 void (*h_document_start)(struct xml_context *ctx); /* Called before entering prolog */
172 void (*h_document_end)(struct xml_context *ctx); /* Called after leaving epilog */
173 void (*h_xml_decl)(struct xml_context *ctx); /* Called after the XML declaration */
174 void (*h_doctype_decl)(struct xml_context *ctx); /* Called in the doctype declaration just before internal subset */
175 void (*h_pi)(struct xml_context *ctx); /* Called after a processing instruction */
176 void (*h_comment)(struct xml_context *ctx); /* Called after a comment */
177 void (*h_element_start)(struct xml_context *ctx); /* Called after STag or EmptyElemTag */
178 void (*h_element_end)(struct xml_context *ctx); /* Called before ETag or after EmptyElemTag */
181 struct xml_elem *root; /* DOM root */
183 struct xml_node *node; /* Current DOM node */
184 struct xml_elem *elem; /* Current element */
191 struct xml_ext_id eid;
195 void (*start_dtd)(struct xml_context *ctx);
196 void (*end_dtd)(struct xml_context *ctx);
197 void (*start_cdata)(struct xml_context *ctx);
198 void (*end_cdata)(struct xml_context *ctx);
199 void (*start_entity)(struct xml_context *ctx);
200 void (*end_entity)(struct xml_context *ctx);
201 void (*chacacters)(struct xml_context *ctx);
202 struct fastbuf *(*resolve_entity)(struct xml_context *ctx);
203 void (*notation_decl)(struct xml_context *ctx);
204 void (*unparsed_entity_decl)(struct xml_context *ctx);
207 /*** Document Type Definition (DTD) ***/
210 struct mempool *pool; /* Memory pool where to allocate DTD */
211 slist gents; /* Link list of general entities */
212 slist pents; /* Link list of parapeter entities */
213 slist notns; /* Link list of notations */
214 slist elems; /* Link list of elements */
215 void *tab_gents; /* Hash table of general entities */
216 void *tab_pents; /* Hash table of parameter entities */
217 void *tab_notns; /* Hash table of notations */
218 void *tab_elems; /* Hash table of elements */
219 void *tab_attrs; /* Hash table of element attributes */
220 void *tab_evals; /* Hash table of enumerated attribute values */
221 void *tab_enotns; /* hash table of enumerated attribute notations */
226 enum xml_dtd_notn_flags {
227 XML_DTD_NOTN_DECLARED = 0x1, /* The notation has been declared (interbal usage) */
230 struct xml_dtd_notn {
231 snode n; /* Node in xml_dtd.notns */
232 uns flags; /* XML_DTD_NOTN_x */
233 char *name; /* Notation name */
234 struct xml_ext_id eid; /* External id */
239 enum xml_dtd_ent_flags {
240 XML_DTD_ENT_DECLARED = 0x1, /* The entity has been declared (internal usage) */
241 XML_DTD_ENT_VISITED = 0x2, /* Cycle detection (internal usage) */
242 XML_DTD_ENT_PARAMETER = 0x4, /* Parameter entity, general otherwise */
243 XML_DTD_ENT_EXTERNAL = 0x8, /* External entity, internal otherwise */
244 XML_DTD_ENT_UNPARSED = 0x10, /* Unparsed entity, parsed otherwise */
245 XML_DTD_ENT_TRIVIAL = 0x20, /* Replacement text is a sequence of characters and character references */
249 snode n; /* Node in xml_dtd.[gp]ents */
250 uns flags; /* XML_DTD_ENT_x */
251 char *name; /* Entity name */
252 char *text; /* Replacement text / expanded replacement text (XML_DTD_ENT_TRIVIAL) */
253 uns len; /* Text length */
254 struct xml_ext_id eid; /* External ID */
255 struct xml_dtd_notn *notn; /* Notation (XML_DTD_ENT_UNPARSED only) */
260 enum xml_dtd_elem_flags {
261 XML_DTD_ELEM_DECLARED = 0x1, /* The element has been declared (internal usage) */
264 struct xml_dtd_elem {
268 struct xml_dtd_elem_node *node;
271 struct xml_dtd_elem_node {
273 struct xml_dtd_elem_node *parent;
279 enum xml_dtd_elem_node_type {
285 enum xml_dtd_elem_node_occur {
286 XML_DTD_ELEM_OCCUR_ONCE,
287 XML_DTD_ELEM_OCCUR_OPT,
288 XML_DTD_ELEM_OCCUR_MULT,
289 XML_DTD_ELEM_OCCUR_PLUS,
295 enum xml_dtd_attribute_default {
302 enum xml_dtd_attribute_type {
315 struct xml_dtd_attr {
317 struct xml_dtd_elem *elem;
318 enum xml_dtd_attribute_type type;
319 enum xml_dtd_attribute_default default_mode;
323 struct xml_dtd_eval {
324 struct xml_dtd_attr *attr;
328 struct xml_dtd_enotn {
329 struct xml_dtd_attr *attr;
330 struct xml_dtd_notn *notn;
333 void xml_init(struct xml_context *ctx);
334 void xml_cleanup(struct xml_context *ctx);
335 void xml_set_source(struct xml_context *ctx, struct fastbuf *fb);
336 int xml_next(struct xml_context *ctx);