2 * UCW JSON Library -- Parser
4 * (c) 2015 Martin Mares <mj@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
11 #include <ucw/fastbuf.h>
12 #include <ucw/ff-unicode.h>
13 #include <ucw/trans.h>
14 #include <ucw/unicode.h>
15 #include <ucw-json/json.h>
20 void json_set_input(struct json_context *js, struct fastbuf *in)
26 js->next_token = NULL;
28 if (!js->trivial_token)
29 js->trivial_token = json_new_node(js, JSON_INVALID);
32 static void NONRET json_parse_error(struct json_context *js, const char *msg)
34 trans_throw("ucw.js.parse", js, "%s at line %u:%u", msg, js->in_line, js->in_column);
37 static int json_get_char(struct json_context *js)
39 int c = bget_utf8_32_repl(js->in_fb, -2);
43 json_parse_error(js, "Malformed UTF-8 character");
45 // FIXME: Reject alternative sequences
52 static void json_unget_char(struct json_context *js, int c)
57 static struct json_node *json_triv_token(struct json_context *js, enum json_node_type type)
59 js->trivial_token->type = type;
60 return js->trivial_token;
63 static struct json_node *json_parse_number(struct json_context *js, int c)
66 char *p = mp_start_noalign(js->pool, 0);
71 p = mp_append_char(js->pool, p, c);
72 c = json_get_char(js);
73 if (!(c >= '0' && c <= '9'))
74 json_parse_error(js, "Malformed number: just minus");
80 // Leading zeroes are forbidden by RFC 7159
81 p = mp_append_char(js->pool, p, c);
82 c = json_get_char(js);
83 if (c >= '0' && c <= '9')
84 json_parse_error(js, "Malformed number: leading zero");
88 while (c >= '0' && c <= '9')
90 p = mp_append_char(js->pool, p, c);
91 c = json_get_char(js);
98 p = mp_append_char(js->pool, p, c);
99 c = json_get_char(js);
100 if (!(c >= '0' && c <= '9'))
101 json_parse_error(js, "Malformed number: no digits after decimal point");
102 while (c >= '0' && c <= '9')
104 p = mp_append_char(js->pool, p, c);
105 c = json_get_char(js);
110 if (c == 'e' || c == 'E')
112 p = mp_append_char(js->pool, p, c);
113 c = json_get_char(js);
114 if (c == '+' || c == '-')
116 p = mp_append_char(js->pool, p, c);
117 c = json_get_char(js);
119 if (!(c >= '0' && c <= '9'))
120 json_parse_error(js, "Malformed number: empty exponent");
121 while (c >= '0' && c <= '9')
123 p = mp_append_char(js->pool, p, c);
124 c = json_get_char(js);
128 json_unget_char(js, c);
130 p = mp_end_string(js->pool, p);
132 double val = strtod(p, NULL);
134 json_parse_error(js, "Number out of range");
137 return json_new_number(js, val);
140 static struct json_node *json_parse_name(struct json_context *js, int c)
145 while (c >= 'a' && c <= 'z')
147 if (i < sizeof(name) - 1)
149 c = json_get_char(js);
151 if (i >= sizeof(name) - 1)
152 json_parse_error(js, "Invalid literal name");
154 json_unget_char(js, c);
157 if (!strcmp(name, "null"))
158 n = json_new_null(js);
159 else if (!strcmp(name, "false"))
160 n = json_new_bool(js, 0);
161 else if (!strcmp(name, "true"))
162 n = json_new_bool(js, 1);
164 json_parse_error(js, "Invalid literal name");
169 static uint json_parse_hex4(struct json_context *js)
172 for (int i=0; i<4; i++)
175 int c = json_get_char(js);
176 if (c >= '0' && c <= '9')
178 else if (c >= 'a' && c <= 'f')
180 else if (c >= 'A' && c <= 'F')
183 json_parse_error(js, "Invalid Unicode escape sequence");
188 static struct json_node *json_parse_string(struct json_context *js, int c)
190 char *p = mp_start_noalign(js->pool, 0);
192 c = json_get_char(js);
195 if (unlikely(c < 0x20))
197 if (c < 0 || c == 0x0d || c == 0x0a)
198 json_parse_error(js, "Unterminated string");
200 json_parse_error(js, "Invalid control character in string");
202 if (unlikely(c >= 0xd800 && c < 0xf900))
205 json_parse_error(js, "Invalid surrogate character in string");
207 json_parse_error(js, "Invalid private-use character in string");
209 if (unlikely(c >= 0xf0000))
212 json_parse_error(js, "Invalid non-Unicode character in string");
214 json_parse_error(js, "Invalid private-use character in string");
218 c = json_get_char(js);
242 uint x = json_parse_hex4(js);
244 json_parse_error(js, "Zero bytes in strings are not supported");
245 if (x >= 0xd800 && x < 0xf900)
249 // High surrogate: low surrogate must follow
251 if (json_get_char(js) == '\\' && json_get_char(js) == 'u')
252 y = json_parse_hex4(js);
253 if (!(y >= 0xdc00 && y < 0xe000))
254 json_parse_error(js, "Escaped high surrogate codepoint must be followed by a low surrogate codepoint");
255 c = 0x10000 + ((x & 0x03ff) << 10) | (y & 0x03ff);
257 json_parse_error(js, "Invalid escaped private-use character");
262 json_parse_error(js, "Invalid escaped surrogate codepoint");
265 json_parse_error(js, "Invalid escaped private-use character");
272 json_parse_error(js, "Invalid backslash sequence in string");
275 p = mp_append_utf8_32(js->pool, p, c);
276 c = json_get_char(js);
279 p = mp_end_string(js->pool, p);
280 return json_new_string_ref(js, p);
283 static struct json_node *json_read_token(struct json_context *js)
285 if (unlikely(js->in_eof))
286 return json_triv_token(js, JSON_EOF);
288 int c = js->next_char;
292 c = json_get_char(js);
294 while (c == 0x20 || c == 0x09 || c == 0x0a || c == 0x0d)
301 c = json_get_char(js);
304 return json_triv_token(js, JSON_EOF);
306 if (c >= '0' && c <= '9' || c == '-')
307 return json_parse_number(js, c);
309 if (c >= 'a' && c <= 'z')
310 return json_parse_name(js, c);
313 return json_parse_string(js, c);
318 return json_triv_token(js, JSON_BEGIN_ARRAY);
320 return json_triv_token(js, JSON_END_ARRAY);
322 return json_triv_token(js, JSON_BEGIN_OBJECT);
324 return json_triv_token(js, JSON_END_OBJECT);
326 return json_triv_token(js, JSON_NAME_SEP);
328 return json_triv_token(js, JSON_VALUE_SEP);
330 json_parse_error(js, "Numbers must start with a digit");
332 json_parse_error(js, "Misplaced byte-order mark, complain in Redmond");
334 json_parse_error(js, "Invalid character");
338 struct json_node *json_peek_token(struct json_context *js)
341 js->next_token = json_read_token(js);
342 return js->next_token;
345 struct json_node *json_next_token(struct json_context *js)
347 struct json_node *t = js->next_token;
350 js->next_token = NULL;
353 return json_read_token(js);
356 struct json_node *json_next_value(struct json_context *js)
358 struct json_node *t = json_next_token(js);
373 case JSON_BEGIN_ARRAY:
375 struct json_node *a = json_new_array(js);
376 if (json_peek_token(js)->type == JSON_END_ARRAY)
380 struct json_node *v = json_next_value(js);
382 json_parse_error(js, "Unterminated array");
383 json_array_append(a, v);
385 t = json_next_token(js);
386 if (t->type == JSON_END_ARRAY)
388 if (t->type != JSON_VALUE_SEP)
389 json_parse_error(js, "Comma expected");
395 case JSON_BEGIN_OBJECT:
397 struct json_node *o = json_new_object(js);
398 if (json_peek_token(js)->type == JSON_END_OBJECT)
402 struct json_node *k = json_next_value(js);
404 json_parse_error(js, "Unterminated object");
405 if (k->type != JSON_STRING)
406 json_parse_error(js, "Object key must be a string");
408 t = json_next_token(js);
409 if (t->type != JSON_NAME_SEP)
410 json_parse_error(js, "Colon expected");
412 struct json_node *v = json_next_value(js);
414 json_parse_error(js, "Unterminated object");
415 if (json_object_get(o, k->string)) // FIXME: Optimize
416 json_parse_error(js, "Key already set");
417 json_object_set(o, k->string, v);
419 t = json_next_token(js);
420 if (t->type == JSON_END_OBJECT)
422 if (t->type != JSON_VALUE_SEP)
423 json_parse_error(js, "Comma expected");
428 // Misplaced characters
430 json_parse_error(js, "Misplaced end of array");
431 case JSON_END_OBJECT:
432 json_parse_error(js, "Misplaced end of object");
434 json_parse_error(js, "Misplaced colon");
436 json_parse_error(js, "Misplaced comma");
442 struct json_node *json_parse(struct json_context *js, struct fastbuf *fb)
444 json_set_input(js, fb);
446 struct json_node *n = json_next_value(js);
448 json_parse_error(js, "Empty input");
450 struct json_node *t = json_next_token(js);
451 if (t->type != JSON_EOF)
452 json_parse_error(js, "Only one top-level value allowed");