2 * UCW JSON Library -- Parser
4 * (c) 2015 Martin Mares <mj@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
11 #include <ucw/fastbuf.h>
12 #include <ucw/ff-unicode.h>
13 #include <ucw/trans.h>
14 #include <ucw/unicode.h>
15 #include <ucw-json/json.h>
20 void json_set_input(struct json_context *js, struct fastbuf *in)
26 js->next_token = NULL;
30 static void NONRET json_parse_error(struct json_context *js, const char *msg)
32 trans_throw("ucw.json.parse", js, "%s at line %u:%u", msg, js->in_line, js->in_column);
35 static int json_get_char(struct json_context *js)
37 int c = bget_utf8_32_repl(js->in_fb, -2);
41 json_parse_error(js, "Malformed UTF-8 character");
49 static void json_unget_char(struct json_context *js, int c)
54 static struct json_node *json_triv_token(struct json_context *js, enum json_node_type type)
56 js->trivial_token->type = type;
57 return js->trivial_token;
60 static struct json_node *json_parse_number(struct json_context *js, int c)
63 char *p = mp_start_noalign(js->pool, 0);
68 p = mp_append_char(js->pool, p, c);
69 c = json_get_char(js);
70 if (!(c >= '0' && c <= '9'))
71 json_parse_error(js, "Malformed number: just minus");
77 // Leading zeroes are forbidden by RFC 7159
78 p = mp_append_char(js->pool, p, c);
79 c = json_get_char(js);
80 if (c >= '0' && c <= '9')
81 json_parse_error(js, "Malformed number: leading zero");
85 while (c >= '0' && c <= '9')
87 p = mp_append_char(js->pool, p, c);
88 c = json_get_char(js);
95 p = mp_append_char(js->pool, p, c);
96 c = json_get_char(js);
97 if (!(c >= '0' && c <= '9'))
98 json_parse_error(js, "Malformed number: no digits after decimal point");
99 while (c >= '0' && c <= '9')
101 p = mp_append_char(js->pool, p, c);
102 c = json_get_char(js);
107 if (c == 'e' || c == 'E')
109 p = mp_append_char(js->pool, p, c);
110 c = json_get_char(js);
111 if (c == '+' || c == '-')
113 p = mp_append_char(js->pool, p, c);
114 c = json_get_char(js);
116 if (!(c >= '0' && c <= '9'))
117 json_parse_error(js, "Malformed number: empty exponent");
118 while (c >= '0' && c <= '9')
120 p = mp_append_char(js->pool, p, c);
121 c = json_get_char(js);
125 json_unget_char(js, c);
127 p = mp_end_string(js->pool, p);
129 double val = strtod(p, NULL);
131 json_parse_error(js, "Number out of range");
134 return json_new_number(js, val);
137 static struct json_node *json_parse_name(struct json_context *js, int c)
142 while (c >= 'a' && c <= 'z')
144 if (i < sizeof(name) - 1)
146 c = json_get_char(js);
148 if (i >= sizeof(name) - 1)
149 json_parse_error(js, "Invalid literal name");
151 json_unget_char(js, c);
154 if (!strcmp(name, "null"))
155 n = json_new_null(js);
156 else if (!strcmp(name, "false"))
157 n = json_new_bool(js, 0);
158 else if (!strcmp(name, "true"))
159 n = json_new_bool(js, 1);
161 json_parse_error(js, "Invalid literal name");
166 static uint json_parse_hex4(struct json_context *js)
169 for (int i=0; i<4; i++)
172 int c = json_get_char(js);
173 if (c >= '0' && c <= '9')
175 else if (c >= 'a' && c <= 'f')
177 else if (c >= 'A' && c <= 'F')
180 json_parse_error(js, "Invalid Unicode escape sequence");
185 static struct json_node *json_parse_string(struct json_context *js, int c)
187 char *p = mp_start_noalign(js->pool, 0);
189 c = json_get_char(js);
192 if (unlikely(c < 0x20))
194 if (c < 0 || c == 0x0d || c == 0x0a)
195 json_parse_error(js, "Unterminated string");
197 json_parse_error(js, "Invalid control character in string");
199 if (unlikely(c >= 0xd800 && c < 0xf900))
202 json_parse_error(js, "Invalid surrogate character in string");
204 json_parse_error(js, "Invalid private-use character in string");
206 if (unlikely(c >= 0xf0000))
209 json_parse_error(js, "Invalid non-Unicode character in string");
211 json_parse_error(js, "Invalid private-use character in string");
215 c = json_get_char(js);
239 uint x = json_parse_hex4(js);
241 json_parse_error(js, "Zero bytes in strings are not supported");
242 if (x >= 0xd800 && x < 0xf900)
246 // High surrogate: low surrogate must follow
248 if (json_get_char(js) == '\\' && json_get_char(js) == 'u')
249 y = json_parse_hex4(js);
250 if (!(y >= 0xdc00 && y < 0xe000))
251 json_parse_error(js, "Escaped high surrogate codepoint must be followed by a low surrogate codepoint");
252 c = 0x10000 + ((x & 0x03ff) << 10) | (y & 0x03ff);
254 json_parse_error(js, "Invalid escaped private-use character");
259 json_parse_error(js, "Invalid escaped surrogate codepoint");
262 json_parse_error(js, "Invalid escaped private-use character");
269 json_parse_error(js, "Invalid backslash sequence in string");
272 p = mp_append_utf8_32(js->pool, p, c);
273 c = json_get_char(js);
276 p = mp_end_string(js->pool, p);
277 return json_new_string_ref(js, p);
280 static struct json_node *json_read_token(struct json_context *js)
282 if (unlikely(js->in_eof))
283 return json_triv_token(js, JSON_EOF);
285 int c = js->next_char;
289 c = json_get_char(js);
291 while (c == 0x20 || c == 0x09 || c == 0x0a || c == 0x0d)
298 c = json_get_char(js);
301 return json_triv_token(js, JSON_EOF);
303 if (c >= '0' && c <= '9' || c == '-')
304 return json_parse_number(js, c);
306 if (c >= 'a' && c <= 'z')
307 return json_parse_name(js, c);
310 return json_parse_string(js, c);
315 return json_triv_token(js, JSON_BEGIN_ARRAY);
317 return json_triv_token(js, JSON_END_ARRAY);
319 return json_triv_token(js, JSON_BEGIN_OBJECT);
321 return json_triv_token(js, JSON_END_OBJECT);
323 return json_triv_token(js, JSON_NAME_SEP);
325 return json_triv_token(js, JSON_VALUE_SEP);
327 json_parse_error(js, "Numbers must start with a digit");
329 json_parse_error(js, "Misplaced byte-order mark, complain in Redmond");
331 json_parse_error(js, "Invalid character");
335 struct json_node *json_peek_token(struct json_context *js)
338 js->next_token = json_read_token(js);
339 return js->next_token;
342 struct json_node *json_next_token(struct json_context *js)
344 struct json_node *t = js->next_token;
347 js->next_token = NULL;
350 return json_read_token(js);
353 struct json_node *json_next_value(struct json_context *js)
355 struct json_node *t = json_next_token(js);
370 case JSON_BEGIN_ARRAY:
372 struct json_node *a = json_new_array(js);
373 if (json_peek_token(js)->type == JSON_END_ARRAY)
377 struct json_node *v = json_next_value(js);
379 json_parse_error(js, "Unterminated array");
380 json_array_append(a, v);
382 t = json_next_token(js);
383 if (t->type == JSON_END_ARRAY)
385 if (t->type != JSON_VALUE_SEP)
386 json_parse_error(js, "Comma or right bracket expected");
392 case JSON_BEGIN_OBJECT:
394 struct json_node *o = json_new_object(js);
395 if (json_peek_token(js)->type == JSON_END_OBJECT)
399 struct json_node *k = json_next_value(js);
401 json_parse_error(js, "Unterminated object");
402 if (k->type != JSON_STRING)
403 json_parse_error(js, "Object key must be a string");
405 t = json_next_token(js);
406 if (t->type != JSON_NAME_SEP)
407 json_parse_error(js, "Colon expected");
409 struct json_node *v = json_next_value(js);
411 json_parse_error(js, "Unterminated object");
412 if (json_object_get(o, k->string)) // FIXME: Optimize
413 json_parse_error(js, "Key already set");
414 json_object_set(o, k->string, v);
416 t = json_next_token(js);
417 if (t->type == JSON_END_OBJECT)
419 if (t->type != JSON_VALUE_SEP)
420 json_parse_error(js, "Comma expected");
425 // Misplaced characters
427 json_parse_error(js, "Misplaced end of array");
428 case JSON_END_OBJECT:
429 json_parse_error(js, "Misplaced end of object");
431 json_parse_error(js, "Misplaced colon");
433 json_parse_error(js, "Misplaced comma");
439 struct json_node *json_parse(struct json_context *js, struct fastbuf *fb)
441 json_set_input(js, fb);
443 struct json_node *n = json_next_value(js);
445 json_parse_error(js, "Empty input");
447 struct json_node *t = json_next_token(js);
448 if (t->type != JSON_EOF)
449 json_parse_error(js, "Only one top-level value allowed");