2 * UCW JSON Library -- Parser
4 * (c) 2015 Martin Mares <mj@ucw.cz>
6 * This software may be freely distributed and used according to the terms
7 * of the GNU Lesser General Public License.
11 #include <ucw/fastbuf.h>
12 #include <ucw/ff-unicode.h>
13 #include <ucw/trans.h>
14 #include <ucw/unicode.h>
15 #include <ucw-json/json.h>
20 void json_set_input(struct json_context *js, struct fastbuf *in)
25 js->next_token = NULL;
27 if (!js->trivial_token)
28 js->trivial_token = json_new_node(js, JSON_INVALID);
31 // FIXME: Report column as well as line?
32 static void NONRET json_parse_error(struct json_context *js, const char *msg)
34 trans_throw("ucw.js.parse", js, "%s at line %u", msg, js->in_line);
37 static int json_get_char(struct json_context *js)
39 int c = bget_utf8_32_repl(js->in_fb, -2);
43 json_parse_error(js, "Malformed UTF-8 character");
45 // FIXME: Reject alternative sequences
51 static void json_unget_char(struct json_context *js, int c)
56 static struct json_node *json_triv_token(struct json_context *js, enum json_node_type type)
58 js->trivial_token->type = type;
59 return js->trivial_token;
62 static struct json_node *json_parse_number(struct json_context *js, int c)
65 char *p = mp_start_noalign(js->pool, 0);
70 p = mp_append_char(js->pool, p, c);
71 c = json_get_char(js);
72 if (!(c >= '0' && c <= '9'))
73 json_parse_error(js, "Malformed number: just minus");
79 // Leading zeroes are forbidden by RFC 7159
80 p = mp_append_char(js->pool, p, c);
81 c = json_get_char(js);
82 if (c >= '0' && c <= '9')
83 json_parse_error(js, "Malformed number: leading zero");
87 while (c >= '0' && c <= '9')
89 p = mp_append_char(js->pool, p, c);
90 c = json_get_char(js);
97 p = mp_append_char(js->pool, p, c);
98 if (!(c >= '0' && c <= '9'))
99 json_parse_error(js, "Malformed number: no digits after decimal point");
100 while (c >= '0' && c <= '9')
102 p = mp_append_char(js->pool, p, c);
103 c = json_get_char(js);
108 if (c == 'e' || c == 'E')
110 p = mp_append_char(js->pool, p, c);
111 c = json_get_char(js);
112 if (c == '+' || c == '-')
114 p = mp_append_char(js->pool, p, c);
115 c = json_get_char(js);
117 if (!(c >= '0' && c <= '9'))
118 json_parse_error(js, "Malformed number: empty exponent");
119 while (c >= '0' && c <= '9')
121 p = mp_append_char(js->pool, p, c);
122 c = json_get_char(js);
126 json_unget_char(js, c);
128 p = mp_end_string(js->pool, p);
130 double val = strtod(p, NULL);
132 json_parse_error(js, "Number out of range");
135 return json_new_number(js, val);
138 static struct json_node *json_parse_name(struct json_context *js, int c)
141 char *p = mp_start_noalign(js->pool, 0);
143 while (c >= 'a' && c <= 'z')
145 p = mp_append_char(js->pool, p, c);
146 c = json_get_char(js);
148 json_unget_char(js, c);
150 p = mp_end_string(js->pool, p);
152 if (!strcmp(p, "null"))
153 n = json_new_null(js);
154 else if (!strcmp(p, "false"))
155 n = json_new_bool(js, 0);
156 else if (!strcmp(p, "true"))
157 n = json_new_bool(js, 1);
159 json_parse_error(js, "Invalid literal name");
165 static uint json_parse_hex4(struct json_context *js)
168 for (int i=0; i<4; i++)
171 int c = json_get_char(js);
172 if (c >= '0' && c <= '9')
174 else if (c >= 'a' && c <= 'f')
176 else if (c >= 'A' && c <= 'F')
179 json_parse_error(js, "Invalid Unicode escape sequence");
184 static struct json_node *json_parse_string(struct json_context *js, int c)
186 char *p = mp_start_noalign(js->pool, 0);
188 c = json_get_char(js);
191 if (unlikely(c < 0x20))
193 if (c < 0 || c == 0x0d || c == 0x0a)
194 json_parse_error(js, "Unterminated string");
196 json_parse_error(js, "Invalid control character in string");
198 if (unlikely(c >= 0xd800 && c < 0xf900))
201 json_parse_error(js, "Invalid surrogate character in string");
203 json_parse_error(js, "Invalid private-use character in string");
205 if (unlikely(c > 0xf0000))
208 json_parse_error(js, "Invalid non-Unicode character in string");
210 json_parse_error(js, "Invalid private-use character in string");
214 c = json_get_char(js);
238 uint x = json_parse_hex4(js);
240 json_parse_error(js, "Zero bytes in strings are not supported");
241 if (x >= 0xd800 && x < 0xf900)
245 // High surrogate: low surrogate must follow
247 if (json_get_char(js) == '\\' && json_get_char(js) == 'u')
248 y = json_parse_hex4(js);
249 if (!(y >= 0xdc00 && y < 0xe000))
250 json_parse_error(js, "Escaped high surrogate codepoint must be followed by a low surrogate codepoint");
251 c = 0x10000 | ((x & 0x03ff) << 10) | (y & 0x03ff);
253 json_parse_error(js, "Invalid escaped private-use character");
258 json_parse_error(js, "Invalid escaped surrogate codepoint");
261 json_parse_error(js, "Invalid escaped private-use character");
266 json_parse_error(js, "Invalid backslash sequence in string");
269 p = mp_append_utf8_32(js->pool, p, c);
270 c = json_get_char(js);
273 p = mp_end_string(js->pool, p);
274 return json_new_string_ref(js, p);
277 struct json_node *json_peek_token(struct json_context *js)
279 if (unlikely(js->in_eof))
280 return json_triv_token(js, JSON_EOF);
282 int c = js->next_char;
286 c = json_get_char(js);
288 while (c == 0x20 || c == 0x09 || c == 0x0a || c == 0x0d)
292 c = json_get_char(js);
295 return json_triv_token(js, JSON_EOF);
297 if (c >= '0' && c <= '9' || c == '-')
298 return json_parse_number(js, c);
300 if (c >= 'a' && c <= 'z')
301 return json_parse_name(js, c);
304 return json_parse_string(js, c);
309 return json_triv_token(js, JSON_BEGIN_ARRAY);
311 return json_triv_token(js, JSON_END_ARRAY);
313 return json_triv_token(js, JSON_BEGIN_OBJECT);
315 return json_triv_token(js, JSON_END_OBJECT);
317 return json_triv_token(js, JSON_NAME_SEP);
319 return json_triv_token(js, JSON_VALUE_SEP);
321 json_parse_error(js, "Invalid character");
325 struct json_node *json_next_token(struct json_context *js)
329 struct json_node *t = js->next_token;
330 js->next_token = NULL;
334 struct json_node *json_next_value(struct json_context *js)
336 struct json_node *t = json_next_token(js);
351 case JSON_BEGIN_ARRAY:
353 struct json_node *a = json_new_array(js);
354 if (json_peek_token(js)->type == JSON_END_ARRAY)
358 struct json_node *v = json_next_value(js);
360 json_parse_error(js, "Unterminated array");
361 json_array_append(a, v);
363 t = json_next_token(js);
364 if (t->type == JSON_END_ARRAY)
366 if (t->type != JSON_VALUE_SEP)
367 json_parse_error(js, "Comma expected");
373 case JSON_BEGIN_OBJECT:
375 struct json_node *o = json_new_object(js);
376 if (json_peek_token(js)->type == JSON_END_OBJECT)
380 struct json_node *k = json_next_value(js);
382 json_parse_error(js, "Unterminated object");
383 if (k->type != JSON_STRING)
384 json_parse_error(js, "Object key must be a string");
386 t = json_next_token(js);
387 if (t->type != JSON_NAME_SEP)
388 json_parse_error(js, "Colon expected");
390 struct json_node *v = json_next_value(js);
392 json_parse_error(js, "Unterminated object");
393 if (json_object_get(o, k->string)) // FIXME: Optimize
394 json_parse_error(js, "Key already set");
395 json_object_set(o, k->string, v);
397 t = json_next_token(js);
398 if (t->type == JSON_END_OBJECT)
400 if (t->type != JSON_VALUE_SEP)
401 json_parse_error(js, "Comma expected");
406 // Misplaced characters
408 json_parse_error(js, "Misplaced end of array");
409 case JSON_END_OBJECT:
410 json_parse_error(js, "Misplaced end of object");
412 json_parse_error(js, "Misplaced colon");
414 json_parse_error(js, "Misplaced comma");
420 struct json_node *json_parse(struct json_context *js, struct fastbuf *fb)
422 json_set_input(js, fb);
424 struct json_node *n = json_next_value(js);
426 json_parse_error(js, "Empty input");
428 struct json_node *t = json_next_token(js);
429 if (t->type != JSON_EOF)
430 json_parse_error(js, "Only one top-level value allowed");