Directory: | ./ |
---|---|
File: | src/string.c |
Date: | 2021-09-04 00:13:15 |
Exec | Total | Coverage | |
---|---|---|---|
Lines: | 166 | 168 | 98.8% |
Branches: | 81 | 87 | 93.1% |
Line | Branch | Exec | Source |
---|---|---|---|
1 | /***************************************************************************/ /** | ||
2 | |||
3 | @file string.c | ||
4 | |||
5 | @author Stephen Brennan | ||
6 | |||
7 | @date Created Tuesday, 8 December 2015 | ||
8 | |||
9 | @brief Parsing strings. | ||
10 | |||
11 | @copyright Copyright (c) 2015, Stephen Brennan. Released under the | ||
12 | Revised BSD License. See LICENSE.txt for details. | ||
13 | |||
14 | This file contains the string parser. It is designed to be independent of | ||
15 | what you're parsing the string for. That is, it can be used in all these | ||
16 | situations: | ||
17 | |||
18 | - Recognizing string tokens when doing the initial tokenizing. | ||
19 | - Comparing string tokens against other strings. | ||
20 | - Loading string tokens into actual strings. | ||
21 | |||
22 | *******************************************************************************/ | ||
23 | |||
24 | #include <assert.h> | ||
25 | #include <stdbool.h> | ||
26 | |||
27 | #include "json_private.h" | ||
28 | #include "nosj.h" | ||
29 | |||
30 | /******************************************************************************* | ||
31 | |||
32 | Private Declarations for the Parser | ||
33 | |||
34 | *******************************************************************************/ | ||
35 | |||
36 | // forward declaration of struct parser_arg | ||
37 | struct parser_arg; | ||
38 | |||
39 | /** | ||
40 | @brief A function that is called for every parsed character. | ||
41 | @param a The parser arguments. Mostly for reference. | ||
42 | @param out The next parsed character in the string. | ||
43 | @param data Any data the setter might need. | ||
44 | */ | ||
45 | typedef void (*output_setter)(struct parser_arg *a, char out, void *data); | ||
46 | |||
47 | /** | ||
48 | @brief States of the parser. | ||
49 | */ | ||
50 | enum parser_st { START, INSTRING, ESCAPE, END, UESC0, UESC1, UESC2, UESC3 }; | ||
51 | |||
52 | /** | ||
53 | @brief All the variables the parser needs to do its job. | ||
54 | */ | ||
55 | struct parser_arg { | ||
56 | /** | ||
57 | @brief The state of the parser. | ||
58 | */ | ||
59 | enum parser_st state; | ||
60 | /** | ||
61 | @brief Input text. | ||
62 | */ | ||
63 | const char *text; | ||
64 | /** | ||
65 | @brief Current index of the text we're parsing. | ||
66 | */ | ||
67 | size_t textidx; | ||
68 | /** | ||
69 | @brief Function to call for every character we parse. | ||
70 | */ | ||
71 | output_setter setter; | ||
72 | /** | ||
73 | @brief Argument to go to the output setting function. | ||
74 | */ | ||
75 | void *setter_arg; | ||
76 | /** | ||
77 | @brief Index in which to put the next output character. | ||
78 | */ | ||
79 | size_t outidx; | ||
80 | /** | ||
81 | @brief Previously parsed unicode escape character. | ||
82 | |||
83 | This is used due to the fact that JSON only does 2-byte Unicode | ||
84 | escapes. In order to escape characters beyond the BMP (besides just | ||
85 | putting them in literally), you have to do the UTF-16 surrogate pair. | ||
86 | What a pain. | ||
87 | */ | ||
88 | wchar_t prev; | ||
89 | /** | ||
90 | @brief Unicode escape character we are currently parsing. | ||
91 | */ | ||
92 | wchar_t curr; | ||
93 | /** | ||
94 | @brief Any error we want to report. | ||
95 | */ | ||
96 | enum json_error error; | ||
97 | }; | ||
98 | |||
99 | /******************************************************************************* | ||
100 | |||
101 | Helper Functions | ||
102 | |||
103 | *******************************************************************************/ | ||
104 | |||
105 | /** | ||
106 | @brief Return true if c is a valid character to come after a backslash. | ||
107 | */ | ||
108 | 69 | static char json_escape(char c) | |
109 | { | ||
110 |
7/7✓ Branch 0 taken 14 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 3 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 3 times.
✓ Branch 6 taken 40 times.
|
69 | switch (c) { |
111 | 14 | case '\"': | |
112 | case '\\': | ||
113 | case '/': | ||
114 | 14 | return c; | |
115 | 3 | case 'b': | |
116 | 3 | return '\b'; | |
117 | 3 | case 'f': | |
118 | 3 | return '\f'; | |
119 | 3 | case 'n': | |
120 | 3 | return '\n'; | |
121 | 3 | case 'r': | |
122 | 3 | return '\r'; | |
123 | 3 | case 't': | |
124 | 3 | return '\t'; | |
125 | 40 | default: | |
126 | 40 | return '\0'; | |
127 | } | ||
128 | } | ||
129 | |||
130 | /** | ||
131 | @brief Return true if c is a valid hexadecimal digit for JSON. | ||
132 | |||
133 | Although there is an iswxdigit function in the C standard library, it allows | ||
134 | for other hexadecimal other than just 0-9, a-f, A-F (depending on locale). | ||
135 | The JSON spec explicitly states that these are the only hex characters it | ||
136 | accepts, so I've written my own to explicitly cover only those. | ||
137 | */ | ||
138 | 214 | static unsigned char json_xdigit(char c) | |
139 | { | ||
140 |
4/4✓ Branch 0 taken 213 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 130 times.
✓ Branch 3 taken 83 times.
|
214 | if ('0' <= c && c <= '9') { |
141 | 130 | return (unsigned char)(c - '0'); | |
142 |
3/4✓ Branch 0 taken 20 times.
✓ Branch 1 taken 64 times.
✓ Branch 2 taken 20 times.
✗ Branch 3 not taken.
|
84 | } else if ('a' <= c && c <= 'f') { |
143 | 20 | return (unsigned char)(10 + c - 'a'); | |
144 |
4/4✓ Branch 0 taken 63 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 62 times.
✓ Branch 3 taken 1 times.
|
64 | } else if ('A' <= c && c <= 'F') { |
145 | 62 | return (unsigned char)(10 + c - 'A'); | |
146 | } else { | ||
147 | 2 | return 0xFF; | |
148 | } | ||
149 | } | ||
150 | |||
151 | /** | ||
152 | @brief Register the output character. | ||
153 | @param a Parser data. | ||
154 | @param out The output character. | ||
155 | @param from_uesc Whether this came from a unicode escape | ||
156 | |||
157 | The nosj approach to JSON is: all data is UTF-8. Unfortunately, JSON can | ||
158 | contain Unicode escape sequences, which we have to manually translate into | ||
159 | valid UTF-8 here. However, if we translated *all* bytes into UTF-8 naively, | ||
160 | then we'd end up botching valid UTF-8 multi-byte sequences which already | ||
161 | exist. So, when from_uesc is true, we treat the output as a potential | ||
162 | multibyte sequence to translate to UTF-8. When from_uesc is false, we treat | ||
163 | it as a byte. | ||
164 | */ | ||
165 | 5842 | static void set_output(struct parser_arg *a, wchar_t out, bool from_uesc) | |
166 | { | ||
167 | // don't forget to flush the "buffered" potential surrogate pair | ||
168 | char bytes[4]; | ||
169 | 5842 | int nbytes = 0; | |
170 | int i; | ||
171 |
3/4✓ Branch 0 taken 5841 times.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 5841 times.
|
5842 | if (a->prev != 0 || out > 0x1FFFFF) { |
172 | 1 | a->state = END; | |
173 | 1 | a->error = JSONERR_INVALID_SURROGATE; | |
174 | 1 | return; | |
175 | } | ||
176 |
2/2✓ Branch 0 taken 5824 times.
✓ Branch 1 taken 17 times.
|
5841 | if (!from_uesc) { |
177 | 5824 | bytes[0] = out & 0xFF; | |
178 | 5824 | nbytes = 1; | |
179 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 13 times.
|
17 | } else if (out > 0xFFFF) { |
180 | 4 | bytes[0] = ((out >> 18) & 0x7) | 0xF0; | |
181 | 4 | nbytes = 4; | |
182 |
2/2✓ Branch 0 taken 7 times.
✓ Branch 1 taken 6 times.
|
13 | } else if (out > 0x7FF) { |
183 | 7 | bytes[0] = ((out >> 12) & 0xF) | 0xE0; | |
184 | 7 | nbytes = 3; | |
185 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 4 times.
|
6 | } else if (out > 0x7F) { |
186 | 2 | bytes[0] = ((out >> 6) & 0x1F) | 0xC0; | |
187 | 2 | nbytes = 2; | |
188 | } else { | ||
189 | 4 | bytes[0] = out & 0x7F; | |
190 | 4 | nbytes = 1; | |
191 | } | ||
192 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 5841 times.
|
5869 | for (i = nbytes - 1; i > 0; i--) { |
193 | 28 | bytes[i] = (out & 0x3F) | 0x80; | |
194 | 28 | out >>= 6; | |
195 | } | ||
196 |
2/2✓ Branch 0 taken 5869 times.
✓ Branch 1 taken 5841 times.
|
11710 | for (i = 0; i < nbytes; i++) { |
197 |
2/2✓ Branch 0 taken 2095 times.
✓ Branch 1 taken 3774 times.
|
5869 | if (a->setter) |
198 | 2095 | a->setter(a, bytes[i], a->setter_arg); | |
199 | 5869 | a->outidx++; | |
200 | } | ||
201 | } | ||
202 | |||
203 | 1080 | static void set_state(struct parser_arg *a, enum parser_st state) | |
204 | { | ||
205 |
2/2✓ Branch 0 taken 1079 times.
✓ Branch 1 taken 1 times.
|
1080 | if (a->state != END) { |
206 | 1079 | a->state = state; | |
207 | } | ||
208 | 1080 | } | |
209 | |||
210 | /******************************************************************************* | ||
211 | |||
212 | Parser Functions | ||
213 | |||
214 | *******************************************************************************/ | ||
215 | |||
216 | /** | ||
217 | @brief Called by the parser when it is in the START state. | ||
218 | @param a Parser data. | ||
219 | @param wc Character. | ||
220 | */ | ||
221 | 469 | static void json_string_start(struct parser_arg *a, char wc) | |
222 | { | ||
223 |
2/2✓ Branch 0 taken 462 times.
✓ Branch 1 taken 7 times.
|
469 | if (wc == '"') { |
224 | 462 | set_state(a, INSTRING); | |
225 | } else { | ||
226 | 7 | set_state(a, END); | |
227 | 7 | a->error = JSONERR_UNEXPECTED_TOKEN; | |
228 | 7 | a->textidx--; | |
229 | } | ||
230 | 469 | } | |
231 | |||
232 | /** | ||
233 | @brief Called by the parser when it is in the INSTRING state. | ||
234 | @param a Parser data. | ||
235 | @param wc Character. | ||
236 | */ | ||
237 | 6310 | static void json_string_instring(struct parser_arg *a, char wc) | |
238 | { | ||
239 |
2/2✓ Branch 0 taken 69 times.
✓ Branch 1 taken 6241 times.
|
6310 | if (wc == '\\') { |
240 | 69 | set_state(a, ESCAPE); | |
241 |
2/2✓ Branch 0 taken 444 times.
✓ Branch 1 taken 5797 times.
|
6241 | } else if (wc == '"') { |
242 | 444 | set_state(a, END); | |
243 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5796 times.
|
5797 | } else if (wc == '\0') { |
244 | 1 | set_state(a, END); | |
245 | 1 | a->error = JSONERR_PREMATURE_EOF; | |
246 | 1 | a->textidx--; | |
247 | } else { | ||
248 | 5796 | set_output(a, wc, false); | |
249 | } | ||
250 | 6310 | } | |
251 | |||
252 | /** | ||
253 | @brief Called by the parser when it is in the ESCAPE state. | ||
254 | @param a Parser data. | ||
255 | @param wc Character. | ||
256 | */ | ||
257 | 69 | static void json_string_escape(struct parser_arg *a, char wc) | |
258 | { | ||
259 | 69 | char esc = json_escape(wc); | |
260 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 68 times.
|
69 | if (wc == '\0') { |
261 | 1 | set_state(a, END); | |
262 | 1 | a->error = JSONERR_PREMATURE_EOF; | |
263 | 1 | a->textidx--; | |
264 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 40 times.
|
68 | } else if (wc == 'u') { |
265 | 28 | set_state(a, UESC0); | |
266 |
2/2✓ Branch 0 taken 29 times.
✓ Branch 1 taken 11 times.
|
40 | } else if (esc != '\0') { |
267 | 29 | set_state(a, INSTRING); | |
268 | 29 | set_output(a, esc, false); | |
269 | } else { | ||
270 | 11 | set_state(a, END); | |
271 | 11 | a->error = JSONERR_UNEXPECTED_TOKEN; | |
272 | 11 | a->textidx--; | |
273 | } | ||
274 | 69 | } | |
275 | |||
276 | /** | ||
277 | @brief Called by the parser when it is in one of the UESC states. | ||
278 | @param a Parser data. | ||
279 | @param wc Character. | ||
280 | */ | ||
281 | 109 | static void json_string_uesc(struct parser_arg *a, char wc) | |
282 | { | ||
283 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 108 times.
|
109 | if (wc == '\0') { |
284 | 1 | set_state(a, END); | |
285 | 1 | a->error = JSONERR_PREMATURE_EOF; | |
286 | 1 | a->textidx--; | |
287 |
2/2✓ Branch 1 taken 2 times.
✓ Branch 2 taken 106 times.
|
108 | } else if (json_xdigit(wc) == 0xFF) { |
288 | 2 | set_state(a, END); | |
289 | 2 | a->error = JSONERR_UNEXPECTED_TOKEN; | |
290 | 2 | a->textidx--; | |
291 | } else { | ||
292 | 106 | a->curr = a->curr << 4; | |
293 | 106 | a->curr |= json_xdigit(wc); | |
294 |
2/2✓ Branch 0 taken 81 times.
✓ Branch 1 taken 25 times.
|
106 | if (a->state < UESC3) { |
295 | // continue reading all the input | ||
296 | 81 | a->state += 1; | |
297 | } else { | ||
298 | // time to "publish" our unicode escape | ||
299 |
2/2✓ Branch 0 taken 20 times.
✓ Branch 1 taken 5 times.
|
25 | if (a->prev == 0) { |
300 | // if there was no "prev", that means this might | ||
301 | // be the start of a surrogate pair. Check for | ||
302 | // that! | ||
303 |
3/4✓ Branch 0 taken 7 times.
✓ Branch 1 taken 13 times.
✓ Branch 2 taken 7 times.
✗ Branch 3 not taken.
|
20 | if (0xD800 <= a->curr && a->curr <= 0xDFFF) { |
304 | // yup, it's a surrogate pair! | ||
305 | 7 | a->prev = a->curr; | |
306 | } else { | ||
307 | // nope, keep going | ||
308 | 13 | set_output(a, a->curr, true); | |
309 | } | ||
310 | } else { | ||
311 | // there was a previous starting surrogate | ||
312 |
3/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 4 times.
✗ Branch 3 not taken.
|
5 | if (0xD800 <= a->curr && a->curr <= 0xDFFF) { |
313 | // and this is also a surrogate | ||
314 | 4 | a->curr &= 0x03FF; // clear upper bits; | |
315 | // keep lower 10 | ||
316 | 4 | a->curr |= (a->prev & 0x03FF) << 10; | |
317 | 4 | a->curr += | |
318 | 0x10000; // apparently this | ||
319 | // needs to happen (?) | ||
320 | 4 | a->prev = 0; | |
321 | 4 | set_output(a, a->curr, true); | |
322 | } else { | ||
323 | // not a legal surrogate to match | ||
324 | // previous surrogate. | ||
325 | 1 | a->state = END; | |
326 | 1 | a->error = JSONERR_INVALID_SURROGATE; | |
327 | } | ||
328 | } | ||
329 | 25 | set_state(a, INSTRING); | |
330 | 25 | a->curr = 0; | |
331 | } | ||
332 | } | ||
333 | 109 | } | |
334 | |||
335 | /** | ||
336 | @brief Parses JSON strings, in a very generic manner. | ||
337 | @param text Input text. | ||
338 | @param idx Starting index of the string. | ||
339 | @param setter Function to call with each character. | ||
340 | @param setarg Argument to give to the setter function. | ||
341 | */ | ||
342 | 469 | static struct parser_arg json_string(const char *text, size_t idx, | |
343 | output_setter setter, void *setarg) | ||
344 | { | ||
345 | char wc; | ||
346 | 469 | struct parser_arg a = { .state = START, | |
347 | .text = text, | ||
348 | .textidx = idx, | ||
349 | .outidx = 0, | ||
350 | .setter = setter, | ||
351 | .setter_arg = setarg, | ||
352 | .prev = 0, | ||
353 | .curr = 0, | ||
354 | .error = JSONERR_NO_ERROR }; | ||
355 | |||
356 |
2/2✓ Branch 0 taken 6957 times.
✓ Branch 1 taken 469 times.
|
7426 | while (a.state != END) { |
357 | 6957 | wc = a.text[a.textidx]; | |
358 |
4/6✓ Branch 0 taken 469 times.
✓ Branch 1 taken 6310 times.
✓ Branch 2 taken 69 times.
✓ Branch 3 taken 109 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
|
6957 | switch (a.state) { |
359 | 469 | case START: | |
360 | 469 | json_string_start(&a, wc); | |
361 | 469 | break; | |
362 | 6310 | case INSTRING: | |
363 | 6310 | json_string_instring(&a, wc); | |
364 | 6310 | break; | |
365 | 69 | case ESCAPE: | |
366 | 69 | json_string_escape(&a, wc); | |
367 | 69 | break; | |
368 | 109 | case UESC0: | |
369 | case UESC1: | ||
370 | case UESC2: | ||
371 | case UESC3: | ||
372 | 109 | json_string_uesc(&a, wc); | |
373 | 109 | break; | |
374 | ✗ | case END: | |
375 | // never happens | ||
376 | ✗ | assert(false); | |
377 | break; | ||
378 | } | ||
379 | 6957 | a.textidx++; | |
380 | } | ||
381 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 466 times.
|
469 | if (a.prev != 0) { |
382 | 3 | a.error = JSONERR_INVALID_SURROGATE; | |
383 | } | ||
384 | 469 | return a; | |
385 | } | ||
386 | |||
387 | /******************************************************************************* | ||
388 | |||
389 | Application-Specific Parsers | ||
390 | |||
391 | *******************************************************************************/ | ||
392 | |||
393 | /** | ||
394 | @brief Parse a string literal. | ||
395 | @param text The text we're parsing. | ||
396 | @param arr The token buffer. | ||
397 | @param maxtoken The length of the token buffer. | ||
398 | @param p The parser state. | ||
399 | @returns Parser state after parsing the string. | ||
400 | */ | ||
401 | 276 | struct json_parser json_parse_string(char *text, struct json_token *arr, | |
402 | size_t maxtoken, struct json_parser p) | ||
403 | { | ||
404 | struct json_token tok; | ||
405 | struct parser_arg a; | ||
406 | |||
407 | 276 | tok.type = JSON_STRING; | |
408 | 276 | tok.start = p.textidx; | |
409 | |||
410 | 276 | a = json_string(text, p.textidx, NULL, NULL); | |
411 | |||
412 | 276 | tok.end = a.textidx - 1; | |
413 | 276 | tok.child = 0; | |
414 | 276 | tok.next = 0; | |
415 | 276 | tok.length = a.outidx; | |
416 | 276 | json_settoken(arr, tok, p, maxtoken); | |
417 | |||
418 | 276 | p.error = a.error; | |
419 | 276 | p.tokenidx++; | |
420 | 276 | p.textidx = a.textidx; | |
421 | 276 | return p; | |
422 | } | ||
423 | |||
424 | /** | ||
425 | @brief Argument passed to setter when we are doing json_string_match(). | ||
426 | */ | ||
427 | struct string_compare_arg { | ||
428 | /** | ||
429 | @brief String we're comparing to. | ||
430 | */ | ||
431 | const char *other; | ||
432 | /** | ||
433 | @brief Whether or not the string has evaluated to equal so far. | ||
434 | */ | ||
435 | bool equal; | ||
436 | }; | ||
437 | |||
438 | /** | ||
439 | @brief This is the "setter" function for json_string_match(). | ||
440 | @param a Parser arguments. | ||
441 | @param wc Character to set. | ||
442 | @param arg The struct string_compare_arg. | ||
443 | |||
444 | This function just compares each output character to the corresponding | ||
445 | character in the other string. It stores the result in the arg, which will | ||
446 | be examined after the fact. | ||
447 | */ | ||
448 | 2049 | static void json_string_comparator(struct parser_arg *a, char wc, void *arg) | |
449 | { | ||
450 | 2049 | struct string_compare_arg *ca = arg; | |
451 | // we are depending on short-circuit evaluation here :) | ||
452 |
4/4✓ Branch 0 taken 359 times.
✓ Branch 1 taken 1690 times.
✓ Branch 2 taken 204 times.
✓ Branch 3 taken 155 times.
|
2049 | ca->equal = ca->equal && (wc == ca->other[a->outidx]); |
453 | 2049 | } | |
454 | |||
455 | 187 | bool json_string_match(const char *json, const struct json_token *tokens, | |
456 | size_t index, const char *other) | ||
457 | { | ||
458 | 187 | struct string_compare_arg ca = { | |
459 | .other = other, | ||
460 | .equal = true, | ||
461 | }; | ||
462 | 187 | struct parser_arg pa = json_string(json, tokens[index].start, | |
463 | &json_string_comparator, &ca); | ||
464 | |||
465 | // They are equal if every previous character matches, and the next | ||
466 | // character in the other string is the null character, signifying the | ||
467 | // end. | ||
468 |
4/4✓ Branch 0 taken 32 times.
✓ Branch 1 taken 155 times.
✓ Branch 2 taken 31 times.
✓ Branch 3 taken 1 times.
|
187 | return ca.equal && (other[pa.outidx] == '\0'); |
469 | } | ||
470 | |||
471 | /** | ||
472 | @brief This is the "setter" function for json_string_match(). | ||
473 | @param a Parser arguments. | ||
474 | @param wc Character to set. | ||
475 | @param arg The struct string_compare_arg. | ||
476 | |||
477 | This function just compares each output character to the corresponding | ||
478 | character in the other string. It stores the result in the arg, which will | ||
479 | be examined after the fact. | ||
480 | */ | ||
481 | 46 | static void json_string_loader(struct parser_arg *a, char wc, void *arg) | |
482 | { | ||
483 | 46 | char *str = arg; | |
484 | // we are depending on short-circuit evaluation here :) | ||
485 | 46 | str[a->outidx] = wc; | |
486 | 46 | } | |
487 | |||
488 | 6 | void json_string_load(const char *json, const struct json_token *tokens, | |
489 | size_t index, char *buffer) | ||
490 | { | ||
491 | 6 | struct parser_arg pa = json_string(json, tokens[index].start, | |
492 | &json_string_loader, buffer); | ||
493 | |||
494 | 6 | buffer[pa.outidx] = '\0'; | |
495 | 6 | } | |
496 |