GCC Code Coverage Report


Directory: ./
File: src/string.c
Date: 2021-09-04 00:13:15
Exec Total Coverage
Lines: 166 168 98.8%
Branches: 81 87 93.1%

Line Branch Exec Source
1 /***************************************************************************/ /**
2
3 @file string.c
4
5 @author Stephen Brennan
6
7 @date Created Tuesday, 8 December 2015
8
9 @brief Parsing strings.
10
11 @copyright Copyright (c) 2015, Stephen Brennan. Released under the
12 Revised BSD License. See LICENSE.txt for details.
13
14 This file contains the string parser. It is designed to be independent of
15 what you're parsing the string for. That is, it can be used in all these
16 situations:
17
18 - Recognizing string tokens when doing the initial tokenizing.
19 - Comparing string tokens against other strings.
20 - Loading string tokens into actual strings.
21
22 *******************************************************************************/
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "json_private.h"
28 #include "nosj.h"
29
30 /*******************************************************************************
31
32 Private Declarations for the Parser
33
34 *******************************************************************************/
35
36 // forward declaration of struct parser_arg
37 struct parser_arg;
38
39 /**
40 @brief A function that is called for every parsed character.
41 @param a The parser arguments. Mostly for reference.
42 @param out The next parsed character in the string.
43 @param data Any data the setter might need.
44 */
45 typedef void (*output_setter)(struct parser_arg *a, char out, void *data);
46
47 /**
48 @brief States of the parser.
49 */
50 enum parser_st { START, INSTRING, ESCAPE, END, UESC0, UESC1, UESC2, UESC3 };
51
52 /**
53 @brief All the variables the parser needs to do its job.
54 */
55 struct parser_arg {
56 /**
57 @brief The state of the parser.
58 */
59 enum parser_st state;
60 /**
61 @brief Input text.
62 */
63 const char *text;
64 /**
65 @brief Current index of the text we're parsing.
66 */
67 size_t textidx;
68 /**
69 @brief Function to call for every character we parse.
70 */
71 output_setter setter;
72 /**
73 @brief Argument to go to the output setting function.
74 */
75 void *setter_arg;
76 /**
77 @brief Index in which to put the next output character.
78 */
79 size_t outidx;
80 /**
81 @brief Previously parsed unicode escape character.
82
83 This is used due to the fact that JSON only does 2-byte Unicode
84 escapes. In order to escape characters beyond the BMP (besides just
85 putting them in literally), you have to do the UTF-16 surrogate pair.
86 What a pain.
87 */
88 wchar_t prev;
89 /**
90 @brief Unicode escape character we are currently parsing.
91 */
92 wchar_t curr;
93 /**
94 @brief Any error we want to report.
95 */
96 enum json_error error;
97 };
98
99 /*******************************************************************************
100
101 Helper Functions
102
103 *******************************************************************************/
104
105 /**
106 @brief Return true if c is a valid character to come after a backslash.
107 */
108 69 static char json_escape(char c)
109 {
110
7/7
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 3 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 3 times.
✓ Branch 6 taken 40 times.
69 switch (c) {
111 14 case '\"':
112 case '\\':
113 case '/':
114 14 return c;
115 3 case 'b':
116 3 return '\b';
117 3 case 'f':
118 3 return '\f';
119 3 case 'n':
120 3 return '\n';
121 3 case 'r':
122 3 return '\r';
123 3 case 't':
124 3 return '\t';
125 40 default:
126 40 return '\0';
127 }
128 }
129
130 /**
131 @brief Return true if c is a valid hexadecimal digit for JSON.
132
133 Although there is an iswxdigit function in the C standard library, it allows
134 for other hexadecimal other than just 0-9, a-f, A-F (depending on locale).
135 The JSON spec explicitly states that these are the only hex characters it
136 accepts, so I've written my own to explicitly cover only those.
137 */
138 214 static unsigned char json_xdigit(char c)
139 {
140
4/4
✓ Branch 0 taken 213 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 130 times.
✓ Branch 3 taken 83 times.
214 if ('0' <= c && c <= '9') {
141 130 return (unsigned char)(c - '0');
142
3/4
✓ Branch 0 taken 20 times.
✓ Branch 1 taken 64 times.
✓ Branch 2 taken 20 times.
✗ Branch 3 not taken.
84 } else if ('a' <= c && c <= 'f') {
143 20 return (unsigned char)(10 + c - 'a');
144
4/4
✓ Branch 0 taken 63 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 62 times.
✓ Branch 3 taken 1 times.
64 } else if ('A' <= c && c <= 'F') {
145 62 return (unsigned char)(10 + c - 'A');
146 } else {
147 2 return 0xFF;
148 }
149 }
150
151 /**
152 @brief Register the output character.
153 @param a Parser data.
154 @param out The output character.
155 @param from_uesc Whether this came from a unicode escape
156
157 The nosj approach to JSON is: all data is UTF-8. Unfortunately, JSON can
158 contain Unicode escape sequences, which we have to manually translate into
159 valid UTF-8 here. However, if we translated *all* bytes into UTF-8 naively,
160 then we'd end up botching valid UTF-8 multi-byte sequences which already
161 exist. So, when from_uesc is true, we treat the output as a potential
162 multibyte sequence to translate to UTF-8. When from_uesc is false, we treat
163 it as a byte.
164 */
165 5842 static void set_output(struct parser_arg *a, wchar_t out, bool from_uesc)
166 {
167 // don't forget to flush the "buffered" potential surrogate pair
168 char bytes[4];
169 5842 int nbytes = 0;
170 int i;
171
3/4
✓ Branch 0 taken 5841 times.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 5841 times.
5842 if (a->prev != 0 || out > 0x1FFFFF) {
172 1 a->state = END;
173 1 a->error = JSONERR_INVALID_SURROGATE;
174 1 return;
175 }
176
2/2
✓ Branch 0 taken 5824 times.
✓ Branch 1 taken 17 times.
5841 if (!from_uesc) {
177 5824 bytes[0] = out & 0xFF;
178 5824 nbytes = 1;
179
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 13 times.
17 } else if (out > 0xFFFF) {
180 4 bytes[0] = ((out >> 18) & 0x7) | 0xF0;
181 4 nbytes = 4;
182
2/2
✓ Branch 0 taken 7 times.
✓ Branch 1 taken 6 times.
13 } else if (out > 0x7FF) {
183 7 bytes[0] = ((out >> 12) & 0xF) | 0xE0;
184 7 nbytes = 3;
185
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 4 times.
6 } else if (out > 0x7F) {
186 2 bytes[0] = ((out >> 6) & 0x1F) | 0xC0;
187 2 nbytes = 2;
188 } else {
189 4 bytes[0] = out & 0x7F;
190 4 nbytes = 1;
191 }
192
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 5841 times.
5869 for (i = nbytes - 1; i > 0; i--) {
193 28 bytes[i] = (out & 0x3F) | 0x80;
194 28 out >>= 6;
195 }
196
2/2
✓ Branch 0 taken 5869 times.
✓ Branch 1 taken 5841 times.
11710 for (i = 0; i < nbytes; i++) {
197
2/2
✓ Branch 0 taken 2095 times.
✓ Branch 1 taken 3774 times.
5869 if (a->setter)
198 2095 a->setter(a, bytes[i], a->setter_arg);
199 5869 a->outidx++;
200 }
201 }
202
203 1080 static void set_state(struct parser_arg *a, enum parser_st state)
204 {
205
2/2
✓ Branch 0 taken 1079 times.
✓ Branch 1 taken 1 times.
1080 if (a->state != END) {
206 1079 a->state = state;
207 }
208 1080 }
209
210 /*******************************************************************************
211
212 Parser Functions
213
214 *******************************************************************************/
215
216 /**
217 @brief Called by the parser when it is in the START state.
218 @param a Parser data.
219 @param wc Character.
220 */
221 469 static void json_string_start(struct parser_arg *a, char wc)
222 {
223
2/2
✓ Branch 0 taken 462 times.
✓ Branch 1 taken 7 times.
469 if (wc == '"') {
224 462 set_state(a, INSTRING);
225 } else {
226 7 set_state(a, END);
227 7 a->error = JSONERR_UNEXPECTED_TOKEN;
228 7 a->textidx--;
229 }
230 469 }
231
232 /**
233 @brief Called by the parser when it is in the INSTRING state.
234 @param a Parser data.
235 @param wc Character.
236 */
237 6310 static void json_string_instring(struct parser_arg *a, char wc)
238 {
239
2/2
✓ Branch 0 taken 69 times.
✓ Branch 1 taken 6241 times.
6310 if (wc == '\\') {
240 69 set_state(a, ESCAPE);
241
2/2
✓ Branch 0 taken 444 times.
✓ Branch 1 taken 5797 times.
6241 } else if (wc == '"') {
242 444 set_state(a, END);
243
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 5796 times.
5797 } else if (wc == '\0') {
244 1 set_state(a, END);
245 1 a->error = JSONERR_PREMATURE_EOF;
246 1 a->textidx--;
247 } else {
248 5796 set_output(a, wc, false);
249 }
250 6310 }
251
252 /**
253 @brief Called by the parser when it is in the ESCAPE state.
254 @param a Parser data.
255 @param wc Character.
256 */
257 69 static void json_string_escape(struct parser_arg *a, char wc)
258 {
259 69 char esc = json_escape(wc);
260
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 68 times.
69 if (wc == '\0') {
261 1 set_state(a, END);
262 1 a->error = JSONERR_PREMATURE_EOF;
263 1 a->textidx--;
264
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 40 times.
68 } else if (wc == 'u') {
265 28 set_state(a, UESC0);
266
2/2
✓ Branch 0 taken 29 times.
✓ Branch 1 taken 11 times.
40 } else if (esc != '\0') {
267 29 set_state(a, INSTRING);
268 29 set_output(a, esc, false);
269 } else {
270 11 set_state(a, END);
271 11 a->error = JSONERR_UNEXPECTED_TOKEN;
272 11 a->textidx--;
273 }
274 69 }
275
276 /**
277 @brief Called by the parser when it is in one of the UESC states.
278 @param a Parser data.
279 @param wc Character.
280 */
281 109 static void json_string_uesc(struct parser_arg *a, char wc)
282 {
283
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 108 times.
109 if (wc == '\0') {
284 1 set_state(a, END);
285 1 a->error = JSONERR_PREMATURE_EOF;
286 1 a->textidx--;
287
2/2
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 106 times.
108 } else if (json_xdigit(wc) == 0xFF) {
288 2 set_state(a, END);
289 2 a->error = JSONERR_UNEXPECTED_TOKEN;
290 2 a->textidx--;
291 } else {
292 106 a->curr = a->curr << 4;
293 106 a->curr |= json_xdigit(wc);
294
2/2
✓ Branch 0 taken 81 times.
✓ Branch 1 taken 25 times.
106 if (a->state < UESC3) {
295 // continue reading all the input
296 81 a->state += 1;
297 } else {
298 // time to "publish" our unicode escape
299
2/2
✓ Branch 0 taken 20 times.
✓ Branch 1 taken 5 times.
25 if (a->prev == 0) {
300 // if there was no "prev", that means this might
301 // be the start of a surrogate pair. Check for
302 // that!
303
3/4
✓ Branch 0 taken 7 times.
✓ Branch 1 taken 13 times.
✓ Branch 2 taken 7 times.
✗ Branch 3 not taken.
20 if (0xD800 <= a->curr && a->curr <= 0xDFFF) {
304 // yup, it's a surrogate pair!
305 7 a->prev = a->curr;
306 } else {
307 // nope, keep going
308 13 set_output(a, a->curr, true);
309 }
310 } else {
311 // there was a previous starting surrogate
312
3/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 4 times.
✗ Branch 3 not taken.
5 if (0xD800 <= a->curr && a->curr <= 0xDFFF) {
313 // and this is also a surrogate
314 4 a->curr &= 0x03FF; // clear upper bits;
315 // keep lower 10
316 4 a->curr |= (a->prev & 0x03FF) << 10;
317 4 a->curr +=
318 0x10000; // apparently this
319 // needs to happen (?)
320 4 a->prev = 0;
321 4 set_output(a, a->curr, true);
322 } else {
323 // not a legal surrogate to match
324 // previous surrogate.
325 1 a->state = END;
326 1 a->error = JSONERR_INVALID_SURROGATE;
327 }
328 }
329 25 set_state(a, INSTRING);
330 25 a->curr = 0;
331 }
332 }
333 109 }
334
335 /**
336 @brief Parses JSON strings, in a very generic manner.
337 @param text Input text.
338 @param idx Starting index of the string.
339 @param setter Function to call with each character.
340 @param setarg Argument to give to the setter function.
341 */
342 469 static struct parser_arg json_string(const char *text, size_t idx,
343 output_setter setter, void *setarg)
344 {
345 char wc;
346 469 struct parser_arg a = { .state = START,
347 .text = text,
348 .textidx = idx,
349 .outidx = 0,
350 .setter = setter,
351 .setter_arg = setarg,
352 .prev = 0,
353 .curr = 0,
354 .error = JSONERR_NO_ERROR };
355
356
2/2
✓ Branch 0 taken 6957 times.
✓ Branch 1 taken 469 times.
7426 while (a.state != END) {
357 6957 wc = a.text[a.textidx];
358
4/6
✓ Branch 0 taken 469 times.
✓ Branch 1 taken 6310 times.
✓ Branch 2 taken 69 times.
✓ Branch 3 taken 109 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
6957 switch (a.state) {
359 469 case START:
360 469 json_string_start(&a, wc);
361 469 break;
362 6310 case INSTRING:
363 6310 json_string_instring(&a, wc);
364 6310 break;
365 69 case ESCAPE:
366 69 json_string_escape(&a, wc);
367 69 break;
368 109 case UESC0:
369 case UESC1:
370 case UESC2:
371 case UESC3:
372 109 json_string_uesc(&a, wc);
373 109 break;
374 case END:
375 // never happens
376 assert(false);
377 break;
378 }
379 6957 a.textidx++;
380 }
381
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 466 times.
469 if (a.prev != 0) {
382 3 a.error = JSONERR_INVALID_SURROGATE;
383 }
384 469 return a;
385 }
386
387 /*******************************************************************************
388
389 Application-Specific Parsers
390
391 *******************************************************************************/
392
393 /**
394 @brief Parse a string literal.
395 @param text The text we're parsing.
396 @param arr The token buffer.
397 @param maxtoken The length of the token buffer.
398 @param p The parser state.
399 @returns Parser state after parsing the string.
400 */
401 276 struct json_parser json_parse_string(char *text, struct json_token *arr,
402 size_t maxtoken, struct json_parser p)
403 {
404 struct json_token tok;
405 struct parser_arg a;
406
407 276 tok.type = JSON_STRING;
408 276 tok.start = p.textidx;
409
410 276 a = json_string(text, p.textidx, NULL, NULL);
411
412 276 tok.end = a.textidx - 1;
413 276 tok.child = 0;
414 276 tok.next = 0;
415 276 tok.length = a.outidx;
416 276 json_settoken(arr, tok, p, maxtoken);
417
418 276 p.error = a.error;
419 276 p.tokenidx++;
420 276 p.textidx = a.textidx;
421 276 return p;
422 }
423
424 /**
425 @brief Argument passed to setter when we are doing json_string_match().
426 */
427 struct string_compare_arg {
428 /**
429 @brief String we're comparing to.
430 */
431 const char *other;
432 /**
433 @brief Whether or not the string has evaluated to equal so far.
434 */
435 bool equal;
436 };
437
438 /**
439 @brief This is the "setter" function for json_string_match().
440 @param a Parser arguments.
441 @param wc Character to set.
442 @param arg The struct string_compare_arg.
443
444 This function just compares each output character to the corresponding
445 character in the other string. It stores the result in the arg, which will
446 be examined after the fact.
447 */
448 2049 static void json_string_comparator(struct parser_arg *a, char wc, void *arg)
449 {
450 2049 struct string_compare_arg *ca = arg;
451 // we are depending on short-circuit evaluation here :)
452
4/4
✓ Branch 0 taken 359 times.
✓ Branch 1 taken 1690 times.
✓ Branch 2 taken 204 times.
✓ Branch 3 taken 155 times.
2049 ca->equal = ca->equal && (wc == ca->other[a->outidx]);
453 2049 }
454
455 187 bool json_string_match(const char *json, const struct json_token *tokens,
456 size_t index, const char *other)
457 {
458 187 struct string_compare_arg ca = {
459 .other = other,
460 .equal = true,
461 };
462 187 struct parser_arg pa = json_string(json, tokens[index].start,
463 &json_string_comparator, &ca);
464
465 // They are equal if every previous character matches, and the next
466 // character in the other string is the null character, signifying the
467 // end.
468
4/4
✓ Branch 0 taken 32 times.
✓ Branch 1 taken 155 times.
✓ Branch 2 taken 31 times.
✓ Branch 3 taken 1 times.
187 return ca.equal && (other[pa.outidx] == '\0');
469 }
470
471 /**
472 @brief This is the "setter" function for json_string_match().
473 @param a Parser arguments.
474 @param wc Character to set.
475 @param arg The struct string_compare_arg.
476
477 This function just compares each output character to the corresponding
478 character in the other string. It stores the result in the arg, which will
479 be examined after the fact.
480 */
481 46 static void json_string_loader(struct parser_arg *a, char wc, void *arg)
482 {
483 46 char *str = arg;
484 // we are depending on short-circuit evaluation here :)
485 46 str[a->outidx] = wc;
486 46 }
487
488 6 void json_string_load(const char *json, const struct json_token *tokens,
489 size_t index, char *buffer)
490 {
491 6 struct parser_arg pa = json_string(json, tokens[index].start,
492 &json_string_loader, buffer);
493
494 6 buffer[pa.outidx] = '\0';
495 6 }
496