Line data Source code
1 : /***************************************************************************//**
2 :
3 : @file codegen.c
4 :
5 : @author Stephen Brennan
6 :
7 : @date Created Saturday, 30 January 2016
8 :
9 : @brief Generating code from parse trees.
10 :
11 : @copyright Copyright (c) 2016, Stephen Brennan. Released under the Revised
12 : BSD License. See LICENSE.txt for details.
13 :
14 : Notes on how code generation works:
15 :
16 : The difficult part of code generation is that you need to generate code with
17 : "jump" and "split" instructions that point to other instructions. But since
18 : the jump and split targets are just (Instr*) pointers, and these frequently
19 : get moved around or freed during code generation, a naive implementation will
20 : end up with a bunch of dangling pointers.
21 :
22 : My solution to this issue is to use "Fragments." Fragments are stored in a
23 : singly linked list, and they contain an instruction along with an ID. As code
24 : is generated, jump and split targets are stored using this ID rather than a
25 : raw pointer. Once you have a final Fragment list that contains all of your
26 : code, it will be turned into an array, and all the IDs will be resolved
27 : efficiently to locations in the final array using a table.
28 :
29 : *******************************************************************************/
30 :
31 : #include <stdio.h>
32 : #include <assert.h>
33 : #include <stdint.h>
34 : #include <stdbool.h>
35 : #include <string.h>
36 :
37 : #include "libstephen/re.h"
38 : #include "libstephen/re_internals.h"
39 :
40 : typedef struct Fragment Fragment;
41 : struct Fragment {
42 : Instr in;
43 : intptr_t id;
44 : Fragment *next;
45 : };
46 :
47 : typedef struct State State;
48 : struct State {
49 : intptr_t id; // "global" id counter
50 : size_t capture; // capture parentheses counter
51 : };
52 :
53 41 : static Fragment *last(Fragment *f)
54 : {
55 189 : while (f->next) {
56 107 : f = f->next;
57 : }
58 41 : return f;
59 : }
60 :
61 : /**
62 : @brief "Join" a fragment to the given ID.
63 :
64 : This replaces each "match" instruction with a "jump" instruction to the given
65 : ID. Since it's assumed that the instruction with that ID will be placed
66 : after this one, if the last instruction of this fragment is a "match", it
67 : will just be deleted.
68 : */
69 41 : static void join(Fragment *a, Fragment *b)
70 : {
71 41 : Fragment *prev = NULL;
72 41 : Fragment *l = last(a);
73 : Instr *lastid;
74 :
75 : // lastid will be set in case the last instruction is a Match, which will be
76 : // deleted later.
77 41 : if (l->in.code == Match) {
78 41 : lastid = (Instr*) l->id;
79 : } else {
80 0 : lastid = (Instr*) -1;
81 : }
82 :
83 189 : while (a->next != NULL) {
84 : // Matches should be replaced with jumps to the next fragment.
85 107 : if (a->in.code == Match) {
86 0 : a->in.code = Jump;
87 0 : a->in.x = (Instr*)b->id;
88 : }
89 : // Jumps to the final match instruction should instead be targeted to the
90 : // next fragment.
91 107 : if ((a->in.code == Jump || a->in.code == Split) && a->in.x == lastid) {
92 0 : a->in.x = (Instr*) b->id;
93 : }
94 107 : if (a->in.code == Split && a->in.y == lastid) {
95 9 : a->in.y = (Instr*) b->id;
96 : }
97 107 : prev = a;
98 107 : a = a->next;
99 : }
100 :
101 : // If the last Instruction is a Match, delete it.
102 41 : if (prev != NULL && a->in.code == Match) {
103 41 : free(a);
104 41 : prev->next = b;
105 : }
106 41 : }
107 :
108 37 : static size_t fraglen(Fragment *f)
109 : {
110 37 : size_t len = 0;
111 212 : while (f) {
112 138 : len++;
113 138 : f = f->next;
114 : }
115 37 : return len;
116 : }
117 :
118 179 : static Fragment *newfrag(enum code code, State *s)
119 : {
120 179 : Fragment *new = calloc(1, sizeof(Fragment));
121 179 : new->in.code = code;
122 179 : new->id = s->id++;
123 179 : return new;
124 : }
125 :
126 37 : static void freefraglist(Fragment *f)
127 : {
128 : Fragment *next;
129 212 : while (f) {
130 138 : next = f->next;
131 138 : free(f);
132 138 : f = next;
133 : }
134 37 : }
135 :
136 : static Fragment *regex(PTree *t, State *s);
137 : static Fragment *term(PTree *t, State *s);
138 : static Fragment *expr(PTree *t, State *s);
139 : static Fragment *class(PTree *t, State *s, bool is_negative);
140 : static Fragment *sub(PTree *t, State *s);
141 :
142 6 : static Fragment *special(char type, State *s)
143 : {
144 : Fragment *f;
145 :
146 6 : char whitespace[] = " \t\t\n\n\r\r\f\f\v\v";
147 6 : char word[] = "azAZ09__";
148 6 : char number[] = "09";
149 :
150 6 : switch (type) {
151 : case 's':
152 : case 'S':
153 2 : f = (type == 's') ? newfrag(Range, s) : newfrag(NRange, s);
154 2 : f->in.s = nelem(whitespace) / 2;
155 2 : f->in.x = calloc(nelem(whitespace), sizeof(char));
156 2 : memcpy(f->in.x, whitespace, nelem(whitespace));
157 2 : break;
158 : case 'w':
159 : case 'W':
160 2 : f = (type == 'w') ? newfrag(Range, s) : newfrag(NRange, s);
161 2 : f->in.s = nelem(word) / 2;
162 2 : f->in.x = calloc(nelem(word), sizeof(char));
163 2 : memcpy(f->in.x, word, nelem(word));
164 2 : break;
165 : case 'd':
166 : case 'D':
167 2 : f = (type == 'd') ? newfrag(Range, s) : newfrag(NRange, s);
168 2 : f->in.s = nelem(number) / 2;
169 2 : f->in.x = calloc(nelem(number), sizeof(char));
170 2 : memcpy(f->in.x, number, nelem(number));
171 2 : break;
172 : default:
173 0 : fprintf(stderr, "not implemented: special character class '%c'\n", type);
174 0 : exit(EXIT_FAILURE);
175 : break;
176 : }
177 :
178 6 : f->next = newfrag(Match, s);
179 6 : return f;
180 : }
181 :
182 55 : static Fragment *term(PTree *t, State *s)
183 : {
184 55 : Fragment *f = NULL;
185 :
186 55 : assert(t->nt == TERMnt);
187 :
188 55 : if (t->production == 1) {
189 42 : if (t->children[0]->tok.sym == CharSym || t->children[0]->tok.sym == Caret
190 9 : || t->children[0]->tok.sym == Minus) {
191 : // Character
192 33 : f = newfrag(Char, s);
193 33 : f->in.c = t->children[0]->tok.c;
194 33 : f->next = newfrag(Match, s);
195 9 : } else if (t->children[0]->tok.sym == Dot) {
196 : // Dot
197 3 : f = newfrag(Any, s);
198 3 : f->next = newfrag(Match, s);
199 6 : } else if (t->children[0]->tok.sym == Special) {
200 : // Special
201 6 : f = special(t->children[0]->tok.c, s);
202 : }
203 13 : } else if (t->production == 2) {
204 : // Parenthesized expression
205 7 : f = newfrag(Save, s);
206 7 : f->in.s = s->capture++;
207 7 : size_t nextsave = s->capture++;
208 7 : f->next = regex(t->children[1], s);
209 7 : Fragment *n = newfrag(Save, s);
210 7 : n->in.s = nextsave;
211 7 : n->next = newfrag(Match, s);
212 7 : join(f, n);
213 : } else {
214 : // Character class
215 6 : f = class(t->children[1], s, (t->production == 4));
216 : }
217 55 : return f;
218 : }
219 :
220 55 : static Fragment *expr(PTree *t, State *s)
221 : {
222 55 : Fragment *f = NULL, *a = NULL, *b = NULL, *c = NULL;
223 :
224 55 : assert(t->nt == EXPRnt);
225 :
226 55 : f = term(t->children[0], s);
227 55 : if (t->nchildren == 1) {
228 33 : return f;
229 : } else {
230 22 : if (t->children[1]->tok.sym == Star) {
231 : /*
232 : L1:
233 : split L2 L3 ;; this is "a" [ non-greedy: split L3 L2 ]
234 : L2:
235 : BLOCK from f
236 : jump L1 ;; this is "b"
237 : L3:
238 : match ;; this is "c"
239 : */
240 15 : a = newfrag(Split, s);
241 15 : b = newfrag(Jump, s);
242 15 : c = newfrag(Match, s);
243 15 : if (t->nchildren == 3) {
244 : // Non-greedy
245 1 : a->in.x = (Instr*) c->id;
246 1 : a->in.y = (Instr*) f->id;
247 : } else {
248 : // Greedy
249 14 : a->in.x = (Instr*) f->id;
250 14 : a->in.y = (Instr*) c->id;
251 : }
252 15 : b->in.x = (Instr*) a->id;
253 15 : a->next = f;
254 15 : b->next = c;
255 15 : join(a, b);
256 15 : return a;
257 7 : } else if (t->children[1]->tok.sym == Plus) {
258 : /*
259 : L1:
260 : BLOCK from f
261 : split L1 L2 ;; this is "a" [ non-greedy: split L2 L1 ]
262 : L2:
263 : match ;; this is "b"
264 : */
265 5 : a = newfrag(Split, s);
266 5 : b = newfrag(Match, s);
267 5 : if (t->nchildren == 3) {
268 : // Non-greedy
269 1 : a->in.x = (Instr*) b->id;
270 1 : a->in.y = (Instr*) f->id;
271 : } else {
272 : // Greedy
273 4 : a->in.x = (Instr*) f->id;
274 4 : a->in.y = (Instr*) b->id;
275 : }
276 5 : join(f, a);
277 5 : a->next = b;
278 5 : return f;
279 2 : } else if (t->children[1]->tok.sym == Question) {
280 : /*
281 : split L1 L2 ;; this is "a" [ non-greedy: split L2 L1 ]
282 : L1:
283 : BLOCK from f
284 : L2:
285 : match ;; this is "b"
286 : */
287 2 : a = newfrag(Split, s);
288 2 : b = newfrag(Match, s);
289 2 : if (t->nchildren == 3) {
290 : // Non-greedy
291 1 : a->in.x = (Instr*) b->id;
292 1 : a->in.y = (Instr*) f->id;
293 : } else {
294 : // Greedy
295 1 : a->in.x = (Instr*) f->id;
296 1 : a->in.y = (Instr*) b->id;
297 : }
298 2 : a->next = f;
299 2 : join(f, b);
300 2 : return a;
301 : } else {
302 0 : assert(false);
303 : }
304 : }
305 : }
306 :
307 55 : static Fragment *sub(PTree *tree, State *state)
308 : {
309 55 : assert(tree->nt == SUBnt);
310 55 : Fragment *e = expr(tree->children[0], state);
311 55 : if (tree->nchildren == 2) {
312 : /*
313 : BLOCK from e
314 : BLOCK from s
315 : */
316 10 : Fragment *s = sub(tree->children[1], state);
317 10 : join(e, s);
318 : }
319 55 : return e;
320 : }
321 :
322 45 : static Fragment *regex(PTree *tree, State *state)
323 : {
324 45 : assert(tree->nt == REGEXnt);
325 45 : Fragment *s = sub(tree->children[0], state);
326 45 : if (tree->nchildren == 3) {
327 : /*
328 : split L1 L2 ;; this is "pre"
329 : L1:
330 : BLOCK from s
331 : jump L3 ;; this is "j"
332 : L2:
333 : BLOCK from r
334 : L3:
335 : match ;; this is "m"
336 : */
337 :
338 1 : Fragment *r = regex(tree->children[2], state);
339 :
340 1 : Fragment *pre = newfrag(Split, state);
341 1 : pre->in.x = (Instr*) s->id;
342 1 : pre->in.y = (Instr*) r->id;
343 1 : pre->next = s;
344 :
345 1 : Fragment *m = newfrag(Match, state);
346 1 : Fragment *j = newfrag(Jump, state);
347 1 : j->in.x = (Instr*) m->id;
348 1 : j->next = r;
349 1 : join(j, m);
350 1 : join(pre, j);
351 :
352 1 : return pre;
353 : }
354 44 : return s;
355 : }
356 :
357 6 : static Fragment *class(PTree *tree, State *state, bool is_negative)
358 : {
359 6 : size_t nranges = 0;
360 : PTree *curr;
361 : Fragment *f;
362 :
363 26 : for (curr = tree; curr->nt == CLASSnt; curr = curr->children[curr->nchildren-1]) {
364 20 : nranges++;
365 : }
366 :
367 6 : if (is_negative) {
368 3 : f = newfrag(NRange, state);
369 : } else {
370 3 : f = newfrag(Range, state);
371 : }
372 :
373 6 : f->in.s = nranges;
374 6 : f->in.x = calloc(nranges*2, sizeof(char));
375 6 : char *block = (char*)f->in.x;
376 :
377 6 : curr = tree;
378 6 : nranges = 0;
379 32 : while (curr->nt == CLASSnt) {
380 20 : if (curr->production == 1 || curr->production == 2) {
381 : // Range
382 7 : block[2*nranges] = curr->children[0]->tok.c;
383 7 : block[2*nranges+1] = curr->children[1]->tok.c;
384 : } else {
385 : // Single
386 13 : block[2*nranges] = curr->children[0]->tok.c;
387 13 : block[2*nranges+1] = curr->children[0]->tok.c;
388 : }
389 20 : curr = curr->children[curr->nchildren-1];
390 20 : nranges++;
391 : }
392 :
393 6 : f->next = newfrag(Match, state);
394 6 : return f;
395 : }
396 :
397 37 : Regex codegen(PTree *tree)
398 : {
399 : // Generate code.
400 37 : State s = {0, 0};
401 37 : Fragment *f = regex(tree, &s);
402 : size_t n;
403 :
404 : // Get the length of the code
405 37 : n = fraglen(f);
406 :
407 : // Allocate buffers for the code, and for a lookup table of targets for jumps.
408 37 : Instr *code = calloc(n, sizeof(Instr));
409 37 : size_t *targets = calloc(s.id, sizeof(size_t));
410 :
411 : // Fill up the lookup table.
412 37 : size_t i = 0;
413 : Fragment *curr;
414 175 : for (curr = f; curr; curr = curr->next, i++) {
415 138 : targets[curr->id] = i;
416 : }
417 :
418 : // Now, copy in the Instructions, replacing the jump targets from the table.
419 175 : for (curr = f, i = 0; curr; curr = curr->next, i++) {
420 138 : code[i] = curr->in;
421 138 : if (code[i].code == Jump || code[i].code == Split) {
422 39 : code[i].x = code + targets[(intptr_t)code[i].x];
423 : }
424 138 : if (code[i].code == Split) {
425 23 : code[i].y = code + targets[(intptr_t)code[i].y];
426 : }
427 : }
428 :
429 37 : free(targets);
430 37 : freefraglist(f);
431 37 : return (Regex){.n=n, .i=code};
432 : }
|