Line data Source code
1 : /***************************************************************************//**
2 :
3 : @file parse.c
4 :
5 : @author Stephen Brennan
6 :
7 : @date Created Saturday, 23 January 2016
8 :
9 : @brief Recursive descent regex parser! (modeled after Wikipedia)
10 :
11 : @copyright Copyright (c) 2016, Stephen Brennan. Released under the Revised
12 : BSD License. See LICENSE.txt for details.
13 :
14 : *******************************************************************************/
15 :
16 : #include <stdio.h>
17 : #include <stdbool.h>
18 : #include <stdlib.h>
19 : #include <wchar.h>
20 :
21 : #include "libstephen/re.h"
22 : #include "libstephen/re_internals.h"
23 :
24 : char *names[] = {
25 : "CharSym", "Special", "Eof", "LParen", "RParen", "LBracket", "RBracket",
26 : "Plus", "Minus", "Star", "Question", "Caret", "Pipe", "Dot"
27 : };
28 :
29 : char *ntnames[] = {
30 : "TERM", "EXPR", "REGEX", "CLASS", "SUB"
31 : };
32 :
33 : /*
34 : Convenience functions for parse trees.
35 : */
36 :
37 245 : static PTree *terminal_tree(Token tok)
38 : {
39 245 : PTree *tree = calloc(1, sizeof(PTree));
40 245 : tree->nchildren = 0;
41 245 : tree->production = 0; // marks this as terminal
42 245 : tree->tok = tok;
43 245 : return tree;
44 : }
45 :
46 460 : static PTree *nonterminal_tree(NTSym nt, size_t nchildren)
47 : {
48 460 : PTree *tree = calloc(1, sizeof(PTree));
49 460 : tree->nchildren = nchildren;
50 460 : tree->production = 1; // update this on return.
51 460 : tree->nt = nt;
52 460 : return tree;
53 : }
54 :
55 621 : void free_tree(PTree *tree)
56 : {
57 1145 : for (size_t i = 0; i < tree->nchildren; i++) {
58 524 : free_tree(tree->children[i]);
59 : }
60 621 : free(tree);
61 621 : }
62 :
63 : /*
64 : Simple convenience functions for a parser.
65 : */
66 :
67 1057 : static bool accept(TSym s, Lexer *l)
68 : {
69 1057 : if (l->tok.sym == s) {
70 251 : nextsym(l);
71 251 : return true;
72 : }
73 806 : return false;
74 : }
75 :
76 58 : static void expect(TSym s, Lexer *l)
77 : {
78 58 : if (l->tok.sym == s) {
79 58 : nextsym(l);
80 58 : return;
81 : }
82 0 : fprintf(stderr, "error: expected %s, got %s\n", names[s], names[l->tok.sym]);
83 0 : exit(1);
84 : }
85 :
86 103 : PTree *TERM(Lexer *l)
87 : {
88 126 : if (accept(CharSym, l) || accept(Dot, l) || accept(Special, l) ||
89 44 : accept(Caret, l) || accept(Minus, l)) {
90 84 : PTree *result = nonterminal_tree(TERMnt, 1);
91 84 : result->children[0] = terminal_tree(l->prev);
92 84 : result->production = 1;
93 84 : return result;
94 19 : } else if (accept(LParen, l)) {
95 9 : PTree *result = nonterminal_tree(TERMnt, 3);
96 9 : result->children[0] = terminal_tree(l->prev);
97 9 : result->children[1] = REGEX(l);
98 9 : expect(RParen, l);
99 9 : result->children[2] = terminal_tree(l->prev);
100 9 : result->production = 2;
101 9 : return result;
102 10 : } else if (accept(LBracket, l)) {
103 : PTree *result;
104 10 : if (accept(Caret, l)) {
105 5 : result = nonterminal_tree(TERMnt, 3);
106 5 : result->children[0] = terminal_tree((Token){LBracket, '['});
107 5 : result->children[1] = CLASS(l);
108 5 : expect(RBracket, l);
109 5 : result->children[2] = terminal_tree(l->prev);
110 5 : result->production = 4;
111 : } else {
112 5 : result = nonterminal_tree(TERMnt, 3);
113 5 : result->children[0] = terminal_tree((Token){LBracket, '['});
114 5 : result->children[1] = CLASS(l);
115 5 : expect(RBracket, l);
116 5 : result->children[2] = terminal_tree(l->prev);
117 5 : result->production = 3;
118 : }
119 10 : return result;
120 : } else {
121 0 : fprintf(stderr, "error: TERM: syntax error\n");
122 0 : exit(1);
123 : }
124 : }
125 :
126 87 : PTree *EXPR(Lexer *l)
127 : {
128 87 : PTree *result = nonterminal_tree(EXPRnt, 1);
129 87 : result->children[0] = TERM(l);
130 87 : if (accept(Plus, l) || accept(Star, l) || accept(Question, l)) {
131 40 : result->nchildren++;
132 40 : result->children[1] = terminal_tree(l->prev);
133 40 : if (accept(Question, l)) {
134 9 : result->nchildren++;
135 9 : result->children[2] = terminal_tree((Token){Question, '?'});
136 : }
137 : }
138 87 : return result;
139 : }
140 :
141 61 : PTree *SUB(Lexer *l)
142 : {
143 61 : PTree *result = nonterminal_tree(SUBnt, 1);
144 61 : PTree *orig = result, *prev = result;
145 :
146 195 : while (l->tok.sym != Eof && l->tok.sym != RParen && l->tok.sym != Pipe) { // seems like a bit of a hack
147 73 : result->children[0] = EXPR(l);
148 73 : result->children[1] = nonterminal_tree(SUBnt, 0);
149 73 : result->nchildren = 2;
150 73 : prev = result;
151 73 : result = result->children[1];
152 : }
153 :
154 : // This prevents SUB nonterminals with no children in the final parse tree.
155 61 : if (prev != result) {
156 61 : prev->nchildren = 1;
157 61 : free(result);
158 : }
159 61 : return orig;
160 : }
161 :
162 57 : PTree *REGEX(Lexer *l)
163 : {
164 57 : PTree *result = nonterminal_tree(REGEXnt, 1);
165 57 : result->children[0] = SUB(l);
166 :
167 57 : if (accept(Pipe, l)) {
168 5 : result->nchildren = 3;
169 5 : result->children[1] = terminal_tree(l->prev);
170 5 : result->children[2] = REGEX(l);
171 : }
172 57 : return result;
173 : }
174 :
175 99 : bool CCHAR(Lexer *l)
176 : {
177 99 : TSym acceptable[] = {CharSym, Dot, LParen, RParen, Plus, Star, Question, Pipe};
178 451 : for (size_t i = 0; i < nelem(acceptable); i++) {
179 414 : if (accept(acceptable[i], l)) {
180 62 : l->prev.sym = CharSym;
181 62 : return true;
182 : }
183 : }
184 37 : return false;
185 : }
186 :
187 30 : PTree *CLASS(Lexer *l)
188 : {
189 30 : PTree *result = nonterminal_tree(CLASSnt, 0), *curr, *prev;
190 : Token t1, t2, t3;
191 30 : curr = result;
192 :
193 : while (true) {
194 79 : if (CCHAR(l)) {
195 49 : prev = curr;
196 49 : t1 = l->prev;
197 49 : if (accept(Minus, l)) {
198 20 : t2 = l->prev;
199 20 : if (CCHAR(l)) {
200 13 : t3 = l->prev;
201 : // We have ourselves a range! Parse it.
202 13 : curr->children[0] = terminal_tree(t1);
203 13 : curr->children[1] = terminal_tree(t3);
204 13 : curr->children[2] = nonterminal_tree(CLASSnt, 0);
205 13 : curr->nchildren = 3;
206 13 : curr->production = 1;
207 13 : curr = curr->children[2];
208 : } else {
209 : // character followed by minus, but not range.
210 7 : unget(t2, l);
211 7 : curr->children[0] = terminal_tree(t1);
212 7 : curr->children[1] = nonterminal_tree(CLASSnt, 0);
213 7 : curr->nchildren = 2;
214 7 : curr->production = 3;
215 7 : curr = curr->children[1];
216 : }
217 : } else {
218 : // just a character
219 29 : curr->children[0] = terminal_tree(t1);
220 29 : curr->children[1] = nonterminal_tree(CLASSnt, 0);
221 29 : curr->nchildren = 2;
222 29 : curr->production = 3;
223 29 : curr = curr->children[1];
224 : }
225 30 : } else if (accept(Minus, l)) {
226 : // just a minus
227 7 : prev = curr;
228 7 : curr->children[0] = terminal_tree(l->prev);
229 7 : curr->nchildren = 1;
230 7 : curr->production = 5;
231 7 : break;
232 : } else {
233 23 : free(curr);
234 23 : prev->nchildren--;
235 23 : prev->production++;
236 23 : break;
237 : }
238 49 : }
239 30 : return result;
240 : }
241 :
242 39 : static PTree *reparse_internal(struct Input input)
243 : {
244 : Lexer l;
245 :
246 : // Initialize the lexer.
247 39 : l.input = input;
248 39 : l.index = 0;
249 39 : l.nbuf = 0;
250 39 : l.tok = (Token){.sym=0, .c=0};
251 :
252 : // Create a parse tree!
253 : //printf(";; TOKENS:\n");
254 39 : nextsym(&l);
255 39 : PTree *tree = REGEX(&l);
256 39 : expect(Eof, &l);
257 :
258 39 : return tree;
259 : }
260 :
261 29 : PTree *reparse(const char *input)
262 : {
263 29 : struct Input in = {.str=input, .wstr=NULL};
264 29 : return reparse_internal(in);
265 : }
266 :
267 10 : PTree *reparsew(const wchar_t *winput)
268 : {
269 10 : struct Input in = {.str=NULL, .wstr=winput};
270 10 : return reparse_internal(in);
271 : }
272 :
273 28 : Regex recomp(const char *regex)
274 : {
275 28 : PTree *tree = reparse(regex);
276 28 : Regex code = codegen(tree);
277 28 : free_tree(tree);
278 28 : return code;
279 : }
280 :
281 9 : Regex recompw(const wchar_t *regex)
282 : {
283 9 : PTree *tree = reparsew(regex);
284 9 : Regex code = codegen(tree);
285 9 : free_tree(tree);
286 9 : return code;
287 : }
|