Line data Source code
1 : /***************************************************************************//**
2 :
3 : @file pike.c
4 :
5 : @author Stephen Brennan, based on code by Russ Cox:
6 : https://swtch.com/~rsc/regexp/regexp2.html
7 :
8 : @date Created Wednesday, 20 January 2016
9 :
10 : @brief Regex implementation using virtual machines.
11 :
12 : @copyright Copyright (c) 2016, Stephen Brennan. Released under the Revised
13 : BSD License. See LICENSE.txt for details.
14 :
15 : Portions of this code are based on code by Russ Cox, Copyright
16 : 2007-2009.
17 :
18 : *******************************************************************************/
19 :
20 : #include <assert.h>
21 : #include <stdlib.h>
22 : #include <stdio.h>
23 : #include <stdbool.h>
24 : #include <string.h>
25 : #include <stdint.h>
26 : #include <unistd.h>
27 : #include <wchar.h>
28 :
29 : #include "libstephen/re.h"
30 : #include "libstephen/re_internals.h"
31 :
32 : // Declarations:
33 :
34 : typedef struct thread thread;
35 : struct thread {
36 : Instr *pc;
37 : size_t *saved;
38 : };
39 :
40 : typedef struct thread_list thread_list;
41 : struct thread_list {
42 : thread *t;
43 : size_t n;
44 : };
45 :
46 : // Printing, for diagnostics
47 :
48 0 : void printthreads(thread_list *tl, Instr *prog, size_t nsave) {
49 0 : for (size_t i = 0; i < tl->n; i++) {
50 0 : printf("T%zu@pc=%lu{", i, (intptr_t) (tl->t[i].pc - prog));
51 0 : for (size_t j = 0; j < nsave; j++) {
52 0 : printf("%lu,", tl->t[i].saved[j]);
53 : }
54 0 : printf("} ");
55 : }
56 0 : printf("\n");
57 0 : }
58 :
59 : // Helper evaluation functions for instructions
60 :
61 32 : bool range(Instr in, wchar_t test) {
62 32 : if (test == L'\0') {
63 0 : return false;
64 : }
65 32 : bool result = false;
66 32 : char *block = (char *) in.x;
67 :
68 : // use in.s for number of ranges, in.x as char* for ranges.
69 80 : for (size_t i = 0; i < in.s; i++) {
70 68 : if (block[i*2] <= test && test <= block [i*2 + 1]) {
71 20 : result = true;
72 20 : break; // short circuit yo!
73 : }
74 : }
75 :
76 : // negate result for negative ranges
77 32 : if (in.code == Range) {
78 16 : return result;
79 : } else {
80 16 : return !result;
81 : }
82 : }
83 :
84 : // Pike VM functions:
85 :
86 188 : thread_list newthread_list(size_t n)
87 : {
88 : thread_list tl;
89 188 : tl.t = calloc(n, sizeof(thread));
90 188 : tl.n = 0;
91 188 : return tl;
92 : }
93 :
94 682 : void addthread(thread_list *threads, Instr *pc, size_t *saved, size_t nsave,
95 : size_t sp)
96 : {
97 : //printf("addthread(): pc=%d, saved={%u, %u}, sp=%u, lastidx=%u\n", pc - extprog,
98 : // saved[0], saved[1], sp, pc->lastidx);
99 682 : if (pc->lastidx == sp) {
100 : // we've executed this instruction on this string index already
101 20 : free(saved);
102 702 : return;
103 : }
104 662 : pc->lastidx = sp;
105 :
106 : size_t *newsaved;
107 662 : switch (pc->code) {
108 : case Jump:
109 88 : addthread(threads, pc->x, saved, nsave, sp);
110 88 : break;
111 : case Split:
112 148 : newsaved = calloc(nsave, sizeof(size_t));
113 148 : memcpy(newsaved, saved, nsave * sizeof(size_t));
114 148 : addthread(threads, pc->x, saved, nsave, sp);
115 148 : addthread(threads, pc->y, newsaved, nsave, sp);
116 148 : break;
117 : case Save:
118 64 : saved[pc->s] = sp;
119 64 : addthread(threads, pc + 1, saved, nsave, sp);
120 64 : break;
121 : default:
122 362 : threads->t[threads->n].pc = pc;
123 362 : threads->t[threads->n].saved = saved;
124 362 : threads->n++;
125 362 : break;
126 : }
127 : }
128 :
129 : /**
130 : @brief "Stash" a list of captures into the "out" pointer.
131 : @param new The new list of captures encountered by the Match instruction.
132 : @param destination The out pointer where the caller wants the captures.
133 : */
134 116 : void stash(size_t *new, size_t **destination)
135 : {
136 116 : if (!destination) {
137 : /* If the user wants to discard the captures, they'll pass NULL. This means
138 : we need to get rid of the capture list, or we'll leak the memory. */
139 102 : free(new);
140 218 : return;
141 : }
142 14 : if (*destination) {
143 : /* If we have already stored a capture list, we should free that. */
144 4 : free(*destination);
145 : }
146 : /* Finally, stash the pointer away. */
147 14 : *destination = new;
148 : }
149 :
150 94 : static ssize_t reexec_internal(Regex r, const struct Input input, size_t **saved)
151 : {
152 : // Can have at most n threads, where n is the length of the program. This is
153 : // because (as it is now) the thread state is simply a program counter.
154 94 : thread_list curr = newthread_list(r.n);
155 94 : thread_list next = newthread_list(r.n);
156 : thread_list temp;
157 94 : size_t nsave = 0;
158 94 : ssize_t match = -1;
159 :
160 : // Set the out pointer to NULL so that stash() knows whether we've already
161 : // stashed away a capture list.
162 94 : if (saved) {
163 10 : *saved = NULL;
164 : }
165 :
166 : // Need to initialize lastidx to something that will never be used.
167 458 : for (size_t i = 0; i < r.n; i++) {
168 364 : r.i[i].lastidx = (size_t)-1;
169 364 : if (r.i[i].code == Save) {
170 36 : nsave++;
171 : }
172 : }
173 :
174 : // Start with a single thread and add more as we need. Note that addthread()
175 : // will execute instructions that don't consume input (i.e. epsilon closure).
176 94 : addthread(&curr, r.i, calloc(nsave, sizeof(size_t)), nsave, 0);
177 :
178 : size_t sp;
179 308 : for (sp = 0; curr.n > 0; sp++) {
180 :
181 : //printf("consider input %c\nthreads: ", input[sp]);
182 : //printthreads(&curr, r.i, nsave);
183 :
184 : // Execute each thread (this will only ever reach instructions that consume
185 : // input, since addthread() stops with those).
186 460 : for (size_t t = 0; t < curr.n; t++) {
187 362 : Instr *pc = curr.t[t].pc;
188 :
189 362 : switch (pc->code) {
190 : case Char:
191 200 : if (InputIdx(input, sp) != pc->c) {
192 88 : free(curr.t[t].saved);
193 88 : break; // fail, don't continue executing this thread
194 : }
195 : // add thread containing the next instruction to the next thread list.
196 112 : addthread(&next, pc+1, curr.t[t].saved, nsave, sp+1);
197 112 : break;
198 : case Any:
199 14 : if (InputIdx(input, sp) == '\0') {
200 2 : free(curr.t[t].saved);
201 2 : break; // dot can't match end of string!
202 : }
203 : // add thread containing the next instruction to the next thread list.
204 12 : addthread(&next, pc+1, curr.t[t].saved, nsave, sp+1);
205 12 : break;
206 : case Range:
207 : case NRange:
208 32 : if (!range(*pc, InputIdx(input, sp))) {
209 16 : free(curr.t[t].saved);
210 16 : break;
211 : }
212 16 : addthread(&next, pc+1, curr.t[t].saved, nsave, sp+1);
213 16 : break;
214 : case Match:
215 116 : stash(curr.t[t].saved, saved);
216 116 : match = sp;
217 116 : goto cont;
218 : default:
219 0 : assert(false);
220 : break;
221 : }
222 : }
223 :
224 : cont:
225 : // Swap the curr and next lists.
226 214 : temp = curr;
227 214 : curr = next;
228 214 : next = temp;
229 :
230 : // Reset our new next list.
231 214 : next.n = 0;
232 : }
233 :
234 94 : free(curr.t);
235 94 : free(next.t);
236 94 : return match;
237 : }
238 :
239 54 : ssize_t reexec(Regex r, const char *input, size_t **saved)
240 : {
241 54 : struct Input in = {.str=input, .wstr=NULL};
242 54 : return reexec_internal(r, in, saved);
243 : }
244 :
245 40 : ssize_t reexecw(Regex r, const wchar_t *input, size_t **saved)
246 : {
247 40 : struct Input in = {.str=NULL, .wstr=input};
248 40 : return reexec_internal(r, in, saved);
249 : }
250 :
251 6 : size_t renumsaves(Regex r)
252 : {
253 6 : size_t ns = 0;
254 50 : for (size_t i = 0; i < r.n; i++) {
255 44 : if (r.i[i].code == Save && r.i[i].s > ns) {
256 6 : ns = r.i[i].s;
257 : }
258 : }
259 6 : return ns + 1;
260 : }
|