LCOV - code coverage report
Current view: top level - src/regex - codegen.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 190 197 96.4 %
Date: 2016-12-21 02:12:01 Functions: 12 12 100.0 %

          Line data    Source code
       1             : /***************************************************************************//**
       2             : 
       3             :   @file         codegen.c
       4             : 
       5             :   @author       Stephen Brennan
       6             : 
       7             :   @date         Created Saturday, 30 January 2016
       8             : 
       9             :   @brief        Generating code from parse trees.
      10             : 
      11             :   @copyright    Copyright (c) 2016, Stephen Brennan.  Released under the Revised
      12             :                 BSD License.  See LICENSE.txt for details.
      13             : 
      14             :   Notes on how code generation works:
      15             : 
      16             :   The difficult part of code generation is that you need to generate code with
      17             :   "jump" and "split" instructions that point to other instructions.  But since
      18             :   the jump and split targets are just (Instr*) pointers, and these frequently
      19             :   get moved around or freed during code generation, a naive implementation will
      20             :   end up with a bunch of dangling pointers.
      21             : 
      22             :   My solution to this issue is to use "Fragments."  Fragments are stored in a
      23             :   singly linked list, and they contain an instruction along with an ID.  As code
      24             :   is generated, jump and split targets are stored using this ID rather than a
      25             :   raw pointer.  Once you have a final Fragment list that contains all of your
      26             :   code, it will be turned into an array, and all the IDs will be resolved
      27             :   efficiently to locations in the final array using a table.
      28             : 
      29             : *******************************************************************************/
      30             : 
      31             : #include <stdio.h>
      32             : #include <assert.h>
      33             : #include <stdint.h>
      34             : #include <stdbool.h>
      35             : #include <string.h>
      36             : 
      37             : #include "libstephen/re.h"
      38             : #include "libstephen/re_internals.h"
      39             : 
      40             : typedef struct Fragment Fragment;
      41             : struct Fragment {
      42             :   Instr in;
      43             :   intptr_t id;
      44             :   Fragment *next;
      45             : };
      46             : 
      47             : typedef struct State State;
      48             : struct State {
      49             :   intptr_t id; // "global" id counter
      50             :   size_t capture; // capture parentheses counter
      51             : };
      52             : 
      53          41 : static Fragment *last(Fragment *f)
      54             : {
      55         189 :   while (f->next) {
      56         107 :     f = f->next;
      57             :   }
      58          41 :   return f;
      59             : }
      60             : 
      61             : /**
      62             :    @brief "Join" a fragment to the given ID.
      63             : 
      64             :    This replaces each "match" instruction with a "jump" instruction to the given
      65             :    ID.  Since it's assumed that the instruction with that ID will be placed
      66             :    after this one, if the last instruction of this fragment is a "match", it
      67             :    will just be deleted.
      68             :  */
      69          41 : static void join(Fragment *a, Fragment *b)
      70             : {
      71          41 :   Fragment *prev = NULL;
      72          41 :   Fragment *l = last(a);
      73             :   Instr *lastid;
      74             : 
      75             :   // lastid will be set in case the last instruction is a Match, which will be
      76             :   // deleted later.
      77          41 :   if (l->in.code == Match) {
      78          41 :     lastid = (Instr*) l->id;
      79             :   } else {
      80           0 :     lastid = (Instr*) -1;
      81             :   }
      82             : 
      83         189 :   while (a->next != NULL) {
      84             :     // Matches should be replaced with jumps to the next fragment.
      85         107 :     if (a->in.code == Match) {
      86           0 :       a->in.code = Jump;
      87           0 :       a->in.x = (Instr*)b->id;
      88             :     }
      89             :     // Jumps to the final match instruction should instead be targeted to the
      90             :     // next fragment.
      91         107 :     if ((a->in.code == Jump || a->in.code == Split) && a->in.x == lastid) {
      92           0 :       a->in.x = (Instr*) b->id;
      93             :     }
      94         107 :     if (a->in.code == Split && a->in.y == lastid) {
      95           9 :       a->in.y = (Instr*) b->id;
      96             :     }
      97         107 :     prev = a;
      98         107 :     a = a->next;
      99             :   }
     100             : 
     101             :   // If the last Instruction is a Match, delete it.
     102          41 :   if (prev != NULL && a->in.code == Match) {
     103          41 :     free(a);
     104          41 :     prev->next = b;
     105             :   }
     106          41 : }
     107             : 
     108          37 : static size_t fraglen(Fragment *f)
     109             : {
     110          37 :   size_t len = 0;
     111         212 :   while (f) {
     112         138 :     len++;
     113         138 :     f = f->next;
     114             :   }
     115          37 :   return len;
     116             : }
     117             : 
     118         179 : static Fragment *newfrag(enum code code, State *s)
     119             : {
     120         179 :   Fragment *new = calloc(1, sizeof(Fragment));
     121         179 :   new->in.code = code;
     122         179 :   new->id = s->id++;
     123         179 :   return new;
     124             : }
     125             : 
     126          37 : static void freefraglist(Fragment *f)
     127             : {
     128             :   Fragment *next;
     129         212 :   while (f) {
     130         138 :     next = f->next;
     131         138 :     free(f);
     132         138 :     f = next;
     133             :   }
     134          37 : }
     135             : 
     136             : static Fragment *regex(PTree *t, State *s);
     137             : static Fragment *term(PTree *t, State *s);
     138             : static Fragment *expr(PTree *t, State *s);
     139             : static Fragment *class(PTree *t, State *s, bool is_negative);
     140             : static Fragment *sub(PTree *t, State *s);
     141             : 
     142           6 : static Fragment *special(char type, State *s)
     143             : {
     144             :   Fragment *f;
     145             : 
     146           6 :   char whitespace[] = "  \t\t\n\n\r\r\f\f\v\v";
     147           6 :   char word[] = "azAZ09__";
     148           6 :   char number[] = "09";
     149             : 
     150           6 :   switch (type) {
     151             :   case 's':
     152             :   case 'S':
     153           2 :     f = (type == 's') ? newfrag(Range, s) : newfrag(NRange, s);
     154           2 :     f->in.s = nelem(whitespace) / 2;
     155           2 :     f->in.x = calloc(nelem(whitespace), sizeof(char));
     156           2 :     memcpy(f->in.x, whitespace, nelem(whitespace));
     157           2 :     break;
     158             :   case 'w':
     159             :   case 'W':
     160           2 :     f = (type == 'w') ? newfrag(Range, s) : newfrag(NRange, s);
     161           2 :     f->in.s = nelem(word) / 2;
     162           2 :     f->in.x = calloc(nelem(word), sizeof(char));
     163           2 :     memcpy(f->in.x, word, nelem(word));
     164           2 :     break;
     165             :   case 'd':
     166             :   case 'D':
     167           2 :     f = (type == 'd') ? newfrag(Range, s) : newfrag(NRange, s);
     168           2 :     f->in.s = nelem(number) / 2;
     169           2 :     f->in.x = calloc(nelem(number), sizeof(char));
     170           2 :     memcpy(f->in.x, number, nelem(number));
     171           2 :     break;
     172             :   default:
     173           0 :     fprintf(stderr, "not implemented: special character class '%c'\n", type);
     174           0 :     exit(EXIT_FAILURE);
     175             :     break;
     176             :   }
     177             : 
     178           6 :   f->next = newfrag(Match, s);
     179           6 :   return f;
     180             : }
     181             : 
     182          55 : static Fragment *term(PTree *t, State *s)
     183             : {
     184          55 :   Fragment *f = NULL;
     185             : 
     186          55 :   assert(t->nt == TERMnt);
     187             : 
     188          55 :   if (t->production == 1) {
     189          42 :     if (t->children[0]->tok.sym == CharSym || t->children[0]->tok.sym == Caret
     190           9 :         || t->children[0]->tok.sym == Minus) {
     191             :       // Character
     192          33 :       f = newfrag(Char, s);
     193          33 :       f->in.c = t->children[0]->tok.c;
     194          33 :       f->next = newfrag(Match, s);
     195           9 :     } else if (t->children[0]->tok.sym == Dot) {
     196             :       // Dot
     197           3 :       f = newfrag(Any, s);
     198           3 :       f->next = newfrag(Match, s);
     199           6 :     } else if (t->children[0]->tok.sym == Special) {
     200             :       // Special
     201           6 :       f = special(t->children[0]->tok.c, s);
     202             :     }
     203          13 :   } else if (t->production == 2) {
     204             :     // Parenthesized expression
     205           7 :     f = newfrag(Save, s);
     206           7 :     f->in.s = s->capture++;
     207           7 :     size_t nextsave = s->capture++;
     208           7 :     f->next = regex(t->children[1], s);
     209           7 :     Fragment *n = newfrag(Save, s);
     210           7 :     n->in.s = nextsave;
     211           7 :     n->next = newfrag(Match, s);
     212           7 :     join(f, n);
     213             :   } else {
     214             :     // Character class
     215           6 :     f = class(t->children[1], s, (t->production == 4));
     216             :   }
     217          55 :   return f;
     218             : }
     219             : 
     220          55 : static Fragment *expr(PTree *t, State *s)
     221             : {
     222          55 :   Fragment *f = NULL, *a = NULL, *b = NULL, *c = NULL;
     223             : 
     224          55 :   assert(t->nt == EXPRnt);
     225             : 
     226          55 :   f = term(t->children[0], s);
     227          55 :   if (t->nchildren == 1) {
     228          33 :     return f;
     229             :   } else {
     230          22 :     if (t->children[1]->tok.sym == Star) {
     231             :       /*
     232             :         L1:
     233             :             split L2 L3   ;; this is "a"  [ non-greedy: split L3 L2 ]
     234             :         L2:
     235             :             BLOCK from f
     236             :             jump L1       ;; this is "b"
     237             :         L3:
     238             :             match         ;; this is "c"
     239             :        */
     240          15 :       a = newfrag(Split, s);
     241          15 :       b = newfrag(Jump, s);
     242          15 :       c = newfrag(Match, s);
     243          15 :       if (t->nchildren == 3) {
     244             :         // Non-greedy
     245           1 :         a->in.x = (Instr*) c->id;
     246           1 :         a->in.y = (Instr*) f->id;
     247             :       } else {
     248             :         // Greedy
     249          14 :         a->in.x = (Instr*) f->id;
     250          14 :         a->in.y = (Instr*) c->id;
     251             :       }
     252          15 :       b->in.x = (Instr*) a->id;
     253          15 :       a->next = f;
     254          15 :       b->next = c;
     255          15 :       join(a, b);
     256          15 :       return a;
     257           7 :     } else if (t->children[1]->tok.sym == Plus) {
     258             :       /*
     259             :         L1:
     260             :             BLOCK from f
     261             :             split L1 L2   ;; this is "a"  [ non-greedy: split L2 L1 ]
     262             :         L2:
     263             :             match         ;; this is "b"
     264             :        */
     265           5 :       a = newfrag(Split, s);
     266           5 :       b = newfrag(Match, s);
     267           5 :       if (t->nchildren == 3) {
     268             :         // Non-greedy
     269           1 :         a->in.x = (Instr*) b->id;
     270           1 :         a->in.y = (Instr*) f->id;
     271             :       } else {
     272             :         // Greedy
     273           4 :         a->in.x = (Instr*) f->id;
     274           4 :         a->in.y = (Instr*) b->id;
     275             :       }
     276           5 :       join(f, a);
     277           5 :       a->next = b;
     278           5 :       return f;
     279           2 :     } else if (t->children[1]->tok.sym == Question) {
     280             :       /*
     281             :             split L1 L2   ;; this is "a"  [ non-greedy: split L2 L1 ]
     282             :         L1:
     283             :             BLOCK from f
     284             :         L2:
     285             :             match         ;; this is "b"
     286             :        */
     287           2 :       a = newfrag(Split, s);
     288           2 :       b = newfrag(Match, s);
     289           2 :       if (t->nchildren == 3) {
     290             :         // Non-greedy
     291           1 :         a->in.x = (Instr*) b->id;
     292           1 :         a->in.y = (Instr*) f->id;
     293             :       } else {
     294             :         // Greedy
     295           1 :         a->in.x = (Instr*) f->id;
     296           1 :         a->in.y = (Instr*) b->id;
     297             :       }
     298           2 :       a->next = f;
     299           2 :       join(f, b);
     300           2 :       return a;
     301             :     } else {
     302           0 :       assert(false);
     303             :     }
     304             :   }
     305             : }
     306             : 
     307          55 : static Fragment *sub(PTree *tree, State *state)
     308             : {
     309          55 :   assert(tree->nt == SUBnt);
     310          55 :   Fragment *e = expr(tree->children[0], state);
     311          55 :   if (tree->nchildren == 2) {
     312             :     /*
     313             :       BLOCK from e
     314             :       BLOCK from s
     315             :      */
     316          10 :     Fragment *s = sub(tree->children[1], state);
     317          10 :     join(e, s);
     318             :   }
     319          55 :   return e;
     320             : }
     321             : 
     322          45 : static Fragment *regex(PTree *tree, State *state)
     323             : {
     324          45 :   assert(tree->nt == REGEXnt);
     325          45 :   Fragment *s = sub(tree->children[0], state);
     326          45 :   if (tree->nchildren == 3) {
     327             :     /*
     328             :           split L1 L2     ;; this is "pre"
     329             :       L1:
     330             :           BLOCK from s
     331             :           jump L3         ;; this is "j"
     332             :       L2:
     333             :           BLOCK from r
     334             :       L3:
     335             :           match           ;; this is "m"
     336             :      */
     337             : 
     338           1 :     Fragment *r = regex(tree->children[2], state);
     339             : 
     340           1 :     Fragment *pre = newfrag(Split, state);
     341           1 :     pre->in.x = (Instr*) s->id;
     342           1 :     pre->in.y = (Instr*) r->id;
     343           1 :     pre->next = s;
     344             : 
     345           1 :     Fragment *m = newfrag(Match, state);
     346           1 :     Fragment *j = newfrag(Jump, state);
     347           1 :     j->in.x = (Instr*) m->id;
     348           1 :     j->next = r;
     349           1 :     join(j, m);
     350           1 :     join(pre, j);
     351             : 
     352           1 :     return pre;
     353             :   }
     354          44 :   return s;
     355             : }
     356             : 
     357           6 : static Fragment *class(PTree *tree, State *state, bool is_negative)
     358             : {
     359           6 :   size_t nranges = 0;
     360             :   PTree *curr;
     361             :   Fragment *f;
     362             : 
     363          26 :   for (curr = tree; curr->nt == CLASSnt; curr = curr->children[curr->nchildren-1]) {
     364          20 :     nranges++;
     365             :   }
     366             : 
     367           6 :   if (is_negative) {
     368           3 :     f = newfrag(NRange, state);
     369             :   } else {
     370           3 :     f = newfrag(Range, state);
     371             :   }
     372             : 
     373           6 :   f->in.s = nranges;
     374           6 :   f->in.x = calloc(nranges*2, sizeof(char));
     375           6 :   char *block = (char*)f->in.x;
     376             : 
     377           6 :   curr = tree;
     378           6 :   nranges = 0;
     379          32 :   while (curr->nt == CLASSnt) {
     380          20 :     if (curr->production == 1 || curr->production == 2) {
     381             :       // Range
     382           7 :       block[2*nranges] = curr->children[0]->tok.c;
     383           7 :       block[2*nranges+1] = curr->children[1]->tok.c;
     384             :     } else {
     385             :       // Single
     386          13 :       block[2*nranges] = curr->children[0]->tok.c;
     387          13 :       block[2*nranges+1] = curr->children[0]->tok.c;
     388             :     }
     389          20 :     curr = curr->children[curr->nchildren-1];
     390          20 :     nranges++;
     391             :   }
     392             : 
     393           6 :   f->next = newfrag(Match, state);
     394           6 :   return f;
     395             : }
     396             : 
     397          37 : Regex codegen(PTree *tree)
     398             : {
     399             :   // Generate code.
     400          37 :   State s = {0, 0};
     401          37 :   Fragment *f = regex(tree, &s);
     402             :   size_t n;
     403             : 
     404             :   // Get the length of the code
     405          37 :   n = fraglen(f);
     406             : 
     407             :   // Allocate buffers for the code, and for a lookup table of targets for jumps.
     408          37 :   Instr *code = calloc(n, sizeof(Instr));
     409          37 :   size_t *targets = calloc(s.id, sizeof(size_t));
     410             : 
     411             :   // Fill up the lookup table.
     412          37 :   size_t i = 0;
     413             :   Fragment *curr;
     414         175 :   for (curr = f; curr; curr = curr->next, i++) {
     415         138 :     targets[curr->id] = i;
     416             :   }
     417             : 
     418             :   // Now, copy in the Instructions, replacing the jump targets from the table.
     419         175 :   for (curr = f, i = 0; curr; curr = curr->next, i++) {
     420         138 :     code[i] = curr->in;
     421         138 :     if (code[i].code == Jump || code[i].code == Split) {
     422          39 :       code[i].x = code + targets[(intptr_t)code[i].x];
     423             :     }
     424         138 :     if (code[i].code == Split) {
     425          23 :       code[i].y = code + targets[(intptr_t)code[i].y];
     426             :     }
     427             :   }
     428             : 
     429          37 :   free(targets);
     430          37 :   freefraglist(f);
     431          37 :   return (Regex){.n=n, .i=code};
     432             : }

Generated by: LCOV version 1.11