LCOV - code coverage report
Current view: top level - src/regex - parse.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 158 162 97.5 %
Date: 2016-12-21 02:12:01 Functions: 16 16 100.0 %

          Line data    Source code
       1             : /***************************************************************************//**
       2             : 
       3             :   @file         parse.c
       4             : 
       5             :   @author       Stephen Brennan
       6             : 
       7             :   @date         Created Saturday, 23 January 2016
       8             : 
       9             :   @brief        Recursive descent regex parser!  (modeled after Wikipedia)
      10             : 
      11             :   @copyright    Copyright (c) 2016, Stephen Brennan.  Released under the Revised
      12             :                 BSD License.  See LICENSE.txt for details.
      13             : 
      14             : *******************************************************************************/
      15             : 
      16             : #include <stdio.h>
      17             : #include <stdbool.h>
      18             : #include <stdlib.h>
      19             : #include <wchar.h>
      20             : 
      21             : #include "libstephen/re.h"
      22             : #include "libstephen/re_internals.h"
      23             : 
      24             : char *names[] = {
      25             :   "CharSym", "Special", "Eof", "LParen", "RParen", "LBracket", "RBracket",
      26             :   "Plus", "Minus", "Star", "Question", "Caret", "Pipe", "Dot"
      27             : };
      28             : 
      29             : char *ntnames[] = {
      30             :   "TERM", "EXPR", "REGEX", "CLASS", "SUB"
      31             : };
      32             : 
      33             : /*
      34             :   Convenience functions for parse trees.
      35             :  */
      36             : 
      37         245 : static PTree *terminal_tree(Token tok)
      38             : {
      39         245 :   PTree *tree = calloc(1, sizeof(PTree));
      40         245 :   tree->nchildren = 0;
      41         245 :   tree->production = 0; // marks this as terminal
      42         245 :   tree->tok = tok;
      43         245 :   return tree;
      44             : }
      45             : 
      46         460 : static PTree *nonterminal_tree(NTSym nt, size_t nchildren)
      47             : {
      48         460 :   PTree *tree = calloc(1, sizeof(PTree));
      49         460 :   tree->nchildren = nchildren;
      50         460 :   tree->production = 1; // update this on return.
      51         460 :   tree->nt = nt;
      52         460 :   return tree;
      53             : }
      54             : 
      55         621 : void free_tree(PTree *tree)
      56             : {
      57        1145 :   for (size_t i = 0; i < tree->nchildren; i++) {
      58         524 :     free_tree(tree->children[i]);
      59             :   }
      60         621 :   free(tree);
      61         621 : }
      62             : 
      63             : /*
      64             :   Simple convenience functions for a parser.
      65             :  */
      66             : 
      67        1057 : static bool accept(TSym s, Lexer *l)
      68             : {
      69        1057 :   if (l->tok.sym == s) {
      70         251 :     nextsym(l);
      71         251 :     return true;
      72             :   }
      73         806 :   return false;
      74             : }
      75             : 
      76          58 : static void expect(TSym s, Lexer *l)
      77             : {
      78          58 :   if (l->tok.sym == s) {
      79          58 :     nextsym(l);
      80          58 :     return;
      81             :   }
      82           0 :   fprintf(stderr, "error: expected %s, got %s\n", names[s], names[l->tok.sym]);
      83           0 :   exit(1);
      84             : }
      85             : 
      86         103 : PTree *TERM(Lexer *l)
      87             : {
      88         126 :   if (accept(CharSym, l) || accept(Dot, l) || accept(Special, l) ||
      89          44 :       accept(Caret, l) || accept(Minus, l)) {
      90          84 :     PTree *result = nonterminal_tree(TERMnt, 1);
      91          84 :     result->children[0] = terminal_tree(l->prev);
      92          84 :     result->production = 1;
      93          84 :     return result;
      94          19 :   } else if (accept(LParen, l)) {
      95           9 :     PTree *result = nonterminal_tree(TERMnt, 3);
      96           9 :     result->children[0] = terminal_tree(l->prev);
      97           9 :     result->children[1] = REGEX(l);
      98           9 :     expect(RParen, l);
      99           9 :     result->children[2] = terminal_tree(l->prev);
     100           9 :     result->production = 2;
     101           9 :     return result;
     102          10 :   } else if (accept(LBracket, l)) {
     103             :     PTree *result;
     104          10 :     if (accept(Caret, l)) {
     105           5 :       result = nonterminal_tree(TERMnt, 3);
     106           5 :       result->children[0] = terminal_tree((Token){LBracket, '['});
     107           5 :       result->children[1] = CLASS(l);
     108           5 :       expect(RBracket, l);
     109           5 :       result->children[2] = terminal_tree(l->prev);
     110           5 :       result->production = 4;
     111             :     } else {
     112           5 :       result = nonterminal_tree(TERMnt, 3);
     113           5 :       result->children[0] = terminal_tree((Token){LBracket, '['});
     114           5 :       result->children[1] = CLASS(l);
     115           5 :       expect(RBracket, l);
     116           5 :       result->children[2] = terminal_tree(l->prev);
     117           5 :       result->production = 3;
     118             :     }
     119          10 :     return result;
     120             :   } else {
     121           0 :     fprintf(stderr, "error: TERM: syntax error\n");
     122           0 :     exit(1);
     123             :   }
     124             : }
     125             : 
     126          87 : PTree *EXPR(Lexer *l)
     127             : {
     128          87 :   PTree *result = nonterminal_tree(EXPRnt, 1);
     129          87 :   result->children[0] = TERM(l);
     130          87 :   if (accept(Plus, l) || accept(Star, l) || accept(Question, l)) {
     131          40 :     result->nchildren++;
     132          40 :     result->children[1] = terminal_tree(l->prev);
     133          40 :     if (accept(Question, l)) {
     134           9 :       result->nchildren++;
     135           9 :       result->children[2] = terminal_tree((Token){Question, '?'});
     136             :     }
     137             :   }
     138          87 :   return result;
     139             : }
     140             : 
     141          61 : PTree *SUB(Lexer *l)
     142             : {
     143          61 :   PTree *result = nonterminal_tree(SUBnt, 1);
     144          61 :   PTree *orig = result, *prev = result;
     145             : 
     146         195 :   while (l->tok.sym != Eof && l->tok.sym != RParen && l->tok.sym != Pipe) { // seems like a bit of a hack
     147          73 :     result->children[0] = EXPR(l);
     148          73 :     result->children[1] = nonterminal_tree(SUBnt, 0);
     149          73 :     result->nchildren = 2;
     150          73 :     prev = result;
     151          73 :     result = result->children[1];
     152             :   }
     153             : 
     154             :   // This prevents SUB nonterminals with no children in the final parse tree.
     155          61 :   if (prev != result) {
     156          61 :     prev->nchildren = 1;
     157          61 :     free(result);
     158             :   }
     159          61 :   return orig;
     160             : }
     161             : 
     162          57 : PTree *REGEX(Lexer *l)
     163             : {
     164          57 :   PTree *result = nonterminal_tree(REGEXnt, 1);
     165          57 :   result->children[0] = SUB(l);
     166             : 
     167          57 :   if (accept(Pipe, l)) {
     168           5 :     result->nchildren = 3;
     169           5 :     result->children[1] = terminal_tree(l->prev);
     170           5 :     result->children[2] = REGEX(l);
     171             :   }
     172          57 :   return result;
     173             : }
     174             : 
     175          99 : bool CCHAR(Lexer *l)
     176             : {
     177          99 :   TSym acceptable[] = {CharSym, Dot, LParen, RParen, Plus, Star, Question, Pipe};
     178         451 :   for (size_t i = 0; i < nelem(acceptable); i++) {
     179         414 :     if (accept(acceptable[i], l)) {
     180          62 :       l->prev.sym = CharSym;
     181          62 :       return true;
     182             :     }
     183             :   }
     184          37 :   return false;
     185             : }
     186             : 
     187          30 : PTree *CLASS(Lexer *l)
     188             : {
     189          30 :   PTree *result = nonterminal_tree(CLASSnt, 0), *curr, *prev;
     190             :   Token t1, t2, t3;
     191          30 :   curr = result;
     192             : 
     193             :   while (true) {
     194          79 :     if (CCHAR(l)) {
     195          49 :       prev = curr;
     196          49 :       t1 = l->prev;
     197          49 :       if (accept(Minus, l)) {
     198          20 :         t2 = l->prev;
     199          20 :         if (CCHAR(l)) {
     200          13 :           t3 = l->prev;
     201             :           // We have ourselves a range!  Parse it.
     202          13 :           curr->children[0] = terminal_tree(t1);
     203          13 :           curr->children[1] = terminal_tree(t3);
     204          13 :           curr->children[2] = nonterminal_tree(CLASSnt, 0);
     205          13 :           curr->nchildren = 3;
     206          13 :           curr->production = 1;
     207          13 :           curr = curr->children[2];
     208             :         } else {
     209             :           // character followed by minus, but not range.
     210           7 :           unget(t2, l);
     211           7 :           curr->children[0] = terminal_tree(t1);
     212           7 :           curr->children[1] = nonterminal_tree(CLASSnt, 0);
     213           7 :           curr->nchildren = 2;
     214           7 :           curr->production = 3;
     215           7 :           curr = curr->children[1];
     216             :         }
     217             :       } else {
     218             :         // just a character
     219          29 :         curr->children[0] = terminal_tree(t1);
     220          29 :         curr->children[1] = nonterminal_tree(CLASSnt, 0);
     221          29 :         curr->nchildren = 2;
     222          29 :         curr->production = 3;
     223          29 :         curr = curr->children[1];
     224             :       }
     225          30 :     } else if (accept(Minus, l)) {
     226             :       // just a minus
     227           7 :       prev = curr;
     228           7 :       curr->children[0] = terminal_tree(l->prev);
     229           7 :       curr->nchildren = 1;
     230           7 :       curr->production = 5;
     231           7 :       break;
     232             :     } else {
     233          23 :       free(curr);
     234          23 :       prev->nchildren--;
     235          23 :       prev->production++;
     236          23 :       break;
     237             :     }
     238          49 :   }
     239          30 :   return result;
     240             : }
     241             : 
     242          39 : static PTree *reparse_internal(struct Input input)
     243             : {
     244             :   Lexer l;
     245             : 
     246             :   // Initialize the lexer.
     247          39 :   l.input = input;
     248          39 :   l.index = 0;
     249          39 :   l.nbuf = 0;
     250          39 :   l.tok = (Token){.sym=0, .c=0};
     251             : 
     252             :   // Create a parse tree!
     253             :   //printf(";; TOKENS:\n");
     254          39 :   nextsym(&l);
     255          39 :   PTree *tree = REGEX(&l);
     256          39 :   expect(Eof, &l);
     257             : 
     258          39 :   return tree;
     259             : }
     260             : 
     261          29 : PTree *reparse(const char *input)
     262             : {
     263          29 :   struct Input in = {.str=input, .wstr=NULL};
     264          29 :   return reparse_internal(in);
     265             : }
     266             : 
     267          10 : PTree *reparsew(const wchar_t *winput)
     268             : {
     269          10 :   struct Input in = {.str=NULL, .wstr=winput};
     270          10 :   return reparse_internal(in);
     271             : }
     272             : 
     273          28 : Regex recomp(const char *regex)
     274             : {
     275          28 :   PTree *tree = reparse(regex);
     276          28 :   Regex code = codegen(tree);
     277          28 :   free_tree(tree);
     278          28 :   return code;
     279             : }
     280             : 
     281           9 : Regex recompw(const wchar_t *regex)
     282             : {
     283           9 :   PTree *tree = reparsew(regex);
     284           9 :   Regex code = codegen(tree);
     285           9 :   free_tree(tree);
     286           9 :   return code;
     287             : }

Generated by: LCOV version 1.11