LCOV - code coverage report
Current view: top level - src/regex - pike.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 95 105 90.5 %
Date: 2016-12-21 02:12:01 Functions: 8 9 88.9 %

          Line data    Source code
       1             : /***************************************************************************//**
       2             : 
       3             :   @file         pike.c
       4             : 
       5             :   @author       Stephen Brennan, based on code by Russ Cox:
       6             :                 https://swtch.com/~rsc/regexp/regexp2.html
       7             : 
       8             :   @date         Created Wednesday, 20 January 2016
       9             : 
      10             :   @brief        Regex implementation using virtual machines.
      11             : 
      12             :   @copyright    Copyright (c) 2016, Stephen Brennan.  Released under the Revised
      13             :                 BSD License.  See LICENSE.txt for details.
      14             : 
      15             :                 Portions of this code are based on code by Russ Cox, Copyright
      16             :                 2007-2009.
      17             : 
      18             : *******************************************************************************/
      19             : 
      20             : #include <assert.h>
      21             : #include <stdlib.h>
      22             : #include <stdio.h>
      23             : #include <stdbool.h>
      24             : #include <string.h>
      25             : #include <stdint.h>
      26             : #include <unistd.h>
      27             : #include <wchar.h>
      28             : 
      29             : #include "libstephen/re.h"
      30             : #include "libstephen/re_internals.h"
      31             : 
      32             : // Declarations:
      33             : 
      34             : typedef struct thread thread;
      35             : struct thread {
      36             :   Instr *pc;
      37             :   size_t *saved;
      38             : };
      39             : 
      40             : typedef struct thread_list thread_list;
      41             : struct thread_list {
      42             :   thread *t;
      43             :   size_t n;
      44             : };
      45             : 
      46             : // Printing, for diagnostics
      47             : 
      48           0 : void printthreads(thread_list *tl, Instr *prog, size_t nsave) {
      49           0 :   for (size_t i = 0; i < tl->n; i++) {
      50           0 :     printf("T%zu@pc=%lu{", i, (intptr_t) (tl->t[i].pc - prog));
      51           0 :     for (size_t j = 0; j < nsave; j++) {
      52           0 :       printf("%lu,", tl->t[i].saved[j]);
      53             :     }
      54           0 :     printf("} ");
      55             :   }
      56           0 :   printf("\n");
      57           0 : }
      58             : 
      59             : // Helper evaluation functions for instructions
      60             : 
      61          32 : bool range(Instr in, wchar_t test) {
      62          32 :   if (test == L'\0') {
      63           0 :     return false;
      64             :   }
      65          32 :   bool result = false;
      66          32 :   char *block = (char *) in.x;
      67             : 
      68             :   // use in.s for number of ranges, in.x as char* for ranges.
      69          80 :   for (size_t i = 0; i < in.s; i++) {
      70          68 :     if (block[i*2] <= test && test <= block [i*2 + 1]) {
      71          20 :       result = true;
      72          20 :       break; // short circuit yo!
      73             :     }
      74             :   }
      75             : 
      76             :   // negate result for negative ranges
      77          32 :   if (in.code == Range) {
      78          16 :     return result;
      79             :   } else {
      80          16 :     return !result;
      81             :   }
      82             : }
      83             : 
      84             : // Pike VM functions:
      85             : 
      86         188 : thread_list newthread_list(size_t n)
      87             : {
      88             :   thread_list tl;
      89         188 :   tl.t = calloc(n, sizeof(thread));
      90         188 :   tl.n = 0;
      91         188 :   return tl;
      92             : }
      93             : 
      94         682 : void addthread(thread_list *threads, Instr *pc, size_t *saved, size_t nsave,
      95             :                size_t sp)
      96             : {
      97             :   //printf("addthread(): pc=%d, saved={%u, %u}, sp=%u, lastidx=%u\n", pc - extprog,
      98             :   //       saved[0], saved[1], sp, pc->lastidx);
      99         682 :   if (pc->lastidx == sp) {
     100             :     // we've executed this instruction on this string index already
     101          20 :     free(saved);
     102         702 :     return;
     103             :   }
     104         662 :   pc->lastidx = sp;
     105             : 
     106             :   size_t *newsaved;
     107         662 :   switch (pc->code) {
     108             :   case Jump:
     109          88 :     addthread(threads, pc->x, saved, nsave, sp);
     110          88 :     break;
     111             :   case Split:
     112         148 :     newsaved = calloc(nsave, sizeof(size_t));
     113         148 :     memcpy(newsaved, saved, nsave * sizeof(size_t));
     114         148 :     addthread(threads, pc->x, saved, nsave, sp);
     115         148 :     addthread(threads, pc->y, newsaved, nsave, sp);
     116         148 :     break;
     117             :   case Save:
     118          64 :     saved[pc->s] = sp;
     119          64 :     addthread(threads, pc + 1, saved, nsave, sp);
     120          64 :     break;
     121             :   default:
     122         362 :     threads->t[threads->n].pc = pc;
     123         362 :     threads->t[threads->n].saved = saved;
     124         362 :     threads->n++;
     125         362 :     break;
     126             :   }
     127             : }
     128             : 
     129             : /**
     130             :    @brief "Stash" a list of captures into the "out" pointer.
     131             :    @param new The new list of captures encountered by the Match instruction.
     132             :    @param destination The out pointer where the caller wants the captures.
     133             :  */
     134         116 : void stash(size_t *new, size_t **destination)
     135             : {
     136         116 :   if (!destination) {
     137             :     /* If the user wants to discard the captures, they'll pass NULL.  This means
     138             :        we need to get rid of the capture list, or we'll leak the memory. */
     139         102 :     free(new);
     140         218 :     return;
     141             :   }
     142          14 :   if (*destination) {
     143             :     /* If we have already stored a capture list, we should free that. */
     144           4 :     free(*destination);
     145             :   }
     146             :   /* Finally, stash the pointer away. */
     147          14 :   *destination = new;
     148             : }
     149             : 
     150          94 : static ssize_t reexec_internal(Regex r, const struct Input input, size_t **saved)
     151             : {
     152             :   // Can have at most n threads, where n is the length of the program.  This is
     153             :   // because (as it is now) the thread state is simply a program counter.
     154          94 :   thread_list curr = newthread_list(r.n);
     155          94 :   thread_list next = newthread_list(r.n);
     156             :   thread_list temp;
     157          94 :   size_t nsave = 0;
     158          94 :   ssize_t match = -1;
     159             : 
     160             :   // Set the out pointer to NULL so that stash() knows whether we've already
     161             :   // stashed away a capture list.
     162          94 :   if (saved) {
     163          10 :     *saved = NULL;
     164             :   }
     165             : 
     166             :   // Need to initialize lastidx to something that will never be used.
     167         458 :   for (size_t i = 0; i < r.n; i++) {
     168         364 :     r.i[i].lastidx = (size_t)-1;
     169         364 :     if (r.i[i].code == Save) {
     170          36 :       nsave++;
     171             :     }
     172             :   }
     173             : 
     174             :   // Start with a single thread and add more as we need.  Note that addthread()
     175             :   // will execute instructions that don't consume input (i.e. epsilon closure).
     176          94 :   addthread(&curr, r.i, calloc(nsave, sizeof(size_t)), nsave, 0);
     177             : 
     178             :   size_t sp;
     179         308 :   for (sp = 0; curr.n > 0; sp++) {
     180             : 
     181             :     //printf("consider input %c\nthreads: ", input[sp]);
     182             :     //printthreads(&curr, r.i, nsave);
     183             : 
     184             :     // Execute each thread (this will only ever reach instructions that consume
     185             :     // input, since addthread() stops with those).
     186         460 :     for (size_t t = 0; t < curr.n; t++) {
     187         362 :       Instr *pc = curr.t[t].pc;
     188             : 
     189         362 :       switch (pc->code) {
     190             :       case Char:
     191         200 :         if (InputIdx(input, sp) != pc->c) {
     192          88 :           free(curr.t[t].saved);
     193          88 :           break; // fail, don't continue executing this thread
     194             :         }
     195             :         // add thread containing the next instruction to the next thread list.
     196         112 :         addthread(&next, pc+1, curr.t[t].saved, nsave, sp+1);
     197         112 :         break;
     198             :       case Any:
     199          14 :         if (InputIdx(input, sp) == '\0') {
     200           2 :           free(curr.t[t].saved);
     201           2 :           break; // dot can't match end of string!
     202             :         }
     203             :         // add thread containing the next instruction to the next thread list.
     204          12 :         addthread(&next, pc+1, curr.t[t].saved, nsave, sp+1);
     205          12 :         break;
     206             :       case Range:
     207             :       case NRange:
     208          32 :         if (!range(*pc, InputIdx(input, sp))) {
     209          16 :           free(curr.t[t].saved);
     210          16 :           break;
     211             :         }
     212          16 :         addthread(&next, pc+1, curr.t[t].saved, nsave, sp+1);
     213          16 :         break;
     214             :       case Match:
     215         116 :         stash(curr.t[t].saved, saved);
     216         116 :         match = sp;
     217         116 :         goto cont;
     218             :       default:
     219           0 :         assert(false);
     220             :         break;
     221             :       }
     222             :     }
     223             : 
     224             :   cont:
     225             :     // Swap the curr and next lists.
     226         214 :     temp = curr;
     227         214 :     curr = next;
     228         214 :     next = temp;
     229             : 
     230             :     // Reset our new next list.
     231         214 :     next.n = 0;
     232             :   }
     233             : 
     234          94 :   free(curr.t);
     235          94 :   free(next.t);
     236          94 :   return match;
     237             : }
     238             : 
     239          54 : ssize_t reexec(Regex r, const char *input, size_t **saved)
     240             : {
     241          54 :   struct Input in = {.str=input, .wstr=NULL};
     242          54 :   return reexec_internal(r, in, saved);
     243             : }
     244             : 
     245          40 : ssize_t reexecw(Regex r, const wchar_t *input, size_t **saved)
     246             : {
     247          40 :   struct Input in = {.str=NULL, .wstr=input};
     248          40 :   return reexec_internal(r, in, saved);
     249             : }
     250             : 
     251           6 : size_t renumsaves(Regex r)
     252             : {
     253           6 :   size_t ns = 0;
     254          50 :   for (size_t i = 0; i < r.n; i++) {
     255          44 :     if (r.i[i].code == Save && r.i[i].s > ns) {
     256           6 :       ns = r.i[i].s;
     257             :     }
     258             :   }
     259           6 :   return ns + 1;
     260             : }

Generated by: LCOV version 1.11