mirror of
https://github.com/dlang/phobos.git
synced 2025-04-30 15:10:46 +03:00

The docs and API still stay in one file. With time and refactoring more internals may be exposed such as parser, each engine explicitly and the sample generator (generator.d). Also inclusdes changes prompted by reviews/pulls: Convert spaces-->tabs in makefiles. Move things (again) to std/regex/internal. Use new package(std.regex) feature. Remove C-style arrays (some other pull against regex.d).
947 lines
32 KiB
D
947 lines
32 KiB
D
//Written in the D programming language
|
|
/*
|
|
Implementation of Thompson NFA std.regex engine.
|
|
Key point is evaluation of all possible threads (state) at each step
|
|
in a breadth-first manner, thereby geting some nice properties:
|
|
- looking at each character only once
|
|
- merging of equivalent threads, that gives matching process linear time complexity
|
|
*/
|
|
module std.regex.internal.thompson;
|
|
|
|
package(std.regex):
|
|
|
|
import std.regex.internal.ir;
|
|
import std.range;
|
|
|
|
//State of VM thread
|
|
struct Thread(DataIndex)
|
|
{
|
|
Thread* next; //intrusive linked list
|
|
uint pc;
|
|
uint counter; //loop counter
|
|
uint uopCounter; //counts micro operations inside one macro instruction (e.g. BackRef)
|
|
Group!DataIndex[1] matches;
|
|
}
|
|
|
|
//head-tail singly-linked list
|
|
struct ThreadList(DataIndex)
|
|
{
|
|
Thread!DataIndex* tip = null, toe = null;
|
|
//add new thread to the start of list
|
|
void insertFront(Thread!DataIndex* t)
|
|
{
|
|
if(tip)
|
|
{
|
|
t.next = tip;
|
|
tip = t;
|
|
}
|
|
else
|
|
{
|
|
t.next = null;
|
|
tip = toe = t;
|
|
}
|
|
}
|
|
//add new thread to the end of list
|
|
void insertBack(Thread!DataIndex* t)
|
|
{
|
|
if(toe)
|
|
{
|
|
toe.next = t;
|
|
toe = t;
|
|
}
|
|
else
|
|
tip = toe = t;
|
|
toe.next = null;
|
|
}
|
|
//move head element out of list
|
|
Thread!DataIndex* fetch()
|
|
{
|
|
auto t = tip;
|
|
if(tip == toe)
|
|
tip = toe = null;
|
|
else
|
|
tip = tip.next;
|
|
return t;
|
|
}
|
|
//non-destructive iteration of ThreadList
|
|
struct ThreadRange
|
|
{
|
|
const(Thread!DataIndex)* ct;
|
|
this(ThreadList tlist){ ct = tlist.tip; }
|
|
@property bool empty(){ return ct is null; }
|
|
@property const(Thread!DataIndex)* front(){ return ct; }
|
|
@property popFront()
|
|
{
|
|
assert(ct);
|
|
ct = ct.next;
|
|
}
|
|
}
|
|
@property bool empty()
|
|
{
|
|
return tip == null;
|
|
}
|
|
ThreadRange opSlice()
|
|
{
|
|
return ThreadRange(this);
|
|
}
|
|
}
|
|
|
|
/+
|
|
Thomspon matcher does all matching in lockstep,
|
|
never looking at the same char twice
|
|
+/
|
|
@trusted struct ThompsonMatcher(Char, Stream = Input!Char)
|
|
if(is(Char : dchar))
|
|
{
|
|
alias DataIndex = Stream.DataIndex;
|
|
Thread!DataIndex* freelist;
|
|
ThreadList!DataIndex clist, nlist;
|
|
DataIndex[] merge;
|
|
Group!DataIndex[] backrefed;
|
|
Regex!Char re; //regex program
|
|
Stream s;
|
|
dchar front;
|
|
DataIndex index;
|
|
DataIndex genCounter; //merge trace counter, goes up on every dchar
|
|
size_t[size_t] subCounters; //a table of gen counter per sub-engine: PC -> counter
|
|
size_t threadSize;
|
|
bool matched;
|
|
bool exhausted;
|
|
static if(__traits(hasMember,Stream, "search"))
|
|
{
|
|
enum kicked = true;
|
|
}
|
|
else
|
|
enum kicked = false;
|
|
|
|
static size_t getThreadSize(const ref Regex!Char re)
|
|
{
|
|
return re.ngroup
|
|
? (Thread!DataIndex).sizeof + (re.ngroup-1)*(Group!DataIndex).sizeof
|
|
: (Thread!DataIndex).sizeof - (Group!DataIndex).sizeof;
|
|
}
|
|
|
|
static size_t initialMemory(const ref Regex!Char re)
|
|
{
|
|
return getThreadSize(re)*re.threadCount + re.hotspotTableSize*size_t.sizeof;
|
|
}
|
|
|
|
//true if it's start of input
|
|
@property bool atStart(){ return index == 0; }
|
|
|
|
//true if it's end of input
|
|
@property bool atEnd(){ return index == s.lastIndex && s.atEnd; }
|
|
|
|
bool next()
|
|
{
|
|
if(!s.nextChar(front, index))
|
|
{
|
|
index = s.lastIndex;
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static if(kicked)
|
|
{
|
|
bool search()
|
|
{
|
|
|
|
if(!s.search(re.kickstart, front, index))
|
|
{
|
|
index = s.lastIndex;
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
}
|
|
|
|
void initExternalMemory(void[] memory)
|
|
{
|
|
threadSize = getThreadSize(re);
|
|
prepareFreeList(re.threadCount, memory);
|
|
if(re.hotspotTableSize)
|
|
{
|
|
merge = arrayInChunk!(DataIndex)(re.hotspotTableSize, memory);
|
|
merge[] = 0;
|
|
}
|
|
}
|
|
|
|
this()(Regex!Char program, Stream stream, void[] memory)
|
|
{
|
|
re = program;
|
|
s = stream;
|
|
initExternalMemory(memory);
|
|
genCounter = 0;
|
|
}
|
|
|
|
this(S)(ref ThompsonMatcher!(Char,S) matcher, Bytecode[] piece, Stream stream)
|
|
{
|
|
s = stream;
|
|
re = matcher.re;
|
|
re.ir = piece;
|
|
threadSize = matcher.threadSize;
|
|
merge = matcher.merge;
|
|
freelist = matcher.freelist;
|
|
front = matcher.front;
|
|
index = matcher.index;
|
|
}
|
|
|
|
auto fwdMatcher()(Bytecode[] piece, size_t counter)
|
|
{
|
|
auto m = ThompsonMatcher!(Char, Stream)(this, piece, s);
|
|
m.genCounter = counter;
|
|
return m;
|
|
}
|
|
|
|
auto bwdMatcher()(Bytecode[] piece, size_t counter)
|
|
{
|
|
alias BackLooper = typeof(s.loopBack(index));
|
|
auto m = ThompsonMatcher!(Char, BackLooper)(this, piece, s.loopBack(index));
|
|
m.genCounter = counter;
|
|
m.next();
|
|
return m;
|
|
}
|
|
|
|
auto dupTo(void[] memory)
|
|
{
|
|
typeof(this) tmp = this;//bitblit
|
|
tmp.initExternalMemory(memory);
|
|
tmp.genCounter = 0;
|
|
return tmp;
|
|
}
|
|
|
|
enum MatchResult{
|
|
NoMatch,
|
|
PartialMatch,
|
|
Match,
|
|
}
|
|
|
|
bool match(Group!DataIndex[] matches)
|
|
{
|
|
debug(std_regex_matcher)
|
|
writeln("------------------------------------------");
|
|
if(exhausted)
|
|
{
|
|
return false;
|
|
}
|
|
if(re.flags & RegexInfo.oneShot)
|
|
{
|
|
next();
|
|
exhausted = true;
|
|
return matchOneShot(matches)==MatchResult.Match;
|
|
}
|
|
static if(kicked)
|
|
if(!re.kickstart.empty)
|
|
return matchImpl!(true)(matches);
|
|
return matchImpl!(false)(matches);
|
|
}
|
|
|
|
//match the input and fill matches
|
|
bool matchImpl(bool withSearch)(Group!DataIndex[] matches)
|
|
{
|
|
if(!matched && clist.empty)
|
|
{
|
|
static if(withSearch)
|
|
search();
|
|
else
|
|
next();
|
|
}
|
|
else//char in question is fetched in prev call to match
|
|
{
|
|
matched = false;
|
|
}
|
|
|
|
if(!atEnd)//if no char
|
|
for(;;)
|
|
{
|
|
genCounter++;
|
|
debug(std_regex_matcher)
|
|
{
|
|
writefln("Threaded matching threads at %s", s[index..s.lastIndex]);
|
|
foreach(t; clist[])
|
|
{
|
|
assert(t);
|
|
writef("pc=%s ",t.pc);
|
|
write(t.matches);
|
|
writeln();
|
|
}
|
|
}
|
|
for(Thread!DataIndex* t = clist.fetch(); t; t = clist.fetch())
|
|
{
|
|
eval!true(t, matches);
|
|
}
|
|
if(!matched)//if we already have match no need to push the engine
|
|
eval!true(createStart(index), matches);//new thread staring at this position
|
|
else if(nlist.empty)
|
|
{
|
|
debug(std_regex_matcher) writeln("Stopped matching before consuming full input");
|
|
break;//not a partial match for sure
|
|
}
|
|
clist = nlist;
|
|
nlist = (ThreadList!DataIndex).init;
|
|
if(clist.tip is null)
|
|
{
|
|
static if(withSearch)
|
|
{
|
|
if(!search())
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
if(!next())
|
|
break;
|
|
}
|
|
}
|
|
else if(!next())
|
|
{
|
|
if (!atEnd) return false;
|
|
exhausted = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
genCounter++; //increment also on each end
|
|
debug(std_regex_matcher) writefln("Threaded matching threads at end");
|
|
//try out all zero-width posibilities
|
|
for(Thread!DataIndex* t = clist.fetch(); t; t = clist.fetch())
|
|
{
|
|
eval!false(t, matches);
|
|
}
|
|
if(!matched)
|
|
eval!false(createStart(index), matches);//new thread starting at end of input
|
|
if(matched)
|
|
{//in case NFA found match along the way
|
|
//and last possible longer alternative ultimately failed
|
|
s.reset(matches[0].end);//reset to last successful match
|
|
next();//and reload front character
|
|
//--- here the exact state of stream was restored ---
|
|
exhausted = atEnd || !(re.flags & RegexOption.global);
|
|
//+ empty match advances the input
|
|
if(!exhausted && matches[0].begin == matches[0].end)
|
|
next();
|
|
}
|
|
return matched;
|
|
}
|
|
|
|
/+
|
|
handle succesful threads
|
|
+/
|
|
void finish(const(Thread!DataIndex)* t, Group!DataIndex[] matches)
|
|
{
|
|
matches.ptr[0..re.ngroup] = t.matches.ptr[0..re.ngroup];
|
|
debug(std_regex_matcher)
|
|
{
|
|
writef("FOUND pc=%s prog_len=%s",
|
|
t.pc, re.ir.length);
|
|
if(!matches.empty)
|
|
writefln(": %s..%s", matches[0].begin, matches[0].end);
|
|
foreach(v; matches)
|
|
writefln("%d .. %d", v.begin, v.end);
|
|
}
|
|
matched = true;
|
|
}
|
|
|
|
/+
|
|
match thread against codepoint, cutting trough all 0-width instructions
|
|
and taking care of control flow, then add it to nlist
|
|
+/
|
|
void eval(bool withInput)(Thread!DataIndex* t, Group!DataIndex[] matches)
|
|
{
|
|
ThreadList!DataIndex worklist;
|
|
debug(std_regex_matcher) writeln("---- Evaluating thread");
|
|
for(;;)
|
|
{
|
|
debug(std_regex_matcher)
|
|
{
|
|
writef("\tpc=%s [", t.pc);
|
|
foreach(x; worklist[])
|
|
writef(" %s ", x.pc);
|
|
writeln("]");
|
|
}
|
|
switch(re.ir[t.pc].code)
|
|
{
|
|
case IR.End:
|
|
finish(t, matches);
|
|
matches[0].end = index; //fix endpoint of the whole match
|
|
recycle(t);
|
|
//cut off low priority threads
|
|
recycle(clist);
|
|
recycle(worklist);
|
|
debug(std_regex_matcher) writeln("Finished thread ", matches);
|
|
return;
|
|
case IR.Wordboundary:
|
|
dchar back;
|
|
DataIndex bi;
|
|
//at start & end of input
|
|
if(atStart && wordTrie[front])
|
|
{
|
|
t.pc += IRL!(IR.Wordboundary);
|
|
break;
|
|
}
|
|
else if(atEnd && s.loopBack(index).nextChar(back, bi)
|
|
&& wordTrie[back])
|
|
{
|
|
t.pc += IRL!(IR.Wordboundary);
|
|
break;
|
|
}
|
|
else if(s.loopBack(index).nextChar(back, bi))
|
|
{
|
|
bool af = wordTrie[front];
|
|
bool ab = wordTrie[back];
|
|
if(af ^ ab)
|
|
{
|
|
t.pc += IRL!(IR.Wordboundary);
|
|
break;
|
|
}
|
|
}
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
case IR.Notwordboundary:
|
|
dchar back;
|
|
DataIndex bi;
|
|
//at start & end of input
|
|
if(atStart && wordTrie[front])
|
|
{
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
}
|
|
else if(atEnd && s.loopBack(index).nextChar(back, bi)
|
|
&& wordTrie[back])
|
|
{
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
}
|
|
else if(s.loopBack(index).nextChar(back, bi))
|
|
{
|
|
bool af = wordTrie[front];
|
|
bool ab = wordTrie[back] != 0;
|
|
if(af ^ ab)
|
|
{
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
}
|
|
}
|
|
t.pc += IRL!(IR.Wordboundary);
|
|
break;
|
|
case IR.Bol:
|
|
dchar back;
|
|
DataIndex bi;
|
|
if(atStart
|
|
||( (re.flags & RegexOption.multiline)
|
|
&& s.loopBack(index).nextChar(back,bi)
|
|
&& startOfLine(back, front == '\n')))
|
|
{
|
|
t.pc += IRL!(IR.Bol);
|
|
}
|
|
else
|
|
{
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
}
|
|
break;
|
|
case IR.Eol:
|
|
debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index..s.lastIndex]);
|
|
dchar back;
|
|
DataIndex bi;
|
|
//no matching inside \r\n
|
|
if(atEnd || ((re.flags & RegexOption.multiline)
|
|
&& endOfLine(front, s.loopBack(index).nextChar(back, bi)
|
|
&& back == '\r')))
|
|
{
|
|
t.pc += IRL!(IR.Eol);
|
|
}
|
|
else
|
|
{
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
}
|
|
break;
|
|
case IR.InfiniteStart, IR.InfiniteQStart:
|
|
t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart);
|
|
goto case IR.InfiniteEnd; //both Q and non-Q
|
|
case IR.RepeatStart, IR.RepeatQStart:
|
|
t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart);
|
|
goto case IR.RepeatEnd; //both Q and non-Q
|
|
case IR.RepeatEnd:
|
|
case IR.RepeatQEnd:
|
|
//len, step, min, max
|
|
uint len = re.ir[t.pc].data;
|
|
uint step = re.ir[t.pc+2].raw;
|
|
uint min = re.ir[t.pc+3].raw;
|
|
if(t.counter < min)
|
|
{
|
|
t.counter += step;
|
|
t.pc -= len;
|
|
break;
|
|
}
|
|
if(merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
|
|
{
|
|
debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
|
|
t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
|
|
merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
|
|
}
|
|
else
|
|
{
|
|
debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
|
|
t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
}
|
|
uint max = re.ir[t.pc+4].raw;
|
|
if(t.counter < max)
|
|
{
|
|
if(re.ir[t.pc].code == IR.RepeatEnd)
|
|
{
|
|
//queue out-of-loop thread
|
|
worklist.insertFront(fork(t, t.pc + IRL!(IR.RepeatEnd), t.counter % step));
|
|
t.counter += step;
|
|
t.pc -= len;
|
|
}
|
|
else
|
|
{
|
|
//queue into-loop thread
|
|
worklist.insertFront(fork(t, t.pc - len, t.counter + step));
|
|
t.counter %= step;
|
|
t.pc += IRL!(IR.RepeatEnd);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
t.counter %= step;
|
|
t.pc += IRL!(IR.RepeatEnd);
|
|
}
|
|
break;
|
|
case IR.InfiniteEnd:
|
|
case IR.InfiniteQEnd:
|
|
if(merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
|
|
{
|
|
debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
|
|
t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
|
|
merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
|
|
}
|
|
else
|
|
{
|
|
debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
|
|
t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
}
|
|
uint len = re.ir[t.pc].data;
|
|
uint pc1, pc2; //branches to take in priority order
|
|
if(re.ir[t.pc].code == IR.InfiniteEnd)
|
|
{
|
|
pc1 = t.pc - len;
|
|
pc2 = t.pc + IRL!(IR.InfiniteEnd);
|
|
}
|
|
else
|
|
{
|
|
pc1 = t.pc + IRL!(IR.InfiniteEnd);
|
|
pc2 = t.pc - len;
|
|
}
|
|
static if(withInput)
|
|
{
|
|
int test = quickTestFwd(pc1, front, re);
|
|
if(test >= 0)
|
|
{
|
|
worklist.insertFront(fork(t, pc2, t.counter));
|
|
t.pc = pc1;
|
|
}
|
|
else
|
|
t.pc = pc2;
|
|
}
|
|
else
|
|
{
|
|
worklist.insertFront(fork(t, pc2, t.counter));
|
|
t.pc = pc1;
|
|
}
|
|
break;
|
|
case IR.OrEnd:
|
|
if(merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
|
|
{
|
|
debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
|
|
t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] );
|
|
merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
|
|
t.pc += IRL!(IR.OrEnd);
|
|
}
|
|
else
|
|
{
|
|
debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
|
|
t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] );
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
}
|
|
break;
|
|
case IR.OrStart:
|
|
t.pc += IRL!(IR.OrStart);
|
|
goto case;
|
|
case IR.Option:
|
|
uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option);
|
|
//queue next Option
|
|
if(re.ir[next].code == IR.Option)
|
|
{
|
|
worklist.insertFront(fork(t, next, t.counter));
|
|
}
|
|
t.pc += IRL!(IR.Option);
|
|
break;
|
|
case IR.GotoEndOr:
|
|
t.pc = t.pc + re.ir[t.pc].data + IRL!(IR.GotoEndOr);
|
|
goto case IR.OrEnd;
|
|
case IR.GroupStart:
|
|
uint n = re.ir[t.pc].data;
|
|
t.matches.ptr[n].begin = index;
|
|
t.pc += IRL!(IR.GroupStart);
|
|
break;
|
|
case IR.GroupEnd:
|
|
uint n = re.ir[t.pc].data;
|
|
t.matches.ptr[n].end = index;
|
|
t.pc += IRL!(IR.GroupEnd);
|
|
break;
|
|
case IR.Backref:
|
|
uint n = re.ir[t.pc].data;
|
|
Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr;
|
|
assert(source);
|
|
if(source[n].begin == source[n].end)//zero-width Backref!
|
|
{
|
|
t.pc += IRL!(IR.Backref);
|
|
}
|
|
else static if(withInput)
|
|
{
|
|
size_t idx = source[n].begin + t.uopCounter;
|
|
size_t end = source[n].end;
|
|
if(s[idx..end].front == front)
|
|
{
|
|
t.uopCounter += std.utf.stride(s[idx..end], 0);
|
|
if(t.uopCounter + source[n].begin == source[n].end)
|
|
{//last codepoint
|
|
t.pc += IRL!(IR.Backref);
|
|
t.uopCounter = 0;
|
|
}
|
|
nlist.insertBack(t);
|
|
}
|
|
else
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
}
|
|
break;
|
|
case IR.LookbehindStart:
|
|
case IR.NeglookbehindStart:
|
|
uint len = re.ir[t.pc].data;
|
|
uint ms = re.ir[t.pc + 1].raw, me = re.ir[t.pc + 2].raw;
|
|
uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart);
|
|
bool positive = re.ir[t.pc].code == IR.LookbehindStart;
|
|
static if(Stream.isLoopback)
|
|
auto matcher = fwdMatcher(re.ir[t.pc .. end], subCounters.get(t.pc, 0));
|
|
else
|
|
auto matcher = bwdMatcher(re.ir[t.pc .. end], subCounters.get(t.pc, 0));
|
|
matcher.re.ngroup = me - ms;
|
|
matcher.backrefed = backrefed.empty ? t.matches : backrefed;
|
|
//backMatch
|
|
auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart));
|
|
freelist = matcher.freelist;
|
|
subCounters[t.pc] = matcher.genCounter;
|
|
if((mRes == MatchResult.Match) ^ positive)
|
|
{
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
}
|
|
else
|
|
t.pc = end;
|
|
break;
|
|
case IR.LookaheadStart:
|
|
case IR.NeglookaheadStart:
|
|
auto save = index;
|
|
uint len = re.ir[t.pc].data;
|
|
uint ms = re.ir[t.pc+1].raw, me = re.ir[t.pc+2].raw;
|
|
uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart);
|
|
bool positive = re.ir[t.pc].code == IR.LookaheadStart;
|
|
static if(Stream.isLoopback)
|
|
auto matcher = bwdMatcher(re.ir[t.pc .. end], subCounters.get(t.pc, 0));
|
|
else
|
|
auto matcher = fwdMatcher(re.ir[t.pc .. end], subCounters.get(t.pc, 0));
|
|
matcher.re.ngroup = me - ms;
|
|
matcher.backrefed = backrefed.empty ? t.matches : backrefed;
|
|
auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart));
|
|
freelist = matcher.freelist;
|
|
subCounters[t.pc] = matcher.genCounter;
|
|
s.reset(index);
|
|
next();
|
|
if((mRes == MatchResult.Match) ^ positive)
|
|
{
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
}
|
|
else
|
|
t.pc = end;
|
|
break;
|
|
case IR.LookaheadEnd:
|
|
case IR.NeglookaheadEnd:
|
|
case IR.LookbehindEnd:
|
|
case IR.NeglookbehindEnd:
|
|
finish(t, matches.ptr[0 .. re.ngroup]);
|
|
recycle(t);
|
|
//cut off low priority threads
|
|
recycle(clist);
|
|
recycle(worklist);
|
|
return;
|
|
case IR.Nop:
|
|
t.pc += IRL!(IR.Nop);
|
|
break;
|
|
|
|
static if(withInput)
|
|
{
|
|
case IR.OrChar:
|
|
uint len = re.ir[t.pc].sequence;
|
|
uint end = t.pc + len;
|
|
static assert(IRL!(IR.OrChar) == 1);
|
|
for(; t.pc < end; t.pc++)
|
|
if(re.ir[t.pc].data == front)
|
|
break;
|
|
if(t.pc != end)
|
|
{
|
|
t.pc = end;
|
|
nlist.insertBack(t);
|
|
}
|
|
else
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
case IR.Char:
|
|
if(front == re.ir[t.pc].data)
|
|
{
|
|
t.pc += IRL!(IR.Char);
|
|
nlist.insertBack(t);
|
|
}
|
|
else
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
case IR.Any:
|
|
t.pc += IRL!(IR.Any);
|
|
if(!(re.flags & RegexOption.singleline)
|
|
&& (front == '\r' || front == '\n'))
|
|
recycle(t);
|
|
else
|
|
nlist.insertBack(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
case IR.CodepointSet:
|
|
if(re.charsets[re.ir[t.pc].data].scanFor(front))
|
|
{
|
|
t.pc += IRL!(IR.CodepointSet);
|
|
nlist.insertBack(t);
|
|
}
|
|
else
|
|
{
|
|
recycle(t);
|
|
}
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
case IR.Trie:
|
|
if(re.tries[re.ir[t.pc].data][front])
|
|
{
|
|
t.pc += IRL!(IR.Trie);
|
|
nlist.insertBack(t);
|
|
}
|
|
else
|
|
{
|
|
recycle(t);
|
|
}
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
break;
|
|
default:
|
|
assert(0, "Unrecognized instruction " ~ re.ir[t.pc].mnemonic);
|
|
}
|
|
else
|
|
{
|
|
default:
|
|
recycle(t);
|
|
t = worklist.fetch();
|
|
if(!t)
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
enum uint RestartPc = uint.max;
|
|
//match the input, evaluating IR without searching
|
|
MatchResult matchOneShot(Group!DataIndex[] matches, uint startPc = 0)
|
|
{
|
|
debug(std_regex_matcher)
|
|
{
|
|
writefln("---------------single shot match ----------------- ");
|
|
}
|
|
alias evalFn = eval;
|
|
assert(clist == (ThreadList!DataIndex).init || startPc == RestartPc); // incorrect after a partial match
|
|
assert(nlist == (ThreadList!DataIndex).init || startPc == RestartPc);
|
|
if(!atEnd)//if no char
|
|
{
|
|
debug(std_regex_matcher)
|
|
{
|
|
writefln("-- Threaded matching threads at %s", s[index..s.lastIndex]);
|
|
}
|
|
if(startPc!=RestartPc)
|
|
{
|
|
auto startT = createStart(index, startPc);
|
|
genCounter++;
|
|
evalFn!true(startT, matches);
|
|
}
|
|
for(;;)
|
|
{
|
|
debug(std_regex_matcher) writeln("\n-- Started iteration of main cycle");
|
|
genCounter++;
|
|
debug(std_regex_matcher)
|
|
{
|
|
foreach(t; clist[])
|
|
{
|
|
assert(t);
|
|
}
|
|
}
|
|
for(Thread!DataIndex* t = clist.fetch(); t; t = clist.fetch())
|
|
{
|
|
evalFn!true(t, matches);
|
|
}
|
|
if(nlist.empty)
|
|
{
|
|
debug(std_regex_matcher) writeln("Stopped matching before consuming full input");
|
|
break;//not a partial match for sure
|
|
}
|
|
clist = nlist;
|
|
nlist = (ThreadList!DataIndex).init;
|
|
if(!next())
|
|
{
|
|
if (!atEnd) return MatchResult.PartialMatch;
|
|
break;
|
|
}
|
|
debug(std_regex_matcher) writeln("-- Ended iteration of main cycle\n");
|
|
}
|
|
}
|
|
genCounter++; //increment also on each end
|
|
debug(std_regex_matcher) writefln("-- Matching threads at end");
|
|
//try out all zero-width posibilities
|
|
for(Thread!DataIndex* t = clist.fetch(); t; t = clist.fetch())
|
|
{
|
|
evalFn!false(t, matches);
|
|
}
|
|
if(!matched)
|
|
evalFn!false(createStart(index, startPc), matches);
|
|
|
|
return (matched?MatchResult.Match:MatchResult.NoMatch);
|
|
}
|
|
|
|
//get a dirty recycled Thread
|
|
Thread!DataIndex* allocate()
|
|
{
|
|
assert(freelist, "not enough preallocated memory");
|
|
Thread!DataIndex* t = freelist;
|
|
freelist = freelist.next;
|
|
return t;
|
|
}
|
|
|
|
//link memory into a free list of Threads
|
|
void prepareFreeList(size_t size, ref void[] memory)
|
|
{
|
|
void[] mem = memory[0 .. threadSize*size];
|
|
memory = memory[threadSize * size .. $];
|
|
freelist = cast(Thread!DataIndex*)&mem[0];
|
|
size_t i;
|
|
for(i = threadSize; i < threadSize*size; i += threadSize)
|
|
(cast(Thread!DataIndex*)&mem[i-threadSize]).next = cast(Thread!DataIndex*)&mem[i];
|
|
(cast(Thread!DataIndex*)&mem[i-threadSize]).next = null;
|
|
}
|
|
|
|
//dispose a thread
|
|
void recycle(Thread!DataIndex* t)
|
|
{
|
|
t.next = freelist;
|
|
freelist = t;
|
|
}
|
|
|
|
//dispose list of threads
|
|
void recycle(ref ThreadList!DataIndex list)
|
|
{
|
|
auto t = list.tip;
|
|
while(t)
|
|
{
|
|
auto next = t.next;
|
|
recycle(t);
|
|
t = next;
|
|
}
|
|
list = list.init;
|
|
}
|
|
|
|
//creates a copy of master thread with given pc
|
|
Thread!DataIndex* fork(Thread!DataIndex* master, uint pc, uint counter)
|
|
{
|
|
auto t = allocate();
|
|
t.matches.ptr[0..re.ngroup] = master.matches.ptr[0..re.ngroup];
|
|
t.pc = pc;
|
|
t.counter = counter;
|
|
t.uopCounter = 0;
|
|
return t;
|
|
}
|
|
|
|
//creates a start thread
|
|
Thread!DataIndex* createStart(DataIndex index, uint pc = 0)
|
|
{
|
|
auto t = allocate();
|
|
t.matches.ptr[0..re.ngroup] = (Group!DataIndex).init;
|
|
t.matches[0].begin = index;
|
|
t.pc = pc;
|
|
t.counter = 0;
|
|
t.uopCounter = 0;
|
|
return t;
|
|
}
|
|
}
|