phobos/std/regex/internal/backtracking.d
Dmitry Olshansky 2b78074fc2 Split up the intertwined mess of std.regex.
The docs and API still stay in one file.
With time and refactoring more internals may be
exposed such as parser, each engine explicitly
and the sample generator (generator.d).

Also inclusdes changes prompted by reviews/pulls:

Convert spaces-->tabs in makefiles.
Move things (again) to std/regex/internal.
Use new package(std.regex) feature.
Remove C-style arrays (some other pull against regex.d).
2014-09-13 13:45:46 +04:00

1406 lines
50 KiB
D

/*
Implementation of backtracking std.regex engine.
Contains both compile-time and run-time versions.
*/
module std.regex.internal.backtracking;
package(std.regex):
import std.regex.internal.ir;
import std.range, std.typecons, std.traits, core.stdc.stdlib;
/+
BacktrackingMatcher implements backtracking scheme of matching
regular expressions.
+/
template BacktrackingMatcher(bool CTregex)
{
@trusted struct BacktrackingMatcher(Char, Stream = Input!Char)
if(is(Char : dchar))
{
alias DataIndex = Stream.DataIndex;
struct State
{//top bit in pc is set if saved along with matches
DataIndex index;
uint pc, counter, infiniteNesting;
}
static assert(State.sizeof % size_t.sizeof == 0);
enum stateSize = State.sizeof / size_t.sizeof;
enum initialStack = 1<<11; // items in a block of segmented stack
alias const(Char)[] String;
alias RegEx = Regex!Char;
alias MatchFn = bool function (ref BacktrackingMatcher!(Char, Stream));
RegEx re; //regex program
static if(CTregex)
MatchFn nativeFn; //native code for that program
//Stream state
Stream s;
DataIndex index;
dchar front;
bool exhausted;
//backtracking machine state
uint pc, counter;
DataIndex lastState = 0; //top of state stack
DataIndex[] trackers;
static if(!CTregex)
uint infiniteNesting;
size_t[] memory;
//local slice of matches, global for backref
Group!DataIndex[] matches, backrefed;
static if(__traits(hasMember,Stream, "search"))
{
enum kicked = true;
}
else
enum kicked = false;
static size_t initialMemory(const ref RegEx re)
{
return (re.ngroup+1)*DataIndex.sizeof //trackers
+ stackSize(re)*size_t.sizeof;
}
static size_t stackSize(const ref RegEx re)
{
return initialStack*(stateSize + re.ngroup*(Group!DataIndex).sizeof/size_t.sizeof)+1;
}
@property bool atStart(){ return index == 0; }
@property bool atEnd(){ return index == s.lastIndex && s.atEnd; }
void next()
{
if(!s.nextChar(front, index))
index = s.lastIndex;
}
void search()
{
static if(kicked)
{
if(!s.search(re.kickstart, front, index))
{
index = s.lastIndex;
}
}
else
next();
}
//
void newStack()
{
auto chunk = mallocArray!(size_t)(stackSize(re));
chunk[0] = cast(size_t)(memory.ptr);
memory = chunk[1..$];
}
void initExternalMemory(void[] memBlock)
{
trackers = arrayInChunk!(DataIndex)(re.ngroup+1, memBlock);
memory = cast(size_t[])memBlock;
memory[0] = 0; //hidden pointer
memory = memory[1..$];
}
void initialize(ref RegEx program, Stream stream, void[] memBlock)
{
re = program;
s = stream;
exhausted = false;
initExternalMemory(memBlock);
backrefed = null;
}
auto dupTo(void[] memory)
{
typeof(this) tmp = this;
tmp.initExternalMemory(memory);
return tmp;
}
this(ref RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx)
{
initialize(program, stream, memBlock);
front = ch;
index = idx;
}
this(ref RegEx program, Stream stream, void[] memBlock)
{
initialize(program, stream, memBlock);
next();
}
auto fwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock)
{
alias BackMatcherTempl = .BacktrackingMatcher!(CTregex);
alias BackMatcher = BackMatcherTempl!(Char, Stream);
auto fwdMatcher = BackMatcher(matcher.re, s, memBlock, front, index);
return fwdMatcher;
}
auto bwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock)
{
alias BackMatcherTempl = .BacktrackingMatcher!(CTregex);
alias BackMatcher = BackMatcherTempl!(Char, typeof(s.loopBack(index)));
auto fwdMatcher =
BackMatcher(matcher.re, s.loopBack(index), memBlock);
return fwdMatcher;
}
//
bool matchFinalize()
{
size_t start = index;
if(matchImpl())
{//stream is updated here
matches[0].begin = start;
matches[0].end = index;
if(!(re.flags & RegexOption.global) || atEnd)
exhausted = true;
if(start == index)//empty match advances input
next();
return true;
}
else
return false;
}
//lookup next match, fill matches with indices into input
bool match(Group!DataIndex[] matches)
{
debug(std_regex_matcher)
{
writeln("------------------------------------------");
}
if(exhausted) //all matches collected
return false;
this.matches = matches;
if(re.flags & RegexInfo.oneShot)
{
exhausted = true;
DataIndex start = index;
auto m = matchImpl();
if(m)
{
matches[0].begin = start;
matches[0].end = index;
}
return m;
}
static if(kicked)
{
if(!re.kickstart.empty)
{
for(;;)
{
if(matchFinalize())
return true;
else
{
if(atEnd)
break;
search();
if(atEnd)
{
exhausted = true;
return matchFinalize();
}
}
}
exhausted = true;
return false; //early return
}
}
//no search available - skip a char at a time
for(;;)
{
if(matchFinalize())
return true;
else
{
if(atEnd)
break;
next();
if(atEnd)
{
exhausted = true;
return matchFinalize();
}
}
}
exhausted = true;
return false;
}
/+
match subexpression against input,
results are stored in matches
+/
bool matchImpl()
{
static if(CTregex && is(typeof(nativeFn(this))))
{
debug(std_regex_ctr) writeln("using C-T matcher");
return nativeFn(this);
}
else
{
pc = 0;
counter = 0;
lastState = 0;
infiniteNesting = -1;//intentional
auto start = s._index;
debug(std_regex_matcher)
writeln("Try match starting at ", s[index..s.lastIndex]);
for(;;)
{
debug(std_regex_matcher)
writefln("PC: %s\tCNT: %s\t%s \tfront: %s src: %s",
pc, counter, disassemble(re.ir, pc, re.dict),
front, s._index);
switch(re.ir[pc].code)
{
case IR.OrChar://assumes IRL!(OrChar) == 1
if(atEnd)
goto L_backtrack;
uint len = re.ir[pc].sequence;
uint end = pc + len;
if(re.ir[pc].data != front && re.ir[pc+1].data != front)
{
for(pc = pc+2; pc < end; pc++)
if(re.ir[pc].data == front)
break;
if(pc == end)
goto L_backtrack;
}
pc = end;
next();
break;
case IR.Char:
if(atEnd || front != re.ir[pc].data)
goto L_backtrack;
pc += IRL!(IR.Char);
next();
break;
case IR.Any:
if(atEnd || (!(re.flags & RegexOption.singleline)
&& (front == '\r' || front == '\n')))
goto L_backtrack;
pc += IRL!(IR.Any);
next();
break;
case IR.CodepointSet:
if(atEnd || !re.charsets[re.ir[pc].data].scanFor(front))
goto L_backtrack;
next();
pc += IRL!(IR.CodepointSet);
break;
case IR.Trie:
if(atEnd || !re.tries[re.ir[pc].data][front])
goto L_backtrack;
next();
pc += IRL!(IR.Trie);
break;
case IR.Wordboundary:
dchar back;
DataIndex bi;
//at start & end of input
if(atStart && wordTrie[front])
{
pc += IRL!(IR.Wordboundary);
break;
}
else if(atEnd && s.loopBack(index).nextChar(back, bi)
&& wordTrie[back])
{
pc += IRL!(IR.Wordboundary);
break;
}
else if(s.loopBack(index).nextChar(back, bi))
{
bool af = wordTrie[front];
bool ab = wordTrie[back];
if(af ^ ab)
{
pc += IRL!(IR.Wordboundary);
break;
}
}
goto L_backtrack;
case IR.Notwordboundary:
dchar back;
DataIndex bi;
//at start & end of input
if(atStart && wordTrie[front])
goto L_backtrack;
else if(atEnd && s.loopBack(index).nextChar(back, bi)
&& wordTrie[back])
goto L_backtrack;
else if(s.loopBack(index).nextChar(back, bi))
{
bool af = wordTrie[front];
bool ab = wordTrie[back];
if(af ^ ab)
goto L_backtrack;
}
pc += IRL!(IR.Wordboundary);
break;
case IR.Bol:
dchar back;
DataIndex bi;
if(atStart)
pc += IRL!(IR.Bol);
else if((re.flags & RegexOption.multiline)
&& s.loopBack(index).nextChar(back,bi)
&& endOfLine(back, front == '\n'))
{
pc += IRL!(IR.Bol);
}
else
goto L_backtrack;
break;
case IR.Eol:
dchar back;
DataIndex bi;
debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index..s.lastIndex]);
//no matching inside \r\n
if(atEnd || ((re.flags & RegexOption.multiline)
&& endOfLine(front, s.loopBack(index).nextChar(back,bi)
&& back == '\r')))
{
pc += IRL!(IR.Eol);
}
else
goto L_backtrack;
break;
case IR.InfiniteStart, IR.InfiniteQStart:
trackers[infiniteNesting+1] = index;
pc += re.ir[pc].data + IRL!(IR.InfiniteStart);
//now pc is at end IR.Infininite(Q)End
uint len = re.ir[pc].data;
int test;
if(re.ir[pc].code == IR.InfiniteEnd)
{
test = quickTestFwd(pc+IRL!(IR.InfiniteEnd), front, re);
if(test >= 0)
pushState(pc+IRL!(IR.InfiniteEnd), counter);
infiniteNesting++;
pc -= len;
}
else
{
test = quickTestFwd(pc - len, front, re);
if(test >= 0)
{
infiniteNesting++;
pushState(pc - len, counter);
infiniteNesting--;
}
pc += IRL!(IR.InfiniteEnd);
}
break;
case IR.RepeatStart, IR.RepeatQStart:
pc += re.ir[pc].data + IRL!(IR.RepeatStart);
break;
case IR.RepeatEnd:
case IR.RepeatQEnd:
//len, step, min, max
uint len = re.ir[pc].data;
uint step = re.ir[pc+2].raw;
uint min = re.ir[pc+3].raw;
uint max = re.ir[pc+4].raw;
if(counter < min)
{
counter += step;
pc -= len;
}
else if(counter < max)
{
if(re.ir[pc].code == IR.RepeatEnd)
{
pushState(pc + IRL!(IR.RepeatEnd), counter%step);
counter += step;
pc -= len;
}
else
{
pushState(pc - len, counter + step);
counter = counter%step;
pc += IRL!(IR.RepeatEnd);
}
}
else
{
counter = counter%step;
pc += IRL!(IR.RepeatEnd);
}
break;
case IR.InfiniteEnd:
case IR.InfiniteQEnd:
uint len = re.ir[pc].data;
debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting);
assert(infiniteNesting < trackers.length);
if(trackers[infiniteNesting] == index)
{//source not consumed
pc += IRL!(IR.InfiniteEnd);
infiniteNesting--;
break;
}
else
trackers[infiniteNesting] = index;
int test;
if(re.ir[pc].code == IR.InfiniteEnd)
{
test = quickTestFwd(pc+IRL!(IR.InfiniteEnd), front, re);
if(test >= 0)
{
infiniteNesting--;
pushState(pc + IRL!(IR.InfiniteEnd), counter);
infiniteNesting++;
}
pc -= len;
}
else
{
test = quickTestFwd(pc-len, front, re);
if(test >= 0)
pushState(pc-len, counter);
pc += IRL!(IR.InfiniteEnd);
infiniteNesting--;
}
break;
case IR.OrEnd:
pc += IRL!(IR.OrEnd);
break;
case IR.OrStart:
pc += IRL!(IR.OrStart);
goto case;
case IR.Option:
uint len = re.ir[pc].data;
if(re.ir[pc+len].code == IR.GotoEndOr)//not a last one
{
pushState(pc + len + IRL!(IR.Option), counter); //remember 2nd branch
}
pc += IRL!(IR.Option);
break;
case IR.GotoEndOr:
pc = pc + re.ir[pc].data + IRL!(IR.GotoEndOr);
break;
case IR.GroupStart:
uint n = re.ir[pc].data;
matches[n].begin = index;
debug(std_regex_matcher) writefln("IR group #%u starts at %u", n, index);
pc += IRL!(IR.GroupStart);
break;
case IR.GroupEnd:
uint n = re.ir[pc].data;
matches[n].end = index;
debug(std_regex_matcher) writefln("IR group #%u ends at %u", n, index);
pc += IRL!(IR.GroupEnd);
break;
case IR.LookaheadStart:
case IR.NeglookaheadStart:
uint len = re.ir[pc].data;
auto save = index;
uint ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw;
auto mem = malloc(initialMemory(re))[0..initialMemory(re)];
scope(exit) free(mem.ptr);
static if(Stream.isLoopback)
{
auto matcher = bwdMatcher(this, mem);
}
else
{
auto matcher = fwdMatcher(this, mem);
}
matcher.matches = matches[ms .. me];
matcher.backrefed = backrefed.empty ? matches : backrefed;
matcher.re.ir = re.ir[pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd)];
bool match = matcher.matchImpl() ^ (re.ir[pc].code == IR.NeglookaheadStart);
s.reset(save);
next();
if(!match)
goto L_backtrack;
else
{
pc += IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd);
}
break;
case IR.LookbehindStart:
case IR.NeglookbehindStart:
uint len = re.ir[pc].data;
uint ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw;
auto mem = malloc(initialMemory(re))[0..initialMemory(re)];
scope(exit) free(mem.ptr);
static if(Stream.isLoopback)
{
alias Matcher = BacktrackingMatcher!(Char, Stream);
auto matcher = Matcher(re, s, mem, front, index);
}
else
{
alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index)));
auto matcher = Matcher(re, s.loopBack(index), mem);
}
matcher.matches = matches[ms .. me];
matcher.re.ir = re.ir[pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd)];
matcher.backrefed = backrefed.empty ? matches : backrefed;
bool match = matcher.matchImpl() ^ (re.ir[pc].code == IR.NeglookbehindStart);
if(!match)
goto L_backtrack;
else
{
pc += IRL!(IR.LookbehindStart)+len+IRL!(IR.LookbehindEnd);
}
break;
case IR.Backref:
uint n = re.ir[pc].data;
auto referenced = re.ir[pc].localRef
? s[matches[n].begin .. matches[n].end]
: s[backrefed[n].begin .. backrefed[n].end];
while(!atEnd && !referenced.empty && front == referenced.front)
{
next();
referenced.popFront();
}
if(referenced.empty)
pc++;
else
goto L_backtrack;
break;
case IR.Nop:
pc += IRL!(IR.Nop);
break;
case IR.LookaheadEnd:
case IR.NeglookaheadEnd:
case IR.LookbehindEnd:
case IR.NeglookbehindEnd:
case IR.End:
return true;
default:
debug printBytecode(re.ir[0..$]);
assert(0);
L_backtrack:
if(!popState())
{
s.reset(start);
return false;
}
}
}
}
assert(0);
}
@property size_t stackAvail()
{
return memory.length - lastState;
}
bool prevStack()
{
import core.stdc.stdlib;
size_t* prev = memory.ptr-1;
prev = cast(size_t*)*prev;//take out hidden pointer
if(!prev)
return false;
free(memory.ptr);//last segment is freed in RegexMatch
immutable size = initialStack*(stateSize + 2*re.ngroup);
memory = prev[0..size];
lastState = size;
return true;
}
void stackPush(T)(T val)
if(!isDynamicArray!T)
{
*cast(T*)&memory[lastState] = val;
enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof;
lastState += delta;
debug(std_regex_matcher) writeln("push element SP= ", lastState);
}
void stackPush(T)(T[] val)
{
static assert(T.sizeof % size_t.sizeof == 0);
(cast(T*)&memory[lastState])[0..val.length]
= val[0..$];
lastState += val.length*(T.sizeof/size_t.sizeof);
debug(std_regex_matcher) writeln("push array SP= ", lastState);
}
void stackPop(T)(ref T val)
if(!isDynamicArray!T)
{
enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof;
lastState -= delta;
val = *cast(T*)&memory[lastState];
debug(std_regex_matcher) writeln("pop element SP= ", lastState);
}
void stackPop(T)(T[] val)
{
stackPop(val); // call ref version
}
void stackPop(T)(ref T[] val)
{
lastState -= val.length*(T.sizeof/size_t.sizeof);
val[0..$] = (cast(T*)&memory[lastState])[0..val.length];
debug(std_regex_matcher) writeln("pop array SP= ", lastState);
}
static if(!CTregex)
{
//helper function, saves engine state
void pushState(uint pc, uint counter)
{
if(stateSize + trackers.length + matches.length > stackAvail)
{
newStack();
lastState = 0;
}
*cast(State*)&memory[lastState] =
State(index, pc, counter, infiniteNesting);
lastState += stateSize;
memory[lastState .. lastState + 2 * matches.length] = (cast(size_t[])matches)[];
lastState += 2*matches.length;
if(trackers.length)
{
memory[lastState .. lastState + trackers.length] = trackers[];
lastState += trackers.length;
}
debug(std_regex_matcher)
writefln("Saved(pc=%s) front: %s src: %s",
pc, front, s[index..s.lastIndex]);
}
//helper function, restores engine state
bool popState()
{
if(!lastState)
return prevStack();
if (trackers.length)
{
lastState -= trackers.length;
trackers[] = memory[lastState .. lastState + trackers.length];
}
lastState -= 2*matches.length;
auto pm = cast(size_t[])matches;
pm[] = memory[lastState .. lastState + 2 * matches.length];
lastState -= stateSize;
State* state = cast(State*)&memory[lastState];
index = state.index;
pc = state.pc;
counter = state.counter;
infiniteNesting = state.infiniteNesting;
debug(std_regex_matcher)
{
writefln("Restored matches", front, s[index .. s.lastIndex]);
foreach(i, m; matches)
writefln("Sub(%d) : %s..%s", i, m.begin, m.end);
}
s.reset(index);
next();
debug(std_regex_matcher)
writefln("Backtracked (pc=%s) front: %s src: %s",
pc, front, s[index..s.lastIndex]);
return true;
}
}
}
}
//very shitty string formatter, $$ replaced with next argument converted to string
@trusted string ctSub( U...)(string format, U args)
{
import std.conv;
bool seenDollar;
foreach(i, ch; format)
{
if(ch == '$')
{
if(seenDollar)
{
static if(args.length > 0)
{
return format[0 .. i - 1] ~ to!string(args[0])
~ ctSub(format[i + 1 .. $], args[1 .. $]);
}
else
assert(0);
}
else
seenDollar = true;
}
else
seenDollar = false;
}
return format;
}
alias Sequence(int B, int E) = staticIota!(B, E);
struct CtContext
{
import std.conv;
//dirty flags
bool counter, infNesting;
// to make a unique advancement counter per nesting level of loops
int curInfLoop, nInfLoops;
//to mark the portion of matches to save
int match, total_matches;
int reserved;
//state of codegenerator
struct CtState
{
string code;
int addr;
}
this(Char)(Regex!Char re)
{
match = 1;
reserved = 1; //first match is skipped
total_matches = re.ngroup;
}
CtContext lookaround(uint s, uint e)
{
CtContext ct;
ct.total_matches = e - s;
ct.match = 1;
return ct;
}
//restore state having current context
string restoreCode()
{
string text;
//stack is checked in L_backtrack
text ~= counter
? "
stackPop(counter);"
: "
counter = 0;";
if(infNesting)
{
text ~= ctSub(`
stackPop(trackers[0..$$]);
`, curInfLoop + 1);
}
if(match < total_matches)
{
text ~= ctSub("
stackPop(matches[$$..$$]);", reserved, match);
text ~= ctSub("
matches[$$..$] = typeof(matches[0]).init;", match);
}
else
text ~= ctSub("
stackPop(matches[$$..$]);", reserved);
return text;
}
//save state having current context
string saveCode(uint pc, string count_expr="counter")
{
string text = ctSub("
if(stackAvail < $$*(Group!(DataIndex)).sizeof/size_t.sizeof + trackers.length + $$)
{
newStack();
lastState = 0;
}", match - reserved, cast(int)counter + 2);
if(match < total_matches)
text ~= ctSub("
stackPush(matches[$$..$$]);", reserved, match);
else
text ~= ctSub("
stackPush(matches[$$..$]);", reserved);
if(infNesting)
{
text ~= ctSub(`
stackPush(trackers[0..$$]);
`, curInfLoop + 1);
}
text ~= counter ? ctSub("
stackPush($$);", count_expr) : "";
text ~= ctSub("
stackPush(index); stackPush($$); \n", pc);
return text;
}
//
CtState ctGenBlock(Bytecode[] ir, int addr)
{
CtState result;
result.addr = addr;
while(!ir.empty)
{
auto n = ctGenGroup(ir, result.addr);
result.code ~= n.code;
result.addr = n.addr;
}
return result;
}
//
CtState ctGenGroup(ref Bytecode[] ir, int addr)
{
import std.algorithm : max;
auto bailOut = "goto L_backtrack;";
auto nextInstr = ctSub("goto case $$;", addr+1);
CtState r;
assert(!ir.empty);
switch(ir[0].code)
{
case IR.InfiniteStart, IR.InfiniteQStart, IR.RepeatStart, IR.RepeatQStart:
bool infLoop =
ir[0].code == IR.InfiniteStart || ir[0].code == IR.InfiniteQStart;
infNesting = infNesting || infLoop;
if(infLoop)
{
curInfLoop++;
nInfLoops = max(nInfLoops, curInfLoop+1);
}
counter = counter ||
ir[0].code == IR.RepeatStart || ir[0].code == IR.RepeatQStart;
uint len = ir[0].data;
auto nir = ir[ir[0].length .. ir[0].length+len];
r = ctGenBlock(nir, addr+1);
if(infLoop)
curInfLoop--;
//start/end codegen
//r.addr is at last test+ jump of loop, addr+1 is body of loop
nir = ir[ir[0].length + len .. $];
r.code = ctGenFixupCode(ir[0..ir[0].length], addr, r.addr) ~ r.code;
r.code ~= ctGenFixupCode(nir, r.addr, addr+1);
r.addr += 2; //account end instruction + restore state
ir = nir;
break;
case IR.OrStart:
uint len = ir[0].data;
auto nir = ir[ir[0].length .. ir[0].length+len];
r = ctGenAlternation(nir, addr);
ir = ir[ir[0].length + len .. $];
assert(ir[0].code == IR.OrEnd);
ir = ir[ir[0].length..$];
break;
case IR.LookaheadStart:
case IR.NeglookaheadStart:
case IR.LookbehindStart:
case IR.NeglookbehindStart:
uint len = ir[0].data;
bool behind = ir[0].code == IR.LookbehindStart || ir[0].code == IR.NeglookbehindStart;
bool negative = ir[0].code == IR.NeglookaheadStart || ir[0].code == IR.NeglookbehindStart;
string fwdType = "typeof(fwdMatcher(matcher, []))";
string bwdType = "typeof(bwdMatcher(matcher, []))";
string fwdCreate = "fwdMatcher(matcher, mem)";
string bwdCreate = "bwdMatcher(matcher, mem)";
uint start = IRL!(IR.LookbehindStart);
uint end = IRL!(IR.LookbehindStart)+len+IRL!(IR.LookaheadEnd);
CtContext context = lookaround(ir[1].raw, ir[2].raw); //split off new context
auto slice = ir[start .. end];
r.code ~= ctSub(`
case $$: //fake lookaround "atom"
static if(typeof(matcher.s).isLoopback)
alias Lookaround = $$;
else
alias Lookaround = $$;
static bool matcher_$$(ref Lookaround matcher) @trusted
{
//(neg)lookaround piece start
$$
//(neg)lookaround piece ends
}
auto save = index;
auto mem = malloc(initialMemory(re))[0..initialMemory(re)];
scope(exit) free(mem.ptr);
static if(typeof(matcher.s).isLoopback)
auto lookaround = $$;
else
auto lookaround = $$;
lookaround.matches = matches[$$..$$];
lookaround.backrefed = backrefed.empty ? matches : backrefed;
lookaround.nativeFn = &matcher_$$; //hookup closure's binary code
bool match = $$;
s.reset(save);
next();
if(match)
$$
else
$$`, addr,
behind ? fwdType : bwdType, behind ? bwdType : fwdType,
addr, context.ctGenRegEx(slice),
behind ? fwdCreate : bwdCreate, behind ? bwdCreate : fwdCreate,
ir[1].raw, ir[2].raw, //start - end of matches slice
addr,
negative ? "!lookaround.matchImpl()" : "lookaround.matchImpl()",
nextInstr, bailOut);
ir = ir[end .. $];
r.addr = addr + 1;
break;
case IR.LookaheadEnd: case IR.NeglookaheadEnd:
case IR.LookbehindEnd: case IR.NeglookbehindEnd:
ir = ir[IRL!(IR.LookaheadEnd) .. $];
r.addr = addr;
break;
default:
assert(ir[0].isAtom, text(ir[0].mnemonic));
r = ctGenAtom(ir, addr);
}
return r;
}
//generate source for bytecode contained in OrStart ... OrEnd
CtState ctGenAlternation(Bytecode[] ir, int addr)
{
CtState[] pieces;
CtState r;
enum optL = IRL!(IR.Option);
for(;;)
{
assert(ir[0].code == IR.Option);
auto len = ir[0].data;
if(optL+len < ir.length && ir[optL+len].code == IR.Option)//not a last option
{
auto nir = ir[optL .. optL+len-IRL!(IR.GotoEndOr)];
r = ctGenBlock(nir, addr+2);//space for Option + restore state
//r.addr+1 to account GotoEndOr at end of branch
r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr+1) ~ r.code;
addr = r.addr+1;//leave space for GotoEndOr
pieces ~= r;
ir = ir[optL + len .. $];
}
else
{
pieces ~= ctGenBlock(ir[optL..$], addr);
addr = pieces[$-1].addr;
break;
}
}
r = pieces[0];
for(uint i = 1; i < pieces.length; i++)
{
r.code ~= ctSub(`
case $$:
goto case $$; `, pieces[i-1].addr, addr);
r.code ~= pieces[i].code;
}
r.addr = addr;
return r;
}
// generate fixup code for instruction in ir,
// fixup means it has an alternative way for control flow
string ctGenFixupCode(Bytecode[] ir, int addr, int fixup)
{
return ctGenFixupCode(ir, addr, fixup); // call ref Bytecode[] version
}
string ctGenFixupCode(ref Bytecode[] ir, int addr, int fixup)
{
string r;
string testCode;
r = ctSub(`
case $$: debug(std_regex_matcher) writeln("#$$");`,
addr, addr);
switch(ir[0].code)
{
case IR.InfiniteStart, IR.InfiniteQStart:
r ~= ctSub( `
trackers[$$] = DataIndex.max;
goto case $$;`, curInfLoop, fixup);
ir = ir[ir[0].length..$];
break;
case IR.InfiniteEnd:
testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1);
r ~= ctSub( `
if(trackers[$$] == index)
{//source not consumed
goto case $$;
}
trackers[$$] = index;
$$
{
$$
}
goto case $$;
case $$: //restore state and go out of loop
$$
goto case;`, curInfLoop, addr+2,
curInfLoop, testCode, saveCode(addr+1),
fixup, addr+1, restoreCode());
ir = ir[ir[0].length..$];
break;
case IR.InfiniteQEnd:
testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1);
auto altCode = testCode.length ? ctSub("else goto case $$;", fixup) : "";
r ~= ctSub( `
if(trackers[$$] == index)
{//source not consumed
goto case $$;
}
trackers[$$] = index;
$$
{
$$
goto case $$;
}
$$
case $$://restore state and go inside loop
$$
goto case $$;`, curInfLoop, addr+2,
curInfLoop, testCode, saveCode(addr+1),
addr+2, altCode, addr+1, restoreCode(), fixup);
ir = ir[ir[0].length..$];
break;
case IR.RepeatStart, IR.RepeatQStart:
r ~= ctSub( `
goto case $$;`, fixup);
ir = ir[ir[0].length..$];
break;
case IR.RepeatEnd, IR.RepeatQEnd:
//len, step, min, max
uint len = ir[0].data;
uint step = ir[2].raw;
uint min = ir[3].raw;
uint max = ir[4].raw;
r ~= ctSub(`
if(counter < $$)
{
debug(std_regex_matcher) writeln("RepeatEnd min case pc=", $$);
counter += $$;
goto case $$;
}`, min, addr, step, fixup);
if(ir[0].code == IR.RepeatEnd)
{
string counter_expr = ctSub("counter % $$", step);
r ~= ctSub(`
else if(counter < $$)
{
$$
counter += $$;
goto case $$;
}`, max, saveCode(addr+1, counter_expr), step, fixup);
}
else
{
string counter_expr = ctSub("counter % $$", step);
r ~= ctSub(`
else if(counter < $$)
{
$$
counter = counter % $$;
goto case $$;
}`, max, saveCode(addr+1,counter_expr), step, addr+2);
}
r ~= ctSub(`
else
{
counter = counter % $$;
goto case $$;
}
case $$: //restore state
$$
goto case $$;`, step, addr+2, addr+1, restoreCode(),
ir[0].code == IR.RepeatEnd ? addr+2 : fixup );
ir = ir[ir[0].length..$];
break;
case IR.Option:
r ~= ctSub( `
{
$$
}
goto case $$;
case $$://restore thunk to go to the next group
$$
goto case $$;`, saveCode(addr+1), addr+2,
addr+1, restoreCode(), fixup);
ir = ir[ir[0].length..$];
break;
default:
assert(0, text(ir[0].mnemonic));
}
return r;
}
string ctQuickTest(Bytecode[] ir, int id)
{
uint pc = 0;
while(pc < ir.length && ir[pc].isAtom)
{
if(ir[pc].code == IR.GroupStart || ir[pc].code == IR.GroupEnd)
{
pc++;
}
else if(ir[pc].code == IR.Backref)
break;
else
{
auto code = ctAtomCode(ir[pc..$], -1);
return ctSub(`
int test_$$()
{
$$ //$$
}
if(test_$$() >= 0)`, id, code.ptr ? code : "return 0;",
ir[pc].mnemonic, id);
}
}
return "";
}
//process & generate source for simple bytecodes at front of ir using address addr
CtState ctGenAtom(ref Bytecode[] ir, int addr)
{
CtState result;
result.code = ctAtomCode(ir, addr);
ir.popFrontN(ir[0].code == IR.OrChar ? ir[0].sequence : ir[0].length);
result.addr = addr + 1;
return result;
}
//D code for atom at ir using address addr, addr < 0 means quickTest
string ctAtomCode(Bytecode[] ir, int addr)
{
string code;
string bailOut, nextInstr;
if(addr < 0)
{
bailOut = "return -1;";
nextInstr = "return 0;";
}
else
{
bailOut = "goto L_backtrack;";
nextInstr = ctSub("goto case $$;", addr+1);
code ~= ctSub( `
case $$: debug(std_regex_matcher) writeln("#$$");
`, addr, addr);
}
switch(ir[0].code)
{
case IR.OrChar://assumes IRL!(OrChar) == 1
code ~= ctSub(`
if(atEnd)
$$`, bailOut);
uint len = ir[0].sequence;
for(uint i = 0; i < len; i++)
{
code ~= ctSub( `
if(front == $$)
{
$$
$$
}`, ir[i].data, addr >= 0 ? "next();" :"", nextInstr);
}
code ~= ctSub( `
$$`, bailOut);
break;
case IR.Char:
code ~= ctSub( `
if(atEnd || front != $$)
$$
$$
$$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
break;
case IR.Any:
code ~= ctSub( `
if(atEnd || (!(re.flags & RegexOption.singleline)
&& (front == '\r' || front == '\n')))
$$
$$
$$`, bailOut, addr >= 0 ? "next();" :"",nextInstr);
break;
case IR.CodepointSet:
code ~= ctSub( `
if(atEnd || !re.charsets[$$].scanFor(front))
$$
$$
$$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
break;
case IR.Trie:
code ~= ctSub( `
if(atEnd || !re.tries[$$][front])
$$
$$
$$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
break;
case IR.Wordboundary:
code ~= ctSub( `
dchar back;
DataIndex bi;
if(atStart && wordTrie[front])
{
$$
}
else if(atEnd && s.loopBack(index).nextChar(back, bi)
&& wordTrie[back])
{
$$
}
else if(s.loopBack(index).nextChar(back, bi))
{
bool af = wordTrie[front];
bool ab = wordTrie[back];
if(af ^ ab)
{
$$
}
}
$$`, nextInstr, nextInstr, nextInstr, bailOut);
break;
case IR.Notwordboundary:
code ~= ctSub( `
dchar back;
DataIndex bi;
//at start & end of input
if(atStart && wordTrie[front])
$$
else if(atEnd && s.loopBack(index).nextChar(back, bi)
&& wordTrie[back])
$$
else if(s.loopBack(index).nextChar(back, bi))
{
bool af = wordTrie[front];
bool ab = wordTrie[back];
if(af ^ ab)
$$
}
$$`, bailOut, bailOut, bailOut, nextInstr);
break;
case IR.Bol:
code ~= ctSub(`
dchar back;
DataIndex bi;
if(atStart || ((re.flags & RegexOption.multiline)
&& s.loopBack(index).nextChar(back,bi)
&& endOfLine(back, front == '\n')))
{
debug(std_regex_matcher) writeln("BOL matched");
$$
}
else
$$`, nextInstr, bailOut);
break;
case IR.Eol:
code ~= ctSub(`
dchar back;
DataIndex bi;
debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index..s.lastIndex]);
//no matching inside \r\n
if(atEnd || ((re.flags & RegexOption.multiline)
&& endOfLine(front, s.loopBack(index).nextChar(back,bi)
&& back == '\r')))
{
debug(std_regex_matcher) writeln("EOL matched");
$$
}
else
$$`, nextInstr, bailOut);
break;
case IR.GroupStart:
code ~= ctSub(`
matches[$$].begin = index;
$$`, ir[0].data, nextInstr);
match = ir[0].data+1;
break;
case IR.GroupEnd:
code ~= ctSub(`
matches[$$].end = index;
$$`, ir[0].data, nextInstr);
break;
case IR.Backref:
string mStr = "auto referenced = ";
mStr ~= ir[0].localRef
? ctSub("s[matches[$$].begin .. matches[$$].end];",
ir[0].data, ir[0].data)
: ctSub("s[backrefed[$$].begin .. backrefed[$$].end];",
ir[0].data, ir[0].data);
code ~= ctSub( `
$$
while(!atEnd && !referenced.empty && front == referenced.front)
{
next();
referenced.popFront();
}
if(referenced.empty)
$$
else
$$`, mStr, nextInstr, bailOut);
break;
case IR.Nop:
case IR.End:
break;
default:
assert(0, text(ir[0].mnemonic, " is not supported yet"));
}
return code;
}
//generate D code for the whole regex
public string ctGenRegEx(Bytecode[] ir)
{
auto bdy = ctGenBlock(ir, 0);
auto r = `
import core.stdc.stdlib;
with(matcher)
{
pc = 0;
counter = 0;
lastState = 0;
auto start = s._index;`;
r ~= `
goto StartLoop;
debug(std_regex_matcher) writeln("Try CT matching starting at ",s[index..s.lastIndex]);
L_backtrack:
if(lastState || prevStack())
{
stackPop(pc);
stackPop(index);
s.reset(index);
next();
}
else
{
s.reset(start);
return false;
}
StartLoop:
switch(pc)
{
`;
r ~= bdy.code;
r ~= ctSub(`
case $$: break;`,bdy.addr);
r ~= `
default:
assert(0);
}
return true;
}
`;
return r;
}
}
string ctGenRegExCode(Char)(Regex!Char re)
{
auto context = CtContext(re);
return context.ctGenRegEx(re.ir);
}