phobos/std/regex/internal/kickstart.d
Dmitry Olshansky 2b78074fc2 Split up the intertwined mess of std.regex.
The docs and API still stay in one file.
With time and refactoring more internals may be
exposed such as parser, each engine explicitly
and the sample generator (generator.d).

Also inclusdes changes prompted by reviews/pulls:

Convert spaces-->tabs in makefiles.
Move things (again) to std/regex/internal.
Use new package(std.regex) feature.
Remove C-style arrays (some other pull against regex.d).
2014-09-13 13:45:46 +04:00

546 lines
19 KiB
D

/*
Kickstart is a coarse-grained "filter" engine that finds likely matches
to be verified by full-blown matcher.
*/
module std.regex.internal.kickstart;
package(std.regex):
import std.regex.internal.ir;
import std.algorithm, std.range, std.utf;
//utility for shiftOr, returns a minimum number of bytes to test in a Char
uint effectiveSize(Char)()
{
static if(is(Char == char))
return 1;
else static if(is(Char == wchar))
return 2;
else static if(is(Char == dchar))
return 3;
else
static assert(0);
}
/*
Kickstart engine using ShiftOr algorithm,
a bit parallel technique for inexact string searching.
*/
struct ShiftOr(Char)
{
private:
uint[] table;
uint fChar;
uint n_length;
enum charSize = effectiveSize!Char();
//maximum number of chars in CodepointSet to process
enum uint charsetThreshold = 32_000;
static struct ShiftThread
{
uint[] tab;
uint mask;
uint idx;
uint pc, counter, hops;
this(uint newPc, uint newCounter, uint[] table)
{
pc = newPc;
counter = newCounter;
mask = 1;
idx = 0;
hops = 0;
tab = table;
}
void setMask(uint idx, uint mask)
{
tab[idx] |= mask;
}
void setInvMask(uint idx, uint mask)
{
tab[idx] &= ~mask;
}
void set(alias setBits = setInvMask)(dchar ch)
{
static if(charSize == 3)
{
uint val = ch, tmask = mask;
setBits(val&0xFF, tmask);
tmask <<= 1;
val >>= 8;
setBits(val&0xFF, tmask);
tmask <<= 1;
val >>= 8;
assert(val <= 0x10);
setBits(val, tmask);
tmask <<= 1;
}
else
{
Char[dchar.sizeof/Char.sizeof] buf;
uint tmask = mask;
size_t total = encode(buf, ch);
for(size_t i = 0; i < total; i++, tmask<<=1)
{
static if(charSize == 1)
setBits(buf[i], tmask);
else static if(charSize == 2)
{
setBits(buf[i]&0xFF, tmask);
tmask <<= 1;
setBits(buf[i]>>8, tmask);
}
}
}
}
void add(dchar ch){ return set!setInvMask(ch); }
void advance(uint s)
{
mask <<= s;
idx += s;
}
@property bool full(){ return !mask; }
}
static ShiftThread fork(ShiftThread t, uint newPc, uint newCounter)
{
ShiftThread nt = t;
nt.pc = newPc;
nt.counter = newCounter;
return nt;
}
@trusted static ShiftThread fetch(ref ShiftThread[] worklist)
{
auto t = worklist[$-1];
worklist.length -= 1;
if(!__ctfe)
cast(void)worklist.assumeSafeAppend();
return t;
}
static uint charLen(uint ch)
{
assert(ch <= 0x10FFFF);
return codeLength!Char(cast(dchar)ch)*charSize;
}
public:
@trusted this(ref Regex!Char re, uint[] memory)
{
import std.conv;
assert(memory.length == 256);
fChar = uint.max;
L_FindChar:
for(size_t i = 0;;)
{
switch(re.ir[i].code)
{
case IR.Char:
fChar = re.ir[i].data;
static if(charSize != 3)
{
Char[dchar.sizeof/Char.sizeof] buf;
encode(buf, fChar);
fChar = buf[0];
}
fChar = fChar & 0xFF;
break L_FindChar;
case IR.GroupStart, IR.GroupEnd:
i += IRL!(IR.GroupStart);
break;
case IR.Bol, IR.Wordboundary, IR.Notwordboundary:
i += IRL!(IR.Bol);
break;
default:
break L_FindChar;
}
}
table = memory;
table[] = uint.max;
ShiftThread[] trs;
ShiftThread t = ShiftThread(0, 0, table);
//locate first fixed char if any
n_length = 32;
for(;;)
{
L_Eval_Thread:
for(;;)
{
switch(re.ir[t.pc].code)
{
case IR.Char:
uint s = charLen(re.ir[t.pc].data);
if(t.idx+s > n_length)
goto L_StopThread;
t.add(re.ir[t.pc].data);
t.advance(s);
t.pc += IRL!(IR.Char);
break;
case IR.OrChar://assumes IRL!(OrChar) == 1
uint len = re.ir[t.pc].sequence;
uint end = t.pc + len;
uint[Bytecode.maxSequence] s;
uint numS;
for(uint i = 0; i < len; i++)
{
auto x = charLen(re.ir[t.pc+i].data);
if(countUntil(s[0..numS], x) < 0)
s[numS++] = x;
}
for(uint i = t.pc; i < end; i++)
{
t.add(re.ir[i].data);
}
for(uint i = 0; i < numS; i++)
{
auto tx = fork(t, t.pc + len, t.counter);
if(tx.idx + s[i] <= n_length)
{
tx.advance(s[i]);
trs ~= tx;
}
}
if(!trs.empty)
t = fetch(trs);
else
goto L_StopThread;
break;
case IR.CodepointSet:
case IR.Trie:
auto set = re.charsets[re.ir[t.pc].data];
uint[4] s;
uint numS;
static if(charSize == 3)
{
s[0] = charSize;
numS = 1;
}
else
{
static if(charSize == 1)
static immutable codeBounds = [0x0, 0x7F, 0x80, 0x7FF, 0x800, 0xFFFF, 0x10000, 0x10FFFF];
else //== 2
static immutable codeBounds = [0x0, 0xFFFF, 0x10000, 0x10FFFF];
uint[] arr = new uint[set.byInterval.length * 2];
size_t ofs = 0;
foreach(ival; set.byInterval)
{
arr[ofs++] = ival.a;
arr[ofs++] = ival.b;
}
auto srange = assumeSorted!"a <= b"(arr);
for(uint i = 0; i < codeBounds.length/2; i++)
{
auto start = srange.lowerBound(codeBounds[2*i]).length;
auto end = srange.lowerBound(codeBounds[2*i+1]).length;
if(end > start || (end == start && (end & 1)))
s[numS++] = (i+1)*charSize;
}
}
if(numS == 0 || t.idx + s[numS-1] > n_length)
goto L_StopThread;
auto chars = set.length;
if(chars > charsetThreshold)
goto L_StopThread;
foreach(ch; set.byCodepoint)
{
//avoid surrogate pairs
if(0xD800 <= ch && ch <= 0xDFFF)
continue;
t.add(ch);
}
for(uint i = 0; i < numS; i++)
{
auto tx = fork(t, t.pc + IRL!(IR.CodepointSet), t.counter);
tx.advance(s[i]);
trs ~= tx;
}
if(!trs.empty)
t = fetch(trs);
else
goto L_StopThread;
break;
case IR.Any:
goto L_StopThread;
case IR.GotoEndOr:
t.pc += IRL!(IR.GotoEndOr)+re.ir[t.pc].data;
assert(re.ir[t.pc].code == IR.OrEnd);
goto case;
case IR.OrEnd:
t.pc += IRL!(IR.OrEnd);
break;
case IR.OrStart:
t.pc += IRL!(IR.OrStart);
goto case;
case IR.Option:
uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option);
//queue next Option
if(re.ir[next].code == IR.Option)
{
trs ~= fork(t, next, t.counter);
}
t.pc += IRL!(IR.Option);
break;
case IR.RepeatStart:case IR.RepeatQStart:
t.pc += IRL!(IR.RepeatStart)+re.ir[t.pc].data;
goto case IR.RepeatEnd;
case IR.RepeatEnd:
case IR.RepeatQEnd:
uint len = re.ir[t.pc].data;
uint step = re.ir[t.pc+2].raw;
uint min = re.ir[t.pc+3].raw;
if(t.counter < min)
{
t.counter += step;
t.pc -= len;
break;
}
uint max = re.ir[t.pc+4].raw;
if(t.counter < max)
{
trs ~= fork(t, t.pc - len, t.counter + step);
t.counter = t.counter%step;
t.pc += IRL!(IR.RepeatEnd);
}
else
{
t.counter = t.counter%step;
t.pc += IRL!(IR.RepeatEnd);
}
break;
case IR.InfiniteStart, IR.InfiniteQStart:
t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart);
goto case IR.InfiniteEnd; //both Q and non-Q
case IR.InfiniteEnd:
case IR.InfiniteQEnd:
uint len = re.ir[t.pc].data;
uint pc1, pc2; //branches to take in priority order
if(++t.hops == 32)
goto L_StopThread;
pc1 = t.pc + IRL!(IR.InfiniteEnd);
pc2 = t.pc - len;
trs ~= fork(t, pc2, t.counter);
t.pc = pc1;
break;
case IR.GroupStart, IR.GroupEnd:
t.pc += IRL!(IR.GroupStart);
break;
case IR.Bol, IR.Wordboundary, IR.Notwordboundary:
t.pc += IRL!(IR.Bol);
break;
case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
t.pc += IRL!(IR.LookaheadStart) + IRL!(IR.LookaheadEnd) + re.ir[t.pc].data;
break;
default:
L_StopThread:
assert(re.ir[t.pc].code >= 0x80, text(re.ir[t.pc].code));
debug (fred_search) writeln("ShiftOr stumbled on ",re.ir[t.pc].mnemonic);
n_length = min(t.idx, n_length);
break L_Eval_Thread;
}
}
if(trs.empty)
break;
t = fetch(trs);
}
debug(std_regex_search)
{
writeln("Min length: ", n_length);
}
}
@property bool empty() const { return n_length == 0; }
@property uint length() const{ return n_length/charSize; }
// lookup compatible bit pattern in haystack, return starting index
// has a useful trait: if supplied with valid UTF indexes,
// returns only valid UTF indexes
// (that given the haystack in question is valid UTF string)
@trusted size_t search(const(Char)[] haystack, size_t idx)
{//@BUG: apparently assumes little endian machines
import std.conv, core.stdc.string;
assert(!empty);
auto p = cast(const(ubyte)*)(haystack.ptr+idx);
uint state = uint.max;
uint limit = 1u<<(n_length - 1u);
debug(std_regex_search) writefln("Limit: %32b",limit);
if(fChar != uint.max)
{
const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length);
const orginalAlign = cast(size_t)p & (Char.sizeof-1);
while(p != end)
{
if(!~state)
{//speed up seeking first matching place
for(;;)
{
assert(p <= end, text(p," vs ", end));
p = cast(ubyte*)memchr(p, fChar, end - p);
if(!p)
return haystack.length;
if((cast(size_t)p & (Char.sizeof-1)) == orginalAlign)
break;
if(++p == end)
return haystack.length;
}
state = ~1u;
assert((cast(size_t)p & (Char.sizeof-1)) == orginalAlign);
static if(charSize == 3)
{
state = (state<<1) | table[p[1]];
state = (state<<1) | table[p[2]];
p += 4;
}
else
p++;
//first char is tested, see if that's all
if(!(state & limit))
return (p-cast(ubyte*)haystack.ptr)/Char.sizeof
-length;
}
else
{//have some bits/states for possible matches,
//use the usual shift-or cycle
static if(charSize == 3)
{
state = (state<<1) | table[p[0]];
state = (state<<1) | table[p[1]];
state = (state<<1) | table[p[2]];
p += 4;
}
else
{
state = (state<<1) | table[p[0]];
p++;
}
if(!(state & limit))
return (p-cast(ubyte*)haystack.ptr)/Char.sizeof
-length;
}
debug(std_regex_search) writefln("State: %32b", state);
}
}
else
{
//normal path, partially unrolled for char/wchar
static if(charSize == 3)
{
const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length);
while(p != end)
{
state = (state<<1) | table[p[0]];
state = (state<<1) | table[p[1]];
state = (state<<1) | table[p[2]];
p += 4;
if(!(state & limit))//division rounds down for dchar
return (p-cast(ubyte*)haystack.ptr)/Char.sizeof
-length;
}
}
else
{
auto len = cast(ubyte*)(haystack.ptr + haystack.length) - p;
size_t i = 0;
if(len & 1)
{
state = (state<<1) | table[p[i++]];
if(!(state & limit))
return idx+i/Char.sizeof-length;
}
while(i < len)
{
state = (state<<1) | table[p[i++]];
if(!(state & limit))
return idx+i/Char.sizeof
-length;
state = (state<<1) | table[p[i++]];
if(!(state & limit))
return idx+i/Char.sizeof
-length;
debug(std_regex_search) writefln("State: %32b", state);
}
}
}
return haystack.length;
}
@system debug static void dump(uint[] table)
{//@@@BUG@@@ writef(ln) is @system
import std.stdio;
for(size_t i = 0; i < table.length; i += 4)
{
writefln("%32b %32b %32b %32b",table[i], table[i+1], table[i+2], table[i+3]);
}
}
}
unittest
{
import std.conv, std.regex;
@trusted void test_fixed(alias Kick)()
{
foreach(i, v; TypeTuple!(char, wchar, dchar))
{
alias Char = v;
alias String = immutable(v)[];
auto r = regex(to!String(`abc$`));
auto kick = Kick!Char(r, new uint[256]);
assert(kick.length == 3, text(Kick.stringof," ",v.stringof, " == ", kick.length));
auto r2 = regex(to!String(`(abc){2}a+`));
kick = Kick!Char(r2, new uint[256]);
assert(kick.length == 7, text(Kick.stringof,v.stringof," == ", kick.length));
auto r3 = regex(to!String(`\b(a{2}b{3}){2,4}`));
kick = Kick!Char(r3, new uint[256]);
assert(kick.length == 10, text(Kick.stringof,v.stringof," == ", kick.length));
auto r4 = regex(to!String(`\ba{2}c\bxyz`));
kick = Kick!Char(r4, new uint[256]);
assert(kick.length == 6, text(Kick.stringof,v.stringof, " == ", kick.length));
auto r5 = regex(to!String(`\ba{2}c\b`));
kick = Kick!Char(r5, new uint[256]);
size_t x = kick.search("aabaacaa", 0);
assert(x == 3, text(Kick.stringof,v.stringof," == ", kick.length));
x = kick.search("aabaacaa", x+1);
assert(x == 8, text(Kick.stringof,v.stringof," == ", kick.length));
}
}
@trusted void test_flex(alias Kick)()
{
foreach(i, v;TypeTuple!(char, wchar, dchar))
{
alias Char = v;
alias String = immutable(v)[];
auto r = regex(to!String(`abc[a-z]`));
auto kick = Kick!Char(r, new uint[256]);
auto x = kick.search(to!String("abbabca"), 0);
assert(x == 3, text("real x is ", x, " ",v.stringof));
auto r2 = regex(to!String(`(ax|bd|cdy)`));
String s2 = to!String("abdcdyabax");
kick = Kick!Char(r2, new uint[256]);
x = kick.search(s2, 0);
assert(x == 1, text("real x is ", x));
x = kick.search(s2, x+1);
assert(x == 3, text("real x is ", x));
x = kick.search(s2, x+1);
assert(x == 8, text("real x is ", x));
auto rdot = regex(to!String(`...`));
kick = Kick!Char(rdot, new uint[256]);
assert(kick.length == 0);
auto rN = regex(to!String(`a(b+|c+)x`));
kick = Kick!Char(rN, new uint[256]);
assert(kick.length == 3);
assert(kick.search("ababx",0) == 2);
assert(kick.search("abaacba",0) == 3);//expected inexact
}
}
test_fixed!(ShiftOr)();
test_flex!(ShiftOr)();
}
alias Kickstart = ShiftOr;