phobos/std/regex/internal/generator.d
Dmitry Olshansky 2b78074fc2 Split up the intertwined mess of std.regex.
The docs and API still stay in one file.
With time and refactoring more internals may be
exposed such as parser, each engine explicitly
and the sample generator (generator.d).

Also inclusdes changes prompted by reviews/pulls:

Convert spaces-->tabs in makefiles.
Move things (again) to std/regex/internal.
Use new package(std.regex) feature.
Remove C-style arrays (some other pull against regex.d).
2014-09-13 13:45:46 +04:00

185 lines
6.2 KiB
D

/*
Generators - components that generate strings for a given regex pattern.
For the moment undocumented, and is subject to change.
*/
module std.regex.internal.generator;
/*
Useful utility for self-testing, an infinite range of string samples
that _have_ to match given compiled regex.
Caveats: supports only a simple subset of bytecode.
*/
@trusted private struct SampleGenerator(Char)
{
import std.regex.internal.ir;
import std.array, std.format, std.utf, std.random;
Regex!Char re;
Appender!(char[]) app;
uint limit, seed;
Xorshift gen;
//generator for pattern r, with soft maximum of threshold elements
//and a given random seed
this(ref Regex!Char r, uint threshold, uint randomSeed)
{
re = r;
limit = threshold;
seed = randomSeed;
app = appender!(Char[])();
compose();
}
uint rand(uint x)
{
uint r = gen.front % x;
gen.popFront();
return r;
}
void compose()
{
uint pc = 0, counter = 0, dataLenOld = uint.max;
for(;;)
{
switch(re.ir[pc].code)
{
case IR.Char:
formattedWrite(app,"%s", cast(dchar)re.ir[pc].data);
pc += IRL!(IR.Char);
break;
case IR.OrChar:
uint len = re.ir[pc].sequence;
formattedWrite(app, "%s", cast(dchar)re.ir[pc + rand(len)].data);
pc += len;
break;
case IR.CodepointSet:
case IR.Trie:
auto set = re.charsets[re.ir[pc].data];
auto x = rand(cast(uint)set.byInterval.length);
auto y = rand(set.byInterval[x].b - set.byInterval[x].a);
formattedWrite(app, "%s", cast(dchar)(set.byInterval[x].a+y));
pc += IRL!(IR.CodepointSet);
break;
case IR.Any:
uint x;
do
{
x = rand(0x11_000);
}while(x == '\r' || x == '\n' || !isValidDchar(x));
formattedWrite(app, "%s", cast(dchar)x);
pc += IRL!(IR.Any);
break;
case IR.GotoEndOr:
pc += IRL!(IR.GotoEndOr)+re.ir[pc].data;
assert(re.ir[pc].code == IR.OrEnd);
goto case;
case IR.OrEnd:
pc += IRL!(IR.OrEnd);
break;
case IR.OrStart:
pc += IRL!(IR.OrStart);
goto case;
case IR.Option:
uint next = pc + re.ir[pc].data + IRL!(IR.Option);
uint nOpt = 0;
//queue next Option
while(re.ir[next].code == IR.Option)
{
nOpt++;
next += re.ir[next].data + IRL!(IR.Option);
}
nOpt++;
nOpt = rand(nOpt);
for(;nOpt; nOpt--)
{
pc += re.ir[pc].data + IRL!(IR.Option);
}
assert(re.ir[pc].code == IR.Option);
pc += IRL!(IR.Option);
break;
case IR.RepeatStart:case IR.RepeatQStart:
pc += IRL!(IR.RepeatStart)+re.ir[pc].data;
goto case IR.RepeatEnd;
case IR.RepeatEnd:
case IR.RepeatQEnd:
uint len = re.ir[pc].data;
uint step = re.ir[pc+2].raw;
uint min = re.ir[pc+3].raw;
if(counter < min)
{
counter += step;
pc -= len;
break;
}
uint max = re.ir[pc+4].raw;
if(counter < max)
{
if(app.data.length < limit && rand(3) > 0)
{
pc -= len;
counter += step;
}
else
{
counter = counter%step;
pc += IRL!(IR.RepeatEnd);
}
}
else
{
counter = counter%step;
pc += IRL!(IR.RepeatEnd);
}
break;
case IR.InfiniteStart, IR.InfiniteQStart:
pc += re.ir[pc].data + IRL!(IR.InfiniteStart);
goto case IR.InfiniteEnd; //both Q and non-Q
case IR.InfiniteEnd:
case IR.InfiniteQEnd:
uint len = re.ir[pc].data;
if(app.data.length == dataLenOld)
{
pc += IRL!(IR.InfiniteEnd);
break;
}
dataLenOld = cast(uint)app.data.length;
if(app.data.length < limit && rand(3) > 0)
pc = pc - len;
else
pc = pc + IRL!(IR.InfiniteEnd);
break;
case IR.GroupStart, IR.GroupEnd:
pc += IRL!(IR.GroupStart);
break;
case IR.Bol, IR.Wordboundary, IR.Notwordboundary:
case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
default:
return;
}
}
}
@property Char[] front()
{
return app.data;
}
@property empty(){ return false; }
void popFront()
{
app.shrinkTo(0);
compose();
}
}
unittest
{
import std.range, std.regex;
auto re = regex(`P[a-z]{3,}q`);
auto gen = SampleGenerator!char(re, 20, 3141592);
static assert(isInputRange!(typeof(gen)));
//@@@BUG@@@ somehow gen.take(1_000) doesn't work
foreach(v; take(gen, 1_000))
assert(v.match(re));
}