mirror of
https://github.com/dlang/phobos.git
synced 2025-04-28 22:21:09 +03:00

The docs and API still stay in one file. With time and refactoring more internals may be exposed such as parser, each engine explicitly and the sample generator (generator.d). Also inclusdes changes prompted by reviews/pulls: Convert spaces-->tabs in makefiles. Move things (again) to std/regex/internal. Use new package(std.regex) feature. Remove C-style arrays (some other pull against regex.d).
185 lines
6.2 KiB
D
185 lines
6.2 KiB
D
/*
|
|
Generators - components that generate strings for a given regex pattern.
|
|
|
|
For the moment undocumented, and is subject to change.
|
|
*/
|
|
module std.regex.internal.generator;
|
|
|
|
/*
|
|
Useful utility for self-testing, an infinite range of string samples
|
|
that _have_ to match given compiled regex.
|
|
Caveats: supports only a simple subset of bytecode.
|
|
*/
|
|
@trusted private struct SampleGenerator(Char)
|
|
{
|
|
import std.regex.internal.ir;
|
|
import std.array, std.format, std.utf, std.random;
|
|
Regex!Char re;
|
|
Appender!(char[]) app;
|
|
uint limit, seed;
|
|
Xorshift gen;
|
|
//generator for pattern r, with soft maximum of threshold elements
|
|
//and a given random seed
|
|
this(ref Regex!Char r, uint threshold, uint randomSeed)
|
|
{
|
|
re = r;
|
|
limit = threshold;
|
|
seed = randomSeed;
|
|
app = appender!(Char[])();
|
|
compose();
|
|
}
|
|
|
|
uint rand(uint x)
|
|
{
|
|
uint r = gen.front % x;
|
|
gen.popFront();
|
|
return r;
|
|
}
|
|
|
|
void compose()
|
|
{
|
|
uint pc = 0, counter = 0, dataLenOld = uint.max;
|
|
for(;;)
|
|
{
|
|
switch(re.ir[pc].code)
|
|
{
|
|
case IR.Char:
|
|
formattedWrite(app,"%s", cast(dchar)re.ir[pc].data);
|
|
pc += IRL!(IR.Char);
|
|
break;
|
|
case IR.OrChar:
|
|
uint len = re.ir[pc].sequence;
|
|
formattedWrite(app, "%s", cast(dchar)re.ir[pc + rand(len)].data);
|
|
pc += len;
|
|
break;
|
|
case IR.CodepointSet:
|
|
case IR.Trie:
|
|
auto set = re.charsets[re.ir[pc].data];
|
|
auto x = rand(cast(uint)set.byInterval.length);
|
|
auto y = rand(set.byInterval[x].b - set.byInterval[x].a);
|
|
formattedWrite(app, "%s", cast(dchar)(set.byInterval[x].a+y));
|
|
pc += IRL!(IR.CodepointSet);
|
|
break;
|
|
case IR.Any:
|
|
uint x;
|
|
do
|
|
{
|
|
x = rand(0x11_000);
|
|
}while(x == '\r' || x == '\n' || !isValidDchar(x));
|
|
formattedWrite(app, "%s", cast(dchar)x);
|
|
pc += IRL!(IR.Any);
|
|
break;
|
|
case IR.GotoEndOr:
|
|
pc += IRL!(IR.GotoEndOr)+re.ir[pc].data;
|
|
assert(re.ir[pc].code == IR.OrEnd);
|
|
goto case;
|
|
case IR.OrEnd:
|
|
pc += IRL!(IR.OrEnd);
|
|
break;
|
|
case IR.OrStart:
|
|
pc += IRL!(IR.OrStart);
|
|
goto case;
|
|
case IR.Option:
|
|
uint next = pc + re.ir[pc].data + IRL!(IR.Option);
|
|
uint nOpt = 0;
|
|
//queue next Option
|
|
while(re.ir[next].code == IR.Option)
|
|
{
|
|
nOpt++;
|
|
next += re.ir[next].data + IRL!(IR.Option);
|
|
}
|
|
nOpt++;
|
|
nOpt = rand(nOpt);
|
|
for(;nOpt; nOpt--)
|
|
{
|
|
pc += re.ir[pc].data + IRL!(IR.Option);
|
|
}
|
|
assert(re.ir[pc].code == IR.Option);
|
|
pc += IRL!(IR.Option);
|
|
break;
|
|
case IR.RepeatStart:case IR.RepeatQStart:
|
|
pc += IRL!(IR.RepeatStart)+re.ir[pc].data;
|
|
goto case IR.RepeatEnd;
|
|
case IR.RepeatEnd:
|
|
case IR.RepeatQEnd:
|
|
uint len = re.ir[pc].data;
|
|
uint step = re.ir[pc+2].raw;
|
|
uint min = re.ir[pc+3].raw;
|
|
if(counter < min)
|
|
{
|
|
counter += step;
|
|
pc -= len;
|
|
break;
|
|
}
|
|
uint max = re.ir[pc+4].raw;
|
|
if(counter < max)
|
|
{
|
|
if(app.data.length < limit && rand(3) > 0)
|
|
{
|
|
pc -= len;
|
|
counter += step;
|
|
}
|
|
else
|
|
{
|
|
counter = counter%step;
|
|
pc += IRL!(IR.RepeatEnd);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
counter = counter%step;
|
|
pc += IRL!(IR.RepeatEnd);
|
|
}
|
|
break;
|
|
case IR.InfiniteStart, IR.InfiniteQStart:
|
|
pc += re.ir[pc].data + IRL!(IR.InfiniteStart);
|
|
goto case IR.InfiniteEnd; //both Q and non-Q
|
|
case IR.InfiniteEnd:
|
|
case IR.InfiniteQEnd:
|
|
uint len = re.ir[pc].data;
|
|
if(app.data.length == dataLenOld)
|
|
{
|
|
pc += IRL!(IR.InfiniteEnd);
|
|
break;
|
|
}
|
|
dataLenOld = cast(uint)app.data.length;
|
|
if(app.data.length < limit && rand(3) > 0)
|
|
pc = pc - len;
|
|
else
|
|
pc = pc + IRL!(IR.InfiniteEnd);
|
|
break;
|
|
case IR.GroupStart, IR.GroupEnd:
|
|
pc += IRL!(IR.GroupStart);
|
|
break;
|
|
case IR.Bol, IR.Wordboundary, IR.Notwordboundary:
|
|
case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
|
|
default:
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
@property Char[] front()
|
|
{
|
|
return app.data;
|
|
}
|
|
|
|
@property empty(){ return false; }
|
|
|
|
void popFront()
|
|
{
|
|
app.shrinkTo(0);
|
|
compose();
|
|
}
|
|
}
|
|
|
|
unittest
|
|
{
|
|
import std.range, std.regex;
|
|
auto re = regex(`P[a-z]{3,}q`);
|
|
auto gen = SampleGenerator!char(re, 20, 3141592);
|
|
static assert(isInputRange!(typeof(gen)));
|
|
//@@@BUG@@@ somehow gen.take(1_000) doesn't work
|
|
foreach(v; take(gen, 1_000))
|
|
assert(v.match(re));
|
|
}
|