Merge pull request #5722 from DmitryOlshansky/regex-matcher-interfaces

std.regex: major internal redesign, also fixes issue 13532
merged-on-behalf-of: Andrei Alexandrescu <andralex@users.noreply.github.com>
This commit is contained in:
The Dlang Bot 2017-10-16 20:16:33 +02:00 committed by GitHub
commit ad489989ec
8 changed files with 1140 additions and 976 deletions

File diff suppressed because it is too large Load diff

View file

@ -200,7 +200,7 @@ bool isAtomIR(IR i)
IR pairedIR(IR i)
{
assert(isStartIR(i) || isEndIR(i));
return cast(IR)(i ^ 0b11);
return cast(IR) (i ^ 0b11);
}
//encoded IR instruction
@ -423,6 +423,134 @@ struct Group(DataIndex)
writeln("\t", disassemble(slice, pc, dict));
}
// Encapsulates memory management, explicit ref counting
// and the exact type of engine created
// there is a single instance per engine combination type x Char
// In future may also maintain a (TLS?) cache of memory
interface MatcherFactory(Char)
{
@safe:
Matcher!Char create(const Regex!Char, in Char[] input) const;
Matcher!Char dup(Matcher!Char m, in Char[] input) const;
size_t incRef(Matcher!Char m) const;
size_t decRef(Matcher!Char m) const;
}
// Only memory management, no compile-time vs run-time specialities
abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char
{
import core.stdc.stdlib : malloc, free;
import core.memory : GC;
enum classSize = __traits(classInstanceSize, EngineType!Char);
Matcher!Char construct(const Regex!Char re, in Char[] input, void[] memory) const;
override Matcher!Char create(const Regex!Char re, in Char[] input) const @trusted
{
immutable size = EngineType!Char.initialMemory(re) + classSize;
auto memory = enforce(malloc(size), "malloc failed")[0 .. size];
scope(failure) free(memory.ptr);
GC.addRange(memory.ptr, classSize);
auto engine = construct(re, input, memory);
assert(engine.refCount == 1);
assert(cast(void*) engine == memory.ptr);
return engine;
}
override Matcher!Char dup(Matcher!Char engine, in Char[] input) const @trusted
{
immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize;
auto memory = enforce(malloc(size), "malloc failed")[0 .. size];
scope(failure) free(memory.ptr);
auto copy = construct(engine.pattern, input, memory);
GC.addRange(memory.ptr, classSize);
engine.dupTo(copy, memory[classSize .. size]);
assert(copy.refCount == 1);
return copy;
}
override size_t incRef(Matcher!Char m) const
{
return ++m.refCount;
}
override size_t decRef(Matcher!Char m) const @trusted
{
assert(m.refCount != 0);
auto cnt = --m.refCount;
if (cnt == 0)
{
void* ptr = cast(void*) m;
GC.removeRange(ptr);
free(ptr);
}
return cnt;
}
}
// A factory for run-time engines
class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char)
{
override EngineType!Char construct(const Regex!Char re, in Char[] input, void[] memory) const
{
import std.conv : emplace;
return emplace!(EngineType!Char)(memory[0 .. classSize],
re, Input!Char(input), memory[classSize .. $]);
}
}
// A factory for compile-time engine
class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char)
{
override EngineType!Char construct(const Regex!Char re, in Char[] input, void[] memory) const
{
import std.conv : emplace;
return emplace!(EngineType!Char)(memory[0 .. classSize],
re, &func, Input!Char(input), memory[classSize .. $]);
}
}
// A workaround for R-T enum re = regex(...)
template defaultFactory(Char)
{
@property MatcherFactory!Char defaultFactory(const Regex!Char re)
{
import std.regex.internal.backtracking : BacktrackingMatcher;
import std.regex.internal.thompson : ThompsonMatcher;
import std.algorithm.searching : canFind;
static MatcherFactory!Char backtrackingFactory;
static MatcherFactory!Char thompsonFactory;
if (re.backrefed.canFind!"a != 0")
{
if (backtrackingFactory is null)
backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char);
return backtrackingFactory;
}
else
{
if (thompsonFactory is null)
thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char);
return thompsonFactory;
}
}
}
// Defining it as an interface has the undesired side-effect:
// casting any class to an interface silently adjusts pointer to point to a nested vtbl
abstract class Matcher(Char)
{
abstract:
// Get a (next) match
int match(Group!size_t[] matches);
// This only maintains internal ref-count,
// deallocation happens inside MatcherFactory
@property ref size_t refCount() @safe;
// Copy internal state to another engine, using memory arena 'memory'
void dupTo(Matcher!Char m, void[] memory);
// The pattern loaded
@property ref const(Regex!Char) pattern() @safe;
}
/++
$(D Regex) object holds regular expression pattern in compiled form.
Instances of this object are constructed via calls to $(D regex).
@ -443,11 +571,11 @@ struct Regex(Char)
static struct NamedGroupRange
{
private:
NamedGroup[] groups;
const(NamedGroup)[] groups;
size_t start;
size_t end;
public:
this(NamedGroup[] g, size_t s, size_t e)
this(const(NamedGroup)[] g, size_t s, size_t e)
{
assert(s <= e);
assert(e <= g.length);
@ -485,7 +613,7 @@ struct Regex(Char)
package(std.regex):
import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
NamedGroup[] dict; // maps name -> user group number
const(NamedGroup)[] dict; // maps name -> user group number
uint ngroup; // number of internal groups
uint maxCounterDepth; // max depth of nested {n,m} repetitions
uint hotspotTableSize; // number of entries in merge table
@ -495,6 +623,35 @@ package(std.regex):
public const(BitTable)[] filters; // bloom filters for conditional loops
uint[] backrefed; // bit array of backreferenced submatches
Kickstart!Char kickstart;
MatcherFactory!Char factory; // produces optimal matcher for this pattern
const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted
{
auto r = cast() this;
r.factory = factory;
return r;
}
const(Regex) withFlags(uint newFlags) pure const @trusted
{
auto r = cast() this;
r.flags = newFlags;
return r;
}
const(Regex) withCode(const(Bytecode)[] code) pure const @trusted
{
auto r = cast() this;
r.ir = code.dup; // TODO: sidestep const instead?
return r;
}
const(Regex) withNGroup(uint nGroup) pure const @trusted
{
auto r = cast() this;
r.ngroup = nGroup;
return r;
}
//bit access helper
uint isBackref(uint n)
@ -537,26 +694,6 @@ package(std.regex):
}
//@@@BUG@@@ (unreduced) - public makes it inaccessible in std.regex.package (!)
/*public*/ struct StaticRegex(Char)
{
package(std.regex):
import std.regex.internal.backtracking : BacktrackingMatcher;
alias Matcher = BacktrackingMatcher!(true);
alias MatchFn = bool function(ref Matcher!Char) @trusted;
MatchFn nativeFn;
public:
Regex!Char _regex;
alias _regex this;
this(Regex!Char re, MatchFn fn)
{
_regex = re;
nativeFn = fn;
}
}
// The stuff below this point is temporarrily part of IR module
// but may need better place in the future (all internals)
package(std.regex):
@ -593,7 +730,7 @@ if (is(Char :dchar))
@property bool atEnd(){
return _index == _origin.length;
}
bool search(Kickstart)(ref Kickstart kick, ref dchar res, ref size_t pos)
bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos)
{
size_t idx = kick.search(_origin, _index);
_index = idx;
@ -676,7 +813,7 @@ template BackLooper(E)
}
//
@trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name)
@trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name)
{//equal is @system?
import std.algorithm.comparison : equal;
import std.algorithm.iteration : map;

View file

@ -393,7 +393,7 @@ public:
// has a useful trait: if supplied with valid UTF indexes,
// returns only valid UTF indexes
// (that given the haystack in question is valid UTF string)
@trusted size_t search(const(Char)[] haystack, size_t idx)
@trusted size_t search(const(Char)[] haystack, size_t idx) const
{//@BUG: apparently assumes little endian machines
import core.stdc.string : memchr;
import std.conv : text;

View file

@ -12,7 +12,11 @@ static import std.ascii;
// package relevant info from parser into a regex object
auto makeRegex(S, CG)(Parser!(S, CG) p)
{
Regex!(BasicElementOf!S) re;
import std.regex.internal.backtracking : BacktrackingMatcher;
import std.regex.internal.thompson : ThompsonMatcher;
import std.algorithm.searching : canFind;
alias Char = BasicElementOf!S;
Regex!Char re;
auto g = p.g;
with(re)
{
@ -25,6 +29,12 @@ auto makeRegex(S, CG)(Parser!(S, CG) p)
matchers = g.matchers;
backrefed = g.backrefed;
re.postprocess();
// check if we have backreferences, if so - use backtracking
if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...);
else if (re.backrefed.canFind!"a != 0")
factory = new RuntimeFactory!(BacktrackingMatcher, Char);
else
factory = new RuntimeFactory!(ThompsonMatcher, Char);
debug(std_regex_parser)
{
__ctfe || print();

View file

@ -518,11 +518,11 @@ alias Sequence(int B, int E) = staticIota!(B, E);
{
import std.algorithm.comparison : equal;
auto rtr = regex("a|b|c");
enum ctr = regex("a|b|c");
static ctr = regex("a|b|c");
assert(equal(rtr.ir,ctr.ir));
//CTFE parser BUG is triggered by group
//in the middle of alternation (at least not first and not last)
enum testCT = regex(`abc|(edf)|xyz`);
static testCT = regex(`abc|(edf)|xyz`);
auto testRT = regex(`abc|(edf)|xyz`);
assert(equal(testCT.ir,testRT.ir));
}
@ -996,6 +996,36 @@ alias Sequence(int B, int E) = staticIota!(B, E);
assertThrown(regex(`^((x)(?=\1))`));
}
// bugzilla 13532
version(none) // TODO: revist once we have proper benchmark framework
@safe unittest
{
import std.datetime.stopwatch : StopWatch, AutoStart;
import std.math : abs;
import std.conv : to;
enum re1 = ctRegex!`[0-9][0-9]`;
immutable static re2 = ctRegex!`[0-9][0-9]`;
immutable iterations = 1_000_000;
size_t result1 = 0, result2 = 0;
auto sw = StopWatch(AutoStart.yes);
foreach (_; 0 .. iterations)
{
result1 += matchFirst("12345678", re1).length;
}
const staticTime = sw.peek();
sw.reset();
foreach (_; 0 .. iterations)
{
result2 += matchFirst("12345678", re2).length;
}
const enumTime = sw.peek();
assert(result1 == result2);
auto ratio = 1.0 * enumTime.total!"usecs" / staticTime.total!"usecs";
// enum is faster or the diff is less < 30%
assert(ratio < 1.0 || abs(ratio - 1.0) < 0.75,
"enum regex to static regex ratio "~to!string(ratio));
}
// bugzilla 14504
@safe unittest
{

View file

@ -89,7 +89,7 @@ struct ThreadList(DataIndex)
template ThompsonOps(E, S, bool withInput:true)
{
@trusted:
static bool op(IR code:IR.End)(E* e, S* state)
static bool op(IR code:IR.End)(E e, S* state)
{
with(e) with(state)
{
@ -105,7 +105,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Wordboundary)(E* e, S* state)
static bool op(IR code:IR.Wordboundary)(E e, S* state)
{
with(e) with(state)
{
@ -137,7 +137,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Notwordboundary)(E* e, S* state)
static bool op(IR code:IR.Notwordboundary)(E e, S* state)
{
with(e) with(state)
{
@ -167,7 +167,7 @@ template ThompsonOps(E, S, bool withInput:true)
return true;
}
static bool op(IR code:IR.Bof)(E* e, S* state)
static bool op(IR code:IR.Bof)(E e, S* state)
{
with(e) with(state)
{
@ -183,7 +183,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Bol)(E* e, S* state)
static bool op(IR code:IR.Bol)(E e, S* state)
{
with(e) with(state)
{
@ -203,7 +203,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Eof)(E* e, S* state)
static bool op(IR code:IR.Eof)(E e, S* state)
{
with(e) with(state)
{
@ -219,7 +219,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Eol)(E* e, S* state)
static bool op(IR code:IR.Eol)(E e, S* state)
{
with(e) with(state)
{
@ -240,42 +240,42 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.InfiniteStart)(E* e, S* state)
static bool op(IR code:IR.InfiniteStart)(E e, S* state)
{
with(e) with(state)
t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart);
return op!(IR.InfiniteEnd)(e,state);
}
static bool op(IR code:IR.InfiniteBloomStart)(E* e, S* state)
static bool op(IR code:IR.InfiniteBloomStart)(E e, S* state)
{
with(e) with(state)
t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteBloomStart);
return op!(IR.InfiniteBloomEnd)(e,state);
}
static bool op(IR code:IR.InfiniteQStart)(E* e, S* state)
static bool op(IR code:IR.InfiniteQStart)(E e, S* state)
{
with(e) with(state)
t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteQStart);
return op!(IR.InfiniteQEnd)(e,state);
}
static bool op(IR code:IR.RepeatStart)(E* e, S* state)
static bool op(IR code:IR.RepeatStart)(E e, S* state)
{
with(e) with(state)
t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart);
return op!(IR.RepeatEnd)(e,state);
}
static bool op(IR code:IR.RepeatQStart)(E* e, S* state)
static bool op(IR code:IR.RepeatQStart)(E e, S* state)
{
with(e) with(state)
t.pc += re.ir[t.pc].data + IRL!(IR.RepeatQStart);
return op!(IR.RepeatQEnd)(e,state);
}
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.RepeatEnd || code == IR.RepeatQEnd)
{
with(e) with(state)
@ -330,7 +330,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.InfiniteEnd || code == IR.InfiniteQEnd)
{
with(e) with(state)
@ -365,7 +365,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.InfiniteBloomEnd)
{
with(e) with(state)
@ -394,7 +394,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.OrEnd)(E* e, S* state)
static bool op(IR code:IR.OrEnd)(E e, S* state)
{
with(e) with(state)
{
@ -415,7 +415,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.OrStart)(E* e, S* state)
static bool op(IR code:IR.OrStart)(E e, S* state)
{
with(e) with(state)
{
@ -424,7 +424,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Option)(E* e, S* state)
static bool op(IR code:IR.Option)(E e, S* state)
{
with(e) with(state)
{
@ -439,7 +439,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.GotoEndOr)(E* e, S* state)
static bool op(IR code:IR.GotoEndOr)(E e, S* state)
{
with(e) with(state)
{
@ -448,7 +448,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.GroupStart)(E* e, S* state)
static bool op(IR code:IR.GroupStart)(E e, S* state)
{
with(e) with(state)
{
@ -458,7 +458,7 @@ template ThompsonOps(E, S, bool withInput:true)
return true;
}
}
static bool op(IR code:IR.GroupEnd)(E* e, S* state)
static bool op(IR code:IR.GroupEnd)(E e, S* state)
{
with(e) with(state)
{
@ -469,7 +469,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Backref)(E* e, S* state)
static bool op(IR code:IR.Backref)(E e, S* state)
{
with(e) with(state)
{
@ -506,7 +506,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.LookbehindStart || code == IR.NeglookbehindStart)
{
with(e) with(state)
@ -516,10 +516,9 @@ template ThompsonOps(E, S, bool withInput:true)
uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart);
bool positive = re.ir[t.pc].code == IR.LookbehindStart;
static if (Stream.isLoopback)
auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
else
auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
matcher.re.ngroup = me - ms;
auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
matcher.backrefed = backrefed.empty ? t.matches : backrefed;
//backMatch
auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart));
@ -534,7 +533,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.LookaheadStart || code == IR.NeglookaheadStart)
{
with(e) with(state)
@ -545,10 +544,9 @@ template ThompsonOps(E, S, bool withInput:true)
uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart);
bool positive = re.ir[t.pc].code == IR.LookaheadStart;
static if (Stream.isLoopback)
auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
else
auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
matcher.re.ngroup = me - ms;
auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
matcher.backrefed = backrefed.empty ? t.matches : backrefed;
auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart));
freelist = matcher.freelist;
@ -564,7 +562,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.LookaheadEnd || code == IR.NeglookaheadEnd ||
code == IR.LookbehindEnd || code == IR.NeglookbehindEnd)
{
@ -579,13 +577,13 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Nop)(E* e, S* state)
static bool op(IR code:IR.Nop)(E e, S* state)
{
with(state) t.pc += IRL!(IR.Nop);
return true;
}
static bool op(IR code:IR.OrChar)(E* e, S* state)
static bool op(IR code:IR.OrChar)(E e, S* state)
{
with(e) with(state)
{
@ -607,7 +605,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Char)(E* e, S* state)
static bool op(IR code:IR.Char)(E e, S* state)
{
with(e) with(state)
{
@ -623,7 +621,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Any)(E* e, S* state)
static bool op(IR code:IR.Any)(E e, S* state)
{
with(e) with(state)
{
@ -634,7 +632,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.CodepointSet)(E* e, S* state)
static bool op(IR code:IR.CodepointSet)(E e, S* state)
{
with(e) with(state)
{
@ -652,7 +650,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Trie)(E* e, S* state)
static bool op(IR code:IR.Trie)(E e, S* state)
{
with(e) with(state)
{
@ -676,7 +674,7 @@ template ThompsonOps(E,S, bool withInput:false)
{
@trusted:
// can't match these without input
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.Char || code == IR.OrChar || code == IR.CodepointSet
|| code == IR.Trie || code == IR.Char || code == IR.Any)
{
@ -684,7 +682,7 @@ template ThompsonOps(E,S, bool withInput:false)
}
// special case of zero-width backref
static bool op(IR code:IR.Backref)(E* e, S* state)
static bool op(IR code:IR.Backref)(E e, S* state)
{
with(e) with(state)
{
@ -702,7 +700,7 @@ template ThompsonOps(E,S, bool withInput:false)
}
// forward all control flow to normal versions
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code != IR.Char && code != IR.OrChar && code != IR.CodepointSet
&& code != IR.Trie && code != IR.Char && code != IR.Any && code != IR.Backref)
{
@ -714,19 +712,19 @@ template ThompsonOps(E,S, bool withInput:false)
Thomspon matcher does all matching in lockstep,
never looking at the same char twice
+/
@trusted struct ThompsonMatcher(Char, StreamType = Input!Char)
@trusted class ThompsonMatcher(Char, StreamType = Input!Char): Matcher!Char
if (is(Char : dchar))
{
alias DataIndex = Stream.DataIndex;
alias Stream = StreamType;
alias OpFunc = bool function(ThompsonMatcher*, State*);
alias OpFunc = bool function(ThompsonMatcher, State*);
alias BackMatcher = ThompsonMatcher!(Char, BackLooper!(Stream));
alias OpBackFunc = bool function(BackMatcher*, BackMatcher.State*);
alias OpBackFunc = bool function(BackMatcher, BackMatcher.State*);
Thread!DataIndex* freelist;
ThreadList!DataIndex clist, nlist;
DataIndex[] merge;
Group!DataIndex[] backrefed;
Regex!Char re; //regex program
const Regex!Char re; //regex program
Stream s;
dchar front;
DataIndex index;
@ -737,16 +735,18 @@ if (is(Char : dchar))
OpBackFunc[] opCacheBackTrue; // ditto
OpBackFunc[] opCacheBackFalse; // ditto
size_t threadSize;
size_t _refCount;
int matched;
bool exhausted;
final:
static struct State
{
Thread!DataIndex* t;
ThreadList!DataIndex worklist;
Group!DataIndex[] matches;
bool popState(E)(E* e)
bool popState(E)(E e)
{
with(e)
{
@ -784,6 +784,10 @@ if (is(Char : dchar))
//true if it's end of input
@property bool atEnd(){ return index == s.lastIndex && s.atEnd; }
override @property ref size_t refCount() @safe { return _refCount; }
override @property ref const(Regex!Char) pattern() @safe { return re; }
bool next()
{
if (!s.nextChar(front, index))
@ -843,19 +847,28 @@ if (is(Char : dchar))
}
}
this()(Regex!Char program, Stream stream, void[] memory)
this()(const Regex!Char program, Stream stream, void[] memory)
{
// We are emplace'd to malloced memory w/o blitting T.init over it\
// make sure we initialize all fields explicitly
_refCount = 1;
subCounters = null;
backrefed = null;
exhausted = false;
matched = 0;
re = program;
s = stream;
initExternalMemory(memory);
genCounter = 0;
}
this(ref ThompsonMatcher matcher, size_t lo, size_t hi, Stream stream)
this(ThompsonMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream)
{
_refCount = 1;
subCounters = matcher.subCounters;
s = stream;
re = matcher.re;
re.ir = re.ir[lo .. hi];
auto code = matcher.re.ir[lo .. hi];
re = matcher.re.withCode(code).withNGroup(nGroup);
threadSize = matcher.threadSize;
merge = matcher.merge;
freelist = matcher.freelist;
@ -867,11 +880,13 @@ if (is(Char : dchar))
index = matcher.index;
}
this(ref BackMatcher matcher, size_t lo, size_t hi, Stream stream)
this(BackMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream)
{
_refCount = 1;
subCounters = matcher.subCounters;
s = stream;
re = matcher.re;
re.ir = re.ir[lo .. hi];
auto code = matcher.re.ir[lo .. hi];
re = matcher.re.withCode(code).withNGroup(nGroup);
threadSize = matcher.threadSize;
merge = matcher.merge;
freelist = matcher.freelist;
@ -883,31 +898,35 @@ if (is(Char : dchar))
index = matcher.index;
}
auto fwdMatcher()(size_t lo, size_t hi, size_t counter)
auto fwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter)
{
auto m = ThompsonMatcher!(Char, Stream)(this, lo, hi, s);
auto m = new ThompsonMatcher!(Char, Stream)(this, lo, hi, nGroup, s);
m.genCounter = counter;
return m;
}
auto bwdMatcher()(size_t lo, size_t hi, size_t counter)
auto bwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter)
{
alias BackLooper = typeof(s.loopBack(index));
auto m = ThompsonMatcher!(Char, BackLooper)(this, lo, hi, s.loopBack(index));
auto m = new ThompsonMatcher!(Char, BackLooper)(this, lo, hi, nGroup, s.loopBack(index));
m.genCounter = counter;
m.next();
return m;
}
auto dupTo(void[] memory)
override void dupTo(Matcher!Char engine, void[] memory)
{
typeof(this) tmp = this;//bitblit
tmp.initExternalMemory(memory);
tmp.genCounter = 0;
return tmp;
auto thompson = cast(ThompsonMatcher) engine;
thompson.s = s;
thompson.subCounters = null;
thompson.front = front;
thompson.index = index;
thompson.matched = matched;
thompson.exhausted = exhausted;
thompson.initExternalMemory(memory);
}
int match(Group!DataIndex[] matches)
override int match(Group!DataIndex[] matches)
{
debug(std_regex_matcher)
writeln("------------------------------------------");
@ -1052,9 +1071,9 @@ if (is(Char : dchar))
{
debug(std_regex_matcher) writeln("---- Evaluating thread");
static if (withInput)
while (opCacheTrue.ptr[state.t.pc](&this, state)){}
while (opCacheTrue.ptr[state.t.pc](this, state)){}
else
while (opCacheFalse.ptr[state.t.pc](&this, state)){}
while (opCacheFalse.ptr[state.t.pc](this, state)){}
}
enum uint RestartPc = uint.max;
//match the input, evaluating IR without searching

View file

@ -298,7 +298,6 @@ module std.regex;
import std.range.primitives, std.traits;
import std.regex.internal.ir;
import std.regex.internal.thompson; //TODO: get rid of this dependency
import std.typecons; // : Flag, Yes, No;
/++
@ -339,10 +338,9 @@ public alias Regex(Char) = std.regex.internal.ir.Regex!(Char);
A $(D StaticRegex) is $(D Regex) object that contains D code specially
generated at compile-time to speed up matching.
Implicitly convertible to normal $(D Regex),
however doing so will result in losing this additional capability.
No longer used, kept as alias to Regex for backwards compatibility.
+/
public alias StaticRegex(Char) = std.regex.internal.ir.StaticRegex!(Char);
public alias StaticRegex = Regex;
/++
Compile regular expression pattern for the later execution.
@ -428,16 +426,25 @@ if (isSomeString!(S))
template ctRegexImpl(alias pattern, string flags=[])
{
import std.regex.internal.backtracking, std.regex.internal.parser;
enum r = regex(pattern, flags);
static immutable r = cast(immutable) regex(pattern, flags);
alias Char = BasicElementOf!(typeof(pattern));
enum source = ctGenRegExCode(r);
alias Matcher = BacktrackingMatcher!(true);
@trusted bool func(ref Matcher!Char matcher)
@trusted bool func(BacktrackingMatcher!Char matcher)
{
debug(std_regex_ctr) pragma(msg, source);
cast(void) matcher;
mixin(source);
}
enum nr = StaticRegex!Char(r, &func);
static immutable staticRe =
cast(immutable) r.withFactory(new CtfeFactory!(BacktrackingMatcher, Char, func));
struct Wrapper
{
// allow code that expects mutable Regex to still work
// we stay "logically const"
@trusted @property auto getRe() const { return cast() staticRe; }
alias getRe this;
}
enum wrapper = Wrapper();
}
/++
@ -450,10 +457,10 @@ template ctRegexImpl(alias pattern, string flags=[])
pattern = Regular expression
flags = The _attributes (g, i, m, s and x accepted)
+/
public enum ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).nr;
public enum ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).wrapper;
enum isRegexFor(RegEx, R) = is(RegEx == Regex!(BasicElementOf!R))
|| is(RegEx == StaticRegex!(BasicElementOf!R));
enum isRegexFor(RegEx, R) = is(Unqual!RegEx == Regex!(BasicElementOf!R)) || is(RegEx : const(Regex!(BasicElementOf!R)))
|| is(Unqual!RegEx == StaticRegex!(BasicElementOf!R));
/++
@ -462,10 +469,10 @@ enum isRegexFor(RegEx, R) = is(RegEx == Regex!(BasicElementOf!R))
First element of range is the whole match.
+/
@trusted public struct Captures(R, DIndex = size_t)
@trusted public struct Captures(R)
if (isSomeString!R)
{//@trusted because of union inside
alias DataIndex = DIndex;
alias DataIndex = size_t;
alias String = R;
private:
import std.conv : text;
@ -480,9 +487,9 @@ private:
}
uint _f, _b;
uint _refcount; // ref count or SMALL MASK + num groups
NamedGroup[] _names;
const(NamedGroup)[] _names;
this()(R input, uint n, NamedGroup[] named)
this(R input, uint n, const(NamedGroup)[] named)
{
_input = input;
_names = named;
@ -491,11 +498,11 @@ private:
_f = 0;
}
this(alias Engine)(ref RegexMatch!(R,Engine) rmatch)
this(ref RegexMatch!R rmatch)
{
_input = rmatch._input;
_names = rmatch._engine.re.dict;
immutable n = rmatch._engine.re.ngroup;
_names = rmatch._engine.pattern.dict;
immutable n = rmatch._engine.pattern.ngroup;
newMatches(n);
_b = n;
_f = 0;
@ -693,58 +700,38 @@ public:
Effectively it's a forward range of Captures!R, produced
by lazily searching for matches in a given input.
$(D alias Engine) specifies an engine type to use during matching,
and is automatically deduced in a call to $(D match)/$(D bmatch).
+/
@trusted public struct RegexMatch(R, alias Engine = ThompsonMatcher)
@trusted public struct RegexMatch(R)
if (isSomeString!R)
{
private:
import core.stdc.stdlib : malloc, free;
alias Char = BasicElementOf!R;
alias EngineType = Engine!Char;
EngineType _engine;
Matcher!Char _engine;
const MatcherFactory!Char _factory;
R _input;
Captures!(R,EngineType.DataIndex) _captures;
void[] _memory;//is ref-counted
Captures!R _captures;
this(RegEx)(R input, RegEx prog)
{
import std.exception : enforce;
_input = input;
immutable size = EngineType.initialMemory(prog)+size_t.sizeof;
_memory = (enforce(malloc(size), "malloc failed")[0 .. size]);
scope(failure) free(_memory.ptr);
*cast(size_t*)_memory.ptr = 1;
_engine = EngineType(prog, Input!Char(input), _memory[size_t.sizeof..$]);
static if (is(RegEx == StaticRegex!(BasicElementOf!R)))
_engine.nativeFn = prog.nativeFn;
_captures = Captures!(R,EngineType.DataIndex)(this);
if (prog.factory is null) _factory = defaultFactory!Char(prog);
else _factory = prog.factory;
_engine = _factory.create(prog, input);
assert(_engine.refCount == 1);
_captures = Captures!R(this);
_captures._nMatch = _engine.match(_captures.matches);
debug(std_regex_allocation) writefln("RefCount (ctor): %x %d", _memory.ptr, counter);
}
@property ref size_t counter(){ return *cast(size_t*)_memory.ptr; }
public:
this(this)
{
if (_memory.ptr)
{
++counter;
debug(std_regex_allocation) writefln("RefCount (postblit): %x %d",
_memory.ptr, *cast(size_t*)_memory.ptr);
}
if (_engine) _factory.incRef(_engine);
}
~this()
{
if (_memory.ptr && --*cast(size_t*)_memory.ptr == 0)
{
debug(std_regex_allocation) writefln("RefCount (dtor): %x %d",
_memory.ptr, *cast(size_t*)_memory.ptr);
free(cast(void*)_memory.ptr);
}
if (_engine) _factory.decRef(_engine);
}
///Shorthands for front.pre, front.post, front.hit.
@ -786,19 +773,18 @@ public:
void popFront()
{
import std.exception : enforce;
if (counter != 1)
{//do cow magic first
counter--;//we abandon this reference
immutable size = EngineType.initialMemory(_engine.re)+size_t.sizeof;
_memory = (enforce(malloc(size), "malloc failed")[0 .. size]);
_engine = _engine.dupTo(_memory[size_t.sizeof .. size]);
counter = 1;//points to new chunk
// CoW - if refCount is not 1, we are aliased by somebody else
if (_engine.refCount != 1)
{
// we create a new engine & abandon this reference
auto old = _engine;
_engine = _factory.dup(old, _input);
_factory.decRef(old);
}
if (!_captures.unique)
{
// has external references - allocate new space
_captures.newMatches(_engine.re.ngroup);
_captures.newMatches(_engine.pattern.ngroup);
}
_captures._nMatch = _engine.match(_captures.matches);
}
@ -814,39 +800,30 @@ public:
/// Same as .front, provided for compatibility with original std.regex.
@property auto captures() inout { return _captures; }
}
private @trusted auto matchOnce(alias Engine, RegEx, R)(R input, RegEx re)
private @trusted auto matchOnce(RegEx, R)(R input, const RegEx prog)
{
import core.stdc.stdlib : malloc, free;
import std.exception : enforce;
alias Char = BasicElementOf!R;
alias EngineType = Engine!Char;
size_t size = EngineType.initialMemory(re);
void[] memory = enforce(malloc(size), "malloc failed")[0 .. size];
scope(exit) free(memory.ptr);
auto captures = Captures!(R, EngineType.DataIndex)(input, re.ngroup, re.dict);
auto engine = EngineType(re, Input!Char(input), memory);
static if (is(RegEx == StaticRegex!(BasicElementOf!R)))
engine.nativeFn = re.nativeFn;
auto factory = prog.factory is null ? defaultFactory!Char(prog) : prog.factory;
auto engine = factory.create(prog, input);
scope(exit) factory.decRef(engine); // destroys the engine
auto captures = Captures!R(input, prog.ngroup, prog.dict);
captures._nMatch = engine.match(captures.matches);
return captures;
}
private auto matchMany(alias Engine, RegEx, R)(R input, RegEx re)
private auto matchMany(RegEx, R)(R input, RegEx re) @safe
{
re.flags |= RegexOption.global;
return RegexMatch!(R, Engine)(input, re);
return RegexMatch!R(input, re.withFlags(re.flags | RegexOption.global));
}
@system unittest
{
//sanity checks for new API
auto re = regex("abc");
assert(!"abc".matchOnce!(ThompsonMatcher)(re).empty);
assert("abc".matchOnce!(ThompsonMatcher)(re)[0] == "abc");
assert(!"abc".matchOnce(re).empty);
assert("abc".matchOnce(re)[0] == "abc");
}
@ -938,25 +915,16 @@ if (isSomeString!R && isRegexFor!(RegEx, R))
+/
public auto match(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R)))
if (isSomeString!R && isRegexFor!(RegEx,R))
{
import std.regex.internal.thompson : ThompsonMatcher;
return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, re);
return RegexMatch!(Unqual!(typeof(input)))(input, re);
}
///ditto
public auto match(R, String)(R input, String re)
if (isSomeString!R && isSomeString!String)
{
import std.regex.internal.thompson : ThompsonMatcher;
return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, regex(re));
}
public auto match(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
{
import std.regex.internal.backtracking : BacktrackingMatcher;
return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re);
return RegexMatch!(Unqual!(typeof(input)))(input, regex(re));
}
/++
@ -978,33 +946,23 @@ if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
if there was a match, otherwise an empty $(LREF Captures) object.
+/
public auto matchFirst(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R)))
if (isSomeString!R && isRegexFor!(RegEx, R))
{
import std.regex.internal.thompson : ThompsonMatcher;
return matchOnce!ThompsonMatcher(input, re);
return matchOnce(input, re);
}
///ditto
public auto matchFirst(R, String)(R input, String re)
if (isSomeString!R && isSomeString!String)
{
import std.regex.internal.thompson : ThompsonMatcher;
return matchOnce!ThompsonMatcher(input, regex(re));
return matchOnce(input, regex(re));
}
///ditto
public auto matchFirst(R, String)(R input, String[] re...)
if (isSomeString!R && isSomeString!String)
{
import std.regex.internal.thompson : ThompsonMatcher;
return matchOnce!ThompsonMatcher(input, regex(re));
}
public auto matchFirst(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
{
import std.regex.internal.backtracking : BacktrackingMatcher;
return matchOnce!(BacktrackingMatcher!true)(input, re);
return matchOnce(input, regex(re));
}
/++
@ -1029,33 +987,23 @@ if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
after the first match was found or an empty one if not present.
+/
public auto matchAll(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R)))
if (isSomeString!R && isRegexFor!(RegEx, R))
{
import std.regex.internal.thompson : ThompsonMatcher;
return matchMany!ThompsonMatcher(input, re);
return matchMany(input, re);
}
///ditto
public auto matchAll(R, String)(R input, String re)
if (isSomeString!R && isSomeString!String)
{
import std.regex.internal.thompson : ThompsonMatcher;
return matchMany!ThompsonMatcher(input, regex(re));
return matchMany(input, regex(re));
}
///ditto
public auto matchAll(R, String)(R input, String[] re...)
if (isSomeString!R && isSomeString!String)
{
import std.regex.internal.thompson : ThompsonMatcher;
return matchMany!ThompsonMatcher(input, regex(re));
}
public auto matchAll(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
{
import std.regex.internal.backtracking : BacktrackingMatcher;
return matchMany!(BacktrackingMatcher!true)(input, re);
return matchMany(input, regex(re));
}
// another set of tests just to cover the new API
@ -1119,25 +1067,16 @@ if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
+/
public auto bmatch(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R)))
if (isSomeString!R && isRegexFor!(RegEx, R))
{
import std.regex.internal.backtracking : BacktrackingMatcher;
return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, re);
return RegexMatch!(Unqual!(typeof(input)))(input, re);
}
///ditto
public auto bmatch(R, String)(R input, String re)
if (isSomeString!R && isSomeString!String)
{
import std.regex.internal.backtracking : BacktrackingMatcher;
return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, regex(re));
}
public auto bmatch(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
{
import std.regex.internal.backtracking : BacktrackingMatcher;
return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re);
return RegexMatch!(Unqual!(typeof(input)))(input, regex(re));
}
// produces replacement string from format using captures for substitution
@ -1530,7 +1469,7 @@ private:
@trusted this(Range input, RegEx separator)
{//@@@BUG@@@ generated opAssign of RegexMatch is not @trusted
_input = input;
separator.flags |= RegexOption.global;
const re = separator.withFlags(separator.flags | RegexOption.global);
if (_input.empty)
{
//there is nothing to match at all, make _offset > 0
@ -1538,7 +1477,7 @@ private:
}
else
{
_match = Rx(_input, separator);
_match = Rx(_input, re);
static if (keepSeparators)
if (_match.pre.empty)