Merge pull request #5722 from DmitryOlshansky/regex-matcher-interfaces

std.regex: major internal redesign, also fixes issue 13532
merged-on-behalf-of: Andrei Alexandrescu <andralex@users.noreply.github.com>
This commit is contained in:
The Dlang Bot 2017-10-16 20:16:33 +02:00 committed by GitHub
commit ad489989ec
8 changed files with 1140 additions and 976 deletions

File diff suppressed because it is too large Load diff

View file

@ -200,7 +200,7 @@ bool isAtomIR(IR i)
IR pairedIR(IR i)
{
assert(isStartIR(i) || isEndIR(i));
return cast(IR)(i ^ 0b11);
return cast(IR) (i ^ 0b11);
}
//encoded IR instruction
@ -423,6 +423,134 @@ struct Group(DataIndex)
writeln("\t", disassemble(slice, pc, dict));
}
// Encapsulates memory management, explicit ref counting
// and the exact type of engine created
// there is a single instance per engine combination type x Char
// In future may also maintain a (TLS?) cache of memory
interface MatcherFactory(Char)
{
@safe:
Matcher!Char create(const Regex!Char, in Char[] input) const;
Matcher!Char dup(Matcher!Char m, in Char[] input) const;
size_t incRef(Matcher!Char m) const;
size_t decRef(Matcher!Char m) const;
}
// Only memory management, no compile-time vs run-time specialities
abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char
{
import core.stdc.stdlib : malloc, free;
import core.memory : GC;
enum classSize = __traits(classInstanceSize, EngineType!Char);
Matcher!Char construct(const Regex!Char re, in Char[] input, void[] memory) const;
override Matcher!Char create(const Regex!Char re, in Char[] input) const @trusted
{
immutable size = EngineType!Char.initialMemory(re) + classSize;
auto memory = enforce(malloc(size), "malloc failed")[0 .. size];
scope(failure) free(memory.ptr);
GC.addRange(memory.ptr, classSize);
auto engine = construct(re, input, memory);
assert(engine.refCount == 1);
assert(cast(void*) engine == memory.ptr);
return engine;
}
override Matcher!Char dup(Matcher!Char engine, in Char[] input) const @trusted
{
immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize;
auto memory = enforce(malloc(size), "malloc failed")[0 .. size];
scope(failure) free(memory.ptr);
auto copy = construct(engine.pattern, input, memory);
GC.addRange(memory.ptr, classSize);
engine.dupTo(copy, memory[classSize .. size]);
assert(copy.refCount == 1);
return copy;
}
override size_t incRef(Matcher!Char m) const
{
return ++m.refCount;
}
override size_t decRef(Matcher!Char m) const @trusted
{
assert(m.refCount != 0);
auto cnt = --m.refCount;
if (cnt == 0)
{
void* ptr = cast(void*) m;
GC.removeRange(ptr);
free(ptr);
}
return cnt;
}
}
// A factory for run-time engines
class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char)
{
override EngineType!Char construct(const Regex!Char re, in Char[] input, void[] memory) const
{
import std.conv : emplace;
return emplace!(EngineType!Char)(memory[0 .. classSize],
re, Input!Char(input), memory[classSize .. $]);
}
}
// A factory for compile-time engine
class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char)
{
override EngineType!Char construct(const Regex!Char re, in Char[] input, void[] memory) const
{
import std.conv : emplace;
return emplace!(EngineType!Char)(memory[0 .. classSize],
re, &func, Input!Char(input), memory[classSize .. $]);
}
}
// A workaround for R-T enum re = regex(...)
template defaultFactory(Char)
{
@property MatcherFactory!Char defaultFactory(const Regex!Char re)
{
import std.regex.internal.backtracking : BacktrackingMatcher;
import std.regex.internal.thompson : ThompsonMatcher;
import std.algorithm.searching : canFind;
static MatcherFactory!Char backtrackingFactory;
static MatcherFactory!Char thompsonFactory;
if (re.backrefed.canFind!"a != 0")
{
if (backtrackingFactory is null)
backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char);
return backtrackingFactory;
}
else
{
if (thompsonFactory is null)
thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char);
return thompsonFactory;
}
}
}
// Defining it as an interface has the undesired side-effect:
// casting any class to an interface silently adjusts pointer to point to a nested vtbl
abstract class Matcher(Char)
{
abstract:
// Get a (next) match
int match(Group!size_t[] matches);
// This only maintains internal ref-count,
// deallocation happens inside MatcherFactory
@property ref size_t refCount() @safe;
// Copy internal state to another engine, using memory arena 'memory'
void dupTo(Matcher!Char m, void[] memory);
// The pattern loaded
@property ref const(Regex!Char) pattern() @safe;
}
/++
$(D Regex) object holds regular expression pattern in compiled form.
Instances of this object are constructed via calls to $(D regex).
@ -443,11 +571,11 @@ struct Regex(Char)
static struct NamedGroupRange
{
private:
NamedGroup[] groups;
const(NamedGroup)[] groups;
size_t start;
size_t end;
public:
this(NamedGroup[] g, size_t s, size_t e)
this(const(NamedGroup)[] g, size_t s, size_t e)
{
assert(s <= e);
assert(e <= g.length);
@ -485,7 +613,7 @@ struct Regex(Char)
package(std.regex):
import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
NamedGroup[] dict; // maps name -> user group number
const(NamedGroup)[] dict; // maps name -> user group number
uint ngroup; // number of internal groups
uint maxCounterDepth; // max depth of nested {n,m} repetitions
uint hotspotTableSize; // number of entries in merge table
@ -495,6 +623,35 @@ package(std.regex):
public const(BitTable)[] filters; // bloom filters for conditional loops
uint[] backrefed; // bit array of backreferenced submatches
Kickstart!Char kickstart;
MatcherFactory!Char factory; // produces optimal matcher for this pattern
const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted
{
auto r = cast() this;
r.factory = factory;
return r;
}
const(Regex) withFlags(uint newFlags) pure const @trusted
{
auto r = cast() this;
r.flags = newFlags;
return r;
}
const(Regex) withCode(const(Bytecode)[] code) pure const @trusted
{
auto r = cast() this;
r.ir = code.dup; // TODO: sidestep const instead?
return r;
}
const(Regex) withNGroup(uint nGroup) pure const @trusted
{
auto r = cast() this;
r.ngroup = nGroup;
return r;
}
//bit access helper
uint isBackref(uint n)
@ -537,26 +694,6 @@ package(std.regex):
}
//@@@BUG@@@ (unreduced) - public makes it inaccessible in std.regex.package (!)
/*public*/ struct StaticRegex(Char)
{
package(std.regex):
import std.regex.internal.backtracking : BacktrackingMatcher;
alias Matcher = BacktrackingMatcher!(true);
alias MatchFn = bool function(ref Matcher!Char) @trusted;
MatchFn nativeFn;
public:
Regex!Char _regex;
alias _regex this;
this(Regex!Char re, MatchFn fn)
{
_regex = re;
nativeFn = fn;
}
}
// The stuff below this point is temporarrily part of IR module
// but may need better place in the future (all internals)
package(std.regex):
@ -593,7 +730,7 @@ if (is(Char :dchar))
@property bool atEnd(){
return _index == _origin.length;
}
bool search(Kickstart)(ref Kickstart kick, ref dchar res, ref size_t pos)
bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos)
{
size_t idx = kick.search(_origin, _index);
_index = idx;
@ -676,7 +813,7 @@ template BackLooper(E)
}
//
@trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name)
@trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name)
{//equal is @system?
import std.algorithm.comparison : equal;
import std.algorithm.iteration : map;

View file

@ -393,7 +393,7 @@ public:
// has a useful trait: if supplied with valid UTF indexes,
// returns only valid UTF indexes
// (that given the haystack in question is valid UTF string)
@trusted size_t search(const(Char)[] haystack, size_t idx)
@trusted size_t search(const(Char)[] haystack, size_t idx) const
{//@BUG: apparently assumes little endian machines
import core.stdc.string : memchr;
import std.conv : text;

View file

@ -12,7 +12,11 @@ static import std.ascii;
// package relevant info from parser into a regex object
auto makeRegex(S, CG)(Parser!(S, CG) p)
{
Regex!(BasicElementOf!S) re;
import std.regex.internal.backtracking : BacktrackingMatcher;
import std.regex.internal.thompson : ThompsonMatcher;
import std.algorithm.searching : canFind;
alias Char = BasicElementOf!S;
Regex!Char re;
auto g = p.g;
with(re)
{
@ -25,6 +29,12 @@ auto makeRegex(S, CG)(Parser!(S, CG) p)
matchers = g.matchers;
backrefed = g.backrefed;
re.postprocess();
// check if we have backreferences, if so - use backtracking
if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...);
else if (re.backrefed.canFind!"a != 0")
factory = new RuntimeFactory!(BacktrackingMatcher, Char);
else
factory = new RuntimeFactory!(ThompsonMatcher, Char);
debug(std_regex_parser)
{
__ctfe || print();

View file

@ -518,11 +518,11 @@ alias Sequence(int B, int E) = staticIota!(B, E);
{
import std.algorithm.comparison : equal;
auto rtr = regex("a|b|c");
enum ctr = regex("a|b|c");
static ctr = regex("a|b|c");
assert(equal(rtr.ir,ctr.ir));
//CTFE parser BUG is triggered by group
//in the middle of alternation (at least not first and not last)
enum testCT = regex(`abc|(edf)|xyz`);
static testCT = regex(`abc|(edf)|xyz`);
auto testRT = regex(`abc|(edf)|xyz`);
assert(equal(testCT.ir,testRT.ir));
}
@ -996,6 +996,36 @@ alias Sequence(int B, int E) = staticIota!(B, E);
assertThrown(regex(`^((x)(?=\1))`));
}
// bugzilla 13532
version(none) // TODO: revist once we have proper benchmark framework
@safe unittest
{
import std.datetime.stopwatch : StopWatch, AutoStart;
import std.math : abs;
import std.conv : to;
enum re1 = ctRegex!`[0-9][0-9]`;
immutable static re2 = ctRegex!`[0-9][0-9]`;
immutable iterations = 1_000_000;
size_t result1 = 0, result2 = 0;
auto sw = StopWatch(AutoStart.yes);
foreach (_; 0 .. iterations)
{
result1 += matchFirst("12345678", re1).length;
}
const staticTime = sw.peek();
sw.reset();
foreach (_; 0 .. iterations)
{
result2 += matchFirst("12345678", re2).length;
}
const enumTime = sw.peek();
assert(result1 == result2);
auto ratio = 1.0 * enumTime.total!"usecs" / staticTime.total!"usecs";
// enum is faster or the diff is less < 30%
assert(ratio < 1.0 || abs(ratio - 1.0) < 0.75,
"enum regex to static regex ratio "~to!string(ratio));
}
// bugzilla 14504
@safe unittest
{

View file

@ -89,7 +89,7 @@ struct ThreadList(DataIndex)
template ThompsonOps(E, S, bool withInput:true)
{
@trusted:
static bool op(IR code:IR.End)(E* e, S* state)
static bool op(IR code:IR.End)(E e, S* state)
{
with(e) with(state)
{
@ -105,7 +105,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Wordboundary)(E* e, S* state)
static bool op(IR code:IR.Wordboundary)(E e, S* state)
{
with(e) with(state)
{
@ -137,7 +137,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Notwordboundary)(E* e, S* state)
static bool op(IR code:IR.Notwordboundary)(E e, S* state)
{
with(e) with(state)
{
@ -167,7 +167,7 @@ template ThompsonOps(E, S, bool withInput:true)
return true;
}
static bool op(IR code:IR.Bof)(E* e, S* state)
static bool op(IR code:IR.Bof)(E e, S* state)
{
with(e) with(state)
{
@ -183,7 +183,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Bol)(E* e, S* state)
static bool op(IR code:IR.Bol)(E e, S* state)
{
with(e) with(state)
{
@ -203,7 +203,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Eof)(E* e, S* state)
static bool op(IR code:IR.Eof)(E e, S* state)
{
with(e) with(state)
{
@ -219,7 +219,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Eol)(E* e, S* state)
static bool op(IR code:IR.Eol)(E e, S* state)
{
with(e) with(state)
{
@ -240,42 +240,42 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.InfiniteStart)(E* e, S* state)
static bool op(IR code:IR.InfiniteStart)(E e, S* state)
{
with(e) with(state)
t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart);
return op!(IR.InfiniteEnd)(e,state);
}
static bool op(IR code:IR.InfiniteBloomStart)(E* e, S* state)
static bool op(IR code:IR.InfiniteBloomStart)(E e, S* state)
{
with(e) with(state)
t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteBloomStart);
return op!(IR.InfiniteBloomEnd)(e,state);
}
static bool op(IR code:IR.InfiniteQStart)(E* e, S* state)
static bool op(IR code:IR.InfiniteQStart)(E e, S* state)
{
with(e) with(state)
t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteQStart);
return op!(IR.InfiniteQEnd)(e,state);
}
static bool op(IR code:IR.RepeatStart)(E* e, S* state)
static bool op(IR code:IR.RepeatStart)(E e, S* state)
{
with(e) with(state)
t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart);
return op!(IR.RepeatEnd)(e,state);
}
static bool op(IR code:IR.RepeatQStart)(E* e, S* state)
static bool op(IR code:IR.RepeatQStart)(E e, S* state)
{
with(e) with(state)
t.pc += re.ir[t.pc].data + IRL!(IR.RepeatQStart);
return op!(IR.RepeatQEnd)(e,state);
}
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.RepeatEnd || code == IR.RepeatQEnd)
{
with(e) with(state)
@ -330,7 +330,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.InfiniteEnd || code == IR.InfiniteQEnd)
{
with(e) with(state)
@ -365,7 +365,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.InfiniteBloomEnd)
{
with(e) with(state)
@ -394,7 +394,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.OrEnd)(E* e, S* state)
static bool op(IR code:IR.OrEnd)(E e, S* state)
{
with(e) with(state)
{
@ -415,7 +415,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.OrStart)(E* e, S* state)
static bool op(IR code:IR.OrStart)(E e, S* state)
{
with(e) with(state)
{
@ -424,7 +424,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Option)(E* e, S* state)
static bool op(IR code:IR.Option)(E e, S* state)
{
with(e) with(state)
{
@ -439,7 +439,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.GotoEndOr)(E* e, S* state)
static bool op(IR code:IR.GotoEndOr)(E e, S* state)
{
with(e) with(state)
{
@ -448,7 +448,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.GroupStart)(E* e, S* state)
static bool op(IR code:IR.GroupStart)(E e, S* state)
{
with(e) with(state)
{
@ -458,7 +458,7 @@ template ThompsonOps(E, S, bool withInput:true)
return true;
}
}
static bool op(IR code:IR.GroupEnd)(E* e, S* state)
static bool op(IR code:IR.GroupEnd)(E e, S* state)
{
with(e) with(state)
{
@ -469,7 +469,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Backref)(E* e, S* state)
static bool op(IR code:IR.Backref)(E e, S* state)
{
with(e) with(state)
{
@ -506,7 +506,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.LookbehindStart || code == IR.NeglookbehindStart)
{
with(e) with(state)
@ -516,10 +516,9 @@ template ThompsonOps(E, S, bool withInput:true)
uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart);
bool positive = re.ir[t.pc].code == IR.LookbehindStart;
static if (Stream.isLoopback)
auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
else
auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
matcher.re.ngroup = me - ms;
auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
matcher.backrefed = backrefed.empty ? t.matches : backrefed;
//backMatch
auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart));
@ -534,7 +533,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.LookaheadStart || code == IR.NeglookaheadStart)
{
with(e) with(state)
@ -545,10 +544,9 @@ template ThompsonOps(E, S, bool withInput:true)
uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart);
bool positive = re.ir[t.pc].code == IR.LookaheadStart;
static if (Stream.isLoopback)
auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
else
auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
matcher.re.ngroup = me - ms;
auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
matcher.backrefed = backrefed.empty ? t.matches : backrefed;
auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart));
freelist = matcher.freelist;
@ -564,7 +562,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.LookaheadEnd || code == IR.NeglookaheadEnd ||
code == IR.LookbehindEnd || code == IR.NeglookbehindEnd)
{
@ -579,13 +577,13 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Nop)(E* e, S* state)
static bool op(IR code:IR.Nop)(E e, S* state)
{
with(state) t.pc += IRL!(IR.Nop);
return true;
}
static bool op(IR code:IR.OrChar)(E* e, S* state)
static bool op(IR code:IR.OrChar)(E e, S* state)
{
with(e) with(state)
{
@ -607,7 +605,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Char)(E* e, S* state)
static bool op(IR code:IR.Char)(E e, S* state)
{
with(e) with(state)
{
@ -623,7 +621,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Any)(E* e, S* state)
static bool op(IR code:IR.Any)(E e, S* state)
{
with(e) with(state)
{
@ -634,7 +632,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.CodepointSet)(E* e, S* state)
static bool op(IR code:IR.CodepointSet)(E e, S* state)
{
with(e) with(state)
{
@ -652,7 +650,7 @@ template ThompsonOps(E, S, bool withInput:true)
}
}
static bool op(IR code:IR.Trie)(E* e, S* state)
static bool op(IR code:IR.Trie)(E e, S* state)
{
with(e) with(state)
{
@ -676,7 +674,7 @@ template ThompsonOps(E,S, bool withInput:false)
{
@trusted:
// can't match these without input
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code == IR.Char || code == IR.OrChar || code == IR.CodepointSet
|| code == IR.Trie || code == IR.Char || code == IR.Any)
{
@ -684,7 +682,7 @@ template ThompsonOps(E,S, bool withInput:false)
}
// special case of zero-width backref
static bool op(IR code:IR.Backref)(E* e, S* state)
static bool op(IR code:IR.Backref)(E e, S* state)
{
with(e) with(state)
{
@ -702,7 +700,7 @@ template ThompsonOps(E,S, bool withInput:false)
}
// forward all control flow to normal versions
static bool op(IR code)(E* e, S* state)
static bool op(IR code)(E e, S* state)
if (code != IR.Char && code != IR.OrChar && code != IR.CodepointSet
&& code != IR.Trie && code != IR.Char && code != IR.Any && code != IR.Backref)
{
@ -714,19 +712,19 @@ template ThompsonOps(E,S, bool withInput:false)
Thomspon matcher does all matching in lockstep,
never looking at the same char twice
+/
@trusted struct ThompsonMatcher(Char, StreamType = Input!Char)
@trusted class ThompsonMatcher(Char, StreamType = Input!Char): Matcher!Char
if (is(Char : dchar))
{
alias DataIndex = Stream.DataIndex;
alias Stream = StreamType;
alias OpFunc = bool function(ThompsonMatcher*, State*);
alias OpFunc = bool function(ThompsonMatcher, State*);
alias BackMatcher = ThompsonMatcher!(Char, BackLooper!(Stream));
alias OpBackFunc = bool function(BackMatcher*, BackMatcher.State*);
alias OpBackFunc = bool function(BackMatcher, BackMatcher.State*);
Thread!DataIndex* freelist;
ThreadList!DataIndex clist, nlist;
DataIndex[] merge;
Group!DataIndex[] backrefed;
Regex!Char re; //regex program
const Regex!Char re; //regex program
Stream s;
dchar front;
DataIndex index;
@ -737,16 +735,18 @@ if (is(Char : dchar))
OpBackFunc[] opCacheBackTrue; // ditto
OpBackFunc[] opCacheBackFalse; // ditto
size_t threadSize;
size_t _refCount;
int matched;
bool exhausted;
final:
static struct State
{
Thread!DataIndex* t;
ThreadList!DataIndex worklist;
Group!DataIndex[] matches;
bool popState(E)(E* e)
bool popState(E)(E e)
{
with(e)
{
@ -784,6 +784,10 @@ if (is(Char : dchar))
//true if it's end of input
@property bool atEnd(){ return index == s.lastIndex && s.atEnd; }
override @property ref size_t refCount() @safe { return _refCount; }
override @property ref const(Regex!Char) pattern() @safe { return re; }
bool next()
{
if (!s.nextChar(front, index))
@ -843,19 +847,28 @@ if (is(Char : dchar))
}
}
this()(Regex!Char program, Stream stream, void[] memory)
this()(const Regex!Char program, Stream stream, void[] memory)
{
// We are emplace'd to malloced memory w/o blitting T.init over it\
// make sure we initialize all fields explicitly
_refCount = 1;
subCounters = null;
backrefed = null;
exhausted = false;
matched = 0;
re = program;
s = stream;
initExternalMemory(memory);
genCounter = 0;
}
this(ref ThompsonMatcher matcher, size_t lo, size_t hi, Stream stream)
this(ThompsonMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream)
{
_refCount = 1;
subCounters = matcher.subCounters;
s = stream;
re = matcher.re;
re.ir = re.ir[lo .. hi];
auto code = matcher.re.ir[lo .. hi];
re = matcher.re.withCode(code).withNGroup(nGroup);
threadSize = matcher.threadSize;
merge = matcher.merge;
freelist = matcher.freelist;
@ -867,11 +880,13 @@ if (is(Char : dchar))
index = matcher.index;
}
this(ref BackMatcher matcher, size_t lo, size_t hi, Stream stream)
this(BackMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream)
{
_refCount = 1;
subCounters = matcher.subCounters;
s = stream;
re = matcher.re;
re.ir = re.ir[lo .. hi];
auto code = matcher.re.ir[lo .. hi];
re = matcher.re.withCode(code).withNGroup(nGroup);
threadSize = matcher.threadSize;
merge = matcher.merge;
freelist = matcher.freelist;
@ -883,31 +898,35 @@ if (is(Char : dchar))
index = matcher.index;
}
auto fwdMatcher()(size_t lo, size_t hi, size_t counter)
auto fwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter)
{
auto m = ThompsonMatcher!(Char, Stream)(this, lo, hi, s);
auto m = new ThompsonMatcher!(Char, Stream)(this, lo, hi, nGroup, s);
m.genCounter = counter;
return m;
}
auto bwdMatcher()(size_t lo, size_t hi, size_t counter)
auto bwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter)
{
alias BackLooper = typeof(s.loopBack(index));
auto m = ThompsonMatcher!(Char, BackLooper)(this, lo, hi, s.loopBack(index));
auto m = new ThompsonMatcher!(Char, BackLooper)(this, lo, hi, nGroup, s.loopBack(index));
m.genCounter = counter;
m.next();
return m;
}
auto dupTo(void[] memory)
override void dupTo(Matcher!Char engine, void[] memory)
{
typeof(this) tmp = this;//bitblit
tmp.initExternalMemory(memory);
tmp.genCounter = 0;
return tmp;
auto thompson = cast(ThompsonMatcher) engine;
thompson.s = s;
thompson.subCounters = null;
thompson.front = front;
thompson.index = index;
thompson.matched = matched;
thompson.exhausted = exhausted;
thompson.initExternalMemory(memory);
}
int match(Group!DataIndex[] matches)
override int match(Group!DataIndex[] matches)
{
debug(std_regex_matcher)
writeln("------------------------------------------");
@ -1052,9 +1071,9 @@ if (is(Char : dchar))
{
debug(std_regex_matcher) writeln("---- Evaluating thread");
static if (withInput)
while (opCacheTrue.ptr[state.t.pc](&this, state)){}
while (opCacheTrue.ptr[state.t.pc](this, state)){}
else
while (opCacheFalse.ptr[state.t.pc](&this, state)){}
while (opCacheFalse.ptr[state.t.pc](this, state)){}
}
enum uint RestartPc = uint.max;
//match the input, evaluating IR without searching

View file

@ -298,7 +298,6 @@ module std.regex;
import std.range.primitives, std.traits;
import std.regex.internal.ir;
import std.regex.internal.thompson; //TODO: get rid of this dependency
import std.typecons; // : Flag, Yes, No;
/++
@ -339,10 +338,9 @@ public alias Regex(Char) = std.regex.internal.ir.Regex!(Char);
A $(D StaticRegex) is $(D Regex) object that contains D code specially
generated at compile-time to speed up matching.
Implicitly convertible to normal $(D Regex),
however doing so will result in losing this additional capability.
No longer used, kept as alias to Regex for backwards compatibility.
+/
public alias StaticRegex(Char) = std.regex.internal.ir.StaticRegex!(Char);
public alias StaticRegex = Regex;
/++
Compile regular expression pattern for the later execution.
@ -428,16 +426,25 @@ if (isSomeString!(S))
template ctRegexImpl(alias pattern, string flags=[])
{
import std.regex.internal.backtracking, std.regex.internal.parser;
enum r = regex(pattern, flags);
static immutable r = cast(immutable) regex(pattern, flags);
alias Char = BasicElementOf!(typeof(pattern));
enum source = ctGenRegExCode(r);
alias Matcher = BacktrackingMatcher!(true);
@trusted bool func(ref Matcher!Char matcher)
@trusted bool func(BacktrackingMatcher!Char matcher)
{
debug(std_regex_ctr) pragma(msg, source);
cast(void) matcher;
mixin(source);
}
enum nr = StaticRegex!Char(r, &func);
static immutable staticRe =
cast(immutable) r.withFactory(new CtfeFactory!(BacktrackingMatcher, Char, func));
struct Wrapper
{
// allow code that expects mutable Regex to still work
// we stay "logically const"
@trusted @property auto getRe() const { return cast() staticRe; }
alias getRe this;
}
enum wrapper = Wrapper();
}
/++
@ -450,10 +457,10 @@ template ctRegexImpl(alias pattern, string flags=[])
pattern = Regular expression
flags = The _attributes (g, i, m, s and x accepted)
+/
public enum ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).nr;
public enum ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).wrapper;
enum isRegexFor(RegEx, R) = is(RegEx == Regex!(BasicElementOf!R))
|| is(RegEx == StaticRegex!(BasicElementOf!R));
enum isRegexFor(RegEx, R) = is(Unqual!RegEx == Regex!(BasicElementOf!R)) || is(RegEx : const(Regex!(BasicElementOf!R)))
|| is(Unqual!RegEx == StaticRegex!(BasicElementOf!R));
/++
@ -462,10 +469,10 @@ enum isRegexFor(RegEx, R) = is(RegEx == Regex!(BasicElementOf!R))
First element of range is the whole match.
+/
@trusted public struct Captures(R, DIndex = size_t)
@trusted public struct Captures(R)
if (isSomeString!R)
{//@trusted because of union inside
alias DataIndex = DIndex;
alias DataIndex = size_t;
alias String = R;
private:
import std.conv : text;
@ -480,9 +487,9 @@ private:
}
uint _f, _b;
uint _refcount; // ref count or SMALL MASK + num groups
NamedGroup[] _names;
const(NamedGroup)[] _names;
this()(R input, uint n, NamedGroup[] named)
this(R input, uint n, const(NamedGroup)[] named)
{
_input = input;
_names = named;
@ -491,11 +498,11 @@ private:
_f = 0;
}
this(alias Engine)(ref RegexMatch!(R,Engine) rmatch)
this(ref RegexMatch!R rmatch)
{
_input = rmatch._input;
_names = rmatch._engine.re.dict;
immutable n = rmatch._engine.re.ngroup;
_names = rmatch._engine.pattern.dict;
immutable n = rmatch._engine.pattern.ngroup;
newMatches(n);
_b = n;
_f = 0;
@ -693,58 +700,38 @@ public:
Effectively it's a forward range of Captures!R, produced
by lazily searching for matches in a given input.
$(D alias Engine) specifies an engine type to use during matching,
and is automatically deduced in a call to $(D match)/$(D bmatch).
+/
@trusted public struct RegexMatch(R, alias Engine = ThompsonMatcher)
@trusted public struct RegexMatch(R)
if (isSomeString!R)
{
private:
import core.stdc.stdlib : malloc, free;
alias Char = BasicElementOf!R;
alias EngineType = Engine!Char;
EngineType _engine;
Matcher!Char _engine;
const MatcherFactory!Char _factory;
R _input;
Captures!(R,EngineType.DataIndex) _captures;
void[] _memory;//is ref-counted
Captures!R _captures;
this(RegEx)(R input, RegEx prog)
{
import std.exception : enforce;
_input = input;
immutable size = EngineType.initialMemory(prog)+size_t.sizeof;
_memory = (enforce(malloc(size), "malloc failed")[0 .. size]);
scope(failure) free(_memory.ptr);
*cast(size_t*)_memory.ptr = 1;
_engine = EngineType(prog, Input!Char(input), _memory[size_t.sizeof..$]);
static if (is(RegEx == StaticRegex!(BasicElementOf!R)))
_engine.nativeFn = prog.nativeFn;
_captures = Captures!(R,EngineType.DataIndex)(this);
if (prog.factory is null) _factory = defaultFactory!Char(prog);
else _factory = prog.factory;
_engine = _factory.create(prog, input);
assert(_engine.refCount == 1);
_captures = Captures!R(this);
_captures._nMatch = _engine.match(_captures.matches);
debug(std_regex_allocation) writefln("RefCount (ctor): %x %d", _memory.ptr, counter);
}
@property ref size_t counter(){ return *cast(size_t*)_memory.ptr; }
public:
this(this)
{
if (_memory.ptr)
{
++counter;
debug(std_regex_allocation) writefln("RefCount (postblit): %x %d",
_memory.ptr, *cast(size_t*)_memory.ptr);
}
if (_engine) _factory.incRef(_engine);
}
~this()
{
if (_memory.ptr && --*cast(size_t*)_memory.ptr == 0)
{
debug(std_regex_allocation) writefln("RefCount (dtor): %x %d",
_memory.ptr, *cast(size_t*)_memory.ptr);
free(cast(void*)_memory.ptr);
}
if (_engine) _factory.decRef(_engine);
}
///Shorthands for front.pre, front.post, front.hit.
@ -786,19 +773,18 @@ public:
void popFront()
{
import std.exception : enforce;
if (counter != 1)
{//do cow magic first
counter--;//we abandon this reference
immutable size = EngineType.initialMemory(_engine.re)+size_t.sizeof;
_memory = (enforce(malloc(size), "malloc failed")[0 .. size]);
_engine = _engine.dupTo(_memory[size_t.sizeof .. size]);
counter = 1;//points to new chunk
// CoW - if refCount is not 1, we are aliased by somebody else
if (_engine.refCount != 1)
{
// we create a new engine & abandon this reference
auto old = _engine;
_engine = _factory.dup(old, _input);
_factory.decRef(old);
}
if (!_captures.unique)
{
// has external references - allocate new space
_captures.newMatches(_engine.re.ngroup);
_captures.newMatches(_engine.pattern.ngroup);
}
_captures._nMatch = _engine.match(_captures.matches);
}
@ -814,39 +800,30 @@ public:
/// Same as .front, provided for compatibility with original std.regex.
@property auto captures() inout { return _captures; }
}
private @trusted auto matchOnce(alias Engine, RegEx, R)(R input, RegEx re)
private @trusted auto matchOnce(RegEx, R)(R input, const RegEx prog)
{
import core.stdc.stdlib : malloc, free;
import std.exception : enforce;
alias Char = BasicElementOf!R;
alias EngineType = Engine!Char;
size_t size = EngineType.initialMemory(re);
void[] memory = enforce(malloc(size), "malloc failed")[0 .. size];
scope(exit) free(memory.ptr);
auto captures = Captures!(R, EngineType.DataIndex)(input, re.ngroup, re.dict);
auto engine = EngineType(re, Input!Char(input), memory);
static if (is(RegEx == StaticRegex!(BasicElementOf!R)))
engine.nativeFn = re.nativeFn;
auto factory = prog.factory is null ? defaultFactory!Char(prog) : prog.factory;
auto engine = factory.create(prog, input);
scope(exit) factory.decRef(engine); // destroys the engine
auto captures = Captures!R(input, prog.ngroup, prog.dict);
captures._nMatch = engine.match(captures.matches);
return captures;
}
private auto matchMany(alias Engine, RegEx, R)(R input, RegEx re)
private auto matchMany(RegEx, R)(R input, RegEx re) @safe
{
re.flags |= RegexOption.global;
return RegexMatch!(R, Engine)(input, re);
return RegexMatch!R(input, re.withFlags(re.flags | RegexOption.global));
}
@system unittest
{
//sanity checks for new API
auto re = regex("abc");
assert(!"abc".matchOnce!(ThompsonMatcher)(re).empty);
assert("abc".matchOnce!(ThompsonMatcher)(re)[0] == "abc");
assert(!"abc".matchOnce(re).empty);
assert("abc".matchOnce(re)[0] == "abc");
}
@ -938,25 +915,16 @@ if (isSomeString!R && isRegexFor!(RegEx, R))
+/
public auto match(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R)))
if (isSomeString!R && isRegexFor!(RegEx,R))
{
import std.regex.internal.thompson : ThompsonMatcher;
return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, re);
return RegexMatch!(Unqual!(typeof(input)))(input, re);
}
///ditto
public auto match(R, String)(R input, String re)
if (isSomeString!R && isSomeString!String)
{
import std.regex.internal.thompson : ThompsonMatcher;
return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, regex(re));
}
public auto match(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
{
import std.regex.internal.backtracking : BacktrackingMatcher;
return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re);
return RegexMatch!(Unqual!(typeof(input)))(input, regex(re));
}
/++
@ -978,33 +946,23 @@ if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
if there was a match, otherwise an empty $(LREF Captures) object.
+/
public auto matchFirst(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R)))
if (isSomeString!R && isRegexFor!(RegEx, R))
{
import std.regex.internal.thompson : ThompsonMatcher;
return matchOnce!ThompsonMatcher(input, re);
return matchOnce(input, re);
}
///ditto
public auto matchFirst(R, String)(R input, String re)
if (isSomeString!R && isSomeString!String)
{
import std.regex.internal.thompson : ThompsonMatcher;
return matchOnce!ThompsonMatcher(input, regex(re));
return matchOnce(input, regex(re));
}
///ditto
public auto matchFirst(R, String)(R input, String[] re...)
if (isSomeString!R && isSomeString!String)
{
import std.regex.internal.thompson : ThompsonMatcher;
return matchOnce!ThompsonMatcher(input, regex(re));
}
public auto matchFirst(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
{
import std.regex.internal.backtracking : BacktrackingMatcher;
return matchOnce!(BacktrackingMatcher!true)(input, re);
return matchOnce(input, regex(re));
}
/++
@ -1029,33 +987,23 @@ if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
after the first match was found or an empty one if not present.
+/
public auto matchAll(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R)))
if (isSomeString!R && isRegexFor!(RegEx, R))
{
import std.regex.internal.thompson : ThompsonMatcher;
return matchMany!ThompsonMatcher(input, re);
return matchMany(input, re);
}
///ditto
public auto matchAll(R, String)(R input, String re)
if (isSomeString!R && isSomeString!String)
{
import std.regex.internal.thompson : ThompsonMatcher;
return matchMany!ThompsonMatcher(input, regex(re));
return matchMany(input, regex(re));
}
///ditto
public auto matchAll(R, String)(R input, String[] re...)
if (isSomeString!R && isSomeString!String)
{
import std.regex.internal.thompson : ThompsonMatcher;
return matchMany!ThompsonMatcher(input, regex(re));
}
public auto matchAll(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
{
import std.regex.internal.backtracking : BacktrackingMatcher;
return matchMany!(BacktrackingMatcher!true)(input, re);
return matchMany(input, regex(re));
}
// another set of tests just to cover the new API
@ -1119,25 +1067,16 @@ if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
+/
public auto bmatch(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R)))
if (isSomeString!R && isRegexFor!(RegEx, R))
{
import std.regex.internal.backtracking : BacktrackingMatcher;
return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, re);
return RegexMatch!(Unqual!(typeof(input)))(input, re);
}
///ditto
public auto bmatch(R, String)(R input, String re)
if (isSomeString!R && isSomeString!String)
{
import std.regex.internal.backtracking : BacktrackingMatcher;
return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, regex(re));
}
public auto bmatch(R, RegEx)(R input, RegEx re)
if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R)))
{
import std.regex.internal.backtracking : BacktrackingMatcher;
return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re);
return RegexMatch!(Unqual!(typeof(input)))(input, regex(re));
}
// produces replacement string from format using captures for substitution
@ -1530,7 +1469,7 @@ private:
@trusted this(Range input, RegEx separator)
{//@@@BUG@@@ generated opAssign of RegexMatch is not @trusted
_input = input;
separator.flags |= RegexOption.global;
const re = separator.withFlags(separator.flags | RegexOption.global);
if (_input.empty)
{
//there is nothing to match at all, make _offset > 0
@ -1538,7 +1477,7 @@ private:
}
else
{
_match = Rx(_input, separator);
_match = Rx(_input, re);
static if (keepSeparators)
if (_match.pre.empty)

104
std/uni.d
View file

@ -2184,6 +2184,12 @@ pure:
return Intervals!(typeof(data))(data);
}
package @property const(CodepointInterval)[] intervals() const
{
import std.array : array;
return Intervals!(typeof(data[]))(data[]).array;
}
/**
Tests the presence of code point $(D val) in this set.
*/
@ -2619,52 +2625,9 @@ public:
assert((set & set.inverted).empty);
}
/**
Generates string with D source code of unary function with name of
$(D funcName) taking a single $(D dchar) argument. If $(D funcName) is empty
the code is adjusted to be a lambda function.
The function generated tests if the $(CODEPOINT) passed
belongs to this set or not. The result is to be used with string mixin.
The intended usage area is aggressive optimization via meta programming
in parser generators and the like.
Note: Use with care for relatively small or regular sets. It
could end up being slower then just using multi-staged tables.
Example:
---
import std.stdio;
// construct set directly from [a, b$RPAREN intervals
auto set = CodepointSet(10, 12, 45, 65, 100, 200);
writeln(set);
writeln(set.toSourceCode("func"));
---
The above outputs something along the lines of:
---
bool func(dchar ch) @safe pure nothrow @nogc
{
if (ch < 45)
{
if (ch == 10 || ch == 11) return true;
return false;
}
else if (ch < 65) return true;
else
{
if (ch < 100) return false;
if (ch < 200) return true;
return false;
}
}
---
*/
string toSourceCode(string funcName="")
package static string toSourceCode(const(CodepointInterval)[] range, string funcName)
{
import std.algorithm.searching : countUntil;
import std.array : array;
import std.format : format;
enum maxBinary = 3;
static string linearScope(R)(R ivals, string indent)
@ -2746,7 +2709,6 @@ public:
string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
funcName.empty ? "function" : funcName);
auto range = byInterval.array();
// special case first bisection to be on ASCII vs beyond
auto tillAscii = countUntil!"a[0] > 0x80"(range);
if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
@ -2756,6 +2718,55 @@ public:
return code;
}
/**
Generates string with D source code of unary function with name of
$(D funcName) taking a single $(D dchar) argument. If $(D funcName) is empty
the code is adjusted to be a lambda function.
The function generated tests if the $(CODEPOINT) passed
belongs to this set or not. The result is to be used with string mixin.
The intended usage area is aggressive optimization via meta programming
in parser generators and the like.
Note: Use with care for relatively small or regular sets. It
could end up being slower then just using multi-staged tables.
Example:
---
import std.stdio;
// construct set directly from [a, b$RPAREN intervals
auto set = CodepointSet(10, 12, 45, 65, 100, 200);
writeln(set);
writeln(set.toSourceCode("func"));
---
The above outputs something along the lines of:
---
bool func(dchar ch) @safe pure nothrow @nogc
{
if (ch < 45)
{
if (ch == 10 || ch == 11) return true;
return false;
}
else if (ch < 65) return true;
else
{
if (ch < 100) return false;
if (ch < 200) return true;
return false;
}
}
---
*/
string toSourceCode(string funcName="")
{
import std.array : array;
auto range = byInterval.array();
return toSourceCode(range, funcName);
}
/**
True if this set doesn't contain any $(CODEPOINTS).
*/
@ -2802,6 +2813,7 @@ private:
//may break sorted property - but we need std.sort to access it
//hence package protection attribute
static if (hasAssignableElements!Range)
package @property void front(CodepointInterval val)
{
slice[start] = val.a;
@ -2816,6 +2828,7 @@ private:
}
//ditto about package
static if (hasAssignableElements!Range)
package @property void back(CodepointInterval val)
{
slice[end-2] = val.a;
@ -2840,6 +2853,7 @@ private:
}
//ditto about package
static if (hasAssignableElements!Range)
package void opIndexAssign(CodepointInterval val, size_t idx)
{
slice[start+idx*2] = val.a;