From 6e684343d76db2fe51c843ac7bc046a23a142933 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Fri, 25 Aug 2017 09:37:42 +0300 Subject: [PATCH 01/15] WIP - refactoring towards sane std.regex code --- std/regex/internal/backtracking.d | 34 ++++---- std/regex/internal/ir.d | 87 ++++++++++++++++++++ std/regex/internal/thompson.d | 2 +- std/regex/package.d | 130 ++++++++++-------------------- 4 files changed, 148 insertions(+), 105 deletions(-) diff --git a/std/regex/internal/backtracking.d b/std/regex/internal/backtracking.d index ffc977992..647295c02 100644 --- a/std/regex/internal/backtracking.d +++ b/std/regex/internal/backtracking.d @@ -15,7 +15,7 @@ import std.regex.internal.ir; +/ template BacktrackingMatcher(bool CTregex) { - @trusted struct BacktrackingMatcher(Char, Stream = Input!Char) + @trusted class BacktrackingMatcher(Char, Stream = Input!Char) : Matcher!Char if (is(Char : dchar)) { alias DataIndex = Stream.DataIndex; @@ -29,20 +29,18 @@ template BacktrackingMatcher(bool CTregex) enum initialStack = 1 << 11; // items in a block of segmented stack alias String = const(Char)[]; alias RegEx = Regex!Char; - alias MatchFn = bool function (ref BacktrackingMatcher!(Char, Stream)); - RegEx re; //regex program - static if (CTregex) - MatchFn nativeFn; //native code for that program - //Stream state + alias MatchFn = bool function (BacktrackingMatcher!(Char, Stream)); + RegEx re; // regex program + MatchFn nativeFn; // native code for that program + // Stream state Stream s; DataIndex index; dchar front; bool exhausted; - //backtracking machine state + // Backtracking machine state uint pc, counter; - DataIndex lastState = 0; //top of state stack - static if (!CTregex) - uint infiniteNesting; + DataIndex lastState = 0; // Top of state stack + uint infiniteNesting; size_t[] memory; Trace[] merge; static struct Trace @@ -69,6 +67,10 @@ template BacktrackingMatcher(bool CTregex) } //local slice of matches, global for backref Group!DataIndex[] matches, backrefed; + size_t _refCount; + + override @property ref size_t refCount() { return _refCount; } + override @property ref RegEx pattern(){ return re; } static if (__traits(hasMember,Stream, "search")) { @@ -182,20 +184,20 @@ template BacktrackingMatcher(bool CTregex) next(); } - auto fwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock) + auto fwdMatcher(BacktrackingMatcher matcher, void[] memBlock) { alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); alias BackMatcher = BackMatcherTempl!(Char, Stream); - auto fwdMatcher = BackMatcher(matcher.re, s, memBlock, front, index); + auto fwdMatcher = new BackMatcher(matcher.re, s, memBlock, front, index); return fwdMatcher; } - auto bwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock) + auto bwdMatcher(BacktrackingMatcher matcher, void[] memBlock) { alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); alias BackMatcher = BackMatcherTempl!(Char, typeof(s.loopBack(index))); auto fwdMatcher = - BackMatcher(matcher.re, s.loopBack(index), memBlock); + new BackMatcher(matcher.re, s.loopBack(index), memBlock); return fwdMatcher; } @@ -716,7 +718,7 @@ template BacktrackingMatcher(bool CTregex) debug(std_regex_matcher) writeln("pop array SP= ", lastState); } - static if (!CTregex) + static if (true) { //helper function, saves engine state void pushState(uint pc, uint counter) @@ -946,7 +948,7 @@ struct CtContext alias Lookaround = $$; else alias Lookaround = $$; - static bool matcher_$$(ref Lookaround matcher) @trusted + static bool matcher_$$(Lookaround matcher) @trusted { //(neg)lookaround piece start $$ diff --git a/std/regex/internal/ir.d b/std/regex/internal/ir.d index 2bd98843c..f41ac8414 100644 --- a/std/regex/internal/ir.d +++ b/std/regex/internal/ir.d @@ -423,6 +423,92 @@ struct Group(DataIndex) writeln("\t", disassemble(slice, pc, dict)); } +// Encapsulates memory management, explicit ref counting +// and the exact type of engine created +// there is a single instance per engine combination type x Char +// In future may also maintain a (TLS?) cache of memory +interface MatcherFactory(Char) +{ + Matcher!Char create(ref Regex!Char, in Char[] input); + Matcher!Char dup(Matcher!Char m, in Char[] input); + void incRef(Matcher!Char m); + void decRef(Matcher!Char m); +} + +// Only memory management, no compile-time vs run-time specialities +abstract class GenericFactory(EngineType, Char) : MatcherFactory!Char +{ + Matcher!Char construct(ref Regex!Char re, in Char[] input, void[] memory); + + Matcher!Char create(ref Regex!Char re, in Char[] input) + { + immutable classSize = __traits(classInstanceSize, EngineType!Char); + immutable size = EngineType!Char.initialMemory(prog) + classSize; + auto memory = enforce(malloc(size), "malloc failed")[0 .. size]; + scope(failure) free(memory.ptr); + auto engine = construct(prog, input, _memory); + assert(engine.refCount == 1); + assert(cast(void*)engine == memory.ptr); + return engine; + } + + Matcher!Char dup(Matcher!Char engine, in Char[] input) + { + immutable size = EngineType.initialMemory(engine.re)+size_t.sizeof; + auto memory = enforce(malloc(size), "malloc failed")[0 .. size]; + scope(failure) free(memory.ptr); + engine.dupTo(_memory[size_t.sizeof .. size]); + engine = construct(engine.pattern, input, memory); + assert(engine.refCount == 1); + return engine; + } + + void incRef() + { + ++m.refCount; + } + + void decRef(Matcher!Char m) + { + assert(m.refCount != 0); + if (--m.refCount == 0) + { + free(cast(void*)m); + } + } +} + +// A factory for run-time engines +class RuntimeFactory(EngineType, Char) : GenericFactory!(EngineType, Char) +{ + Matcher!Char construct(ref Regex!Char re, in Char[] input, void[] memory) + { + return emplace!(EngineType!Char)(_memory[0 .. classSize], + prog, Input!Char(input), _memory[classSize .. $]); + } +} + +// A factory for compile-time engine +class CtfeFactory(EngineType, Char, alias func) : GenericFactory!(EngineType, Char) +{ + Matcher!Char construct(ref Regex!Char re, in Char[] input, void[] memory) + { + return emplace!(EngineType!Char)(_memory[0 .. classSize], + StaticRegex!Char(prog, func), Input!Char(input), _memory[classSize .. $]); + } +} + +interface Matcher(Char) +{ + // Get a (next) match + int match(Group!size_t[] matches); + // This only maintains internal ref-count, + // deallocation happens inside MatcherFactory + @property ref size_t refCount(); + // The pattern loaded + @property ref Regex!Char pattern(); +} + /++ $(D Regex) object holds regular expression pattern in compiled form. Instances of this object are constructed via calls to $(D regex). @@ -495,6 +581,7 @@ package(std.regex): public const(BitTable)[] filters; // bloom filters for conditional loops uint[] backrefed; // bit array of backreferenced submatches Kickstart!Char kickstart; + MatcherFactory!Char factory; // produces optimal matcher for this pattern //bit access helper uint isBackref(uint n) diff --git a/std/regex/internal/thompson.d b/std/regex/internal/thompson.d index 4d7deaa1f..ceb561ea3 100644 --- a/std/regex/internal/thompson.d +++ b/std/regex/internal/thompson.d @@ -714,7 +714,7 @@ template ThompsonOps(E,S, bool withInput:false) Thomspon matcher does all matching in lockstep, never looking at the same char twice +/ -@trusted struct ThompsonMatcher(Char, StreamType = Input!Char) +@trusted class ThompsonMatcher(Char, StreamType = Input!Char): Matcher!Char if (is(Char : dchar)) { alias DataIndex = Stream.DataIndex; diff --git a/std/regex/package.d b/std/regex/package.d index bfc7d7ff3..23841357a 100644 --- a/std/regex/package.d +++ b/std/regex/package.d @@ -462,10 +462,10 @@ enum isRegexFor(RegEx, R) = is(RegEx == Regex!(BasicElementOf!R)) First element of range is the whole match. +/ -@trusted public struct Captures(R, DIndex = size_t) +@trusted public struct Captures(R) if (isSomeString!R) {//@trusted because of union inside - alias DataIndex = DIndex; + alias DataIndex = size_t; alias String = R; private: import std.conv : text; @@ -482,7 +482,7 @@ private: uint _refcount; // ref count or SMALL MASK + num groups NamedGroup[] _names; - this()(R input, uint n, NamedGroup[] named) + this(R input, uint n, NamedGroup[] named) { _input = input; _names = named; @@ -491,11 +491,11 @@ private: _f = 0; } - this(alias Engine)(ref RegexMatch!(R,Engine) rmatch) + this(ref RegexMatch!R rmatch) { _input = rmatch._input; - _names = rmatch._engine.re.dict; - immutable n = rmatch._engine.re.ngroup; + _names = rmatch._engine.pattern.dict; + immutable n = rmatch._engine.pattern.ngroup; newMatches(n); _b = n; _f = 0; @@ -693,58 +693,37 @@ public: Effectively it's a forward range of Captures!R, produced by lazily searching for matches in a given input. - - $(D alias Engine) specifies an engine type to use during matching, - and is automatically deduced in a call to $(D match)/$(D bmatch). +/ -@trusted public struct RegexMatch(R, alias Engine = ThompsonMatcher) +@trusted public struct RegexMatch(R) if (isSomeString!R) { private: import core.stdc.stdlib : malloc, free; alias Char = BasicElementOf!R; - alias EngineType = Engine!Char; - EngineType _engine; + Matcher!Char _engine; + MatcherFactory!Char _factory; R _input; - Captures!(R,EngineType.DataIndex) _captures; - void[] _memory;//is ref-counted + Captures!R _captures; this(RegEx)(R input, RegEx prog) { import std.exception : enforce; _input = input; - immutable size = EngineType.initialMemory(prog)+size_t.sizeof; - _memory = (enforce(malloc(size), "malloc failed")[0 .. size]); - scope(failure) free(_memory.ptr); - *cast(size_t*)_memory.ptr = 1; - _engine = EngineType(prog, Input!Char(input), _memory[size_t.sizeof..$]); - static if (is(RegEx == StaticRegex!(BasicElementOf!R))) - _engine.nativeFn = prog.nativeFn; - _captures = Captures!(R,EngineType.DataIndex)(this); + _factory = prog.factory; + _engine = prog.factory.create(prog, input); + _captures = Captures!R(this); _captures._nMatch = _engine.match(_captures.matches); - debug(std_regex_allocation) writefln("RefCount (ctor): %x %d", _memory.ptr, counter); } - @property ref size_t counter(){ return *cast(size_t*)_memory.ptr; } public: this(this) { - if (_memory.ptr) - { - ++counter; - debug(std_regex_allocation) writefln("RefCount (postblit): %x %d", - _memory.ptr, *cast(size_t*)_memory.ptr); - } + _factory.incRef(_engine); } ~this() { - if (_memory.ptr && --*cast(size_t*)_memory.ptr == 0) - { - debug(std_regex_allocation) writefln("RefCount (dtor): %x %d", - _memory.ptr, *cast(size_t*)_memory.ptr); - free(cast(void*)_memory.ptr); - } + _factory.decRef(_engine); } ///Shorthands for front.pre, front.post, front.hit. @@ -786,19 +765,17 @@ public: void popFront() { import std.exception : enforce; - if (counter != 1) - {//do cow magic first - counter--;//we abandon this reference - immutable size = EngineType.initialMemory(_engine.re)+size_t.sizeof; - _memory = (enforce(malloc(size), "malloc failed")[0 .. size]); - _engine = _engine.dupTo(_memory[size_t.sizeof .. size]); - counter = 1;//points to new chunk + // CoW - if refCount is not 1, we are aliased by somebody else + if (_engine.refCount != 1) + { + // we abandon this reference & create a new engine + _factory.decRef(_engine); + _engine = _factory.dup(_engine, _input); } - if (!_captures.unique) { // has external references - allocate new space - _captures.newMatches(_engine.re.ngroup); + _captures.newMatches(_engine.pattern.ngroup); } _captures._nMatch = _engine.match(_captures.matches); } @@ -814,31 +791,22 @@ public: /// Same as .front, provided for compatibility with original std.regex. @property auto captures() inout { return _captures; } - } -private @trusted auto matchOnce(alias Engine, RegEx, R)(R input, RegEx re) +private @trusted auto matchOnce(RegEx, R)(R input, RegEx re) { - import core.stdc.stdlib : malloc, free; - import std.exception : enforce; alias Char = BasicElementOf!R; - alias EngineType = Engine!Char; - - size_t size = EngineType.initialMemory(re); - void[] memory = enforce(malloc(size), "malloc failed")[0 .. size]; - scope(exit) free(memory.ptr); - auto captures = Captures!(R, EngineType.DataIndex)(input, re.ngroup, re.dict); - auto engine = EngineType(re, Input!Char(input), memory); - static if (is(RegEx == StaticRegex!(BasicElementOf!R))) - engine.nativeFn = re.nativeFn; + auto engine = re.factory.create(re, input); + scope(exit) re.factory.decRef(engine); // destroys the engine + auto captures = Captures!R(input, re.ngroup, re.dict); captures._nMatch = engine.match(captures.matches); return captures; } -private auto matchMany(alias Engine, RegEx, R)(R input, RegEx re) +private auto matchMany(RegEx, R)(R input, RegEx re) { re.flags |= RegexOption.global; - return RegexMatch!(R, Engine)(input, re); + return RegexMatch!R(input, re); } @system unittest @@ -940,23 +908,20 @@ if (isSomeString!R && isRegexFor!(RegEx, R)) public auto match(R, RegEx)(R input, RegEx re) if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) { - import std.regex.internal.thompson : ThompsonMatcher; - return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, re); + return RegexMatch!(Unqual!(typeof(input)))(input, re); } ///ditto public auto match(R, String)(R input, String re) if (isSomeString!R && isSomeString!String) { - import std.regex.internal.thompson : ThompsonMatcher; - return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, regex(re)); + return RegexMatch!(Unqual!(typeof(input)))(input, regex(re)); } public auto match(R, RegEx)(R input, RegEx re) if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) { - import std.regex.internal.backtracking : BacktrackingMatcher; - return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re); + return RegexMatch!(Unqual!(typeof(input)))(input, re); } /++ @@ -980,31 +945,27 @@ if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) public auto matchFirst(R, RegEx)(R input, RegEx re) if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) { - import std.regex.internal.thompson : ThompsonMatcher; - return matchOnce!ThompsonMatcher(input, re); + return matchOnce(input, re); } ///ditto public auto matchFirst(R, String)(R input, String re) if (isSomeString!R && isSomeString!String) { - import std.regex.internal.thompson : ThompsonMatcher; - return matchOnce!ThompsonMatcher(input, regex(re)); + return matchOnce(input, regex(re)); } ///ditto public auto matchFirst(R, String)(R input, String[] re...) if (isSomeString!R && isSomeString!String) { - import std.regex.internal.thompson : ThompsonMatcher; - return matchOnce!ThompsonMatcher(input, regex(re)); + return matchOnce(input, regex(re)); } public auto matchFirst(R, RegEx)(R input, RegEx re) if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) { - import std.regex.internal.backtracking : BacktrackingMatcher; - return matchOnce!(BacktrackingMatcher!true)(input, re); + return matchOnce(input, re); } /++ @@ -1031,31 +992,27 @@ if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) public auto matchAll(R, RegEx)(R input, RegEx re) if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) { - import std.regex.internal.thompson : ThompsonMatcher; - return matchMany!ThompsonMatcher(input, re); + return matchMany(input, re); } ///ditto public auto matchAll(R, String)(R input, String re) if (isSomeString!R && isSomeString!String) { - import std.regex.internal.thompson : ThompsonMatcher; - return matchMany!ThompsonMatcher(input, regex(re)); + return matchMany(input, regex(re)); } ///ditto public auto matchAll(R, String)(R input, String[] re...) if (isSomeString!R && isSomeString!String) { - import std.regex.internal.thompson : ThompsonMatcher; - return matchMany!ThompsonMatcher(input, regex(re)); + return matchMany(input, regex(re)); } public auto matchAll(R, RegEx)(R input, RegEx re) if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) { - import std.regex.internal.backtracking : BacktrackingMatcher; - return matchMany!(BacktrackingMatcher!true)(input, re); + return matchMany(input, re); } // another set of tests just to cover the new API @@ -1121,23 +1078,20 @@ if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) public auto bmatch(R, RegEx)(R input, RegEx re) if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) { - import std.regex.internal.backtracking : BacktrackingMatcher; - return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, re); + return RegexMatch!(Unqual!(typeof(input)))(input, re); } ///ditto public auto bmatch(R, String)(R input, String re) if (isSomeString!R && isSomeString!String) { - import std.regex.internal.backtracking : BacktrackingMatcher; - return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, regex(re)); + return RegexMatch!(Unqual!(typeof(input)))(input, regex(re)); } public auto bmatch(R, RegEx)(R input, RegEx re) if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) { - import std.regex.internal.backtracking : BacktrackingMatcher; - return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re); + return RegexMatch!(Unqual!(typeof(input)))(input, re); } // produces replacement string from format using captures for substitution From 33d7ecc84c79cc16021e7ee0e451f5cedcf162a4 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Wed, 30 Aug 2017 15:23:54 +0300 Subject: [PATCH 02/15] Finalized matcher factory abstraction, with compile-time regex support --- std/regex/internal/backtracking.d | 66 ++++++++++-------- std/regex/internal/ir.d | 108 ++++++++++++++++-------------- std/regex/internal/kickstart.d | 2 +- std/regex/package.d | 77 ++++++++------------- 4 files changed, 125 insertions(+), 128 deletions(-) diff --git a/std/regex/internal/backtracking.d b/std/regex/internal/backtracking.d index 647295c02..84e025da2 100644 --- a/std/regex/internal/backtracking.d +++ b/std/regex/internal/backtracking.d @@ -29,8 +29,8 @@ template BacktrackingMatcher(bool CTregex) enum initialStack = 1 << 11; // items in a block of segmented stack alias String = const(Char)[]; alias RegEx = Regex!Char; - alias MatchFn = bool function (BacktrackingMatcher!(Char, Stream)); - RegEx re; // regex program + alias MatchFn = bool function (ref BacktrackingMatcher); + const RegEx re; // regex program MatchFn nativeFn; // native code for that program // Stream state Stream s; @@ -70,7 +70,7 @@ template BacktrackingMatcher(bool CTregex) size_t _refCount; override @property ref size_t refCount() { return _refCount; } - override @property ref RegEx pattern(){ return re; } + override @property ref const(RegEx) pattern(){ return re; } static if (__traits(hasMember,Stream, "search")) { @@ -155,49 +155,57 @@ template BacktrackingMatcher(bool CTregex) memory = memory[2..$]; } - void initialize(ref RegEx program, Stream stream, void[] memBlock) + void initialize(ref const RegEx program, Stream stream, void[] memBlock) { - re = program; s = stream; exhausted = false; initExternalMemory(memBlock); backrefed = null; } - auto dupTo(void[] memory) + void dupTo(void[] memory) { - typeof(this) tmp = this; + auto tmp = this; tmp.initExternalMemory(memory); - return tmp; } - this(ref RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx) + this(ref const RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx) { + re = program; initialize(program, stream, memBlock); front = ch; index = idx; } - this(ref RegEx program, Stream stream, void[] memBlock) + this(ref const RegEx program, MatchFn func, Stream stream, void[] memBlock) { + re = program; + initialize(program, stream, memBlock); + nativeFn = func; + next(); + } + + this(ref const RegEx program, Stream stream, void[] memBlock) + { + re = program; initialize(program, stream, memBlock); next(); } - auto fwdMatcher(BacktrackingMatcher matcher, void[] memBlock) + auto fwdMatcher(ref const RegEx re, void[] memBlock) { alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); alias BackMatcher = BackMatcherTempl!(Char, Stream); - auto fwdMatcher = new BackMatcher(matcher.re, s, memBlock, front, index); + auto fwdMatcher = new BackMatcher(re, s, memBlock, front, index); return fwdMatcher; } - auto bwdMatcher(BacktrackingMatcher matcher, void[] memBlock) + auto bwdMatcher(ref const RegEx re, void[] memBlock) { alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); alias BackMatcher = BackMatcherTempl!(Char, typeof(s.loopBack(index))); auto fwdMatcher = - new BackMatcher(matcher.re, s.loopBack(index), memBlock); + new BackMatcher(re, s.loopBack(index), memBlock); return fwdMatcher; } @@ -582,19 +590,19 @@ template BacktrackingMatcher(bool CTregex) immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)]; scope(exit) free(mem.ptr); + auto slicedRe = re.withCode(re.ir[ + pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd) + ]); static if (Stream.isLoopback) { - auto matcher = bwdMatcher(this, mem); + auto matcher = bwdMatcher(slicedRe, mem); } else { - auto matcher = fwdMatcher(this, mem); + auto matcher = fwdMatcher(slicedRe, mem); } matcher.matches = matches[ms .. me]; matcher.backrefed = backrefed.empty ? matches : backrefed; - matcher.re.ir = re.ir[ - pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd) - ]; immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookaheadStart); s.reset(save); next(); @@ -611,20 +619,20 @@ template BacktrackingMatcher(bool CTregex) immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)]; scope(exit) free(mem.ptr); + auto slicedRe = re.withCode(re.ir[ + pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd) + ]); static if (Stream.isLoopback) { alias Matcher = BacktrackingMatcher!(Char, Stream); - auto matcher = Matcher(re, s, mem, front, index); + auto matcher = new Matcher(slicedRe, s, mem, front, index); } else { alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index))); - auto matcher = Matcher(re, s.loopBack(index), mem); + auto matcher = new Matcher(slicedRe, s.loopBack(index), mem); } matcher.matches = matches[ms .. me]; - matcher.re.ir = re.ir[ - pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd) - ]; matcher.backrefed = backrefed.empty ? matches : backrefed; immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookbehindStart); if (!match) @@ -934,10 +942,10 @@ struct CtContext immutable len = ir[0].data; immutable behind = ir[0].code == IR.LookbehindStart || ir[0].code == IR.NeglookbehindStart; immutable negative = ir[0].code == IR.NeglookaheadStart || ir[0].code == IR.NeglookbehindStart; - string fwdType = "typeof(fwdMatcher(matcher, []))"; - string bwdType = "typeof(bwdMatcher(matcher, []))"; - string fwdCreate = "fwdMatcher(matcher, mem)"; - string bwdCreate = "bwdMatcher(matcher, mem)"; + string fwdType = "typeof(fwdMatcher(re, []))"; + string bwdType = "typeof(bwdMatcher(re, []))"; + string fwdCreate = "fwdMatcher(re, mem)"; + string bwdCreate = "bwdMatcher(re, mem)"; immutable start = IRL!(IR.LookbehindStart); immutable end = IRL!(IR.LookbehindStart)+len+IRL!(IR.LookaheadEnd); CtContext context = lookaround(ir[1].raw, ir[2].raw); //split off new context @@ -948,7 +956,7 @@ struct CtContext alias Lookaround = $$; else alias Lookaround = $$; - static bool matcher_$$(Lookaround matcher) @trusted + static bool matcher_$$(ref Lookaround matcher) @trusted { //(neg)lookaround piece start $$ diff --git a/std/regex/internal/ir.d b/std/regex/internal/ir.d index f41ac8414..7533b0717 100644 --- a/std/regex/internal/ir.d +++ b/std/regex/internal/ir.d @@ -429,46 +429,49 @@ struct Group(DataIndex) // In future may also maintain a (TLS?) cache of memory interface MatcherFactory(Char) { - Matcher!Char create(ref Regex!Char, in Char[] input); - Matcher!Char dup(Matcher!Char m, in Char[] input); - void incRef(Matcher!Char m); - void decRef(Matcher!Char m); +@safe: + Matcher!Char create(const Regex!Char, in Char[] input) const; + Matcher!Char dup(Matcher!Char m, in Char[] input) const; + void incRef(Matcher!Char m) const; + void decRef(Matcher!Char m) const; } // Only memory management, no compile-time vs run-time specialities -abstract class GenericFactory(EngineType, Char) : MatcherFactory!Char +abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char { - Matcher!Char construct(ref Regex!Char re, in Char[] input, void[] memory); + import core.stdc.stdlib; + enum classSize = __traits(classInstanceSize, EngineType!Char); - Matcher!Char create(ref Regex!Char re, in Char[] input) + Matcher!Char construct(const Regex!Char re, in Char[] input, void[] memory) const; + + override Matcher!Char create(const Regex!Char re, in Char[] input) const @trusted { - immutable classSize = __traits(classInstanceSize, EngineType!Char); - immutable size = EngineType!Char.initialMemory(prog) + classSize; + immutable size = EngineType!Char.initialMemory(re) + classSize; auto memory = enforce(malloc(size), "malloc failed")[0 .. size]; scope(failure) free(memory.ptr); - auto engine = construct(prog, input, _memory); + auto engine = construct(re, input, memory); assert(engine.refCount == 1); assert(cast(void*)engine == memory.ptr); return engine; } - Matcher!Char dup(Matcher!Char engine, in Char[] input) + override Matcher!Char dup(Matcher!Char engine, in Char[] input) const @trusted { - immutable size = EngineType.initialMemory(engine.re)+size_t.sizeof; + immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize; auto memory = enforce(malloc(size), "malloc failed")[0 .. size]; scope(failure) free(memory.ptr); - engine.dupTo(_memory[size_t.sizeof .. size]); + engine.dupTo(memory[classSize .. size]); engine = construct(engine.pattern, input, memory); assert(engine.refCount == 1); return engine; } - void incRef() + override void incRef(Matcher!Char m) const { ++m.refCount; } - void decRef(Matcher!Char m) + override void decRef(Matcher!Char m) const @trusted { assert(m.refCount != 0); if (--m.refCount == 0) @@ -479,22 +482,24 @@ abstract class GenericFactory(EngineType, Char) : MatcherFactory!Char } // A factory for run-time engines -class RuntimeFactory(EngineType, Char) : GenericFactory!(EngineType, Char) +class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char) { - Matcher!Char construct(ref Regex!Char re, in Char[] input, void[] memory) + override Matcher!Char construct(const Regex!Char re, in Char[] input, void[] memory) const { - return emplace!(EngineType!Char)(_memory[0 .. classSize], - prog, Input!Char(input), _memory[classSize .. $]); + import std.conv : emplace; + return emplace!(EngineType!Char)(memory[0 .. classSize], + prog, Input!Char(input), memory[classSize .. $]); } } // A factory for compile-time engine -class CtfeFactory(EngineType, Char, alias func) : GenericFactory!(EngineType, Char) +class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char) { - Matcher!Char construct(ref Regex!Char re, in Char[] input, void[] memory) + override Matcher!Char construct(const Regex!Char re, in Char[] input, void[] memory) const { - return emplace!(EngineType!Char)(_memory[0 .. classSize], - StaticRegex!Char(prog, func), Input!Char(input), _memory[classSize .. $]); + import std.conv : emplace; + return emplace!(EngineType!Char)(memory[0 .. classSize], + re, &func, Input!Char(input), memory[classSize .. $]); } } @@ -504,9 +509,11 @@ interface Matcher(Char) int match(Group!size_t[] matches); // This only maintains internal ref-count, // deallocation happens inside MatcherFactory - @property ref size_t refCount(); + @property ref size_t refCount() @safe; + // Copy internal state to memory location + void dupTo(void[] memory); // The pattern loaded - @property ref Regex!Char pattern(); + @property ref const(Regex!Char) pattern() @safe; } /++ @@ -529,11 +536,11 @@ struct Regex(Char) static struct NamedGroupRange { private: - NamedGroup[] groups; + const(NamedGroup)[] groups; size_t start; size_t end; public: - this(NamedGroup[] g, size_t s, size_t e) + this(const(NamedGroup)[] g, size_t s, size_t e) { assert(s <= e); assert(e <= g.length); @@ -571,7 +578,7 @@ struct Regex(Char) package(std.regex): import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency - NamedGroup[] dict; // maps name -> user group number + const(NamedGroup)[] dict; // maps name -> user group number uint ngroup; // number of internal groups uint maxCounterDepth; // max depth of nested {n,m} repetitions uint hotspotTableSize; // number of entries in merge table @@ -583,6 +590,27 @@ package(std.regex): Kickstart!Char kickstart; MatcherFactory!Char factory; // produces optimal matcher for this pattern + const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted + { + auto r = cast()this; + r.factory = factory; + return r; + } + + const(Regex) withFlags(uint newFlags) pure const @trusted + { + auto r = cast()this; + r.flags = newFlags; + return r; + } + + const(Regex) withCode(const(Bytecode)[] code) pure const @trusted + { + auto r = cast()this; + r.ir = code.dup; // TODO: sidestep const instead? + return r; + } + //bit access helper uint isBackref(uint n) { @@ -624,26 +652,6 @@ package(std.regex): } -//@@@BUG@@@ (unreduced) - public makes it inaccessible in std.regex.package (!) -/*public*/ struct StaticRegex(Char) -{ -package(std.regex): - import std.regex.internal.backtracking : BacktrackingMatcher; - alias Matcher = BacktrackingMatcher!(true); - alias MatchFn = bool function(ref Matcher!Char) @trusted; - MatchFn nativeFn; -public: - Regex!Char _regex; - alias _regex this; - this(Regex!Char re, MatchFn fn) - { - _regex = re; - nativeFn = fn; - - } - -} - // The stuff below this point is temporarrily part of IR module // but may need better place in the future (all internals) package(std.regex): @@ -680,7 +688,7 @@ if (is(Char :dchar)) @property bool atEnd(){ return _index == _origin.length; } - bool search(Kickstart)(ref Kickstart kick, ref dchar res, ref size_t pos) + bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos) { size_t idx = kick.search(_origin, _index); _index = idx; @@ -763,7 +771,7 @@ template BackLooper(E) } // -@trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name) +@trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name) {//equal is @system? import std.algorithm.comparison : equal; import std.algorithm.iteration : map; diff --git a/std/regex/internal/kickstart.d b/std/regex/internal/kickstart.d index f303b43b6..8afa8d927 100644 --- a/std/regex/internal/kickstart.d +++ b/std/regex/internal/kickstart.d @@ -393,7 +393,7 @@ public: // has a useful trait: if supplied with valid UTF indexes, // returns only valid UTF indexes // (that given the haystack in question is valid UTF string) - @trusted size_t search(const(Char)[] haystack, size_t idx) + @trusted size_t search(const(Char)[] haystack, size_t idx) const {//@BUG: apparently assumes little endian machines import core.stdc.string : memchr; import std.conv : text; diff --git a/std/regex/package.d b/std/regex/package.d index 23841357a..7e572462c 100644 --- a/std/regex/package.d +++ b/std/regex/package.d @@ -298,7 +298,6 @@ module std.regex; import std.range.primitives, std.traits; import std.regex.internal.ir; -import std.regex.internal.thompson; //TODO: get rid of this dependency import std.typecons; // : Flag, Yes, No; /++ @@ -339,10 +338,9 @@ public alias Regex(Char) = std.regex.internal.ir.Regex!(Char); A $(D StaticRegex) is $(D Regex) object that contains D code specially generated at compile-time to speed up matching. - Implicitly convertible to normal $(D Regex), - however doing so will result in losing this additional capability. + No longer used, kept as alias to Regex for backwards compatibility. +/ -public alias StaticRegex(Char) = std.regex.internal.ir.StaticRegex!(Char); +public alias StaticRegex = Regex; /++ Compile regular expression pattern for the later execution. @@ -431,13 +429,19 @@ template ctRegexImpl(alias pattern, string flags=[]) enum r = regex(pattern, flags); alias Char = BasicElementOf!(typeof(pattern)); enum source = ctGenRegExCode(r); - alias Matcher = BacktrackingMatcher!(true); - @trusted bool func(ref Matcher!Char matcher) + alias CtMatcher = BacktrackingMatcher!(true); + @trusted bool func(ref CtMatcher!Char matcher) { debug(std_regex_ctr) pragma(msg, source); mixin(source); } - enum nr = StaticRegex!Char(r, &func); + static immutable staticRe = cast(immutable)r.withFactory(new CtfeFactory!(CtMatcher, Char, func)); + struct Wrapper + { + @property auto getRe(){ return staticRe; } + alias getRe this; + } + enum wrapper = Wrapper(); } /++ @@ -450,10 +454,10 @@ template ctRegexImpl(alias pattern, string flags=[]) pattern = Regular expression flags = The _attributes (g, i, m and x accepted) +/ -public enum ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).nr; +public enum ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).wrapper; -enum isRegexFor(RegEx, R) = is(RegEx == Regex!(BasicElementOf!R)) - || is(RegEx == StaticRegex!(BasicElementOf!R)); +enum isRegexFor(RegEx, R) = is(Unqual!RegEx == Regex!(BasicElementOf!R)) || is(RegEx : const(Regex!(BasicElementOf!R))) + || is(Unqual!RegEx == StaticRegex!(BasicElementOf!R)); /++ @@ -480,9 +484,9 @@ private: } uint _f, _b; uint _refcount; // ref count or SMALL MASK + num groups - NamedGroup[] _names; + const(NamedGroup)[] _names; - this(R input, uint n, NamedGroup[] named) + this(R input, uint n, const(NamedGroup)[] named) { _input = input; _names = named; @@ -701,7 +705,7 @@ private: import core.stdc.stdlib : malloc, free; alias Char = BasicElementOf!R; Matcher!Char _engine; - MatcherFactory!Char _factory; + const MatcherFactory!Char _factory; R _input; Captures!R _captures; @@ -709,6 +713,7 @@ private: { import std.exception : enforce; _input = input; + assert(prog.factory !is null, "malformed regex - missing matcher factory"); _factory = prog.factory; _engine = prog.factory.create(prog, input); _captures = Captures!R(this); @@ -796,6 +801,7 @@ public: private @trusted auto matchOnce(RegEx, R)(R input, RegEx re) { alias Char = BasicElementOf!R; + assert(re.factory !is null, "malformed regex - missing matcher factory"); auto engine = re.factory.create(re, input); scope(exit) re.factory.decRef(engine); // destroys the engine auto captures = Captures!R(input, re.ngroup, re.dict); @@ -803,18 +809,17 @@ private @trusted auto matchOnce(RegEx, R)(R input, RegEx re) return captures; } -private auto matchMany(RegEx, R)(R input, RegEx re) +private auto matchMany(RegEx, R)(R input, RegEx re) @safe { - re.flags |= RegexOption.global; - return RegexMatch!R(input, re); + return RegexMatch!R(input, re.withFlags(re.flags | RegexOption.global)); } @system unittest { //sanity checks for new API auto re = regex("abc"); - assert(!"abc".matchOnce!(ThompsonMatcher)(re).empty); - assert("abc".matchOnce!(ThompsonMatcher)(re)[0] == "abc"); + assert(!"abc".matchOnce(re).empty); + assert("abc".matchOnce(re)[0] == "abc"); } @@ -906,7 +911,7 @@ if (isSomeString!R && isRegexFor!(RegEx, R)) +/ public auto match(R, RegEx)(R input, RegEx re) -if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) +if (isSomeString!R && isRegexFor!(RegEx,R)) { return RegexMatch!(Unqual!(typeof(input)))(input, re); } @@ -918,12 +923,6 @@ if (isSomeString!R && isSomeString!String) return RegexMatch!(Unqual!(typeof(input)))(input, regex(re)); } -public auto match(R, RegEx)(R input, RegEx re) -if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) -{ - return RegexMatch!(Unqual!(typeof(input)))(input, re); -} - /++ Find the first (leftmost) slice of the $(D input) that matches the pattern $(D re). This function picks the most suitable @@ -943,7 +942,7 @@ if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) if there was a match, otherwise an empty $(LREF Captures) object. +/ public auto matchFirst(R, RegEx)(R input, RegEx re) -if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) +if (isSomeString!R && isRegexFor!(RegEx, R)) { return matchOnce(input, re); } @@ -962,12 +961,6 @@ if (isSomeString!R && isSomeString!String) return matchOnce(input, regex(re)); } -public auto matchFirst(R, RegEx)(R input, RegEx re) -if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) -{ - return matchOnce(input, re); -} - /++ Initiate a search for all non-overlapping matches to the pattern $(D re) in the given $(D input). The result is a lazy range of matches generated @@ -990,7 +983,7 @@ if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) after the first match was found or an empty one if not present. +/ public auto matchAll(R, RegEx)(R input, RegEx re) -if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) +if (isSomeString!R && isRegexFor!(RegEx, R)) { return matchMany(input, re); } @@ -1009,12 +1002,6 @@ if (isSomeString!R && isSomeString!String) return matchMany(input, regex(re)); } -public auto matchAll(R, RegEx)(R input, RegEx re) -if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) -{ - return matchMany(input, re); -} - // another set of tests just to cover the new API @system unittest { @@ -1076,7 +1063,7 @@ if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) +/ public auto bmatch(R, RegEx)(R input, RegEx re) -if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) +if (isSomeString!R && isRegexFor!(RegEx, R)) { return RegexMatch!(Unqual!(typeof(input)))(input, re); } @@ -1088,12 +1075,6 @@ if (isSomeString!R && isSomeString!String) return RegexMatch!(Unqual!(typeof(input)))(input, regex(re)); } -public auto bmatch(R, RegEx)(R input, RegEx re) -if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) -{ - return RegexMatch!(Unqual!(typeof(input)))(input, re); -} - // produces replacement string from format using captures for substitution package void replaceFmt(R, Capt, OutR) (R format, Capt captures, OutR sink, bool ignoreBadSubs = false) @@ -1484,7 +1465,7 @@ private: @trusted this(Range input, RegEx separator) {//@@@BUG@@@ generated opAssign of RegexMatch is not @trusted _input = input; - separator.flags |= RegexOption.global; + auto re = separator.withFlags(separator.flags | RegexOption.global); if (_input.empty) { //there is nothing to match at all, make _offset > 0 @@ -1492,7 +1473,7 @@ private: } else { - _match = Rx(_input, separator); + _match = Rx(_input, re); static if (keepSeparators) if (_match.pre.empty) From 639dd3dd6f3f23a1c76c7050a2116d100dbb805d Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Wed, 30 Aug 2017 18:59:33 +0300 Subject: [PATCH 03/15] The new std.regex design is concluded, going through the unitests --- std/regex/internal/backtracking.d | 39 +++++---- std/regex/internal/ir.d | 18 ++++- std/regex/internal/parser.d | 8 +- std/regex/internal/tests.d | 4 +- std/regex/internal/thompson.d | 127 +++++++++++++++--------------- std/regex/package.d | 2 +- std/uni.d | 9 ++- 7 files changed, 118 insertions(+), 89 deletions(-) diff --git a/std/regex/internal/backtracking.d b/std/regex/internal/backtracking.d index 84e025da2..491273040 100644 --- a/std/regex/internal/backtracking.d +++ b/std/regex/internal/backtracking.d @@ -68,6 +68,7 @@ template BacktrackingMatcher(bool CTregex) //local slice of matches, global for backref Group!DataIndex[] matches, backrefed; size_t _refCount; + final: override @property ref size_t refCount() { return _refCount; } override @property ref const(RegEx) pattern(){ return re; } @@ -163,14 +164,18 @@ template BacktrackingMatcher(bool CTregex) backrefed = null; } - void dupTo(void[] memory) + override void dupTo(void[] memBlock) { - auto tmp = this; - tmp.initExternalMemory(memory); + auto m = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock); + m[] = Trace.init; + auto mem = cast(size_t[]) memBlock; + mem[0] = 0; // hidden pointer + mem[1] = 0; // used size } this(ref const RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx) { + _refCount = 1; re = program; initialize(program, stream, memBlock); front = ch; @@ -179,6 +184,7 @@ template BacktrackingMatcher(bool CTregex) this(ref const RegEx program, MatchFn func, Stream stream, void[] memBlock) { + _refCount = 1; re = program; initialize(program, stream, memBlock); nativeFn = func; @@ -187,6 +193,7 @@ template BacktrackingMatcher(bool CTregex) this(ref const RegEx program, Stream stream, void[] memBlock) { + _refCount = 1; re = program; initialize(program, stream, memBlock); next(); @@ -229,7 +236,7 @@ template BacktrackingMatcher(bool CTregex) } //lookup next match, fill matches with indices into input - int match(Group!DataIndex[] matches) + override int match(Group!DataIndex[] matches) { debug(std_regex_matcher) { @@ -815,7 +822,7 @@ struct CtContext //to mark the portion of matches to save int match, total_matches; int reserved; - CodepointSet[] charsets; + const CodepointSet[] charsets; //state of codegenerator @@ -825,7 +832,7 @@ struct CtContext int addr; } - this(Char)(Regex!Char re) + this(Char)(const Regex!Char re) { match = 1; reserved = 1; //first match is skipped @@ -886,7 +893,7 @@ struct CtContext } // - CtState ctGenBlock(Bytecode[] ir, int addr) + CtState ctGenBlock(const(Bytecode)[] ir, int addr) { CtState result; result.addr = addr; @@ -900,7 +907,7 @@ struct CtContext } // - CtState ctGenGroup(ref Bytecode[] ir, int addr) + CtState ctGenGroup(ref const(Bytecode)[] ir, int addr) { import std.algorithm.comparison : max; auto bailOut = "goto L_backtrack;"; @@ -1002,7 +1009,7 @@ struct CtContext } //generate source for bytecode contained in OrStart ... OrEnd - CtState ctGenAlternation(Bytecode[] ir, int addr) + CtState ctGenAlternation(const(Bytecode)[] ir, int addr) { CtState[] pieces; CtState r; @@ -1042,11 +1049,11 @@ struct CtContext // generate fixup code for instruction in ir, // fixup means it has an alternative way for control flow - string ctGenFixupCode(Bytecode[] ir, int addr, int fixup) + string ctGenFixupCode(const(Bytecode)[] ir, int addr, int fixup) { return ctGenFixupCode(ir, addr, fixup); // call ref Bytecode[] version } - string ctGenFixupCode(ref Bytecode[] ir, int addr, int fixup) + string ctGenFixupCode(ref const(Bytecode)[] ir, int addr, int fixup) { string r; string testCode; @@ -1200,7 +1207,7 @@ struct CtContext } - string ctQuickTest(Bytecode[] ir, int id) + string ctQuickTest(const(Bytecode)[] ir, int id) { uint pc = 0; while (pc < ir.length && ir[pc].isAtom) @@ -1227,7 +1234,7 @@ struct CtContext } //process & generate source for simple bytecodes at front of ir using address addr - CtState ctGenAtom(ref Bytecode[] ir, int addr) + CtState ctGenAtom(ref const(Bytecode)[] ir, int addr) { CtState result; result.code = ctAtomCode(ir, addr); @@ -1237,7 +1244,7 @@ struct CtContext } //D code for atom at ir using address addr, addr < 0 means quickTest - string ctAtomCode(Bytecode[] ir, int addr) + string ctAtomCode(const(Bytecode)[] ir, int addr) { string code; string bailOut, nextInstr; @@ -1449,7 +1456,7 @@ struct CtContext } //generate D code for the whole regex - public string ctGenRegEx(Bytecode[] ir) + public string ctGenRegEx(const(Bytecode)[] ir) { auto bdy = ctGenBlock(ir, 0); auto r = ` @@ -1498,7 +1505,7 @@ struct CtContext } -string ctGenRegExCode(Char)(Regex!Char re) +string ctGenRegExCode(Char)(const Regex!Char re) { auto context = CtContext(re); return context.ctGenRegEx(re.ir); diff --git a/std/regex/internal/ir.d b/std/regex/internal/ir.d index 7533b0717..6e47352bd 100644 --- a/std/regex/internal/ir.d +++ b/std/regex/internal/ir.d @@ -484,18 +484,18 @@ abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char // A factory for run-time engines class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char) { - override Matcher!Char construct(const Regex!Char re, in Char[] input, void[] memory) const + override EngineType!Char construct(const Regex!Char re, in Char[] input, void[] memory) const { import std.conv : emplace; return emplace!(EngineType!Char)(memory[0 .. classSize], - prog, Input!Char(input), memory[classSize .. $]); + re, Input!Char(input), memory[classSize .. $]); } } // A factory for compile-time engine class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char) { - override Matcher!Char construct(const Regex!Char re, in Char[] input, void[] memory) const + override EngineType!Char construct(const Regex!Char re, in Char[] input, void[] memory) const { import std.conv : emplace; return emplace!(EngineType!Char)(memory[0 .. classSize], @@ -503,8 +503,11 @@ class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineTy } } -interface Matcher(Char) +// Defining it as an interface has the undesired side-effect: +// casting any class to an interface silently adjusts pointer to point to a nested vtbl +abstract class Matcher(Char) { +abstract: // Get a (next) match int match(Group!size_t[] matches); // This only maintains internal ref-count, @@ -611,6 +614,13 @@ package(std.regex): return r; } + const(Regex) withNGroup(uint nGroup) pure const @trusted + { + auto r = cast()this; + r.ngroup = nGroup; + return r; + } + //bit access helper uint isBackref(uint n) { diff --git a/std/regex/internal/parser.d b/std/regex/internal/parser.d index 7b7e96884..4b485cb2b 100644 --- a/std/regex/internal/parser.d +++ b/std/regex/internal/parser.d @@ -12,7 +12,10 @@ static import std.ascii; // package relevant info from parser into a regex object auto makeRegex(S, CG)(Parser!(S, CG) p) { - Regex!(BasicElementOf!S) re; + import std.regex.internal.thompson, std.regex.internal.backtracking; + import std.algorithm.searching : canFind; + alias Char = BasicElementOf!S; + Regex!Char re; auto g = p.g; with(re) { @@ -24,6 +27,9 @@ auto makeRegex(S, CG)(Parser!(S, CG) p) charsets = g.charsets; matchers = g.matchers; backrefed = g.backrefed; + // check if we have backreferences, if so - use backtracking + if (backrefed.canFind!"a != 0") factory = new RuntimeFactory!(BacktrackingMatcher!false, Char); + else factory = new RuntimeFactory!(ThompsonMatcher, Char); re.postprocess(); debug(std_regex_parser) { diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index 025ef7e44..0d4a65e74 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -518,11 +518,11 @@ alias Sequence(int B, int E) = staticIota!(B, E); { import std.algorithm.comparison : equal; auto rtr = regex("a|b|c"); - enum ctr = regex("a|b|c"); + static ctr = regex("a|b|c"); assert(equal(rtr.ir,ctr.ir)); //CTFE parser BUG is triggered by group //in the middle of alternation (at least not first and not last) - enum testCT = regex(`abc|(edf)|xyz`); + static testCT = regex(`abc|(edf)|xyz`); auto testRT = regex(`abc|(edf)|xyz`); assert(equal(testCT.ir,testRT.ir)); } diff --git a/std/regex/internal/thompson.d b/std/regex/internal/thompson.d index ceb561ea3..47e071b4d 100644 --- a/std/regex/internal/thompson.d +++ b/std/regex/internal/thompson.d @@ -89,7 +89,7 @@ struct ThreadList(DataIndex) template ThompsonOps(E, S, bool withInput:true) { @trusted: - static bool op(IR code:IR.End)(E* e, S* state) + static bool op(IR code:IR.End)(E e, S* state) { with(e) with(state) { @@ -105,7 +105,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.Wordboundary)(E* e, S* state) + static bool op(IR code:IR.Wordboundary)(E e, S* state) { with(e) with(state) { @@ -137,7 +137,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.Notwordboundary)(E* e, S* state) + static bool op(IR code:IR.Notwordboundary)(E e, S* state) { with(e) with(state) { @@ -167,7 +167,7 @@ template ThompsonOps(E, S, bool withInput:true) return true; } - static bool op(IR code:IR.Bof)(E* e, S* state) + static bool op(IR code:IR.Bof)(E e, S* state) { with(e) with(state) { @@ -183,7 +183,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.Bol)(E* e, S* state) + static bool op(IR code:IR.Bol)(E e, S* state) { with(e) with(state) { @@ -203,7 +203,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.Eof)(E* e, S* state) + static bool op(IR code:IR.Eof)(E e, S* state) { with(e) with(state) { @@ -219,7 +219,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.Eol)(E* e, S* state) + static bool op(IR code:IR.Eol)(E e, S* state) { with(e) with(state) { @@ -240,42 +240,42 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.InfiniteStart)(E* e, S* state) + static bool op(IR code:IR.InfiniteStart)(E e, S* state) { with(e) with(state) t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart); return op!(IR.InfiniteEnd)(e,state); } - static bool op(IR code:IR.InfiniteBloomStart)(E* e, S* state) + static bool op(IR code:IR.InfiniteBloomStart)(E e, S* state) { with(e) with(state) t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteBloomStart); return op!(IR.InfiniteBloomEnd)(e,state); } - static bool op(IR code:IR.InfiniteQStart)(E* e, S* state) + static bool op(IR code:IR.InfiniteQStart)(E e, S* state) { with(e) with(state) t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteQStart); return op!(IR.InfiniteQEnd)(e,state); } - static bool op(IR code:IR.RepeatStart)(E* e, S* state) + static bool op(IR code:IR.RepeatStart)(E e, S* state) { with(e) with(state) t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart); return op!(IR.RepeatEnd)(e,state); } - static bool op(IR code:IR.RepeatQStart)(E* e, S* state) + static bool op(IR code:IR.RepeatQStart)(E e, S* state) { with(e) with(state) t.pc += re.ir[t.pc].data + IRL!(IR.RepeatQStart); return op!(IR.RepeatQEnd)(e,state); } - static bool op(IR code)(E* e, S* state) + static bool op(IR code)(E e, S* state) if (code == IR.RepeatEnd || code == IR.RepeatQEnd) { with(e) with(state) @@ -330,7 +330,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code)(E* e, S* state) + static bool op(IR code)(E e, S* state) if (code == IR.InfiniteEnd || code == IR.InfiniteQEnd) { with(e) with(state) @@ -365,7 +365,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code)(E* e, S* state) + static bool op(IR code)(E e, S* state) if (code == IR.InfiniteBloomEnd) { with(e) with(state) @@ -394,7 +394,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.OrEnd)(E* e, S* state) + static bool op(IR code:IR.OrEnd)(E e, S* state) { with(e) with(state) { @@ -415,7 +415,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.OrStart)(E* e, S* state) + static bool op(IR code:IR.OrStart)(E e, S* state) { with(e) with(state) { @@ -424,7 +424,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.Option)(E* e, S* state) + static bool op(IR code:IR.Option)(E e, S* state) { with(e) with(state) { @@ -439,7 +439,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.GotoEndOr)(E* e, S* state) + static bool op(IR code:IR.GotoEndOr)(E e, S* state) { with(e) with(state) { @@ -448,7 +448,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.GroupStart)(E* e, S* state) + static bool op(IR code:IR.GroupStart)(E e, S* state) { with(e) with(state) { @@ -458,7 +458,7 @@ template ThompsonOps(E, S, bool withInput:true) return true; } } - static bool op(IR code:IR.GroupEnd)(E* e, S* state) + static bool op(IR code:IR.GroupEnd)(E e, S* state) { with(e) with(state) { @@ -469,7 +469,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.Backref)(E* e, S* state) + static bool op(IR code:IR.Backref)(E e, S* state) { with(e) with(state) { @@ -506,7 +506,7 @@ template ThompsonOps(E, S, bool withInput:true) } - static bool op(IR code)(E* e, S* state) + static bool op(IR code)(E e, S* state) if (code == IR.LookbehindStart || code == IR.NeglookbehindStart) { with(e) with(state) @@ -516,10 +516,9 @@ template ThompsonOps(E, S, bool withInput:true) uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart); bool positive = re.ir[t.pc].code == IR.LookbehindStart; static if (Stream.isLoopback) - auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); + auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0)); else - auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); - matcher.re.ngroup = me - ms; + auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0)); matcher.backrefed = backrefed.empty ? t.matches : backrefed; //backMatch auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart)); @@ -534,7 +533,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code)(E* e, S* state) + static bool op(IR code)(E e, S* state) if (code == IR.LookaheadStart || code == IR.NeglookaheadStart) { with(e) with(state) @@ -545,10 +544,9 @@ template ThompsonOps(E, S, bool withInput:true) uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart); bool positive = re.ir[t.pc].code == IR.LookaheadStart; static if (Stream.isLoopback) - auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); + auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0)); else - auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); - matcher.re.ngroup = me - ms; + auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0)); matcher.backrefed = backrefed.empty ? t.matches : backrefed; auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart)); freelist = matcher.freelist; @@ -564,7 +562,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code)(E* e, S* state) + static bool op(IR code)(E e, S* state) if (code == IR.LookaheadEnd || code == IR.NeglookaheadEnd || code == IR.LookbehindEnd || code == IR.NeglookbehindEnd) { @@ -579,13 +577,13 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.Nop)(E* e, S* state) + static bool op(IR code:IR.Nop)(E e, S* state) { with(state) t.pc += IRL!(IR.Nop); return true; } - static bool op(IR code:IR.OrChar)(E* e, S* state) + static bool op(IR code:IR.OrChar)(E e, S* state) { with(e) with(state) { @@ -607,7 +605,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.Char)(E* e, S* state) + static bool op(IR code:IR.Char)(E e, S* state) { with(e) with(state) { @@ -623,7 +621,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.Any)(E* e, S* state) + static bool op(IR code:IR.Any)(E e, S* state) { with(e) with(state) { @@ -634,7 +632,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.CodepointSet)(E* e, S* state) + static bool op(IR code:IR.CodepointSet)(E e, S* state) { with(e) with(state) { @@ -652,7 +650,7 @@ template ThompsonOps(E, S, bool withInput:true) } } - static bool op(IR code:IR.Trie)(E* e, S* state) + static bool op(IR code:IR.Trie)(E e, S* state) { with(e) with(state) { @@ -676,7 +674,7 @@ template ThompsonOps(E,S, bool withInput:false) { @trusted: // can't match these without input - static bool op(IR code)(E* e, S* state) + static bool op(IR code)(E e, S* state) if (code == IR.Char || code == IR.OrChar || code == IR.CodepointSet || code == IR.Trie || code == IR.Char || code == IR.Any) { @@ -684,7 +682,7 @@ template ThompsonOps(E,S, bool withInput:false) } // special case of zero-width backref - static bool op(IR code:IR.Backref)(E* e, S* state) + static bool op(IR code:IR.Backref)(E e, S* state) { with(e) with(state) { @@ -702,7 +700,7 @@ template ThompsonOps(E,S, bool withInput:false) } // forward all control flow to normal versions - static bool op(IR code)(E* e, S* state) + static bool op(IR code)(E e, S* state) if (code != IR.Char && code != IR.OrChar && code != IR.CodepointSet && code != IR.Trie && code != IR.Char && code != IR.Any && code != IR.Backref) { @@ -719,14 +717,14 @@ if (is(Char : dchar)) { alias DataIndex = Stream.DataIndex; alias Stream = StreamType; - alias OpFunc = bool function(ThompsonMatcher*, State*); + alias OpFunc = bool function(ThompsonMatcher, State*); alias BackMatcher = ThompsonMatcher!(Char, BackLooper!(Stream)); - alias OpBackFunc = bool function(BackMatcher*, BackMatcher.State*); + alias OpBackFunc = bool function(BackMatcher, BackMatcher.State*); Thread!DataIndex* freelist; ThreadList!DataIndex clist, nlist; DataIndex[] merge; Group!DataIndex[] backrefed; - Regex!Char re; //regex program + const Regex!Char re; //regex program Stream s; dchar front; DataIndex index; @@ -737,16 +735,18 @@ if (is(Char : dchar)) OpBackFunc[] opCacheBackTrue; // ditto OpBackFunc[] opCacheBackFalse; // ditto size_t threadSize; + size_t _refCount; int matched; bool exhausted; +final: static struct State { Thread!DataIndex* t; ThreadList!DataIndex worklist; Group!DataIndex[] matches; - bool popState(E)(E* e) + bool popState(E)(E e) { with(e) { @@ -784,6 +784,10 @@ if (is(Char : dchar)) //true if it's end of input @property bool atEnd(){ return index == s.lastIndex && s.atEnd; } + override @property ref size_t refCount() @safe { return _refCount; } + + override @property ref const(Regex!Char) pattern() @safe { return re; } + bool next() { if (!s.nextChar(front, index)) @@ -843,19 +847,20 @@ if (is(Char : dchar)) } } - this()(Regex!Char program, Stream stream, void[] memory) + this()(const Regex!Char program, Stream stream, void[] memory) { + _refCount = 1; re = program; s = stream; initExternalMemory(memory); genCounter = 0; } - this(ref ThompsonMatcher matcher, size_t lo, size_t hi, Stream stream) + this(ThompsonMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream) { + _refCount = 1; s = stream; - re = matcher.re; - re.ir = re.ir[lo .. hi]; + re = matcher.re.withCode(re.ir[lo .. hi]).withNGroup(nGroup); threadSize = matcher.threadSize; merge = matcher.merge; freelist = matcher.freelist; @@ -867,11 +872,11 @@ if (is(Char : dchar)) index = matcher.index; } - this(ref BackMatcher matcher, size_t lo, size_t hi, Stream stream) + this(BackMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream) { + _refCount = 1; s = stream; - re = matcher.re; - re.ir = re.ir[lo .. hi]; + re = matcher.re.withCode(re.ir[lo .. hi]).withNGroup(nGroup); threadSize = matcher.threadSize; merge = matcher.merge; freelist = matcher.freelist; @@ -883,31 +888,29 @@ if (is(Char : dchar)) index = matcher.index; } - auto fwdMatcher()(size_t lo, size_t hi, size_t counter) + auto fwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter) { - auto m = ThompsonMatcher!(Char, Stream)(this, lo, hi, s); + auto m = new ThompsonMatcher!(Char, Stream)(this, lo, hi, nGroup, s); m.genCounter = counter; return m; } - auto bwdMatcher()(size_t lo, size_t hi, size_t counter) + auto bwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter) { alias BackLooper = typeof(s.loopBack(index)); - auto m = ThompsonMatcher!(Char, BackLooper)(this, lo, hi, s.loopBack(index)); + auto m = new ThompsonMatcher!(Char, BackLooper)(this, lo, hi, nGroup, s.loopBack(index)); m.genCounter = counter; m.next(); return m; } - auto dupTo(void[] memory) + override void dupTo(void[] memory) { - typeof(this) tmp = this;//bitblit + auto tmp = this; // bit-blit tmp.initExternalMemory(memory); - tmp.genCounter = 0; - return tmp; } - int match(Group!DataIndex[] matches) + override int match(Group!DataIndex[] matches) { debug(std_regex_matcher) writeln("------------------------------------------"); @@ -1052,9 +1055,9 @@ if (is(Char : dchar)) { debug(std_regex_matcher) writeln("---- Evaluating thread"); static if (withInput) - while (opCacheTrue.ptr[state.t.pc](&this, state)){} + while (opCacheTrue.ptr[state.t.pc](this, state)){} else - while (opCacheFalse.ptr[state.t.pc](&this, state)){} + while (opCacheFalse.ptr[state.t.pc](this, state)){} } enum uint RestartPc = uint.max; //match the input, evaluating IR without searching diff --git a/std/regex/package.d b/std/regex/package.d index 7e572462c..7993a8a1d 100644 --- a/std/regex/package.d +++ b/std/regex/package.d @@ -426,7 +426,7 @@ if (isSomeString!(S)) template ctRegexImpl(alias pattern, string flags=[]) { import std.regex.internal.backtracking, std.regex.internal.parser; - enum r = regex(pattern, flags); + static immutable r = cast(immutable)regex(pattern, flags); alias Char = BasicElementOf!(typeof(pattern)); enum source = ctGenRegExCode(r); alias CtMatcher = BacktrackingMatcher!(true); diff --git a/std/uni.d b/std/uni.d index d2247b506..a0b39ff0d 100644 --- a/std/uni.d +++ b/std/uni.d @@ -2181,9 +2181,9 @@ pure: assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')])); ----------- */ - @property auto byInterval() + @property auto byInterval() const { - return Intervals!(typeof(data))(data); + return Intervals!(const(uint)[])(data[]); } /** @@ -2663,7 +2663,7 @@ public: } --- */ - string toSourceCode(string funcName="") + string toSourceCode(string funcName="") const { import std.algorithm.searching : countUntil; import std.array : array; @@ -2804,6 +2804,7 @@ private: //may break sorted property - but we need std.sort to access it //hence package protection attribute + static if(hasAssignableElements!Range) package @property void front(CodepointInterval val) { slice[start] = val.a; @@ -2818,6 +2819,7 @@ private: } //ditto about package + static if(hasAssignableElements!Range) package @property void back(CodepointInterval val) { slice[end-2] = val.a; @@ -2842,6 +2844,7 @@ private: } //ditto about package + static if(hasAssignableElements!Range) package void opIndexAssign(CodepointInterval val, size_t idx) { slice[start+idx*2] = val.a; From c7bdfbe51bf9db1eef55692fbce80168b427bc76 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Mon, 4 Sep 2017 18:41:33 +0300 Subject: [PATCH 04/15] std.regex finish the loose ends in the great refactoring --- std/regex/internal/backtracking.d | 24 ++++--- std/regex/internal/ir.d | 60 ++++++++++++----- std/regex/internal/parser.d | 13 ++-- std/regex/internal/thompson.d | 17 +++-- std/regex/package.d | 31 ++++----- std/uni.d | 105 +++++++++++++++++------------- 6 files changed, 151 insertions(+), 99 deletions(-) diff --git a/std/regex/internal/backtracking.d b/std/regex/internal/backtracking.d index 491273040..e44b9ec99 100644 --- a/std/regex/internal/backtracking.d +++ b/std/regex/internal/backtracking.d @@ -164,13 +164,14 @@ template BacktrackingMatcher(bool CTregex) backrefed = null; } - override void dupTo(void[] memBlock) + override void dupTo(Matcher!Char m, void[] memBlock) { - auto m = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock); - m[] = Trace.init; - auto mem = cast(size_t[]) memBlock; - mem[0] = 0; // hidden pointer - mem[1] = 0; // used size + auto backtracking = cast(BacktrackingMatcher)m; + backtracking.s = s; + backtracking.front = front; + backtracking.index = index; + backtracking.exhausted = exhausted; + backtracking.initExternalMemory(memBlock); } this(ref const RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx) @@ -822,7 +823,7 @@ struct CtContext //to mark the portion of matches to save int match, total_matches; int reserved; - const CodepointSet[] charsets; + const(CodepointInterval)[][] charsets; //state of codegenerator @@ -837,7 +838,10 @@ struct CtContext match = 1; reserved = 1; //first match is skipped total_matches = re.ngroup; - charsets = re.charsets; + foreach (ref set; re.charsets) + { + charsets ~= set.intervals; + } } CtContext lookaround(uint s, uint e) @@ -1299,7 +1303,7 @@ struct CtContext if (charsets.length) { string name = `func_`~to!string(addr+1); - string funcCode = charsets[ir[0].data].toSourceCode(name); + string funcCode = CodepointSet.toSourceCode(charsets[ir[0].data], name); code ~= ctSub( ` static $$ if (atEnd || !$$(front)) @@ -1315,7 +1319,7 @@ struct CtContext $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); break; case IR.Trie: - if (charsets.length && charsets[ir[0].data].byInterval.length <= 8) + if (charsets.length && charsets[ir[0].data].length <= 8) goto case IR.CodepointSet; code ~= ctSub( ` if (atEnd || !re.matchers[$$][front]) diff --git a/std/regex/internal/ir.d b/std/regex/internal/ir.d index 6e47352bd..fd509ca35 100644 --- a/std/regex/internal/ir.d +++ b/std/regex/internal/ir.d @@ -432,8 +432,8 @@ interface MatcherFactory(Char) @safe: Matcher!Char create(const Regex!Char, in Char[] input) const; Matcher!Char dup(Matcher!Char m, in Char[] input) const; - void incRef(Matcher!Char m) const; - void decRef(Matcher!Char m) const; + size_t incRef(Matcher!Char m) const; + size_t decRef(Matcher!Char m) const; } // Only memory management, no compile-time vs run-time specialities @@ -460,24 +460,23 @@ abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize; auto memory = enforce(malloc(size), "malloc failed")[0 .. size]; scope(failure) free(memory.ptr); - engine.dupTo(memory[classSize .. size]); - engine = construct(engine.pattern, input, memory); - assert(engine.refCount == 1); - return engine; + auto copy = construct(engine.pattern, input, memory); + engine.dupTo(copy, memory[classSize .. size]); + assert(copy.refCount == 1); + return copy; } - override void incRef(Matcher!Char m) const + override size_t incRef(Matcher!Char m) const { - ++m.refCount; + return ++m.refCount; } - override void decRef(Matcher!Char m) const @trusted + override size_t decRef(Matcher!Char m) const @trusted { assert(m.refCount != 0); - if (--m.refCount == 0) - { - free(cast(void*)m); - } + auto cnt = --m.refCount; + if (cnt == 0) free(cast(void*)m); + return cnt; } } @@ -487,7 +486,7 @@ class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char) override EngineType!Char construct(const Regex!Char re, in Char[] input, void[] memory) const { import std.conv : emplace; - return emplace!(EngineType!Char)(memory[0 .. classSize], + return emplace!(EngineType!Char)(memory[0 .. classSize], re, Input!Char(input), memory[classSize .. $]); } } @@ -498,14 +497,39 @@ class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineTy override EngineType!Char construct(const Regex!Char re, in Char[] input, void[] memory) const { import std.conv : emplace; - return emplace!(EngineType!Char)(memory[0 .. classSize], + return emplace!(EngineType!Char)(memory[0 .. classSize], re, &func, Input!Char(input), memory[classSize .. $]); } } +// A workaround for R-T enum re = regex(...) +template defaultFactory(Char) +{ + @property MatcherFactory!Char defaultFactory(const Regex!Char re) + { + import std.regex.internal.backtracking : BacktrackingMatcher; + import std.regex.internal.thompson : ThompsonMatcher; + import std.algorithm.searching : canFind; + static MatcherFactory!Char backtrackingFactory; + static MatcherFactory!Char thompsonFactory; + if (re.backrefed.canFind!"a != 0") + { + if (backtrackingFactory is null) + backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher!false, Char); + return backtrackingFactory; + } + else + { + if (thompsonFactory is null) + thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char); + return thompsonFactory; + } + } +} + // Defining it as an interface has the undesired side-effect: // casting any class to an interface silently adjusts pointer to point to a nested vtbl -abstract class Matcher(Char) +abstract class Matcher(Char) { abstract: // Get a (next) match @@ -513,8 +537,8 @@ abstract: // This only maintains internal ref-count, // deallocation happens inside MatcherFactory @property ref size_t refCount() @safe; - // Copy internal state to memory location - void dupTo(void[] memory); + // Copy internal state to another engine, using memory arena 'memory' + void dupTo(Matcher!Char m, void[] memory); // The pattern loaded @property ref const(Regex!Char) pattern() @safe; } diff --git a/std/regex/internal/parser.d b/std/regex/internal/parser.d index 4b485cb2b..a5c88cc81 100644 --- a/std/regex/internal/parser.d +++ b/std/regex/internal/parser.d @@ -12,7 +12,8 @@ static import std.ascii; // package relevant info from parser into a regex object auto makeRegex(S, CG)(Parser!(S, CG) p) { - import std.regex.internal.thompson, std.regex.internal.backtracking; + import std.regex.internal.backtracking : BacktrackingMatcher; + import std.regex.internal.thompson : ThompsonMatcher; import std.algorithm.searching : canFind; alias Char = BasicElementOf!S; Regex!Char re; @@ -27,10 +28,14 @@ auto makeRegex(S, CG)(Parser!(S, CG) p) charsets = g.charsets; matchers = g.matchers; backrefed = g.backrefed; - // check if we have backreferences, if so - use backtracking - if (backrefed.canFind!"a != 0") factory = new RuntimeFactory!(BacktrackingMatcher!false, Char); - else factory = new RuntimeFactory!(ThompsonMatcher, Char); re.postprocess(); + // check if we have backreferences, if so - use backtracking + if(__ctfe) factory = null; // allows us to use the awful enum re = regex(...); + else + if (re.backrefed.canFind!"a != 0") + factory = new RuntimeFactory!(BacktrackingMatcher!false, Char); + else + factory = new RuntimeFactory!(ThompsonMatcher, Char); debug(std_regex_parser) { __ctfe || print(); diff --git a/std/regex/internal/thompson.d b/std/regex/internal/thompson.d index 47e071b4d..b26eb5f9e 100644 --- a/std/regex/internal/thompson.d +++ b/std/regex/internal/thompson.d @@ -860,7 +860,8 @@ final: { _refCount = 1; s = stream; - re = matcher.re.withCode(re.ir[lo .. hi]).withNGroup(nGroup); + auto code = matcher.re.ir[lo .. hi]; + re = matcher.re.withCode(code).withNGroup(nGroup); threadSize = matcher.threadSize; merge = matcher.merge; freelist = matcher.freelist; @@ -876,7 +877,8 @@ final: { _refCount = 1; s = stream; - re = matcher.re.withCode(re.ir[lo .. hi]).withNGroup(nGroup); + auto code = matcher.re.ir[lo .. hi]; + re = matcher.re.withCode(code).withNGroup(nGroup); threadSize = matcher.threadSize; merge = matcher.merge; freelist = matcher.freelist; @@ -904,10 +906,15 @@ final: return m; } - override void dupTo(void[] memory) + override void dupTo(Matcher!Char engine, void[] memory) { - auto tmp = this; // bit-blit - tmp.initExternalMemory(memory); + auto thompson = cast(ThompsonMatcher)engine; + thompson.s = s; + thompson.front = front; + thompson.index = index; + thompson.matched = matched; + thompson.exhausted = exhausted; + thompson.initExternalMemory(memory); } override int match(Group!DataIndex[] matches) diff --git a/std/regex/package.d b/std/regex/package.d index 7993a8a1d..0d28ef333 100644 --- a/std/regex/package.d +++ b/std/regex/package.d @@ -438,7 +438,7 @@ template ctRegexImpl(alias pattern, string flags=[]) static immutable staticRe = cast(immutable)r.withFactory(new CtfeFactory!(CtMatcher, Char, func)); struct Wrapper { - @property auto getRe(){ return staticRe; } + @property auto getRe() const { return staticRe; } alias getRe this; } enum wrapper = Wrapper(); @@ -702,7 +702,6 @@ public: if (isSomeString!R) { private: - import core.stdc.stdlib : malloc, free; alias Char = BasicElementOf!R; Matcher!Char _engine; const MatcherFactory!Char _factory; @@ -713,9 +712,10 @@ private: { import std.exception : enforce; _input = input; - assert(prog.factory !is null, "malformed regex - missing matcher factory"); - _factory = prog.factory; - _engine = prog.factory.create(prog, input); + if (prog.factory is null) _factory = defaultFactory!Char(prog); + else _factory = prog.factory; + _engine = _factory.create(prog, input); + assert(_engine.refCount == 1); _captures = Captures!R(this); _captures._nMatch = _engine.match(_captures.matches); } @@ -723,12 +723,12 @@ private: public: this(this) { - _factory.incRef(_engine); + if (_engine) _factory.incRef(_engine); } ~this() { - _factory.decRef(_engine); + if (_engine) _factory.decRef(_engine); } ///Shorthands for front.pre, front.post, front.hit. @@ -773,9 +773,10 @@ public: // CoW - if refCount is not 1, we are aliased by somebody else if (_engine.refCount != 1) { - // we abandon this reference & create a new engine - _factory.decRef(_engine); - _engine = _factory.dup(_engine, _input); + // we create a new engine & abandon this reference + auto old = _engine; + _engine = _factory.dup(old, _input); + _factory.decRef(old); } if (!_captures.unique) { @@ -798,13 +799,13 @@ public: @property auto captures() inout { return _captures; } } -private @trusted auto matchOnce(RegEx, R)(R input, RegEx re) +private @trusted auto matchOnce(RegEx, R)(R input, const RegEx prog) { alias Char = BasicElementOf!R; - assert(re.factory !is null, "malformed regex - missing matcher factory"); - auto engine = re.factory.create(re, input); - scope(exit) re.factory.decRef(engine); // destroys the engine - auto captures = Captures!R(input, re.ngroup, re.dict); + auto factory = prog.factory is null ? defaultFactory!Char(prog) : prog.factory; + auto engine = prog.factory.create(prog, input); + scope(exit) prog.factory.decRef(engine); // destroys the engine + auto captures = Captures!R(input, prog.ngroup, prog.dict); captures._nMatch = engine.match(captures.matches); return captures; } diff --git a/std/uni.d b/std/uni.d index a0b39ff0d..be5a1b066 100644 --- a/std/uni.d +++ b/std/uni.d @@ -2181,9 +2181,15 @@ pure: assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')])); ----------- */ - @property auto byInterval() const + @property auto byInterval() { - return Intervals!(const(uint)[])(data[]); + return Intervals!(typeof(data))(data); + } + + package @property const(CodepointInterval)[] intervals() const + { + import std.array : array; + return Intervals!(typeof(data[]))(data[]).array; } /** @@ -2621,52 +2627,9 @@ public: assert((set & set.inverted).empty); } - /** - Generates string with D source code of unary function with name of - $(D funcName) taking a single $(D dchar) argument. If $(D funcName) is empty - the code is adjusted to be a lambda function. - - The function generated tests if the $(CODEPOINT) passed - belongs to this set or not. The result is to be used with string mixin. - The intended usage area is aggressive optimization via meta programming - in parser generators and the like. - - Note: Use with care for relatively small or regular sets. It - could end up being slower then just using multi-staged tables. - - Example: - --- - import std.stdio; - - // construct set directly from [a, b$RPAREN intervals - auto set = CodepointSet(10, 12, 45, 65, 100, 200); - writeln(set); - writeln(set.toSourceCode("func")); - --- - - The above outputs something along the lines of: - --- - bool func(dchar ch) @safe pure nothrow @nogc - { - if (ch < 45) - { - if (ch == 10 || ch == 11) return true; - return false; - } - else if (ch < 65) return true; - else - { - if (ch < 100) return false; - if (ch < 200) return true; - return false; - } - } - --- - */ - string toSourceCode(string funcName="") const + package static string toSourceCode(const(CodepointInterval)[] range, string funcName) { import std.algorithm.searching : countUntil; - import std.array : array; import std.format : format; enum maxBinary = 3; static string linearScope(R)(R ivals, string indent) @@ -2748,7 +2711,6 @@ public: string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n", funcName.empty ? "function" : funcName); - auto range = byInterval.array(); // special case first bisection to be on ASCII vs beyond auto tillAscii = countUntil!"a[0] > 0x80"(range); if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0) @@ -2758,6 +2720,55 @@ public: return code; } + /** + Generates string with D source code of unary function with name of + $(D funcName) taking a single $(D dchar) argument. If $(D funcName) is empty + the code is adjusted to be a lambda function. + + The function generated tests if the $(CODEPOINT) passed + belongs to this set or not. The result is to be used with string mixin. + The intended usage area is aggressive optimization via meta programming + in parser generators and the like. + + Note: Use with care for relatively small or regular sets. It + could end up being slower then just using multi-staged tables. + + Example: + --- + import std.stdio; + + // construct set directly from [a, b$RPAREN intervals + auto set = CodepointSet(10, 12, 45, 65, 100, 200); + writeln(set); + writeln(set.toSourceCode("func")); + --- + + The above outputs something along the lines of: + --- + bool func(dchar ch) @safe pure nothrow @nogc + { + if (ch < 45) + { + if (ch == 10 || ch == 11) return true; + return false; + } + else if (ch < 65) return true; + else + { + if (ch < 100) return false; + if (ch < 200) return true; + return false; + } + } + --- + */ + string toSourceCode(string funcName="") + { + import std.array : array; + auto range = byInterval.array(); + return toSourceCode(range, funcName); + } + /** True if this set doesn't contain any $(CODEPOINTS). */ From a877469f07819fa26cd12248f11fd59cbea6563a Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Tue, 5 Sep 2017 10:31:38 +0300 Subject: [PATCH 05/15] Fix issue 13532 - std.regex performance (enums; regex vs ctRegex) --- std/regex/internal/tests.d | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index 0d4a65e74..378a0aff3 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -996,6 +996,33 @@ alias Sequence(int B, int E) = staticIota!(B, E); assertThrown(regex(`^((x)(?=\1))`)); } +// bugzilla 13532 +@safe unittest +{ + import std.datetime.stopwatch : benchmark; + import std.math : abs; + void enumRegex() + { + enum re = ctRegex!`[0-9][0-9]`; + foreach (_; 0..100) + { + auto r = re; + assert(r.charsets.length == 1); + } + } + void staticRegex() + { + immutable static re = ctRegex!`[0-9][0-9]`; + foreach (_; 0..100) + { + assert(re.charsets.length == 1); + } + } + auto bench = benchmark!(enumRegex, staticRegex)(100); + auto ratio = 1.0 * bench[0].total!"usecs" / bench[1].total!"usecs"; + assert(abs(ratio - 1.0) < 0.10, "enum regex vs static regex diff is > 10%"); +} + // bugzilla 14504 @safe unittest { From 8bfa66c50a9c41a533401c46f11308185406ba77 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Tue, 5 Sep 2017 12:30:53 +0300 Subject: [PATCH 06/15] Fix CI complaints --- std/regex/internal/backtracking.d | 2 +- std/regex/internal/ir.d | 16 ++++++++-------- std/regex/internal/parser.d | 2 +- std/regex/internal/tests.d | 13 ++++++++----- std/regex/internal/thompson.d | 4 ++-- std/regex/package.d | 10 +++++----- std/uni.d | 6 +++--- 7 files changed, 28 insertions(+), 25 deletions(-) diff --git a/std/regex/internal/backtracking.d b/std/regex/internal/backtracking.d index e44b9ec99..5b5b854f2 100644 --- a/std/regex/internal/backtracking.d +++ b/std/regex/internal/backtracking.d @@ -166,7 +166,7 @@ template BacktrackingMatcher(bool CTregex) override void dupTo(Matcher!Char m, void[] memBlock) { - auto backtracking = cast(BacktrackingMatcher)m; + auto backtracking = cast(BacktrackingMatcher) m; backtracking.s = s; backtracking.front = front; backtracking.index = index; diff --git a/std/regex/internal/ir.d b/std/regex/internal/ir.d index fd509ca35..01690def8 100644 --- a/std/regex/internal/ir.d +++ b/std/regex/internal/ir.d @@ -200,7 +200,7 @@ bool isAtomIR(IR i) IR pairedIR(IR i) { assert(isStartIR(i) || isEndIR(i)); - return cast(IR)(i ^ 0b11); + return cast(IR) (i ^ 0b11); } //encoded IR instruction @@ -439,7 +439,7 @@ interface MatcherFactory(Char) // Only memory management, no compile-time vs run-time specialities abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char { - import core.stdc.stdlib; + import core.stdc.stdlib : malloc, free; enum classSize = __traits(classInstanceSize, EngineType!Char); Matcher!Char construct(const Regex!Char re, in Char[] input, void[] memory) const; @@ -451,7 +451,7 @@ abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char scope(failure) free(memory.ptr); auto engine = construct(re, input, memory); assert(engine.refCount == 1); - assert(cast(void*)engine == memory.ptr); + assert(cast(void*) engine == memory.ptr); return engine; } @@ -475,7 +475,7 @@ abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char { assert(m.refCount != 0); auto cnt = --m.refCount; - if (cnt == 0) free(cast(void*)m); + if (cnt == 0) free(cast(void*) m); return cnt; } } @@ -619,28 +619,28 @@ package(std.regex): const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted { - auto r = cast()this; + auto r = cast() this; r.factory = factory; return r; } const(Regex) withFlags(uint newFlags) pure const @trusted { - auto r = cast()this; + auto r = cast() this; r.flags = newFlags; return r; } const(Regex) withCode(const(Bytecode)[] code) pure const @trusted { - auto r = cast()this; + auto r = cast() this; r.ir = code.dup; // TODO: sidestep const instead? return r; } const(Regex) withNGroup(uint nGroup) pure const @trusted { - auto r = cast()this; + auto r = cast() this; r.ngroup = nGroup; return r; } diff --git a/std/regex/internal/parser.d b/std/regex/internal/parser.d index a5c88cc81..91a5c76be 100644 --- a/std/regex/internal/parser.d +++ b/std/regex/internal/parser.d @@ -30,7 +30,7 @@ auto makeRegex(S, CG)(Parser!(S, CG) p) backrefed = g.backrefed; re.postprocess(); // check if we have backreferences, if so - use backtracking - if(__ctfe) factory = null; // allows us to use the awful enum re = regex(...); + if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...); else if (re.backrefed.canFind!"a != 0") factory = new RuntimeFactory!(BacktrackingMatcher!false, Char); diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index 378a0aff3..acb05de03 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -1001,26 +1001,29 @@ alias Sequence(int B, int E) = staticIota!(B, E); { import std.datetime.stopwatch : benchmark; import std.math : abs; + import std.conv : to; void enumRegex() { enum re = ctRegex!`[0-9][0-9]`; - foreach (_; 0..100) + foreach (_; 0 .. 100) { - auto r = re; - assert(r.charsets.length == 1); + asm @trusted { nop; } + assert(re.charsets.length == 1); } } void staticRegex() { immutable static re = ctRegex!`[0-9][0-9]`; - foreach (_; 0..100) + foreach (_; 0 .. 100) { + asm @trusted { nop; } assert(re.charsets.length == 1); } } auto bench = benchmark!(enumRegex, staticRegex)(100); auto ratio = 1.0 * bench[0].total!"usecs" / bench[1].total!"usecs"; - assert(abs(ratio - 1.0) < 0.10, "enum regex vs static regex diff is > 10%"); + assert(abs(ratio - 1.0) < 0.33, + "enum regex to static regex ratio "~to!string(ratio)); } // bugzilla 14504 diff --git a/std/regex/internal/thompson.d b/std/regex/internal/thompson.d index b26eb5f9e..50b558f7c 100644 --- a/std/regex/internal/thompson.d +++ b/std/regex/internal/thompson.d @@ -785,7 +785,7 @@ final: @property bool atEnd(){ return index == s.lastIndex && s.atEnd; } override @property ref size_t refCount() @safe { return _refCount; } - + override @property ref const(Regex!Char) pattern() @safe { return re; } bool next() @@ -908,7 +908,7 @@ final: override void dupTo(Matcher!Char engine, void[] memory) { - auto thompson = cast(ThompsonMatcher)engine; + auto thompson = cast(ThompsonMatcher) engine; thompson.s = s; thompson.front = front; thompson.index = index; diff --git a/std/regex/package.d b/std/regex/package.d index 0d28ef333..40bcd7361 100644 --- a/std/regex/package.d +++ b/std/regex/package.d @@ -426,7 +426,7 @@ if (isSomeString!(S)) template ctRegexImpl(alias pattern, string flags=[]) { import std.regex.internal.backtracking, std.regex.internal.parser; - static immutable r = cast(immutable)regex(pattern, flags); + static immutable r = cast(immutable) regex(pattern, flags); alias Char = BasicElementOf!(typeof(pattern)); enum source = ctGenRegExCode(r); alias CtMatcher = BacktrackingMatcher!(true); @@ -435,10 +435,10 @@ template ctRegexImpl(alias pattern, string flags=[]) debug(std_regex_ctr) pragma(msg, source); mixin(source); } - static immutable staticRe = cast(immutable)r.withFactory(new CtfeFactory!(CtMatcher, Char, func)); + static immutable staticRe = cast(immutable) r.withFactory(new CtfeFactory!(CtMatcher, Char, func)); struct Wrapper { - @property auto getRe() const { return staticRe; } + @property ref getRe() const { return staticRe; } alias getRe this; } enum wrapper = Wrapper(); @@ -803,8 +803,8 @@ private @trusted auto matchOnce(RegEx, R)(R input, const RegEx prog) { alias Char = BasicElementOf!R; auto factory = prog.factory is null ? defaultFactory!Char(prog) : prog.factory; - auto engine = prog.factory.create(prog, input); - scope(exit) prog.factory.decRef(engine); // destroys the engine + auto engine = factory.create(prog, input); + scope(exit) factory.decRef(engine); // destroys the engine auto captures = Captures!R(input, prog.ngroup, prog.dict); captures._nMatch = engine.match(captures.matches); return captures; diff --git a/std/uni.d b/std/uni.d index be5a1b066..2beb09a92 100644 --- a/std/uni.d +++ b/std/uni.d @@ -2815,7 +2815,7 @@ private: //may break sorted property - but we need std.sort to access it //hence package protection attribute - static if(hasAssignableElements!Range) + static if (hasAssignableElements!Range) package @property void front(CodepointInterval val) { slice[start] = val.a; @@ -2830,7 +2830,7 @@ private: } //ditto about package - static if(hasAssignableElements!Range) + static if (hasAssignableElements!Range) package @property void back(CodepointInterval val) { slice[end-2] = val.a; @@ -2855,7 +2855,7 @@ private: } //ditto about package - static if(hasAssignableElements!Range) + static if (hasAssignableElements!Range) package void opIndexAssign(CodepointInterval val, size_t idx) { slice[start+idx*2] = val.a; From eaa62a83db6e40e501337d839987d6b03eed2123 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Wed, 6 Sep 2017 11:02:00 +0300 Subject: [PATCH 07/15] Fix latent bug due to emplacing over uninitialized memory --- std/regex/internal/backtracking.d | 1 + std/regex/internal/thompson.d | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/std/regex/internal/backtracking.d b/std/regex/internal/backtracking.d index 5b5b854f2..cee9e462a 100644 --- a/std/regex/internal/backtracking.d +++ b/std/regex/internal/backtracking.d @@ -321,6 +321,7 @@ template BacktrackingMatcher(bool CTregex) pc = 0; counter = 0; lastState = 0; + infiniteNesting = 0; matches[] = Group!DataIndex.init; auto start = s._index; debug(std_regex_matcher) diff --git a/std/regex/internal/thompson.d b/std/regex/internal/thompson.d index 50b558f7c..1332ed6e2 100644 --- a/std/regex/internal/thompson.d +++ b/std/regex/internal/thompson.d @@ -849,7 +849,13 @@ final: this()(const Regex!Char program, Stream stream, void[] memory) { + // We are emplace'd to malloced memory w/o blitting T.init over it\ + // make sure we initialize all fields explicitly _refCount = 1; + subCounters = null; + backrefed = null; + exhausted = false; + matched = 0; re = program; s = stream; initExternalMemory(memory); From dc9b60c1f314023c7c24376ccae8d35f38ecd684 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Wed, 6 Sep 2017 12:29:37 +0300 Subject: [PATCH 08/15] Trying to narrow down flakiness of timing --- std/regex/internal/tests.d | 39 +++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index acb05de03..119ab1ae1 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -999,29 +999,38 @@ alias Sequence(int B, int E) = staticIota!(B, E); // bugzilla 13532 @safe unittest { - import std.datetime.stopwatch : benchmark; + import std.datetime.stopwatch : StopWatch, AutoStart; import std.math : abs; import std.conv : to; - void enumRegex() + bool enumRegex() { enum re = ctRegex!`[0-9][0-9]`; - foreach (_; 0 .. 100) - { - asm @trusted { nop; } - assert(re.charsets.length == 1); - } + asm @trusted { nop; } + return re.charsets.length == 1; } - void staticRegex() + bool staticRegex() { immutable static re = ctRegex!`[0-9][0-9]`; - foreach (_; 0 .. 100) - { - asm @trusted { nop; } - assert(re.charsets.length == 1); - } + asm @trusted { nop; } + return re.charsets.length == 1; } - auto bench = benchmark!(enumRegex, staticRegex)(100); - auto ratio = 1.0 * bench[0].total!"usecs" / bench[1].total!"usecs"; + immutable iterations = 1000_000; + bool result1 = true, result2 = true; + auto sw = StopWatch(AutoStart.yes); + foreach (_; 0 .. iterations) + { + result1 &= staticRegex(); + } + const staticTime = sw.peek(); + sw.reset(); + foreach (_; 0 .. iterations) + { + result2 &= enumRegex(); + } + const enumTime = sw.peek(); + assert(result1); + assert(result2); + auto ratio = 1.0 * enumTime.total!"usecs" / staticTime.total!"usecs"; assert(abs(ratio - 1.0) < 0.33, "enum regex to static regex ratio "~to!string(ratio)); } From ee1f69a570d89b8cd1a025dcb387fb8f75516262 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Fri, 22 Sep 2017 08:19:44 -0700 Subject: [PATCH 09/15] Initialize subCounters --- std/regex/internal/thompson.d | 3 +++ 1 file changed, 3 insertions(+) diff --git a/std/regex/internal/thompson.d b/std/regex/internal/thompson.d index 1332ed6e2..fc36b51ac 100644 --- a/std/regex/internal/thompson.d +++ b/std/regex/internal/thompson.d @@ -865,6 +865,7 @@ final: this(ThompsonMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream) { _refCount = 1; + subCounters = null; s = stream; auto code = matcher.re.ir[lo .. hi]; re = matcher.re.withCode(code).withNGroup(nGroup); @@ -882,6 +883,7 @@ final: this(BackMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream) { _refCount = 1; + subCounters = null; s = stream; auto code = matcher.re.ir[lo .. hi]; re = matcher.re.withCode(code).withNGroup(nGroup); @@ -916,6 +918,7 @@ final: { auto thompson = cast(ThompsonMatcher) engine; thompson.s = s; + thompson.subCounters = null; thompson.front = front; thompson.index = index; thompson.matched = matched; From 09491f96e6154ceeddc8b2b62fa08ea7b7257793 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Fri, 22 Sep 2017 09:00:59 -0700 Subject: [PATCH 10/15] Another try to fix ratio of static/enum --- std/regex/internal/tests.d | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index 119ab1ae1..7ee3897bf 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -1002,34 +1002,23 @@ alias Sequence(int B, int E) = staticIota!(B, E); import std.datetime.stopwatch : StopWatch, AutoStart; import std.math : abs; import std.conv : to; - bool enumRegex() - { - enum re = ctRegex!`[0-9][0-9]`; - asm @trusted { nop; } - return re.charsets.length == 1; - } - bool staticRegex() - { - immutable static re = ctRegex!`[0-9][0-9]`; - asm @trusted { nop; } - return re.charsets.length == 1; - } + enum re1 = ctRegex!`[0-9][0-9]`; + immutable static re2 = ctRegex!`[0-9][0-9]`; immutable iterations = 1000_000; - bool result1 = true, result2 = true; + size_t result1 = 0, result2 = 0; auto sw = StopWatch(AutoStart.yes); foreach (_; 0 .. iterations) { - result1 &= staticRegex(); + result1 += matchFirst("12345678", re1).length; } const staticTime = sw.peek(); sw.reset(); foreach (_; 0 .. iterations) { - result2 &= enumRegex(); + result2 += matchFirst("12345678", re2).length; } const enumTime = sw.peek(); - assert(result1); - assert(result2); + assert(result1 == result2); auto ratio = 1.0 * enumTime.total!"usecs" / staticTime.total!"usecs"; assert(abs(ratio - 1.0) < 0.33, "enum regex to static regex ratio "~to!string(ratio)); From c49ea4f8c7dd686c504242762368a0a0df0d9db9 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Mon, 25 Sep 2017 10:31:33 +0300 Subject: [PATCH 11/15] Another attempt to even the odds of enum/static speed --- std/regex/internal/tests.d | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index 7ee3897bf..618f50982 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -1020,7 +1020,8 @@ alias Sequence(int B, int E) = staticIota!(B, E); const enumTime = sw.peek(); assert(result1 == result2); auto ratio = 1.0 * enumTime.total!"usecs" / staticTime.total!"usecs"; - assert(abs(ratio - 1.0) < 0.33, + // enum is faster or the diff is less < 30% + assert(ratio < 1.0 || abs(ratio - 1.0) < 0.3, "enum regex to static regex ratio "~to!string(ratio)); } From 7bf26afced82fee1a4d7903e75ccd57af69e1544 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Mon, 25 Sep 2017 11:12:01 +0300 Subject: [PATCH 12/15] fix broken Jenkins CI --- std/regex/package.d | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/std/regex/package.d b/std/regex/package.d index 40bcd7361..a5764a08c 100644 --- a/std/regex/package.d +++ b/std/regex/package.d @@ -438,7 +438,9 @@ template ctRegexImpl(alias pattern, string flags=[]) static immutable staticRe = cast(immutable) r.withFactory(new CtfeFactory!(CtMatcher, Char, func)); struct Wrapper { - @property ref getRe() const { return staticRe; } + // allow code that expects mutable Regex to still work + // we stay "logically const" + @trusted @property auto getRe() const { return cast() staticRe; } alias getRe this; } enum wrapper = Wrapper(); From 41c229647d994917f31dd1e06abedde5a829ae87 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Thu, 5 Oct 2017 08:08:15 -0700 Subject: [PATCH 13/15] Need to propagate subCounters Disable "benchmark" in unittest, it's too volatile with different compiler flags Also use GC.addRange/GC.removeRange --- std/regex/internal/ir.d | 10 +++++++++- std/regex/internal/parser.d | 3 +-- std/regex/internal/tests.d | 5 +++-- std/regex/internal/thompson.d | 4 ++-- std/regex/package.d | 2 +- 5 files changed, 16 insertions(+), 8 deletions(-) diff --git a/std/regex/internal/ir.d b/std/regex/internal/ir.d index 01690def8..3831b85e9 100644 --- a/std/regex/internal/ir.d +++ b/std/regex/internal/ir.d @@ -440,6 +440,7 @@ interface MatcherFactory(Char) abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char { import core.stdc.stdlib : malloc, free; + import core.memory : GC; enum classSize = __traits(classInstanceSize, EngineType!Char); Matcher!Char construct(const Regex!Char re, in Char[] input, void[] memory) const; @@ -449,6 +450,7 @@ abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char immutable size = EngineType!Char.initialMemory(re) + classSize; auto memory = enforce(malloc(size), "malloc failed")[0 .. size]; scope(failure) free(memory.ptr); + GC.addRange(memory.ptr, classSize); auto engine = construct(re, input, memory); assert(engine.refCount == 1); assert(cast(void*) engine == memory.ptr); @@ -461,6 +463,7 @@ abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char auto memory = enforce(malloc(size), "malloc failed")[0 .. size]; scope(failure) free(memory.ptr); auto copy = construct(engine.pattern, input, memory); + GC.addRange(memory.ptr, classSize); engine.dupTo(copy, memory[classSize .. size]); assert(copy.refCount == 1); return copy; @@ -475,7 +478,12 @@ abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char { assert(m.refCount != 0); auto cnt = --m.refCount; - if (cnt == 0) free(cast(void*) m); + if (cnt == 0) + { + void* ptr = cast(void*) m; + GC.removeRange(ptr); + free(ptr); + } return cnt; } } diff --git a/std/regex/internal/parser.d b/std/regex/internal/parser.d index 91a5c76be..f2a2ac2e1 100644 --- a/std/regex/internal/parser.d +++ b/std/regex/internal/parser.d @@ -31,8 +31,7 @@ auto makeRegex(S, CG)(Parser!(S, CG) p) re.postprocess(); // check if we have backreferences, if so - use backtracking if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...); - else - if (re.backrefed.canFind!"a != 0") + else if (re.backrefed.canFind!"a != 0") factory = new RuntimeFactory!(BacktrackingMatcher!false, Char); else factory = new RuntimeFactory!(ThompsonMatcher, Char); diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index 618f50982..37f099d15 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -997,6 +997,7 @@ alias Sequence(int B, int E) = staticIota!(B, E); } // bugzilla 13532 +version(none) // TODO: revist once we have proper benchmark framework @safe unittest { import std.datetime.stopwatch : StopWatch, AutoStart; @@ -1004,7 +1005,7 @@ alias Sequence(int B, int E) = staticIota!(B, E); import std.conv : to; enum re1 = ctRegex!`[0-9][0-9]`; immutable static re2 = ctRegex!`[0-9][0-9]`; - immutable iterations = 1000_000; + immutable iterations = 1_000_000; size_t result1 = 0, result2 = 0; auto sw = StopWatch(AutoStart.yes); foreach (_; 0 .. iterations) @@ -1021,7 +1022,7 @@ alias Sequence(int B, int E) = staticIota!(B, E); assert(result1 == result2); auto ratio = 1.0 * enumTime.total!"usecs" / staticTime.total!"usecs"; // enum is faster or the diff is less < 30% - assert(ratio < 1.0 || abs(ratio - 1.0) < 0.3, + assert(ratio < 1.0 || abs(ratio - 1.0) < 0.75, "enum regex to static regex ratio "~to!string(ratio)); } diff --git a/std/regex/internal/thompson.d b/std/regex/internal/thompson.d index fc36b51ac..5879639e4 100644 --- a/std/regex/internal/thompson.d +++ b/std/regex/internal/thompson.d @@ -865,7 +865,7 @@ final: this(ThompsonMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream) { _refCount = 1; - subCounters = null; + subCounters = matcher.subCounters; s = stream; auto code = matcher.re.ir[lo .. hi]; re = matcher.re.withCode(code).withNGroup(nGroup); @@ -883,7 +883,7 @@ final: this(BackMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream) { _refCount = 1; - subCounters = null; + subCounters = matcher.subCounters; s = stream; auto code = matcher.re.ir[lo .. hi]; re = matcher.re.withCode(code).withNGroup(nGroup); diff --git a/std/regex/package.d b/std/regex/package.d index a5764a08c..7b4769a95 100644 --- a/std/regex/package.d +++ b/std/regex/package.d @@ -1468,7 +1468,7 @@ private: @trusted this(Range input, RegEx separator) {//@@@BUG@@@ generated opAssign of RegexMatch is not @trusted _input = input; - auto re = separator.withFlags(separator.flags | RegexOption.global); + const re = separator.withFlags(separator.flags | RegexOption.global); if (_input.empty) { //there is nothing to match at all, make _offset > 0 From 13ea5f9aeae1092fc2a03203f6cc3972b972cf70 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Wed, 11 Oct 2017 17:48:04 +0300 Subject: [PATCH 14/15] Drop another level of templatizations in backtracking --- std/regex/internal/backtracking.d | 1385 ++++++++++++++--------------- std/regex/internal/ir.d | 2 +- std/regex/internal/parser.d | 2 +- std/regex/package.d | 6 +- 4 files changed, 694 insertions(+), 701 deletions(-) diff --git a/std/regex/internal/backtracking.d b/std/regex/internal/backtracking.d index cee9e462a..7829dc9fc 100644 --- a/std/regex/internal/backtracking.d +++ b/std/regex/internal/backtracking.d @@ -13,775 +13,768 @@ import std.regex.internal.ir; BacktrackingMatcher implements backtracking scheme of matching regular expressions. +/ -template BacktrackingMatcher(bool CTregex) +@trusted class BacktrackingMatcher(Char, Stream = Input!Char) : Matcher!Char + if (is(Char : dchar)) { - @trusted class BacktrackingMatcher(Char, Stream = Input!Char) : Matcher!Char - if (is(Char : dchar)) - { - alias DataIndex = Stream.DataIndex; - struct State - {//top bit in pc is set if saved along with matches - DataIndex index; - uint pc, counter, infiniteNesting; - } - static assert(State.sizeof % size_t.sizeof == 0); - enum stateSize = State.sizeof / size_t.sizeof; - enum initialStack = 1 << 11; // items in a block of segmented stack - alias String = const(Char)[]; - alias RegEx = Regex!Char; - alias MatchFn = bool function (ref BacktrackingMatcher); - const RegEx re; // regex program - MatchFn nativeFn; // native code for that program - // Stream state - Stream s; + alias DataIndex = Stream.DataIndex; + struct State + {//top bit in pc is set if saved along with matches DataIndex index; - dchar front; - bool exhausted; - // Backtracking machine state - uint pc, counter; - DataIndex lastState = 0; // Top of state stack - uint infiniteNesting; - size_t[] memory; - Trace[] merge; - static struct Trace - { - ulong mask; - size_t offset; + uint pc, counter, infiniteNesting; + } + static assert(State.sizeof % size_t.sizeof == 0); + enum stateSize = State.sizeof / size_t.sizeof; + enum initialStack = 1 << 11; // items in a block of segmented stack + alias String = const(Char)[]; + alias RegEx = Regex!Char; + alias MatchFn = bool function (BacktrackingMatcher); + const RegEx re; // regex program + MatchFn nativeFn; // native code for that program + // Stream state + Stream s; + DataIndex index; + dchar front; + bool exhausted; + // Backtracking machine state + uint pc, counter; + DataIndex lastState = 0; // Top of state stack + uint infiniteNesting; + size_t[] memory; + Trace[] merge; + static struct Trace + { + ulong mask; + size_t offset; - bool mark(size_t idx) + bool mark(size_t idx) + { + immutable d = idx - offset; + if (d < 64) // including overflow { - immutable d = idx - offset; - if (d < 64) // including overflow - { - immutable p = mask & (1UL << d); - mask |= 1UL << d; - return p != 0; - } - else - { - offset = idx; - mask = 1; - return false; - } + immutable p = mask & (1UL << d); + mask |= 1UL << d; + return p != 0; + } + else + { + offset = idx; + mask = 1; + return false; } } - //local slice of matches, global for backref - Group!DataIndex[] matches, backrefed; - size_t _refCount; - final: + } + //local slice of matches, global for backref + Group!DataIndex[] matches, backrefed; + size_t _refCount; +final: - override @property ref size_t refCount() { return _refCount; } - override @property ref const(RegEx) pattern(){ return re; } + override @property ref size_t refCount() { return _refCount; } + override @property ref const(RegEx) pattern(){ return re; } - static if (__traits(hasMember,Stream, "search")) + static if (__traits(hasMember,Stream, "search")) + { + enum kicked = true; + } + else + enum kicked = false; + + static size_t initialMemory(const ref RegEx re) + { + return stackSize(re)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof; + } + + static size_t stackSize(const ref RegEx re) + { + size_t itemSize = stateSize + + re.ngroup * (Group!DataIndex).sizeof / size_t.sizeof; + return initialStack * itemSize + 2; + } + + @property bool atStart(){ return index == 0; } + + @property bool atEnd(){ return index == s.lastIndex && s.atEnd; } + + void next() + { + if (!s.nextChar(front, index)) + index = s.lastIndex; + } + + void search() + { + static if (kicked) { - enum kicked = true; + if (!s.search(re.kickstart, front, index)) + { + index = s.lastIndex; + } } else - enum kicked = false; + next(); + } - static size_t initialMemory(const ref RegEx re) + // + void newStack() + { + auto chunk = mallocArray!(size_t)(stackSize(re)); + chunk[0] = cast(size_t)(memory.ptr); + chunk[1] = lastState; + memory = chunk[2..$]; + lastState = 0; + } + + bool prevStack() + { + // pointer to previous block + size_t* prev = cast(size_t*) memory.ptr[-2]; + if (!prev) { - return stackSize(re)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof; + // The last segment is freed in RegexMatch + return false; } - - static size_t stackSize(const ref RegEx re) + else { - size_t itemSize = stateSize - + re.ngroup * (Group!DataIndex).sizeof / size_t.sizeof; - return initialStack * itemSize + 2; + import core.stdc.stdlib : free; + // memory used in previous block + size_t size = memory.ptr[-1]; + free(memory.ptr-2); + memory = prev[0 .. size]; + lastState = size; + return true; } + } - @property bool atStart(){ return index == 0; } + void initExternalMemory(void[] memBlock) + { + merge = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock); + merge[] = Trace.init; + memory = cast(size_t[]) memBlock; + memory[0] = 0; // hidden pointer + memory[1] = 0; // used size + memory = memory[2..$]; + } - @property bool atEnd(){ return index == s.lastIndex && s.atEnd; } + void initialize(ref const RegEx program, Stream stream, void[] memBlock) + { + s = stream; + exhausted = false; + initExternalMemory(memBlock); + backrefed = null; + } - void next() - { - if (!s.nextChar(front, index)) - index = s.lastIndex; - } + override void dupTo(Matcher!Char m, void[] memBlock) + { + auto backtracking = cast(BacktrackingMatcher) m; + backtracking.s = s; + backtracking.front = front; + backtracking.index = index; + backtracking.exhausted = exhausted; + backtracking.initExternalMemory(memBlock); + } - void search() - { - static if (kicked) - { - if (!s.search(re.kickstart, front, index)) - { - index = s.lastIndex; - } - } - else + this(ref const RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx) + { + _refCount = 1; + re = program; + nativeFn = null; + initialize(program, stream, memBlock); + front = ch; + index = idx; + } + + this(ref const RegEx program, MatchFn func, Stream stream, void[] memBlock) + { + _refCount = 1; + re = program; + initialize(program, stream, memBlock); + nativeFn = func; + next(); + } + + this(ref const RegEx program, Stream stream, void[] memBlock) + { + _refCount = 1; + re = program; + nativeFn = null; + initialize(program, stream, memBlock); + next(); + } + + auto fwdMatcher(ref const RegEx re, void[] memBlock) + { + alias BackMatcher = BacktrackingMatcher!(Char, Stream); + auto fwdMatcher = new BackMatcher(re, s, memBlock, front, index); + return fwdMatcher; + } + + auto bwdMatcher(ref const RegEx re, void[] memBlock) + { + alias BackMatcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index))); + auto fwdMatcher = + new BackMatcher(re, s.loopBack(index), memBlock); + return fwdMatcher; + } + + // + int matchFinalize() + { + immutable start = index; + immutable val = matchImpl(); + if (val) + {//stream is updated here + matches[0].begin = start; + matches[0].end = index; + if (!(re.flags & RegexOption.global) || atEnd) + exhausted = true; + if (start == index)//empty match advances input next(); + return val; } + else + return 0; + } - // - void newStack() + //lookup next match, fill matches with indices into input + override int match(Group!DataIndex[] matches) + { + debug(std_regex_matcher) { - auto chunk = mallocArray!(size_t)(stackSize(re)); - chunk[0] = cast(size_t)(memory.ptr); - chunk[1] = lastState; - memory = chunk[2..$]; - lastState = 0; + writeln("------------------------------------------"); } - - bool prevStack() + if (exhausted) //all matches collected + return false; + this.matches = matches; + if (re.flags & RegexInfo.oneShot) { - // pointer to previous block - size_t* prev = cast(size_t*) memory.ptr[-2]; - if (!prev) + exhausted = true; + const DataIndex start = index; + immutable m = matchImpl(); + if (m) { - // The last segment is freed in RegexMatch - return false; - } - else - { - import core.stdc.stdlib : free; - // memory used in previous block - size_t size = memory.ptr[-1]; - free(memory.ptr-2); - memory = prev[0 .. size]; - lastState = size; - return true; - } - } - - void initExternalMemory(void[] memBlock) - { - merge = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock); - merge[] = Trace.init; - memory = cast(size_t[]) memBlock; - memory[0] = 0; // hidden pointer - memory[1] = 0; // used size - memory = memory[2..$]; - } - - void initialize(ref const RegEx program, Stream stream, void[] memBlock) - { - s = stream; - exhausted = false; - initExternalMemory(memBlock); - backrefed = null; - } - - override void dupTo(Matcher!Char m, void[] memBlock) - { - auto backtracking = cast(BacktrackingMatcher) m; - backtracking.s = s; - backtracking.front = front; - backtracking.index = index; - backtracking.exhausted = exhausted; - backtracking.initExternalMemory(memBlock); - } - - this(ref const RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx) - { - _refCount = 1; - re = program; - initialize(program, stream, memBlock); - front = ch; - index = idx; - } - - this(ref const RegEx program, MatchFn func, Stream stream, void[] memBlock) - { - _refCount = 1; - re = program; - initialize(program, stream, memBlock); - nativeFn = func; - next(); - } - - this(ref const RegEx program, Stream stream, void[] memBlock) - { - _refCount = 1; - re = program; - initialize(program, stream, memBlock); - next(); - } - - auto fwdMatcher(ref const RegEx re, void[] memBlock) - { - alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); - alias BackMatcher = BackMatcherTempl!(Char, Stream); - auto fwdMatcher = new BackMatcher(re, s, memBlock, front, index); - return fwdMatcher; - } - - auto bwdMatcher(ref const RegEx re, void[] memBlock) - { - alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); - alias BackMatcher = BackMatcherTempl!(Char, typeof(s.loopBack(index))); - auto fwdMatcher = - new BackMatcher(re, s.loopBack(index), memBlock); - return fwdMatcher; - } - - // - int matchFinalize() - { - immutable start = index; - immutable val = matchImpl(); - if (val) - {//stream is updated here matches[0].begin = start; matches[0].end = index; - if (!(re.flags & RegexOption.global) || atEnd) - exhausted = true; - if (start == index)//empty match advances input - next(); - return val; } - else - return 0; + return m; } - - //lookup next match, fill matches with indices into input - override int match(Group!DataIndex[] matches) + static if (kicked) { - debug(std_regex_matcher) + if (!re.kickstart.empty) { - writeln("------------------------------------------"); - } - if (exhausted) //all matches collected - return false; - this.matches = matches; - if (re.flags & RegexInfo.oneShot) - { - exhausted = true; - const DataIndex start = index; - immutable m = matchImpl(); - if (m) - { - matches[0].begin = start; - matches[0].end = index; - } - return m; - } - static if (kicked) - { - if (!re.kickstart.empty) - { - for (;;) - { - immutable val = matchFinalize(); - if (val) - return val; - else - { - if (atEnd) - break; - search(); - if (atEnd) - { - exhausted = true; - return matchFinalize(); - } - } - } - exhausted = true; - return 0; //early return - } - } - //no search available - skip a char at a time - for (;;) - { - immutable val = matchFinalize(); - if (val) - return val; - else - { - if (atEnd) - break; - next(); - if (atEnd) - { - exhausted = true; - return matchFinalize(); - } - } - } - exhausted = true; - return 0; - } - - /+ - match subexpression against input, - results are stored in matches - +/ - int matchImpl() - { - static if (CTregex && is(typeof(nativeFn(this)))) - { - debug(std_regex_ctr) writeln("using C-T matcher"); - return nativeFn(this); - } - else - { - pc = 0; - counter = 0; - lastState = 0; - infiniteNesting = 0; - matches[] = Group!DataIndex.init; - auto start = s._index; - debug(std_regex_matcher) - writeln("Try match starting at ", s[index .. s.lastIndex]); for (;;) { - debug(std_regex_matcher) - writefln("PC: %s\tCNT: %s\t%s \tfront: %s src: %s", - pc, counter, disassemble(re.ir, pc, re.dict), - front, s._index); - switch (re.ir[pc].code) + immutable val = matchFinalize(); + if (val) + return val; + else { - case IR.OrChar://assumes IRL!(OrChar) == 1 if (atEnd) - goto L_backtrack; - uint len = re.ir[pc].sequence; - uint end = pc + len; - if (re.ir[pc].data != front && re.ir[pc+1].data != front) + break; + search(); + if (atEnd) { - for (pc = pc+2; pc < end; pc++) - if (re.ir[pc].data == front) - break; - if (pc == end) - goto L_backtrack; + exhausted = true; + return matchFinalize(); } - pc = end; - next(); - break; - case IR.Char: - if (atEnd || front != re.ir[pc].data) - goto L_backtrack; - pc += IRL!(IR.Char); - next(); + } + } + exhausted = true; + return 0; //early return + } + } + //no search available - skip a char at a time + for (;;) + { + immutable val = matchFinalize(); + if (val) + return val; + else + { + if (atEnd) break; - case IR.Any: - if (atEnd) - goto L_backtrack; - pc += IRL!(IR.Any); - next(); - break; - case IR.CodepointSet: - if (atEnd || !re.charsets[re.ir[pc].data].scanFor(front)) - goto L_backtrack; - next(); - pc += IRL!(IR.CodepointSet); - break; - case IR.Trie: - if (atEnd || !re.matchers[re.ir[pc].data][front]) - goto L_backtrack; - next(); - pc += IRL!(IR.Trie); - break; - case IR.Wordboundary: - dchar back; - DataIndex bi; - //at start & end of input - if (atStart && wordMatcher[front]) - { - pc += IRL!(IR.Wordboundary); - break; - } - else if (atEnd && s.loopBack(index).nextChar(back, bi) - && wordMatcher[back]) - { - pc += IRL!(IR.Wordboundary); - break; - } - else if (s.loopBack(index).nextChar(back, bi)) - { - immutable af = wordMatcher[front]; - immutable ab = wordMatcher[back]; - if (af ^ ab) - { - pc += IRL!(IR.Wordboundary); - break; - } - } + next(); + if (atEnd) + { + exhausted = true; + return matchFinalize(); + } + } + } + exhausted = true; + return 0; + } + + /+ + match subexpression against input, + results are stored in matches + +/ + int matchImpl() + { + if (nativeFn) + { + debug(std_regex_ctr) writeln("using C-T matcher"); + return nativeFn(this); + } + else + { + pc = 0; + counter = 0; + lastState = 0; + infiniteNesting = 0; + matches[] = Group!DataIndex.init; + auto start = s._index; + debug(std_regex_matcher) + writeln("Try match starting at ", s[index .. s.lastIndex]); + for (;;) + { + debug(std_regex_matcher) + writefln("PC: %s\tCNT: %s\t%s \tfront: %s src: %s", + pc, counter, disassemble(re.ir, pc, re.dict), + front, s._index); + switch (re.ir[pc].code) + { + case IR.OrChar://assumes IRL!(OrChar) == 1 + if (atEnd) goto L_backtrack; - case IR.Notwordboundary: - dchar back; - DataIndex bi; - //at start & end of input - if (atStart && wordMatcher[front]) + uint len = re.ir[pc].sequence; + uint end = pc + len; + if (re.ir[pc].data != front && re.ir[pc+1].data != front) + { + for (pc = pc+2; pc < end; pc++) + if (re.ir[pc].data == front) + break; + if (pc == end) goto L_backtrack; - else if (atEnd && s.loopBack(index).nextChar(back, bi) - && wordMatcher[back]) - goto L_backtrack; - else if (s.loopBack(index).nextChar(back, bi)) - { - immutable af = wordMatcher[front]; - immutable ab = wordMatcher[back]; - if (af ^ ab) - goto L_backtrack; - } + } + pc = end; + next(); + break; + case IR.Char: + if (atEnd || front != re.ir[pc].data) + goto L_backtrack; + pc += IRL!(IR.Char); + next(); + break; + case IR.Any: + if (atEnd) + goto L_backtrack; + pc += IRL!(IR.Any); + next(); + break; + case IR.CodepointSet: + if (atEnd || !re.charsets[re.ir[pc].data].scanFor(front)) + goto L_backtrack; + next(); + pc += IRL!(IR.CodepointSet); + break; + case IR.Trie: + if (atEnd || !re.matchers[re.ir[pc].data][front]) + goto L_backtrack; + next(); + pc += IRL!(IR.Trie); + break; + case IR.Wordboundary: + dchar back; + DataIndex bi; + //at start & end of input + if (atStart && wordMatcher[front]) + { pc += IRL!(IR.Wordboundary); break; - case IR.Bof: - if (atStart) - pc += IRL!(IR.Bol); - else - goto L_backtrack; + } + else if (atEnd && s.loopBack(index).nextChar(back, bi) + && wordMatcher[back]) + { + pc += IRL!(IR.Wordboundary); break; - case IR.Bol: - dchar back; - DataIndex bi; - if (atStart) - pc += IRL!(IR.Bol); - else if (s.loopBack(index).nextChar(back,bi) - && endOfLine(back, front == '\n')) + } + else if (s.loopBack(index).nextChar(back, bi)) + { + immutable af = wordMatcher[front]; + immutable ab = wordMatcher[back]; + if (af ^ ab) { - pc += IRL!(IR.Bol); + pc += IRL!(IR.Wordboundary); + break; } - else + } + goto L_backtrack; + case IR.Notwordboundary: + dchar back; + DataIndex bi; + //at start & end of input + if (atStart && wordMatcher[front]) + goto L_backtrack; + else if (atEnd && s.loopBack(index).nextChar(back, bi) + && wordMatcher[back]) + goto L_backtrack; + else if (s.loopBack(index).nextChar(back, bi)) + { + immutable af = wordMatcher[front]; + immutable ab = wordMatcher[back]; + if (af ^ ab) goto L_backtrack; - break; - case IR.Eof: - if (atEnd) - pc += IRL!(IR.Eol); - else - goto L_backtrack; - break; - case IR.Eol: - dchar back; - DataIndex bi; - debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]); - //no matching inside \r\n - if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi) - && back == '\r'))) - { - pc += IRL!(IR.Eol); - } - else - goto L_backtrack; - break; - case IR.InfiniteStart, IR.InfiniteQStart: - pc += re.ir[pc].data + IRL!(IR.InfiniteStart); - //now pc is at end IR.Infinite(Q)End - uint len = re.ir[pc].data; - if (re.ir[pc].code == IR.InfiniteEnd) - { - pushState(pc+IRL!(IR.InfiniteEnd), counter); - pc -= len; - } - else - { - pushState(pc - len, counter); - pc += IRL!(IR.InfiniteEnd); - } - break; - case IR.InfiniteBloomStart: - pc += re.ir[pc].data + IRL!(IR.InfiniteBloomStart); - //now pc is at end IR.InfiniteBloomEnd - immutable len = re.ir[pc].data; - immutable filterIdx = re.ir[pc+2].raw; - if (re.filters[filterIdx][front]) - pushState(pc+IRL!(IR.InfiniteBloomEnd), counter); + } + pc += IRL!(IR.Wordboundary); + break; + case IR.Bof: + if (atStart) + pc += IRL!(IR.Bol); + else + goto L_backtrack; + break; + case IR.Bol: + dchar back; + DataIndex bi; + if (atStart) + pc += IRL!(IR.Bol); + else if (s.loopBack(index).nextChar(back,bi) + && endOfLine(back, front == '\n')) + { + pc += IRL!(IR.Bol); + } + else + goto L_backtrack; + break; + case IR.Eof: + if (atEnd) + pc += IRL!(IR.Eol); + else + goto L_backtrack; + break; + case IR.Eol: + dchar back; + DataIndex bi; + debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]); + //no matching inside \r\n + if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi) + && back == '\r'))) + { + pc += IRL!(IR.Eol); + } + else + goto L_backtrack; + break; + case IR.InfiniteStart, IR.InfiniteQStart: + pc += re.ir[pc].data + IRL!(IR.InfiniteStart); + //now pc is at end IR.Infinite(Q)End + uint len = re.ir[pc].data; + if (re.ir[pc].code == IR.InfiniteEnd) + { + pushState(pc+IRL!(IR.InfiniteEnd), counter); pc -= len; - break; - case IR.RepeatStart, IR.RepeatQStart: - pc += re.ir[pc].data + IRL!(IR.RepeatStart); - break; - case IR.RepeatEnd: - case IR.RepeatQEnd: - if (merge[re.ir[pc + 1].raw+counter].mark(index)) - { - // merged! - goto L_backtrack; - } - //len, step, min, max - immutable len = re.ir[pc].data; - immutable step = re.ir[pc+2].raw; - immutable min = re.ir[pc+3].raw; - immutable max = re.ir[pc+4].raw; - if (counter < min) + } + else + { + pushState(pc - len, counter); + pc += IRL!(IR.InfiniteEnd); + } + break; + case IR.InfiniteBloomStart: + pc += re.ir[pc].data + IRL!(IR.InfiniteBloomStart); + //now pc is at end IR.InfiniteBloomEnd + immutable len = re.ir[pc].data; + immutable filterIdx = re.ir[pc+2].raw; + if (re.filters[filterIdx][front]) + pushState(pc+IRL!(IR.InfiniteBloomEnd), counter); + pc -= len; + break; + case IR.RepeatStart, IR.RepeatQStart: + pc += re.ir[pc].data + IRL!(IR.RepeatStart); + break; + case IR.RepeatEnd: + case IR.RepeatQEnd: + if (merge[re.ir[pc + 1].raw+counter].mark(index)) + { + // merged! + goto L_backtrack; + } + //len, step, min, max + immutable len = re.ir[pc].data; + immutable step = re.ir[pc+2].raw; + immutable min = re.ir[pc+3].raw; + immutable max = re.ir[pc+4].raw; + if (counter < min) + { + counter += step; + pc -= len; + } + else if (counter < max) + { + if (re.ir[pc].code == IR.RepeatEnd) { + pushState(pc + IRL!(IR.RepeatEnd), counter%step); counter += step; pc -= len; } - else if (counter < max) - { - if (re.ir[pc].code == IR.RepeatEnd) - { - pushState(pc + IRL!(IR.RepeatEnd), counter%step); - counter += step; - pc -= len; - } - else - { - pushState(pc - len, counter + step); - counter = counter%step; - pc += IRL!(IR.RepeatEnd); - } - } else { + pushState(pc - len, counter + step); counter = counter%step; pc += IRL!(IR.RepeatEnd); } - break; - case IR.InfiniteEnd: - case IR.InfiniteQEnd: - debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); - if (merge[re.ir[pc + 1].raw+counter].mark(index)) - { - // merged! - goto L_backtrack; - } - immutable len = re.ir[pc].data; - if (re.ir[pc].code == IR.InfiniteEnd) - { - pushState(pc + IRL!(IR.InfiniteEnd), counter); - pc -= len; - } - else - { - pushState(pc-len, counter); - pc += IRL!(IR.InfiniteEnd); - } - break; - case IR.InfiniteBloomEnd: - debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); - if (merge[re.ir[pc + 1].raw+counter].mark(index)) - { - // merged! - goto L_backtrack; - } - immutable len = re.ir[pc].data; - immutable filterIdx = re.ir[pc+2].raw; - if (re.filters[filterIdx][front]) - { - infiniteNesting--; - pushState(pc + IRL!(IR.InfiniteBloomEnd), counter); - infiniteNesting++; - } + } + else + { + counter = counter%step; + pc += IRL!(IR.RepeatEnd); + } + break; + case IR.InfiniteEnd: + case IR.InfiniteQEnd: + debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); + if (merge[re.ir[pc + 1].raw+counter].mark(index)) + { + // merged! + goto L_backtrack; + } + immutable len = re.ir[pc].data; + if (re.ir[pc].code == IR.InfiniteEnd) + { + pushState(pc + IRL!(IR.InfiniteEnd), counter); pc -= len; - break; - case IR.OrEnd: - if (merge[re.ir[pc + 1].raw+counter].mark(index)) - { - // merged! - goto L_backtrack; - } - pc += IRL!(IR.OrEnd); - break; - case IR.OrStart: - pc += IRL!(IR.OrStart); - goto case; - case IR.Option: - immutable len = re.ir[pc].data; - if (re.ir[pc+len].code == IR.GotoEndOr)//not a last one - { - pushState(pc + len + IRL!(IR.Option), counter); //remember 2nd branch - } - pc += IRL!(IR.Option); - break; - case IR.GotoEndOr: - pc = pc + re.ir[pc].data + IRL!(IR.GotoEndOr); - break; - case IR.GroupStart: - immutable n = re.ir[pc].data; - matches[n].begin = index; - debug(std_regex_matcher) writefln("IR group #%u starts at %u", n, index); - pc += IRL!(IR.GroupStart); - break; - case IR.GroupEnd: - immutable n = re.ir[pc].data; - matches[n].end = index; - debug(std_regex_matcher) writefln("IR group #%u ends at %u", n, index); - pc += IRL!(IR.GroupEnd); - break; - case IR.LookaheadStart: - case IR.NeglookaheadStart: - immutable len = re.ir[pc].data; - auto save = index; - immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; - auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)]; - scope(exit) free(mem.ptr); - auto slicedRe = re.withCode(re.ir[ - pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd) - ]); - static if (Stream.isLoopback) - { - auto matcher = bwdMatcher(slicedRe, mem); - } - else - { - auto matcher = fwdMatcher(slicedRe, mem); - } - matcher.matches = matches[ms .. me]; - matcher.backrefed = backrefed.empty ? matches : backrefed; - immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookaheadStart); - s.reset(save); + } + else + { + pushState(pc-len, counter); + pc += IRL!(IR.InfiniteEnd); + } + break; + case IR.InfiniteBloomEnd: + debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); + if (merge[re.ir[pc + 1].raw+counter].mark(index)) + { + // merged! + goto L_backtrack; + } + immutable len = re.ir[pc].data; + immutable filterIdx = re.ir[pc+2].raw; + if (re.filters[filterIdx][front]) + { + infiniteNesting--; + pushState(pc + IRL!(IR.InfiniteBloomEnd), counter); + infiniteNesting++; + } + pc -= len; + break; + case IR.OrEnd: + if (merge[re.ir[pc + 1].raw+counter].mark(index)) + { + // merged! + goto L_backtrack; + } + pc += IRL!(IR.OrEnd); + break; + case IR.OrStart: + pc += IRL!(IR.OrStart); + goto case; + case IR.Option: + immutable len = re.ir[pc].data; + if (re.ir[pc+len].code == IR.GotoEndOr)//not a last one + { + pushState(pc + len + IRL!(IR.Option), counter); //remember 2nd branch + } + pc += IRL!(IR.Option); + break; + case IR.GotoEndOr: + pc = pc + re.ir[pc].data + IRL!(IR.GotoEndOr); + break; + case IR.GroupStart: + immutable n = re.ir[pc].data; + matches[n].begin = index; + debug(std_regex_matcher) writefln("IR group #%u starts at %u", n, index); + pc += IRL!(IR.GroupStart); + break; + case IR.GroupEnd: + immutable n = re.ir[pc].data; + matches[n].end = index; + debug(std_regex_matcher) writefln("IR group #%u ends at %u", n, index); + pc += IRL!(IR.GroupEnd); + break; + case IR.LookaheadStart: + case IR.NeglookaheadStart: + immutable len = re.ir[pc].data; + auto save = index; + immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; + auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)]; + scope(exit) free(mem.ptr); + auto slicedRe = re.withCode(re.ir[ + pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd) + ]); + static if (Stream.isLoopback) + { + auto matcher = bwdMatcher(slicedRe, mem); + } + else + { + auto matcher = fwdMatcher(slicedRe, mem); + } + matcher.matches = matches[ms .. me]; + matcher.backrefed = backrefed.empty ? matches : backrefed; + immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookaheadStart); + s.reset(save); + next(); + if (!match) + goto L_backtrack; + else + { + pc += IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd); + } + break; + case IR.LookbehindStart: + case IR.NeglookbehindStart: + immutable len = re.ir[pc].data; + immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; + auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)]; + scope(exit) free(mem.ptr); + auto slicedRe = re.withCode(re.ir[ + pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd) + ]); + static if (Stream.isLoopback) + { + alias Matcher = BacktrackingMatcher!(Char, Stream); + auto matcher = new Matcher(slicedRe, s, mem, front, index); + } + else + { + alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index))); + auto matcher = new Matcher(slicedRe, s.loopBack(index), mem); + } + matcher.matches = matches[ms .. me]; + matcher.backrefed = backrefed.empty ? matches : backrefed; + immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookbehindStart); + if (!match) + goto L_backtrack; + else + { + pc += IRL!(IR.LookbehindStart)+len+IRL!(IR.LookbehindEnd); + } + break; + case IR.Backref: + immutable n = re.ir[pc].data; + auto referenced = re.ir[pc].localRef + ? s[matches[n].begin .. matches[n].end] + : s[backrefed[n].begin .. backrefed[n].end]; + while (!atEnd && !referenced.empty && front == referenced.front) + { next(); - if (!match) - goto L_backtrack; - else - { - pc += IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd); - } - break; - case IR.LookbehindStart: - case IR.NeglookbehindStart: - immutable len = re.ir[pc].data; - immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; - auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)]; - scope(exit) free(mem.ptr); - auto slicedRe = re.withCode(re.ir[ - pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd) - ]); - static if (Stream.isLoopback) - { - alias Matcher = BacktrackingMatcher!(Char, Stream); - auto matcher = new Matcher(slicedRe, s, mem, front, index); - } - else - { - alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index))); - auto matcher = new Matcher(slicedRe, s.loopBack(index), mem); - } - matcher.matches = matches[ms .. me]; - matcher.backrefed = backrefed.empty ? matches : backrefed; - immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookbehindStart); - if (!match) - goto L_backtrack; - else - { - pc += IRL!(IR.LookbehindStart)+len+IRL!(IR.LookbehindEnd); - } - break; - case IR.Backref: - immutable n = re.ir[pc].data; - auto referenced = re.ir[pc].localRef - ? s[matches[n].begin .. matches[n].end] - : s[backrefed[n].begin .. backrefed[n].end]; - while (!atEnd && !referenced.empty && front == referenced.front) - { - next(); - referenced.popFront(); - } - if (referenced.empty) - pc++; - else - goto L_backtrack; - break; - case IR.Nop: - pc += IRL!(IR.Nop); - break; - case IR.LookaheadEnd: - case IR.NeglookaheadEnd: - case IR.LookbehindEnd: - case IR.NeglookbehindEnd: - case IR.End: - // cleanup stale stack blocks if any - while (prevStack()) {} - return re.ir[pc].data; - default: - debug printBytecode(re.ir[0..$]); - assert(0); - L_backtrack: - if (!popState()) - { - s.reset(start); - return 0; - } + referenced.popFront(); + } + if (referenced.empty) + pc++; + else + goto L_backtrack; + break; + case IR.Nop: + pc += IRL!(IR.Nop); + break; + case IR.LookaheadEnd: + case IR.NeglookaheadEnd: + case IR.LookbehindEnd: + case IR.NeglookbehindEnd: + case IR.End: + // cleanup stale stack blocks if any + while (prevStack()) {} + return re.ir[pc].data; + default: + debug printBytecode(re.ir[0..$]); + assert(0); + L_backtrack: + if (!popState()) + { + s.reset(start); + return 0; } } } - assert(0); } + assert(0); + } - @property size_t stackAvail() - { - return memory.length - lastState; - } + @property size_t stackAvail() + { + return memory.length - lastState; + } - void stackPush(T)(T val) - if (!isDynamicArray!T) - { - *cast(T*)&memory[lastState] = val; - enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof; - lastState += delta; - debug(std_regex_matcher) writeln("push element SP= ", lastState); - } + void stackPush(T)(T val) + if (!isDynamicArray!T) + { + *cast(T*)&memory[lastState] = val; + enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof; + lastState += delta; + debug(std_regex_matcher) writeln("push element SP= ", lastState); + } - void stackPush(T)(T[] val) - { - static assert(T.sizeof % size_t.sizeof == 0); - (cast(T*)&memory[lastState])[0 .. val.length] - = val[0..$]; - lastState += val.length*(T.sizeof/size_t.sizeof); - debug(std_regex_matcher) writeln("push array SP= ", lastState); - } + void stackPush(T)(T[] val) + { + static assert(T.sizeof % size_t.sizeof == 0); + (cast(T*)&memory[lastState])[0 .. val.length] + = val[0..$]; + lastState += val.length*(T.sizeof/size_t.sizeof); + debug(std_regex_matcher) writeln("push array SP= ", lastState); + } - void stackPop(T)(ref T val) - if (!isDynamicArray!T) - { - enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof; - lastState -= delta; - val = *cast(T*)&memory[lastState]; - debug(std_regex_matcher) writeln("pop element SP= ", lastState); - } + void stackPop(T)(ref T val) + if (!isDynamicArray!T) + { + enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof; + lastState -= delta; + val = *cast(T*)&memory[lastState]; + debug(std_regex_matcher) writeln("pop element SP= ", lastState); + } - void stackPop(T)(T[] val) + void stackPop(T)(T[] val) + { + stackPop(val); // call ref version + } + void stackPop(T)(ref T[] val) + { + lastState -= val.length*(T.sizeof/size_t.sizeof); + val[0..$] = (cast(T*)&memory[lastState])[0 .. val.length]; + debug(std_regex_matcher) writeln("pop array SP= ", lastState); + } + //helper function, saves engine state + void pushState(uint pc, uint counter) + { + if (stateSize + 2 * matches.length > stackAvail) { - stackPop(val); // call ref version - } - void stackPop(T)(ref T[] val) - { - lastState -= val.length*(T.sizeof/size_t.sizeof); - val[0..$] = (cast(T*)&memory[lastState])[0 .. val.length]; - debug(std_regex_matcher) writeln("pop array SP= ", lastState); + newStack(); } + *cast(State*)&memory[lastState] = + State(index, pc, counter, infiniteNesting); + lastState += stateSize; + memory[lastState .. lastState + 2 * matches.length] = (cast(size_t[]) matches)[]; + lastState += 2*matches.length; + debug(std_regex_matcher) + writefln("Saved(pc=%s) front: %s src: %s", + pc, front, s[index .. s.lastIndex]); + } - static if (true) + //helper function, restores engine state + bool popState() + { + if (!lastState && !prevStack()) + return false; + lastState -= 2*matches.length; + auto pm = cast(size_t[]) matches; + pm[] = memory[lastState .. lastState + 2 * matches.length]; + lastState -= stateSize; + State* state = cast(State*)&memory[lastState]; + index = state.index; + pc = state.pc; + counter = state.counter; + infiniteNesting = state.infiniteNesting; + debug(std_regex_matcher) { - //helper function, saves engine state - void pushState(uint pc, uint counter) - { - if (stateSize + 2 * matches.length > stackAvail) - { - newStack(); - } - *cast(State*)&memory[lastState] = - State(index, pc, counter, infiniteNesting); - lastState += stateSize; - memory[lastState .. lastState + 2 * matches.length] = (cast(size_t[]) matches)[]; - lastState += 2*matches.length; - debug(std_regex_matcher) - writefln("Saved(pc=%s) front: %s src: %s", - pc, front, s[index .. s.lastIndex]); - } - - //helper function, restores engine state - bool popState() - { - if (!lastState && !prevStack()) - return false; - lastState -= 2*matches.length; - auto pm = cast(size_t[]) matches; - pm[] = memory[lastState .. lastState + 2 * matches.length]; - lastState -= stateSize; - State* state = cast(State*)&memory[lastState]; - index = state.index; - pc = state.pc; - counter = state.counter; - infiniteNesting = state.infiniteNesting; - debug(std_regex_matcher) - { - writefln("Restored matches", front, s[index .. s.lastIndex]); - foreach (i, m; matches) - writefln("Sub(%d) : %s..%s", i, m.begin, m.end); - } - s.reset(index); - next(); - debug(std_regex_matcher) - writefln("Backtracked (pc=%s) front: %s src: %s", - pc, front, s[index .. s.lastIndex]); - return true; - } + writefln("Restored matches", front, s[index .. s.lastIndex]); + foreach (i, m; matches) + writefln("Sub(%d) : %s..%s", i, m.begin, m.end); } + s.reset(index); + next(); + debug(std_regex_matcher) + writefln("Backtracked (pc=%s) front: %s src: %s", + pc, front, s[index .. s.lastIndex]); + return true; } } @@ -968,7 +961,7 @@ struct CtContext alias Lookaround = $$; else alias Lookaround = $$; - static bool matcher_$$(ref Lookaround matcher) @trusted + static bool matcher_$$(Lookaround matcher) @trusted { //(neg)lookaround piece start $$ diff --git a/std/regex/internal/ir.d b/std/regex/internal/ir.d index 3831b85e9..fb0a6a075 100644 --- a/std/regex/internal/ir.d +++ b/std/regex/internal/ir.d @@ -523,7 +523,7 @@ template defaultFactory(Char) if (re.backrefed.canFind!"a != 0") { if (backtrackingFactory is null) - backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher!false, Char); + backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char); return backtrackingFactory; } else diff --git a/std/regex/internal/parser.d b/std/regex/internal/parser.d index f2a2ac2e1..a4caae38b 100644 --- a/std/regex/internal/parser.d +++ b/std/regex/internal/parser.d @@ -32,7 +32,7 @@ auto makeRegex(S, CG)(Parser!(S, CG) p) // check if we have backreferences, if so - use backtracking if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...); else if (re.backrefed.canFind!"a != 0") - factory = new RuntimeFactory!(BacktrackingMatcher!false, Char); + factory = new RuntimeFactory!(BacktrackingMatcher, Char); else factory = new RuntimeFactory!(ThompsonMatcher, Char); debug(std_regex_parser) diff --git a/std/regex/package.d b/std/regex/package.d index 7b4769a95..e03ea4580 100644 --- a/std/regex/package.d +++ b/std/regex/package.d @@ -429,13 +429,13 @@ template ctRegexImpl(alias pattern, string flags=[]) static immutable r = cast(immutable) regex(pattern, flags); alias Char = BasicElementOf!(typeof(pattern)); enum source = ctGenRegExCode(r); - alias CtMatcher = BacktrackingMatcher!(true); - @trusted bool func(ref CtMatcher!Char matcher) + @trusted bool func(BacktrackingMatcher!Char matcher) { debug(std_regex_ctr) pragma(msg, source); mixin(source); } - static immutable staticRe = cast(immutable) r.withFactory(new CtfeFactory!(CtMatcher, Char, func)); + static immutable staticRe = + cast(immutable) r.withFactory(new CtfeFactory!(BacktrackingMatcher, Char, func)); struct Wrapper { // allow code that expects mutable Regex to still work From f4c963b5e94b96d7edbab634c9998b35a49a5c35 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Thu, 12 Oct 2017 09:49:27 +0300 Subject: [PATCH 15/15] Silence the circleCI style check? --- std/regex/package.d | 1 + 1 file changed, 1 insertion(+) diff --git a/std/regex/package.d b/std/regex/package.d index e03ea4580..20284c453 100644 --- a/std/regex/package.d +++ b/std/regex/package.d @@ -432,6 +432,7 @@ template ctRegexImpl(alias pattern, string flags=[]) @trusted bool func(BacktrackingMatcher!Char matcher) { debug(std_regex_ctr) pragma(msg, source); + cast(void) matcher; mixin(source); } static immutable staticRe =