From df07aa7dea259a91acb3e84ea89b48eb3550b17f Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Sun, 27 Mar 2016 20:11:55 +0300 Subject: [PATCH] Special case ASCII to use bit tables insted of 2-level tries --- std/regex/internal/backtracking.d | 36 ++++++++-------- std/regex/internal/ir.d | 70 ++++++++++++++++++++----------- std/regex/internal/parser.d | 46 ++++---------------- std/regex/internal/thompson.d | 18 ++++---- 4 files changed, 80 insertions(+), 90 deletions(-) diff --git a/std/regex/internal/backtracking.d b/std/regex/internal/backtracking.d index 8f16cb5f1..bd5c82f66 100644 --- a/std/regex/internal/backtracking.d +++ b/std/regex/internal/backtracking.d @@ -301,7 +301,7 @@ template BacktrackingMatcher(bool CTregex) pc += IRL!(IR.CodepointSet); break; case IR.Trie: - if(atEnd || !re.tries[re.ir[pc].data][front]) + if(atEnd || !re.matchers[re.ir[pc].data][front]) goto L_backtrack; next(); pc += IRL!(IR.Trie); @@ -310,21 +310,21 @@ template BacktrackingMatcher(bool CTregex) dchar back; DataIndex bi; //at start & end of input - if(atStart && wordTrie[front]) + if(atStart && wordMatcher[front]) { pc += IRL!(IR.Wordboundary); break; } else if(atEnd && s.loopBack(index).nextChar(back, bi) - && wordTrie[back]) + && wordMatcher[back]) { pc += IRL!(IR.Wordboundary); break; } else if(s.loopBack(index).nextChar(back, bi)) { - bool af = wordTrie[front]; - bool ab = wordTrie[back]; + bool af = wordMatcher[front]; + bool ab = wordMatcher[back]; if(af ^ ab) { pc += IRL!(IR.Wordboundary); @@ -336,15 +336,15 @@ template BacktrackingMatcher(bool CTregex) dchar back; DataIndex bi; //at start & end of input - if(atStart && wordTrie[front]) + if(atStart && wordMatcher[front]) goto L_backtrack; else if(atEnd && s.loopBack(index).nextChar(back, bi) - && wordTrie[back]) + && wordMatcher[back]) goto L_backtrack; else if(s.loopBack(index).nextChar(back, bi)) { - bool af = wordTrie[front]; - bool ab = wordTrie[back]; + bool af = wordMatcher[front]; + bool ab = wordMatcher[back]; if(af ^ ab) goto L_backtrack; } @@ -1276,7 +1276,7 @@ struct CtContext break; case IR.Trie: code ~= ctSub( ` - if(atEnd || !re.tries[$$][front]) + if(atEnd || !re.matchers[$$][front]) $$ $$ $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); @@ -1285,19 +1285,19 @@ struct CtContext code ~= ctSub( ` dchar back; DataIndex bi; - if(atStart && wordTrie[front]) + if(atStart && wordMatcher[front]) { $$ } else if(atEnd && s.loopBack(index).nextChar(back, bi) - && wordTrie[back]) + && wordMatcher[back]) { $$ } else if(s.loopBack(index).nextChar(back, bi)) { - bool af = wordTrie[front]; - bool ab = wordTrie[back]; + bool af = wordMatcher[front]; + bool ab = wordMatcher[back]; if(af ^ ab) { $$ @@ -1310,15 +1310,15 @@ struct CtContext dchar back; DataIndex bi; //at start & end of input - if(atStart && wordTrie[front]) + if(atStart && wordMatcher[front]) $$ else if(atEnd && s.loopBack(index).nextChar(back, bi) - && wordTrie[back]) + && wordMatcher[back]) $$ else if(s.loopBack(index).nextChar(back, bi)) { - bool af = wordTrie[front]; - bool ab = wordTrie[back]; + bool af = wordMatcher[front]; + bool ab = wordMatcher[back]; if(af ^ ab) $$ } diff --git a/std/regex/internal/ir.d b/std/regex/internal/ir.d index 0724d4e90..3c7bca8d2 100644 --- a/std/regex/internal/ir.d +++ b/std/regex/internal/ir.d @@ -19,29 +19,29 @@ alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range); enum maxCharsetUsed = 6; // another variable to tweak behavior of caching generated Tries for character classes -enum maxCachedTries = 8; +enum maxCachedMatchers = 8; alias Trie = CodepointSetTrie!(13, 8); alias makeTrie = codepointSetTrie!(13, 8); -Trie[CodepointSet] trieCache; +CharMatcher[CodepointSet] matcherCache; //accessor with caching -@trusted Trie getTrie(CodepointSet set) +@trusted CharMatcher getMatcher(CodepointSet set) {// @@@BUG@@@ 6357 almost all properties of AA are not @safe - if(__ctfe || maxCachedTries == 0) - return makeTrie(set); + if(__ctfe || maxCachedMatchers == 0) + return CharMatcher(set); else { - auto p = set in trieCache; + auto p = set in matcherCache; if(p) return *p; - if(trieCache.length == maxCachedTries) + if(matcherCache.length == maxCachedMatchers) { - // flush entries in trieCache - trieCache = null; + // flush enmatchers in trieCache + matcherCache = null; } - return (trieCache[set] = makeTrie(set)); + return (matcherCache[set] = CharMatcher(set)); } } @@ -67,9 +67,9 @@ Trie[CodepointSet] trieCache; | unicode.Me | unicode.Nd | unicode.Pc")(); } -@property Trie wordTrie() +@property CharMatcher wordMatcher() { - return memoizeExpr!("makeTrie(wordCharacter)")(); + return memoizeExpr!("CharMatcher(wordCharacter)")(); } // some special Unicode white space characters @@ -503,15 +503,15 @@ struct Regex(Char) package(std.regex): import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency - NamedGroup[] dict; //maps name -> user group number - uint ngroup; //number of internal groups - uint maxCounterDepth; //max depth of nested {n,m} repetitions - uint hotspotTableSize; //number of entries in merge table - uint threadCount; - uint flags; //global regex flags - public const(Trie)[] tries; // - public const(BloomFilter)[] filters; // bloom filters for conditional loops - uint[] backrefed; //bit array of backreferenced submatches + NamedGroup[] dict; // maps name -> user group number + uint ngroup; // number of internal groups + uint maxCounterDepth; // max depth of nested {n,m} repetitions + uint hotspotTableSize; // number of entries in merge table + uint threadCount; // upper bound on number of Thompson VM threads + uint flags; // global regex flags + public const(CharMatcher)[] matchers; // tables that represent character sets + public const(BitTable)[] filters; // bloom filters for conditional loops + uint[] backrefed; // bit array of backreferenced submatches Kickstart!Char kickstart; //bit access helper @@ -728,8 +728,8 @@ public class RegexException : Exception mixin basicExceptionCtors; } - -struct BloomFilter { +// simple 128-entry bit-table used with a hash function +struct BitTable { uint[4] filter; this(CodepointSet set){ @@ -744,12 +744,32 @@ struct BloomFilter { filter[i >> 5] |= 1<<(i & 31); } // non-zero -> might be present, 0 -> absent - uint opIndex()(dchar ch) const{ + bool opIndex()(dchar ch) const{ immutable i = index(ch); - return filter[i >> 5] & (1<<(i & 31)); + return (filter[i >> 5]>>(i & 31)) & 1; } static uint index()(dchar ch){ return ((ch >> 7) ^ ch) & 0x7F; } } + +struct CharMatcher { + BitTable ascii; // fast path for ASCII + Trie trie; // slow path for Unicode + + this(CodepointSet set) + { + auto asciiSet = set & unicode.ASCII; + ascii = BitTable(asciiSet); + trie = makeTrie(set); + } + + bool opIndex()(dchar ch) const + { + if (ch < 0x80) + return ascii[ch]; + else + return trie[ch]; + } +} diff --git a/std/regex/internal/parser.d b/std/regex/internal/parser.d index c3e513a17..32889131e 100644 --- a/std/regex/internal/parser.d +++ b/std/regex/internal/parser.d @@ -21,7 +21,7 @@ auto makeRegex(S)(Parser!S p) maxCounterDepth = p.counterDepth; flags = p.re_flags; charsets = p.charsets; - tries = p.tries; + matchers = p.matchers; backrefed = p.backrefed; re.lightPostprocess(); debug(std_regex_parser) @@ -198,36 +198,6 @@ dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit) .canFind("invalid codepoint")); } -//heuristic value determines maximum CodepointSet length suitable for linear search -enum maxCharsetUsed = 6; - -enum maxCachedTries = 8; - -alias Trie = CodepointSetTrie!(13, 8); -alias makeTrie = codepointSetTrie!(13, 8); - -Trie[CodepointSet] trieCache; - -//accessor with caching -@trusted Trie getTrie(CodepointSet set) -{// @@@BUG@@@ 6357 almost all properties of AA are not @safe - if(__ctfe || maxCachedTries == 0) - return makeTrie(set); - else - { - auto p = set in trieCache; - if(p) - return *p; - if(trieCache.length == maxCachedTries) - { - // flush entries in trieCache - trieCache = null; - } - return (trieCache[set] = makeTrie(set)); - } -} - - auto caseEnclose(CodepointSet set) { auto cased = set & unicode.LC; @@ -305,7 +275,7 @@ struct Parser(R) uint lookaroundNest = 0; uint counterDepth = 0; //current depth of nested counted repetitions CodepointSet[] charsets; // - const(Trie)[] tries; // + const(CharMatcher)[] matchers; // uint[] backrefed; //bitarray for groups @trusted this(S)(R pattern, S flags) @@ -1248,18 +1218,18 @@ struct Parser(R) } if(ivals.length*2 > maxCharsetUsed) { - auto t = getTrie(set); - put(Bytecode(IR.Trie, cast(uint)tries.length)); - tries ~= t; + auto t = getMatcher(set); + put(Bytecode(IR.Trie, cast(uint)matchers.length)); + matchers ~= t; debug(std_regex_allocation) writeln("Trie generated"); } else { put(Bytecode(IR.CodepointSet, cast(uint)charsets.length)); - tries ~= Trie.init; + matchers ~= CharMatcher.init; } charsets ~= set; - assert(charsets.length == tries.length); + assert(charsets.length == matchers.length); } } @@ -1556,7 +1526,7 @@ void optimize(Char)(ref Regex!Char zis) Bytecode(InfiniteBloomStart, ir[i].data); ir.insertInPlace(i+IRL!(InfiniteEnd), Bytecode.fromRaw(cast(uint)zis.filters.length)); - zis.filters ~= BloomFilter(set); + zis.filters ~= BitTable(set); } } } diff --git a/std/regex/internal/thompson.d b/std/regex/internal/thompson.d index c72a80db3..83493d9ae 100644 --- a/std/regex/internal/thompson.d +++ b/std/regex/internal/thompson.d @@ -112,21 +112,21 @@ template ThompsonOps(E, S, bool withInput:true) dchar back; DataIndex bi; //at start & end of input - if(atStart && wordTrie[front]) + if(atStart && wordMatcher[front]) { t.pc += IRL!(IR.Wordboundary); return true; } else if(atEnd && s.loopBack(index).nextChar(back, bi) - && wordTrie[back]) + && wordMatcher[back]) { t.pc += IRL!(IR.Wordboundary); return true; } else if(s.loopBack(index).nextChar(back, bi)) { - bool af = wordTrie[front]; - bool ab = wordTrie[back]; + bool af = wordMatcher[front]; + bool ab = wordMatcher[back]; if(af ^ ab) { t.pc += IRL!(IR.Wordboundary); @@ -144,19 +144,19 @@ template ThompsonOps(E, S, bool withInput:true) dchar back; DataIndex bi; //at start & end of input - if(atStart && wordTrie[front]) + if(atStart && wordMatcher[front]) { return popState(e); } else if(atEnd && s.loopBack(index).nextChar(back, bi) - && wordTrie[back]) + && wordMatcher[back]) { return popState(e); } else if(s.loopBack(index).nextChar(back, bi)) { - bool af = wordTrie[front]; - bool ab = wordTrie[back] != 0; + bool af = wordMatcher[front]; + bool ab = wordMatcher[back] != 0; if(af ^ ab) { return popState(e); @@ -630,7 +630,7 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - if(re.tries[re.ir[t.pc].data][front]) + if(re.matchers[re.ir[t.pc].data][front]) { t.pc += IRL!(IR.Trie); nlist.insertBack(t);