Special case ASCII to use bit tables insted of 2-level tries

This commit is contained in:
Dmitry Olshansky 2016-03-27 20:11:55 +03:00
parent 0e55583fed
commit df07aa7dea
4 changed files with 80 additions and 90 deletions

View file

@ -301,7 +301,7 @@ template BacktrackingMatcher(bool CTregex)
pc += IRL!(IR.CodepointSet);
break;
case IR.Trie:
if(atEnd || !re.tries[re.ir[pc].data][front])
if(atEnd || !re.matchers[re.ir[pc].data][front])
goto L_backtrack;
next();
pc += IRL!(IR.Trie);
@ -310,21 +310,21 @@ template BacktrackingMatcher(bool CTregex)
dchar back;
DataIndex bi;
//at start & end of input
if(atStart && wordTrie[front])
if(atStart && wordMatcher[front])
{
pc += IRL!(IR.Wordboundary);
break;
}
else if(atEnd && s.loopBack(index).nextChar(back, bi)
&& wordTrie[back])
&& wordMatcher[back])
{
pc += IRL!(IR.Wordboundary);
break;
}
else if(s.loopBack(index).nextChar(back, bi))
{
bool af = wordTrie[front];
bool ab = wordTrie[back];
bool af = wordMatcher[front];
bool ab = wordMatcher[back];
if(af ^ ab)
{
pc += IRL!(IR.Wordboundary);
@ -336,15 +336,15 @@ template BacktrackingMatcher(bool CTregex)
dchar back;
DataIndex bi;
//at start & end of input
if(atStart && wordTrie[front])
if(atStart && wordMatcher[front])
goto L_backtrack;
else if(atEnd && s.loopBack(index).nextChar(back, bi)
&& wordTrie[back])
&& wordMatcher[back])
goto L_backtrack;
else if(s.loopBack(index).nextChar(back, bi))
{
bool af = wordTrie[front];
bool ab = wordTrie[back];
bool af = wordMatcher[front];
bool ab = wordMatcher[back];
if(af ^ ab)
goto L_backtrack;
}
@ -1276,7 +1276,7 @@ struct CtContext
break;
case IR.Trie:
code ~= ctSub( `
if(atEnd || !re.tries[$$][front])
if(atEnd || !re.matchers[$$][front])
$$
$$
$$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
@ -1285,19 +1285,19 @@ struct CtContext
code ~= ctSub( `
dchar back;
DataIndex bi;
if(atStart && wordTrie[front])
if(atStart && wordMatcher[front])
{
$$
}
else if(atEnd && s.loopBack(index).nextChar(back, bi)
&& wordTrie[back])
&& wordMatcher[back])
{
$$
}
else if(s.loopBack(index).nextChar(back, bi))
{
bool af = wordTrie[front];
bool ab = wordTrie[back];
bool af = wordMatcher[front];
bool ab = wordMatcher[back];
if(af ^ ab)
{
$$
@ -1310,15 +1310,15 @@ struct CtContext
dchar back;
DataIndex bi;
//at start & end of input
if(atStart && wordTrie[front])
if(atStart && wordMatcher[front])
$$
else if(atEnd && s.loopBack(index).nextChar(back, bi)
&& wordTrie[back])
&& wordMatcher[back])
$$
else if(s.loopBack(index).nextChar(back, bi))
{
bool af = wordTrie[front];
bool ab = wordTrie[back];
bool af = wordMatcher[front];
bool ab = wordMatcher[back];
if(af ^ ab)
$$
}

View file

@ -19,29 +19,29 @@ alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range);
enum maxCharsetUsed = 6;
// another variable to tweak behavior of caching generated Tries for character classes
enum maxCachedTries = 8;
enum maxCachedMatchers = 8;
alias Trie = CodepointSetTrie!(13, 8);
alias makeTrie = codepointSetTrie!(13, 8);
Trie[CodepointSet] trieCache;
CharMatcher[CodepointSet] matcherCache;
//accessor with caching
@trusted Trie getTrie(CodepointSet set)
@trusted CharMatcher getMatcher(CodepointSet set)
{// @@@BUG@@@ 6357 almost all properties of AA are not @safe
if(__ctfe || maxCachedTries == 0)
return makeTrie(set);
if(__ctfe || maxCachedMatchers == 0)
return CharMatcher(set);
else
{
auto p = set in trieCache;
auto p = set in matcherCache;
if(p)
return *p;
if(trieCache.length == maxCachedTries)
if(matcherCache.length == maxCachedMatchers)
{
// flush entries in trieCache
trieCache = null;
// flush enmatchers in trieCache
matcherCache = null;
}
return (trieCache[set] = makeTrie(set));
return (matcherCache[set] = CharMatcher(set));
}
}
@ -67,9 +67,9 @@ Trie[CodepointSet] trieCache;
| unicode.Me | unicode.Nd | unicode.Pc")();
}
@property Trie wordTrie()
@property CharMatcher wordMatcher()
{
return memoizeExpr!("makeTrie(wordCharacter)")();
return memoizeExpr!("CharMatcher(wordCharacter)")();
}
// some special Unicode white space characters
@ -503,15 +503,15 @@ struct Regex(Char)
package(std.regex):
import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
NamedGroup[] dict; //maps name -> user group number
uint ngroup; //number of internal groups
uint maxCounterDepth; //max depth of nested {n,m} repetitions
uint hotspotTableSize; //number of entries in merge table
uint threadCount;
uint flags; //global regex flags
public const(Trie)[] tries; //
public const(BloomFilter)[] filters; // bloom filters for conditional loops
uint[] backrefed; //bit array of backreferenced submatches
NamedGroup[] dict; // maps name -> user group number
uint ngroup; // number of internal groups
uint maxCounterDepth; // max depth of nested {n,m} repetitions
uint hotspotTableSize; // number of entries in merge table
uint threadCount; // upper bound on number of Thompson VM threads
uint flags; // global regex flags
public const(CharMatcher)[] matchers; // tables that represent character sets
public const(BitTable)[] filters; // bloom filters for conditional loops
uint[] backrefed; // bit array of backreferenced submatches
Kickstart!Char kickstart;
//bit access helper
@ -728,8 +728,8 @@ public class RegexException : Exception
mixin basicExceptionCtors;
}
struct BloomFilter {
// simple 128-entry bit-table used with a hash function
struct BitTable {
uint[4] filter;
this(CodepointSet set){
@ -744,12 +744,32 @@ struct BloomFilter {
filter[i >> 5] |= 1<<(i & 31);
}
// non-zero -> might be present, 0 -> absent
uint opIndex()(dchar ch) const{
bool opIndex()(dchar ch) const{
immutable i = index(ch);
return filter[i >> 5] & (1<<(i & 31));
return (filter[i >> 5]>>(i & 31)) & 1;
}
static uint index()(dchar ch){
return ((ch >> 7) ^ ch) & 0x7F;
}
}
struct CharMatcher {
BitTable ascii; // fast path for ASCII
Trie trie; // slow path for Unicode
this(CodepointSet set)
{
auto asciiSet = set & unicode.ASCII;
ascii = BitTable(asciiSet);
trie = makeTrie(set);
}
bool opIndex()(dchar ch) const
{
if (ch < 0x80)
return ascii[ch];
else
return trie[ch];
}
}

View file

@ -21,7 +21,7 @@ auto makeRegex(S)(Parser!S p)
maxCounterDepth = p.counterDepth;
flags = p.re_flags;
charsets = p.charsets;
tries = p.tries;
matchers = p.matchers;
backrefed = p.backrefed;
re.lightPostprocess();
debug(std_regex_parser)
@ -198,36 +198,6 @@ dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit)
.canFind("invalid codepoint"));
}
//heuristic value determines maximum CodepointSet length suitable for linear search
enum maxCharsetUsed = 6;
enum maxCachedTries = 8;
alias Trie = CodepointSetTrie!(13, 8);
alias makeTrie = codepointSetTrie!(13, 8);
Trie[CodepointSet] trieCache;
//accessor with caching
@trusted Trie getTrie(CodepointSet set)
{// @@@BUG@@@ 6357 almost all properties of AA are not @safe
if(__ctfe || maxCachedTries == 0)
return makeTrie(set);
else
{
auto p = set in trieCache;
if(p)
return *p;
if(trieCache.length == maxCachedTries)
{
// flush entries in trieCache
trieCache = null;
}
return (trieCache[set] = makeTrie(set));
}
}
auto caseEnclose(CodepointSet set)
{
auto cased = set & unicode.LC;
@ -305,7 +275,7 @@ struct Parser(R)
uint lookaroundNest = 0;
uint counterDepth = 0; //current depth of nested counted repetitions
CodepointSet[] charsets; //
const(Trie)[] tries; //
const(CharMatcher)[] matchers; //
uint[] backrefed; //bitarray for groups
@trusted this(S)(R pattern, S flags)
@ -1248,18 +1218,18 @@ struct Parser(R)
}
if(ivals.length*2 > maxCharsetUsed)
{
auto t = getTrie(set);
put(Bytecode(IR.Trie, cast(uint)tries.length));
tries ~= t;
auto t = getMatcher(set);
put(Bytecode(IR.Trie, cast(uint)matchers.length));
matchers ~= t;
debug(std_regex_allocation) writeln("Trie generated");
}
else
{
put(Bytecode(IR.CodepointSet, cast(uint)charsets.length));
tries ~= Trie.init;
matchers ~= CharMatcher.init;
}
charsets ~= set;
assert(charsets.length == tries.length);
assert(charsets.length == matchers.length);
}
}
@ -1556,7 +1526,7 @@ void optimize(Char)(ref Regex!Char zis)
Bytecode(InfiniteBloomStart, ir[i].data);
ir.insertInPlace(i+IRL!(InfiniteEnd),
Bytecode.fromRaw(cast(uint)zis.filters.length));
zis.filters ~= BloomFilter(set);
zis.filters ~= BitTable(set);
}
}
}

View file

@ -112,21 +112,21 @@ template ThompsonOps(E, S, bool withInput:true)
dchar back;
DataIndex bi;
//at start & end of input
if(atStart && wordTrie[front])
if(atStart && wordMatcher[front])
{
t.pc += IRL!(IR.Wordboundary);
return true;
}
else if(atEnd && s.loopBack(index).nextChar(back, bi)
&& wordTrie[back])
&& wordMatcher[back])
{
t.pc += IRL!(IR.Wordboundary);
return true;
}
else if(s.loopBack(index).nextChar(back, bi))
{
bool af = wordTrie[front];
bool ab = wordTrie[back];
bool af = wordMatcher[front];
bool ab = wordMatcher[back];
if(af ^ ab)
{
t.pc += IRL!(IR.Wordboundary);
@ -144,19 +144,19 @@ template ThompsonOps(E, S, bool withInput:true)
dchar back;
DataIndex bi;
//at start & end of input
if(atStart && wordTrie[front])
if(atStart && wordMatcher[front])
{
return popState(e);
}
else if(atEnd && s.loopBack(index).nextChar(back, bi)
&& wordTrie[back])
&& wordMatcher[back])
{
return popState(e);
}
else if(s.loopBack(index).nextChar(back, bi))
{
bool af = wordTrie[front];
bool ab = wordTrie[back] != 0;
bool af = wordMatcher[front];
bool ab = wordMatcher[back] != 0;
if(af ^ ab)
{
return popState(e);
@ -630,7 +630,7 @@ template ThompsonOps(E, S, bool withInput:true)
{
with(e) with(state)
{
if(re.tries[re.ir[t.pc].data][front])
if(re.matchers[re.ir[t.pc].data][front])
{
t.pc += IRL!(IR.Trie);
nlist.insertBack(t);