mirror of
https://github.com/dlang/phobos.git
synced 2025-04-28 22:21:09 +03:00
Special case ASCII to use bit tables insted of 2-level tries
This commit is contained in:
parent
0e55583fed
commit
df07aa7dea
4 changed files with 80 additions and 90 deletions
|
@ -301,7 +301,7 @@ template BacktrackingMatcher(bool CTregex)
|
|||
pc += IRL!(IR.CodepointSet);
|
||||
break;
|
||||
case IR.Trie:
|
||||
if(atEnd || !re.tries[re.ir[pc].data][front])
|
||||
if(atEnd || !re.matchers[re.ir[pc].data][front])
|
||||
goto L_backtrack;
|
||||
next();
|
||||
pc += IRL!(IR.Trie);
|
||||
|
@ -310,21 +310,21 @@ template BacktrackingMatcher(bool CTregex)
|
|||
dchar back;
|
||||
DataIndex bi;
|
||||
//at start & end of input
|
||||
if(atStart && wordTrie[front])
|
||||
if(atStart && wordMatcher[front])
|
||||
{
|
||||
pc += IRL!(IR.Wordboundary);
|
||||
break;
|
||||
}
|
||||
else if(atEnd && s.loopBack(index).nextChar(back, bi)
|
||||
&& wordTrie[back])
|
||||
&& wordMatcher[back])
|
||||
{
|
||||
pc += IRL!(IR.Wordboundary);
|
||||
break;
|
||||
}
|
||||
else if(s.loopBack(index).nextChar(back, bi))
|
||||
{
|
||||
bool af = wordTrie[front];
|
||||
bool ab = wordTrie[back];
|
||||
bool af = wordMatcher[front];
|
||||
bool ab = wordMatcher[back];
|
||||
if(af ^ ab)
|
||||
{
|
||||
pc += IRL!(IR.Wordboundary);
|
||||
|
@ -336,15 +336,15 @@ template BacktrackingMatcher(bool CTregex)
|
|||
dchar back;
|
||||
DataIndex bi;
|
||||
//at start & end of input
|
||||
if(atStart && wordTrie[front])
|
||||
if(atStart && wordMatcher[front])
|
||||
goto L_backtrack;
|
||||
else if(atEnd && s.loopBack(index).nextChar(back, bi)
|
||||
&& wordTrie[back])
|
||||
&& wordMatcher[back])
|
||||
goto L_backtrack;
|
||||
else if(s.loopBack(index).nextChar(back, bi))
|
||||
{
|
||||
bool af = wordTrie[front];
|
||||
bool ab = wordTrie[back];
|
||||
bool af = wordMatcher[front];
|
||||
bool ab = wordMatcher[back];
|
||||
if(af ^ ab)
|
||||
goto L_backtrack;
|
||||
}
|
||||
|
@ -1276,7 +1276,7 @@ struct CtContext
|
|||
break;
|
||||
case IR.Trie:
|
||||
code ~= ctSub( `
|
||||
if(atEnd || !re.tries[$$][front])
|
||||
if(atEnd || !re.matchers[$$][front])
|
||||
$$
|
||||
$$
|
||||
$$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
|
||||
|
@ -1285,19 +1285,19 @@ struct CtContext
|
|||
code ~= ctSub( `
|
||||
dchar back;
|
||||
DataIndex bi;
|
||||
if(atStart && wordTrie[front])
|
||||
if(atStart && wordMatcher[front])
|
||||
{
|
||||
$$
|
||||
}
|
||||
else if(atEnd && s.loopBack(index).nextChar(back, bi)
|
||||
&& wordTrie[back])
|
||||
&& wordMatcher[back])
|
||||
{
|
||||
$$
|
||||
}
|
||||
else if(s.loopBack(index).nextChar(back, bi))
|
||||
{
|
||||
bool af = wordTrie[front];
|
||||
bool ab = wordTrie[back];
|
||||
bool af = wordMatcher[front];
|
||||
bool ab = wordMatcher[back];
|
||||
if(af ^ ab)
|
||||
{
|
||||
$$
|
||||
|
@ -1310,15 +1310,15 @@ struct CtContext
|
|||
dchar back;
|
||||
DataIndex bi;
|
||||
//at start & end of input
|
||||
if(atStart && wordTrie[front])
|
||||
if(atStart && wordMatcher[front])
|
||||
$$
|
||||
else if(atEnd && s.loopBack(index).nextChar(back, bi)
|
||||
&& wordTrie[back])
|
||||
&& wordMatcher[back])
|
||||
$$
|
||||
else if(s.loopBack(index).nextChar(back, bi))
|
||||
{
|
||||
bool af = wordTrie[front];
|
||||
bool ab = wordTrie[back];
|
||||
bool af = wordMatcher[front];
|
||||
bool ab = wordMatcher[back];
|
||||
if(af ^ ab)
|
||||
$$
|
||||
}
|
||||
|
|
|
@ -19,29 +19,29 @@ alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range);
|
|||
enum maxCharsetUsed = 6;
|
||||
|
||||
// another variable to tweak behavior of caching generated Tries for character classes
|
||||
enum maxCachedTries = 8;
|
||||
enum maxCachedMatchers = 8;
|
||||
|
||||
alias Trie = CodepointSetTrie!(13, 8);
|
||||
alias makeTrie = codepointSetTrie!(13, 8);
|
||||
|
||||
Trie[CodepointSet] trieCache;
|
||||
CharMatcher[CodepointSet] matcherCache;
|
||||
|
||||
//accessor with caching
|
||||
@trusted Trie getTrie(CodepointSet set)
|
||||
@trusted CharMatcher getMatcher(CodepointSet set)
|
||||
{// @@@BUG@@@ 6357 almost all properties of AA are not @safe
|
||||
if(__ctfe || maxCachedTries == 0)
|
||||
return makeTrie(set);
|
||||
if(__ctfe || maxCachedMatchers == 0)
|
||||
return CharMatcher(set);
|
||||
else
|
||||
{
|
||||
auto p = set in trieCache;
|
||||
auto p = set in matcherCache;
|
||||
if(p)
|
||||
return *p;
|
||||
if(trieCache.length == maxCachedTries)
|
||||
if(matcherCache.length == maxCachedMatchers)
|
||||
{
|
||||
// flush entries in trieCache
|
||||
trieCache = null;
|
||||
// flush enmatchers in trieCache
|
||||
matcherCache = null;
|
||||
}
|
||||
return (trieCache[set] = makeTrie(set));
|
||||
return (matcherCache[set] = CharMatcher(set));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -67,9 +67,9 @@ Trie[CodepointSet] trieCache;
|
|||
| unicode.Me | unicode.Nd | unicode.Pc")();
|
||||
}
|
||||
|
||||
@property Trie wordTrie()
|
||||
@property CharMatcher wordMatcher()
|
||||
{
|
||||
return memoizeExpr!("makeTrie(wordCharacter)")();
|
||||
return memoizeExpr!("CharMatcher(wordCharacter)")();
|
||||
}
|
||||
|
||||
// some special Unicode white space characters
|
||||
|
@ -503,15 +503,15 @@ struct Regex(Char)
|
|||
|
||||
package(std.regex):
|
||||
import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
|
||||
NamedGroup[] dict; //maps name -> user group number
|
||||
uint ngroup; //number of internal groups
|
||||
uint maxCounterDepth; //max depth of nested {n,m} repetitions
|
||||
uint hotspotTableSize; //number of entries in merge table
|
||||
uint threadCount;
|
||||
uint flags; //global regex flags
|
||||
public const(Trie)[] tries; //
|
||||
public const(BloomFilter)[] filters; // bloom filters for conditional loops
|
||||
uint[] backrefed; //bit array of backreferenced submatches
|
||||
NamedGroup[] dict; // maps name -> user group number
|
||||
uint ngroup; // number of internal groups
|
||||
uint maxCounterDepth; // max depth of nested {n,m} repetitions
|
||||
uint hotspotTableSize; // number of entries in merge table
|
||||
uint threadCount; // upper bound on number of Thompson VM threads
|
||||
uint flags; // global regex flags
|
||||
public const(CharMatcher)[] matchers; // tables that represent character sets
|
||||
public const(BitTable)[] filters; // bloom filters for conditional loops
|
||||
uint[] backrefed; // bit array of backreferenced submatches
|
||||
Kickstart!Char kickstart;
|
||||
|
||||
//bit access helper
|
||||
|
@ -728,8 +728,8 @@ public class RegexException : Exception
|
|||
mixin basicExceptionCtors;
|
||||
}
|
||||
|
||||
|
||||
struct BloomFilter {
|
||||
// simple 128-entry bit-table used with a hash function
|
||||
struct BitTable {
|
||||
uint[4] filter;
|
||||
|
||||
this(CodepointSet set){
|
||||
|
@ -744,12 +744,32 @@ struct BloomFilter {
|
|||
filter[i >> 5] |= 1<<(i & 31);
|
||||
}
|
||||
// non-zero -> might be present, 0 -> absent
|
||||
uint opIndex()(dchar ch) const{
|
||||
bool opIndex()(dchar ch) const{
|
||||
immutable i = index(ch);
|
||||
return filter[i >> 5] & (1<<(i & 31));
|
||||
return (filter[i >> 5]>>(i & 31)) & 1;
|
||||
}
|
||||
|
||||
static uint index()(dchar ch){
|
||||
return ((ch >> 7) ^ ch) & 0x7F;
|
||||
}
|
||||
}
|
||||
|
||||
struct CharMatcher {
|
||||
BitTable ascii; // fast path for ASCII
|
||||
Trie trie; // slow path for Unicode
|
||||
|
||||
this(CodepointSet set)
|
||||
{
|
||||
auto asciiSet = set & unicode.ASCII;
|
||||
ascii = BitTable(asciiSet);
|
||||
trie = makeTrie(set);
|
||||
}
|
||||
|
||||
bool opIndex()(dchar ch) const
|
||||
{
|
||||
if (ch < 0x80)
|
||||
return ascii[ch];
|
||||
else
|
||||
return trie[ch];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,7 @@ auto makeRegex(S)(Parser!S p)
|
|||
maxCounterDepth = p.counterDepth;
|
||||
flags = p.re_flags;
|
||||
charsets = p.charsets;
|
||||
tries = p.tries;
|
||||
matchers = p.matchers;
|
||||
backrefed = p.backrefed;
|
||||
re.lightPostprocess();
|
||||
debug(std_regex_parser)
|
||||
|
@ -198,36 +198,6 @@ dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit)
|
|||
.canFind("invalid codepoint"));
|
||||
}
|
||||
|
||||
//heuristic value determines maximum CodepointSet length suitable for linear search
|
||||
enum maxCharsetUsed = 6;
|
||||
|
||||
enum maxCachedTries = 8;
|
||||
|
||||
alias Trie = CodepointSetTrie!(13, 8);
|
||||
alias makeTrie = codepointSetTrie!(13, 8);
|
||||
|
||||
Trie[CodepointSet] trieCache;
|
||||
|
||||
//accessor with caching
|
||||
@trusted Trie getTrie(CodepointSet set)
|
||||
{// @@@BUG@@@ 6357 almost all properties of AA are not @safe
|
||||
if(__ctfe || maxCachedTries == 0)
|
||||
return makeTrie(set);
|
||||
else
|
||||
{
|
||||
auto p = set in trieCache;
|
||||
if(p)
|
||||
return *p;
|
||||
if(trieCache.length == maxCachedTries)
|
||||
{
|
||||
// flush entries in trieCache
|
||||
trieCache = null;
|
||||
}
|
||||
return (trieCache[set] = makeTrie(set));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
auto caseEnclose(CodepointSet set)
|
||||
{
|
||||
auto cased = set & unicode.LC;
|
||||
|
@ -305,7 +275,7 @@ struct Parser(R)
|
|||
uint lookaroundNest = 0;
|
||||
uint counterDepth = 0; //current depth of nested counted repetitions
|
||||
CodepointSet[] charsets; //
|
||||
const(Trie)[] tries; //
|
||||
const(CharMatcher)[] matchers; //
|
||||
uint[] backrefed; //bitarray for groups
|
||||
|
||||
@trusted this(S)(R pattern, S flags)
|
||||
|
@ -1248,18 +1218,18 @@ struct Parser(R)
|
|||
}
|
||||
if(ivals.length*2 > maxCharsetUsed)
|
||||
{
|
||||
auto t = getTrie(set);
|
||||
put(Bytecode(IR.Trie, cast(uint)tries.length));
|
||||
tries ~= t;
|
||||
auto t = getMatcher(set);
|
||||
put(Bytecode(IR.Trie, cast(uint)matchers.length));
|
||||
matchers ~= t;
|
||||
debug(std_regex_allocation) writeln("Trie generated");
|
||||
}
|
||||
else
|
||||
{
|
||||
put(Bytecode(IR.CodepointSet, cast(uint)charsets.length));
|
||||
tries ~= Trie.init;
|
||||
matchers ~= CharMatcher.init;
|
||||
}
|
||||
charsets ~= set;
|
||||
assert(charsets.length == tries.length);
|
||||
assert(charsets.length == matchers.length);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1556,7 +1526,7 @@ void optimize(Char)(ref Regex!Char zis)
|
|||
Bytecode(InfiniteBloomStart, ir[i].data);
|
||||
ir.insertInPlace(i+IRL!(InfiniteEnd),
|
||||
Bytecode.fromRaw(cast(uint)zis.filters.length));
|
||||
zis.filters ~= BloomFilter(set);
|
||||
zis.filters ~= BitTable(set);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -112,21 +112,21 @@ template ThompsonOps(E, S, bool withInput:true)
|
|||
dchar back;
|
||||
DataIndex bi;
|
||||
//at start & end of input
|
||||
if(atStart && wordTrie[front])
|
||||
if(atStart && wordMatcher[front])
|
||||
{
|
||||
t.pc += IRL!(IR.Wordboundary);
|
||||
return true;
|
||||
}
|
||||
else if(atEnd && s.loopBack(index).nextChar(back, bi)
|
||||
&& wordTrie[back])
|
||||
&& wordMatcher[back])
|
||||
{
|
||||
t.pc += IRL!(IR.Wordboundary);
|
||||
return true;
|
||||
}
|
||||
else if(s.loopBack(index).nextChar(back, bi))
|
||||
{
|
||||
bool af = wordTrie[front];
|
||||
bool ab = wordTrie[back];
|
||||
bool af = wordMatcher[front];
|
||||
bool ab = wordMatcher[back];
|
||||
if(af ^ ab)
|
||||
{
|
||||
t.pc += IRL!(IR.Wordboundary);
|
||||
|
@ -144,19 +144,19 @@ template ThompsonOps(E, S, bool withInput:true)
|
|||
dchar back;
|
||||
DataIndex bi;
|
||||
//at start & end of input
|
||||
if(atStart && wordTrie[front])
|
||||
if(atStart && wordMatcher[front])
|
||||
{
|
||||
return popState(e);
|
||||
}
|
||||
else if(atEnd && s.loopBack(index).nextChar(back, bi)
|
||||
&& wordTrie[back])
|
||||
&& wordMatcher[back])
|
||||
{
|
||||
return popState(e);
|
||||
}
|
||||
else if(s.loopBack(index).nextChar(back, bi))
|
||||
{
|
||||
bool af = wordTrie[front];
|
||||
bool ab = wordTrie[back] != 0;
|
||||
bool af = wordMatcher[front];
|
||||
bool ab = wordMatcher[back] != 0;
|
||||
if(af ^ ab)
|
||||
{
|
||||
return popState(e);
|
||||
|
@ -630,7 +630,7 @@ template ThompsonOps(E, S, bool withInput:true)
|
|||
{
|
||||
with(e) with(state)
|
||||
{
|
||||
if(re.tries[re.ir[t.pc].data][front])
|
||||
if(re.matchers[re.ir[t.pc].data][front])
|
||||
{
|
||||
t.pc += IRL!(IR.Trie);
|
||||
nlist.insertBack(t);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue