Special case ASCII to use bit tables insted of 2-level tries

2025-04-28 22:21:09 +03:00 · 2016-03-27 20:11:55 +03:00 · 2016-03-27 20:11:55 +03:00 · df07aa7dea
commit df07aa7dea
parent 0e55583fed
4 changed files with 80 additions and 90 deletions
--- a/std/regex/internal/backtracking.d
+++ b/std/regex/internal/backtracking.d
@ -301,7 +301,7 @@ template BacktrackingMatcher(bool CTregex)
                        pc += IRL!(IR.CodepointSet);
                        break;
                    case IR.Trie:
-                        if(atEnd || !re.tries[re.ir[pc].data][front])
+                        if(atEnd || !re.matchers[re.ir[pc].data][front])
                            goto L_backtrack;
                        next();
                        pc += IRL!(IR.Trie);
@ -310,21 +310,21 @@ template BacktrackingMatcher(bool CTregex)
                        dchar back;
                        DataIndex bi;
                        //at start & end of input
-                        if(atStart && wordTrie[front])
+                        if(atStart && wordMatcher[front])
                        {
                            pc += IRL!(IR.Wordboundary);
                            break;
                        }
                        else if(atEnd && s.loopBack(index).nextChar(back, bi)
-                                && wordTrie[back])
+                                && wordMatcher[back])
                        {
                            pc += IRL!(IR.Wordboundary);
                            break;
                        }
                        else if(s.loopBack(index).nextChar(back, bi))
                        {
-                            bool af = wordTrie[front];
-                            bool ab = wordTrie[back];
+                            bool af = wordMatcher[front];
+                            bool ab = wordMatcher[back];
                            if(af ^ ab)
                            {
                                pc += IRL!(IR.Wordboundary);
@ -336,15 +336,15 @@ template BacktrackingMatcher(bool CTregex)
                        dchar back;
                        DataIndex bi;
                        //at start & end of input
-                        if(atStart && wordTrie[front])
+                        if(atStart && wordMatcher[front])
                            goto L_backtrack;
                        else if(atEnd && s.loopBack(index).nextChar(back, bi)
-                                && wordTrie[back])
+                                && wordMatcher[back])
                            goto L_backtrack;
                        else if(s.loopBack(index).nextChar(back, bi))
                        {
-                            bool af = wordTrie[front];
-                            bool ab = wordTrie[back];
+                            bool af = wordMatcher[front];
+                            bool ab = wordMatcher[back];
                            if(af ^ ab)
                                goto L_backtrack;
                        }
@ -1276,7 +1276,7 @@ struct CtContext
            break;
        case IR.Trie:
            code ~= ctSub( `
-                    if(atEnd || !re.tries[$$][front])
+                    if(atEnd || !re.matchers[$$][front])
                        $$
                    $$
                $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
@ -1285,19 +1285,19 @@ struct CtContext
            code ~= ctSub( `
                    dchar back;
                    DataIndex bi;
-                    if(atStart && wordTrie[front])
+                    if(atStart && wordMatcher[front])
                    {
                        $$
                    }
                    else if(atEnd && s.loopBack(index).nextChar(back, bi)
-                            && wordTrie[back])
+                            && wordMatcher[back])
                    {
                        $$
                    }
                    else if(s.loopBack(index).nextChar(back, bi))
                    {
-                        bool af = wordTrie[front];
-                        bool ab = wordTrie[back];
+                        bool af = wordMatcher[front];
+                        bool ab = wordMatcher[back];
                        if(af ^ ab)
                        {
                            $$
@ -1310,15 +1310,15 @@ struct CtContext
                    dchar back;
                    DataIndex bi;
                    //at start & end of input
-                    if(atStart && wordTrie[front])
+                    if(atStart && wordMatcher[front])
                        $$
                    else if(atEnd && s.loopBack(index).nextChar(back, bi)
-                            && wordTrie[back])
+                            && wordMatcher[back])
                        $$
                    else if(s.loopBack(index).nextChar(back, bi))
                    {
-                        bool af = wordTrie[front];
-                        bool ab = wordTrie[back];
+                        bool af = wordMatcher[front];
+                        bool ab = wordMatcher[back];
                        if(af ^ ab)
                            $$
                    }
--- a/std/regex/internal/ir.d
+++ b/std/regex/internal/ir.d
@ -19,29 +19,29 @@ alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range);
 enum maxCharsetUsed = 6;

 // another variable to tweak behavior of caching generated Tries for character classes
-enum maxCachedTries = 8;
+enum maxCachedMatchers = 8;

 alias Trie = CodepointSetTrie!(13, 8);
 alias makeTrie = codepointSetTrie!(13, 8);

-Trie[CodepointSet] trieCache;
+CharMatcher[CodepointSet] matcherCache;

 //accessor with caching
-@trusted Trie getTrie(CodepointSet set)
+@trusted CharMatcher getMatcher(CodepointSet set)
 {// @@@BUG@@@ 6357 almost all properties of AA are not @safe
-    if(__ctfe || maxCachedTries == 0)
-        return makeTrie(set);
+    if(__ctfe || maxCachedMatchers == 0)
+        return CharMatcher(set);
    else
    {
-        auto p = set in trieCache;
+        auto p = set in matcherCache;
        if(p)
            return *p;
-        if(trieCache.length == maxCachedTries)
+        if(matcherCache.length == maxCachedMatchers)
        {
-            // flush entries in trieCache
-            trieCache = null;
+            // flush enmatchers in trieCache
+            matcherCache = null;
        }
-        return (trieCache[set] = makeTrie(set));
+        return (matcherCache[set] = CharMatcher(set));
    }
 }

@ -67,9 +67,9 @@ Trie[CodepointSet] trieCache;
        | unicode.Me | unicode.Nd | unicode.Pc")();
 }

-@property Trie wordTrie()
+@property CharMatcher wordMatcher()
 {
-    return memoizeExpr!("makeTrie(wordCharacter)")();
+    return memoizeExpr!("CharMatcher(wordCharacter)")();
 }

 // some special Unicode white space characters
@ -503,15 +503,15 @@ struct Regex(Char)

 package(std.regex):
    import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
-    NamedGroup[] dict;  //maps name -> user group number
-    uint ngroup;        //number of internal groups
-    uint maxCounterDepth; //max depth of nested {n,m} repetitions
-    uint hotspotTableSize; //number of entries in merge table
-    uint threadCount;
-    uint flags;         //global regex flags
-    public const(Trie)[]  tries; //
-    public const(BloomFilter)[] filters; // bloom filters for conditional loops
-    uint[] backrefed; //bit array of backreferenced submatches
+    NamedGroup[] dict;                     // maps name -> user group number
+    uint ngroup;                           // number of internal groups
+    uint maxCounterDepth;                  // max depth of nested {n,m} repetitions
+    uint hotspotTableSize;                 // number of entries in merge table
+    uint threadCount;                      // upper bound on number of Thompson VM threads 
+    uint flags;                            // global regex flags
+    public const(CharMatcher)[]  matchers; // tables that represent character sets
+    public const(BitTable)[] filters;   // bloom filters for conditional loops
+    uint[] backrefed;                      // bit array of backreferenced submatches
    Kickstart!Char kickstart;

    //bit access helper
@ -728,8 +728,8 @@ public class RegexException : Exception
    mixin basicExceptionCtors;
 }

-
-struct BloomFilter {
+// simple 128-entry bit-table used with a hash function
+struct BitTable {
    uint[4] filter;

    this(CodepointSet set){
@ -744,12 +744,32 @@ struct BloomFilter {
        filter[i >> 5]  |=  1<<(i & 31);
    }
    // non-zero -> might be present, 0 -> absent
-    uint opIndex()(dchar ch) const{
+    bool opIndex()(dchar ch) const{
        immutable i = index(ch);
-        return filter[i >> 5] & (1<<(i & 31));
+        return (filter[i >> 5]>>(i & 31)) & 1;
    }

    static uint index()(dchar ch){
        return ((ch >> 7) ^ ch) & 0x7F;
    }
 }
+
+struct CharMatcher {
+    BitTable ascii; // fast path for ASCII
+    Trie trie;      // slow path for Unicode
+
+    this(CodepointSet set)
+    {
+        auto asciiSet = set & unicode.ASCII;
+        ascii = BitTable(asciiSet);
+        trie = makeTrie(set);
+    }
+
+    bool opIndex()(dchar ch) const
+    {
+        if (ch < 0x80)
+            return ascii[ch];
+        else
+            return trie[ch];
+    }
+}
--- a/std/regex/internal/parser.d
+++ b/std/regex/internal/parser.d
@ -21,7 +21,7 @@ auto makeRegex(S)(Parser!S p)
        maxCounterDepth = p.counterDepth;
        flags = p.re_flags;
        charsets = p.charsets;
-        tries = p.tries;
+        matchers = p.matchers;
        backrefed = p.backrefed;
        re.lightPostprocess();
        debug(std_regex_parser)
@ -198,36 +198,6 @@ dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit)
      .canFind("invalid codepoint"));
 }

-//heuristic value determines maximum CodepointSet length suitable for linear search
-enum maxCharsetUsed = 6;
-
-enum maxCachedTries = 8;
-
-alias Trie = CodepointSetTrie!(13, 8);
-alias makeTrie = codepointSetTrie!(13, 8);
-
-Trie[CodepointSet] trieCache;
-
-//accessor with caching
-@trusted Trie getTrie(CodepointSet set)
-{// @@@BUG@@@ 6357 almost all properties of AA are not @safe
-    if(__ctfe || maxCachedTries == 0)
-        return makeTrie(set);
-    else
-    {
-        auto p = set in trieCache;
-        if(p)
-            return *p;
-        if(trieCache.length == maxCachedTries)
-        {
-            // flush entries in trieCache
-            trieCache = null;
-        }
-        return (trieCache[set] = makeTrie(set));
-    }
-}
-
-
 auto caseEnclose(CodepointSet set)
 {
    auto cased = set & unicode.LC;
@ -305,7 +275,7 @@ struct Parser(R)
    uint lookaroundNest = 0;
    uint counterDepth = 0; //current depth of nested counted repetitions
    CodepointSet[] charsets;  //
-    const(Trie)[] tries; //
+    const(CharMatcher)[] matchers; //
    uint[] backrefed; //bitarray for groups

    @trusted this(S)(R pattern, S flags)
@ -1248,18 +1218,18 @@ struct Parser(R)
            }
            if(ivals.length*2 > maxCharsetUsed)
            {
-                auto t  = getTrie(set);
-                put(Bytecode(IR.Trie, cast(uint)tries.length));
-                tries ~= t;
+                auto t  = getMatcher(set);
+                put(Bytecode(IR.Trie, cast(uint)matchers.length));
+                matchers ~= t;
                debug(std_regex_allocation) writeln("Trie generated");
            }
            else
            {
                put(Bytecode(IR.CodepointSet, cast(uint)charsets.length));
-                tries ~= Trie.init;
+                matchers ~= CharMatcher.init;
            }
            charsets ~= set;
-            assert(charsets.length == tries.length);
+            assert(charsets.length == matchers.length);
        }
    }

@ -1556,7 +1526,7 @@ void optimize(Char)(ref Regex!Char zis)
                    Bytecode(InfiniteBloomStart, ir[i].data);
                ir.insertInPlace(i+IRL!(InfiniteEnd),
                    Bytecode.fromRaw(cast(uint)zis.filters.length));
-                zis.filters ~= BloomFilter(set);
+                zis.filters ~= BitTable(set);
            }
        }
    }
--- a/std/regex/internal/thompson.d
+++ b/std/regex/internal/thompson.d
@ -112,21 +112,21 @@ template ThompsonOps(E, S, bool withInput:true)
            dchar back;
            DataIndex bi;
            //at start & end of input
-            if(atStart && wordTrie[front])
+            if(atStart && wordMatcher[front])
            {
                t.pc += IRL!(IR.Wordboundary);
                return true;
            }
            else if(atEnd && s.loopBack(index).nextChar(back, bi)
-                    && wordTrie[back])
+                    && wordMatcher[back])
            {
                t.pc += IRL!(IR.Wordboundary);
                return true;
            }
            else if(s.loopBack(index).nextChar(back, bi))
            {
-                bool af = wordTrie[front];
-                bool ab = wordTrie[back];
+                bool af = wordMatcher[front];
+                bool ab = wordMatcher[back];
                if(af ^ ab)
                {
                    t.pc += IRL!(IR.Wordboundary);
@ -144,19 +144,19 @@ template ThompsonOps(E, S, bool withInput:true)
            dchar back;
            DataIndex bi;
            //at start & end of input
-            if(atStart && wordTrie[front])
+            if(atStart && wordMatcher[front])
            {
                return popState(e);
            }
            else if(atEnd && s.loopBack(index).nextChar(back, bi)
-                    && wordTrie[back])
+                    && wordMatcher[back])
            {
                return popState(e);
            }
            else if(s.loopBack(index).nextChar(back, bi))
            {
-                bool af = wordTrie[front];
-                bool ab = wordTrie[back]  != 0;
+                bool af = wordMatcher[front];
+                bool ab = wordMatcher[back]  != 0;
                if(af ^ ab)
                {
                    return popState(e);
@ -630,7 +630,7 @@ template ThompsonOps(E, S, bool withInput:true)
    {
        with(e) with(state)
        {
-            if(re.tries[re.ir[t.pc].data][front])
+            if(re.matchers[re.ir[t.pc].data][front])
            {
                t.pc += IRL!(IR.Trie);
                nlist.insertBack(t);