From df07aa7dea259a91acb3e84ea89b48eb3550b17f Mon Sep 17 00:00:00 2001
From: Dmitry Olshansky <dmitry.olsh@gmail.com>
Date: Sun, 27 Mar 2016 20:11:55 +0300
Subject: [PATCH] Special case ASCII to use bit tables insted of 2-level tries

---
 std/regex/internal/backtracking.d | 36 ++++++++--------
 std/regex/internal/ir.d           | 70 ++++++++++++++++++++-----------
 std/regex/internal/parser.d       | 46 ++++----------------
 std/regex/internal/thompson.d     | 18 ++++----
 4 files changed, 80 insertions(+), 90 deletions(-)

diff --git a/std/regex/internal/backtracking.d b/std/regex/internal/backtracking.d
index 8f16cb5f1..bd5c82f66 100644
--- a/std/regex/internal/backtracking.d
+++ b/std/regex/internal/backtracking.d
@@ -301,7 +301,7 @@ template BacktrackingMatcher(bool CTregex)
                         pc += IRL!(IR.CodepointSet);
                         break;
                     case IR.Trie:
-                        if(atEnd || !re.tries[re.ir[pc].data][front])
+                        if(atEnd || !re.matchers[re.ir[pc].data][front])
                             goto L_backtrack;
                         next();
                         pc += IRL!(IR.Trie);
@@ -310,21 +310,21 @@ template BacktrackingMatcher(bool CTregex)
                         dchar back;
                         DataIndex bi;
                         //at start & end of input
-                        if(atStart && wordTrie[front])
+                        if(atStart && wordMatcher[front])
                         {
                             pc += IRL!(IR.Wordboundary);
                             break;
                         }
                         else if(atEnd && s.loopBack(index).nextChar(back, bi)
-                                && wordTrie[back])
+                                && wordMatcher[back])
                         {
                             pc += IRL!(IR.Wordboundary);
                             break;
                         }
                         else if(s.loopBack(index).nextChar(back, bi))
                         {
-                            bool af = wordTrie[front];
-                            bool ab = wordTrie[back];
+                            bool af = wordMatcher[front];
+                            bool ab = wordMatcher[back];
                             if(af ^ ab)
                             {
                                 pc += IRL!(IR.Wordboundary);
@@ -336,15 +336,15 @@ template BacktrackingMatcher(bool CTregex)
                         dchar back;
                         DataIndex bi;
                         //at start & end of input
-                        if(atStart && wordTrie[front])
+                        if(atStart && wordMatcher[front])
                             goto L_backtrack;
                         else if(atEnd && s.loopBack(index).nextChar(back, bi)
-                                && wordTrie[back])
+                                && wordMatcher[back])
                             goto L_backtrack;
                         else if(s.loopBack(index).nextChar(back, bi))
                         {
-                            bool af = wordTrie[front];
-                            bool ab = wordTrie[back];
+                            bool af = wordMatcher[front];
+                            bool ab = wordMatcher[back];
                             if(af ^ ab)
                                 goto L_backtrack;
                         }
@@ -1276,7 +1276,7 @@ struct CtContext
             break;
         case IR.Trie:
             code ~= ctSub( `
-                    if(atEnd || !re.tries[$$][front])
+                    if(atEnd || !re.matchers[$$][front])
                         $$
                     $$
                 $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
@@ -1285,19 +1285,19 @@ struct CtContext
             code ~= ctSub( `
                     dchar back;
                     DataIndex bi;
-                    if(atStart && wordTrie[front])
+                    if(atStart && wordMatcher[front])
                     {
                         $$
                     }
                     else if(atEnd && s.loopBack(index).nextChar(back, bi)
-                            && wordTrie[back])
+                            && wordMatcher[back])
                     {
                         $$
                     }
                     else if(s.loopBack(index).nextChar(back, bi))
                     {
-                        bool af = wordTrie[front];
-                        bool ab = wordTrie[back];
+                        bool af = wordMatcher[front];
+                        bool ab = wordMatcher[back];
                         if(af ^ ab)
                         {
                             $$
@@ -1310,15 +1310,15 @@ struct CtContext
                     dchar back;
                     DataIndex bi;
                     //at start & end of input
-                    if(atStart && wordTrie[front])
+                    if(atStart && wordMatcher[front])
                         $$
                     else if(atEnd && s.loopBack(index).nextChar(back, bi)
-                            && wordTrie[back])
+                            && wordMatcher[back])
                         $$
                     else if(s.loopBack(index).nextChar(back, bi))
                     {
-                        bool af = wordTrie[front];
-                        bool ab = wordTrie[back];
+                        bool af = wordMatcher[front];
+                        bool ab = wordMatcher[back];
                         if(af ^ ab)
                             $$
                     }
diff --git a/std/regex/internal/ir.d b/std/regex/internal/ir.d
index 0724d4e90..3c7bca8d2 100644
--- a/std/regex/internal/ir.d
+++ b/std/regex/internal/ir.d
@@ -19,29 +19,29 @@ alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range);
 enum maxCharsetUsed = 6;
 
 // another variable to tweak behavior of caching generated Tries for character classes
-enum maxCachedTries = 8;
+enum maxCachedMatchers = 8;
 
 alias Trie = CodepointSetTrie!(13, 8);
 alias makeTrie = codepointSetTrie!(13, 8);
 
-Trie[CodepointSet] trieCache;
+CharMatcher[CodepointSet] matcherCache;
 
 //accessor with caching
-@trusted Trie getTrie(CodepointSet set)
+@trusted CharMatcher getMatcher(CodepointSet set)
 {// @@@BUG@@@ 6357 almost all properties of AA are not @safe
-    if(__ctfe || maxCachedTries == 0)
-        return makeTrie(set);
+    if(__ctfe || maxCachedMatchers == 0)
+        return CharMatcher(set);
     else
     {
-        auto p = set in trieCache;
+        auto p = set in matcherCache;
         if(p)
             return *p;
-        if(trieCache.length == maxCachedTries)
+        if(matcherCache.length == maxCachedMatchers)
         {
-            // flush entries in trieCache
-            trieCache = null;
+            // flush enmatchers in trieCache
+            matcherCache = null;
         }
-        return (trieCache[set] = makeTrie(set));
+        return (matcherCache[set] = CharMatcher(set));
     }
 }
 
@@ -67,9 +67,9 @@ Trie[CodepointSet] trieCache;
         | unicode.Me | unicode.Nd | unicode.Pc")();
 }
 
-@property Trie wordTrie()
+@property CharMatcher wordMatcher()
 {
-    return memoizeExpr!("makeTrie(wordCharacter)")();
+    return memoizeExpr!("CharMatcher(wordCharacter)")();
 }
 
 // some special Unicode white space characters
@@ -503,15 +503,15 @@ struct Regex(Char)
 
 package(std.regex):
     import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
-    NamedGroup[] dict;  //maps name -> user group number
-    uint ngroup;        //number of internal groups
-    uint maxCounterDepth; //max depth of nested {n,m} repetitions
-    uint hotspotTableSize; //number of entries in merge table
-    uint threadCount;
-    uint flags;         //global regex flags
-    public const(Trie)[]  tries; //
-    public const(BloomFilter)[] filters; // bloom filters for conditional loops
-    uint[] backrefed; //bit array of backreferenced submatches
+    NamedGroup[] dict;                     // maps name -> user group number
+    uint ngroup;                           // number of internal groups
+    uint maxCounterDepth;                  // max depth of nested {n,m} repetitions
+    uint hotspotTableSize;                 // number of entries in merge table
+    uint threadCount;                      // upper bound on number of Thompson VM threads 
+    uint flags;                            // global regex flags
+    public const(CharMatcher)[]  matchers; // tables that represent character sets
+    public const(BitTable)[] filters;   // bloom filters for conditional loops
+    uint[] backrefed;                      // bit array of backreferenced submatches
     Kickstart!Char kickstart;
 
     //bit access helper
@@ -728,8 +728,8 @@ public class RegexException : Exception
     mixin basicExceptionCtors;
 }
 
-
-struct BloomFilter {
+// simple 128-entry bit-table used with a hash function
+struct BitTable {
     uint[4] filter;
 
     this(CodepointSet set){
@@ -744,12 +744,32 @@ struct BloomFilter {
         filter[i >> 5]  |=  1<<(i & 31);
     }
     // non-zero -> might be present, 0 -> absent
-    uint opIndex()(dchar ch) const{
+    bool opIndex()(dchar ch) const{
         immutable i = index(ch);
-        return filter[i >> 5] & (1<<(i & 31));
+        return (filter[i >> 5]>>(i & 31)) & 1;
     }
 
     static uint index()(dchar ch){
         return ((ch >> 7) ^ ch) & 0x7F;
     }
 }
+
+struct CharMatcher {
+    BitTable ascii; // fast path for ASCII
+    Trie trie;      // slow path for Unicode
+
+    this(CodepointSet set)
+    {
+        auto asciiSet = set & unicode.ASCII;
+        ascii = BitTable(asciiSet);
+        trie = makeTrie(set);
+    }
+
+    bool opIndex()(dchar ch) const
+    {
+        if (ch < 0x80)
+            return ascii[ch];
+        else
+            return trie[ch];
+    }
+}
diff --git a/std/regex/internal/parser.d b/std/regex/internal/parser.d
index c3e513a17..32889131e 100644
--- a/std/regex/internal/parser.d
+++ b/std/regex/internal/parser.d
@@ -21,7 +21,7 @@ auto makeRegex(S)(Parser!S p)
         maxCounterDepth = p.counterDepth;
         flags = p.re_flags;
         charsets = p.charsets;
-        tries = p.tries;
+        matchers = p.matchers;
         backrefed = p.backrefed;
         re.lightPostprocess();
         debug(std_regex_parser)
@@ -198,36 +198,6 @@ dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit)
       .canFind("invalid codepoint"));
 }
 
-//heuristic value determines maximum CodepointSet length suitable for linear search
-enum maxCharsetUsed = 6;
-
-enum maxCachedTries = 8;
-
-alias Trie = CodepointSetTrie!(13, 8);
-alias makeTrie = codepointSetTrie!(13, 8);
-
-Trie[CodepointSet] trieCache;
-
-//accessor with caching
-@trusted Trie getTrie(CodepointSet set)
-{// @@@BUG@@@ 6357 almost all properties of AA are not @safe
-    if(__ctfe || maxCachedTries == 0)
-        return makeTrie(set);
-    else
-    {
-        auto p = set in trieCache;
-        if(p)
-            return *p;
-        if(trieCache.length == maxCachedTries)
-        {
-            // flush entries in trieCache
-            trieCache = null;
-        }
-        return (trieCache[set] = makeTrie(set));
-    }
-}
-
-
 auto caseEnclose(CodepointSet set)
 {
     auto cased = set & unicode.LC;
@@ -305,7 +275,7 @@ struct Parser(R)
     uint lookaroundNest = 0;
     uint counterDepth = 0; //current depth of nested counted repetitions
     CodepointSet[] charsets;  //
-    const(Trie)[] tries; //
+    const(CharMatcher)[] matchers; //
     uint[] backrefed; //bitarray for groups
 
     @trusted this(S)(R pattern, S flags)
@@ -1248,18 +1218,18 @@ struct Parser(R)
             }
             if(ivals.length*2 > maxCharsetUsed)
             {
-                auto t  = getTrie(set);
-                put(Bytecode(IR.Trie, cast(uint)tries.length));
-                tries ~= t;
+                auto t  = getMatcher(set);
+                put(Bytecode(IR.Trie, cast(uint)matchers.length));
+                matchers ~= t;
                 debug(std_regex_allocation) writeln("Trie generated");
             }
             else
             {
                 put(Bytecode(IR.CodepointSet, cast(uint)charsets.length));
-                tries ~= Trie.init;
+                matchers ~= CharMatcher.init;
             }
             charsets ~= set;
-            assert(charsets.length == tries.length);
+            assert(charsets.length == matchers.length);
         }
     }
 
@@ -1556,7 +1526,7 @@ void optimize(Char)(ref Regex!Char zis)
                     Bytecode(InfiniteBloomStart, ir[i].data);
                 ir.insertInPlace(i+IRL!(InfiniteEnd),
                     Bytecode.fromRaw(cast(uint)zis.filters.length));
-                zis.filters ~= BloomFilter(set);
+                zis.filters ~= BitTable(set);
             }
         }
     }
diff --git a/std/regex/internal/thompson.d b/std/regex/internal/thompson.d
index c72a80db3..83493d9ae 100644
--- a/std/regex/internal/thompson.d
+++ b/std/regex/internal/thompson.d
@@ -112,21 +112,21 @@ template ThompsonOps(E, S, bool withInput:true)
             dchar back;
             DataIndex bi;
             //at start & end of input
-            if(atStart && wordTrie[front])
+            if(atStart && wordMatcher[front])
             {
                 t.pc += IRL!(IR.Wordboundary);
                 return true;
             }
             else if(atEnd && s.loopBack(index).nextChar(back, bi)
-                    && wordTrie[back])
+                    && wordMatcher[back])
             {
                 t.pc += IRL!(IR.Wordboundary);
                 return true;
             }
             else if(s.loopBack(index).nextChar(back, bi))
             {
-                bool af = wordTrie[front];
-                bool ab = wordTrie[back];
+                bool af = wordMatcher[front];
+                bool ab = wordMatcher[back];
                 if(af ^ ab)
                 {
                     t.pc += IRL!(IR.Wordboundary);
@@ -144,19 +144,19 @@ template ThompsonOps(E, S, bool withInput:true)
             dchar back;
             DataIndex bi;
             //at start & end of input
-            if(atStart && wordTrie[front])
+            if(atStart && wordMatcher[front])
             {
                 return popState(e);
             }
             else if(atEnd && s.loopBack(index).nextChar(back, bi)
-                    && wordTrie[back])
+                    && wordMatcher[back])
             {
                 return popState(e);
             }
             else if(s.loopBack(index).nextChar(back, bi))
             {
-                bool af = wordTrie[front];
-                bool ab = wordTrie[back]  != 0;
+                bool af = wordMatcher[front];
+                bool ab = wordMatcher[back]  != 0;
                 if(af ^ ab)
                 {
                     return popState(e);
@@ -630,7 +630,7 @@ template ThompsonOps(E, S, bool withInput:true)
     {
         with(e) with(state)
         {
-            if(re.tries[re.ir[t.pc].data][front])
+            if(re.matchers[re.ir[t.pc].data][front])
             {
                 t.pc += IRL!(IR.Trie);
                 nlist.insertBack(t);