String interning is now shared and nearly lock free

2014-02-26 00:22:01 -08:00 · 2014-02-26 00:22:01 -08:00 · 299969b252
parent d979e7ca22
commit 299969b252
4 changed files with 124 additions and 169 deletions
--- a/ctags.d
+++ b/ctags.d
@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames)
 {
 	string[] tags;
 	LexerConfig config;
-	StringCache* cache = new StringCache(StringCache.defaultBucketCount);
+	shared(StringCache)* cache = new shared StringCache(StringCache.defaultBucketCount);
 	foreach (fileName; fileNames)
 	{
 		File f = File(fileName);
--- a/main.d
+++ b/main.d
@ -91,7 +91,7 @@ int main(string[] args)
 		return 1;
 	}
-	StringCache* cache = new StringCache(StringCache.defaultBucketCount);
+	shared(StringCache)* cache = new shared StringCache(StringCache.defaultBucketCount);
 	if (tokenDump || highlight)
 	{
--- a/stdx/d/lexer.d
+++ b/stdx/d/lexer.d
@ -402,7 +402,7 @@ public struct DLexer
 	mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
 		keywords, pseudoTokenHandlers);
-	this(ubyte[] range, const LexerConfig config, StringCache* cache)
+	this(ubyte[] range, const LexerConfig config, shared(StringCache)* cache)
 	{
 		this.range = LexerRange(range);
 		this.config = config;
@ -550,7 +550,7 @@ public struct DLexer
 			}
 		} while (!range.empty);
 		string text = config.whitespaceBehavior == WhitespaceBehavior.skip
-			? null : cache.cacheGet(range.slice(mark));
+			? null : cache.intern(range.slice(mark));
 		return Token(tok!"whitespace", text, line, column, index);
 	}
@ -631,7 +631,7 @@ public struct DLexer
 				break hexLoop;
 			}
 		}
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}
@ -662,7 +662,7 @@ public struct DLexer
 				break binaryLoop;
 			}
 		}
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}
@ -743,7 +743,7 @@ public struct DLexer
 				break decimalLoop;
 			}
 		}
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}
@ -852,7 +852,7 @@ public struct DLexer
 		mixin (tokenStart);
 		while (!range.empty && !isNewline)
 			range.popFront();
-		return Token(tok!"scriptLine", cache.cacheGet(range.slice(mark)),
+		return Token(tok!"scriptLine", cache.intern(range.slice(mark)),
 			line, column, index);
 	}
@ -861,7 +861,7 @@ public struct DLexer
 		mixin (tokenStart);
 		while (!range.empty && !isNewline)
 			range.popFront();
-		return Token(tok!"specialTokenSequence", cache.cacheGet(range.slice(mark)),
+		return Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)),
 			line, column, index);
 	}
@ -885,7 +885,7 @@ public struct DLexer
 			else
 				popFrontWhitespaceAware();
 		}
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}
@ -901,7 +901,7 @@ public struct DLexer
 				break;
 			range.popFront();
 		}
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}
@ -935,7 +935,7 @@ public struct DLexer
 			else
 				popFrontWhitespaceAware();
 		}
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}
@ -964,7 +964,7 @@ public struct DLexer
 		}
 		IdType type = tok!"stringLiteral";
 		lexStringSuffix(type);
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}
@ -1018,7 +1018,7 @@ public struct DLexer
 			}
 		}
 		lexStringSuffix(type);
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}
@ -1105,7 +1105,7 @@ public struct DLexer
 		}
 		IdType type = tok!"stringLiteral";
 		lexStringSuffix(type);
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column, index);
+		return Token(type, cache.intern(range.slice(mark)), line, column, index);
 	}
 	Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index)
@ -1142,7 +1142,7 @@ public struct DLexer
 			error(`" expected`);
 		IdType type = tok!"stringLiteral";
 		lexStringSuffix(type);
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column, index);
+		return Token(type, cache.intern(range.slice(mark)), line, column, index);
 	}
 	Token lexTokenString() pure
@ -1186,7 +1186,7 @@ public struct DLexer
 		}
 		IdType type = tok!"stringLiteral";
 		lexStringSuffix(type);
-		return Token(type, cache.cacheGet(cast(const(ubyte)[]) app.data), line,
+		return Token(type, cache.intern(cast(const(ubyte)[]) app.data), line,
 			column, index);
 	}
@ -1223,7 +1223,7 @@ public struct DLexer
 		IdType type = tok!"stringLiteral";
 		lexStringSuffix(type);
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}
@ -1332,7 +1332,7 @@ public struct DLexer
 		else if (range.front == '\'')
 		{
 			range.popFront();
-			return Token(tok!"characterLiteral", cache.cacheGet(range.slice(mark)),
+			return Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
 				line, column, index);
 		}
 		else if (range.front & 0x80)
@ -1350,7 +1350,7 @@ public struct DLexer
 		if (range.front == '\'')
 		{
 			range.popFront();
-			return Token(tok!"characterLiteral", cache.cacheGet(range.slice(mark)),
+			return Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
 				line, column, index);
 		}
 		else
@ -1375,7 +1375,7 @@ public struct DLexer
 			hash = StringCache.hashStep(range.front, hash);
 			range.popFront();
 		}
-		return Token(tok!"identifier", cache.cacheGet(range.slice(mark), hash), line,
+		return Token(tok!"identifier", cache.intern(range.slice(mark), hash), line,
 			column, index);
 	}
@ -1414,7 +1414,7 @@ public struct DLexer
 		range.popFront();
 		range.popFront();
 		range.incrementLine();
-		return Token(tok!"whitespace", cache.cacheGet(range.slice(mark)), line,
+		return Token(tok!"whitespace", cache.intern(range.slice(mark)), line,
 			column, index);
 	}
@ -1474,24 +1474,24 @@ public struct DLexer
 	}
 	Message[] messages;
-	StringCache* cache;
+	shared(StringCache)* cache;
 	LexerConfig config;
 }
 public auto byToken(ubyte[] range)
 {
 	LexerConfig config;
-	StringCache* cache = new StringCache(StringCache.defaultBucketCount);
+	shared(StringCache)* cache = new shared StringCache(StringCache.defaultBucketCount);
 	return DLexer(range, config, cache);
 }
-public auto byToken(ubyte[] range, StringCache* cache)
+public auto byToken(ubyte[] range, shared(StringCache)* cache)
 {
 	LexerConfig config;
 	return DLexer(range, config, cache);
 }
-public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
+public auto byToken(ubyte[] range, const LexerConfig config, shared(StringCache)* cache)
 {
 	return DLexer(range, config, cache);
 }
--- a/stdx/lexer.d
+++ b/stdx/lexer.d
@ -633,7 +633,6 @@ mixin template Lexer(Token, alias defaultTokenFunction,
 */
 struct LexerRange
 {
    /**
     * Params:
     *     bytes = the _lexer input
@ -767,17 +766,11 @@ struct LexerRange
 }
 /**
- * The string cache implements a map/set for strings. Placing a string in the
+ * FREAKIN' MAAAGIC
 * cache returns an identifier that can be used to instantly access the stored
 * string. It is then possible to simply compare these indexes instead of
 * performing full string comparisons when comparing the string content of
 * dynamic tokens. The string cache also handles its own memory, so that mutable
 * ubyte[] to lexers can still have immutable string fields in their tokens.
 * Because the string cache also performs de-duplication it is possible to
 * drastically reduce the memory usage of a lexer.
 */
-struct StringCache
+shared struct StringCache
 {
    import core.sync.mutex;
 public:
    @disable this();
@ -787,34 +780,13 @@ public:
     */
    this(size_t bucketCount)
    {
-        buckets = new Item*[bucketCount];
+        buckets = cast(shared) new Node*[bucketCount];
-    }
+        allocating = false;
    /**
     * Equivalent to calling cache() and get().
     * ---
     * StringCache cache;
     * ubyte[] str = ['a', 'b', 'c'];
     * string s = cache.get(cache.cache(str));
     * assert(s == "abc");
     * ---
     */
    string cacheGet(const(ubyte[]) bytes) pure nothrow @safe
    {
        return get(cache(bytes));
    }
    /**
     * Equivalent to calling cache() and get().
     */
    string cacheGet(const(ubyte[]) bytes, uint hash) pure nothrow @safe
    {
        return get(cache(bytes, hash));
    }
    /**
     * Caches a string.
-     * Params: bytes = the string to _cache
+     * Params: str = the string to intern
     * Returns: A key that can be used to retrieve the cached string
     * Examples:
     * ---
@ -825,10 +797,10 @@ public:
     * assert (first == second);
     * ---
     */
-    size_t cache(const(ubyte)[] bytes) pure nothrow @safe
+    string intern(const(ubyte)[] str) pure nothrow @safe
    {
-        immutable uint hash = hashBytes(bytes);
+        immutable uint hash = hashBytes(str);
-        return cache(bytes, hash);
+        return intern(str, hash);
    }
    /**
@ -836,51 +808,14 @@ public:
     * calculating one itself. Use this alongside $(LREF hashStep)() can reduce the
     * amount of work necessary when lexing dynamic tokens.
     */
-    size_t cache(const(ubyte)[] bytes, uint hash) pure nothrow @safe
+    string intern(const(ubyte)[] str, uint hash) pure nothrow @safe
    in
    {
-        assert (bytes.length > 0);
+        assert (str.length > 0);
    }
    out (retVal)
    {
        assert (retVal < items.length);
    }
    body
    {
-        debug memoryRequested += bytes.length;
+        return _intern(str, hash);
        const(Item)* found = find(bytes, hash);
        if (found is null)
            return intern(bytes, hash);
        return found.index;
    }
    /**
     * Gets a cached string based on its key.
     * Params: index = the key
     * Returns: the cached string
     */
    string get(size_t index) const pure nothrow @safe
    in
    {
        assert (items.length > index);
        assert (items[index] !is null);
    }
    out (retVal)
    {
        assert (retVal !is null);
    }
    body
    {
        return items[index].str;
    }
    debug void printStats()
    {
        import std.stdio;
        writeln("Load Factor:           ", cast(float) items.length / cast(float) buckets.length);
        writeln("Memory used by blocks: ", blocks.length * blockSize);
        writeln("Memory requsted:       ", memoryRequested);
        writeln("rehashes:              ", rehashCount);
    }
    /**
@ -902,72 +837,52 @@ public:
 private:
-    private void rehash() pure nothrow @safe
+    string _intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
    {
-        immutable size_t newBucketCount = items.length * 2;
+        import core.atomic;
-        buckets = new Item*[newBucketCount];
+        import core.memory;
-        debug rehashCount++;
+        shared ubyte[] mem;
-        foreach (item; items)
+        shared(Node*)* oldBucketRoot = &buckets[hash % buckets.length];
        while (true)
        {
-            immutable size_t newIndex = item.hash % newBucketCount;
+            bool found;
-            item.next = buckets[newIndex];
+            shared(Node)* s = find(bytes, hash, found);
-            buckets[newIndex] = item;
+            shared(Node)* n = s is null ? null : s.next;
-        }
+            if (found)
-    }
+                return cast(string) s.str;
-
+            if (mem.length == 0)
    size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
            {
-        ubyte[] mem = allocate(bytes.length);
+                mem = allocate(bytes.length);
                mem[] = bytes[];
-        Item* item = cast(Item*) allocate(Item.sizeof).ptr;
+            }
-        item.index = items.length;
+            shared(Node)* node = new shared Node(mem, hash, null);
-        item.str = cast(string) mem;
+            if (s is null && cas(oldBucketRoot, *oldBucketRoot, node))
-        item.hash = hash;
+                break;
-        item.next = buckets[hash % buckets.length];
+            node.next = s.next;
-        immutable bool checkLoadFactor = item.next !is null;
+            if (cas(&s.next, n, node))
-        buckets[hash % buckets.length] = item;
+                break;
-        items ~= item;
+        }
-        if (checkLoadFactor && (cast(float) items.length / cast(float) buckets.length) > 0.75)
+        return cast(string) mem;
            rehash();
        return item.index;
    }
-    const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe
+    shared(Node)* find(const(ubyte)[] bytes, uint hash, ref bool found) pure nothrow @trusted
    {
        import std.algorithm;
        immutable size_t index = hash % buckets.length;
-        for (const(Item)* item = buckets[index]; item !is null; item = item.next)
+        shared(Node)* node = buckets[index];
        while (node !is null)
        {
-            if (item.hash == hash && bytes.equal(item.str))
+            if (node.hash >= hash && bytes.equal(cast(ubyte[]) node.str))
-                return item;
+            {
                found = true;
                return node;
            }
-        return null;
+            node = node.next;
        }
        return node;
    }
-    ubyte[] allocate(size_t byteCount) pure nothrow @trusted
+    static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted
    {
        import core.memory;
        if (byteCount > (blockSize / 4))
        {
            ubyte* mem = cast(ubyte*) GC.malloc(byteCount, GC.BlkAttr.NO_SCAN);
            return mem[0 .. byteCount];
        }
        foreach (ref block; blocks)
        {
            immutable size_t oldUsed = block.used;
            immutable size_t end = oldUsed + byteCount;
            if (end > block.bytes.length)
                continue;
            block.used = end;
            return block.bytes[oldUsed .. end];
        }
        blocks ~= Block(
            (cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize],
            byteCount);
        return blocks[$ - 1].bytes[0 .. byteCount];
    }
    static uint hashBytes(const(ubyte)[] data) pure nothrow @safe
    {
        uint hash = 0;
        foreach (b; data)
@ -978,23 +893,65 @@ private:
        return hash;
    }
-    static struct Item
+    shared(ubyte[]) allocate(immutable size_t numBytes) pure nothrow @trusted
    in
    {
-        size_t index;
+        assert (numBytes != 0);
-        string str;
+    }
-        uint hash;
+    body
-        Item* next;
+    {
        import core.atomic;
        import core.memory;
        if (numBytes > (blockSize / 4))
            return cast(shared) (cast(ubyte*) GC.malloc(numBytes, GC.BlkAttr.NO_SCAN))[0 .. numBytes];
        shared(Block)* r = rootBlock;
        while (true)
        {
            while (r !is null)
            {
                while (true)
                {
                    immutable size_t available = r.bytes.length;
                    immutable size_t oldUsed = atomicLoad(r.used);
                    immutable size_t newUsed = oldUsed + numBytes;
                    if (newUsed > available)
                        break;
                    if (cas(&r.used, oldUsed, newUsed))
                        return r.bytes[oldUsed .. newUsed];
                }
                r = r.next;
            }
            if (cas(&allocating, false, true))
            {
                shared(Block)* b = new shared Block(
                    cast(shared) (cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize],
                    numBytes,
                    r);
                atomicStore(rootBlock, b);
                atomicStore(allocating, false);
                r = rootBlock;
                return b.bytes[0 .. numBytes];
            }
        }
    }
-    static struct Block
+    static shared struct Node
    {
        ubyte[] str;
        uint hash;
        shared(Node)* next;
    }
    static shared struct Block
    {
        ubyte[] bytes;
        size_t used;
        shared(Block)* next;
    }
    static enum blockSize = 1024 * 16;
-    public static immutable uint[] sbox = [
+    static immutable uint[] sbox = [
        0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
        0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982,
        0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56,
@ -1061,9 +1018,7 @@ private:
        0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A,
    ];
-    Item*[] items;
+    shared bool allocating;
-    Item*[] buckets;
+    shared(Node)*[] buckets;
-    Block[] blocks;
+    shared(Block)* rootBlock;
    debug size_t memoryRequested;
    debug uint rehashCount;
 }