String interning is now shared and nearly lock free

2014-02-26 00:22:01 -08:00 · 2014-02-26 00:22:01 -08:00 · 299969b252
parent d979e7ca22
commit 299969b252
4 changed files with 124 additions and 169 deletions
--- a/ctags.d
+++ b/ctags.d
@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames)
 {
 	string[] tags;
 	LexerConfig config;
-	StringCache* cache = new StringCache(StringCache.defaultBucketCount);
+	shared(StringCache)* cache = new shared StringCache(StringCache.defaultBucketCount);
 	foreach (fileName; fileNames)
 	{
 		File f = File(fileName);
--- a/main.d
+++ b/main.d
@ -91,7 +91,7 @@ int main(string[] args)
 		return 1;
 	}

-	StringCache* cache = new StringCache(StringCache.defaultBucketCount);
+	shared(StringCache)* cache = new shared StringCache(StringCache.defaultBucketCount);

 	if (tokenDump || highlight)
 	{
--- a/stdx/d/lexer.d
+++ b/stdx/d/lexer.d
@ -402,7 +402,7 @@ public struct DLexer
 	mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
 		keywords, pseudoTokenHandlers);

-	this(ubyte[] range, const LexerConfig config, StringCache* cache)
+	this(ubyte[] range, const LexerConfig config, shared(StringCache)* cache)
 	{
 		this.range = LexerRange(range);
 		this.config = config;
@ -550,7 +550,7 @@ public struct DLexer
 			}
 		} while (!range.empty);
 		string text = config.whitespaceBehavior == WhitespaceBehavior.skip
-			? null : cache.cacheGet(range.slice(mark));
+			? null : cache.intern(range.slice(mark));
 		return Token(tok!"whitespace", text, line, column, index);
 	}

@ -631,7 +631,7 @@ public struct DLexer
 				break hexLoop;
 			}
 		}
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}

@ -662,7 +662,7 @@ public struct DLexer
 				break binaryLoop;
 			}
 		}
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}

@ -743,7 +743,7 @@ public struct DLexer
 				break decimalLoop;
 			}
 		}
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}

@ -852,7 +852,7 @@ public struct DLexer
 		mixin (tokenStart);
 		while (!range.empty && !isNewline)
 			range.popFront();
-		return Token(tok!"scriptLine", cache.cacheGet(range.slice(mark)),
+		return Token(tok!"scriptLine", cache.intern(range.slice(mark)),
 			line, column, index);
 	}

@ -861,7 +861,7 @@ public struct DLexer
 		mixin (tokenStart);
 		while (!range.empty && !isNewline)
 			range.popFront();
-		return Token(tok!"specialTokenSequence", cache.cacheGet(range.slice(mark)),
+		return Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)),
 			line, column, index);
 	}

@ -885,7 +885,7 @@ public struct DLexer
 			else
 				popFrontWhitespaceAware();
 		}
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}

@ -901,7 +901,7 @@ public struct DLexer
 				break;
 			range.popFront();
 		}
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}

@ -935,7 +935,7 @@ public struct DLexer
 			else
 				popFrontWhitespaceAware();
 		}
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}

@ -964,7 +964,7 @@ public struct DLexer
 		}
 		IdType type = tok!"stringLiteral";
 		lexStringSuffix(type);
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}

@ -1018,7 +1018,7 @@ public struct DLexer
 			}
 		}
 		lexStringSuffix(type);
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}

@ -1105,7 +1105,7 @@ public struct DLexer
 		}
 		IdType type = tok!"stringLiteral";
 		lexStringSuffix(type);
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column, index);
+		return Token(type, cache.intern(range.slice(mark)), line, column, index);
 	}

 	Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index)
@ -1142,7 +1142,7 @@ public struct DLexer
 			error(`" expected`);
 		IdType type = tok!"stringLiteral";
 		lexStringSuffix(type);
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column, index);
+		return Token(type, cache.intern(range.slice(mark)), line, column, index);
 	}

 	Token lexTokenString() pure
@ -1186,7 +1186,7 @@ public struct DLexer
 		}
 		IdType type = tok!"stringLiteral";
 		lexStringSuffix(type);
-		return Token(type, cache.cacheGet(cast(const(ubyte)[]) app.data), line,
+		return Token(type, cache.intern(cast(const(ubyte)[]) app.data), line,
 			column, index);
 	}

@ -1223,7 +1223,7 @@ public struct DLexer

 		IdType type = tok!"stringLiteral";
 		lexStringSuffix(type);
-		return Token(type, cache.cacheGet(range.slice(mark)), line, column,
+		return Token(type, cache.intern(range.slice(mark)), line, column,
 			index);
 	}

@ -1332,7 +1332,7 @@ public struct DLexer
 		else if (range.front == '\'')
 		{
 			range.popFront();
-			return Token(tok!"characterLiteral", cache.cacheGet(range.slice(mark)),
+			return Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
 				line, column, index);
 		}
 		else if (range.front & 0x80)
@ -1350,7 +1350,7 @@ public struct DLexer
 		if (range.front == '\'')
 		{
 			range.popFront();
-			return Token(tok!"characterLiteral", cache.cacheGet(range.slice(mark)),
+			return Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
 				line, column, index);
 		}
 		else
@ -1375,7 +1375,7 @@ public struct DLexer
 			hash = StringCache.hashStep(range.front, hash);
 			range.popFront();
 		}
-		return Token(tok!"identifier", cache.cacheGet(range.slice(mark), hash), line,
+		return Token(tok!"identifier", cache.intern(range.slice(mark), hash), line,
 			column, index);
 	}

@ -1414,7 +1414,7 @@ public struct DLexer
 		range.popFront();
 		range.popFront();
 		range.incrementLine();
-		return Token(tok!"whitespace", cache.cacheGet(range.slice(mark)), line,
+		return Token(tok!"whitespace", cache.intern(range.slice(mark)), line,
 			column, index);
 	}

@ -1474,24 +1474,24 @@ public struct DLexer
 	}

 	Message[] messages;
-	StringCache* cache;
+	shared(StringCache)* cache;
 	LexerConfig config;
 }

 public auto byToken(ubyte[] range)
 {
 	LexerConfig config;
-	StringCache* cache = new StringCache(StringCache.defaultBucketCount);
+	shared(StringCache)* cache = new shared StringCache(StringCache.defaultBucketCount);
 	return DLexer(range, config, cache);
 }

-public auto byToken(ubyte[] range, StringCache* cache)
+public auto byToken(ubyte[] range, shared(StringCache)* cache)
 {
 	LexerConfig config;
 	return DLexer(range, config, cache);
 }

-public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
+public auto byToken(ubyte[] range, const LexerConfig config, shared(StringCache)* cache)
 {
 	return DLexer(range, config, cache);
 }
--- a/stdx/lexer.d
+++ b/stdx/lexer.d
@ -633,7 +633,6 @@ mixin template Lexer(Token, alias defaultTokenFunction,
 */
 struct LexerRange
 {
-
    /**
     * Params:
     *     bytes = the _lexer input
@ -767,17 +766,11 @@ struct LexerRange
 }

 /**
- * The string cache implements a map/set for strings. Placing a string in the
- * cache returns an identifier that can be used to instantly access the stored
- * string. It is then possible to simply compare these indexes instead of
- * performing full string comparisons when comparing the string content of
- * dynamic tokens. The string cache also handles its own memory, so that mutable
- * ubyte[] to lexers can still have immutable string fields in their tokens.
- * Because the string cache also performs de-duplication it is possible to
- * drastically reduce the memory usage of a lexer.
+ * FREAKIN' MAAAGIC
 */
-struct StringCache
+shared struct StringCache
 {
+    import core.sync.mutex;
 public:

    @disable this();
@ -787,34 +780,13 @@ public:
     */
    this(size_t bucketCount)
    {
-        buckets = new Item*[bucketCount];
-    }
-
-    /**
-     * Equivalent to calling cache() and get().
-     * ---
-     * StringCache cache;
-     * ubyte[] str = ['a', 'b', 'c'];
-     * string s = cache.get(cache.cache(str));
-     * assert(s == "abc");
-     * ---
-     */
-    string cacheGet(const(ubyte[]) bytes) pure nothrow @safe
-    {
-        return get(cache(bytes));
-    }
-
-    /**
-     * Equivalent to calling cache() and get().
-     */
-    string cacheGet(const(ubyte[]) bytes, uint hash) pure nothrow @safe
-    {
-        return get(cache(bytes, hash));
+        buckets = cast(shared) new Node*[bucketCount];
+        allocating = false;
    }

    /**
     * Caches a string.
-     * Params: bytes = the string to _cache
+     * Params: str = the string to intern
     * Returns: A key that can be used to retrieve the cached string
     * Examples:
     * ---
@ -825,10 +797,10 @@ public:
     * assert (first == second);
     * ---
     */
-    size_t cache(const(ubyte)[] bytes) pure nothrow @safe
+    string intern(const(ubyte)[] str) pure nothrow @safe
    {
-        immutable uint hash = hashBytes(bytes);
-        return cache(bytes, hash);
+        immutable uint hash = hashBytes(str);
+        return intern(str, hash);
    }

    /**
@ -836,51 +808,14 @@ public:
     * calculating one itself. Use this alongside $(LREF hashStep)() can reduce the
     * amount of work necessary when lexing dynamic tokens.
     */
-    size_t cache(const(ubyte)[] bytes, uint hash) pure nothrow @safe
+    string intern(const(ubyte)[] str, uint hash) pure nothrow @safe
    in
    {
-        assert (bytes.length > 0);
-    }
-    out (retVal)
-    {
-        assert (retVal < items.length);
+        assert (str.length > 0);
    }
    body
    {
-        debug memoryRequested += bytes.length;
-        const(Item)* found = find(bytes, hash);
-        if (found is null)
-            return intern(bytes, hash);
-        return found.index;
-    }
-
-    /**
-     * Gets a cached string based on its key.
-     * Params: index = the key
-     * Returns: the cached string
-     */
-    string get(size_t index) const pure nothrow @safe
-    in
-    {
-        assert (items.length > index);
-        assert (items[index] !is null);
-    }
-    out (retVal)
-    {
-        assert (retVal !is null);
-    }
-    body
-    {
-        return items[index].str;
-    }
-
-    debug void printStats()
-    {
-        import std.stdio;
-        writeln("Load Factor:           ", cast(float) items.length / cast(float) buckets.length);
-        writeln("Memory used by blocks: ", blocks.length * blockSize);
-        writeln("Memory requsted:       ", memoryRequested);
-        writeln("rehashes:              ", rehashCount);
+        return _intern(str, hash);
    }

    /**
@ -902,72 +837,52 @@ public:

 private:

-    private void rehash() pure nothrow @safe
+    string _intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
    {
-        immutable size_t newBucketCount = items.length * 2;
-        buckets = new Item*[newBucketCount];
-        debug rehashCount++;
-        foreach (item; items)
+        import core.atomic;
+        import core.memory;
+        shared ubyte[] mem;
+        shared(Node*)* oldBucketRoot = &buckets[hash % buckets.length];
+        while (true)
        {
-            immutable size_t newIndex = item.hash % newBucketCount;
-            item.next = buckets[newIndex];
-            buckets[newIndex] = item;
+            bool found;
+            shared(Node)* s = find(bytes, hash, found);
+            shared(Node)* n = s is null ? null : s.next;
+            if (found)
+                return cast(string) s.str;
+            if (mem.length == 0)
+            {
+                mem = allocate(bytes.length);
+                mem[] = bytes[];
+            }
+            shared(Node)* node = new shared Node(mem, hash, null);
+            if (s is null && cas(oldBucketRoot, *oldBucketRoot, node))
+                break;
+            node.next = s.next;
+            if (cas(&s.next, n, node))
+                break;
        }
+        return cast(string) mem;
    }

-    size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
-    {
-        ubyte[] mem = allocate(bytes.length);
-        mem[] = bytes[];
-        Item* item = cast(Item*) allocate(Item.sizeof).ptr;
-        item.index = items.length;
-        item.str = cast(string) mem;
-        item.hash = hash;
-        item.next = buckets[hash % buckets.length];
-        immutable bool checkLoadFactor = item.next !is null;
-        buckets[hash % buckets.length] = item;
-        items ~= item;
-        if (checkLoadFactor && (cast(float) items.length / cast(float) buckets.length) > 0.75)
-            rehash();
-        return item.index;
-    }
-
-    const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe
+    shared(Node)* find(const(ubyte)[] bytes, uint hash, ref bool found) pure nothrow @trusted
    {
        import std.algorithm;
        immutable size_t index = hash % buckets.length;
-        for (const(Item)* item = buckets[index]; item !is null; item = item.next)
+        shared(Node)* node = buckets[index];
+        while (node !is null)
        {
-            if (item.hash == hash && bytes.equal(item.str))
-                return item;
+            if (node.hash >= hash && bytes.equal(cast(ubyte[]) node.str))
+            {
+                found = true;
+                return node;
+            }
+            node = node.next;
        }
-        return null;
+        return node;
    }

-    ubyte[] allocate(size_t byteCount) pure nothrow @trusted
-    {
-        import core.memory;
-        if (byteCount > (blockSize / 4))
-        {
-            ubyte* mem = cast(ubyte*) GC.malloc(byteCount, GC.BlkAttr.NO_SCAN);
-            return mem[0 .. byteCount];
-        }
-        foreach (ref block; blocks)
-        {
-            immutable size_t oldUsed = block.used;
-            immutable size_t end = oldUsed + byteCount;
-            if (end > block.bytes.length)
-                continue;
-            block.used = end;
-            return block.bytes[oldUsed .. end];
-        }
-        blocks ~= Block(
-            (cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize],
-            byteCount);
-        return blocks[$ - 1].bytes[0 .. byteCount];
-    }
-
-    static uint hashBytes(const(ubyte)[] data) pure nothrow @safe
+    static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted
    {
        uint hash = 0;
        foreach (b; data)
@ -978,23 +893,65 @@ private:
        return hash;
    }

-    static struct Item
+    shared(ubyte[]) allocate(immutable size_t numBytes) pure nothrow @trusted
+    in
    {
-        size_t index;
-        string str;
-        uint hash;
-        Item* next;
+        assert (numBytes != 0);
+    }
+    body
+    {
+        import core.atomic;
+        import core.memory;
+        if (numBytes > (blockSize / 4))
+            return cast(shared) (cast(ubyte*) GC.malloc(numBytes, GC.BlkAttr.NO_SCAN))[0 .. numBytes];
+        shared(Block)* r = rootBlock;
+        while (true)
+        {
+            while (r !is null)
+            {
+                while (true)
+                {
+                    immutable size_t available = r.bytes.length;
+                    immutable size_t oldUsed = atomicLoad(r.used);
+                    immutable size_t newUsed = oldUsed + numBytes;
+                    if (newUsed > available)
+                        break;
+                    if (cas(&r.used, oldUsed, newUsed))
+                        return r.bytes[oldUsed .. newUsed];
+                }
+                r = r.next;
+            }
+            if (cas(&allocating, false, true))
+            {
+                shared(Block)* b = new shared Block(
+                    cast(shared) (cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize],
+                    numBytes,
+                    r);
+                atomicStore(rootBlock, b);
+                atomicStore(allocating, false);
+                r = rootBlock;
+                return b.bytes[0 .. numBytes];
+            }
+        }
    }

-    static struct Block
+    static shared struct Node
+    {
+        ubyte[] str;
+        uint hash;
+        shared(Node)* next;
+    }
+
+    static shared struct Block
    {
        ubyte[] bytes;
        size_t used;
+        shared(Block)* next;
    }

    static enum blockSize = 1024 * 16;

-    public static immutable uint[] sbox = [
+    static immutable uint[] sbox = [
        0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
        0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982,
        0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56,
@ -1061,9 +1018,7 @@ private:
        0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A,
    ];

-    Item*[] items;
-    Item*[] buckets;
-    Block[] blocks;
-    debug size_t memoryRequested;
-    debug uint rehashCount;
+    shared bool allocating;
+    shared(Node)*[] buckets;
+    shared(Block)* rootBlock;
 }