String interning is now shared and nearly lock free

This commit is contained in:
Hackerpilot 2014-02-26 00:22:01 -08:00
parent d979e7ca22
commit 299969b252
4 changed files with 124 additions and 169 deletions

View File

@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames)
{
string[] tags;
LexerConfig config;
StringCache* cache = new StringCache(StringCache.defaultBucketCount);
shared(StringCache)* cache = new shared StringCache(StringCache.defaultBucketCount);
foreach (fileName; fileNames)
{
File f = File(fileName);

2
main.d
View File

@ -91,7 +91,7 @@ int main(string[] args)
return 1;
}
StringCache* cache = new StringCache(StringCache.defaultBucketCount);
shared(StringCache)* cache = new shared StringCache(StringCache.defaultBucketCount);
if (tokenDump || highlight)
{

View File

@ -402,7 +402,7 @@ public struct DLexer
mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
keywords, pseudoTokenHandlers);
this(ubyte[] range, const LexerConfig config, StringCache* cache)
this(ubyte[] range, const LexerConfig config, shared(StringCache)* cache)
{
this.range = LexerRange(range);
this.config = config;
@ -550,7 +550,7 @@ public struct DLexer
}
} while (!range.empty);
string text = config.whitespaceBehavior == WhitespaceBehavior.skip
? null : cache.cacheGet(range.slice(mark));
? null : cache.intern(range.slice(mark));
return Token(tok!"whitespace", text, line, column, index);
}
@ -631,7 +631,7 @@ public struct DLexer
break hexLoop;
}
}
return Token(type, cache.cacheGet(range.slice(mark)), line, column,
return Token(type, cache.intern(range.slice(mark)), line, column,
index);
}
@ -662,7 +662,7 @@ public struct DLexer
break binaryLoop;
}
}
return Token(type, cache.cacheGet(range.slice(mark)), line, column,
return Token(type, cache.intern(range.slice(mark)), line, column,
index);
}
@ -743,7 +743,7 @@ public struct DLexer
break decimalLoop;
}
}
return Token(type, cache.cacheGet(range.slice(mark)), line, column,
return Token(type, cache.intern(range.slice(mark)), line, column,
index);
}
@ -852,7 +852,7 @@ public struct DLexer
mixin (tokenStart);
while (!range.empty && !isNewline)
range.popFront();
return Token(tok!"scriptLine", cache.cacheGet(range.slice(mark)),
return Token(tok!"scriptLine", cache.intern(range.slice(mark)),
line, column, index);
}
@ -861,7 +861,7 @@ public struct DLexer
mixin (tokenStart);
while (!range.empty && !isNewline)
range.popFront();
return Token(tok!"specialTokenSequence", cache.cacheGet(range.slice(mark)),
return Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)),
line, column, index);
}
@ -885,7 +885,7 @@ public struct DLexer
else
popFrontWhitespaceAware();
}
return Token(type, cache.cacheGet(range.slice(mark)), line, column,
return Token(type, cache.intern(range.slice(mark)), line, column,
index);
}
@ -901,7 +901,7 @@ public struct DLexer
break;
range.popFront();
}
return Token(type, cache.cacheGet(range.slice(mark)), line, column,
return Token(type, cache.intern(range.slice(mark)), line, column,
index);
}
@ -935,7 +935,7 @@ public struct DLexer
else
popFrontWhitespaceAware();
}
return Token(type, cache.cacheGet(range.slice(mark)), line, column,
return Token(type, cache.intern(range.slice(mark)), line, column,
index);
}
@ -964,7 +964,7 @@ public struct DLexer
}
IdType type = tok!"stringLiteral";
lexStringSuffix(type);
return Token(type, cache.cacheGet(range.slice(mark)), line, column,
return Token(type, cache.intern(range.slice(mark)), line, column,
index);
}
@ -1018,7 +1018,7 @@ public struct DLexer
}
}
lexStringSuffix(type);
return Token(type, cache.cacheGet(range.slice(mark)), line, column,
return Token(type, cache.intern(range.slice(mark)), line, column,
index);
}
@ -1105,7 +1105,7 @@ public struct DLexer
}
IdType type = tok!"stringLiteral";
lexStringSuffix(type);
return Token(type, cache.cacheGet(range.slice(mark)), line, column, index);
return Token(type, cache.intern(range.slice(mark)), line, column, index);
}
Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index)
@ -1142,7 +1142,7 @@ public struct DLexer
error(`" expected`);
IdType type = tok!"stringLiteral";
lexStringSuffix(type);
return Token(type, cache.cacheGet(range.slice(mark)), line, column, index);
return Token(type, cache.intern(range.slice(mark)), line, column, index);
}
Token lexTokenString() pure
@ -1186,7 +1186,7 @@ public struct DLexer
}
IdType type = tok!"stringLiteral";
lexStringSuffix(type);
return Token(type, cache.cacheGet(cast(const(ubyte)[]) app.data), line,
return Token(type, cache.intern(cast(const(ubyte)[]) app.data), line,
column, index);
}
@ -1223,7 +1223,7 @@ public struct DLexer
IdType type = tok!"stringLiteral";
lexStringSuffix(type);
return Token(type, cache.cacheGet(range.slice(mark)), line, column,
return Token(type, cache.intern(range.slice(mark)), line, column,
index);
}
@ -1332,7 +1332,7 @@ public struct DLexer
else if (range.front == '\'')
{
range.popFront();
return Token(tok!"characterLiteral", cache.cacheGet(range.slice(mark)),
return Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
line, column, index);
}
else if (range.front & 0x80)
@ -1350,7 +1350,7 @@ public struct DLexer
if (range.front == '\'')
{
range.popFront();
return Token(tok!"characterLiteral", cache.cacheGet(range.slice(mark)),
return Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
line, column, index);
}
else
@ -1375,7 +1375,7 @@ public struct DLexer
hash = StringCache.hashStep(range.front, hash);
range.popFront();
}
return Token(tok!"identifier", cache.cacheGet(range.slice(mark), hash), line,
return Token(tok!"identifier", cache.intern(range.slice(mark), hash), line,
column, index);
}
@ -1414,7 +1414,7 @@ public struct DLexer
range.popFront();
range.popFront();
range.incrementLine();
return Token(tok!"whitespace", cache.cacheGet(range.slice(mark)), line,
return Token(tok!"whitespace", cache.intern(range.slice(mark)), line,
column, index);
}
@ -1474,24 +1474,24 @@ public struct DLexer
}
Message[] messages;
StringCache* cache;
shared(StringCache)* cache;
LexerConfig config;
}
public auto byToken(ubyte[] range)
{
LexerConfig config;
StringCache* cache = new StringCache(StringCache.defaultBucketCount);
shared(StringCache)* cache = new shared StringCache(StringCache.defaultBucketCount);
return DLexer(range, config, cache);
}
public auto byToken(ubyte[] range, StringCache* cache)
public auto byToken(ubyte[] range, shared(StringCache)* cache)
{
LexerConfig config;
return DLexer(range, config, cache);
}
public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
public auto byToken(ubyte[] range, const LexerConfig config, shared(StringCache)* cache)
{
return DLexer(range, config, cache);
}

View File

@ -633,7 +633,6 @@ mixin template Lexer(Token, alias defaultTokenFunction,
*/
struct LexerRange
{
/**
* Params:
* bytes = the _lexer input
@ -767,17 +766,11 @@ struct LexerRange
}
/**
* The string cache implements a map/set for strings. Placing a string in the
* cache returns an identifier that can be used to instantly access the stored
* string. It is then possible to simply compare these indexes instead of
* performing full string comparisons when comparing the string content of
* dynamic tokens. The string cache also handles its own memory, so that mutable
* ubyte[] to lexers can still have immutable string fields in their tokens.
* Because the string cache also performs de-duplication it is possible to
* drastically reduce the memory usage of a lexer.
* FREAKIN' MAAAGIC
*/
struct StringCache
shared struct StringCache
{
import core.sync.mutex;
public:
@disable this();
@ -787,34 +780,13 @@ public:
*/
this(size_t bucketCount)
{
buckets = new Item*[bucketCount];
}
/**
* Equivalent to calling cache() and get().
* ---
* StringCache cache;
* ubyte[] str = ['a', 'b', 'c'];
* string s = cache.get(cache.cache(str));
* assert(s == "abc");
* ---
*/
string cacheGet(const(ubyte[]) bytes) pure nothrow @safe
{
return get(cache(bytes));
}
/**
* Equivalent to calling cache() and get().
*/
string cacheGet(const(ubyte[]) bytes, uint hash) pure nothrow @safe
{
return get(cache(bytes, hash));
buckets = cast(shared) new Node*[bucketCount];
allocating = false;
}
/**
* Caches a string.
* Params: bytes = the string to _cache
* Params: str = the string to intern
* Returns: A key that can be used to retrieve the cached string
* Examples:
* ---
@ -825,10 +797,10 @@ public:
* assert (first == second);
* ---
*/
size_t cache(const(ubyte)[] bytes) pure nothrow @safe
string intern(const(ubyte)[] str) pure nothrow @safe
{
immutable uint hash = hashBytes(bytes);
return cache(bytes, hash);
immutable uint hash = hashBytes(str);
return intern(str, hash);
}
/**
@ -836,51 +808,14 @@ public:
* calculating one itself. Use this alongside $(LREF hashStep)() can reduce the
* amount of work necessary when lexing dynamic tokens.
*/
size_t cache(const(ubyte)[] bytes, uint hash) pure nothrow @safe
string intern(const(ubyte)[] str, uint hash) pure nothrow @safe
in
{
assert (bytes.length > 0);
}
out (retVal)
{
assert (retVal < items.length);
assert (str.length > 0);
}
body
{
debug memoryRequested += bytes.length;
const(Item)* found = find(bytes, hash);
if (found is null)
return intern(bytes, hash);
return found.index;
}
/**
* Gets a cached string based on its key.
* Params: index = the key
* Returns: the cached string
*/
string get(size_t index) const pure nothrow @safe
in
{
assert (items.length > index);
assert (items[index] !is null);
}
out (retVal)
{
assert (retVal !is null);
}
body
{
return items[index].str;
}
debug void printStats()
{
import std.stdio;
writeln("Load Factor: ", cast(float) items.length / cast(float) buckets.length);
writeln("Memory used by blocks: ", blocks.length * blockSize);
writeln("Memory requsted: ", memoryRequested);
writeln("rehashes: ", rehashCount);
return _intern(str, hash);
}
/**
@ -902,72 +837,52 @@ public:
private:
private void rehash() pure nothrow @safe
string _intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
{
immutable size_t newBucketCount = items.length * 2;
buckets = new Item*[newBucketCount];
debug rehashCount++;
foreach (item; items)
import core.atomic;
import core.memory;
shared ubyte[] mem;
shared(Node*)* oldBucketRoot = &buckets[hash % buckets.length];
while (true)
{
immutable size_t newIndex = item.hash % newBucketCount;
item.next = buckets[newIndex];
buckets[newIndex] = item;
bool found;
shared(Node)* s = find(bytes, hash, found);
shared(Node)* n = s is null ? null : s.next;
if (found)
return cast(string) s.str;
if (mem.length == 0)
{
mem = allocate(bytes.length);
mem[] = bytes[];
}
shared(Node)* node = new shared Node(mem, hash, null);
if (s is null && cas(oldBucketRoot, *oldBucketRoot, node))
break;
node.next = s.next;
if (cas(&s.next, n, node))
break;
}
return cast(string) mem;
}
size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
{
ubyte[] mem = allocate(bytes.length);
mem[] = bytes[];
Item* item = cast(Item*) allocate(Item.sizeof).ptr;
item.index = items.length;
item.str = cast(string) mem;
item.hash = hash;
item.next = buckets[hash % buckets.length];
immutable bool checkLoadFactor = item.next !is null;
buckets[hash % buckets.length] = item;
items ~= item;
if (checkLoadFactor && (cast(float) items.length / cast(float) buckets.length) > 0.75)
rehash();
return item.index;
}
const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe
shared(Node)* find(const(ubyte)[] bytes, uint hash, ref bool found) pure nothrow @trusted
{
import std.algorithm;
immutable size_t index = hash % buckets.length;
for (const(Item)* item = buckets[index]; item !is null; item = item.next)
shared(Node)* node = buckets[index];
while (node !is null)
{
if (item.hash == hash && bytes.equal(item.str))
return item;
if (node.hash >= hash && bytes.equal(cast(ubyte[]) node.str))
{
found = true;
return node;
}
node = node.next;
}
return null;
return node;
}
ubyte[] allocate(size_t byteCount) pure nothrow @trusted
{
import core.memory;
if (byteCount > (blockSize / 4))
{
ubyte* mem = cast(ubyte*) GC.malloc(byteCount, GC.BlkAttr.NO_SCAN);
return mem[0 .. byteCount];
}
foreach (ref block; blocks)
{
immutable size_t oldUsed = block.used;
immutable size_t end = oldUsed + byteCount;
if (end > block.bytes.length)
continue;
block.used = end;
return block.bytes[oldUsed .. end];
}
blocks ~= Block(
(cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize],
byteCount);
return blocks[$ - 1].bytes[0 .. byteCount];
}
static uint hashBytes(const(ubyte)[] data) pure nothrow @safe
static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted
{
uint hash = 0;
foreach (b; data)
@ -978,23 +893,65 @@ private:
return hash;
}
static struct Item
shared(ubyte[]) allocate(immutable size_t numBytes) pure nothrow @trusted
in
{
size_t index;
string str;
uint hash;
Item* next;
assert (numBytes != 0);
}
body
{
import core.atomic;
import core.memory;
if (numBytes > (blockSize / 4))
return cast(shared) (cast(ubyte*) GC.malloc(numBytes, GC.BlkAttr.NO_SCAN))[0 .. numBytes];
shared(Block)* r = rootBlock;
while (true)
{
while (r !is null)
{
while (true)
{
immutable size_t available = r.bytes.length;
immutable size_t oldUsed = atomicLoad(r.used);
immutable size_t newUsed = oldUsed + numBytes;
if (newUsed > available)
break;
if (cas(&r.used, oldUsed, newUsed))
return r.bytes[oldUsed .. newUsed];
}
r = r.next;
}
if (cas(&allocating, false, true))
{
shared(Block)* b = new shared Block(
cast(shared) (cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize],
numBytes,
r);
atomicStore(rootBlock, b);
atomicStore(allocating, false);
r = rootBlock;
return b.bytes[0 .. numBytes];
}
}
}
static struct Block
static shared struct Node
{
ubyte[] str;
uint hash;
shared(Node)* next;
}
static shared struct Block
{
ubyte[] bytes;
size_t used;
shared(Block)* next;
}
static enum blockSize = 1024 * 16;
public static immutable uint[] sbox = [
static immutable uint[] sbox = [
0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982,
0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56,
@ -1061,9 +1018,7 @@ private:
0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A,
];
Item*[] items;
Item*[] buckets;
Block[] blocks;
debug size_t memoryRequested;
debug uint rehashCount;
shared bool allocating;
shared(Node)*[] buckets;
shared(Block)* rootBlock;
}