String interning is now shared and nearly lock free

This commit is contained in:
Hackerpilot 2014-02-26 00:22:01 -08:00
parent d979e7ca22
commit 299969b252
4 changed files with 124 additions and 169 deletions

View File

@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames)
{ {
string[] tags; string[] tags;
LexerConfig config; LexerConfig config;
StringCache* cache = new StringCache(StringCache.defaultBucketCount); shared(StringCache)* cache = new shared StringCache(StringCache.defaultBucketCount);
foreach (fileName; fileNames) foreach (fileName; fileNames)
{ {
File f = File(fileName); File f = File(fileName);

2
main.d
View File

@ -91,7 +91,7 @@ int main(string[] args)
return 1; return 1;
} }
StringCache* cache = new StringCache(StringCache.defaultBucketCount); shared(StringCache)* cache = new shared StringCache(StringCache.defaultBucketCount);
if (tokenDump || highlight) if (tokenDump || highlight)
{ {

View File

@ -402,7 +402,7 @@ public struct DLexer
mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens, mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
keywords, pseudoTokenHandlers); keywords, pseudoTokenHandlers);
this(ubyte[] range, const LexerConfig config, StringCache* cache) this(ubyte[] range, const LexerConfig config, shared(StringCache)* cache)
{ {
this.range = LexerRange(range); this.range = LexerRange(range);
this.config = config; this.config = config;
@ -550,7 +550,7 @@ public struct DLexer
} }
} while (!range.empty); } while (!range.empty);
string text = config.whitespaceBehavior == WhitespaceBehavior.skip string text = config.whitespaceBehavior == WhitespaceBehavior.skip
? null : cache.cacheGet(range.slice(mark)); ? null : cache.intern(range.slice(mark));
return Token(tok!"whitespace", text, line, column, index); return Token(tok!"whitespace", text, line, column, index);
} }
@ -631,7 +631,7 @@ public struct DLexer
break hexLoop; break hexLoop;
} }
} }
return Token(type, cache.cacheGet(range.slice(mark)), line, column, return Token(type, cache.intern(range.slice(mark)), line, column,
index); index);
} }
@ -662,7 +662,7 @@ public struct DLexer
break binaryLoop; break binaryLoop;
} }
} }
return Token(type, cache.cacheGet(range.slice(mark)), line, column, return Token(type, cache.intern(range.slice(mark)), line, column,
index); index);
} }
@ -743,7 +743,7 @@ public struct DLexer
break decimalLoop; break decimalLoop;
} }
} }
return Token(type, cache.cacheGet(range.slice(mark)), line, column, return Token(type, cache.intern(range.slice(mark)), line, column,
index); index);
} }
@ -852,7 +852,7 @@ public struct DLexer
mixin (tokenStart); mixin (tokenStart);
while (!range.empty && !isNewline) while (!range.empty && !isNewline)
range.popFront(); range.popFront();
return Token(tok!"scriptLine", cache.cacheGet(range.slice(mark)), return Token(tok!"scriptLine", cache.intern(range.slice(mark)),
line, column, index); line, column, index);
} }
@ -861,7 +861,7 @@ public struct DLexer
mixin (tokenStart); mixin (tokenStart);
while (!range.empty && !isNewline) while (!range.empty && !isNewline)
range.popFront(); range.popFront();
return Token(tok!"specialTokenSequence", cache.cacheGet(range.slice(mark)), return Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)),
line, column, index); line, column, index);
} }
@ -885,7 +885,7 @@ public struct DLexer
else else
popFrontWhitespaceAware(); popFrontWhitespaceAware();
} }
return Token(type, cache.cacheGet(range.slice(mark)), line, column, return Token(type, cache.intern(range.slice(mark)), line, column,
index); index);
} }
@ -901,7 +901,7 @@ public struct DLexer
break; break;
range.popFront(); range.popFront();
} }
return Token(type, cache.cacheGet(range.slice(mark)), line, column, return Token(type, cache.intern(range.slice(mark)), line, column,
index); index);
} }
@ -935,7 +935,7 @@ public struct DLexer
else else
popFrontWhitespaceAware(); popFrontWhitespaceAware();
} }
return Token(type, cache.cacheGet(range.slice(mark)), line, column, return Token(type, cache.intern(range.slice(mark)), line, column,
index); index);
} }
@ -964,7 +964,7 @@ public struct DLexer
} }
IdType type = tok!"stringLiteral"; IdType type = tok!"stringLiteral";
lexStringSuffix(type); lexStringSuffix(type);
return Token(type, cache.cacheGet(range.slice(mark)), line, column, return Token(type, cache.intern(range.slice(mark)), line, column,
index); index);
} }
@ -1018,7 +1018,7 @@ public struct DLexer
} }
} }
lexStringSuffix(type); lexStringSuffix(type);
return Token(type, cache.cacheGet(range.slice(mark)), line, column, return Token(type, cache.intern(range.slice(mark)), line, column,
index); index);
} }
@ -1105,7 +1105,7 @@ public struct DLexer
} }
IdType type = tok!"stringLiteral"; IdType type = tok!"stringLiteral";
lexStringSuffix(type); lexStringSuffix(type);
return Token(type, cache.cacheGet(range.slice(mark)), line, column, index); return Token(type, cache.intern(range.slice(mark)), line, column, index);
} }
Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index) Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index)
@ -1142,7 +1142,7 @@ public struct DLexer
error(`" expected`); error(`" expected`);
IdType type = tok!"stringLiteral"; IdType type = tok!"stringLiteral";
lexStringSuffix(type); lexStringSuffix(type);
return Token(type, cache.cacheGet(range.slice(mark)), line, column, index); return Token(type, cache.intern(range.slice(mark)), line, column, index);
} }
Token lexTokenString() pure Token lexTokenString() pure
@ -1186,7 +1186,7 @@ public struct DLexer
} }
IdType type = tok!"stringLiteral"; IdType type = tok!"stringLiteral";
lexStringSuffix(type); lexStringSuffix(type);
return Token(type, cache.cacheGet(cast(const(ubyte)[]) app.data), line, return Token(type, cache.intern(cast(const(ubyte)[]) app.data), line,
column, index); column, index);
} }
@ -1223,7 +1223,7 @@ public struct DLexer
IdType type = tok!"stringLiteral"; IdType type = tok!"stringLiteral";
lexStringSuffix(type); lexStringSuffix(type);
return Token(type, cache.cacheGet(range.slice(mark)), line, column, return Token(type, cache.intern(range.slice(mark)), line, column,
index); index);
} }
@ -1332,7 +1332,7 @@ public struct DLexer
else if (range.front == '\'') else if (range.front == '\'')
{ {
range.popFront(); range.popFront();
return Token(tok!"characterLiteral", cache.cacheGet(range.slice(mark)), return Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
line, column, index); line, column, index);
} }
else if (range.front & 0x80) else if (range.front & 0x80)
@ -1350,7 +1350,7 @@ public struct DLexer
if (range.front == '\'') if (range.front == '\'')
{ {
range.popFront(); range.popFront();
return Token(tok!"characterLiteral", cache.cacheGet(range.slice(mark)), return Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
line, column, index); line, column, index);
} }
else else
@ -1375,7 +1375,7 @@ public struct DLexer
hash = StringCache.hashStep(range.front, hash); hash = StringCache.hashStep(range.front, hash);
range.popFront(); range.popFront();
} }
return Token(tok!"identifier", cache.cacheGet(range.slice(mark), hash), line, return Token(tok!"identifier", cache.intern(range.slice(mark), hash), line,
column, index); column, index);
} }
@ -1414,7 +1414,7 @@ public struct DLexer
range.popFront(); range.popFront();
range.popFront(); range.popFront();
range.incrementLine(); range.incrementLine();
return Token(tok!"whitespace", cache.cacheGet(range.slice(mark)), line, return Token(tok!"whitespace", cache.intern(range.slice(mark)), line,
column, index); column, index);
} }
@ -1474,24 +1474,24 @@ public struct DLexer
} }
Message[] messages; Message[] messages;
StringCache* cache; shared(StringCache)* cache;
LexerConfig config; LexerConfig config;
} }
public auto byToken(ubyte[] range) public auto byToken(ubyte[] range)
{ {
LexerConfig config; LexerConfig config;
StringCache* cache = new StringCache(StringCache.defaultBucketCount); shared(StringCache)* cache = new shared StringCache(StringCache.defaultBucketCount);
return DLexer(range, config, cache); return DLexer(range, config, cache);
} }
public auto byToken(ubyte[] range, StringCache* cache) public auto byToken(ubyte[] range, shared(StringCache)* cache)
{ {
LexerConfig config; LexerConfig config;
return DLexer(range, config, cache); return DLexer(range, config, cache);
} }
public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache) public auto byToken(ubyte[] range, const LexerConfig config, shared(StringCache)* cache)
{ {
return DLexer(range, config, cache); return DLexer(range, config, cache);
} }

View File

@ -633,7 +633,6 @@ mixin template Lexer(Token, alias defaultTokenFunction,
*/ */
struct LexerRange struct LexerRange
{ {
/** /**
* Params: * Params:
* bytes = the _lexer input * bytes = the _lexer input
@ -767,17 +766,11 @@ struct LexerRange
} }
/** /**
* The string cache implements a map/set for strings. Placing a string in the * FREAKIN' MAAAGIC
* cache returns an identifier that can be used to instantly access the stored
* string. It is then possible to simply compare these indexes instead of
* performing full string comparisons when comparing the string content of
* dynamic tokens. The string cache also handles its own memory, so that mutable
* ubyte[] to lexers can still have immutable string fields in their tokens.
* Because the string cache also performs de-duplication it is possible to
* drastically reduce the memory usage of a lexer.
*/ */
struct StringCache shared struct StringCache
{ {
import core.sync.mutex;
public: public:
@disable this(); @disable this();
@ -787,34 +780,13 @@ public:
*/ */
this(size_t bucketCount) this(size_t bucketCount)
{ {
buckets = new Item*[bucketCount]; buckets = cast(shared) new Node*[bucketCount];
} allocating = false;
/**
* Equivalent to calling cache() and get().
* ---
* StringCache cache;
* ubyte[] str = ['a', 'b', 'c'];
* string s = cache.get(cache.cache(str));
* assert(s == "abc");
* ---
*/
string cacheGet(const(ubyte[]) bytes) pure nothrow @safe
{
return get(cache(bytes));
}
/**
* Equivalent to calling cache() and get().
*/
string cacheGet(const(ubyte[]) bytes, uint hash) pure nothrow @safe
{
return get(cache(bytes, hash));
} }
/** /**
* Caches a string. * Caches a string.
* Params: bytes = the string to _cache * Params: str = the string to intern
* Returns: A key that can be used to retrieve the cached string * Returns: A key that can be used to retrieve the cached string
* Examples: * Examples:
* --- * ---
@ -825,10 +797,10 @@ public:
* assert (first == second); * assert (first == second);
* --- * ---
*/ */
size_t cache(const(ubyte)[] bytes) pure nothrow @safe string intern(const(ubyte)[] str) pure nothrow @safe
{ {
immutable uint hash = hashBytes(bytes); immutable uint hash = hashBytes(str);
return cache(bytes, hash); return intern(str, hash);
} }
/** /**
@ -836,51 +808,14 @@ public:
* calculating one itself. Use this alongside $(LREF hashStep)() can reduce the * calculating one itself. Use this alongside $(LREF hashStep)() can reduce the
* amount of work necessary when lexing dynamic tokens. * amount of work necessary when lexing dynamic tokens.
*/ */
size_t cache(const(ubyte)[] bytes, uint hash) pure nothrow @safe string intern(const(ubyte)[] str, uint hash) pure nothrow @safe
in in
{ {
assert (bytes.length > 0); assert (str.length > 0);
}
out (retVal)
{
assert (retVal < items.length);
} }
body body
{ {
debug memoryRequested += bytes.length; return _intern(str, hash);
const(Item)* found = find(bytes, hash);
if (found is null)
return intern(bytes, hash);
return found.index;
}
/**
* Gets a cached string based on its key.
* Params: index = the key
* Returns: the cached string
*/
string get(size_t index) const pure nothrow @safe
in
{
assert (items.length > index);
assert (items[index] !is null);
}
out (retVal)
{
assert (retVal !is null);
}
body
{
return items[index].str;
}
debug void printStats()
{
import std.stdio;
writeln("Load Factor: ", cast(float) items.length / cast(float) buckets.length);
writeln("Memory used by blocks: ", blocks.length * blockSize);
writeln("Memory requsted: ", memoryRequested);
writeln("rehashes: ", rehashCount);
} }
/** /**
@ -902,72 +837,52 @@ public:
private: private:
private void rehash() pure nothrow @safe string _intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
{ {
immutable size_t newBucketCount = items.length * 2; import core.atomic;
buckets = new Item*[newBucketCount]; import core.memory;
debug rehashCount++; shared ubyte[] mem;
foreach (item; items) shared(Node*)* oldBucketRoot = &buckets[hash % buckets.length];
while (true)
{ {
immutable size_t newIndex = item.hash % newBucketCount; bool found;
item.next = buckets[newIndex]; shared(Node)* s = find(bytes, hash, found);
buckets[newIndex] = item; shared(Node)* n = s is null ? null : s.next;
} if (found)
} return cast(string) s.str;
if (mem.length == 0)
size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
{ {
ubyte[] mem = allocate(bytes.length); mem = allocate(bytes.length);
mem[] = bytes[]; mem[] = bytes[];
Item* item = cast(Item*) allocate(Item.sizeof).ptr; }
item.index = items.length; shared(Node)* node = new shared Node(mem, hash, null);
item.str = cast(string) mem; if (s is null && cas(oldBucketRoot, *oldBucketRoot, node))
item.hash = hash; break;
item.next = buckets[hash % buckets.length]; node.next = s.next;
immutable bool checkLoadFactor = item.next !is null; if (cas(&s.next, n, node))
buckets[hash % buckets.length] = item; break;
items ~= item; }
if (checkLoadFactor && (cast(float) items.length / cast(float) buckets.length) > 0.75) return cast(string) mem;
rehash();
return item.index;
} }
const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe shared(Node)* find(const(ubyte)[] bytes, uint hash, ref bool found) pure nothrow @trusted
{ {
import std.algorithm; import std.algorithm;
immutable size_t index = hash % buckets.length; immutable size_t index = hash % buckets.length;
for (const(Item)* item = buckets[index]; item !is null; item = item.next) shared(Node)* node = buckets[index];
while (node !is null)
{ {
if (item.hash == hash && bytes.equal(item.str)) if (node.hash >= hash && bytes.equal(cast(ubyte[]) node.str))
return item; {
found = true;
return node;
} }
return null; node = node.next;
}
return node;
} }
ubyte[] allocate(size_t byteCount) pure nothrow @trusted static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted
{
import core.memory;
if (byteCount > (blockSize / 4))
{
ubyte* mem = cast(ubyte*) GC.malloc(byteCount, GC.BlkAttr.NO_SCAN);
return mem[0 .. byteCount];
}
foreach (ref block; blocks)
{
immutable size_t oldUsed = block.used;
immutable size_t end = oldUsed + byteCount;
if (end > block.bytes.length)
continue;
block.used = end;
return block.bytes[oldUsed .. end];
}
blocks ~= Block(
(cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize],
byteCount);
return blocks[$ - 1].bytes[0 .. byteCount];
}
static uint hashBytes(const(ubyte)[] data) pure nothrow @safe
{ {
uint hash = 0; uint hash = 0;
foreach (b; data) foreach (b; data)
@ -978,23 +893,65 @@ private:
return hash; return hash;
} }
static struct Item shared(ubyte[]) allocate(immutable size_t numBytes) pure nothrow @trusted
in
{ {
size_t index; assert (numBytes != 0);
string str; }
uint hash; body
Item* next; {
import core.atomic;
import core.memory;
if (numBytes > (blockSize / 4))
return cast(shared) (cast(ubyte*) GC.malloc(numBytes, GC.BlkAttr.NO_SCAN))[0 .. numBytes];
shared(Block)* r = rootBlock;
while (true)
{
while (r !is null)
{
while (true)
{
immutable size_t available = r.bytes.length;
immutable size_t oldUsed = atomicLoad(r.used);
immutable size_t newUsed = oldUsed + numBytes;
if (newUsed > available)
break;
if (cas(&r.used, oldUsed, newUsed))
return r.bytes[oldUsed .. newUsed];
}
r = r.next;
}
if (cas(&allocating, false, true))
{
shared(Block)* b = new shared Block(
cast(shared) (cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize],
numBytes,
r);
atomicStore(rootBlock, b);
atomicStore(allocating, false);
r = rootBlock;
return b.bytes[0 .. numBytes];
}
}
} }
static struct Block static shared struct Node
{
ubyte[] str;
uint hash;
shared(Node)* next;
}
static shared struct Block
{ {
ubyte[] bytes; ubyte[] bytes;
size_t used; size_t used;
shared(Block)* next;
} }
static enum blockSize = 1024 * 16; static enum blockSize = 1024 * 16;
public static immutable uint[] sbox = [ static immutable uint[] sbox = [
0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53, 0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982, 0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982,
0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56, 0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56,
@ -1061,9 +1018,7 @@ private:
0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A, 0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A,
]; ];
Item*[] items; shared bool allocating;
Item*[] buckets; shared(Node)*[] buckets;
Block[] blocks; shared(Block)* rootBlock;
debug size_t memoryRequested;
debug uint rehashCount;
} }