String cache improvements

This commit is contained in:
Hackerpilot 2014-01-16 18:46:18 -08:00
parent a3f9be1e12
commit 281b46eea2
4 changed files with 176 additions and 59 deletions

View File

@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames)
{
string[] tags;
LexerConfig config;
StringCache cache;
StringCache* cache = new StringCache;
foreach (fileName; fileNames)
{
File f = File(fileName);

9
main.d
View File

@ -93,7 +93,7 @@ int main(string[] args)
return 1;
}
StringCache cache;
StringCache* cache = new StringCache;
if (tokenDump || highlight)
{
@ -111,10 +111,8 @@ int main(string[] args)
}
else if (tokenDump)
{
while (!tokens.empty)
foreach (token; tokens)
{
auto token = tokens.front();
tokens.popFront();
writeln("«", token.text is null ? str(token.type) : token.text,
"» ", token.index, " ", token.line, " ", token.column, " ",
token.comment);
@ -152,11 +150,14 @@ int main(string[] args)
ulong count;
foreach (f; expandArgs(args, recursive))
{
import core.memory;
GC.disable();
auto tokens = byToken!(ubyte[])(readFile(f));
if (tokenCount)
count += printTokenCount(stdout, f, tokens);
else
count += printLineCount(stdout, f, tokens);
GC.enable();
}
writefln("total:\t%d", count);
}

View File

@ -50,13 +50,13 @@ private enum dynamicTokens = [
"dstringLiteral", "stringLiteral", "wstringLiteral", "scriptLine"
];
public alias TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens) IdType;
public alias tokenStringRepresentation!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens) str;
public alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
public alias str = tokenStringRepresentation!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens);
public template tok(string token)
{
alias TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token) tok;
alias tok = TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token);
}
enum extraFields = q{
private enum extraFields = q{
string comment;
int opCmp(size_t i) const pure nothrow @safe {
@ -65,7 +65,7 @@ enum extraFields = q{
return 0;
}
};
public alias stdx.lexer.TokenStructure!(IdType, extraFields) Token;
public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields);
/**
* Configure string lexing behavior
@ -115,17 +115,17 @@ public struct LexerConfig
public auto byToken(R)(R range)
{
LexerConfig config;
StringCache cache;
StringCache* cache = new StringCache;
return byToken(range, config, cache);
}
public auto byToken(R)(R range, StringCache cache)
public auto byToken(R)(R range, StringCache* cache)
{
LexerConfig config;
return DLexer!(R)(range, config, cache);
}
public auto byToken(R)(R range, const LexerConfig config, StringCache cache)
public auto byToken(R)(R range, const LexerConfig config, StringCache* cache)
{
return DLexer!(R)(range, config, cache);
}
@ -437,12 +437,13 @@ public struct DLexer(R)
mixin Lexer!(R, IdType, Token, lexIdentifier, staticTokens,
dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens);
private alias typeof(range).Mark Mark;
private alias Mark = typeof(range).Mark;
this(R range, const LexerConfig config, StringCache cache)
this(R range, const LexerConfig config, StringCache* cache)
{
this.range = LexerRange!(typeof(buffer(range)))(buffer(range));
this.config = config;
this.cache = cache;
popFront();
}
@ -1432,8 +1433,8 @@ public struct DLexer(R)
if (c >= '[' && c <= '^') return true;
if (c >= '{' && c <= '~') return true;
if (c == '`') return true;
// if (c & 0x80 && (range.lookahead(3).startsWith("\u2028")
// || range.lookahead(3).startsWith("\u2029"))) return true;
// if (c & 0x80 && (range.lookahead(3) == "\u2028"
// || range.lookahead(3) == "\u2029")) return true;
return false;
}
@ -1452,6 +1453,6 @@ public struct DLexer(R)
}
StringCache cache;
StringCache* cache;
LexerConfig config;
}

View File

@ -20,6 +20,13 @@ import std.math;
import dpick.buffer.buffer;
import dpick.buffer.traits;
/**
* Template for determining the type used for a token type. Selects the smallest
* unsigned integral type that is able to hold the value
* staticTokens.length + dynamicTokens.length. For example if there are 20
* static tokens, 30 dynamic tokens, and 10 possible default tokens, this
* template will alias itself to ubyte, as 20 + 30 + 10 < ubyte.max.
*/
template TokenIdType(alias staticTokens, alias dynamicTokens,
alias possibleDefaultTokens)
{
@ -33,6 +40,9 @@ template TokenIdType(alias staticTokens, alias dynamicTokens,
static assert (false);
}
/**
* Looks up the string representation of the given token type.
*/
string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens, alias possibleDefaultTokens)(IdType type) @property
{
if (type == 0)
@ -47,18 +57,41 @@ string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens
return null;
}
/**
* Generates the token type identifier for the given symbol. There are two
* special cases:
* $(UL
* $(LI If symbol is "", then the token identifier will be 0)
* $(LI If symbol is "\0", then the token identifier will be the maximum
* valid token type identifier)
* )
* In all cases this template will alias itself to a constant of type IdType.
* Examples:
* ---
* enum string[] staticTokens = ["+", "-", "*", "/"];
* enum string[] dynamicTokens = ["number"];
* enum string[] possibleDefaultTokens = [];
* alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
* template tok(string symbol)
* {
* alias tok = TokenId!(IdType, staticTokens, dynamicTokens,
* possibleDefaultTokens, symbol);
* }
* IdType plus = tok!"+";
* ---
*/
template TokenId(IdType, alias staticTokens, alias dynamicTokens,
alias possibleDefaultTokens, string symbol)
{
static if (symbol == "")
{
enum id = 0;
alias id TokenId;
alias TokenId = id;
}
else static if (symbol == "\0")
{
enum id = 1 + staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length;
alias id TokenId;
alias TokenId = id;
}
else
{
@ -66,7 +99,7 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens,
static if (i >= 0)
{
enum id = i + 1;
alias id TokenId;
alias TokenId = id;
}
else
{
@ -75,7 +108,7 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens,
{
enum id = ii + staticTokens.length + 1;
static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol);
alias id TokenId;
alias TokenId = id;
}
else
{
@ -84,24 +117,43 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens,
? i + staticTokens.length + possibleDefaultTokens.length + dynamicId + 1
: -1;
static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol);
alias id TokenId;
alias TokenId = id;
}
}
}
}
/**
* The token that is returned by the lexer.
* Params:
* IDType = The D type of the "type" token type field.
* extraFields = A string containing D code for any extra fields that should
* be included in the token structure body. This string is passed
* directly to a mixin statement.
*/
struct TokenStructure(IDType, string extraFields = "")
{
public:
/**
* == overload for the the token type.
*/
bool opEquals(IDType type) const pure nothrow @safe
{
return this.type == type;
}
/**
*
*/
this(IDType type)
{
this.type = type;
}
/**
*
*/
this(IDType type, string text, size_t line, size_t column, size_t index)
{
this.text = text;
@ -111,11 +163,31 @@ struct TokenStructure(IDType, string extraFields = "")
this.index = index;
}
/**
*
*/
string text;
/**
*
*/
size_t line;
/**
*
*/
size_t column;
/**
*
*/
size_t index;
/**
*
*/
IDType type;
mixin (extraFields);
}
@ -223,21 +295,21 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
static string escape(string input)
{
string rVal;
string retVal;
foreach (ubyte c; cast(ubyte[]) input)
{
switch (c)
{
case '\\': rVal ~= `\\`; break;
case '"': rVal ~= `\"`; break;
case '\'': rVal ~= `\'`; break;
case '\t': rVal ~= `\t`; break;
case '\n': rVal ~= `\n`; break;
case '\r': rVal ~= `\r`; break;
default: rVal ~= c; break;
case '\\': retVal ~= `\\`; break;
case '"': retVal ~= `\"`; break;
case '\'': retVal ~= `\'`; break;
case '\t': retVal ~= `\t`; break;
case '\n': retVal ~= `\n`; break;
case '\r': retVal ~= `\r`; break;
default: retVal ~= c; break;
}
}
return rVal;
return retVal;
}
Token advance() pure
@ -262,10 +334,10 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
*/
static T[] stupidToArray(R, T = ElementType!R)(R range)
{
T[] rVal;
T[] retVal;
foreach (v; range)
rVal ~= v;
return rVal;
retVal ~= v;
return retVal;
}
LexerRange!(typeof(buffer(R.init))) range;
@ -302,20 +374,56 @@ struct LexerRange(BufferType) if (isBuffer!BufferType)
size_t line;
}
/**
* The string cache should be used within lexer implementations for several
* reasons:
* $(UL
* $(LI Reducing memory consumption.)
* $(LI Increasing performance in token comparisons)
* $(LI Correctly creating immutable token text if the lexing source is not
* immutable)
* )
*/
struct StringCache
{
public:
/**
* Equivalent to calling cache() and get().
* ---
* StringCache cache;
* ubyte[] str = ['a', 'b', 'c'];
* string s = cache.get(cache.cache(str));
* assert(s == "abc");
* ---
*/
string cacheGet(const(ubyte[]) bytes) pure nothrow @safe
{
return get(cache(bytes));
}
/**
* Caches a string.
* Params: bytes = the string to cache
* Returns: A key that can be used to retrieve the cached string
* Examples:
* ---
* StringCache cache;
* ubyte[] bytes = ['a', 'b', 'c'];
* size_t first = cache.cache(bytes);
* size_t second = cache.cache(bytes);
* assert (first == second);
* ---
*/
size_t cache(const(ubyte)[] bytes) pure nothrow @safe
in
{
assert (bytes.length > 0);
}
out (retVal)
{
assert (retVal < items.length);
}
body
{
immutable uint hash = hashBytes(bytes);
@ -325,12 +433,21 @@ public:
return found.index;
}
/**
* Gets a cached string based on its key.
* Params: index = the key
* Returns: the cached string
*/
string get(size_t index) const pure nothrow @safe
in
{
assert (items.length > index);
assert (items[index] !is null);
}
out (retVal)
{
assert (retVal !is null);
}
body
{
return items[index].str;
@ -345,7 +462,7 @@ private:
item.str = allocate(bytes);
item.index = items.length;
items ~= item;
buckets[hash % bucketCount] ~= item;
buckets[hash % buckets.length] ~= item;
return item.index;
}
@ -361,9 +478,9 @@ private:
}
string allocate(const(ubyte)[] bytes) pure nothrow @trusted
out (rVal)
out (retVal)
{
assert (rVal == bytes);
assert (retVal == bytes);
}
body
{
@ -391,23 +508,6 @@ private:
return cast(string) blocks[$ - 1].bytes[0 .. bytes.length];
}
Item*[] items;
Item*[][bucketCount] buckets;
Block[] blocks;
struct Item
{
size_t index;
string str;
uint hash;
}
struct Block
{
ubyte[] bytes;
size_t used;
}
static uint hashBytes(const(ubyte)[] data) pure nothrow @safe
{
uint hash = 0;
@ -419,8 +519,21 @@ private:
return hash;
}
enum pageSize = 4096 * 1024;
enum bucketCount = 2048;
static struct Item
{
size_t index;
string str;
uint hash;
}
static struct Block
{
ubyte[] bytes;
size_t used;
}
static enum pageSize = 4096 * 1024;
static enum bucketCount = 2048;
static enum uint[] sbox = [
0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
@ -488,6 +601,8 @@ private:
0xA0B38F96, 0x51D39199, 0x37A6AD75, 0xDF84EE41,
0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A,
];
Item*[] items;
Item*[][bucketCount] buckets;
Block[] blocks;
}