String cache improvements
This commit is contained in:
parent
a3f9be1e12
commit
281b46eea2
2
ctags.d
2
ctags.d
|
@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames)
|
|||
{
|
||||
string[] tags;
|
||||
LexerConfig config;
|
||||
StringCache cache;
|
||||
StringCache* cache = new StringCache;
|
||||
foreach (fileName; fileNames)
|
||||
{
|
||||
File f = File(fileName);
|
||||
|
|
9
main.d
9
main.d
|
@ -93,7 +93,7 @@ int main(string[] args)
|
|||
return 1;
|
||||
}
|
||||
|
||||
StringCache cache;
|
||||
StringCache* cache = new StringCache;
|
||||
|
||||
if (tokenDump || highlight)
|
||||
{
|
||||
|
@ -111,10 +111,8 @@ int main(string[] args)
|
|||
}
|
||||
else if (tokenDump)
|
||||
{
|
||||
while (!tokens.empty)
|
||||
foreach (token; tokens)
|
||||
{
|
||||
auto token = tokens.front();
|
||||
tokens.popFront();
|
||||
writeln("«", token.text is null ? str(token.type) : token.text,
|
||||
"» ", token.index, " ", token.line, " ", token.column, " ",
|
||||
token.comment);
|
||||
|
@ -152,11 +150,14 @@ int main(string[] args)
|
|||
ulong count;
|
||||
foreach (f; expandArgs(args, recursive))
|
||||
{
|
||||
import core.memory;
|
||||
GC.disable();
|
||||
auto tokens = byToken!(ubyte[])(readFile(f));
|
||||
if (tokenCount)
|
||||
count += printTokenCount(stdout, f, tokens);
|
||||
else
|
||||
count += printLineCount(stdout, f, tokens);
|
||||
GC.enable();
|
||||
}
|
||||
writefln("total:\t%d", count);
|
||||
}
|
||||
|
|
|
@ -50,13 +50,13 @@ private enum dynamicTokens = [
|
|||
"dstringLiteral", "stringLiteral", "wstringLiteral", "scriptLine"
|
||||
];
|
||||
|
||||
public alias TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens) IdType;
|
||||
public alias tokenStringRepresentation!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens) str;
|
||||
public alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
|
||||
public alias str = tokenStringRepresentation!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens);
|
||||
public template tok(string token)
|
||||
{
|
||||
alias TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token) tok;
|
||||
alias tok = TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token);
|
||||
}
|
||||
enum extraFields = q{
|
||||
private enum extraFields = q{
|
||||
string comment;
|
||||
|
||||
int opCmp(size_t i) const pure nothrow @safe {
|
||||
|
@ -65,7 +65,7 @@ enum extraFields = q{
|
|||
return 0;
|
||||
}
|
||||
};
|
||||
public alias stdx.lexer.TokenStructure!(IdType, extraFields) Token;
|
||||
public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields);
|
||||
|
||||
/**
|
||||
* Configure string lexing behavior
|
||||
|
@ -115,17 +115,17 @@ public struct LexerConfig
|
|||
public auto byToken(R)(R range)
|
||||
{
|
||||
LexerConfig config;
|
||||
StringCache cache;
|
||||
StringCache* cache = new StringCache;
|
||||
return byToken(range, config, cache);
|
||||
}
|
||||
|
||||
public auto byToken(R)(R range, StringCache cache)
|
||||
public auto byToken(R)(R range, StringCache* cache)
|
||||
{
|
||||
LexerConfig config;
|
||||
return DLexer!(R)(range, config, cache);
|
||||
}
|
||||
|
||||
public auto byToken(R)(R range, const LexerConfig config, StringCache cache)
|
||||
public auto byToken(R)(R range, const LexerConfig config, StringCache* cache)
|
||||
{
|
||||
return DLexer!(R)(range, config, cache);
|
||||
}
|
||||
|
@ -437,12 +437,13 @@ public struct DLexer(R)
|
|||
mixin Lexer!(R, IdType, Token, lexIdentifier, staticTokens,
|
||||
dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens);
|
||||
|
||||
private alias typeof(range).Mark Mark;
|
||||
private alias Mark = typeof(range).Mark;
|
||||
|
||||
this(R range, const LexerConfig config, StringCache cache)
|
||||
this(R range, const LexerConfig config, StringCache* cache)
|
||||
{
|
||||
this.range = LexerRange!(typeof(buffer(range)))(buffer(range));
|
||||
this.config = config;
|
||||
this.cache = cache;
|
||||
popFront();
|
||||
}
|
||||
|
||||
|
@ -1432,8 +1433,8 @@ public struct DLexer(R)
|
|||
if (c >= '[' && c <= '^') return true;
|
||||
if (c >= '{' && c <= '~') return true;
|
||||
if (c == '`') return true;
|
||||
// if (c & 0x80 && (range.lookahead(3).startsWith("\u2028")
|
||||
// || range.lookahead(3).startsWith("\u2029"))) return true;
|
||||
// if (c & 0x80 && (range.lookahead(3) == "\u2028"
|
||||
// || range.lookahead(3) == "\u2029")) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1452,6 +1453,6 @@ public struct DLexer(R)
|
|||
|
||||
}
|
||||
|
||||
StringCache cache;
|
||||
StringCache* cache;
|
||||
LexerConfig config;
|
||||
}
|
||||
|
|
197
stdx/lexer.d
197
stdx/lexer.d
|
@ -20,6 +20,13 @@ import std.math;
|
|||
import dpick.buffer.buffer;
|
||||
import dpick.buffer.traits;
|
||||
|
||||
/**
|
||||
* Template for determining the type used for a token type. Selects the smallest
|
||||
* unsigned integral type that is able to hold the value
|
||||
* staticTokens.length + dynamicTokens.length. For example if there are 20
|
||||
* static tokens, 30 dynamic tokens, and 10 possible default tokens, this
|
||||
* template will alias itself to ubyte, as 20 + 30 + 10 < ubyte.max.
|
||||
*/
|
||||
template TokenIdType(alias staticTokens, alias dynamicTokens,
|
||||
alias possibleDefaultTokens)
|
||||
{
|
||||
|
@ -33,6 +40,9 @@ template TokenIdType(alias staticTokens, alias dynamicTokens,
|
|||
static assert (false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks up the string representation of the given token type.
|
||||
*/
|
||||
string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens, alias possibleDefaultTokens)(IdType type) @property
|
||||
{
|
||||
if (type == 0)
|
||||
|
@ -47,18 +57,41 @@ string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens
|
|||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates the token type identifier for the given symbol. There are two
|
||||
* special cases:
|
||||
* $(UL
|
||||
* $(LI If symbol is "", then the token identifier will be 0)
|
||||
* $(LI If symbol is "\0", then the token identifier will be the maximum
|
||||
* valid token type identifier)
|
||||
* )
|
||||
* In all cases this template will alias itself to a constant of type IdType.
|
||||
* Examples:
|
||||
* ---
|
||||
* enum string[] staticTokens = ["+", "-", "*", "/"];
|
||||
* enum string[] dynamicTokens = ["number"];
|
||||
* enum string[] possibleDefaultTokens = [];
|
||||
* alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
|
||||
* template tok(string symbol)
|
||||
* {
|
||||
* alias tok = TokenId!(IdType, staticTokens, dynamicTokens,
|
||||
* possibleDefaultTokens, symbol);
|
||||
* }
|
||||
* IdType plus = tok!"+";
|
||||
* ---
|
||||
*/
|
||||
template TokenId(IdType, alias staticTokens, alias dynamicTokens,
|
||||
alias possibleDefaultTokens, string symbol)
|
||||
{
|
||||
static if (symbol == "")
|
||||
{
|
||||
enum id = 0;
|
||||
alias id TokenId;
|
||||
alias TokenId = id;
|
||||
}
|
||||
else static if (symbol == "\0")
|
||||
{
|
||||
enum id = 1 + staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length;
|
||||
alias id TokenId;
|
||||
alias TokenId = id;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -66,7 +99,7 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens,
|
|||
static if (i >= 0)
|
||||
{
|
||||
enum id = i + 1;
|
||||
alias id TokenId;
|
||||
alias TokenId = id;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -75,7 +108,7 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens,
|
|||
{
|
||||
enum id = ii + staticTokens.length + 1;
|
||||
static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol);
|
||||
alias id TokenId;
|
||||
alias TokenId = id;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -84,24 +117,43 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens,
|
|||
? i + staticTokens.length + possibleDefaultTokens.length + dynamicId + 1
|
||||
: -1;
|
||||
static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol);
|
||||
alias id TokenId;
|
||||
alias TokenId = id;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The token that is returned by the lexer.
|
||||
* Params:
|
||||
* IDType = The D type of the "type" token type field.
|
||||
* extraFields = A string containing D code for any extra fields that should
|
||||
* be included in the token structure body. This string is passed
|
||||
* directly to a mixin statement.
|
||||
*/
|
||||
struct TokenStructure(IDType, string extraFields = "")
|
||||
{
|
||||
public:
|
||||
|
||||
/**
|
||||
* == overload for the the token type.
|
||||
*/
|
||||
bool opEquals(IDType type) const pure nothrow @safe
|
||||
{
|
||||
return this.type == type;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
this(IDType type)
|
||||
{
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
this(IDType type, string text, size_t line, size_t column, size_t index)
|
||||
{
|
||||
this.text = text;
|
||||
|
@ -111,11 +163,31 @@ struct TokenStructure(IDType, string extraFields = "")
|
|||
this.index = index;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
string text;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
size_t line;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
size_t column;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
size_t index;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
IDType type;
|
||||
|
||||
mixin (extraFields);
|
||||
}
|
||||
|
||||
|
@ -223,21 +295,21 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
|
|||
|
||||
static string escape(string input)
|
||||
{
|
||||
string rVal;
|
||||
string retVal;
|
||||
foreach (ubyte c; cast(ubyte[]) input)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
case '\\': rVal ~= `\\`; break;
|
||||
case '"': rVal ~= `\"`; break;
|
||||
case '\'': rVal ~= `\'`; break;
|
||||
case '\t': rVal ~= `\t`; break;
|
||||
case '\n': rVal ~= `\n`; break;
|
||||
case '\r': rVal ~= `\r`; break;
|
||||
default: rVal ~= c; break;
|
||||
case '\\': retVal ~= `\\`; break;
|
||||
case '"': retVal ~= `\"`; break;
|
||||
case '\'': retVal ~= `\'`; break;
|
||||
case '\t': retVal ~= `\t`; break;
|
||||
case '\n': retVal ~= `\n`; break;
|
||||
case '\r': retVal ~= `\r`; break;
|
||||
default: retVal ~= c; break;
|
||||
}
|
||||
}
|
||||
return rVal;
|
||||
return retVal;
|
||||
}
|
||||
|
||||
Token advance() pure
|
||||
|
@ -262,10 +334,10 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
|
|||
*/
|
||||
static T[] stupidToArray(R, T = ElementType!R)(R range)
|
||||
{
|
||||
T[] rVal;
|
||||
T[] retVal;
|
||||
foreach (v; range)
|
||||
rVal ~= v;
|
||||
return rVal;
|
||||
retVal ~= v;
|
||||
return retVal;
|
||||
}
|
||||
|
||||
LexerRange!(typeof(buffer(R.init))) range;
|
||||
|
@ -302,20 +374,56 @@ struct LexerRange(BufferType) if (isBuffer!BufferType)
|
|||
size_t line;
|
||||
}
|
||||
|
||||
/**
|
||||
* The string cache should be used within lexer implementations for several
|
||||
* reasons:
|
||||
* $(UL
|
||||
* $(LI Reducing memory consumption.)
|
||||
* $(LI Increasing performance in token comparisons)
|
||||
* $(LI Correctly creating immutable token text if the lexing source is not
|
||||
* immutable)
|
||||
* )
|
||||
*/
|
||||
struct StringCache
|
||||
{
|
||||
public:
|
||||
|
||||
/**
|
||||
* Equivalent to calling cache() and get().
|
||||
* ---
|
||||
* StringCache cache;
|
||||
* ubyte[] str = ['a', 'b', 'c'];
|
||||
* string s = cache.get(cache.cache(str));
|
||||
* assert(s == "abc");
|
||||
* ---
|
||||
*/
|
||||
string cacheGet(const(ubyte[]) bytes) pure nothrow @safe
|
||||
{
|
||||
return get(cache(bytes));
|
||||
}
|
||||
|
||||
/**
|
||||
* Caches a string.
|
||||
* Params: bytes = the string to cache
|
||||
* Returns: A key that can be used to retrieve the cached string
|
||||
* Examples:
|
||||
* ---
|
||||
* StringCache cache;
|
||||
* ubyte[] bytes = ['a', 'b', 'c'];
|
||||
* size_t first = cache.cache(bytes);
|
||||
* size_t second = cache.cache(bytes);
|
||||
* assert (first == second);
|
||||
* ---
|
||||
*/
|
||||
size_t cache(const(ubyte)[] bytes) pure nothrow @safe
|
||||
in
|
||||
{
|
||||
assert (bytes.length > 0);
|
||||
}
|
||||
out (retVal)
|
||||
{
|
||||
assert (retVal < items.length);
|
||||
}
|
||||
body
|
||||
{
|
||||
immutable uint hash = hashBytes(bytes);
|
||||
|
@ -325,12 +433,21 @@ public:
|
|||
return found.index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a cached string based on its key.
|
||||
* Params: index = the key
|
||||
* Returns: the cached string
|
||||
*/
|
||||
string get(size_t index) const pure nothrow @safe
|
||||
in
|
||||
{
|
||||
assert (items.length > index);
|
||||
assert (items[index] !is null);
|
||||
}
|
||||
out (retVal)
|
||||
{
|
||||
assert (retVal !is null);
|
||||
}
|
||||
body
|
||||
{
|
||||
return items[index].str;
|
||||
|
@ -345,7 +462,7 @@ private:
|
|||
item.str = allocate(bytes);
|
||||
item.index = items.length;
|
||||
items ~= item;
|
||||
buckets[hash % bucketCount] ~= item;
|
||||
buckets[hash % buckets.length] ~= item;
|
||||
return item.index;
|
||||
}
|
||||
|
||||
|
@ -361,9 +478,9 @@ private:
|
|||
}
|
||||
|
||||
string allocate(const(ubyte)[] bytes) pure nothrow @trusted
|
||||
out (rVal)
|
||||
out (retVal)
|
||||
{
|
||||
assert (rVal == bytes);
|
||||
assert (retVal == bytes);
|
||||
}
|
||||
body
|
||||
{
|
||||
|
@ -391,23 +508,6 @@ private:
|
|||
return cast(string) blocks[$ - 1].bytes[0 .. bytes.length];
|
||||
}
|
||||
|
||||
Item*[] items;
|
||||
Item*[][bucketCount] buckets;
|
||||
Block[] blocks;
|
||||
|
||||
struct Item
|
||||
{
|
||||
size_t index;
|
||||
string str;
|
||||
uint hash;
|
||||
}
|
||||
|
||||
struct Block
|
||||
{
|
||||
ubyte[] bytes;
|
||||
size_t used;
|
||||
}
|
||||
|
||||
static uint hashBytes(const(ubyte)[] data) pure nothrow @safe
|
||||
{
|
||||
uint hash = 0;
|
||||
|
@ -419,8 +519,21 @@ private:
|
|||
return hash;
|
||||
}
|
||||
|
||||
enum pageSize = 4096 * 1024;
|
||||
enum bucketCount = 2048;
|
||||
static struct Item
|
||||
{
|
||||
size_t index;
|
||||
string str;
|
||||
uint hash;
|
||||
}
|
||||
|
||||
static struct Block
|
||||
{
|
||||
ubyte[] bytes;
|
||||
size_t used;
|
||||
}
|
||||
|
||||
static enum pageSize = 4096 * 1024;
|
||||
static enum bucketCount = 2048;
|
||||
|
||||
static enum uint[] sbox = [
|
||||
0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
|
||||
|
@ -488,6 +601,8 @@ private:
|
|||
0xA0B38F96, 0x51D39199, 0x37A6AD75, 0xDF84EE41,
|
||||
0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A,
|
||||
];
|
||||
|
||||
Item*[] items;
|
||||
Item*[][bucketCount] buckets;
|
||||
Block[] blocks;
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue