String cache improvements

2014-01-16 18:46:18 -08:00 · 2014-01-16 18:46:18 -08:00 · 281b46eea2
parent a3f9be1e12
commit 281b46eea2
4 changed files with 176 additions and 59 deletions
--- a/ctags.d
+++ b/ctags.d
@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames)
 {
 	string[] tags;
 	LexerConfig config;
-	StringCache cache;
+	StringCache* cache = new StringCache;
 	foreach (fileName; fileNames)
 	{
 		File f = File(fileName);
--- a/main.d
+++ b/main.d
@ -93,7 +93,7 @@ int main(string[] args)
 		return 1;
 	}

-	StringCache cache;
+	StringCache* cache = new StringCache;

 	if (tokenDump || highlight)
 	{
@ -111,10 +111,8 @@ int main(string[] args)
 		}
 		else if (tokenDump)
 		{
-			while (!tokens.empty)
+			foreach (token; tokens)
 			{
-				auto token = tokens.front();
-				tokens.popFront();
 				writeln("«", token.text is null ? str(token.type) : token.text,
 					"» ", token.index, " ", token.line, " ", token.column, " ",
 					token.comment);
@ -152,11 +150,14 @@ int main(string[] args)
 				ulong count;
 				foreach (f; expandArgs(args, recursive))
 				{
+					import core.memory;
+					GC.disable();
 					auto tokens = byToken!(ubyte[])(readFile(f));
 					if (tokenCount)
 						count += printTokenCount(stdout, f, tokens);
 					else
 						count += printLineCount(stdout, f, tokens);
+					GC.enable();
 				}
 				writefln("total:\t%d", count);
 			}
--- a/stdx/d/lexer.d
+++ b/stdx/d/lexer.d
@ -50,13 +50,13 @@ private enum dynamicTokens = [
 	"dstringLiteral", "stringLiteral", "wstringLiteral", "scriptLine"
 ];

-public alias TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens) IdType;
-public alias tokenStringRepresentation!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens) str;
+public alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
+public alias str = tokenStringRepresentation!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens);
 public template tok(string token)
 {
-  alias TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token) tok;
+  alias tok = TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token);
 }
-enum extraFields = q{
+private enum extraFields = q{
    string comment;

    int opCmp(size_t i) const pure nothrow @safe {
@ -65,7 +65,7 @@ enum extraFields = q{
        return 0;
    }
 };
-public alias stdx.lexer.TokenStructure!(IdType, extraFields) Token;
+public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields);

 /**
 * Configure string lexing behavior
@ -115,17 +115,17 @@ public struct LexerConfig
 public auto byToken(R)(R range)
 {
    LexerConfig config;
-	StringCache cache;
+	StringCache* cache = new StringCache;
    return byToken(range, config, cache);
 }

-public auto byToken(R)(R range, StringCache cache)
+public auto byToken(R)(R range, StringCache* cache)
 {
 	LexerConfig config;
 	return DLexer!(R)(range, config, cache);
 }

-public auto byToken(R)(R range, const LexerConfig config, StringCache cache)
+public auto byToken(R)(R range, const LexerConfig config, StringCache* cache)
 {
 	return DLexer!(R)(range, config, cache);
 }
@ -437,12 +437,13 @@ public struct DLexer(R)
 	mixin Lexer!(R, IdType, Token, lexIdentifier, staticTokens,
 		dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens);

-	private alias typeof(range).Mark Mark;
+	private alias Mark = typeof(range).Mark;

-	this(R range, const LexerConfig config, StringCache cache)
+	this(R range, const LexerConfig config, StringCache* cache)
 	{
 		this.range = LexerRange!(typeof(buffer(range)))(buffer(range));
        this.config = config;
+		this.cache = cache;
        popFront();
 	}

@ -1432,8 +1433,8 @@ public struct DLexer(R)
 		if (c >= '[' && c <= '^') return true;
 		if (c >= '{' && c <= '~') return true;
 		if (c == '`') return true;
-//		if (c & 0x80 && (range.lookahead(3).startsWith("\u2028")
-//			|| range.lookahead(3).startsWith("\u2029"))) return true;
+//		if (c & 0x80 && (range.lookahead(3) == "\u2028"
+//			|| range.lookahead(3) == "\u2029")) return true;
 		return false;
 	}

@ -1452,6 +1453,6 @@ public struct DLexer(R)

 	}

-	StringCache cache;
+	StringCache* cache;
 	LexerConfig config;
 }
--- a/stdx/lexer.d
+++ b/stdx/lexer.d
@ -20,6 +20,13 @@ import std.math;
 import dpick.buffer.buffer;
 import dpick.buffer.traits;

+/**
+ * Template for determining the type used for a token type. Selects the smallest
+ * unsigned integral type that is able to hold the value
+ * staticTokens.length + dynamicTokens.length. For example if there are 20
+ * static tokens, 30 dynamic tokens, and 10 possible default tokens, this
+ * template will alias itself to ubyte, as 20 + 30 + 10 < ubyte.max.
+ */
 template TokenIdType(alias staticTokens, alias dynamicTokens,
 	alias possibleDefaultTokens)
 {
@ -33,6 +40,9 @@ template TokenIdType(alias staticTokens, alias dynamicTokens,
 		static assert (false);
 }

+/**
+ * Looks up the string representation of the given token type.
+ */
 string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens, alias possibleDefaultTokens)(IdType type) @property
 {
 	if (type == 0)
@ -47,18 +57,41 @@ string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens
 		return null;
 }

+/**
+ * Generates the token type identifier for the given symbol. There are two
+ * special cases:
+ * $(UL
+ *     $(LI If symbol is "", then the token identifier will be 0)
+ *     $(LI If symbol is "\0", then the token identifier will be the maximum
+ *         valid token type identifier)
+ * )
+ * In all cases this template will alias itself to a constant of type IdType.
+ * Examples:
+ * ---
+ * enum string[] staticTokens = ["+", "-", "*", "/"];
+ * enum string[] dynamicTokens = ["number"];
+ * enum string[] possibleDefaultTokens = [];
+ * alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
+ * template tok(string symbol)
+ * {
+ *     alias tok = TokenId!(IdType, staticTokens, dynamicTokens,
+ *         possibleDefaultTokens, symbol);
+ * }
+ * IdType plus = tok!"+";
+ * ---
+ */
 template TokenId(IdType, alias staticTokens, alias dynamicTokens,
 	alias possibleDefaultTokens, string symbol)
 {
 	static if (symbol == "")
 	{
 		enum id = 0;
-		alias id TokenId;
+		alias TokenId = id;
 	}
 	else static if (symbol == "\0")
 	{
 		enum id = 1 + staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length;
-		alias id TokenId;
+		alias TokenId = id;
 	}
 	else
 	{
@ -66,7 +99,7 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens,
 		static if (i >= 0)
 		{
 			enum id = i + 1;
-			alias id TokenId;
+			alias TokenId = id;
 		}
 		else
 		{
@ -75,7 +108,7 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens,
 			{
 				enum id = ii + staticTokens.length + 1;
 				static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol);
-				alias id TokenId;
+				alias TokenId = id;
 			}
 			else
 			{
@ -84,24 +117,43 @@ template TokenId(IdType, alias staticTokens, alias dynamicTokens,
 					? i + staticTokens.length + possibleDefaultTokens.length + dynamicId + 1
 					: -1;
 				static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol);
-				alias id TokenId;
+				alias TokenId = id;
 			}
 		}
 	}
 }

+/**
+ * The token that is returned by the lexer.
+ * Params:
+ *     IDType = The D type of the "type" token type field.
+ *     extraFields = A string containing D code for any extra fields that should
+ *         be included in the token structure body. This string is passed
+ *         directly to a mixin statement.
+ */
 struct TokenStructure(IDType, string extraFields = "")
 {
+public:
+
+	/**
+	 * == overload for the the token type.
+	 */
 	bool opEquals(IDType type) const pure nothrow @safe
 	{
 		return this.type == type;
 	}

+	/**
+	 *
+	 */
 	this(IDType type)
 	{
 		this.type = type;
 	}

+	/**
+	 *
+	 */
 	this(IDType type, string text, size_t line, size_t column, size_t index)
 	{
 		this.text = text;
@ -111,11 +163,31 @@ struct TokenStructure(IDType, string extraFields = "")
 		this.index = index;
 	}

+	/**
+	 *
+	 */
 	string text;
+
+	/**
+	 *
+	 */
 	size_t line;
+
+	/**
+	 *
+	 */
 	size_t column;
+
+	/**
+	 *
+	 */
 	size_t index;
+
+	/**
+	 *
+	 */
 	IDType type;
+
 	mixin (extraFields);
 }

@ -223,21 +295,21 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,

 	static string escape(string input)
 	{
-		string rVal;
+		string retVal;
 		foreach (ubyte c; cast(ubyte[]) input)
 		{
 			switch (c)
 			{
-			case '\\': rVal ~= `\\`; break;
-			case '"': rVal ~= `\"`; break;
-			case '\'': rVal ~= `\'`; break;
-			case '\t': rVal ~= `\t`; break;
-			case '\n': rVal ~= `\n`; break;
-			case '\r': rVal ~= `\r`; break;
-			default: rVal ~= c; break;
+			case '\\': retVal ~= `\\`; break;
+			case '"': retVal ~= `\"`; break;
+			case '\'': retVal ~= `\'`; break;
+			case '\t': retVal ~= `\t`; break;
+			case '\n': retVal ~= `\n`; break;
+			case '\r': retVal ~= `\r`; break;
+			default: retVal ~= c; break;
 			}
 		}
-		return rVal;
+		return retVal;
 	}

 	Token advance() pure
@ -262,10 +334,10 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
 	 */
 	static T[] stupidToArray(R, T = ElementType!R)(R range)
 	{
-		T[] rVal;
+		T[] retVal;
 		foreach (v; range)
-			rVal ~= v;
-		return rVal;
+			retVal ~= v;
+		return retVal;
 	}

 	LexerRange!(typeof(buffer(R.init))) range;
@ -302,20 +374,56 @@ struct LexerRange(BufferType) if (isBuffer!BufferType)
 	size_t line;
 }

+/**
+ * The string cache should be used within lexer implementations for several
+ * reasons:
+ * $(UL
+ *     $(LI Reducing memory consumption.)
+ *     $(LI Increasing performance in token comparisons)
+ *     $(LI Correctly creating immutable token text if the lexing source is not
+ *     immutable)
+ * )
+ */
 struct StringCache
 {
 public:

+	/**
+	 * Equivalent to calling cache() and get().
+	 * ---
+	 * StringCache cache;
+	 * ubyte[] str = ['a', 'b', 'c'];
+	 * string s = cache.get(cache.cache(str));
+	 * assert(s == "abc");
+	 * ---
+	 */
 	string cacheGet(const(ubyte[]) bytes) pure nothrow @safe
 	{
 		return get(cache(bytes));
 	}

+	/**
+	 * Caches a string.
+	 * Params: bytes = the string to cache
+	 * Returns: A key that can be used to retrieve the cached string
+	 * Examples:
+	 * ---
+	 * StringCache cache;
+	 * ubyte[] bytes = ['a', 'b', 'c'];
+	 * size_t first = cache.cache(bytes);
+	 * size_t second = cache.cache(bytes);
+	 * assert (first == second);
+	 * ---
+	 */
 	size_t cache(const(ubyte)[] bytes) pure nothrow @safe
 	in
 	{
 		assert (bytes.length > 0);
 	}
+	out (retVal)
+	{
+		assert (retVal < items.length);
+	}
 	body
 	{
 		immutable uint hash = hashBytes(bytes);
@ -325,12 +433,21 @@ public:
 		return found.index;
 	}

+	/**
+	 * Gets a cached string based on its key.
+	 * Params: index = the key
+	 * Returns: the cached string
+	 */
 	string get(size_t index) const pure nothrow @safe
 	in
 	{
 		assert (items.length > index);
 		assert (items[index] !is null);
 	}
+	out (retVal)
+	{
+		assert (retVal !is null);
+	}
 	body
 	{
 		return items[index].str;
@ -345,7 +462,7 @@ private:
 		item.str = allocate(bytes);
 		item.index = items.length;
 		items ~= item;
-		buckets[hash % bucketCount] ~= item;
+		buckets[hash % buckets.length] ~= item;
 		return item.index;
 	}

@ -361,9 +478,9 @@ private:
 	}

 	string allocate(const(ubyte)[] bytes) pure nothrow @trusted
-	out (rVal)
+	out (retVal)
 	{
-		assert (rVal == bytes);
+		assert (retVal == bytes);
 	}
 	body
 	{
@ -391,23 +508,6 @@ private:
 		return cast(string) blocks[$ - 1].bytes[0 .. bytes.length];
 	}

-	Item*[] items;
-	Item*[][bucketCount] buckets;
-	Block[] blocks;
-
-	struct Item
-	{
-		size_t index;
-		string str;
-		uint hash;
-	}
-
-	struct Block
-	{
-		ubyte[] bytes;
-		size_t used;
-	}
-
 	static uint hashBytes(const(ubyte)[] data) pure nothrow @safe
    {
        uint hash = 0;
@ -419,8 +519,21 @@ private:
        return hash;
    }

-	enum pageSize = 4096 * 1024;
-	enum bucketCount = 2048;
+	static struct Item
+	{
+		size_t index;
+		string str;
+		uint hash;
+	}
+
+	static struct Block
+	{
+		ubyte[] bytes;
+		size_t used;
+	}
+
+	static enum pageSize = 4096 * 1024;
+	static enum bucketCount = 2048;

 	static enum uint[] sbox = [
 		0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
@ -488,6 +601,8 @@ private:
 		0xA0B38F96, 0x51D39199, 0x37A6AD75, 0xDF84EE41,
 		0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A,
 	];
+
+	Item*[] items;
+	Item*[][bucketCount] buckets;
+	Block[] blocks;
 }
-
-