Include string cache changes from @blackwhale

2013-02-08 06:35:41 -08:00 · 2013-02-08 06:35:41 -08:00 · 62c27452ca
parent 61704db501
commit 62c27452ca
2 changed files with 2457 additions and 2363 deletions
--- a/build.sh
+++ b/build.sh
@ -1,4 +1,4 @@
 #dmd *.d std/d/*.d -release -inline -noboundscheck -O -w -wi -m64 -property -ofdscanner -L-lsqlite3 #-inline
 #dmd *.d std/d/*.d -g -m64 -w -wi -property -ofdscanner -unittest
-#ldc2 -O3 *.d std/d/*.d -of=dscanner -release -vectorize -m64
+ldc2 -O2 *.d std/d/*.d -of=dscanner -release -vectorize -m64
-ldc2 *.d std/d/*.d -of=dscanner -unittest -m64 -g
+#ldc2 *.d std/d/*.d -of=dscanner -unittest -m64 -g
--- a/std/d/lexer.d
+++ b/std/d/lexer.d
@ -1,110 +1,110 @@
 // Written in the D programming language
 /**
- * This module contains a range-based _lexer for the D programming language.
+* This module contains a range-based _lexer for the D programming language.
- *
+*
- * For performance reasons the _lexer contained in this module operates only on
+* For performance reasons the _lexer contained in this module operates only on
- * ASCII and UTF-8 encoded source code. If the use of other encodings is
+* ASCII and UTF-8 encoded source code. If the use of other encodings is
- * desired, the source code must be converted to UTF-8 before passing it to this
+* desired, the source code must be converted to UTF-8 before passing it to this
- * _lexer.
+* _lexer.
- *
+*
- * To use the _lexer, create a LexerConfig struct
+* To use the _lexer, create a LexerConfig struct
- * ---
+* ---
- * LexerConfig config;
+* LexerConfig config;
- * config.iterStyle = IterationStyle.everything;
+* config.iterStyle = IterationStyle.everything;
- * config.tokenStyle = IterationStyle.source;
+* config.tokenStyle = IterationStyle.source;
- * config.versionNumber = 2061;
+* config.versionNumber = 2061;
- * config.vendorString = "Lexer Example";
+* config.vendorString = "Lexer Example";
- * ---
+* ---
- * Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your
+* Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your
- * source code, passing in the configuration.
+* source code, passing in the configuration.
- * ---
+* ---
- * auto source = "import std.stdio;"c;
+* auto source = "import std.stdio;"c;
- * auto tokens = byToken(source, config);
+* auto tokens = byToken(source, config);
- * ---
+* ---
- * The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can
+* The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can
- * be used easily with the algorithms from std.algorithm or iterated over with
+* be used easily with the algorithms from std.algorithm or iterated over with
- * $(D_KEYWORD foreach)
+* $(D_KEYWORD foreach)
- * ---
+* ---
- * assert (tokens.front.type == TokenType.import_);
+* assert (tokens.front.type == TokenType.import_);
- * assert (tokens.front.value == "import");
+* assert (tokens.front.value == "import");
- * assert (tokens.front.line == 1);
+* assert (tokens.front.line == 1);
- * assert (tokens.front.startIndex == 0);
+* assert (tokens.front.startIndex == 0);
- * ---
+* ---
- *
+*
- * Examples:
+* Examples:
- *
+*
- * Generate HTML markup of D code.
+* Generate HTML markup of D code.
- * ---
+* ---
- * module highlighter;
+* module highlighter;
- *
+*
- * import std.stdio;
+* import std.stdio;
- * import std.array;
+* import std.array;
- * import std.d.lexer;
+* import std.d.lexer;
- *
+*
- * void writeSpan(string cssClass, string value)
+* void writeSpan(string cssClass, string value)
- * {
+* {
- *     stdout.write(`<span class="`, cssClass, `">`, value.replace("&", "&amp;").replace("<", "&lt;"), `</span>`);
+*     stdout.write(`<span class="`, cssClass, `">`, value.replace("&", "&amp;").replace("<", "&lt;"), `</span>`);
- * }
+* }
- *
+*
- *
+*
- * // http://ethanschoonover.com/solarized
+* // http://ethanschoonover.com/solarized
- * void highlight(R)(R tokens)
+* void highlight(R)(R tokens)
- * {
+* {
- *     stdout.writeln(q"[<!DOCTYPE html>
+*     stdout.writeln(q"[<!DOCTYPE html>
- * <html>
+* <html>
- * <head>
+* <head>
- * <meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
+* <meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
- * </head>
+* </head>
- * <body>
+* <body>
- * <style type="text/css">
+* <style type="text/css">
- * html  { background-color: #fdf6e3; color: #002b36; }
+* html  { background-color: #fdf6e3; color: #002b36; }
- * .kwrd { color: #b58900; font-weight: bold;  }
+* .kwrd { color: #b58900; font-weight: bold;  }
- * .com  { color: #93a1a1; font-style: italic; }
+* .com  { color: #93a1a1; font-style: italic; }
- * .num  { color: #dc322f; font-weigth: bold;  }
+* .num  { color: #dc322f; font-weigth: bold;  }
- * .str  { color: #2aa198; font-style: italic; }
+* .str  { color: #2aa198; font-style: italic; }
- * .op   { color: #586e75; font-weight: bold;  }
+* .op   { color: #586e75; font-weight: bold;  }
- * .type { color: #268bd2; font-weight: bold;  }
+* .type { color: #268bd2; font-weight: bold;  }
- * .cons { color: #859900; font-weight: bold;  }
+* .cons { color: #859900; font-weight: bold;  }
- * </style>
+* </style>
- * <pre>]");
+* <pre>]");
- *
+*
- *     foreach (Token t; tokens)
+*     foreach (Token t; tokens)
- *     {
+*     {
- *         if (isType(t.type))
+*         if (isType(t.type))
- *             writeSpan("type", t.value);
+*             writeSpan("type", t.value);
- *         else if (isKeyword(t.type))
+*         else if (isKeyword(t.type))
- *             writeSpan("kwrd", t.value);
+*             writeSpan("kwrd", t.value);
- *         else if (t.type == TokenType.comment)
+*         else if (t.type == TokenType.comment)
- *             writeSpan("com", t.value);
+*             writeSpan("com", t.value);
- *         else if (isStringLiteral(t.type))
+*         else if (isStringLiteral(t.type))
- *             writeSpan("str", t.value);
+*             writeSpan("str", t.value);
- *         else if (isNumberLiteral(t.type))
+*         else if (isNumberLiteral(t.type))
- *             writeSpan("num", t.value);
+*             writeSpan("num", t.value);
- *         else if (isOperator(t.type))
+*         else if (isOperator(t.type))
- *             writeSpan("op", t.value);
+*             writeSpan("op", t.value);
- *         else
+*         else
- *             stdout.write(t.value.replace("<", "&lt;"));
+*             stdout.write(t.value.replace("<", "&lt;"));
- *     }
+*     }
- *     stdout.writeln("</pre>\n</body></html>");
+*     stdout.writeln("</pre>\n</body></html>");
- * }
+* }
- *
+*
- * void main(string[] args)
+* void main(string[] args)
- * {
+* {
- *     LexerConfig config;
+*     LexerConfig config;
- *     config.tokenStyle = TokenStyle.source;
+*     config.tokenStyle = TokenStyle.source;
- *     config.iterStyle = IterationStyle.everything;
+*     config.iterStyle = IterationStyle.everything;
- *     config.fileName = args[1];
+*     config.fileName = args[1];
- *     auto f = File(args[1]);
+*     auto f = File(args[1]);
- *     (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight();
+*     (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight();
- * }
+* }
- * ---
+* ---
- *
+*
- * Copyright: Brian Schott 2013
+* Copyright: Brian Schott 2013
- * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
+* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
- * Authors: Brian Schott
+* Authors: Brian Schott
- * Source: $(PHOBOSSRC std/d/_lexer.d)
+* Source: $(PHOBOSSRC std/d/_lexer.d)
- */
+*/
 module std.d.lexer;
@ -123,8 +123,8 @@ import std.utf;
 public:
 /**
- * Represents a D token
+* Represents a D token
- */
+*/
 struct Token
 {
 	/**
@ -186,9 +186,9 @@ struct Token
 }
 /**
- * Configure the behavior of the byToken() function. These flags may be
+* Configure the behavior of the byToken() function. These flags may be
- * combined using a bitwise or.
+* combined using a bitwise or.
- */
+*/
 enum IterationStyle
 {
 	/// Only include code, not whitespace or comments
@ -206,9 +206,9 @@ enum IterationStyle
 }
 /**
- * Configuration of the token lexing style. These flags may be combined with a
+* Configuration of the token lexing style. These flags may be combined with a
- * bitwise or.
+* bitwise or.
- */
+*/
 enum TokenStyle : uint
 {
 	/**
@ -249,8 +249,8 @@ enum TokenStyle : uint
 }
 /**
- * Lexer configuration
+* Lexer configuration
- */
+*/
 struct LexerConfig
 {
 	/**
@ -296,13 +296,13 @@ struct LexerConfig
 }
 /**
- * Iterate over the given range of characters by D tokens.
+* Iterate over the given range of characters by D tokens.
- * Params:
+* Params:
- *     range = the range of characters
+*     range = the range of characters
- *     config = the lexer configuration
+*     config = the lexer configuration
- * Returns:
+* Returns:
- *     an input range of tokens
+*     an input range of tokens
- */
+*/
 TokenRange!(R) byToken(R)(R range, LexerConfig config) if (isForwardRange!(R))
 {
 	auto r = TokenRange!(R)(range);
@ -313,8 +313,8 @@ TokenRange!(R) byToken(R)(R range, LexerConfig config) if (isForwardRange!(R))
 }
 /**
- * Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate.
+* Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate.
- */
+*/
 struct TokenRange(R) if (isForwardRange!(R))
 {
 	/**
@ -434,7 +434,10 @@ private:
 		if (isWhite())
 		{
-            lexWhitespace();
+			if (config.iterStyle & IterationStyle.includeWhitespace)
 				lexWhitespace!true();
 			else
 				lexWhitespace!false();
 			return;
 		}
@ -667,15 +670,15 @@ private:
 		}
 	}
-    void lexWhitespace()
+	void lexWhitespace(bool keep)()
 	{
 		current.type = TokenType.whitespace;
 		while (!isEoF() && isWhite())
 		{
-            keepChar();
+			static if (keep) keepChar();
 			else advanceRange();
 		}
-        if (config.iterStyle & IterationStyle.includeWhitespace)
+		static if (keep) setTokenValue();
            setTokenValue();
 	}
 	void lexComment()
@ -1828,89 +1831,89 @@ private:
 }
 /**
- * Returns: true if the token is an operator
+* Returns: true if the token is an operator
- */
+*/
 pure nothrow bool isOperator(const TokenType t)
 {
 	return t >= TokenType.assign && t <= TokenType.xorEquals;
 }
 /**
- * Returns: true if the token is a keyword
+* Returns: true if the token is a keyword
- */
+*/
 pure nothrow bool isKeyword(const TokenType t)
 {
 	return t >= TokenType.bool_ && t <= TokenType.with_;
 }
 /**
- * Returns: true if the token is a built-in type
+* Returns: true if the token is a built-in type
- */
+*/
 pure nothrow bool isType(const TokenType t)
 {
 	return t >= TokenType.bool_ && t <= TokenType.wchar_;
 }
 /**
- * Returns: true if the token is an attribute
+* Returns: true if the token is an attribute
- */
+*/
 pure nothrow bool isAttribute(const TokenType t)
 {
 	return t >= TokenType.align_ && t <= TokenType.static_;
 }
 /**
- * Returns: true if the token is a protection attribute
+* Returns: true if the token is a protection attribute
- */
+*/
 pure nothrow bool isProtection(const TokenType t)
 {
 	return t >= TokenType.export_ && t <= TokenType.public_;
 }
 /**
- * Returns: true if the token is a compile-time constant such as ___DATE__
+* Returns: true if the token is a compile-time constant such as ___DATE__
- */
+*/
 pure nothrow bool isConstant(const TokenType t)
 {
 	return t >= TokenType.date && t <= TokenType.traits;
 }
 /**
- * Returns: true if the token is a string or number literal
+* Returns: true if the token is a string or number literal
- */
+*/
 pure nothrow bool isLiteral(const TokenType t)
 {
 	return t >= TokenType.doubleLiteral && t <= TokenType.wstringLiteral;
 }
 /**
- * Returns: true if the token is a number literal
+* Returns: true if the token is a number literal
- */
+*/
 pure nothrow bool isNumberLiteral(const TokenType t)
 {
 	return t >= TokenType.doubleLiteral && t <= TokenType.ulongLiteral;
 }
 /**
- * Returns: true if the token is a string literal
+* Returns: true if the token is a string literal
- */
+*/
 pure nothrow bool isStringLiteral(const TokenType t)
 {
 	return t >= TokenType.dstringLiteral && t <= TokenType.wstringLiteral;
 }
 /**
- * Returns: true if the token is whitespace, a commemnt, a special token
+* Returns: true if the token is whitespace, a commemnt, a special token
- *     sequence, or an identifier
+*     sequence, or an identifier
- */
+*/
 pure nothrow bool isMisc(const TokenType t)
 {
 	return t >= TokenType.comment && t <= TokenType.specialTokenSequence;
 }
 /**
- * Listing of all the tokens in the D language.
+* Listing of all the tokens in the D language.
- */
+*/
 enum TokenType: ushort
 {
 	assign, /// =
@ -2124,9 +2127,9 @@ pure nothrow bool isRangeEoF(R)(ref R range)
 }
 /*
- * Slices of the above string to save memory. This array is automatically
+* Slices of the above string to save memory. This array is automatically
- * generated.
+* generated.
- */
+*/
 immutable(string[TokenType.max + 1]) tokenValues = [
 	"=",
 	"@",
@ -2632,12 +2635,11 @@ string generateCaseTrie(string[] args ...)
 	return printCaseStatements(t, "");
 }
 struct StringCache
 {
 	string get(const ubyte[] bytes)
 	{
        import std.stdio;
 		size_t bucket;
 		hash_t h;
 		string* val = find(bytes, bucket, h);
@ -2647,7 +2649,7 @@ struct StringCache
 		}
 		else
 		{
-            auto s = (cast(char[]) bytes).idup;
+			auto s = putIntoCache(bytes);
 			index[bucket] ~= s;
 			return s;
 		}
@ -2655,6 +2657,7 @@ struct StringCache
 private:
 	import std.stdio;
 	string* find(const ubyte[] data, out size_t bucket, out hash_t h)
 	{
 		h = hash(data);
@ -2662,24 +2665,115 @@ private:
 		foreach (i; 0 .. index[bucket].length)
 		{
 			if (index[bucket][i] == data)
 			{
 				return &index[bucket][i];
 			}
 		}
 		return null;
 	}
 	static hash_t hash(const(ubyte)[] data)
 	{
-        hash_t h = 5381;
+		uint hash = 0;
-        int c;
+		foreach ( b; data)
        size_t i;
        while (i < data.length)
 		{
-            c = data[i++];
+			hash ^= sbox[b];
-            h = ((h << 5) + h) ^ c;
+			hash *= 3;
 		}
-        return h;
+		return hash;
 	}
-    immutable mapSize = 997;
+	enum mapSize = 2048;
 	string[][mapSize] index;
 	// leave some slack for alloctors/GC meta-data
 	enum chunkSize = 16*1024 - size_t.sizeof*8;
 	ubyte*[] chunkS;
 	size_t next = chunkSize;
 	string putIntoCache(const ubyte[] data)
 	{
 		import core.memory;
 		if(next + data.length > chunkSize)
 		{
 			// avoid huge strings
 			if(data.length > chunkSize/4)
 				return (cast(char[])data).idup;
 			chunkS ~= cast(ubyte*)GC.malloc(chunkSize,
 				GC.BlkAttr.NO_SCAN | GC.BlkAttr.NO_INTERIOR);
 			next = 0;
 		}
 		auto slice = chunkS[$-1][next..next+data.length];
 		slice[] = data[];
 		next += data.length;
 		return cast(string)slice;
 	}
 }
 immutable uint[] sbox = [
 	0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
 	0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982,
 	0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56,
 	0x514F4303, 0x7BE12B83, 0x7192F195, 0x82DC7300,
 	0x084380B4, 0x480B55D3, 0x5F430471, 0x13F75991,
 	0x3F9CF22C, 0x2FE0907A, 0xFD8E1E69, 0x7B1D5DE8,
 	0xD575A85C, 0xAD01C50A, 0x7EE00737, 0x3CE981E8,
 	0x0E447EFA, 0x23089DD6, 0xB59F149F, 0x13600EC7,
 	0xE802C8E6, 0x670921E4, 0x7207EFF0, 0xE74761B0,
 	0x69035234, 0xBFA40F19, 0xF63651A0, 0x29E64C26,
 	0x1F98CCA7, 0xD957007E, 0xE71DDC75, 0x3E729595,
 	0x7580B7CC, 0xD7FAF60B, 0x92484323, 0xA44113EB,
 	0xE4CBDE08, 0x346827C9, 0x3CF32AFA, 0x0B29BCF1,
 	0x6E29F7DF, 0xB01E71CB, 0x3BFBC0D1, 0x62EDC5B8,
 	0xB7DE789A, 0xA4748EC9, 0xE17A4C4F, 0x67E5BD03,
 	0xF3B33D1A, 0x97D8D3E9, 0x09121BC0, 0x347B2D2C,
 	0x79A1913C, 0x504172DE, 0x7F1F8483, 0x13AC3CF6,
 	0x7A2094DB, 0xC778FA12, 0xADF7469F, 0x21786B7B,
 	0x71A445D0, 0xA8896C1B, 0x656F62FB, 0x83A059B3,
 	0x972DFE6E, 0x4122000C, 0x97D9DA19, 0x17D5947B,
 	0xB1AFFD0C, 0x6EF83B97, 0xAF7F780B, 0x4613138A,
 	0x7C3E73A6, 0xCF15E03D, 0x41576322, 0x672DF292,
 	0xB658588D, 0x33EBEFA9, 0x938CBF06, 0x06B67381,
 	0x07F192C6, 0x2BDA5855, 0x348EE0E8, 0x19DBB6E3,
 	0x3222184B, 0xB69D5DBA, 0x7E760B88, 0xAF4D8154,
 	0x007A51AD, 0x35112500, 0xC9CD2D7D, 0x4F4FB761,
 	0x694772E3, 0x694C8351, 0x4A7E3AF5, 0x67D65CE1,
 	0x9287DE92, 0x2518DB3C, 0x8CB4EC06, 0xD154D38F,
 	0xE19A26BB, 0x295EE439, 0xC50A1104, 0x2153C6A7,
 	0x82366656, 0x0713BC2F, 0x6462215A, 0x21D9BFCE,
 	0xBA8EACE6, 0xAE2DF4C1, 0x2A8D5E80, 0x3F7E52D1,
 	0x29359399, 0xFEA1D19C, 0x18879313, 0x455AFA81,
 	0xFADFE838, 0x62609838, 0xD1028839, 0x0736E92F,
 	0x3BCA22A3, 0x1485B08A, 0x2DA7900B, 0x852C156D,
 	0xE8F24803, 0x00078472, 0x13F0D332, 0x2ACFD0CF,
 	0x5F747F5C, 0x87BB1E2F, 0xA7EFCB63, 0x23F432F0,
 	0xE6CE7C5C, 0x1F954EF6, 0xB609C91B, 0x3B4571BF,
 	0xEED17DC0, 0xE556CDA0, 0xA7846A8D, 0xFF105F94,
 	0x52B7CCDE, 0x0E33E801, 0x664455EA, 0xF2C70414,
 	0x73E7B486, 0x8F830661, 0x8B59E826, 0xBB8AEDCA,
 	0xF3D70AB9, 0xD739F2B9, 0x4A04C34A, 0x88D0F089,
 	0xE02191A2, 0xD89D9C78, 0x192C2749, 0xFC43A78F,
 	0x0AAC88CB, 0x9438D42D, 0x9E280F7A, 0x36063802,
 	0x38E8D018, 0x1C42A9CB, 0x92AAFF6C, 0xA24820C5,
 	0x007F077F, 0xCE5BC543, 0x69668D58, 0x10D6FF74,
 	0xBE00F621, 0x21300BBE, 0x2E9E8F46, 0x5ACEA629,
 	0xFA1F86C7, 0x52F206B8, 0x3EDF1A75, 0x6DA8D843,
 	0xCF719928, 0x73E3891F, 0xB4B95DD6, 0xB2A42D27,
 	0xEDA20BBF, 0x1A58DBDF, 0xA449AD03, 0x6DDEF22B,
 	0x900531E6, 0x3D3BFF35, 0x5B24ABA2, 0x472B3E4C,
 	0x387F2D75, 0x4D8DBA36, 0x71CB5641, 0xE3473F3F,
 	0xF6CD4B7F, 0xBF7D1428, 0x344B64D0, 0xC5CDFCB6,
 	0xFE2E0182, 0x2C37A673, 0xDE4EB7A3, 0x63FDC933,
 	0x01DC4063, 0x611F3571, 0xD167BFAF, 0x4496596F,
 	0x3DEE0689, 0xD8704910, 0x7052A114, 0x068C9EC5,
 	0x75D0E766, 0x4D54CC20, 0xB44ECDE2, 0x4ABC653E,
 	0x2C550A21, 0x1A52C0DB, 0xCFED03D0, 0x119BAFE2,
 	0x876A6133, 0xBC232088, 0x435BA1B2, 0xAE99BBFA,
 	0xBB4F08E4, 0xA62B5F49, 0x1DA4B695, 0x336B84DE,
 	0xDC813D31, 0x00C134FB, 0x397A98E6, 0x151F0E64,
 	0xD9EB3E69, 0xD3C7DF60, 0xD2F2C336, 0x2DDD067B,
 	0xBD122835, 0xB0B3BD3A, 0xB0D54E46, 0x8641F1E4,
 	0xA0B38F96, 0x51D39199, 0x37A6AD75, 0xDF84EE41,
 	0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A,
 ];