From c01c51a61ea4f2e6a94a64396be0abd9e8249bab Mon Sep 17 00:00:00 2001
From: Hackerpilot <briancschott@gmail.com>
Date: Sun, 19 Jan 2014 23:13:13 -0800
Subject: [PATCH] Back-end cleanup and optimization in the lexer

---
 .gitmodules    |   3 -
 build.sh       |  40 ++++++--
 ctags.d        |   2 +-
 datapicked     |   1 -
 main.d         |  14 +--
 perftest.sh    |   9 --
 stats.d        |   9 +-
 stdx/d/lexer.d | 252 ++++++++++++++++++++++++-------------------------
 stdx/lexer.d   | 246 ++++++++++++++++++++++++++++++++---------------
 9 files changed, 344 insertions(+), 232 deletions(-)
 delete mode 100644 .gitmodules
 delete mode 160000 datapicked
 delete mode 100755 perftest.sh

diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 3a7a14f..0000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "datapicked"]
-	path = datapicked
-	url = https://github.com/blackwhale/datapicked.git
diff --git a/build.sh b/build.sh
index be987c0..3a81079 100755
--- a/build.sh
+++ b/build.sh
@@ -1,4 +1,3 @@
-#dmd *.d stdx/d/*.d -release -inline -noboundscheck -O -w -wi -m64 -property -ofdscanner-dmd
 dmd\
 	main.d\
 	stats.d\
@@ -11,9 +10,36 @@ dmd\
 	style.d\
 	stdx/*.d\
 	stdx/d/*.d\
-	datapicked/dpick/buffer/*.d\
-	-Idatapicked\
-	-g -m64 -wi -ofdscanner
-#ldc2 main.d stats.d imports.d highlighter.d ctags.d astprinter.d formatter.d outliner.d stdx/*.d stdx/d/*.d -of=dscanner-ldc -m64 -oq
-#ldc2 *.d stdx/d/*.d -of=dscanner -unittest -m64 -g
-#/opt/gdc/bin/gdc -O3 -odscanner-gdc -fno-bounds-check -frelease -m64 *.d stdx/d/*.d
+	-ofdscanner\
+	-m64\
+	-O -release -noboundscheck
+
+#gdc\
+#	main.d\
+#	stats.d\
+#	imports.d\
+#	highlighter.d\
+#	ctags.d\
+#	astprinter.d\
+#	formatter.d\
+#	outliner.d\
+#	style.d\
+#	stdx/*.d\
+#	stdx/d/*.d\
+#	-O3 -frelease -fno-bounds-check\
+#	-odscanner\
+
+#ldc2\
+#	main.d\
+#	stats.d\
+#	imports.d\
+#	highlighter.d\
+#	ctags.d\
+#	astprinter.d\
+#	formatter.d\
+#	outliner.d\
+#	style.d\
+#	stdx/*.d\
+#	stdx/d/*.d\
+#	-O3 -release\
+#	-oq -of=dscanner\
diff --git a/ctags.d b/ctags.d
index a83a574..37677da 100644
--- a/ctags.d
+++ b/ctags.d
@@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames)
 {
 	string[] tags;
 	LexerConfig config;
-	StringCache* cache = new StringCache;
+	StringCache* cache = new StringCache(StringCache.defaultBucketCount);
 	foreach (fileName; fileNames)
 	{
 		File f = File(fileName);
diff --git a/datapicked b/datapicked
deleted file mode 160000
index f63a843..0000000
--- a/datapicked
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit f63a843e9c0ce8db7fd897684fe323697255d87d
diff --git a/main.d b/main.d
index 9e2b818..41d2210 100644
--- a/main.d
+++ b/main.d
@@ -10,15 +10,12 @@ import std.array;
 import std.conv;
 import std.file;
 import std.getopt;
-import std.parallelism;
 import std.path;
-import std.regex;
 import std.stdio;
 import std.range;
 import stdx.lexer;
 import stdx.d.lexer;
 import stdx.d.parser;
-import dpick.buffer.buffer;
 
 import highlighter;
 import stats;
@@ -93,7 +90,7 @@ int main(string[] args)
 		return 1;
 	}
 
-	StringCache* cache = new StringCache;
+	StringCache* cache = new StringCache(StringCache.defaultBucketCount);
 
 	if (tokenDump || highlight)
 	{
@@ -151,13 +148,16 @@ int main(string[] args)
 				foreach (f; expandArgs(args, recursive))
 				{
 					import core.memory;
-					GC.disable();
-					auto tokens = byToken!(ubyte[])(readFile(f));
+					LexerConfig config;
+					config.whitespaceBehavior = WhitespaceBehavior.skip;
+					config.stringBehavior = StringBehavior.source;
+					config.commentBehavior = CommentBehavior.include;
+					auto tokens = byToken(readFile(f), config, cache);
 					if (tokenCount)
 						count += printTokenCount(stdout, f, tokens);
 					else
 						count += printLineCount(stdout, f, tokens);
-					GC.enable();
+					cache.printStats();
 				}
 				writefln("total:\t%d", count);
 			}
diff --git a/perftest.sh b/perftest.sh
deleted file mode 100755
index 1b78e6a..0000000
--- a/perftest.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-echo -e "file\tstd.d.lexer dmd\tstd.d.lexer ldc\tstd.d.lexer gdc\tdmd"
-for i in $(ls ../phobos/std/*.d); do
-	f=$(echo $i | sed "s/.*phobos\///")
-	dmdt=$(avgtime -q -r 200 ./dscanner-dmd --tokenCount $i | grep "Median" | sed "s/.*: //")
-	ldct=$(avgtime -q -r 200 ./dscanner-ldc --tokenCount $i | grep "Median" | sed "s/.*: //")
-	gdct=$(avgtime -q -r 200 ./dscanner-gdc --tokenCount $i | grep "Median" | sed "s/.*: //")
-	gcct=$(avgtime -q -r 200 ~/src/dmd-lexer/src/dmd $i | grep "Median" | sed "s/.*: //")
-	echo -e "${f}\t${dmdt}\t${ldct}\t${gdct}\t${gcct}"
-done
diff --git a/stats.d b/stats.d
index ee55ccb..d4a65c4 100644
--- a/stats.d
+++ b/stats.d
@@ -32,7 +32,12 @@ pure nothrow bool isLineOfCode(IdType t)
 
 ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens)
 {
-	ulong c = tokens.count!(a => true);
+
+	ulong c;
+	foreach (ref t; tokens)
+	{
+		c++;
+	}
 	output.writefln("%s:\t%d", fileName, c);
 	return c;
 }
@@ -40,7 +45,7 @@ ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens)
 ulong printLineCount(Tokens)(File output, string fileName, ref Tokens tokens)
 {
 	ulong count;
-	foreach (t; tokens)
+	foreach (ref t; tokens)
 	{
 		if (isLineOfCode(t.type))
 			++count;
diff --git a/stdx/d/lexer.d b/stdx/d/lexer.d
index 2052815..6cb620e 100644
--- a/stdx/d/lexer.d
+++ b/stdx/d/lexer.d
@@ -57,13 +57,13 @@ public template tok(string token)
   alias tok = TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token);
 }
 private enum extraFields = q{
-    string comment;
+	string comment;
 
-    int opCmp(size_t i) const pure nothrow @safe {
-        if (index < i) return -1;
-        if (index > i) return 1;
-        return 0;
-    }
+	int opCmp(size_t i) const pure nothrow @safe {
+		if (index < i) return -1;
+		if (index > i) return 1;
+		return 0;
+	}
 };
 public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields);
 
@@ -72,15 +72,15 @@ public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields);
  */
 public enum StringBehavior : ubyte
 {
-    /// Do not include quote characters, process escape sequences
-    compiler = 0b0000_0000,
-    /// Opening quotes, closing quotes, and string suffixes are included in the
-    /// string token
-    includeQuoteChars = 0b0000_0001,
-    /// String escape sequences are not replaced
-    notEscaped = 0b0000_0010,
-    /// Not modified at all. Useful for formatters or highlighters
-    source = includeQuoteChars | notEscaped
+	/// Do not include quote characters, process escape sequences
+	compiler = 0b0000_0000,
+	/// Opening quotes, closing quotes, and string suffixes are included in the
+	/// string token
+	includeQuoteChars = 0b0000_0001,
+	/// String escape sequences are not replaced
+	notEscaped = 0b0000_0010,
+	/// Not modified at all. Useful for formatters or highlighters
+	source = includeQuoteChars | notEscaped
 }
 
 /**
@@ -88,55 +88,28 @@ public enum StringBehavior : ubyte
  */
 public enum WhitespaceBehavior : ubyte
 {
-    /// Whitespace is skipped
-    skip,
-    /// Whitespace is treated as a token
-    include
+	/// Whitespace is skipped
+	skip,
+	/// Whitespace is treated as a token
+	include
 }
 /**
  * Configure comment handling behavior
  */
 public enum CommentBehavior : ubyte
 {
-    /// Comments are attached to the non-whitespace token that follows them
-    attach,
-    /// Comments are tokens, and can be returned by calls to the token range's front()
-    include
+	/// Comments are attached to the non-whitespace token that follows them
+	attach,
+	/// Comments are tokens, and can be returned by calls to the token range's front()
+	include
 }
 
 public struct LexerConfig
 {
 	string fileName;
-    StringBehavior stringBehavior;
-    WhitespaceBehavior whitespaceBehavior;
-    CommentBehavior commentBehavior;
-}
-
-public auto byToken(R)(R range)
-{
-    LexerConfig config;
-	StringCache* cache = new StringCache;
-    return byToken(range, config, cache);
-}
-
-public auto byToken(R)(R range, StringCache* cache)
-{
-	LexerConfig config;
-	return DLexer!(R)(range, config, cache);
-}
-
-public auto byToken(R)(R range, const LexerConfig config, StringCache* cache)
-{
-	return DLexer!(R)(range, config, cache);
-}
-
-unittest
-{
-    import std.stdio;
-    auto source = cast(ubyte[]) q{ import std.stdio;}c;
-    auto tokens = byToken(source);
-    assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
-        tok!"identifier", tok!";"]));
+	StringBehavior stringBehavior;
+	WhitespaceBehavior whitespaceBehavior;
+	CommentBehavior commentBehavior;
 }
 
 public bool isBasicType(IdType type) nothrow pure @safe
@@ -396,11 +369,9 @@ public bool isProtection(IdType type) pure nothrow @safe
 	}
 }
 
-public struct DLexer(R)
+public struct DLexer
 {
-	import std.conv;
 	import core.vararg;
-	import dpick.buffer.buffer;
 
 	private enum pseudoTokenHandlers = [
 		"\"", "lexStringLiteral",
@@ -434,53 +405,51 @@ public struct DLexer(R)
 		"#line", "lexSpecialTokenSequence"
 	];
 
-	mixin Lexer!(R, IdType, Token, lexIdentifier, staticTokens,
+	mixin Lexer!(IdType, Token, lexIdentifier, staticTokens,
 		dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens);
 
-	private alias Mark = typeof(range).Mark;
-
-	this(R range, const LexerConfig config, StringCache* cache)
+	this(ubyte[] range, const LexerConfig config, StringCache* cache)
 	{
-		this.range = LexerRange!(typeof(buffer(range)))(buffer(range));
-        this.config = config;
+		this.range = LexerRange(range);
+		this.config = config;
 		this.cache = cache;
-        popFront();
+		popFront();
 	}
 
-    private static bool isDocComment(string comment) pure nothrow @safe
-    {
-        return comment.length >= 3 && (comment[0 .. 3] == "///"
-            || comment[0 .. 3] == "/**" || comment[0 .. 3] == "/++");
-    }
+	private static bool isDocComment(string comment) pure nothrow @safe
+	{
+		return comment.length >= 3 && (comment[0 .. 3] == "///"
+			|| comment[0 .. 3] == "/**" || comment[0 .. 3] == "/++");
+	}
 
-    public void popFront() pure
-    {
-        _popFront();
-        string comment = null;
-        switch (front.type)
-        {
-            case tok!"comment":
-                if (config.commentBehavior == CommentBehavior.attach)
-                {
-                    import std.string;
-                    if (isDocComment(front.text))
-                        comment = comment == null ? front.text : format("%s\n%s", comment, front.text);
-                    do _popFront(); while (front == tok!"comment");
-                    if (front == tok!"whitespace") goto case tok!"whitespace";
-                }
-                break;
-            case tok!"whitespace":
-                if (config.whitespaceBehavior == WhitespaceBehavior.skip)
-                {
-                    do _popFront(); while (front == tok!"whitespace");
-                    if (front == tok!"comment") goto case tok!"comment";
-                }
-                break;
-            default:
-                break;
-        }
-        _front.comment = comment;
-    }
+	public void popFront() pure
+	{
+		_popFront();
+		string comment = null;
+		switch (front.type)
+		{
+			case tok!"comment":
+				if (config.commentBehavior == CommentBehavior.attach)
+				{
+					import std.string;
+					if (isDocComment(front.text))
+						comment = comment == null ? front.text : format("%s\n%s", comment, front.text);
+					do _popFront(); while (front == tok!"comment");
+					if (front == tok!"whitespace") goto case tok!"whitespace";
+				}
+				break;
+			case tok!"whitespace":
+				if (config.whitespaceBehavior == WhitespaceBehavior.skip)
+				{
+					do _popFront(); while (front == tok!"whitespace");
+					if (front == tok!"comment") goto case tok!"comment";
+				}
+				break;
+			default:
+				break;
+		}
+		_front.comment = comment;
+	}
 
 
 	bool isWhitespace() pure /*const*/ nothrow
@@ -493,7 +462,7 @@ public struct DLexer(R)
 		case '\t':
 			return true;
 		case 0xe2:
-			auto peek = range.lookahead(2);
+			auto peek = range.peek(2);
 			return peek.length == 2
 				&& peek[0] == 0x80
 				&& (peek[1] == 0xa8 || peek[1] == 0xa9);
@@ -521,7 +490,7 @@ public struct DLexer(R)
 			range.incrementLine();
 			return;
 		case 0xe2:
-			auto lookahead = range.lookahead(3);
+			auto lookahead = range.peek(3);
 			if (lookahead.length == 3 && lookahead[1] == 0x80
 				&& (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
 			{
@@ -564,7 +533,7 @@ public struct DLexer(R)
 				range.popFront();
 				break;
 			case 0xe2:
-				auto lookahead = range.lookahead(3);
+				auto lookahead = range.peek(3);
 				if (lookahead.length != 3)
 					break loop;
 				if (lookahead[1] != 0x80)
@@ -590,10 +559,10 @@ public struct DLexer(R)
 	Token lexNumber() pure nothrow
 	{
 		mixin (tokenStart);
-		auto lookahead = range.lookahead(2);
-		if (range.front == '0' && lookahead.length == 2)
+		if (range.canPeek(1) && range.front == '0')
 		{
-			switch (lookahead[1])
+			auto ahead = range.peek(1)[1];
+			switch (ahead)
 			{
 			case 'x':
 			case 'X':
@@ -619,7 +588,7 @@ public struct DLexer(R)
 		return lexHex(mark, line, column, index);
 	}
 
-	Token lexHex(Mark mark, size_t line, size_t column, size_t index) pure nothrow
+	Token lexHex(size_t mark, size_t line, size_t column, size_t index) pure nothrow
 	{
 		IdType type = tok!"intLiteral";
 		bool foundDot;
@@ -654,7 +623,7 @@ public struct DLexer(R)
 			case '.':
 				if (foundDot)
 					break hexLoop;
-				if (range.lookahead(1).length && range.lookahead(1)[0] == '.')
+				if (range.peek(1).length && range.peek(1)[0] == '.')
 					break hexLoop;
 				range.popFront();
 				foundDot = true;
@@ -674,7 +643,7 @@ public struct DLexer(R)
 		return lexBinary(mark, line, column, index);
 	}
 
-	Token lexBinary(Mark mark, size_t line, size_t column, size_t index) pure nothrow
+	Token lexBinary(size_t mark, size_t line, size_t column, size_t index) pure nothrow
 	{
 		IdType type = tok!"intLiteral";
 		binaryLoop: while (!range.empty)
@@ -699,13 +668,13 @@ public struct DLexer(R)
 			index);
 	}
 
-	Token lexDecimal()
+	Token lexDecimal() pure nothrow
 	{
 		mixin (tokenStart);
 		return lexDecimal(mark, line, column, index);
 	}
 
-	Token lexDecimal(Mark mark, size_t line, size_t column, size_t index) pure nothrow
+	Token lexDecimal(size_t mark, size_t line, size_t column, size_t index) pure nothrow
 	{
 		bool foundDot = range.front == '.';
 		IdType type = tok!"intLiteral";
@@ -748,7 +717,7 @@ public struct DLexer(R)
 			case '.':
 				if (foundDot)
 					break decimalLoop;
-				auto lookahead = range.lookahead(2);
+				auto lookahead = range.peek(2);
 				if (lookahead.length == 2 && lookahead[1] == '.')
 					break decimalLoop;
 				else
@@ -1058,7 +1027,7 @@ public struct DLexer(R)
 			index);
 	}
 
-	void lexStringSuffix(ref IdType type) pure
+	void lexStringSuffix(ref IdType type) pure nothrow
 	{
 		if (range.empty)
 			type = tok!"stringLiteral";
@@ -1076,12 +1045,12 @@ public struct DLexer(R)
 
 	Token lexDelimitedString() pure nothrow
 	{
-        import std.traits;
+		import std.traits;
 		mixin (tokenStart);
 		range.popFront();
 		range.popFront();
-		Unqual!(ElementEncodingType!R) open;
-		Unqual!(ElementEncodingType!R) close;
+		ubyte open;
+		ubyte close;
 		switch (range.front)
 		{
 		case '<':
@@ -1109,8 +1078,8 @@ public struct DLexer(R)
 		}
 	}
 
-	Token lexNormalDelimitedString(Mark mark, size_t line, size_t column,
-		size_t index, ElementEncodingType!R open, ElementEncodingType!R close)
+	Token lexNormalDelimitedString(size_t mark, size_t line, size_t column,
+		size_t index, ubyte open, ubyte close)
 		pure nothrow
 	{
 		int depth = 1;
@@ -1144,7 +1113,7 @@ public struct DLexer(R)
 		return Token(type, cache.cacheGet(range.slice(mark)), line, column, index);
 	}
 
-	Token lexHeredocString(Mark mark, size_t line, size_t column, size_t index)
+	Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index)
 		pure nothrow
 	{
 		import std.regex;
@@ -1158,7 +1127,7 @@ public struct DLexer(R)
 			if (isNewline())
 			{
 				popFrontWhitespaceAware();
-				if (range.lookahead(ident.text.length) == ident.text)
+				if (range.peek(ident.text.length) == ident.text)
 				{
 					foreach (i ; 0 .. ident.text.length)
 						range.popFront();
@@ -1395,18 +1364,20 @@ public struct DLexer(R)
 	Token lexIdentifier() pure nothrow
 	{
 		mixin (tokenStart);
-		while (!range.empty && !isSeparating(range.front))
+		uint hash = 0;
+		while (!range.empty && !isSeparating(0))
 		{
+			hash = StringCache.hashStep(range.front, hash);
 			range.popFront();
 		}
-		return Token(tok!"identifier", cache.cacheGet(range.slice(mark)), line,
+		return Token(tok!"identifier", cache.cacheGet(range.slice(mark), hash), line,
 			column, index);
 	}
 
 	Token lexDot() pure nothrow
 	{
 		mixin (tokenStart);
-		auto lookahead = range.lookahead(1);
+		auto lookahead = range.peek(1);
 		if (lookahead.length == 0)
 		{
 			range.popFront();
@@ -1447,22 +1418,25 @@ public struct DLexer(R)
 	{
 		if (range.front == '\n') return true;
 		if (range.front == '\r') return true;
-		auto lookahead = range.lookahead(3);
+		auto lookahead = range.peek(3);
 		if (lookahead.length == 0) return false;
 		if (lookahead == "\u2028" || lookahead == "\u2029")
 			return true;
 		return false;
 	}
 
-	bool isSeparating(ElementType!R c) nothrow pure @safe
+	bool isSeparating(size_t offset) const pure nothrow @safe
 	{
+		auto r = range.save();
+		r.popFrontN(offset);
+		auto c = r.front;
 		if (c <= 0x2f) return true;
 		if (c >= ':' && c <= '@') return true;
 		if (c >= '[' && c <= '^') return true;
 		if (c >= '{' && c <= '~') return true;
 		if (c == '`') return true;
-//		if (c & 0x80 && (range.lookahead(3) == "\u2028"
-//			|| range.lookahead(3) == "\u2029")) return true;
+		if (c & 0x80 && (r.peek(3) == "\u2028"
+			|| range.peek(3) == "\u2029")) return true;
 		return false;
 	}
 
@@ -1470,17 +1444,43 @@ public struct DLexer(R)
 		size_t index = range.index;
 		size_t column = range.column;
 		size_t line = range.line;
-		const mark = range.mark();
+		auto mark = range.mark();
 	};
 
-	void error(...) pure {
+	void error(...) pure nothrow @safe {
 
 	}
 
-	void warning(...) pure {
+	void warning(...) pure nothrow @safe {
 
 	}
 
 	StringCache* cache;
 	LexerConfig config;
 }
+
+public auto byToken(ubyte[] range)
+{
+	LexerConfig config;
+	StringCache* cache = new StringCache(StringCache.defaultBucketCount);
+	return DLexer(range, config, cache);
+}
+
+public auto byToken(ubyte[] range, StringCache* cache)
+{
+	LexerConfig config;
+	return DLexer(range, config, cache);
+}
+
+public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
+{
+	return DLexer(range, config, cache);
+}
+unittest
+{
+	import std.stdio;
+	auto source = cast(ubyte[]) q{ import std.stdio;}c;
+	auto tokens = byToken(source);
+	assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
+		tok!"identifier", tok!";"]));
+}
diff --git a/stdx/lexer.d b/stdx/lexer.d
index 7f23be4..c8cd8f4 100644
--- a/stdx/lexer.d
+++ b/stdx/lexer.d
@@ -17,8 +17,6 @@ import std.range;
 import std.traits;
 import std.conv;
 import std.math;
-import dpick.buffer.buffer;
-import dpick.buffer.traits;
 
 /**
  * Template for determining the type used for a token type. Selects the smallest
@@ -191,12 +189,13 @@ public:
 	mixin (extraFields);
 }
 
-mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
+mixin template Lexer(IDType, Token, alias defaultTokenFunction,
 	alias staticTokens, alias dynamicTokens, alias pseudoTokens,
 	alias pseudoTokenHandlers, alias possibleDefaultTokens)
 {
 	static string generateCaseStatements(string[] tokens, size_t offset = 0)
 	{
+		import std.conv;
 		string code;
 		for (size_t i = 0; i < tokens.length; i++)
 		{
@@ -216,9 +215,9 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
 						code ~= generateLeaf(tokens[i], indent ~ "    ");
 					else
 					{
-						code ~= indent ~ "    if (range.lookahead(" ~ text(tokens[i].length) ~ ").length == 0)\n";
+						code ~= indent ~ "    if (!range.canPeek(" ~ text(tokens[i].length) ~ "))\n";
 						code ~= indent ~ "        goto outer_default;\n";
-						code ~= indent ~ "    if (range.lookahead(" ~ text(tokens[i].length) ~ ") == \"" ~ escape(tokens[i]) ~ "\")\n";
+						code ~= indent ~ "    if (range.peek(" ~ text(tokens[i].length - 1) ~ ") == \"" ~ escape(tokens[i]) ~ "\")\n";
 						code ~= indent ~ "    {\n";
 						code ~= generateLeaf(tokens[i], indent ~ "        ");
 						code ~= indent ~ "    }\n";
@@ -228,11 +227,11 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
 				}
 				else
 				{
-					code ~= indent ~ "    if (range.lookahead(" ~ text(offset + 2) ~ ").length == 0)\n";
+					code ~= indent ~ "    if (!range.canPeek(" ~ text(offset + 1) ~ "))\n";
 					code ~= indent ~ "    {\n";
 					code ~= generateLeaf(tokens[i][0 .. offset + 1], indent ~ "        ");
 					code ~= indent ~ "    }\n";
-					code ~= indent ~ "    switch (range.lookahead(" ~ text(offset + 2) ~ ")[" ~ text(offset + 1) ~ "])\n";
+					code ~= indent ~ "    switch (range.peek(" ~ text(offset + 1) ~ ")[" ~ text(offset + 1) ~ "])\n";
 					code ~= indent ~ "    {\n";
 					code ~= generateCaseStatements(tokens[i .. j], offset + 1);
 					code ~= indent ~ "    default:\n";
@@ -247,6 +246,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
 
 	static string generateLeaf(string token, string indent)
 	{
+		import std.conv;
 		static assert (pseudoTokenHandlers.length % 2 == 0,
 			"Each pseudo-token must have a matching function name.");
 		string code;
@@ -262,7 +262,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
 			code ~= indent ~ "return " ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(token) + 1] ~ "();\n";
 		else if (possibleDefaultTokens.countUntil(token) >= 0)
 		{
-			code ~= indent ~ "if (range.lookahead(" ~ text(token.length + 1) ~ ").length == 0 || isSeparating(range.lookahead(" ~ text(token.length + 1) ~ ")[" ~ text(token.length) ~ "]))\n";
+			code ~= indent ~ "if (!range.canPeek(" ~ text(token.length + 1) ~ ") || isSeparating(" ~ text(token.length) ~ "))\n";
 			code ~= indent ~ "{\n";
 			if (token.length == 1)
 				code ~= indent ~ "    range.popFront();\n";
@@ -278,7 +278,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
 		return code;
 	}
 
-	const(Token) front() pure nothrow const @property
+	ref const(Token) front() pure nothrow const @property
 	{
 		return _front;
 	}
@@ -312,7 +312,21 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
 		return retVal;
 	}
 
-	Token advance() pure
+	/**
+	 * This only exists because the real array() can't be called at compile-time
+	 */
+	static string[] stupidToArray(R)(R range)
+	{
+		string[] retVal;
+		foreach (v; range)
+			retVal ~= v;
+		return retVal;
+	}
+
+
+	enum loopBody = generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens)));
+
+	auto ref Token advance() pure
 	{
 		if (range.empty)
 			return Token(tok!"\0");
@@ -321,54 +335,87 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
 		immutable size_t line = range.line;
 		lexerLoop: switch (range.front)
 		{
-		mixin(generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens))));
-//		pragma(msg, generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens))));
+		mixin(loopBody);
+		/+pragma(msg, loopBody);+/
 		outer_default:
 		default:
 			return defaultTokenFunction();
 		}
 	}
 
-	/**
-	 * This only exists because the real array() can't be called at compile-time
-	 */
-	static T[] stupidToArray(R, T = ElementType!R)(R range)
-	{
-		T[] retVal;
-		foreach (v; range)
-			retVal ~= v;
-		return retVal;
-	}
-
-	LexerRange!(typeof(buffer(R.init))) range;
+	LexerRange range;
 	Token _front;
 }
 
-struct LexerRange(BufferType) if (isBuffer!BufferType)
+struct LexerRange
 {
-	this(BufferType r)
+
+	this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1) pure nothrow @safe
 	{
-		this.range = r;
-		index = 0;
-		column = 1;
-		line = 1;
+		this.bytes = bytes;
+		this.index = index;
+		this.column = column;
+		this.line = line;
 	}
 
-	void popFront() pure
+	size_t mark() const nothrow pure @safe
+	{
+		return index;
+	}
+
+	void seek(size_t m) nothrow pure @safe
+	{
+		index = m;
+	}
+
+	const(ubyte)[] slice(size_t m) const nothrow pure @safe
+	{
+		return bytes[m .. index];
+	}
+
+	bool empty() const nothrow pure @safe
+	{
+		return index >= bytes.length;
+	}
+
+	ubyte front() const nothrow pure @safe
+	{
+		return bytes[index];
+	}
+
+	const(ubyte)[] peek(size_t p) const nothrow pure @safe
+	{
+		return bytes[index .. index + p + 1];
+	}
+
+	bool canPeek(size_t p) const nothrow pure @safe
+	{
+		return index + p < bytes.length;
+	}
+
+	LexerRange save() const nothrow pure @safe
+	{
+		return LexerRange(bytes, index, column, line);
+	}
+
+	void popFront() pure nothrow @safe
 	{
 		index++;
 		column++;
-		range.popFront();
 	}
 
-	void incrementLine() pure nothrow
+	void popFrontN(size_t n) pure nothrow @safe
+	{
+		index += n;
+	}
+
+	void incrementLine() pure nothrow @safe
 	{
 		column = 1;
 		line++;
 	}
 
-	BufferType range;
-	alias range this;
+	const(ubyte)[] bytes;
 	size_t index;
 	size_t column;
 	size_t line;
@@ -388,6 +435,13 @@ struct StringCache
 {
 public:
 
+	@disable this();
+
+	this(size_t bucketCount = defaultBucketCount)
+	{
+		buckets = new Item*[bucketCount];
+	}
+
 	/**
 	 * Equivalent to calling cache() and get().
 	 * ---
@@ -402,6 +456,11 @@ public:
 		return get(cache(bytes));
 	}
 
+	string cacheGet(const(ubyte[]) bytes, uint hash) pure nothrow @safe
+	{
+		return get(cache(bytes, hash));
+	}
+
 	/**
 	 * Caches a string.
 	 * Params: bytes = the string to cache
@@ -416,6 +475,12 @@ public:
 	 * ---
 	 */
 	size_t cache(const(ubyte)[] bytes) pure nothrow @safe
+	{
+		immutable uint hash = hashBytes(bytes);
+		return cache(bytes, hash);
+	}
+
+	size_t cache(const(ubyte)[] bytes, uint hash) pure nothrow @safe
 	in
 	{
 		assert (bytes.length > 0);
@@ -426,7 +491,7 @@ public:
 	}
 	body
 	{
-		immutable uint hash = hashBytes(bytes);
+		memoryRequested += bytes.length;
 		const(Item)* found = find(bytes, hash);
 		if (found is null)
 			return intern(bytes, hash);
@@ -453,23 +518,58 @@ public:
 		return items[index].str;
 	}
 
+	void printStats()
+	{
+		import std.stdio;
+		writeln("Load Factor:           ", cast(float) items.length / cast(float) buckets.length);
+		writeln("Memory used by blocks: ", blocks.length * blockSize);
+		writeln("Memory requsted:       ", memoryRequested);
+		writeln("rehashes:              ", rehashCount);
+	}
+
+	static uint hashStep(ubyte b, uint h) pure nothrow @safe
+	{
+		return (h ^ sbox[b]) * 3;
+	}
+
+	static enum defaultBucketCount = 2048;
+
 private:
 
-	size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @safe
+	private void rehash() pure nothrow @safe
 	{
-		Item* item = new Item;
-		item.hash = hash;
-		item.str = allocate(bytes);
+		immutable size_t newBucketCount = items.length * 2;
+		buckets = new Item*[newBucketCount];
+		rehashCount++;
+		foreach (item; items)
+		{
+			immutable size_t newIndex = item.hash % newBucketCount;
+			item.next = buckets[newIndex];
+			buckets[newIndex] = item;
+		}
+	}
+
+	size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
+	{
+		ubyte[] mem = allocate(bytes.length);
+		mem[] = bytes[];
+		Item* item = cast(Item*) allocate(Item.sizeof).ptr;
 		item.index = items.length;
+		item.str = cast(string) mem;
+		item.hash = hash;
+		item.next = buckets[hash % buckets.length];
+		immutable bool checkLoadFactor = item.next !is null;
+		buckets[hash % buckets.length] = item;
 		items ~= item;
-		buckets[hash % buckets.length] ~= item;
+		if (checkLoadFactor && (cast(float) items.length / cast(float) buckets.length) > 0.75)
+			rehash();
 		return item.index;
 	}
 
 	const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe
 	{
 		immutable size_t index = hash % buckets.length;
-		foreach (item; buckets[index])
+		for (const(Item)* item = buckets[index]; item !is null; item = item.next)
 		{
 			if (item.hash == hash && bytes.equal(item.str))
 				return item;
@@ -477,53 +577,46 @@ private:
 		return null;
 	}
 
-	string allocate(const(ubyte)[] bytes) pure nothrow @trusted
-	out (retVal)
-	{
-		assert (retVal == bytes);
-	}
-	body
+	ubyte[] allocate(size_t byteCount) pure nothrow @trusted
 	{
 		import core.memory;
-		if (bytes.length > (pageSize / 4))
+		if (byteCount > (blockSize / 4))
 		{
-			ubyte* memory = cast(ubyte*) GC.malloc(bytes.length, GC.BlkAttr.NO_SCAN);
-			memory[0 .. bytes.length] = bytes[];
-			return cast(string) memory[0..bytes.length];
+			ubyte* mem = cast(ubyte*) GC.malloc(byteCount, GC.BlkAttr.NO_SCAN);
+			return mem[0 .. byteCount];
 		}
 		foreach (ref block; blocks)
 		{
-			immutable size_t endIndex = block.used + bytes.length;
-			if (endIndex > block.bytes.length)
+			immutable size_t oldUsed = block.used;
+			immutable size_t end = oldUsed + byteCount;
+			if (end > block.bytes.length)
 				continue;
-			block.bytes[block.used .. endIndex] = bytes[];
-			string slice = cast(string) block.bytes[block.used .. endIndex];
-			block.used = endIndex;
-			return slice;
+			block.used = end;
+			return block.bytes[oldUsed .. end];
 		}
-		blocks.length = blocks.length + 1;
-		blocks[$ - 1].bytes = (cast(ubyte*) GC.malloc(pageSize, GC.BlkAttr.NO_SCAN))[0 .. pageSize];
-		blocks[$ - 1].bytes[0 .. bytes.length] = bytes[];
-		blocks[$ - 1].used = bytes.length;
-		return cast(string) blocks[$ - 1].bytes[0 .. bytes.length];
+		blocks ~= Block(
+			(cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize],
+			byteCount);
+		return blocks[$ - 1].bytes[0 .. byteCount];
 	}
 
 	static uint hashBytes(const(ubyte)[] data) pure nothrow @safe
-    {
-        uint hash = 0;
-        foreach (b; data)
-        {
-            hash ^= sbox[b];
-            hash *= 3;
-        }
-        return hash;
-    }
+	{
+		uint hash = 0;
+		foreach (b; data)
+		{
+			hash ^= sbox[b];
+			hash *= 3;
+		}
+		return hash;
+	}
 
 	static struct Item
 	{
 		size_t index;
 		string str;
 		uint hash;
+		Item* next;
 	}
 
 	static struct Block
@@ -532,10 +625,9 @@ private:
 		size_t used;
 	}
 
-	static enum pageSize = 4096 * 1024;
-	static enum bucketCount = 2048;
+	static enum blockSize = 1024 * 16;
 
-	static enum uint[] sbox = [
+	public static immutable uint[] sbox = [
 		0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
 		0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982,
 		0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56,
@@ -603,6 +695,8 @@ private:
 	];
 
 	Item*[] items;
-	Item*[][bucketCount] buckets;
+	Item*[] buckets;
 	Block[] blocks;
+	size_t memoryRequested;
+	uint rehashCount;
 }