Simplified lexer interface

2014-01-23 22:54:18 -08:00 · 2014-01-23 22:54:18 -08:00 · 2f78272fed
parent 76fc800d30
commit 2f78272fed
3 changed files with 24 additions and 34 deletions
--- a/stdx/d/gendoc.sh
+++ b/stdx/d/gendoc.sh
@ -1,4 +0,0 @@
-dmd -c -D lexer.d ../../../d-programming-language.org/std.ddoc -Df../../../hackerpilot.github.com/experimental/std_lexer/phobos/lexer.html -I../..
-dmd -c -D ast.d ../../../d-programming-language.org/std.ddoc -Df../../../hackerpilot.github.com/experimental/std_lexer/phobos/ast.html -I../..
-dmd -c -D parser.d ../../../d-programming-language.org/std.ddoc -Df../../../hackerpilot.github.com/experimental/std_lexer/phobos/parser.html -I../..
-
--- a/stdx/d/lexer.d
+++ b/stdx/d/lexer.d
@ -8,7 +8,7 @@ import std.range;
 import stdx.lexer;
 public import stdx.lexer : StringCache;

-private enum staticTokens = [
+private enum operators = [
 	",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=",
 	"!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++",
 	"+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=",
@ -16,13 +16,7 @@ private enum staticTokens = [
 	"^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~="
 ];

-private enum pseudoTokens = [
-	"\"", "`", "//", "/*", "/+", ".", "'", "0", "1", "2", "3", "4", "5", "6",
-	"7", "8", "9", "q\"", "q{", "r\"", "x\"", " ", "\t", "\r", "\n", "#!",
-	"#line", "\u2028", "\u2029"
-];
-
-private enum possibleDefaultTokens = [
+private enum keywords = [
 	"abstract", "alias", "align", "asm", "assert", "auto", "body", "bool",
 	"break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat",
 	"char", "class", "const", "continue", "creal", "dchar", "debug", "default",
@ -82,11 +76,11 @@ private enum pseudoTokenHandlers = [
 	"#line", "lexSpecialTokenSequence"
 ];

-public alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
-public alias str = tokenStringRepresentation!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens);
+public alias IdType = TokenIdType!(operators, dynamicTokens, keywords);
+public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords);
 public template tok(string token)
 {
-  alias tok = TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token);
+  alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token);
 }
 private enum extraFields = q{
 	string comment;
@ -405,8 +399,8 @@ public struct DLexer
 {
 	import core.vararg;

-	mixin Lexer!(IdType, Token, lexIdentifier, staticTokens,
-		dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens);
+	mixin Lexer!(IdType, Token, lexIdentifier, isSeparating, operators,
+		dynamicTokens, pseudoTokenHandlers, keywords);

 	this(ubyte[] range, const LexerConfig config, StringCache* cache)
 	{
--- a/stdx/lexer.d
+++ b/stdx/lexer.d
@ -11,13 +11,6 @@

 module stdx.lexer;

-import std.typecons;
-import std.algorithm;
-import std.range;
-import std.traits;
-import std.conv;
-import std.math;
-
 /**
 * Template for determining the type used for a token type. Selects the smallest
 * unsigned integral type that is able to hold the value
@ -81,6 +74,7 @@ string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens
 template TokenId(IdType, alias staticTokens, alias dynamicTokens,
 	alias possibleDefaultTokens, string symbol)
 {
+	import std.algorithm;
 	static if (symbol == "")
 	{
 		enum id = 0;
@ -190,10 +184,13 @@ public:
 }

 mixin template Lexer(IDType, Token, alias defaultTokenFunction,
-	alias staticTokens, alias dynamicTokens, alias pseudoTokens,
+	alias tokenSeparatingFunction, alias staticTokens, alias dynamicTokens,
 	alias pseudoTokenHandlers, alias possibleDefaultTokens)
 {

+	static assert (pseudoTokenHandlers.length % 2 == 0, "Each pseudo-token must"
+		~ " have a corresponding handler function name.");
+
 	static string generateMask(const ubyte[] arr)
 	{
 		import std.string;
@ -211,26 +208,28 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction,
 		return format("0x%016x", ulong.max >> ((8 - l) * 8));
 	}

-	static string generateCaseStatements(string[] tokens)
+	static string generateCaseStatements()
 	{
 		import std.conv;
 		import std.string;
+		import std.range;

-
+		string[] pseudoTokens = stupidToArray(pseudoTokenHandlers.stride(2));
+		string[] allTokens = stupidToArray(sort(staticTokens ~ possibleDefaultTokens ~ pseudoTokens).uniq);
 		string code;
-		for (size_t i = 0; i < tokens.length; i++)
+		for (size_t i = 0; i < allTokens.length; i++)
 		{
 			size_t j = i + 1;
 			size_t o = i;
-			while (j < tokens.length && tokens[i][0] == tokens[j][0]) j++;
-			code ~= format("case 0x%02x:\n", cast(ubyte) tokens[i][0]);
-			code ~= printCase(tokens[i .. j]);
+			while (j < allTokens.length && allTokens[i][0] == allTokens[j][0]) j++;
+			code ~= format("case 0x%02x:\n", cast(ubyte) allTokens[i][0]);
+			code ~= printCase(allTokens[i .. j], pseudoTokens);
 			i = j - 1;
 		}
 		return code;
 	}

-	static string printCase(string[] tokens)
+	static string printCase(string[] tokens, string[] pseudoTokens)
 	{
 		string[] t = tokens;
 		string[] sortedTokens = stupidToArray(sort!"a.length > b.length"(t));
@ -300,7 +299,7 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction,
 				// possible default
 				if (token.length <= 8)
 				{
-					code ~= "        if (isSeparating(" ~ text(token.length) ~ "))\n";
+					code ~= "        if (tokenSeparatingFunction(" ~ text(token.length) ~ "))\n";
 					code ~= "        {\n";
 					code ~= "            range.popFrontN(" ~ text(token.length) ~ ");\n";
 					code ~= "            return Token(tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
@ -371,7 +370,7 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction,
 		return retVal;
 	}

-	enum tokenSearch = generateCaseStatements(stupidToArray(uniq(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens))));
+	enum tokenSearch = generateCaseStatements();

 	static ulong getFront(const ubyte[] arr) pure nothrow @trusted
 	{
@ -625,6 +624,7 @@ private:

 	const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe
 	{
+		import std.algorithm;
 		immutable size_t index = hash % buckets.length;
 		for (const(Item)* item = buckets[index]; item !is null; item = item.next)
 		{