Simplified lexer interface

This commit is contained in:
Hackerpilot 2014-01-23 22:54:18 -08:00
parent 76fc800d30
commit 2f78272fed
3 changed files with 24 additions and 34 deletions

View File

@ -1,4 +0,0 @@
dmd -c -D lexer.d ../../../d-programming-language.org/std.ddoc -Df../../../hackerpilot.github.com/experimental/std_lexer/phobos/lexer.html -I../..
dmd -c -D ast.d ../../../d-programming-language.org/std.ddoc -Df../../../hackerpilot.github.com/experimental/std_lexer/phobos/ast.html -I../..
dmd -c -D parser.d ../../../d-programming-language.org/std.ddoc -Df../../../hackerpilot.github.com/experimental/std_lexer/phobos/parser.html -I../..

View File

@ -8,7 +8,7 @@ import std.range;
import stdx.lexer; import stdx.lexer;
public import stdx.lexer : StringCache; public import stdx.lexer : StringCache;
private enum staticTokens = [ private enum operators = [
",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=", ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=",
"!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++", "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++",
"+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=", "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=",
@ -16,13 +16,7 @@ private enum staticTokens = [
"^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~=" "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~="
]; ];
private enum pseudoTokens = [ private enum keywords = [
"\"", "`", "//", "/*", "/+", ".", "'", "0", "1", "2", "3", "4", "5", "6",
"7", "8", "9", "q\"", "q{", "r\"", "x\"", " ", "\t", "\r", "\n", "#!",
"#line", "\u2028", "\u2029"
];
private enum possibleDefaultTokens = [
"abstract", "alias", "align", "asm", "assert", "auto", "body", "bool", "abstract", "alias", "align", "asm", "assert", "auto", "body", "bool",
"break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat", "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat",
"char", "class", "const", "continue", "creal", "dchar", "debug", "default", "char", "class", "const", "continue", "creal", "dchar", "debug", "default",
@ -82,11 +76,11 @@ private enum pseudoTokenHandlers = [
"#line", "lexSpecialTokenSequence" "#line", "lexSpecialTokenSequence"
]; ];
public alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens); public alias IdType = TokenIdType!(operators, dynamicTokens, keywords);
public alias str = tokenStringRepresentation!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens); public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords);
public template tok(string token) public template tok(string token)
{ {
alias tok = TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token); alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token);
} }
private enum extraFields = q{ private enum extraFields = q{
string comment; string comment;
@ -405,8 +399,8 @@ public struct DLexer
{ {
import core.vararg; import core.vararg;
mixin Lexer!(IdType, Token, lexIdentifier, staticTokens, mixin Lexer!(IdType, Token, lexIdentifier, isSeparating, operators,
dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens); dynamicTokens, pseudoTokenHandlers, keywords);
this(ubyte[] range, const LexerConfig config, StringCache* cache) this(ubyte[] range, const LexerConfig config, StringCache* cache)
{ {

View File

@ -11,13 +11,6 @@
module stdx.lexer; module stdx.lexer;
import std.typecons;
import std.algorithm;
import std.range;
import std.traits;
import std.conv;
import std.math;
/** /**
* Template for determining the type used for a token type. Selects the smallest * Template for determining the type used for a token type. Selects the smallest
* unsigned integral type that is able to hold the value * unsigned integral type that is able to hold the value
@ -81,6 +74,7 @@ string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens
template TokenId(IdType, alias staticTokens, alias dynamicTokens, template TokenId(IdType, alias staticTokens, alias dynamicTokens,
alias possibleDefaultTokens, string symbol) alias possibleDefaultTokens, string symbol)
{ {
import std.algorithm;
static if (symbol == "") static if (symbol == "")
{ {
enum id = 0; enum id = 0;
@ -190,10 +184,13 @@ public:
} }
mixin template Lexer(IDType, Token, alias defaultTokenFunction, mixin template Lexer(IDType, Token, alias defaultTokenFunction,
alias staticTokens, alias dynamicTokens, alias pseudoTokens, alias tokenSeparatingFunction, alias staticTokens, alias dynamicTokens,
alias pseudoTokenHandlers, alias possibleDefaultTokens) alias pseudoTokenHandlers, alias possibleDefaultTokens)
{ {
static assert (pseudoTokenHandlers.length % 2 == 0, "Each pseudo-token must"
~ " have a corresponding handler function name.");
static string generateMask(const ubyte[] arr) static string generateMask(const ubyte[] arr)
{ {
import std.string; import std.string;
@ -211,26 +208,28 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction,
return format("0x%016x", ulong.max >> ((8 - l) * 8)); return format("0x%016x", ulong.max >> ((8 - l) * 8));
} }
static string generateCaseStatements(string[] tokens) static string generateCaseStatements()
{ {
import std.conv; import std.conv;
import std.string; import std.string;
import std.range;
string[] pseudoTokens = stupidToArray(pseudoTokenHandlers.stride(2));
string[] allTokens = stupidToArray(sort(staticTokens ~ possibleDefaultTokens ~ pseudoTokens).uniq);
string code; string code;
for (size_t i = 0; i < tokens.length; i++) for (size_t i = 0; i < allTokens.length; i++)
{ {
size_t j = i + 1; size_t j = i + 1;
size_t o = i; size_t o = i;
while (j < tokens.length && tokens[i][0] == tokens[j][0]) j++; while (j < allTokens.length && allTokens[i][0] == allTokens[j][0]) j++;
code ~= format("case 0x%02x:\n", cast(ubyte) tokens[i][0]); code ~= format("case 0x%02x:\n", cast(ubyte) allTokens[i][0]);
code ~= printCase(tokens[i .. j]); code ~= printCase(allTokens[i .. j], pseudoTokens);
i = j - 1; i = j - 1;
} }
return code; return code;
} }
static string printCase(string[] tokens) static string printCase(string[] tokens, string[] pseudoTokens)
{ {
string[] t = tokens; string[] t = tokens;
string[] sortedTokens = stupidToArray(sort!"a.length > b.length"(t)); string[] sortedTokens = stupidToArray(sort!"a.length > b.length"(t));
@ -300,7 +299,7 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction,
// possible default // possible default
if (token.length <= 8) if (token.length <= 8)
{ {
code ~= " if (isSeparating(" ~ text(token.length) ~ "))\n"; code ~= " if (tokenSeparatingFunction(" ~ text(token.length) ~ "))\n";
code ~= " {\n"; code ~= " {\n";
code ~= " range.popFrontN(" ~ text(token.length) ~ ");\n"; code ~= " range.popFrontN(" ~ text(token.length) ~ ");\n";
code ~= " return Token(tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; code ~= " return Token(tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
@ -371,7 +370,7 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction,
return retVal; return retVal;
} }
enum tokenSearch = generateCaseStatements(stupidToArray(uniq(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens)))); enum tokenSearch = generateCaseStatements();
static ulong getFront(const ubyte[] arr) pure nothrow @trusted static ulong getFront(const ubyte[] arr) pure nothrow @trusted
{ {
@ -625,6 +624,7 @@ private:
const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe
{ {
import std.algorithm;
immutable size_t index = hash % buckets.length; immutable size_t index = hash % buckets.length;
for (const(Item)* item = buckets[index]; item !is null; item = item.next) for (const(Item)* item = buckets[index]; item !is null; item = item.next)
{ {