Back-end cleanup and optimization in the lexer

This commit is contained in:
Hackerpilot 2014-01-19 23:13:13 -08:00
parent 24a0c1bc2b
commit c01c51a61e
9 changed files with 344 additions and 232 deletions

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "datapicked"]
path = datapicked
url = https://github.com/blackwhale/datapicked.git

View File

@ -1,4 +1,3 @@
#dmd *.d stdx/d/*.d -release -inline -noboundscheck -O -w -wi -m64 -property -ofdscanner-dmd
dmd\ dmd\
main.d\ main.d\
stats.d\ stats.d\
@ -11,9 +10,36 @@ dmd\
style.d\ style.d\
stdx/*.d\ stdx/*.d\
stdx/d/*.d\ stdx/d/*.d\
datapicked/dpick/buffer/*.d\ -ofdscanner\
-Idatapicked\ -m64\
-g -m64 -wi -ofdscanner -O -release -noboundscheck
#ldc2 main.d stats.d imports.d highlighter.d ctags.d astprinter.d formatter.d outliner.d stdx/*.d stdx/d/*.d -of=dscanner-ldc -m64 -oq
#ldc2 *.d stdx/d/*.d -of=dscanner -unittest -m64 -g #gdc\
#/opt/gdc/bin/gdc -O3 -odscanner-gdc -fno-bounds-check -frelease -m64 *.d stdx/d/*.d # main.d\
# stats.d\
# imports.d\
# highlighter.d\
# ctags.d\
# astprinter.d\
# formatter.d\
# outliner.d\
# style.d\
# stdx/*.d\
# stdx/d/*.d\
# -O3 -frelease -fno-bounds-check\
# -odscanner\
#ldc2\
# main.d\
# stats.d\
# imports.d\
# highlighter.d\
# ctags.d\
# astprinter.d\
# formatter.d\
# outliner.d\
# style.d\
# stdx/*.d\
# stdx/d/*.d\
# -O3 -release\
# -oq -of=dscanner\

View File

@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames)
{ {
string[] tags; string[] tags;
LexerConfig config; LexerConfig config;
StringCache* cache = new StringCache; StringCache* cache = new StringCache(StringCache.defaultBucketCount);
foreach (fileName; fileNames) foreach (fileName; fileNames)
{ {
File f = File(fileName); File f = File(fileName);

@ -1 +0,0 @@
Subproject commit f63a843e9c0ce8db7fd897684fe323697255d87d

14
main.d
View File

@ -10,15 +10,12 @@ import std.array;
import std.conv; import std.conv;
import std.file; import std.file;
import std.getopt; import std.getopt;
import std.parallelism;
import std.path; import std.path;
import std.regex;
import std.stdio; import std.stdio;
import std.range; import std.range;
import stdx.lexer; import stdx.lexer;
import stdx.d.lexer; import stdx.d.lexer;
import stdx.d.parser; import stdx.d.parser;
import dpick.buffer.buffer;
import highlighter; import highlighter;
import stats; import stats;
@ -93,7 +90,7 @@ int main(string[] args)
return 1; return 1;
} }
StringCache* cache = new StringCache; StringCache* cache = new StringCache(StringCache.defaultBucketCount);
if (tokenDump || highlight) if (tokenDump || highlight)
{ {
@ -151,13 +148,16 @@ int main(string[] args)
foreach (f; expandArgs(args, recursive)) foreach (f; expandArgs(args, recursive))
{ {
import core.memory; import core.memory;
GC.disable(); LexerConfig config;
auto tokens = byToken!(ubyte[])(readFile(f)); config.whitespaceBehavior = WhitespaceBehavior.skip;
config.stringBehavior = StringBehavior.source;
config.commentBehavior = CommentBehavior.include;
auto tokens = byToken(readFile(f), config, cache);
if (tokenCount) if (tokenCount)
count += printTokenCount(stdout, f, tokens); count += printTokenCount(stdout, f, tokens);
else else
count += printLineCount(stdout, f, tokens); count += printLineCount(stdout, f, tokens);
GC.enable(); cache.printStats();
} }
writefln("total:\t%d", count); writefln("total:\t%d", count);
} }

View File

@ -1,9 +0,0 @@
echo -e "file\tstd.d.lexer dmd\tstd.d.lexer ldc\tstd.d.lexer gdc\tdmd"
for i in $(ls ../phobos/std/*.d); do
f=$(echo $i | sed "s/.*phobos\///")
dmdt=$(avgtime -q -r 200 ./dscanner-dmd --tokenCount $i | grep "Median" | sed "s/.*: //")
ldct=$(avgtime -q -r 200 ./dscanner-ldc --tokenCount $i | grep "Median" | sed "s/.*: //")
gdct=$(avgtime -q -r 200 ./dscanner-gdc --tokenCount $i | grep "Median" | sed "s/.*: //")
gcct=$(avgtime -q -r 200 ~/src/dmd-lexer/src/dmd $i | grep "Median" | sed "s/.*: //")
echo -e "${f}\t${dmdt}\t${ldct}\t${gdct}\t${gcct}"
done

View File

@ -32,7 +32,12 @@ pure nothrow bool isLineOfCode(IdType t)
ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens) ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens)
{ {
ulong c = tokens.count!(a => true);
ulong c;
foreach (ref t; tokens)
{
c++;
}
output.writefln("%s:\t%d", fileName, c); output.writefln("%s:\t%d", fileName, c);
return c; return c;
} }
@ -40,7 +45,7 @@ ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens)
ulong printLineCount(Tokens)(File output, string fileName, ref Tokens tokens) ulong printLineCount(Tokens)(File output, string fileName, ref Tokens tokens)
{ {
ulong count; ulong count;
foreach (t; tokens) foreach (ref t; tokens)
{ {
if (isLineOfCode(t.type)) if (isLineOfCode(t.type))
++count; ++count;

View File

@ -57,13 +57,13 @@ public template tok(string token)
alias tok = TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token); alias tok = TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token);
} }
private enum extraFields = q{ private enum extraFields = q{
string comment; string comment;
int opCmp(size_t i) const pure nothrow @safe { int opCmp(size_t i) const pure nothrow @safe {
if (index < i) return -1; if (index < i) return -1;
if (index > i) return 1; if (index > i) return 1;
return 0; return 0;
} }
}; };
public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields); public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields);
@ -72,15 +72,15 @@ public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields);
*/ */
public enum StringBehavior : ubyte public enum StringBehavior : ubyte
{ {
/// Do not include quote characters, process escape sequences /// Do not include quote characters, process escape sequences
compiler = 0b0000_0000, compiler = 0b0000_0000,
/// Opening quotes, closing quotes, and string suffixes are included in the /// Opening quotes, closing quotes, and string suffixes are included in the
/// string token /// string token
includeQuoteChars = 0b0000_0001, includeQuoteChars = 0b0000_0001,
/// String escape sequences are not replaced /// String escape sequences are not replaced
notEscaped = 0b0000_0010, notEscaped = 0b0000_0010,
/// Not modified at all. Useful for formatters or highlighters /// Not modified at all. Useful for formatters or highlighters
source = includeQuoteChars | notEscaped source = includeQuoteChars | notEscaped
} }
/** /**
@ -88,55 +88,28 @@ public enum StringBehavior : ubyte
*/ */
public enum WhitespaceBehavior : ubyte public enum WhitespaceBehavior : ubyte
{ {
/// Whitespace is skipped /// Whitespace is skipped
skip, skip,
/// Whitespace is treated as a token /// Whitespace is treated as a token
include include
} }
/** /**
* Configure comment handling behavior * Configure comment handling behavior
*/ */
public enum CommentBehavior : ubyte public enum CommentBehavior : ubyte
{ {
/// Comments are attached to the non-whitespace token that follows them /// Comments are attached to the non-whitespace token that follows them
attach, attach,
/// Comments are tokens, and can be returned by calls to the token range's front() /// Comments are tokens, and can be returned by calls to the token range's front()
include include
} }
public struct LexerConfig public struct LexerConfig
{ {
string fileName; string fileName;
StringBehavior stringBehavior; StringBehavior stringBehavior;
WhitespaceBehavior whitespaceBehavior; WhitespaceBehavior whitespaceBehavior;
CommentBehavior commentBehavior; CommentBehavior commentBehavior;
}
public auto byToken(R)(R range)
{
LexerConfig config;
StringCache* cache = new StringCache;
return byToken(range, config, cache);
}
public auto byToken(R)(R range, StringCache* cache)
{
LexerConfig config;
return DLexer!(R)(range, config, cache);
}
public auto byToken(R)(R range, const LexerConfig config, StringCache* cache)
{
return DLexer!(R)(range, config, cache);
}
unittest
{
import std.stdio;
auto source = cast(ubyte[]) q{ import std.stdio;}c;
auto tokens = byToken(source);
assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
tok!"identifier", tok!";"]));
} }
public bool isBasicType(IdType type) nothrow pure @safe public bool isBasicType(IdType type) nothrow pure @safe
@ -396,11 +369,9 @@ public bool isProtection(IdType type) pure nothrow @safe
} }
} }
public struct DLexer(R) public struct DLexer
{ {
import std.conv;
import core.vararg; import core.vararg;
import dpick.buffer.buffer;
private enum pseudoTokenHandlers = [ private enum pseudoTokenHandlers = [
"\"", "lexStringLiteral", "\"", "lexStringLiteral",
@ -434,53 +405,51 @@ public struct DLexer(R)
"#line", "lexSpecialTokenSequence" "#line", "lexSpecialTokenSequence"
]; ];
mixin Lexer!(R, IdType, Token, lexIdentifier, staticTokens, mixin Lexer!(IdType, Token, lexIdentifier, staticTokens,
dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens); dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens);
private alias Mark = typeof(range).Mark; this(ubyte[] range, const LexerConfig config, StringCache* cache)
this(R range, const LexerConfig config, StringCache* cache)
{ {
this.range = LexerRange!(typeof(buffer(range)))(buffer(range)); this.range = LexerRange(range);
this.config = config; this.config = config;
this.cache = cache; this.cache = cache;
popFront(); popFront();
} }
private static bool isDocComment(string comment) pure nothrow @safe private static bool isDocComment(string comment) pure nothrow @safe
{ {
return comment.length >= 3 && (comment[0 .. 3] == "///" return comment.length >= 3 && (comment[0 .. 3] == "///"
|| comment[0 .. 3] == "/**" || comment[0 .. 3] == "/++"); || comment[0 .. 3] == "/**" || comment[0 .. 3] == "/++");
} }
public void popFront() pure public void popFront() pure
{ {
_popFront(); _popFront();
string comment = null; string comment = null;
switch (front.type) switch (front.type)
{ {
case tok!"comment": case tok!"comment":
if (config.commentBehavior == CommentBehavior.attach) if (config.commentBehavior == CommentBehavior.attach)
{ {
import std.string; import std.string;
if (isDocComment(front.text)) if (isDocComment(front.text))
comment = comment == null ? front.text : format("%s\n%s", comment, front.text); comment = comment == null ? front.text : format("%s\n%s", comment, front.text);
do _popFront(); while (front == tok!"comment"); do _popFront(); while (front == tok!"comment");
if (front == tok!"whitespace") goto case tok!"whitespace"; if (front == tok!"whitespace") goto case tok!"whitespace";
} }
break; break;
case tok!"whitespace": case tok!"whitespace":
if (config.whitespaceBehavior == WhitespaceBehavior.skip) if (config.whitespaceBehavior == WhitespaceBehavior.skip)
{ {
do _popFront(); while (front == tok!"whitespace"); do _popFront(); while (front == tok!"whitespace");
if (front == tok!"comment") goto case tok!"comment"; if (front == tok!"comment") goto case tok!"comment";
} }
break; break;
default: default:
break; break;
} }
_front.comment = comment; _front.comment = comment;
} }
bool isWhitespace() pure /*const*/ nothrow bool isWhitespace() pure /*const*/ nothrow
@ -493,7 +462,7 @@ public struct DLexer(R)
case '\t': case '\t':
return true; return true;
case 0xe2: case 0xe2:
auto peek = range.lookahead(2); auto peek = range.peek(2);
return peek.length == 2 return peek.length == 2
&& peek[0] == 0x80 && peek[0] == 0x80
&& (peek[1] == 0xa8 || peek[1] == 0xa9); && (peek[1] == 0xa8 || peek[1] == 0xa9);
@ -521,7 +490,7 @@ public struct DLexer(R)
range.incrementLine(); range.incrementLine();
return; return;
case 0xe2: case 0xe2:
auto lookahead = range.lookahead(3); auto lookahead = range.peek(3);
if (lookahead.length == 3 && lookahead[1] == 0x80 if (lookahead.length == 3 && lookahead[1] == 0x80
&& (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
{ {
@ -564,7 +533,7 @@ public struct DLexer(R)
range.popFront(); range.popFront();
break; break;
case 0xe2: case 0xe2:
auto lookahead = range.lookahead(3); auto lookahead = range.peek(3);
if (lookahead.length != 3) if (lookahead.length != 3)
break loop; break loop;
if (lookahead[1] != 0x80) if (lookahead[1] != 0x80)
@ -590,10 +559,10 @@ public struct DLexer(R)
Token lexNumber() pure nothrow Token lexNumber() pure nothrow
{ {
mixin (tokenStart); mixin (tokenStart);
auto lookahead = range.lookahead(2); if (range.canPeek(1) && range.front == '0')
if (range.front == '0' && lookahead.length == 2)
{ {
switch (lookahead[1]) auto ahead = range.peek(1)[1];
switch (ahead)
{ {
case 'x': case 'x':
case 'X': case 'X':
@ -619,7 +588,7 @@ public struct DLexer(R)
return lexHex(mark, line, column, index); return lexHex(mark, line, column, index);
} }
Token lexHex(Mark mark, size_t line, size_t column, size_t index) pure nothrow Token lexHex(size_t mark, size_t line, size_t column, size_t index) pure nothrow
{ {
IdType type = tok!"intLiteral"; IdType type = tok!"intLiteral";
bool foundDot; bool foundDot;
@ -654,7 +623,7 @@ public struct DLexer(R)
case '.': case '.':
if (foundDot) if (foundDot)
break hexLoop; break hexLoop;
if (range.lookahead(1).length && range.lookahead(1)[0] == '.') if (range.peek(1).length && range.peek(1)[0] == '.')
break hexLoop; break hexLoop;
range.popFront(); range.popFront();
foundDot = true; foundDot = true;
@ -674,7 +643,7 @@ public struct DLexer(R)
return lexBinary(mark, line, column, index); return lexBinary(mark, line, column, index);
} }
Token lexBinary(Mark mark, size_t line, size_t column, size_t index) pure nothrow Token lexBinary(size_t mark, size_t line, size_t column, size_t index) pure nothrow
{ {
IdType type = tok!"intLiteral"; IdType type = tok!"intLiteral";
binaryLoop: while (!range.empty) binaryLoop: while (!range.empty)
@ -699,13 +668,13 @@ public struct DLexer(R)
index); index);
} }
Token lexDecimal() Token lexDecimal() pure nothrow
{ {
mixin (tokenStart); mixin (tokenStart);
return lexDecimal(mark, line, column, index); return lexDecimal(mark, line, column, index);
} }
Token lexDecimal(Mark mark, size_t line, size_t column, size_t index) pure nothrow Token lexDecimal(size_t mark, size_t line, size_t column, size_t index) pure nothrow
{ {
bool foundDot = range.front == '.'; bool foundDot = range.front == '.';
IdType type = tok!"intLiteral"; IdType type = tok!"intLiteral";
@ -748,7 +717,7 @@ public struct DLexer(R)
case '.': case '.':
if (foundDot) if (foundDot)
break decimalLoop; break decimalLoop;
auto lookahead = range.lookahead(2); auto lookahead = range.peek(2);
if (lookahead.length == 2 && lookahead[1] == '.') if (lookahead.length == 2 && lookahead[1] == '.')
break decimalLoop; break decimalLoop;
else else
@ -1058,7 +1027,7 @@ public struct DLexer(R)
index); index);
} }
void lexStringSuffix(ref IdType type) pure void lexStringSuffix(ref IdType type) pure nothrow
{ {
if (range.empty) if (range.empty)
type = tok!"stringLiteral"; type = tok!"stringLiteral";
@ -1076,12 +1045,12 @@ public struct DLexer(R)
Token lexDelimitedString() pure nothrow Token lexDelimitedString() pure nothrow
{ {
import std.traits; import std.traits;
mixin (tokenStart); mixin (tokenStart);
range.popFront(); range.popFront();
range.popFront(); range.popFront();
Unqual!(ElementEncodingType!R) open; ubyte open;
Unqual!(ElementEncodingType!R) close; ubyte close;
switch (range.front) switch (range.front)
{ {
case '<': case '<':
@ -1109,8 +1078,8 @@ public struct DLexer(R)
} }
} }
Token lexNormalDelimitedString(Mark mark, size_t line, size_t column, Token lexNormalDelimitedString(size_t mark, size_t line, size_t column,
size_t index, ElementEncodingType!R open, ElementEncodingType!R close) size_t index, ubyte open, ubyte close)
pure nothrow pure nothrow
{ {
int depth = 1; int depth = 1;
@ -1144,7 +1113,7 @@ public struct DLexer(R)
return Token(type, cache.cacheGet(range.slice(mark)), line, column, index); return Token(type, cache.cacheGet(range.slice(mark)), line, column, index);
} }
Token lexHeredocString(Mark mark, size_t line, size_t column, size_t index) Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index)
pure nothrow pure nothrow
{ {
import std.regex; import std.regex;
@ -1158,7 +1127,7 @@ public struct DLexer(R)
if (isNewline()) if (isNewline())
{ {
popFrontWhitespaceAware(); popFrontWhitespaceAware();
if (range.lookahead(ident.text.length) == ident.text) if (range.peek(ident.text.length) == ident.text)
{ {
foreach (i ; 0 .. ident.text.length) foreach (i ; 0 .. ident.text.length)
range.popFront(); range.popFront();
@ -1395,18 +1364,20 @@ public struct DLexer(R)
Token lexIdentifier() pure nothrow Token lexIdentifier() pure nothrow
{ {
mixin (tokenStart); mixin (tokenStart);
while (!range.empty && !isSeparating(range.front)) uint hash = 0;
while (!range.empty && !isSeparating(0))
{ {
hash = StringCache.hashStep(range.front, hash);
range.popFront(); range.popFront();
} }
return Token(tok!"identifier", cache.cacheGet(range.slice(mark)), line, return Token(tok!"identifier", cache.cacheGet(range.slice(mark), hash), line,
column, index); column, index);
} }
Token lexDot() pure nothrow Token lexDot() pure nothrow
{ {
mixin (tokenStart); mixin (tokenStart);
auto lookahead = range.lookahead(1); auto lookahead = range.peek(1);
if (lookahead.length == 0) if (lookahead.length == 0)
{ {
range.popFront(); range.popFront();
@ -1447,22 +1418,25 @@ public struct DLexer(R)
{ {
if (range.front == '\n') return true; if (range.front == '\n') return true;
if (range.front == '\r') return true; if (range.front == '\r') return true;
auto lookahead = range.lookahead(3); auto lookahead = range.peek(3);
if (lookahead.length == 0) return false; if (lookahead.length == 0) return false;
if (lookahead == "\u2028" || lookahead == "\u2029") if (lookahead == "\u2028" || lookahead == "\u2029")
return true; return true;
return false; return false;
} }
bool isSeparating(ElementType!R c) nothrow pure @safe bool isSeparating(size_t offset) const pure nothrow @safe
{ {
auto r = range.save();
r.popFrontN(offset);
auto c = r.front;
if (c <= 0x2f) return true; if (c <= 0x2f) return true;
if (c >= ':' && c <= '@') return true; if (c >= ':' && c <= '@') return true;
if (c >= '[' && c <= '^') return true; if (c >= '[' && c <= '^') return true;
if (c >= '{' && c <= '~') return true; if (c >= '{' && c <= '~') return true;
if (c == '`') return true; if (c == '`') return true;
// if (c & 0x80 && (range.lookahead(3) == "\u2028" if (c & 0x80 && (r.peek(3) == "\u2028"
// || range.lookahead(3) == "\u2029")) return true; || range.peek(3) == "\u2029")) return true;
return false; return false;
} }
@ -1470,17 +1444,43 @@ public struct DLexer(R)
size_t index = range.index; size_t index = range.index;
size_t column = range.column; size_t column = range.column;
size_t line = range.line; size_t line = range.line;
const mark = range.mark(); auto mark = range.mark();
}; };
void error(...) pure { void error(...) pure nothrow @safe {
} }
void warning(...) pure { void warning(...) pure nothrow @safe {
} }
StringCache* cache; StringCache* cache;
LexerConfig config; LexerConfig config;
} }
public auto byToken(ubyte[] range)
{
LexerConfig config;
StringCache* cache = new StringCache(StringCache.defaultBucketCount);
return DLexer(range, config, cache);
}
public auto byToken(ubyte[] range, StringCache* cache)
{
LexerConfig config;
return DLexer(range, config, cache);
}
public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
{
return DLexer(range, config, cache);
}
unittest
{
import std.stdio;
auto source = cast(ubyte[]) q{ import std.stdio;}c;
auto tokens = byToken(source);
assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
tok!"identifier", tok!";"]));
}

View File

@ -17,8 +17,6 @@ import std.range;
import std.traits; import std.traits;
import std.conv; import std.conv;
import std.math; import std.math;
import dpick.buffer.buffer;
import dpick.buffer.traits;
/** /**
* Template for determining the type used for a token type. Selects the smallest * Template for determining the type used for a token type. Selects the smallest
@ -191,12 +189,13 @@ public:
mixin (extraFields); mixin (extraFields);
} }
mixin template Lexer(R, IDType, Token, alias defaultTokenFunction, mixin template Lexer(IDType, Token, alias defaultTokenFunction,
alias staticTokens, alias dynamicTokens, alias pseudoTokens, alias staticTokens, alias dynamicTokens, alias pseudoTokens,
alias pseudoTokenHandlers, alias possibleDefaultTokens) alias pseudoTokenHandlers, alias possibleDefaultTokens)
{ {
static string generateCaseStatements(string[] tokens, size_t offset = 0) static string generateCaseStatements(string[] tokens, size_t offset = 0)
{ {
import std.conv;
string code; string code;
for (size_t i = 0; i < tokens.length; i++) for (size_t i = 0; i < tokens.length; i++)
{ {
@ -216,9 +215,9 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
code ~= generateLeaf(tokens[i], indent ~ " "); code ~= generateLeaf(tokens[i], indent ~ " ");
else else
{ {
code ~= indent ~ " if (range.lookahead(" ~ text(tokens[i].length) ~ ").length == 0)\n"; code ~= indent ~ " if (!range.canPeek(" ~ text(tokens[i].length) ~ "))\n";
code ~= indent ~ " goto outer_default;\n"; code ~= indent ~ " goto outer_default;\n";
code ~= indent ~ " if (range.lookahead(" ~ text(tokens[i].length) ~ ") == \"" ~ escape(tokens[i]) ~ "\")\n"; code ~= indent ~ " if (range.peek(" ~ text(tokens[i].length - 1) ~ ") == \"" ~ escape(tokens[i]) ~ "\")\n";
code ~= indent ~ " {\n"; code ~= indent ~ " {\n";
code ~= generateLeaf(tokens[i], indent ~ " "); code ~= generateLeaf(tokens[i], indent ~ " ");
code ~= indent ~ " }\n"; code ~= indent ~ " }\n";
@ -228,11 +227,11 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
} }
else else
{ {
code ~= indent ~ " if (range.lookahead(" ~ text(offset + 2) ~ ").length == 0)\n"; code ~= indent ~ " if (!range.canPeek(" ~ text(offset + 1) ~ "))\n";
code ~= indent ~ " {\n"; code ~= indent ~ " {\n";
code ~= generateLeaf(tokens[i][0 .. offset + 1], indent ~ " "); code ~= generateLeaf(tokens[i][0 .. offset + 1], indent ~ " ");
code ~= indent ~ " }\n"; code ~= indent ~ " }\n";
code ~= indent ~ " switch (range.lookahead(" ~ text(offset + 2) ~ ")[" ~ text(offset + 1) ~ "])\n"; code ~= indent ~ " switch (range.peek(" ~ text(offset + 1) ~ ")[" ~ text(offset + 1) ~ "])\n";
code ~= indent ~ " {\n"; code ~= indent ~ " {\n";
code ~= generateCaseStatements(tokens[i .. j], offset + 1); code ~= generateCaseStatements(tokens[i .. j], offset + 1);
code ~= indent ~ " default:\n"; code ~= indent ~ " default:\n";
@ -247,6 +246,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
static string generateLeaf(string token, string indent) static string generateLeaf(string token, string indent)
{ {
import std.conv;
static assert (pseudoTokenHandlers.length % 2 == 0, static assert (pseudoTokenHandlers.length % 2 == 0,
"Each pseudo-token must have a matching function name."); "Each pseudo-token must have a matching function name.");
string code; string code;
@ -262,7 +262,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
code ~= indent ~ "return " ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(token) + 1] ~ "();\n"; code ~= indent ~ "return " ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(token) + 1] ~ "();\n";
else if (possibleDefaultTokens.countUntil(token) >= 0) else if (possibleDefaultTokens.countUntil(token) >= 0)
{ {
code ~= indent ~ "if (range.lookahead(" ~ text(token.length + 1) ~ ").length == 0 || isSeparating(range.lookahead(" ~ text(token.length + 1) ~ ")[" ~ text(token.length) ~ "]))\n"; code ~= indent ~ "if (!range.canPeek(" ~ text(token.length + 1) ~ ") || isSeparating(" ~ text(token.length) ~ "))\n";
code ~= indent ~ "{\n"; code ~= indent ~ "{\n";
if (token.length == 1) if (token.length == 1)
code ~= indent ~ " range.popFront();\n"; code ~= indent ~ " range.popFront();\n";
@ -278,7 +278,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
return code; return code;
} }
const(Token) front() pure nothrow const @property ref const(Token) front() pure nothrow const @property
{ {
return _front; return _front;
} }
@ -312,7 +312,21 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
return retVal; return retVal;
} }
Token advance() pure /**
* This only exists because the real array() can't be called at compile-time
*/
static string[] stupidToArray(R)(R range)
{
string[] retVal;
foreach (v; range)
retVal ~= v;
return retVal;
}
enum loopBody = generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens)));
auto ref Token advance() pure
{ {
if (range.empty) if (range.empty)
return Token(tok!"\0"); return Token(tok!"\0");
@ -321,54 +335,87 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
immutable size_t line = range.line; immutable size_t line = range.line;
lexerLoop: switch (range.front) lexerLoop: switch (range.front)
{ {
mixin(generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens)))); mixin(loopBody);
// pragma(msg, generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens)))); /+pragma(msg, loopBody);+/
outer_default: outer_default:
default: default:
return defaultTokenFunction(); return defaultTokenFunction();
} }
} }
/** LexerRange range;
* This only exists because the real array() can't be called at compile-time
*/
static T[] stupidToArray(R, T = ElementType!R)(R range)
{
T[] retVal;
foreach (v; range)
retVal ~= v;
return retVal;
}
LexerRange!(typeof(buffer(R.init))) range;
Token _front; Token _front;
} }
struct LexerRange(BufferType) if (isBuffer!BufferType) struct LexerRange
{ {
this(BufferType r)
this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1) pure nothrow @safe
{ {
this.range = r; this.bytes = bytes;
index = 0; this.index = index;
column = 1; this.column = column;
line = 1; this.line = line;
} }
void popFront() pure size_t mark() const nothrow pure @safe
{
return index;
}
void seek(size_t m) nothrow pure @safe
{
index = m;
}
const(ubyte)[] slice(size_t m) const nothrow pure @safe
{
return bytes[m .. index];
}
bool empty() const nothrow pure @safe
{
return index >= bytes.length;
}
ubyte front() const nothrow pure @safe
{
return bytes[index];
}
const(ubyte)[] peek(size_t p) const nothrow pure @safe
{
return bytes[index .. index + p + 1];
}
bool canPeek(size_t p) const nothrow pure @safe
{
return index + p < bytes.length;
}
LexerRange save() const nothrow pure @safe
{
return LexerRange(bytes, index, column, line);
}
void popFront() pure nothrow @safe
{ {
index++; index++;
column++; column++;
range.popFront();
} }
void incrementLine() pure nothrow void popFrontN(size_t n) pure nothrow @safe
{
index += n;
}
void incrementLine() pure nothrow @safe
{ {
column = 1; column = 1;
line++; line++;
} }
BufferType range; const(ubyte)[] bytes;
alias range this;
size_t index; size_t index;
size_t column; size_t column;
size_t line; size_t line;
@ -388,6 +435,13 @@ struct StringCache
{ {
public: public:
@disable this();
this(size_t bucketCount = defaultBucketCount)
{
buckets = new Item*[bucketCount];
}
/** /**
* Equivalent to calling cache() and get(). * Equivalent to calling cache() and get().
* --- * ---
@ -402,6 +456,11 @@ public:
return get(cache(bytes)); return get(cache(bytes));
} }
string cacheGet(const(ubyte[]) bytes, uint hash) pure nothrow @safe
{
return get(cache(bytes, hash));
}
/** /**
* Caches a string. * Caches a string.
* Params: bytes = the string to cache * Params: bytes = the string to cache
@ -416,6 +475,12 @@ public:
* --- * ---
*/ */
size_t cache(const(ubyte)[] bytes) pure nothrow @safe size_t cache(const(ubyte)[] bytes) pure nothrow @safe
{
immutable uint hash = hashBytes(bytes);
return cache(bytes, hash);
}
size_t cache(const(ubyte)[] bytes, uint hash) pure nothrow @safe
in in
{ {
assert (bytes.length > 0); assert (bytes.length > 0);
@ -426,7 +491,7 @@ public:
} }
body body
{ {
immutable uint hash = hashBytes(bytes); memoryRequested += bytes.length;
const(Item)* found = find(bytes, hash); const(Item)* found = find(bytes, hash);
if (found is null) if (found is null)
return intern(bytes, hash); return intern(bytes, hash);
@ -453,23 +518,58 @@ public:
return items[index].str; return items[index].str;
} }
void printStats()
{
import std.stdio;
writeln("Load Factor: ", cast(float) items.length / cast(float) buckets.length);
writeln("Memory used by blocks: ", blocks.length * blockSize);
writeln("Memory requsted: ", memoryRequested);
writeln("rehashes: ", rehashCount);
}
static uint hashStep(ubyte b, uint h) pure nothrow @safe
{
return (h ^ sbox[b]) * 3;
}
static enum defaultBucketCount = 2048;
private: private:
size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @safe private void rehash() pure nothrow @safe
{ {
Item* item = new Item; immutable size_t newBucketCount = items.length * 2;
item.hash = hash; buckets = new Item*[newBucketCount];
item.str = allocate(bytes); rehashCount++;
foreach (item; items)
{
immutable size_t newIndex = item.hash % newBucketCount;
item.next = buckets[newIndex];
buckets[newIndex] = item;
}
}
size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
{
ubyte[] mem = allocate(bytes.length);
mem[] = bytes[];
Item* item = cast(Item*) allocate(Item.sizeof).ptr;
item.index = items.length; item.index = items.length;
item.str = cast(string) mem;
item.hash = hash;
item.next = buckets[hash % buckets.length];
immutable bool checkLoadFactor = item.next !is null;
buckets[hash % buckets.length] = item;
items ~= item; items ~= item;
buckets[hash % buckets.length] ~= item; if (checkLoadFactor && (cast(float) items.length / cast(float) buckets.length) > 0.75)
rehash();
return item.index; return item.index;
} }
const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe
{ {
immutable size_t index = hash % buckets.length; immutable size_t index = hash % buckets.length;
foreach (item; buckets[index]) for (const(Item)* item = buckets[index]; item !is null; item = item.next)
{ {
if (item.hash == hash && bytes.equal(item.str)) if (item.hash == hash && bytes.equal(item.str))
return item; return item;
@ -477,53 +577,46 @@ private:
return null; return null;
} }
string allocate(const(ubyte)[] bytes) pure nothrow @trusted ubyte[] allocate(size_t byteCount) pure nothrow @trusted
out (retVal)
{
assert (retVal == bytes);
}
body
{ {
import core.memory; import core.memory;
if (bytes.length > (pageSize / 4)) if (byteCount > (blockSize / 4))
{ {
ubyte* memory = cast(ubyte*) GC.malloc(bytes.length, GC.BlkAttr.NO_SCAN); ubyte* mem = cast(ubyte*) GC.malloc(byteCount, GC.BlkAttr.NO_SCAN);
memory[0 .. bytes.length] = bytes[]; return mem[0 .. byteCount];
return cast(string) memory[0..bytes.length];
} }
foreach (ref block; blocks) foreach (ref block; blocks)
{ {
immutable size_t endIndex = block.used + bytes.length; immutable size_t oldUsed = block.used;
if (endIndex > block.bytes.length) immutable size_t end = oldUsed + byteCount;
if (end > block.bytes.length)
continue; continue;
block.bytes[block.used .. endIndex] = bytes[]; block.used = end;
string slice = cast(string) block.bytes[block.used .. endIndex]; return block.bytes[oldUsed .. end];
block.used = endIndex;
return slice;
} }
blocks.length = blocks.length + 1; blocks ~= Block(
blocks[$ - 1].bytes = (cast(ubyte*) GC.malloc(pageSize, GC.BlkAttr.NO_SCAN))[0 .. pageSize]; (cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize],
blocks[$ - 1].bytes[0 .. bytes.length] = bytes[]; byteCount);
blocks[$ - 1].used = bytes.length; return blocks[$ - 1].bytes[0 .. byteCount];
return cast(string) blocks[$ - 1].bytes[0 .. bytes.length];
} }
static uint hashBytes(const(ubyte)[] data) pure nothrow @safe static uint hashBytes(const(ubyte)[] data) pure nothrow @safe
{ {
uint hash = 0; uint hash = 0;
foreach (b; data) foreach (b; data)
{ {
hash ^= sbox[b]; hash ^= sbox[b];
hash *= 3; hash *= 3;
} }
return hash; return hash;
} }
static struct Item static struct Item
{ {
size_t index; size_t index;
string str; string str;
uint hash; uint hash;
Item* next;
} }
static struct Block static struct Block
@ -532,10 +625,9 @@ private:
size_t used; size_t used;
} }
static enum pageSize = 4096 * 1024; static enum blockSize = 1024 * 16;
static enum bucketCount = 2048;
static enum uint[] sbox = [ public static immutable uint[] sbox = [
0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53, 0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982, 0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982,
0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56, 0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56,
@ -603,6 +695,8 @@ private:
]; ];
Item*[] items; Item*[] items;
Item*[][bucketCount] buckets; Item*[] buckets;
Block[] blocks; Block[] blocks;
size_t memoryRequested;
uint rehashCount;
} }