Back-end cleanup and optimization in the lexer

This commit is contained in:
Hackerpilot 2014-01-19 23:13:13 -08:00
parent 24a0c1bc2b
commit c01c51a61e
9 changed files with 344 additions and 232 deletions

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "datapicked"]
path = datapicked
url = https://github.com/blackwhale/datapicked.git

View File

@ -1,4 +1,3 @@
#dmd *.d stdx/d/*.d -release -inline -noboundscheck -O -w -wi -m64 -property -ofdscanner-dmd
dmd\ dmd\
main.d\ main.d\
stats.d\ stats.d\
@ -11,9 +10,36 @@ dmd\
style.d\ style.d\
stdx/*.d\ stdx/*.d\
stdx/d/*.d\ stdx/d/*.d\
datapicked/dpick/buffer/*.d\ -ofdscanner\
-Idatapicked\ -m64\
-g -m64 -wi -ofdscanner -O -release -noboundscheck
#ldc2 main.d stats.d imports.d highlighter.d ctags.d astprinter.d formatter.d outliner.d stdx/*.d stdx/d/*.d -of=dscanner-ldc -m64 -oq
#ldc2 *.d stdx/d/*.d -of=dscanner -unittest -m64 -g #gdc\
#/opt/gdc/bin/gdc -O3 -odscanner-gdc -fno-bounds-check -frelease -m64 *.d stdx/d/*.d # main.d\
# stats.d\
# imports.d\
# highlighter.d\
# ctags.d\
# astprinter.d\
# formatter.d\
# outliner.d\
# style.d\
# stdx/*.d\
# stdx/d/*.d\
# -O3 -frelease -fno-bounds-check\
# -odscanner\
#ldc2\
# main.d\
# stats.d\
# imports.d\
# highlighter.d\
# ctags.d\
# astprinter.d\
# formatter.d\
# outliner.d\
# style.d\
# stdx/*.d\
# stdx/d/*.d\
# -O3 -release\
# -oq -of=dscanner\

View File

@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames)
{ {
string[] tags; string[] tags;
LexerConfig config; LexerConfig config;
StringCache* cache = new StringCache; StringCache* cache = new StringCache(StringCache.defaultBucketCount);
foreach (fileName; fileNames) foreach (fileName; fileNames)
{ {
File f = File(fileName); File f = File(fileName);

@ -1 +0,0 @@
Subproject commit f63a843e9c0ce8db7fd897684fe323697255d87d

14
main.d
View File

@ -10,15 +10,12 @@ import std.array;
import std.conv; import std.conv;
import std.file; import std.file;
import std.getopt; import std.getopt;
import std.parallelism;
import std.path; import std.path;
import std.regex;
import std.stdio; import std.stdio;
import std.range; import std.range;
import stdx.lexer; import stdx.lexer;
import stdx.d.lexer; import stdx.d.lexer;
import stdx.d.parser; import stdx.d.parser;
import dpick.buffer.buffer;
import highlighter; import highlighter;
import stats; import stats;
@ -93,7 +90,7 @@ int main(string[] args)
return 1; return 1;
} }
StringCache* cache = new StringCache; StringCache* cache = new StringCache(StringCache.defaultBucketCount);
if (tokenDump || highlight) if (tokenDump || highlight)
{ {
@ -151,13 +148,16 @@ int main(string[] args)
foreach (f; expandArgs(args, recursive)) foreach (f; expandArgs(args, recursive))
{ {
import core.memory; import core.memory;
GC.disable(); LexerConfig config;
auto tokens = byToken!(ubyte[])(readFile(f)); config.whitespaceBehavior = WhitespaceBehavior.skip;
config.stringBehavior = StringBehavior.source;
config.commentBehavior = CommentBehavior.include;
auto tokens = byToken(readFile(f), config, cache);
if (tokenCount) if (tokenCount)
count += printTokenCount(stdout, f, tokens); count += printTokenCount(stdout, f, tokens);
else else
count += printLineCount(stdout, f, tokens); count += printLineCount(stdout, f, tokens);
GC.enable(); cache.printStats();
} }
writefln("total:\t%d", count); writefln("total:\t%d", count);
} }

View File

@ -1,9 +0,0 @@
echo -e "file\tstd.d.lexer dmd\tstd.d.lexer ldc\tstd.d.lexer gdc\tdmd"
for i in $(ls ../phobos/std/*.d); do
f=$(echo $i | sed "s/.*phobos\///")
dmdt=$(avgtime -q -r 200 ./dscanner-dmd --tokenCount $i | grep "Median" | sed "s/.*: //")
ldct=$(avgtime -q -r 200 ./dscanner-ldc --tokenCount $i | grep "Median" | sed "s/.*: //")
gdct=$(avgtime -q -r 200 ./dscanner-gdc --tokenCount $i | grep "Median" | sed "s/.*: //")
gcct=$(avgtime -q -r 200 ~/src/dmd-lexer/src/dmd $i | grep "Median" | sed "s/.*: //")
echo -e "${f}\t${dmdt}\t${ldct}\t${gdct}\t${gcct}"
done

View File

@ -32,7 +32,12 @@ pure nothrow bool isLineOfCode(IdType t)
ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens) ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens)
{ {
ulong c = tokens.count!(a => true);
ulong c;
foreach (ref t; tokens)
{
c++;
}
output.writefln("%s:\t%d", fileName, c); output.writefln("%s:\t%d", fileName, c);
return c; return c;
} }
@ -40,7 +45,7 @@ ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens)
ulong printLineCount(Tokens)(File output, string fileName, ref Tokens tokens) ulong printLineCount(Tokens)(File output, string fileName, ref Tokens tokens)
{ {
ulong count; ulong count;
foreach (t; tokens) foreach (ref t; tokens)
{ {
if (isLineOfCode(t.type)) if (isLineOfCode(t.type))
++count; ++count;

View File

@ -112,33 +112,6 @@ public struct LexerConfig
CommentBehavior commentBehavior; CommentBehavior commentBehavior;
} }
public auto byToken(R)(R range)
{
LexerConfig config;
StringCache* cache = new StringCache;
return byToken(range, config, cache);
}
public auto byToken(R)(R range, StringCache* cache)
{
LexerConfig config;
return DLexer!(R)(range, config, cache);
}
public auto byToken(R)(R range, const LexerConfig config, StringCache* cache)
{
return DLexer!(R)(range, config, cache);
}
unittest
{
import std.stdio;
auto source = cast(ubyte[]) q{ import std.stdio;}c;
auto tokens = byToken(source);
assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
tok!"identifier", tok!";"]));
}
public bool isBasicType(IdType type) nothrow pure @safe public bool isBasicType(IdType type) nothrow pure @safe
{ {
switch (type) switch (type)
@ -396,11 +369,9 @@ public bool isProtection(IdType type) pure nothrow @safe
} }
} }
public struct DLexer(R) public struct DLexer
{ {
import std.conv;
import core.vararg; import core.vararg;
import dpick.buffer.buffer;
private enum pseudoTokenHandlers = [ private enum pseudoTokenHandlers = [
"\"", "lexStringLiteral", "\"", "lexStringLiteral",
@ -434,14 +405,12 @@ public struct DLexer(R)
"#line", "lexSpecialTokenSequence" "#line", "lexSpecialTokenSequence"
]; ];
mixin Lexer!(R, IdType, Token, lexIdentifier, staticTokens, mixin Lexer!(IdType, Token, lexIdentifier, staticTokens,
dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens); dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens);
private alias Mark = typeof(range).Mark; this(ubyte[] range, const LexerConfig config, StringCache* cache)
this(R range, const LexerConfig config, StringCache* cache)
{ {
this.range = LexerRange!(typeof(buffer(range)))(buffer(range)); this.range = LexerRange(range);
this.config = config; this.config = config;
this.cache = cache; this.cache = cache;
popFront(); popFront();
@ -493,7 +462,7 @@ public struct DLexer(R)
case '\t': case '\t':
return true; return true;
case 0xe2: case 0xe2:
auto peek = range.lookahead(2); auto peek = range.peek(2);
return peek.length == 2 return peek.length == 2
&& peek[0] == 0x80 && peek[0] == 0x80
&& (peek[1] == 0xa8 || peek[1] == 0xa9); && (peek[1] == 0xa8 || peek[1] == 0xa9);
@ -521,7 +490,7 @@ public struct DLexer(R)
range.incrementLine(); range.incrementLine();
return; return;
case 0xe2: case 0xe2:
auto lookahead = range.lookahead(3); auto lookahead = range.peek(3);
if (lookahead.length == 3 && lookahead[1] == 0x80 if (lookahead.length == 3 && lookahead[1] == 0x80
&& (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
{ {
@ -564,7 +533,7 @@ public struct DLexer(R)
range.popFront(); range.popFront();
break; break;
case 0xe2: case 0xe2:
auto lookahead = range.lookahead(3); auto lookahead = range.peek(3);
if (lookahead.length != 3) if (lookahead.length != 3)
break loop; break loop;
if (lookahead[1] != 0x80) if (lookahead[1] != 0x80)
@ -590,10 +559,10 @@ public struct DLexer(R)
Token lexNumber() pure nothrow Token lexNumber() pure nothrow
{ {
mixin (tokenStart); mixin (tokenStart);
auto lookahead = range.lookahead(2); if (range.canPeek(1) && range.front == '0')
if (range.front == '0' && lookahead.length == 2)
{ {
switch (lookahead[1]) auto ahead = range.peek(1)[1];
switch (ahead)
{ {
case 'x': case 'x':
case 'X': case 'X':
@ -619,7 +588,7 @@ public struct DLexer(R)
return lexHex(mark, line, column, index); return lexHex(mark, line, column, index);
} }
Token lexHex(Mark mark, size_t line, size_t column, size_t index) pure nothrow Token lexHex(size_t mark, size_t line, size_t column, size_t index) pure nothrow
{ {
IdType type = tok!"intLiteral"; IdType type = tok!"intLiteral";
bool foundDot; bool foundDot;
@ -654,7 +623,7 @@ public struct DLexer(R)
case '.': case '.':
if (foundDot) if (foundDot)
break hexLoop; break hexLoop;
if (range.lookahead(1).length && range.lookahead(1)[0] == '.') if (range.peek(1).length && range.peek(1)[0] == '.')
break hexLoop; break hexLoop;
range.popFront(); range.popFront();
foundDot = true; foundDot = true;
@ -674,7 +643,7 @@ public struct DLexer(R)
return lexBinary(mark, line, column, index); return lexBinary(mark, line, column, index);
} }
Token lexBinary(Mark mark, size_t line, size_t column, size_t index) pure nothrow Token lexBinary(size_t mark, size_t line, size_t column, size_t index) pure nothrow
{ {
IdType type = tok!"intLiteral"; IdType type = tok!"intLiteral";
binaryLoop: while (!range.empty) binaryLoop: while (!range.empty)
@ -699,13 +668,13 @@ public struct DLexer(R)
index); index);
} }
Token lexDecimal() Token lexDecimal() pure nothrow
{ {
mixin (tokenStart); mixin (tokenStart);
return lexDecimal(mark, line, column, index); return lexDecimal(mark, line, column, index);
} }
Token lexDecimal(Mark mark, size_t line, size_t column, size_t index) pure nothrow Token lexDecimal(size_t mark, size_t line, size_t column, size_t index) pure nothrow
{ {
bool foundDot = range.front == '.'; bool foundDot = range.front == '.';
IdType type = tok!"intLiteral"; IdType type = tok!"intLiteral";
@ -748,7 +717,7 @@ public struct DLexer(R)
case '.': case '.':
if (foundDot) if (foundDot)
break decimalLoop; break decimalLoop;
auto lookahead = range.lookahead(2); auto lookahead = range.peek(2);
if (lookahead.length == 2 && lookahead[1] == '.') if (lookahead.length == 2 && lookahead[1] == '.')
break decimalLoop; break decimalLoop;
else else
@ -1058,7 +1027,7 @@ public struct DLexer(R)
index); index);
} }
void lexStringSuffix(ref IdType type) pure void lexStringSuffix(ref IdType type) pure nothrow
{ {
if (range.empty) if (range.empty)
type = tok!"stringLiteral"; type = tok!"stringLiteral";
@ -1080,8 +1049,8 @@ public struct DLexer(R)
mixin (tokenStart); mixin (tokenStart);
range.popFront(); range.popFront();
range.popFront(); range.popFront();
Unqual!(ElementEncodingType!R) open; ubyte open;
Unqual!(ElementEncodingType!R) close; ubyte close;
switch (range.front) switch (range.front)
{ {
case '<': case '<':
@ -1109,8 +1078,8 @@ public struct DLexer(R)
} }
} }
Token lexNormalDelimitedString(Mark mark, size_t line, size_t column, Token lexNormalDelimitedString(size_t mark, size_t line, size_t column,
size_t index, ElementEncodingType!R open, ElementEncodingType!R close) size_t index, ubyte open, ubyte close)
pure nothrow pure nothrow
{ {
int depth = 1; int depth = 1;
@ -1144,7 +1113,7 @@ public struct DLexer(R)
return Token(type, cache.cacheGet(range.slice(mark)), line, column, index); return Token(type, cache.cacheGet(range.slice(mark)), line, column, index);
} }
Token lexHeredocString(Mark mark, size_t line, size_t column, size_t index) Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index)
pure nothrow pure nothrow
{ {
import std.regex; import std.regex;
@ -1158,7 +1127,7 @@ public struct DLexer(R)
if (isNewline()) if (isNewline())
{ {
popFrontWhitespaceAware(); popFrontWhitespaceAware();
if (range.lookahead(ident.text.length) == ident.text) if (range.peek(ident.text.length) == ident.text)
{ {
foreach (i ; 0 .. ident.text.length) foreach (i ; 0 .. ident.text.length)
range.popFront(); range.popFront();
@ -1395,18 +1364,20 @@ public struct DLexer(R)
Token lexIdentifier() pure nothrow Token lexIdentifier() pure nothrow
{ {
mixin (tokenStart); mixin (tokenStart);
while (!range.empty && !isSeparating(range.front)) uint hash = 0;
while (!range.empty && !isSeparating(0))
{ {
hash = StringCache.hashStep(range.front, hash);
range.popFront(); range.popFront();
} }
return Token(tok!"identifier", cache.cacheGet(range.slice(mark)), line, return Token(tok!"identifier", cache.cacheGet(range.slice(mark), hash), line,
column, index); column, index);
} }
Token lexDot() pure nothrow Token lexDot() pure nothrow
{ {
mixin (tokenStart); mixin (tokenStart);
auto lookahead = range.lookahead(1); auto lookahead = range.peek(1);
if (lookahead.length == 0) if (lookahead.length == 0)
{ {
range.popFront(); range.popFront();
@ -1447,22 +1418,25 @@ public struct DLexer(R)
{ {
if (range.front == '\n') return true; if (range.front == '\n') return true;
if (range.front == '\r') return true; if (range.front == '\r') return true;
auto lookahead = range.lookahead(3); auto lookahead = range.peek(3);
if (lookahead.length == 0) return false; if (lookahead.length == 0) return false;
if (lookahead == "\u2028" || lookahead == "\u2029") if (lookahead == "\u2028" || lookahead == "\u2029")
return true; return true;
return false; return false;
} }
bool isSeparating(ElementType!R c) nothrow pure @safe bool isSeparating(size_t offset) const pure nothrow @safe
{ {
auto r = range.save();
r.popFrontN(offset);
auto c = r.front;
if (c <= 0x2f) return true; if (c <= 0x2f) return true;
if (c >= ':' && c <= '@') return true; if (c >= ':' && c <= '@') return true;
if (c >= '[' && c <= '^') return true; if (c >= '[' && c <= '^') return true;
if (c >= '{' && c <= '~') return true; if (c >= '{' && c <= '~') return true;
if (c == '`') return true; if (c == '`') return true;
// if (c & 0x80 && (range.lookahead(3) == "\u2028" if (c & 0x80 && (r.peek(3) == "\u2028"
// || range.lookahead(3) == "\u2029")) return true; || range.peek(3) == "\u2029")) return true;
return false; return false;
} }
@ -1470,17 +1444,43 @@ public struct DLexer(R)
size_t index = range.index; size_t index = range.index;
size_t column = range.column; size_t column = range.column;
size_t line = range.line; size_t line = range.line;
const mark = range.mark(); auto mark = range.mark();
}; };
void error(...) pure { void error(...) pure nothrow @safe {
} }
void warning(...) pure { void warning(...) pure nothrow @safe {
} }
StringCache* cache; StringCache* cache;
LexerConfig config; LexerConfig config;
} }
public auto byToken(ubyte[] range)
{
LexerConfig config;
StringCache* cache = new StringCache(StringCache.defaultBucketCount);
return DLexer(range, config, cache);
}
public auto byToken(ubyte[] range, StringCache* cache)
{
LexerConfig config;
return DLexer(range, config, cache);
}
public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
{
return DLexer(range, config, cache);
}
unittest
{
import std.stdio;
auto source = cast(ubyte[]) q{ import std.stdio;}c;
auto tokens = byToken(source);
assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
tok!"identifier", tok!";"]));
}

View File

@ -17,8 +17,6 @@ import std.range;
import std.traits; import std.traits;
import std.conv; import std.conv;
import std.math; import std.math;
import dpick.buffer.buffer;
import dpick.buffer.traits;
/** /**
* Template for determining the type used for a token type. Selects the smallest * Template for determining the type used for a token type. Selects the smallest
@ -191,12 +189,13 @@ public:
mixin (extraFields); mixin (extraFields);
} }
mixin template Lexer(R, IDType, Token, alias defaultTokenFunction, mixin template Lexer(IDType, Token, alias defaultTokenFunction,
alias staticTokens, alias dynamicTokens, alias pseudoTokens, alias staticTokens, alias dynamicTokens, alias pseudoTokens,
alias pseudoTokenHandlers, alias possibleDefaultTokens) alias pseudoTokenHandlers, alias possibleDefaultTokens)
{ {
static string generateCaseStatements(string[] tokens, size_t offset = 0) static string generateCaseStatements(string[] tokens, size_t offset = 0)
{ {
import std.conv;
string code; string code;
for (size_t i = 0; i < tokens.length; i++) for (size_t i = 0; i < tokens.length; i++)
{ {
@ -216,9 +215,9 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
code ~= generateLeaf(tokens[i], indent ~ " "); code ~= generateLeaf(tokens[i], indent ~ " ");
else else
{ {
code ~= indent ~ " if (range.lookahead(" ~ text(tokens[i].length) ~ ").length == 0)\n"; code ~= indent ~ " if (!range.canPeek(" ~ text(tokens[i].length) ~ "))\n";
code ~= indent ~ " goto outer_default;\n"; code ~= indent ~ " goto outer_default;\n";
code ~= indent ~ " if (range.lookahead(" ~ text(tokens[i].length) ~ ") == \"" ~ escape(tokens[i]) ~ "\")\n"; code ~= indent ~ " if (range.peek(" ~ text(tokens[i].length - 1) ~ ") == \"" ~ escape(tokens[i]) ~ "\")\n";
code ~= indent ~ " {\n"; code ~= indent ~ " {\n";
code ~= generateLeaf(tokens[i], indent ~ " "); code ~= generateLeaf(tokens[i], indent ~ " ");
code ~= indent ~ " }\n"; code ~= indent ~ " }\n";
@ -228,11 +227,11 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
} }
else else
{ {
code ~= indent ~ " if (range.lookahead(" ~ text(offset + 2) ~ ").length == 0)\n"; code ~= indent ~ " if (!range.canPeek(" ~ text(offset + 1) ~ "))\n";
code ~= indent ~ " {\n"; code ~= indent ~ " {\n";
code ~= generateLeaf(tokens[i][0 .. offset + 1], indent ~ " "); code ~= generateLeaf(tokens[i][0 .. offset + 1], indent ~ " ");
code ~= indent ~ " }\n"; code ~= indent ~ " }\n";
code ~= indent ~ " switch (range.lookahead(" ~ text(offset + 2) ~ ")[" ~ text(offset + 1) ~ "])\n"; code ~= indent ~ " switch (range.peek(" ~ text(offset + 1) ~ ")[" ~ text(offset + 1) ~ "])\n";
code ~= indent ~ " {\n"; code ~= indent ~ " {\n";
code ~= generateCaseStatements(tokens[i .. j], offset + 1); code ~= generateCaseStatements(tokens[i .. j], offset + 1);
code ~= indent ~ " default:\n"; code ~= indent ~ " default:\n";
@ -247,6 +246,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
static string generateLeaf(string token, string indent) static string generateLeaf(string token, string indent)
{ {
import std.conv;
static assert (pseudoTokenHandlers.length % 2 == 0, static assert (pseudoTokenHandlers.length % 2 == 0,
"Each pseudo-token must have a matching function name."); "Each pseudo-token must have a matching function name.");
string code; string code;
@ -262,7 +262,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
code ~= indent ~ "return " ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(token) + 1] ~ "();\n"; code ~= indent ~ "return " ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(token) + 1] ~ "();\n";
else if (possibleDefaultTokens.countUntil(token) >= 0) else if (possibleDefaultTokens.countUntil(token) >= 0)
{ {
code ~= indent ~ "if (range.lookahead(" ~ text(token.length + 1) ~ ").length == 0 || isSeparating(range.lookahead(" ~ text(token.length + 1) ~ ")[" ~ text(token.length) ~ "]))\n"; code ~= indent ~ "if (!range.canPeek(" ~ text(token.length + 1) ~ ") || isSeparating(" ~ text(token.length) ~ "))\n";
code ~= indent ~ "{\n"; code ~= indent ~ "{\n";
if (token.length == 1) if (token.length == 1)
code ~= indent ~ " range.popFront();\n"; code ~= indent ~ " range.popFront();\n";
@ -278,7 +278,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
return code; return code;
} }
const(Token) front() pure nothrow const @property ref const(Token) front() pure nothrow const @property
{ {
return _front; return _front;
} }
@ -312,7 +312,21 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
return retVal; return retVal;
} }
Token advance() pure /**
* This only exists because the real array() can't be called at compile-time
*/
static string[] stupidToArray(R)(R range)
{
string[] retVal;
foreach (v; range)
retVal ~= v;
return retVal;
}
enum loopBody = generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens)));
auto ref Token advance() pure
{ {
if (range.empty) if (range.empty)
return Token(tok!"\0"); return Token(tok!"\0");
@ -321,54 +335,87 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
immutable size_t line = range.line; immutable size_t line = range.line;
lexerLoop: switch (range.front) lexerLoop: switch (range.front)
{ {
mixin(generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens)))); mixin(loopBody);
// pragma(msg, generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens)))); /+pragma(msg, loopBody);+/
outer_default: outer_default:
default: default:
return defaultTokenFunction(); return defaultTokenFunction();
} }
} }
/** LexerRange range;
* This only exists because the real array() can't be called at compile-time
*/
static T[] stupidToArray(R, T = ElementType!R)(R range)
{
T[] retVal;
foreach (v; range)
retVal ~= v;
return retVal;
}
LexerRange!(typeof(buffer(R.init))) range;
Token _front; Token _front;
} }
struct LexerRange(BufferType) if (isBuffer!BufferType) struct LexerRange
{ {
this(BufferType r)
this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1) pure nothrow @safe
{ {
this.range = r; this.bytes = bytes;
index = 0; this.index = index;
column = 1; this.column = column;
line = 1; this.line = line;
} }
void popFront() pure size_t mark() const nothrow pure @safe
{
return index;
}
void seek(size_t m) nothrow pure @safe
{
index = m;
}
const(ubyte)[] slice(size_t m) const nothrow pure @safe
{
return bytes[m .. index];
}
bool empty() const nothrow pure @safe
{
return index >= bytes.length;
}
ubyte front() const nothrow pure @safe
{
return bytes[index];
}
const(ubyte)[] peek(size_t p) const nothrow pure @safe
{
return bytes[index .. index + p + 1];
}
bool canPeek(size_t p) const nothrow pure @safe
{
return index + p < bytes.length;
}
LexerRange save() const nothrow pure @safe
{
return LexerRange(bytes, index, column, line);
}
void popFront() pure nothrow @safe
{ {
index++; index++;
column++; column++;
range.popFront();
} }
void incrementLine() pure nothrow void popFrontN(size_t n) pure nothrow @safe
{
index += n;
}
void incrementLine() pure nothrow @safe
{ {
column = 1; column = 1;
line++; line++;
} }
BufferType range; const(ubyte)[] bytes;
alias range this;
size_t index; size_t index;
size_t column; size_t column;
size_t line; size_t line;
@ -388,6 +435,13 @@ struct StringCache
{ {
public: public:
@disable this();
this(size_t bucketCount = defaultBucketCount)
{
buckets = new Item*[bucketCount];
}
/** /**
* Equivalent to calling cache() and get(). * Equivalent to calling cache() and get().
* --- * ---
@ -402,6 +456,11 @@ public:
return get(cache(bytes)); return get(cache(bytes));
} }
string cacheGet(const(ubyte[]) bytes, uint hash) pure nothrow @safe
{
return get(cache(bytes, hash));
}
/** /**
* Caches a string. * Caches a string.
* Params: bytes = the string to cache * Params: bytes = the string to cache
@ -416,6 +475,12 @@ public:
* --- * ---
*/ */
size_t cache(const(ubyte)[] bytes) pure nothrow @safe size_t cache(const(ubyte)[] bytes) pure nothrow @safe
{
immutable uint hash = hashBytes(bytes);
return cache(bytes, hash);
}
size_t cache(const(ubyte)[] bytes, uint hash) pure nothrow @safe
in in
{ {
assert (bytes.length > 0); assert (bytes.length > 0);
@ -426,7 +491,7 @@ public:
} }
body body
{ {
immutable uint hash = hashBytes(bytes); memoryRequested += bytes.length;
const(Item)* found = find(bytes, hash); const(Item)* found = find(bytes, hash);
if (found is null) if (found is null)
return intern(bytes, hash); return intern(bytes, hash);
@ -453,23 +518,58 @@ public:
return items[index].str; return items[index].str;
} }
void printStats()
{
import std.stdio;
writeln("Load Factor: ", cast(float) items.length / cast(float) buckets.length);
writeln("Memory used by blocks: ", blocks.length * blockSize);
writeln("Memory requsted: ", memoryRequested);
writeln("rehashes: ", rehashCount);
}
static uint hashStep(ubyte b, uint h) pure nothrow @safe
{
return (h ^ sbox[b]) * 3;
}
static enum defaultBucketCount = 2048;
private: private:
size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @safe private void rehash() pure nothrow @safe
{ {
Item* item = new Item; immutable size_t newBucketCount = items.length * 2;
item.hash = hash; buckets = new Item*[newBucketCount];
item.str = allocate(bytes); rehashCount++;
foreach (item; items)
{
immutable size_t newIndex = item.hash % newBucketCount;
item.next = buckets[newIndex];
buckets[newIndex] = item;
}
}
size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
{
ubyte[] mem = allocate(bytes.length);
mem[] = bytes[];
Item* item = cast(Item*) allocate(Item.sizeof).ptr;
item.index = items.length; item.index = items.length;
item.str = cast(string) mem;
item.hash = hash;
item.next = buckets[hash % buckets.length];
immutable bool checkLoadFactor = item.next !is null;
buckets[hash % buckets.length] = item;
items ~= item; items ~= item;
buckets[hash % buckets.length] ~= item; if (checkLoadFactor && (cast(float) items.length / cast(float) buckets.length) > 0.75)
rehash();
return item.index; return item.index;
} }
const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe
{ {
immutable size_t index = hash % buckets.length; immutable size_t index = hash % buckets.length;
foreach (item; buckets[index]) for (const(Item)* item = buckets[index]; item !is null; item = item.next)
{ {
if (item.hash == hash && bytes.equal(item.str)) if (item.hash == hash && bytes.equal(item.str))
return item; return item;
@ -477,35 +577,27 @@ private:
return null; return null;
} }
string allocate(const(ubyte)[] bytes) pure nothrow @trusted ubyte[] allocate(size_t byteCount) pure nothrow @trusted
out (retVal)
{
assert (retVal == bytes);
}
body
{ {
import core.memory; import core.memory;
if (bytes.length > (pageSize / 4)) if (byteCount > (blockSize / 4))
{ {
ubyte* memory = cast(ubyte*) GC.malloc(bytes.length, GC.BlkAttr.NO_SCAN); ubyte* mem = cast(ubyte*) GC.malloc(byteCount, GC.BlkAttr.NO_SCAN);
memory[0 .. bytes.length] = bytes[]; return mem[0 .. byteCount];
return cast(string) memory[0..bytes.length];
} }
foreach (ref block; blocks) foreach (ref block; blocks)
{ {
immutable size_t endIndex = block.used + bytes.length; immutable size_t oldUsed = block.used;
if (endIndex > block.bytes.length) immutable size_t end = oldUsed + byteCount;
if (end > block.bytes.length)
continue; continue;
block.bytes[block.used .. endIndex] = bytes[]; block.used = end;
string slice = cast(string) block.bytes[block.used .. endIndex]; return block.bytes[oldUsed .. end];
block.used = endIndex;
return slice;
} }
blocks.length = blocks.length + 1; blocks ~= Block(
blocks[$ - 1].bytes = (cast(ubyte*) GC.malloc(pageSize, GC.BlkAttr.NO_SCAN))[0 .. pageSize]; (cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize],
blocks[$ - 1].bytes[0 .. bytes.length] = bytes[]; byteCount);
blocks[$ - 1].used = bytes.length; return blocks[$ - 1].bytes[0 .. byteCount];
return cast(string) blocks[$ - 1].bytes[0 .. bytes.length];
} }
static uint hashBytes(const(ubyte)[] data) pure nothrow @safe static uint hashBytes(const(ubyte)[] data) pure nothrow @safe
@ -524,6 +616,7 @@ private:
size_t index; size_t index;
string str; string str;
uint hash; uint hash;
Item* next;
} }
static struct Block static struct Block
@ -532,10 +625,9 @@ private:
size_t used; size_t used;
} }
static enum pageSize = 4096 * 1024; static enum blockSize = 1024 * 16;
static enum bucketCount = 2048;
static enum uint[] sbox = [ public static immutable uint[] sbox = [
0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53, 0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982, 0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982,
0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56, 0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56,
@ -603,6 +695,8 @@ private:
]; ];
Item*[] items; Item*[] items;
Item*[][bucketCount] buckets; Item*[] buckets;
Block[] blocks; Block[] blocks;
size_t memoryRequested;
uint rehashCount;
} }