Back-end cleanup and optimization in the lexer

This commit is contained in:
Hackerpilot 2014-01-19 23:13:13 -08:00
parent 24a0c1bc2b
commit c01c51a61e
9 changed files with 344 additions and 232 deletions

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "datapicked"]
path = datapicked
url = https://github.com/blackwhale/datapicked.git

View File

@ -1,4 +1,3 @@
#dmd *.d stdx/d/*.d -release -inline -noboundscheck -O -w -wi -m64 -property -ofdscanner-dmd
dmd\
main.d\
stats.d\
@ -11,9 +10,36 @@ dmd\
style.d\
stdx/*.d\
stdx/d/*.d\
datapicked/dpick/buffer/*.d\
-Idatapicked\
-g -m64 -wi -ofdscanner
#ldc2 main.d stats.d imports.d highlighter.d ctags.d astprinter.d formatter.d outliner.d stdx/*.d stdx/d/*.d -of=dscanner-ldc -m64 -oq
#ldc2 *.d stdx/d/*.d -of=dscanner -unittest -m64 -g
#/opt/gdc/bin/gdc -O3 -odscanner-gdc -fno-bounds-check -frelease -m64 *.d stdx/d/*.d
-ofdscanner\
-m64\
-O -release -noboundscheck
#gdc\
# main.d\
# stats.d\
# imports.d\
# highlighter.d\
# ctags.d\
# astprinter.d\
# formatter.d\
# outliner.d\
# style.d\
# stdx/*.d\
# stdx/d/*.d\
# -O3 -frelease -fno-bounds-check\
# -odscanner\
#ldc2\
# main.d\
# stats.d\
# imports.d\
# highlighter.d\
# ctags.d\
# astprinter.d\
# formatter.d\
# outliner.d\
# style.d\
# stdx/*.d\
# stdx/d/*.d\
# -O3 -release\
# -oq -of=dscanner\

View File

@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames)
{
string[] tags;
LexerConfig config;
StringCache* cache = new StringCache;
StringCache* cache = new StringCache(StringCache.defaultBucketCount);
foreach (fileName; fileNames)
{
File f = File(fileName);

@ -1 +0,0 @@
Subproject commit f63a843e9c0ce8db7fd897684fe323697255d87d

14
main.d
View File

@ -10,15 +10,12 @@ import std.array;
import std.conv;
import std.file;
import std.getopt;
import std.parallelism;
import std.path;
import std.regex;
import std.stdio;
import std.range;
import stdx.lexer;
import stdx.d.lexer;
import stdx.d.parser;
import dpick.buffer.buffer;
import highlighter;
import stats;
@ -93,7 +90,7 @@ int main(string[] args)
return 1;
}
StringCache* cache = new StringCache;
StringCache* cache = new StringCache(StringCache.defaultBucketCount);
if (tokenDump || highlight)
{
@ -151,13 +148,16 @@ int main(string[] args)
foreach (f; expandArgs(args, recursive))
{
import core.memory;
GC.disable();
auto tokens = byToken!(ubyte[])(readFile(f));
LexerConfig config;
config.whitespaceBehavior = WhitespaceBehavior.skip;
config.stringBehavior = StringBehavior.source;
config.commentBehavior = CommentBehavior.include;
auto tokens = byToken(readFile(f), config, cache);
if (tokenCount)
count += printTokenCount(stdout, f, tokens);
else
count += printLineCount(stdout, f, tokens);
GC.enable();
cache.printStats();
}
writefln("total:\t%d", count);
}

View File

@ -1,9 +0,0 @@
echo -e "file\tstd.d.lexer dmd\tstd.d.lexer ldc\tstd.d.lexer gdc\tdmd"
for i in $(ls ../phobos/std/*.d); do
f=$(echo $i | sed "s/.*phobos\///")
dmdt=$(avgtime -q -r 200 ./dscanner-dmd --tokenCount $i | grep "Median" | sed "s/.*: //")
ldct=$(avgtime -q -r 200 ./dscanner-ldc --tokenCount $i | grep "Median" | sed "s/.*: //")
gdct=$(avgtime -q -r 200 ./dscanner-gdc --tokenCount $i | grep "Median" | sed "s/.*: //")
gcct=$(avgtime -q -r 200 ~/src/dmd-lexer/src/dmd $i | grep "Median" | sed "s/.*: //")
echo -e "${f}\t${dmdt}\t${ldct}\t${gdct}\t${gcct}"
done

View File

@ -32,7 +32,12 @@ pure nothrow bool isLineOfCode(IdType t)
ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens)
{
ulong c = tokens.count!(a => true);
ulong c;
foreach (ref t; tokens)
{
c++;
}
output.writefln("%s:\t%d", fileName, c);
return c;
}
@ -40,7 +45,7 @@ ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens)
ulong printLineCount(Tokens)(File output, string fileName, ref Tokens tokens)
{
ulong count;
foreach (t; tokens)
foreach (ref t; tokens)
{
if (isLineOfCode(t.type))
++count;

View File

@ -57,13 +57,13 @@ public template tok(string token)
alias tok = TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token);
}
private enum extraFields = q{
string comment;
string comment;
int opCmp(size_t i) const pure nothrow @safe {
if (index < i) return -1;
if (index > i) return 1;
return 0;
}
int opCmp(size_t i) const pure nothrow @safe {
if (index < i) return -1;
if (index > i) return 1;
return 0;
}
};
public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields);
@ -72,15 +72,15 @@ public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields);
*/
public enum StringBehavior : ubyte
{
/// Do not include quote characters, process escape sequences
compiler = 0b0000_0000,
/// Opening quotes, closing quotes, and string suffixes are included in the
/// string token
includeQuoteChars = 0b0000_0001,
/// String escape sequences are not replaced
notEscaped = 0b0000_0010,
/// Not modified at all. Useful for formatters or highlighters
source = includeQuoteChars | notEscaped
/// Do not include quote characters, process escape sequences
compiler = 0b0000_0000,
/// Opening quotes, closing quotes, and string suffixes are included in the
/// string token
includeQuoteChars = 0b0000_0001,
/// String escape sequences are not replaced
notEscaped = 0b0000_0010,
/// Not modified at all. Useful for formatters or highlighters
source = includeQuoteChars | notEscaped
}
/**
@ -88,55 +88,28 @@ public enum StringBehavior : ubyte
*/
public enum WhitespaceBehavior : ubyte
{
/// Whitespace is skipped
skip,
/// Whitespace is treated as a token
include
/// Whitespace is skipped
skip,
/// Whitespace is treated as a token
include
}
/**
* Configure comment handling behavior
*/
public enum CommentBehavior : ubyte
{
/// Comments are attached to the non-whitespace token that follows them
attach,
/// Comments are tokens, and can be returned by calls to the token range's front()
include
/// Comments are attached to the non-whitespace token that follows them
attach,
/// Comments are tokens, and can be returned by calls to the token range's front()
include
}
public struct LexerConfig
{
string fileName;
StringBehavior stringBehavior;
WhitespaceBehavior whitespaceBehavior;
CommentBehavior commentBehavior;
}
public auto byToken(R)(R range)
{
LexerConfig config;
StringCache* cache = new StringCache;
return byToken(range, config, cache);
}
public auto byToken(R)(R range, StringCache* cache)
{
LexerConfig config;
return DLexer!(R)(range, config, cache);
}
public auto byToken(R)(R range, const LexerConfig config, StringCache* cache)
{
return DLexer!(R)(range, config, cache);
}
unittest
{
import std.stdio;
auto source = cast(ubyte[]) q{ import std.stdio;}c;
auto tokens = byToken(source);
assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
tok!"identifier", tok!";"]));
StringBehavior stringBehavior;
WhitespaceBehavior whitespaceBehavior;
CommentBehavior commentBehavior;
}
public bool isBasicType(IdType type) nothrow pure @safe
@ -396,11 +369,9 @@ public bool isProtection(IdType type) pure nothrow @safe
}
}
public struct DLexer(R)
public struct DLexer
{
import std.conv;
import core.vararg;
import dpick.buffer.buffer;
private enum pseudoTokenHandlers = [
"\"", "lexStringLiteral",
@ -434,53 +405,51 @@ public struct DLexer(R)
"#line", "lexSpecialTokenSequence"
];
mixin Lexer!(R, IdType, Token, lexIdentifier, staticTokens,
mixin Lexer!(IdType, Token, lexIdentifier, staticTokens,
dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens);
private alias Mark = typeof(range).Mark;
this(R range, const LexerConfig config, StringCache* cache)
this(ubyte[] range, const LexerConfig config, StringCache* cache)
{
this.range = LexerRange!(typeof(buffer(range)))(buffer(range));
this.config = config;
this.range = LexerRange(range);
this.config = config;
this.cache = cache;
popFront();
popFront();
}
private static bool isDocComment(string comment) pure nothrow @safe
{
return comment.length >= 3 && (comment[0 .. 3] == "///"
|| comment[0 .. 3] == "/**" || comment[0 .. 3] == "/++");
}
private static bool isDocComment(string comment) pure nothrow @safe
{
return comment.length >= 3 && (comment[0 .. 3] == "///"
|| comment[0 .. 3] == "/**" || comment[0 .. 3] == "/++");
}
public void popFront() pure
{
_popFront();
string comment = null;
switch (front.type)
{
case tok!"comment":
if (config.commentBehavior == CommentBehavior.attach)
{
import std.string;
if (isDocComment(front.text))
comment = comment == null ? front.text : format("%s\n%s", comment, front.text);
do _popFront(); while (front == tok!"comment");
if (front == tok!"whitespace") goto case tok!"whitespace";
}
break;
case tok!"whitespace":
if (config.whitespaceBehavior == WhitespaceBehavior.skip)
{
do _popFront(); while (front == tok!"whitespace");
if (front == tok!"comment") goto case tok!"comment";
}
break;
default:
break;
}
_front.comment = comment;
}
public void popFront() pure
{
_popFront();
string comment = null;
switch (front.type)
{
case tok!"comment":
if (config.commentBehavior == CommentBehavior.attach)
{
import std.string;
if (isDocComment(front.text))
comment = comment == null ? front.text : format("%s\n%s", comment, front.text);
do _popFront(); while (front == tok!"comment");
if (front == tok!"whitespace") goto case tok!"whitespace";
}
break;
case tok!"whitespace":
if (config.whitespaceBehavior == WhitespaceBehavior.skip)
{
do _popFront(); while (front == tok!"whitespace");
if (front == tok!"comment") goto case tok!"comment";
}
break;
default:
break;
}
_front.comment = comment;
}
bool isWhitespace() pure /*const*/ nothrow
@ -493,7 +462,7 @@ public struct DLexer(R)
case '\t':
return true;
case 0xe2:
auto peek = range.lookahead(2);
auto peek = range.peek(2);
return peek.length == 2
&& peek[0] == 0x80
&& (peek[1] == 0xa8 || peek[1] == 0xa9);
@ -521,7 +490,7 @@ public struct DLexer(R)
range.incrementLine();
return;
case 0xe2:
auto lookahead = range.lookahead(3);
auto lookahead = range.peek(3);
if (lookahead.length == 3 && lookahead[1] == 0x80
&& (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
{
@ -564,7 +533,7 @@ public struct DLexer(R)
range.popFront();
break;
case 0xe2:
auto lookahead = range.lookahead(3);
auto lookahead = range.peek(3);
if (lookahead.length != 3)
break loop;
if (lookahead[1] != 0x80)
@ -590,10 +559,10 @@ public struct DLexer(R)
Token lexNumber() pure nothrow
{
mixin (tokenStart);
auto lookahead = range.lookahead(2);
if (range.front == '0' && lookahead.length == 2)
if (range.canPeek(1) && range.front == '0')
{
switch (lookahead[1])
auto ahead = range.peek(1)[1];
switch (ahead)
{
case 'x':
case 'X':
@ -619,7 +588,7 @@ public struct DLexer(R)
return lexHex(mark, line, column, index);
}
Token lexHex(Mark mark, size_t line, size_t column, size_t index) pure nothrow
Token lexHex(size_t mark, size_t line, size_t column, size_t index) pure nothrow
{
IdType type = tok!"intLiteral";
bool foundDot;
@ -654,7 +623,7 @@ public struct DLexer(R)
case '.':
if (foundDot)
break hexLoop;
if (range.lookahead(1).length && range.lookahead(1)[0] == '.')
if (range.peek(1).length && range.peek(1)[0] == '.')
break hexLoop;
range.popFront();
foundDot = true;
@ -674,7 +643,7 @@ public struct DLexer(R)
return lexBinary(mark, line, column, index);
}
Token lexBinary(Mark mark, size_t line, size_t column, size_t index) pure nothrow
Token lexBinary(size_t mark, size_t line, size_t column, size_t index) pure nothrow
{
IdType type = tok!"intLiteral";
binaryLoop: while (!range.empty)
@ -699,13 +668,13 @@ public struct DLexer(R)
index);
}
Token lexDecimal()
Token lexDecimal() pure nothrow
{
mixin (tokenStart);
return lexDecimal(mark, line, column, index);
}
Token lexDecimal(Mark mark, size_t line, size_t column, size_t index) pure nothrow
Token lexDecimal(size_t mark, size_t line, size_t column, size_t index) pure nothrow
{
bool foundDot = range.front == '.';
IdType type = tok!"intLiteral";
@ -748,7 +717,7 @@ public struct DLexer(R)
case '.':
if (foundDot)
break decimalLoop;
auto lookahead = range.lookahead(2);
auto lookahead = range.peek(2);
if (lookahead.length == 2 && lookahead[1] == '.')
break decimalLoop;
else
@ -1058,7 +1027,7 @@ public struct DLexer(R)
index);
}
void lexStringSuffix(ref IdType type) pure
void lexStringSuffix(ref IdType type) pure nothrow
{
if (range.empty)
type = tok!"stringLiteral";
@ -1076,12 +1045,12 @@ public struct DLexer(R)
Token lexDelimitedString() pure nothrow
{
import std.traits;
import std.traits;
mixin (tokenStart);
range.popFront();
range.popFront();
Unqual!(ElementEncodingType!R) open;
Unqual!(ElementEncodingType!R) close;
ubyte open;
ubyte close;
switch (range.front)
{
case '<':
@ -1109,8 +1078,8 @@ public struct DLexer(R)
}
}
Token lexNormalDelimitedString(Mark mark, size_t line, size_t column,
size_t index, ElementEncodingType!R open, ElementEncodingType!R close)
Token lexNormalDelimitedString(size_t mark, size_t line, size_t column,
size_t index, ubyte open, ubyte close)
pure nothrow
{
int depth = 1;
@ -1144,7 +1113,7 @@ public struct DLexer(R)
return Token(type, cache.cacheGet(range.slice(mark)), line, column, index);
}
Token lexHeredocString(Mark mark, size_t line, size_t column, size_t index)
Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index)
pure nothrow
{
import std.regex;
@ -1158,7 +1127,7 @@ public struct DLexer(R)
if (isNewline())
{
popFrontWhitespaceAware();
if (range.lookahead(ident.text.length) == ident.text)
if (range.peek(ident.text.length) == ident.text)
{
foreach (i ; 0 .. ident.text.length)
range.popFront();
@ -1395,18 +1364,20 @@ public struct DLexer(R)
Token lexIdentifier() pure nothrow
{
mixin (tokenStart);
while (!range.empty && !isSeparating(range.front))
uint hash = 0;
while (!range.empty && !isSeparating(0))
{
hash = StringCache.hashStep(range.front, hash);
range.popFront();
}
return Token(tok!"identifier", cache.cacheGet(range.slice(mark)), line,
return Token(tok!"identifier", cache.cacheGet(range.slice(mark), hash), line,
column, index);
}
Token lexDot() pure nothrow
{
mixin (tokenStart);
auto lookahead = range.lookahead(1);
auto lookahead = range.peek(1);
if (lookahead.length == 0)
{
range.popFront();
@ -1447,22 +1418,25 @@ public struct DLexer(R)
{
if (range.front == '\n') return true;
if (range.front == '\r') return true;
auto lookahead = range.lookahead(3);
auto lookahead = range.peek(3);
if (lookahead.length == 0) return false;
if (lookahead == "\u2028" || lookahead == "\u2029")
return true;
return false;
}
bool isSeparating(ElementType!R c) nothrow pure @safe
bool isSeparating(size_t offset) const pure nothrow @safe
{
auto r = range.save();
r.popFrontN(offset);
auto c = r.front;
if (c <= 0x2f) return true;
if (c >= ':' && c <= '@') return true;
if (c >= '[' && c <= '^') return true;
if (c >= '{' && c <= '~') return true;
if (c == '`') return true;
// if (c & 0x80 && (range.lookahead(3) == "\u2028"
// || range.lookahead(3) == "\u2029")) return true;
if (c & 0x80 && (r.peek(3) == "\u2028"
|| range.peek(3) == "\u2029")) return true;
return false;
}
@ -1470,17 +1444,43 @@ public struct DLexer(R)
size_t index = range.index;
size_t column = range.column;
size_t line = range.line;
const mark = range.mark();
auto mark = range.mark();
};
void error(...) pure {
void error(...) pure nothrow @safe {
}
void warning(...) pure {
void warning(...) pure nothrow @safe {
}
StringCache* cache;
LexerConfig config;
}
public auto byToken(ubyte[] range)
{
LexerConfig config;
StringCache* cache = new StringCache(StringCache.defaultBucketCount);
return DLexer(range, config, cache);
}
public auto byToken(ubyte[] range, StringCache* cache)
{
LexerConfig config;
return DLexer(range, config, cache);
}
public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
{
return DLexer(range, config, cache);
}
unittest
{
import std.stdio;
auto source = cast(ubyte[]) q{ import std.stdio;}c;
auto tokens = byToken(source);
assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
tok!"identifier", tok!";"]));
}

View File

@ -17,8 +17,6 @@ import std.range;
import std.traits;
import std.conv;
import std.math;
import dpick.buffer.buffer;
import dpick.buffer.traits;
/**
* Template for determining the type used for a token type. Selects the smallest
@ -191,12 +189,13 @@ public:
mixin (extraFields);
}
mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
mixin template Lexer(IDType, Token, alias defaultTokenFunction,
alias staticTokens, alias dynamicTokens, alias pseudoTokens,
alias pseudoTokenHandlers, alias possibleDefaultTokens)
{
static string generateCaseStatements(string[] tokens, size_t offset = 0)
{
import std.conv;
string code;
for (size_t i = 0; i < tokens.length; i++)
{
@ -216,9 +215,9 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
code ~= generateLeaf(tokens[i], indent ~ " ");
else
{
code ~= indent ~ " if (range.lookahead(" ~ text(tokens[i].length) ~ ").length == 0)\n";
code ~= indent ~ " if (!range.canPeek(" ~ text(tokens[i].length) ~ "))\n";
code ~= indent ~ " goto outer_default;\n";
code ~= indent ~ " if (range.lookahead(" ~ text(tokens[i].length) ~ ") == \"" ~ escape(tokens[i]) ~ "\")\n";
code ~= indent ~ " if (range.peek(" ~ text(tokens[i].length - 1) ~ ") == \"" ~ escape(tokens[i]) ~ "\")\n";
code ~= indent ~ " {\n";
code ~= generateLeaf(tokens[i], indent ~ " ");
code ~= indent ~ " }\n";
@ -228,11 +227,11 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
}
else
{
code ~= indent ~ " if (range.lookahead(" ~ text(offset + 2) ~ ").length == 0)\n";
code ~= indent ~ " if (!range.canPeek(" ~ text(offset + 1) ~ "))\n";
code ~= indent ~ " {\n";
code ~= generateLeaf(tokens[i][0 .. offset + 1], indent ~ " ");
code ~= indent ~ " }\n";
code ~= indent ~ " switch (range.lookahead(" ~ text(offset + 2) ~ ")[" ~ text(offset + 1) ~ "])\n";
code ~= indent ~ " switch (range.peek(" ~ text(offset + 1) ~ ")[" ~ text(offset + 1) ~ "])\n";
code ~= indent ~ " {\n";
code ~= generateCaseStatements(tokens[i .. j], offset + 1);
code ~= indent ~ " default:\n";
@ -247,6 +246,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
static string generateLeaf(string token, string indent)
{
import std.conv;
static assert (pseudoTokenHandlers.length % 2 == 0,
"Each pseudo-token must have a matching function name.");
string code;
@ -262,7 +262,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
code ~= indent ~ "return " ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(token) + 1] ~ "();\n";
else if (possibleDefaultTokens.countUntil(token) >= 0)
{
code ~= indent ~ "if (range.lookahead(" ~ text(token.length + 1) ~ ").length == 0 || isSeparating(range.lookahead(" ~ text(token.length + 1) ~ ")[" ~ text(token.length) ~ "]))\n";
code ~= indent ~ "if (!range.canPeek(" ~ text(token.length + 1) ~ ") || isSeparating(" ~ text(token.length) ~ "))\n";
code ~= indent ~ "{\n";
if (token.length == 1)
code ~= indent ~ " range.popFront();\n";
@ -278,7 +278,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
return code;
}
const(Token) front() pure nothrow const @property
ref const(Token) front() pure nothrow const @property
{
return _front;
}
@ -312,7 +312,21 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
return retVal;
}
Token advance() pure
/**
* This only exists because the real array() can't be called at compile-time
*/
static string[] stupidToArray(R)(R range)
{
string[] retVal;
foreach (v; range)
retVal ~= v;
return retVal;
}
enum loopBody = generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens)));
auto ref Token advance() pure
{
if (range.empty)
return Token(tok!"\0");
@ -321,54 +335,87 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
immutable size_t line = range.line;
lexerLoop: switch (range.front)
{
mixin(generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens))));
// pragma(msg, generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens))));
mixin(loopBody);
/+pragma(msg, loopBody);+/
outer_default:
default:
return defaultTokenFunction();
}
}
/**
* This only exists because the real array() can't be called at compile-time
*/
static T[] stupidToArray(R, T = ElementType!R)(R range)
{
T[] retVal;
foreach (v; range)
retVal ~= v;
return retVal;
}
LexerRange!(typeof(buffer(R.init))) range;
LexerRange range;
Token _front;
}
struct LexerRange(BufferType) if (isBuffer!BufferType)
struct LexerRange
{
this(BufferType r)
this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1) pure nothrow @safe
{
this.range = r;
index = 0;
column = 1;
line = 1;
this.bytes = bytes;
this.index = index;
this.column = column;
this.line = line;
}
void popFront() pure
size_t mark() const nothrow pure @safe
{
return index;
}
void seek(size_t m) nothrow pure @safe
{
index = m;
}
const(ubyte)[] slice(size_t m) const nothrow pure @safe
{
return bytes[m .. index];
}
bool empty() const nothrow pure @safe
{
return index >= bytes.length;
}
ubyte front() const nothrow pure @safe
{
return bytes[index];
}
const(ubyte)[] peek(size_t p) const nothrow pure @safe
{
return bytes[index .. index + p + 1];
}
bool canPeek(size_t p) const nothrow pure @safe
{
return index + p < bytes.length;
}
LexerRange save() const nothrow pure @safe
{
return LexerRange(bytes, index, column, line);
}
void popFront() pure nothrow @safe
{
index++;
column++;
range.popFront();
}
void incrementLine() pure nothrow
void popFrontN(size_t n) pure nothrow @safe
{
index += n;
}
void incrementLine() pure nothrow @safe
{
column = 1;
line++;
}
BufferType range;
alias range this;
const(ubyte)[] bytes;
size_t index;
size_t column;
size_t line;
@ -388,6 +435,13 @@ struct StringCache
{
public:
@disable this();
this(size_t bucketCount = defaultBucketCount)
{
buckets = new Item*[bucketCount];
}
/**
* Equivalent to calling cache() and get().
* ---
@ -402,6 +456,11 @@ public:
return get(cache(bytes));
}
string cacheGet(const(ubyte[]) bytes, uint hash) pure nothrow @safe
{
return get(cache(bytes, hash));
}
/**
* Caches a string.
* Params: bytes = the string to cache
@ -416,6 +475,12 @@ public:
* ---
*/
size_t cache(const(ubyte)[] bytes) pure nothrow @safe
{
immutable uint hash = hashBytes(bytes);
return cache(bytes, hash);
}
size_t cache(const(ubyte)[] bytes, uint hash) pure nothrow @safe
in
{
assert (bytes.length > 0);
@ -426,7 +491,7 @@ public:
}
body
{
immutable uint hash = hashBytes(bytes);
memoryRequested += bytes.length;
const(Item)* found = find(bytes, hash);
if (found is null)
return intern(bytes, hash);
@ -453,23 +518,58 @@ public:
return items[index].str;
}
void printStats()
{
import std.stdio;
writeln("Load Factor: ", cast(float) items.length / cast(float) buckets.length);
writeln("Memory used by blocks: ", blocks.length * blockSize);
writeln("Memory requsted: ", memoryRequested);
writeln("rehashes: ", rehashCount);
}
static uint hashStep(ubyte b, uint h) pure nothrow @safe
{
return (h ^ sbox[b]) * 3;
}
static enum defaultBucketCount = 2048;
private:
size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @safe
private void rehash() pure nothrow @safe
{
Item* item = new Item;
item.hash = hash;
item.str = allocate(bytes);
immutable size_t newBucketCount = items.length * 2;
buckets = new Item*[newBucketCount];
rehashCount++;
foreach (item; items)
{
immutable size_t newIndex = item.hash % newBucketCount;
item.next = buckets[newIndex];
buckets[newIndex] = item;
}
}
size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
{
ubyte[] mem = allocate(bytes.length);
mem[] = bytes[];
Item* item = cast(Item*) allocate(Item.sizeof).ptr;
item.index = items.length;
item.str = cast(string) mem;
item.hash = hash;
item.next = buckets[hash % buckets.length];
immutable bool checkLoadFactor = item.next !is null;
buckets[hash % buckets.length] = item;
items ~= item;
buckets[hash % buckets.length] ~= item;
if (checkLoadFactor && (cast(float) items.length / cast(float) buckets.length) > 0.75)
rehash();
return item.index;
}
const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe
{
immutable size_t index = hash % buckets.length;
foreach (item; buckets[index])
for (const(Item)* item = buckets[index]; item !is null; item = item.next)
{
if (item.hash == hash && bytes.equal(item.str))
return item;
@ -477,53 +577,46 @@ private:
return null;
}
string allocate(const(ubyte)[] bytes) pure nothrow @trusted
out (retVal)
{
assert (retVal == bytes);
}
body
ubyte[] allocate(size_t byteCount) pure nothrow @trusted
{
import core.memory;
if (bytes.length > (pageSize / 4))
if (byteCount > (blockSize / 4))
{
ubyte* memory = cast(ubyte*) GC.malloc(bytes.length, GC.BlkAttr.NO_SCAN);
memory[0 .. bytes.length] = bytes[];
return cast(string) memory[0..bytes.length];
ubyte* mem = cast(ubyte*) GC.malloc(byteCount, GC.BlkAttr.NO_SCAN);
return mem[0 .. byteCount];
}
foreach (ref block; blocks)
{
immutable size_t endIndex = block.used + bytes.length;
if (endIndex > block.bytes.length)
immutable size_t oldUsed = block.used;
immutable size_t end = oldUsed + byteCount;
if (end > block.bytes.length)
continue;
block.bytes[block.used .. endIndex] = bytes[];
string slice = cast(string) block.bytes[block.used .. endIndex];
block.used = endIndex;
return slice;
block.used = end;
return block.bytes[oldUsed .. end];
}
blocks.length = blocks.length + 1;
blocks[$ - 1].bytes = (cast(ubyte*) GC.malloc(pageSize, GC.BlkAttr.NO_SCAN))[0 .. pageSize];
blocks[$ - 1].bytes[0 .. bytes.length] = bytes[];
blocks[$ - 1].used = bytes.length;
return cast(string) blocks[$ - 1].bytes[0 .. bytes.length];
blocks ~= Block(
(cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize],
byteCount);
return blocks[$ - 1].bytes[0 .. byteCount];
}
static uint hashBytes(const(ubyte)[] data) pure nothrow @safe
{
uint hash = 0;
foreach (b; data)
{
hash ^= sbox[b];
hash *= 3;
}
return hash;
}
{
uint hash = 0;
foreach (b; data)
{
hash ^= sbox[b];
hash *= 3;
}
return hash;
}
static struct Item
{
size_t index;
string str;
uint hash;
Item* next;
}
static struct Block
@ -532,10 +625,9 @@ private:
size_t used;
}
static enum pageSize = 4096 * 1024;
static enum bucketCount = 2048;
static enum blockSize = 1024 * 16;
static enum uint[] sbox = [
public static immutable uint[] sbox = [
0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982,
0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56,
@ -603,6 +695,8 @@ private:
];
Item*[] items;
Item*[][bucketCount] buckets;
Item*[] buckets;
Block[] blocks;
size_t memoryRequested;
uint rehashCount;
}