Back-end cleanup and optimization in the lexer
This commit is contained in:
parent
24a0c1bc2b
commit
c01c51a61e
|
@ -1,3 +0,0 @@
|
|||
[submodule "datapicked"]
|
||||
path = datapicked
|
||||
url = https://github.com/blackwhale/datapicked.git
|
40
build.sh
40
build.sh
|
@ -1,4 +1,3 @@
|
|||
#dmd *.d stdx/d/*.d -release -inline -noboundscheck -O -w -wi -m64 -property -ofdscanner-dmd
|
||||
dmd\
|
||||
main.d\
|
||||
stats.d\
|
||||
|
@ -11,9 +10,36 @@ dmd\
|
|||
style.d\
|
||||
stdx/*.d\
|
||||
stdx/d/*.d\
|
||||
datapicked/dpick/buffer/*.d\
|
||||
-Idatapicked\
|
||||
-g -m64 -wi -ofdscanner
|
||||
#ldc2 main.d stats.d imports.d highlighter.d ctags.d astprinter.d formatter.d outliner.d stdx/*.d stdx/d/*.d -of=dscanner-ldc -m64 -oq
|
||||
#ldc2 *.d stdx/d/*.d -of=dscanner -unittest -m64 -g
|
||||
#/opt/gdc/bin/gdc -O3 -odscanner-gdc -fno-bounds-check -frelease -m64 *.d stdx/d/*.d
|
||||
-ofdscanner\
|
||||
-m64\
|
||||
-O -release -noboundscheck
|
||||
|
||||
#gdc\
|
||||
# main.d\
|
||||
# stats.d\
|
||||
# imports.d\
|
||||
# highlighter.d\
|
||||
# ctags.d\
|
||||
# astprinter.d\
|
||||
# formatter.d\
|
||||
# outliner.d\
|
||||
# style.d\
|
||||
# stdx/*.d\
|
||||
# stdx/d/*.d\
|
||||
# -O3 -frelease -fno-bounds-check\
|
||||
# -odscanner\
|
||||
|
||||
#ldc2\
|
||||
# main.d\
|
||||
# stats.d\
|
||||
# imports.d\
|
||||
# highlighter.d\
|
||||
# ctags.d\
|
||||
# astprinter.d\
|
||||
# formatter.d\
|
||||
# outliner.d\
|
||||
# style.d\
|
||||
# stdx/*.d\
|
||||
# stdx/d/*.d\
|
||||
# -O3 -release\
|
||||
# -oq -of=dscanner\
|
||||
|
|
2
ctags.d
2
ctags.d
|
@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames)
|
|||
{
|
||||
string[] tags;
|
||||
LexerConfig config;
|
||||
StringCache* cache = new StringCache;
|
||||
StringCache* cache = new StringCache(StringCache.defaultBucketCount);
|
||||
foreach (fileName; fileNames)
|
||||
{
|
||||
File f = File(fileName);
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
Subproject commit f63a843e9c0ce8db7fd897684fe323697255d87d
|
14
main.d
14
main.d
|
@ -10,15 +10,12 @@ import std.array;
|
|||
import std.conv;
|
||||
import std.file;
|
||||
import std.getopt;
|
||||
import std.parallelism;
|
||||
import std.path;
|
||||
import std.regex;
|
||||
import std.stdio;
|
||||
import std.range;
|
||||
import stdx.lexer;
|
||||
import stdx.d.lexer;
|
||||
import stdx.d.parser;
|
||||
import dpick.buffer.buffer;
|
||||
|
||||
import highlighter;
|
||||
import stats;
|
||||
|
@ -93,7 +90,7 @@ int main(string[] args)
|
|||
return 1;
|
||||
}
|
||||
|
||||
StringCache* cache = new StringCache;
|
||||
StringCache* cache = new StringCache(StringCache.defaultBucketCount);
|
||||
|
||||
if (tokenDump || highlight)
|
||||
{
|
||||
|
@ -151,13 +148,16 @@ int main(string[] args)
|
|||
foreach (f; expandArgs(args, recursive))
|
||||
{
|
||||
import core.memory;
|
||||
GC.disable();
|
||||
auto tokens = byToken!(ubyte[])(readFile(f));
|
||||
LexerConfig config;
|
||||
config.whitespaceBehavior = WhitespaceBehavior.skip;
|
||||
config.stringBehavior = StringBehavior.source;
|
||||
config.commentBehavior = CommentBehavior.include;
|
||||
auto tokens = byToken(readFile(f), config, cache);
|
||||
if (tokenCount)
|
||||
count += printTokenCount(stdout, f, tokens);
|
||||
else
|
||||
count += printLineCount(stdout, f, tokens);
|
||||
GC.enable();
|
||||
cache.printStats();
|
||||
}
|
||||
writefln("total:\t%d", count);
|
||||
}
|
||||
|
|
|
@ -1,9 +0,0 @@
|
|||
echo -e "file\tstd.d.lexer dmd\tstd.d.lexer ldc\tstd.d.lexer gdc\tdmd"
|
||||
for i in $(ls ../phobos/std/*.d); do
|
||||
f=$(echo $i | sed "s/.*phobos\///")
|
||||
dmdt=$(avgtime -q -r 200 ./dscanner-dmd --tokenCount $i | grep "Median" | sed "s/.*: //")
|
||||
ldct=$(avgtime -q -r 200 ./dscanner-ldc --tokenCount $i | grep "Median" | sed "s/.*: //")
|
||||
gdct=$(avgtime -q -r 200 ./dscanner-gdc --tokenCount $i | grep "Median" | sed "s/.*: //")
|
||||
gcct=$(avgtime -q -r 200 ~/src/dmd-lexer/src/dmd $i | grep "Median" | sed "s/.*: //")
|
||||
echo -e "${f}\t${dmdt}\t${ldct}\t${gdct}\t${gcct}"
|
||||
done
|
9
stats.d
9
stats.d
|
@ -32,7 +32,12 @@ pure nothrow bool isLineOfCode(IdType t)
|
|||
|
||||
ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens)
|
||||
{
|
||||
ulong c = tokens.count!(a => true);
|
||||
|
||||
ulong c;
|
||||
foreach (ref t; tokens)
|
||||
{
|
||||
c++;
|
||||
}
|
||||
output.writefln("%s:\t%d", fileName, c);
|
||||
return c;
|
||||
}
|
||||
|
@ -40,7 +45,7 @@ ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens)
|
|||
ulong printLineCount(Tokens)(File output, string fileName, ref Tokens tokens)
|
||||
{
|
||||
ulong count;
|
||||
foreach (t; tokens)
|
||||
foreach (ref t; tokens)
|
||||
{
|
||||
if (isLineOfCode(t.type))
|
||||
++count;
|
||||
|
|
252
stdx/d/lexer.d
252
stdx/d/lexer.d
|
@ -57,13 +57,13 @@ public template tok(string token)
|
|||
alias tok = TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token);
|
||||
}
|
||||
private enum extraFields = q{
|
||||
string comment;
|
||||
string comment;
|
||||
|
||||
int opCmp(size_t i) const pure nothrow @safe {
|
||||
if (index < i) return -1;
|
||||
if (index > i) return 1;
|
||||
return 0;
|
||||
}
|
||||
int opCmp(size_t i) const pure nothrow @safe {
|
||||
if (index < i) return -1;
|
||||
if (index > i) return 1;
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields);
|
||||
|
||||
|
@ -72,15 +72,15 @@ public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields);
|
|||
*/
|
||||
public enum StringBehavior : ubyte
|
||||
{
|
||||
/// Do not include quote characters, process escape sequences
|
||||
compiler = 0b0000_0000,
|
||||
/// Opening quotes, closing quotes, and string suffixes are included in the
|
||||
/// string token
|
||||
includeQuoteChars = 0b0000_0001,
|
||||
/// String escape sequences are not replaced
|
||||
notEscaped = 0b0000_0010,
|
||||
/// Not modified at all. Useful for formatters or highlighters
|
||||
source = includeQuoteChars | notEscaped
|
||||
/// Do not include quote characters, process escape sequences
|
||||
compiler = 0b0000_0000,
|
||||
/// Opening quotes, closing quotes, and string suffixes are included in the
|
||||
/// string token
|
||||
includeQuoteChars = 0b0000_0001,
|
||||
/// String escape sequences are not replaced
|
||||
notEscaped = 0b0000_0010,
|
||||
/// Not modified at all. Useful for formatters or highlighters
|
||||
source = includeQuoteChars | notEscaped
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -88,55 +88,28 @@ public enum StringBehavior : ubyte
|
|||
*/
|
||||
public enum WhitespaceBehavior : ubyte
|
||||
{
|
||||
/// Whitespace is skipped
|
||||
skip,
|
||||
/// Whitespace is treated as a token
|
||||
include
|
||||
/// Whitespace is skipped
|
||||
skip,
|
||||
/// Whitespace is treated as a token
|
||||
include
|
||||
}
|
||||
/**
|
||||
* Configure comment handling behavior
|
||||
*/
|
||||
public enum CommentBehavior : ubyte
|
||||
{
|
||||
/// Comments are attached to the non-whitespace token that follows them
|
||||
attach,
|
||||
/// Comments are tokens, and can be returned by calls to the token range's front()
|
||||
include
|
||||
/// Comments are attached to the non-whitespace token that follows them
|
||||
attach,
|
||||
/// Comments are tokens, and can be returned by calls to the token range's front()
|
||||
include
|
||||
}
|
||||
|
||||
public struct LexerConfig
|
||||
{
|
||||
string fileName;
|
||||
StringBehavior stringBehavior;
|
||||
WhitespaceBehavior whitespaceBehavior;
|
||||
CommentBehavior commentBehavior;
|
||||
}
|
||||
|
||||
public auto byToken(R)(R range)
|
||||
{
|
||||
LexerConfig config;
|
||||
StringCache* cache = new StringCache;
|
||||
return byToken(range, config, cache);
|
||||
}
|
||||
|
||||
public auto byToken(R)(R range, StringCache* cache)
|
||||
{
|
||||
LexerConfig config;
|
||||
return DLexer!(R)(range, config, cache);
|
||||
}
|
||||
|
||||
public auto byToken(R)(R range, const LexerConfig config, StringCache* cache)
|
||||
{
|
||||
return DLexer!(R)(range, config, cache);
|
||||
}
|
||||
|
||||
unittest
|
||||
{
|
||||
import std.stdio;
|
||||
auto source = cast(ubyte[]) q{ import std.stdio;}c;
|
||||
auto tokens = byToken(source);
|
||||
assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
|
||||
tok!"identifier", tok!";"]));
|
||||
StringBehavior stringBehavior;
|
||||
WhitespaceBehavior whitespaceBehavior;
|
||||
CommentBehavior commentBehavior;
|
||||
}
|
||||
|
||||
public bool isBasicType(IdType type) nothrow pure @safe
|
||||
|
@ -396,11 +369,9 @@ public bool isProtection(IdType type) pure nothrow @safe
|
|||
}
|
||||
}
|
||||
|
||||
public struct DLexer(R)
|
||||
public struct DLexer
|
||||
{
|
||||
import std.conv;
|
||||
import core.vararg;
|
||||
import dpick.buffer.buffer;
|
||||
|
||||
private enum pseudoTokenHandlers = [
|
||||
"\"", "lexStringLiteral",
|
||||
|
@ -434,53 +405,51 @@ public struct DLexer(R)
|
|||
"#line", "lexSpecialTokenSequence"
|
||||
];
|
||||
|
||||
mixin Lexer!(R, IdType, Token, lexIdentifier, staticTokens,
|
||||
mixin Lexer!(IdType, Token, lexIdentifier, staticTokens,
|
||||
dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens);
|
||||
|
||||
private alias Mark = typeof(range).Mark;
|
||||
|
||||
this(R range, const LexerConfig config, StringCache* cache)
|
||||
this(ubyte[] range, const LexerConfig config, StringCache* cache)
|
||||
{
|
||||
this.range = LexerRange!(typeof(buffer(range)))(buffer(range));
|
||||
this.config = config;
|
||||
this.range = LexerRange(range);
|
||||
this.config = config;
|
||||
this.cache = cache;
|
||||
popFront();
|
||||
popFront();
|
||||
}
|
||||
|
||||
private static bool isDocComment(string comment) pure nothrow @safe
|
||||
{
|
||||
return comment.length >= 3 && (comment[0 .. 3] == "///"
|
||||
|| comment[0 .. 3] == "/**" || comment[0 .. 3] == "/++");
|
||||
}
|
||||
private static bool isDocComment(string comment) pure nothrow @safe
|
||||
{
|
||||
return comment.length >= 3 && (comment[0 .. 3] == "///"
|
||||
|| comment[0 .. 3] == "/**" || comment[0 .. 3] == "/++");
|
||||
}
|
||||
|
||||
public void popFront() pure
|
||||
{
|
||||
_popFront();
|
||||
string comment = null;
|
||||
switch (front.type)
|
||||
{
|
||||
case tok!"comment":
|
||||
if (config.commentBehavior == CommentBehavior.attach)
|
||||
{
|
||||
import std.string;
|
||||
if (isDocComment(front.text))
|
||||
comment = comment == null ? front.text : format("%s\n%s", comment, front.text);
|
||||
do _popFront(); while (front == tok!"comment");
|
||||
if (front == tok!"whitespace") goto case tok!"whitespace";
|
||||
}
|
||||
break;
|
||||
case tok!"whitespace":
|
||||
if (config.whitespaceBehavior == WhitespaceBehavior.skip)
|
||||
{
|
||||
do _popFront(); while (front == tok!"whitespace");
|
||||
if (front == tok!"comment") goto case tok!"comment";
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
_front.comment = comment;
|
||||
}
|
||||
public void popFront() pure
|
||||
{
|
||||
_popFront();
|
||||
string comment = null;
|
||||
switch (front.type)
|
||||
{
|
||||
case tok!"comment":
|
||||
if (config.commentBehavior == CommentBehavior.attach)
|
||||
{
|
||||
import std.string;
|
||||
if (isDocComment(front.text))
|
||||
comment = comment == null ? front.text : format("%s\n%s", comment, front.text);
|
||||
do _popFront(); while (front == tok!"comment");
|
||||
if (front == tok!"whitespace") goto case tok!"whitespace";
|
||||
}
|
||||
break;
|
||||
case tok!"whitespace":
|
||||
if (config.whitespaceBehavior == WhitespaceBehavior.skip)
|
||||
{
|
||||
do _popFront(); while (front == tok!"whitespace");
|
||||
if (front == tok!"comment") goto case tok!"comment";
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
_front.comment = comment;
|
||||
}
|
||||
|
||||
|
||||
bool isWhitespace() pure /*const*/ nothrow
|
||||
|
@ -493,7 +462,7 @@ public struct DLexer(R)
|
|||
case '\t':
|
||||
return true;
|
||||
case 0xe2:
|
||||
auto peek = range.lookahead(2);
|
||||
auto peek = range.peek(2);
|
||||
return peek.length == 2
|
||||
&& peek[0] == 0x80
|
||||
&& (peek[1] == 0xa8 || peek[1] == 0xa9);
|
||||
|
@ -521,7 +490,7 @@ public struct DLexer(R)
|
|||
range.incrementLine();
|
||||
return;
|
||||
case 0xe2:
|
||||
auto lookahead = range.lookahead(3);
|
||||
auto lookahead = range.peek(3);
|
||||
if (lookahead.length == 3 && lookahead[1] == 0x80
|
||||
&& (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
|
||||
{
|
||||
|
@ -564,7 +533,7 @@ public struct DLexer(R)
|
|||
range.popFront();
|
||||
break;
|
||||
case 0xe2:
|
||||
auto lookahead = range.lookahead(3);
|
||||
auto lookahead = range.peek(3);
|
||||
if (lookahead.length != 3)
|
||||
break loop;
|
||||
if (lookahead[1] != 0x80)
|
||||
|
@ -590,10 +559,10 @@ public struct DLexer(R)
|
|||
Token lexNumber() pure nothrow
|
||||
{
|
||||
mixin (tokenStart);
|
||||
auto lookahead = range.lookahead(2);
|
||||
if (range.front == '0' && lookahead.length == 2)
|
||||
if (range.canPeek(1) && range.front == '0')
|
||||
{
|
||||
switch (lookahead[1])
|
||||
auto ahead = range.peek(1)[1];
|
||||
switch (ahead)
|
||||
{
|
||||
case 'x':
|
||||
case 'X':
|
||||
|
@ -619,7 +588,7 @@ public struct DLexer(R)
|
|||
return lexHex(mark, line, column, index);
|
||||
}
|
||||
|
||||
Token lexHex(Mark mark, size_t line, size_t column, size_t index) pure nothrow
|
||||
Token lexHex(size_t mark, size_t line, size_t column, size_t index) pure nothrow
|
||||
{
|
||||
IdType type = tok!"intLiteral";
|
||||
bool foundDot;
|
||||
|
@ -654,7 +623,7 @@ public struct DLexer(R)
|
|||
case '.':
|
||||
if (foundDot)
|
||||
break hexLoop;
|
||||
if (range.lookahead(1).length && range.lookahead(1)[0] == '.')
|
||||
if (range.peek(1).length && range.peek(1)[0] == '.')
|
||||
break hexLoop;
|
||||
range.popFront();
|
||||
foundDot = true;
|
||||
|
@ -674,7 +643,7 @@ public struct DLexer(R)
|
|||
return lexBinary(mark, line, column, index);
|
||||
}
|
||||
|
||||
Token lexBinary(Mark mark, size_t line, size_t column, size_t index) pure nothrow
|
||||
Token lexBinary(size_t mark, size_t line, size_t column, size_t index) pure nothrow
|
||||
{
|
||||
IdType type = tok!"intLiteral";
|
||||
binaryLoop: while (!range.empty)
|
||||
|
@ -699,13 +668,13 @@ public struct DLexer(R)
|
|||
index);
|
||||
}
|
||||
|
||||
Token lexDecimal()
|
||||
Token lexDecimal() pure nothrow
|
||||
{
|
||||
mixin (tokenStart);
|
||||
return lexDecimal(mark, line, column, index);
|
||||
}
|
||||
|
||||
Token lexDecimal(Mark mark, size_t line, size_t column, size_t index) pure nothrow
|
||||
Token lexDecimal(size_t mark, size_t line, size_t column, size_t index) pure nothrow
|
||||
{
|
||||
bool foundDot = range.front == '.';
|
||||
IdType type = tok!"intLiteral";
|
||||
|
@ -748,7 +717,7 @@ public struct DLexer(R)
|
|||
case '.':
|
||||
if (foundDot)
|
||||
break decimalLoop;
|
||||
auto lookahead = range.lookahead(2);
|
||||
auto lookahead = range.peek(2);
|
||||
if (lookahead.length == 2 && lookahead[1] == '.')
|
||||
break decimalLoop;
|
||||
else
|
||||
|
@ -1058,7 +1027,7 @@ public struct DLexer(R)
|
|||
index);
|
||||
}
|
||||
|
||||
void lexStringSuffix(ref IdType type) pure
|
||||
void lexStringSuffix(ref IdType type) pure nothrow
|
||||
{
|
||||
if (range.empty)
|
||||
type = tok!"stringLiteral";
|
||||
|
@ -1076,12 +1045,12 @@ public struct DLexer(R)
|
|||
|
||||
Token lexDelimitedString() pure nothrow
|
||||
{
|
||||
import std.traits;
|
||||
import std.traits;
|
||||
mixin (tokenStart);
|
||||
range.popFront();
|
||||
range.popFront();
|
||||
Unqual!(ElementEncodingType!R) open;
|
||||
Unqual!(ElementEncodingType!R) close;
|
||||
ubyte open;
|
||||
ubyte close;
|
||||
switch (range.front)
|
||||
{
|
||||
case '<':
|
||||
|
@ -1109,8 +1078,8 @@ public struct DLexer(R)
|
|||
}
|
||||
}
|
||||
|
||||
Token lexNormalDelimitedString(Mark mark, size_t line, size_t column,
|
||||
size_t index, ElementEncodingType!R open, ElementEncodingType!R close)
|
||||
Token lexNormalDelimitedString(size_t mark, size_t line, size_t column,
|
||||
size_t index, ubyte open, ubyte close)
|
||||
pure nothrow
|
||||
{
|
||||
int depth = 1;
|
||||
|
@ -1144,7 +1113,7 @@ public struct DLexer(R)
|
|||
return Token(type, cache.cacheGet(range.slice(mark)), line, column, index);
|
||||
}
|
||||
|
||||
Token lexHeredocString(Mark mark, size_t line, size_t column, size_t index)
|
||||
Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index)
|
||||
pure nothrow
|
||||
{
|
||||
import std.regex;
|
||||
|
@ -1158,7 +1127,7 @@ public struct DLexer(R)
|
|||
if (isNewline())
|
||||
{
|
||||
popFrontWhitespaceAware();
|
||||
if (range.lookahead(ident.text.length) == ident.text)
|
||||
if (range.peek(ident.text.length) == ident.text)
|
||||
{
|
||||
foreach (i ; 0 .. ident.text.length)
|
||||
range.popFront();
|
||||
|
@ -1395,18 +1364,20 @@ public struct DLexer(R)
|
|||
Token lexIdentifier() pure nothrow
|
||||
{
|
||||
mixin (tokenStart);
|
||||
while (!range.empty && !isSeparating(range.front))
|
||||
uint hash = 0;
|
||||
while (!range.empty && !isSeparating(0))
|
||||
{
|
||||
hash = StringCache.hashStep(range.front, hash);
|
||||
range.popFront();
|
||||
}
|
||||
return Token(tok!"identifier", cache.cacheGet(range.slice(mark)), line,
|
||||
return Token(tok!"identifier", cache.cacheGet(range.slice(mark), hash), line,
|
||||
column, index);
|
||||
}
|
||||
|
||||
Token lexDot() pure nothrow
|
||||
{
|
||||
mixin (tokenStart);
|
||||
auto lookahead = range.lookahead(1);
|
||||
auto lookahead = range.peek(1);
|
||||
if (lookahead.length == 0)
|
||||
{
|
||||
range.popFront();
|
||||
|
@ -1447,22 +1418,25 @@ public struct DLexer(R)
|
|||
{
|
||||
if (range.front == '\n') return true;
|
||||
if (range.front == '\r') return true;
|
||||
auto lookahead = range.lookahead(3);
|
||||
auto lookahead = range.peek(3);
|
||||
if (lookahead.length == 0) return false;
|
||||
if (lookahead == "\u2028" || lookahead == "\u2029")
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool isSeparating(ElementType!R c) nothrow pure @safe
|
||||
bool isSeparating(size_t offset) const pure nothrow @safe
|
||||
{
|
||||
auto r = range.save();
|
||||
r.popFrontN(offset);
|
||||
auto c = r.front;
|
||||
if (c <= 0x2f) return true;
|
||||
if (c >= ':' && c <= '@') return true;
|
||||
if (c >= '[' && c <= '^') return true;
|
||||
if (c >= '{' && c <= '~') return true;
|
||||
if (c == '`') return true;
|
||||
// if (c & 0x80 && (range.lookahead(3) == "\u2028"
|
||||
// || range.lookahead(3) == "\u2029")) return true;
|
||||
if (c & 0x80 && (r.peek(3) == "\u2028"
|
||||
|| range.peek(3) == "\u2029")) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1470,17 +1444,43 @@ public struct DLexer(R)
|
|||
size_t index = range.index;
|
||||
size_t column = range.column;
|
||||
size_t line = range.line;
|
||||
const mark = range.mark();
|
||||
auto mark = range.mark();
|
||||
};
|
||||
|
||||
void error(...) pure {
|
||||
void error(...) pure nothrow @safe {
|
||||
|
||||
}
|
||||
|
||||
void warning(...) pure {
|
||||
void warning(...) pure nothrow @safe {
|
||||
|
||||
}
|
||||
|
||||
StringCache* cache;
|
||||
LexerConfig config;
|
||||
}
|
||||
|
||||
public auto byToken(ubyte[] range)
|
||||
{
|
||||
LexerConfig config;
|
||||
StringCache* cache = new StringCache(StringCache.defaultBucketCount);
|
||||
return DLexer(range, config, cache);
|
||||
}
|
||||
|
||||
public auto byToken(ubyte[] range, StringCache* cache)
|
||||
{
|
||||
LexerConfig config;
|
||||
return DLexer(range, config, cache);
|
||||
}
|
||||
|
||||
public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
|
||||
{
|
||||
return DLexer(range, config, cache);
|
||||
}
|
||||
unittest
|
||||
{
|
||||
import std.stdio;
|
||||
auto source = cast(ubyte[]) q{ import std.stdio;}c;
|
||||
auto tokens = byToken(source);
|
||||
assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
|
||||
tok!"identifier", tok!";"]));
|
||||
}
|
||||
|
|
246
stdx/lexer.d
246
stdx/lexer.d
|
@ -17,8 +17,6 @@ import std.range;
|
|||
import std.traits;
|
||||
import std.conv;
|
||||
import std.math;
|
||||
import dpick.buffer.buffer;
|
||||
import dpick.buffer.traits;
|
||||
|
||||
/**
|
||||
* Template for determining the type used for a token type. Selects the smallest
|
||||
|
@ -191,12 +189,13 @@ public:
|
|||
mixin (extraFields);
|
||||
}
|
||||
|
||||
mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
|
||||
mixin template Lexer(IDType, Token, alias defaultTokenFunction,
|
||||
alias staticTokens, alias dynamicTokens, alias pseudoTokens,
|
||||
alias pseudoTokenHandlers, alias possibleDefaultTokens)
|
||||
{
|
||||
static string generateCaseStatements(string[] tokens, size_t offset = 0)
|
||||
{
|
||||
import std.conv;
|
||||
string code;
|
||||
for (size_t i = 0; i < tokens.length; i++)
|
||||
{
|
||||
|
@ -216,9 +215,9 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
|
|||
code ~= generateLeaf(tokens[i], indent ~ " ");
|
||||
else
|
||||
{
|
||||
code ~= indent ~ " if (range.lookahead(" ~ text(tokens[i].length) ~ ").length == 0)\n";
|
||||
code ~= indent ~ " if (!range.canPeek(" ~ text(tokens[i].length) ~ "))\n";
|
||||
code ~= indent ~ " goto outer_default;\n";
|
||||
code ~= indent ~ " if (range.lookahead(" ~ text(tokens[i].length) ~ ") == \"" ~ escape(tokens[i]) ~ "\")\n";
|
||||
code ~= indent ~ " if (range.peek(" ~ text(tokens[i].length - 1) ~ ") == \"" ~ escape(tokens[i]) ~ "\")\n";
|
||||
code ~= indent ~ " {\n";
|
||||
code ~= generateLeaf(tokens[i], indent ~ " ");
|
||||
code ~= indent ~ " }\n";
|
||||
|
@ -228,11 +227,11 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
|
|||
}
|
||||
else
|
||||
{
|
||||
code ~= indent ~ " if (range.lookahead(" ~ text(offset + 2) ~ ").length == 0)\n";
|
||||
code ~= indent ~ " if (!range.canPeek(" ~ text(offset + 1) ~ "))\n";
|
||||
code ~= indent ~ " {\n";
|
||||
code ~= generateLeaf(tokens[i][0 .. offset + 1], indent ~ " ");
|
||||
code ~= indent ~ " }\n";
|
||||
code ~= indent ~ " switch (range.lookahead(" ~ text(offset + 2) ~ ")[" ~ text(offset + 1) ~ "])\n";
|
||||
code ~= indent ~ " switch (range.peek(" ~ text(offset + 1) ~ ")[" ~ text(offset + 1) ~ "])\n";
|
||||
code ~= indent ~ " {\n";
|
||||
code ~= generateCaseStatements(tokens[i .. j], offset + 1);
|
||||
code ~= indent ~ " default:\n";
|
||||
|
@ -247,6 +246,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
|
|||
|
||||
static string generateLeaf(string token, string indent)
|
||||
{
|
||||
import std.conv;
|
||||
static assert (pseudoTokenHandlers.length % 2 == 0,
|
||||
"Each pseudo-token must have a matching function name.");
|
||||
string code;
|
||||
|
@ -262,7 +262,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
|
|||
code ~= indent ~ "return " ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(token) + 1] ~ "();\n";
|
||||
else if (possibleDefaultTokens.countUntil(token) >= 0)
|
||||
{
|
||||
code ~= indent ~ "if (range.lookahead(" ~ text(token.length + 1) ~ ").length == 0 || isSeparating(range.lookahead(" ~ text(token.length + 1) ~ ")[" ~ text(token.length) ~ "]))\n";
|
||||
code ~= indent ~ "if (!range.canPeek(" ~ text(token.length + 1) ~ ") || isSeparating(" ~ text(token.length) ~ "))\n";
|
||||
code ~= indent ~ "{\n";
|
||||
if (token.length == 1)
|
||||
code ~= indent ~ " range.popFront();\n";
|
||||
|
@ -278,7 +278,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
|
|||
return code;
|
||||
}
|
||||
|
||||
const(Token) front() pure nothrow const @property
|
||||
ref const(Token) front() pure nothrow const @property
|
||||
{
|
||||
return _front;
|
||||
}
|
||||
|
@ -312,7 +312,21 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
|
|||
return retVal;
|
||||
}
|
||||
|
||||
Token advance() pure
|
||||
/**
|
||||
* This only exists because the real array() can't be called at compile-time
|
||||
*/
|
||||
static string[] stupidToArray(R)(R range)
|
||||
{
|
||||
string[] retVal;
|
||||
foreach (v; range)
|
||||
retVal ~= v;
|
||||
return retVal;
|
||||
}
|
||||
|
||||
|
||||
enum loopBody = generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens)));
|
||||
|
||||
auto ref Token advance() pure
|
||||
{
|
||||
if (range.empty)
|
||||
return Token(tok!"\0");
|
||||
|
@ -321,54 +335,87 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction,
|
|||
immutable size_t line = range.line;
|
||||
lexerLoop: switch (range.front)
|
||||
{
|
||||
mixin(generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens))));
|
||||
// pragma(msg, generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens))));
|
||||
mixin(loopBody);
|
||||
/+pragma(msg, loopBody);+/
|
||||
outer_default:
|
||||
default:
|
||||
return defaultTokenFunction();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This only exists because the real array() can't be called at compile-time
|
||||
*/
|
||||
static T[] stupidToArray(R, T = ElementType!R)(R range)
|
||||
{
|
||||
T[] retVal;
|
||||
foreach (v; range)
|
||||
retVal ~= v;
|
||||
return retVal;
|
||||
}
|
||||
|
||||
LexerRange!(typeof(buffer(R.init))) range;
|
||||
LexerRange range;
|
||||
Token _front;
|
||||
}
|
||||
|
||||
struct LexerRange(BufferType) if (isBuffer!BufferType)
|
||||
struct LexerRange
|
||||
{
|
||||
this(BufferType r)
|
||||
|
||||
this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1) pure nothrow @safe
|
||||
{
|
||||
this.range = r;
|
||||
index = 0;
|
||||
column = 1;
|
||||
line = 1;
|
||||
this.bytes = bytes;
|
||||
this.index = index;
|
||||
this.column = column;
|
||||
this.line = line;
|
||||
}
|
||||
|
||||
void popFront() pure
|
||||
size_t mark() const nothrow pure @safe
|
||||
{
|
||||
return index;
|
||||
}
|
||||
|
||||
void seek(size_t m) nothrow pure @safe
|
||||
{
|
||||
index = m;
|
||||
}
|
||||
|
||||
const(ubyte)[] slice(size_t m) const nothrow pure @safe
|
||||
{
|
||||
return bytes[m .. index];
|
||||
}
|
||||
|
||||
bool empty() const nothrow pure @safe
|
||||
{
|
||||
return index >= bytes.length;
|
||||
}
|
||||
|
||||
ubyte front() const nothrow pure @safe
|
||||
{
|
||||
return bytes[index];
|
||||
}
|
||||
|
||||
const(ubyte)[] peek(size_t p) const nothrow pure @safe
|
||||
{
|
||||
return bytes[index .. index + p + 1];
|
||||
}
|
||||
|
||||
bool canPeek(size_t p) const nothrow pure @safe
|
||||
{
|
||||
return index + p < bytes.length;
|
||||
}
|
||||
|
||||
LexerRange save() const nothrow pure @safe
|
||||
{
|
||||
return LexerRange(bytes, index, column, line);
|
||||
}
|
||||
|
||||
void popFront() pure nothrow @safe
|
||||
{
|
||||
index++;
|
||||
column++;
|
||||
range.popFront();
|
||||
}
|
||||
|
||||
void incrementLine() pure nothrow
|
||||
void popFrontN(size_t n) pure nothrow @safe
|
||||
{
|
||||
index += n;
|
||||
}
|
||||
|
||||
void incrementLine() pure nothrow @safe
|
||||
{
|
||||
column = 1;
|
||||
line++;
|
||||
}
|
||||
|
||||
BufferType range;
|
||||
alias range this;
|
||||
const(ubyte)[] bytes;
|
||||
size_t index;
|
||||
size_t column;
|
||||
size_t line;
|
||||
|
@ -388,6 +435,13 @@ struct StringCache
|
|||
{
|
||||
public:
|
||||
|
||||
@disable this();
|
||||
|
||||
this(size_t bucketCount = defaultBucketCount)
|
||||
{
|
||||
buckets = new Item*[bucketCount];
|
||||
}
|
||||
|
||||
/**
|
||||
* Equivalent to calling cache() and get().
|
||||
* ---
|
||||
|
@ -402,6 +456,11 @@ public:
|
|||
return get(cache(bytes));
|
||||
}
|
||||
|
||||
string cacheGet(const(ubyte[]) bytes, uint hash) pure nothrow @safe
|
||||
{
|
||||
return get(cache(bytes, hash));
|
||||
}
|
||||
|
||||
/**
|
||||
* Caches a string.
|
||||
* Params: bytes = the string to cache
|
||||
|
@ -416,6 +475,12 @@ public:
|
|||
* ---
|
||||
*/
|
||||
size_t cache(const(ubyte)[] bytes) pure nothrow @safe
|
||||
{
|
||||
immutable uint hash = hashBytes(bytes);
|
||||
return cache(bytes, hash);
|
||||
}
|
||||
|
||||
size_t cache(const(ubyte)[] bytes, uint hash) pure nothrow @safe
|
||||
in
|
||||
{
|
||||
assert (bytes.length > 0);
|
||||
|
@ -426,7 +491,7 @@ public:
|
|||
}
|
||||
body
|
||||
{
|
||||
immutable uint hash = hashBytes(bytes);
|
||||
memoryRequested += bytes.length;
|
||||
const(Item)* found = find(bytes, hash);
|
||||
if (found is null)
|
||||
return intern(bytes, hash);
|
||||
|
@ -453,23 +518,58 @@ public:
|
|||
return items[index].str;
|
||||
}
|
||||
|
||||
void printStats()
|
||||
{
|
||||
import std.stdio;
|
||||
writeln("Load Factor: ", cast(float) items.length / cast(float) buckets.length);
|
||||
writeln("Memory used by blocks: ", blocks.length * blockSize);
|
||||
writeln("Memory requsted: ", memoryRequested);
|
||||
writeln("rehashes: ", rehashCount);
|
||||
}
|
||||
|
||||
static uint hashStep(ubyte b, uint h) pure nothrow @safe
|
||||
{
|
||||
return (h ^ sbox[b]) * 3;
|
||||
}
|
||||
|
||||
static enum defaultBucketCount = 2048;
|
||||
|
||||
private:
|
||||
|
||||
size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @safe
|
||||
private void rehash() pure nothrow @safe
|
||||
{
|
||||
Item* item = new Item;
|
||||
item.hash = hash;
|
||||
item.str = allocate(bytes);
|
||||
immutable size_t newBucketCount = items.length * 2;
|
||||
buckets = new Item*[newBucketCount];
|
||||
rehashCount++;
|
||||
foreach (item; items)
|
||||
{
|
||||
immutable size_t newIndex = item.hash % newBucketCount;
|
||||
item.next = buckets[newIndex];
|
||||
buckets[newIndex] = item;
|
||||
}
|
||||
}
|
||||
|
||||
size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
|
||||
{
|
||||
ubyte[] mem = allocate(bytes.length);
|
||||
mem[] = bytes[];
|
||||
Item* item = cast(Item*) allocate(Item.sizeof).ptr;
|
||||
item.index = items.length;
|
||||
item.str = cast(string) mem;
|
||||
item.hash = hash;
|
||||
item.next = buckets[hash % buckets.length];
|
||||
immutable bool checkLoadFactor = item.next !is null;
|
||||
buckets[hash % buckets.length] = item;
|
||||
items ~= item;
|
||||
buckets[hash % buckets.length] ~= item;
|
||||
if (checkLoadFactor && (cast(float) items.length / cast(float) buckets.length) > 0.75)
|
||||
rehash();
|
||||
return item.index;
|
||||
}
|
||||
|
||||
const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe
|
||||
{
|
||||
immutable size_t index = hash % buckets.length;
|
||||
foreach (item; buckets[index])
|
||||
for (const(Item)* item = buckets[index]; item !is null; item = item.next)
|
||||
{
|
||||
if (item.hash == hash && bytes.equal(item.str))
|
||||
return item;
|
||||
|
@ -477,53 +577,46 @@ private:
|
|||
return null;
|
||||
}
|
||||
|
||||
string allocate(const(ubyte)[] bytes) pure nothrow @trusted
|
||||
out (retVal)
|
||||
{
|
||||
assert (retVal == bytes);
|
||||
}
|
||||
body
|
||||
ubyte[] allocate(size_t byteCount) pure nothrow @trusted
|
||||
{
|
||||
import core.memory;
|
||||
if (bytes.length > (pageSize / 4))
|
||||
if (byteCount > (blockSize / 4))
|
||||
{
|
||||
ubyte* memory = cast(ubyte*) GC.malloc(bytes.length, GC.BlkAttr.NO_SCAN);
|
||||
memory[0 .. bytes.length] = bytes[];
|
||||
return cast(string) memory[0..bytes.length];
|
||||
ubyte* mem = cast(ubyte*) GC.malloc(byteCount, GC.BlkAttr.NO_SCAN);
|
||||
return mem[0 .. byteCount];
|
||||
}
|
||||
foreach (ref block; blocks)
|
||||
{
|
||||
immutable size_t endIndex = block.used + bytes.length;
|
||||
if (endIndex > block.bytes.length)
|
||||
immutable size_t oldUsed = block.used;
|
||||
immutable size_t end = oldUsed + byteCount;
|
||||
if (end > block.bytes.length)
|
||||
continue;
|
||||
block.bytes[block.used .. endIndex] = bytes[];
|
||||
string slice = cast(string) block.bytes[block.used .. endIndex];
|
||||
block.used = endIndex;
|
||||
return slice;
|
||||
block.used = end;
|
||||
return block.bytes[oldUsed .. end];
|
||||
}
|
||||
blocks.length = blocks.length + 1;
|
||||
blocks[$ - 1].bytes = (cast(ubyte*) GC.malloc(pageSize, GC.BlkAttr.NO_SCAN))[0 .. pageSize];
|
||||
blocks[$ - 1].bytes[0 .. bytes.length] = bytes[];
|
||||
blocks[$ - 1].used = bytes.length;
|
||||
return cast(string) blocks[$ - 1].bytes[0 .. bytes.length];
|
||||
blocks ~= Block(
|
||||
(cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize],
|
||||
byteCount);
|
||||
return blocks[$ - 1].bytes[0 .. byteCount];
|
||||
}
|
||||
|
||||
static uint hashBytes(const(ubyte)[] data) pure nothrow @safe
|
||||
{
|
||||
uint hash = 0;
|
||||
foreach (b; data)
|
||||
{
|
||||
hash ^= sbox[b];
|
||||
hash *= 3;
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
{
|
||||
uint hash = 0;
|
||||
foreach (b; data)
|
||||
{
|
||||
hash ^= sbox[b];
|
||||
hash *= 3;
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
static struct Item
|
||||
{
|
||||
size_t index;
|
||||
string str;
|
||||
uint hash;
|
||||
Item* next;
|
||||
}
|
||||
|
||||
static struct Block
|
||||
|
@ -532,10 +625,9 @@ private:
|
|||
size_t used;
|
||||
}
|
||||
|
||||
static enum pageSize = 4096 * 1024;
|
||||
static enum bucketCount = 2048;
|
||||
static enum blockSize = 1024 * 16;
|
||||
|
||||
static enum uint[] sbox = [
|
||||
public static immutable uint[] sbox = [
|
||||
0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
|
||||
0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982,
|
||||
0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56,
|
||||
|
@ -603,6 +695,8 @@ private:
|
|||
];
|
||||
|
||||
Item*[] items;
|
||||
Item*[][bucketCount] buckets;
|
||||
Item*[] buckets;
|
||||
Block[] blocks;
|
||||
size_t memoryRequested;
|
||||
uint rehashCount;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue