From c01c51a61ea4f2e6a94a64396be0abd9e8249bab Mon Sep 17 00:00:00 2001 From: Hackerpilot Date: Sun, 19 Jan 2014 23:13:13 -0800 Subject: [PATCH] Back-end cleanup and optimization in the lexer --- .gitmodules | 3 - build.sh | 40 ++++++-- ctags.d | 2 +- datapicked | 1 - main.d | 14 +-- perftest.sh | 9 -- stats.d | 9 +- stdx/d/lexer.d | 252 ++++++++++++++++++++++++------------------------- stdx/lexer.d | 246 ++++++++++++++++++++++++++++++++--------------- 9 files changed, 344 insertions(+), 232 deletions(-) delete mode 100644 .gitmodules delete mode 160000 datapicked delete mode 100755 perftest.sh diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 3a7a14f..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "datapicked"] - path = datapicked - url = https://github.com/blackwhale/datapicked.git diff --git a/build.sh b/build.sh index be987c0..3a81079 100755 --- a/build.sh +++ b/build.sh @@ -1,4 +1,3 @@ -#dmd *.d stdx/d/*.d -release -inline -noboundscheck -O -w -wi -m64 -property -ofdscanner-dmd dmd\ main.d\ stats.d\ @@ -11,9 +10,36 @@ dmd\ style.d\ stdx/*.d\ stdx/d/*.d\ - datapicked/dpick/buffer/*.d\ - -Idatapicked\ - -g -m64 -wi -ofdscanner -#ldc2 main.d stats.d imports.d highlighter.d ctags.d astprinter.d formatter.d outliner.d stdx/*.d stdx/d/*.d -of=dscanner-ldc -m64 -oq -#ldc2 *.d stdx/d/*.d -of=dscanner -unittest -m64 -g -#/opt/gdc/bin/gdc -O3 -odscanner-gdc -fno-bounds-check -frelease -m64 *.d stdx/d/*.d + -ofdscanner\ + -m64\ + -O -release -noboundscheck + +#gdc\ +# main.d\ +# stats.d\ +# imports.d\ +# highlighter.d\ +# ctags.d\ +# astprinter.d\ +# formatter.d\ +# outliner.d\ +# style.d\ +# stdx/*.d\ +# stdx/d/*.d\ +# -O3 -frelease -fno-bounds-check\ +# -odscanner\ + +#ldc2\ +# main.d\ +# stats.d\ +# imports.d\ +# highlighter.d\ +# ctags.d\ +# astprinter.d\ +# formatter.d\ +# outliner.d\ +# style.d\ +# stdx/*.d\ +# stdx/d/*.d\ +# -O3 -release\ +# -oq -of=dscanner\ diff --git a/ctags.d b/ctags.d index a83a574..37677da 100644 --- a/ctags.d +++ b/ctags.d @@ -20,7 +20,7 @@ void printCtags(File output, string[] fileNames) { string[] tags; LexerConfig config; - StringCache* cache = new StringCache; + StringCache* cache = new StringCache(StringCache.defaultBucketCount); foreach (fileName; fileNames) { File f = File(fileName); diff --git a/datapicked b/datapicked deleted file mode 160000 index f63a843..0000000 --- a/datapicked +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f63a843e9c0ce8db7fd897684fe323697255d87d diff --git a/main.d b/main.d index 9e2b818..41d2210 100644 --- a/main.d +++ b/main.d @@ -10,15 +10,12 @@ import std.array; import std.conv; import std.file; import std.getopt; -import std.parallelism; import std.path; -import std.regex; import std.stdio; import std.range; import stdx.lexer; import stdx.d.lexer; import stdx.d.parser; -import dpick.buffer.buffer; import highlighter; import stats; @@ -93,7 +90,7 @@ int main(string[] args) return 1; } - StringCache* cache = new StringCache; + StringCache* cache = new StringCache(StringCache.defaultBucketCount); if (tokenDump || highlight) { @@ -151,13 +148,16 @@ int main(string[] args) foreach (f; expandArgs(args, recursive)) { import core.memory; - GC.disable(); - auto tokens = byToken!(ubyte[])(readFile(f)); + LexerConfig config; + config.whitespaceBehavior = WhitespaceBehavior.skip; + config.stringBehavior = StringBehavior.source; + config.commentBehavior = CommentBehavior.include; + auto tokens = byToken(readFile(f), config, cache); if (tokenCount) count += printTokenCount(stdout, f, tokens); else count += printLineCount(stdout, f, tokens); - GC.enable(); + cache.printStats(); } writefln("total:\t%d", count); } diff --git a/perftest.sh b/perftest.sh deleted file mode 100755 index 1b78e6a..0000000 --- a/perftest.sh +++ /dev/null @@ -1,9 +0,0 @@ -echo -e "file\tstd.d.lexer dmd\tstd.d.lexer ldc\tstd.d.lexer gdc\tdmd" -for i in $(ls ../phobos/std/*.d); do - f=$(echo $i | sed "s/.*phobos\///") - dmdt=$(avgtime -q -r 200 ./dscanner-dmd --tokenCount $i | grep "Median" | sed "s/.*: //") - ldct=$(avgtime -q -r 200 ./dscanner-ldc --tokenCount $i | grep "Median" | sed "s/.*: //") - gdct=$(avgtime -q -r 200 ./dscanner-gdc --tokenCount $i | grep "Median" | sed "s/.*: //") - gcct=$(avgtime -q -r 200 ~/src/dmd-lexer/src/dmd $i | grep "Median" | sed "s/.*: //") - echo -e "${f}\t${dmdt}\t${ldct}\t${gdct}\t${gcct}" -done diff --git a/stats.d b/stats.d index ee55ccb..d4a65c4 100644 --- a/stats.d +++ b/stats.d @@ -32,7 +32,12 @@ pure nothrow bool isLineOfCode(IdType t) ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens) { - ulong c = tokens.count!(a => true); + + ulong c; + foreach (ref t; tokens) + { + c++; + } output.writefln("%s:\t%d", fileName, c); return c; } @@ -40,7 +45,7 @@ ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens) ulong printLineCount(Tokens)(File output, string fileName, ref Tokens tokens) { ulong count; - foreach (t; tokens) + foreach (ref t; tokens) { if (isLineOfCode(t.type)) ++count; diff --git a/stdx/d/lexer.d b/stdx/d/lexer.d index 2052815..6cb620e 100644 --- a/stdx/d/lexer.d +++ b/stdx/d/lexer.d @@ -57,13 +57,13 @@ public template tok(string token) alias tok = TokenId!(IdType, staticTokens, dynamicTokens, possibleDefaultTokens, token); } private enum extraFields = q{ - string comment; + string comment; - int opCmp(size_t i) const pure nothrow @safe { - if (index < i) return -1; - if (index > i) return 1; - return 0; - } + int opCmp(size_t i) const pure nothrow @safe { + if (index < i) return -1; + if (index > i) return 1; + return 0; + } }; public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields); @@ -72,15 +72,15 @@ public alias Token = stdx.lexer.TokenStructure!(IdType, extraFields); */ public enum StringBehavior : ubyte { - /// Do not include quote characters, process escape sequences - compiler = 0b0000_0000, - /// Opening quotes, closing quotes, and string suffixes are included in the - /// string token - includeQuoteChars = 0b0000_0001, - /// String escape sequences are not replaced - notEscaped = 0b0000_0010, - /// Not modified at all. Useful for formatters or highlighters - source = includeQuoteChars | notEscaped + /// Do not include quote characters, process escape sequences + compiler = 0b0000_0000, + /// Opening quotes, closing quotes, and string suffixes are included in the + /// string token + includeQuoteChars = 0b0000_0001, + /// String escape sequences are not replaced + notEscaped = 0b0000_0010, + /// Not modified at all. Useful for formatters or highlighters + source = includeQuoteChars | notEscaped } /** @@ -88,55 +88,28 @@ public enum StringBehavior : ubyte */ public enum WhitespaceBehavior : ubyte { - /// Whitespace is skipped - skip, - /// Whitespace is treated as a token - include + /// Whitespace is skipped + skip, + /// Whitespace is treated as a token + include } /** * Configure comment handling behavior */ public enum CommentBehavior : ubyte { - /// Comments are attached to the non-whitespace token that follows them - attach, - /// Comments are tokens, and can be returned by calls to the token range's front() - include + /// Comments are attached to the non-whitespace token that follows them + attach, + /// Comments are tokens, and can be returned by calls to the token range's front() + include } public struct LexerConfig { string fileName; - StringBehavior stringBehavior; - WhitespaceBehavior whitespaceBehavior; - CommentBehavior commentBehavior; -} - -public auto byToken(R)(R range) -{ - LexerConfig config; - StringCache* cache = new StringCache; - return byToken(range, config, cache); -} - -public auto byToken(R)(R range, StringCache* cache) -{ - LexerConfig config; - return DLexer!(R)(range, config, cache); -} - -public auto byToken(R)(R range, const LexerConfig config, StringCache* cache) -{ - return DLexer!(R)(range, config, cache); -} - -unittest -{ - import std.stdio; - auto source = cast(ubyte[]) q{ import std.stdio;}c; - auto tokens = byToken(source); - assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".", - tok!"identifier", tok!";"])); + StringBehavior stringBehavior; + WhitespaceBehavior whitespaceBehavior; + CommentBehavior commentBehavior; } public bool isBasicType(IdType type) nothrow pure @safe @@ -396,11 +369,9 @@ public bool isProtection(IdType type) pure nothrow @safe } } -public struct DLexer(R) +public struct DLexer { - import std.conv; import core.vararg; - import dpick.buffer.buffer; private enum pseudoTokenHandlers = [ "\"", "lexStringLiteral", @@ -434,53 +405,51 @@ public struct DLexer(R) "#line", "lexSpecialTokenSequence" ]; - mixin Lexer!(R, IdType, Token, lexIdentifier, staticTokens, + mixin Lexer!(IdType, Token, lexIdentifier, staticTokens, dynamicTokens, pseudoTokens, pseudoTokenHandlers, possibleDefaultTokens); - private alias Mark = typeof(range).Mark; - - this(R range, const LexerConfig config, StringCache* cache) + this(ubyte[] range, const LexerConfig config, StringCache* cache) { - this.range = LexerRange!(typeof(buffer(range)))(buffer(range)); - this.config = config; + this.range = LexerRange(range); + this.config = config; this.cache = cache; - popFront(); + popFront(); } - private static bool isDocComment(string comment) pure nothrow @safe - { - return comment.length >= 3 && (comment[0 .. 3] == "///" - || comment[0 .. 3] == "/**" || comment[0 .. 3] == "/++"); - } + private static bool isDocComment(string comment) pure nothrow @safe + { + return comment.length >= 3 && (comment[0 .. 3] == "///" + || comment[0 .. 3] == "/**" || comment[0 .. 3] == "/++"); + } - public void popFront() pure - { - _popFront(); - string comment = null; - switch (front.type) - { - case tok!"comment": - if (config.commentBehavior == CommentBehavior.attach) - { - import std.string; - if (isDocComment(front.text)) - comment = comment == null ? front.text : format("%s\n%s", comment, front.text); - do _popFront(); while (front == tok!"comment"); - if (front == tok!"whitespace") goto case tok!"whitespace"; - } - break; - case tok!"whitespace": - if (config.whitespaceBehavior == WhitespaceBehavior.skip) - { - do _popFront(); while (front == tok!"whitespace"); - if (front == tok!"comment") goto case tok!"comment"; - } - break; - default: - break; - } - _front.comment = comment; - } + public void popFront() pure + { + _popFront(); + string comment = null; + switch (front.type) + { + case tok!"comment": + if (config.commentBehavior == CommentBehavior.attach) + { + import std.string; + if (isDocComment(front.text)) + comment = comment == null ? front.text : format("%s\n%s", comment, front.text); + do _popFront(); while (front == tok!"comment"); + if (front == tok!"whitespace") goto case tok!"whitespace"; + } + break; + case tok!"whitespace": + if (config.whitespaceBehavior == WhitespaceBehavior.skip) + { + do _popFront(); while (front == tok!"whitespace"); + if (front == tok!"comment") goto case tok!"comment"; + } + break; + default: + break; + } + _front.comment = comment; + } bool isWhitespace() pure /*const*/ nothrow @@ -493,7 +462,7 @@ public struct DLexer(R) case '\t': return true; case 0xe2: - auto peek = range.lookahead(2); + auto peek = range.peek(2); return peek.length == 2 && peek[0] == 0x80 && (peek[1] == 0xa8 || peek[1] == 0xa9); @@ -521,7 +490,7 @@ public struct DLexer(R) range.incrementLine(); return; case 0xe2: - auto lookahead = range.lookahead(3); + auto lookahead = range.peek(3); if (lookahead.length == 3 && lookahead[1] == 0x80 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) { @@ -564,7 +533,7 @@ public struct DLexer(R) range.popFront(); break; case 0xe2: - auto lookahead = range.lookahead(3); + auto lookahead = range.peek(3); if (lookahead.length != 3) break loop; if (lookahead[1] != 0x80) @@ -590,10 +559,10 @@ public struct DLexer(R) Token lexNumber() pure nothrow { mixin (tokenStart); - auto lookahead = range.lookahead(2); - if (range.front == '0' && lookahead.length == 2) + if (range.canPeek(1) && range.front == '0') { - switch (lookahead[1]) + auto ahead = range.peek(1)[1]; + switch (ahead) { case 'x': case 'X': @@ -619,7 +588,7 @@ public struct DLexer(R) return lexHex(mark, line, column, index); } - Token lexHex(Mark mark, size_t line, size_t column, size_t index) pure nothrow + Token lexHex(size_t mark, size_t line, size_t column, size_t index) pure nothrow { IdType type = tok!"intLiteral"; bool foundDot; @@ -654,7 +623,7 @@ public struct DLexer(R) case '.': if (foundDot) break hexLoop; - if (range.lookahead(1).length && range.lookahead(1)[0] == '.') + if (range.peek(1).length && range.peek(1)[0] == '.') break hexLoop; range.popFront(); foundDot = true; @@ -674,7 +643,7 @@ public struct DLexer(R) return lexBinary(mark, line, column, index); } - Token lexBinary(Mark mark, size_t line, size_t column, size_t index) pure nothrow + Token lexBinary(size_t mark, size_t line, size_t column, size_t index) pure nothrow { IdType type = tok!"intLiteral"; binaryLoop: while (!range.empty) @@ -699,13 +668,13 @@ public struct DLexer(R) index); } - Token lexDecimal() + Token lexDecimal() pure nothrow { mixin (tokenStart); return lexDecimal(mark, line, column, index); } - Token lexDecimal(Mark mark, size_t line, size_t column, size_t index) pure nothrow + Token lexDecimal(size_t mark, size_t line, size_t column, size_t index) pure nothrow { bool foundDot = range.front == '.'; IdType type = tok!"intLiteral"; @@ -748,7 +717,7 @@ public struct DLexer(R) case '.': if (foundDot) break decimalLoop; - auto lookahead = range.lookahead(2); + auto lookahead = range.peek(2); if (lookahead.length == 2 && lookahead[1] == '.') break decimalLoop; else @@ -1058,7 +1027,7 @@ public struct DLexer(R) index); } - void lexStringSuffix(ref IdType type) pure + void lexStringSuffix(ref IdType type) pure nothrow { if (range.empty) type = tok!"stringLiteral"; @@ -1076,12 +1045,12 @@ public struct DLexer(R) Token lexDelimitedString() pure nothrow { - import std.traits; + import std.traits; mixin (tokenStart); range.popFront(); range.popFront(); - Unqual!(ElementEncodingType!R) open; - Unqual!(ElementEncodingType!R) close; + ubyte open; + ubyte close; switch (range.front) { case '<': @@ -1109,8 +1078,8 @@ public struct DLexer(R) } } - Token lexNormalDelimitedString(Mark mark, size_t line, size_t column, - size_t index, ElementEncodingType!R open, ElementEncodingType!R close) + Token lexNormalDelimitedString(size_t mark, size_t line, size_t column, + size_t index, ubyte open, ubyte close) pure nothrow { int depth = 1; @@ -1144,7 +1113,7 @@ public struct DLexer(R) return Token(type, cache.cacheGet(range.slice(mark)), line, column, index); } - Token lexHeredocString(Mark mark, size_t line, size_t column, size_t index) + Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index) pure nothrow { import std.regex; @@ -1158,7 +1127,7 @@ public struct DLexer(R) if (isNewline()) { popFrontWhitespaceAware(); - if (range.lookahead(ident.text.length) == ident.text) + if (range.peek(ident.text.length) == ident.text) { foreach (i ; 0 .. ident.text.length) range.popFront(); @@ -1395,18 +1364,20 @@ public struct DLexer(R) Token lexIdentifier() pure nothrow { mixin (tokenStart); - while (!range.empty && !isSeparating(range.front)) + uint hash = 0; + while (!range.empty && !isSeparating(0)) { + hash = StringCache.hashStep(range.front, hash); range.popFront(); } - return Token(tok!"identifier", cache.cacheGet(range.slice(mark)), line, + return Token(tok!"identifier", cache.cacheGet(range.slice(mark), hash), line, column, index); } Token lexDot() pure nothrow { mixin (tokenStart); - auto lookahead = range.lookahead(1); + auto lookahead = range.peek(1); if (lookahead.length == 0) { range.popFront(); @@ -1447,22 +1418,25 @@ public struct DLexer(R) { if (range.front == '\n') return true; if (range.front == '\r') return true; - auto lookahead = range.lookahead(3); + auto lookahead = range.peek(3); if (lookahead.length == 0) return false; if (lookahead == "\u2028" || lookahead == "\u2029") return true; return false; } - bool isSeparating(ElementType!R c) nothrow pure @safe + bool isSeparating(size_t offset) const pure nothrow @safe { + auto r = range.save(); + r.popFrontN(offset); + auto c = r.front; if (c <= 0x2f) return true; if (c >= ':' && c <= '@') return true; if (c >= '[' && c <= '^') return true; if (c >= '{' && c <= '~') return true; if (c == '`') return true; -// if (c & 0x80 && (range.lookahead(3) == "\u2028" -// || range.lookahead(3) == "\u2029")) return true; + if (c & 0x80 && (r.peek(3) == "\u2028" + || range.peek(3) == "\u2029")) return true; return false; } @@ -1470,17 +1444,43 @@ public struct DLexer(R) size_t index = range.index; size_t column = range.column; size_t line = range.line; - const mark = range.mark(); + auto mark = range.mark(); }; - void error(...) pure { + void error(...) pure nothrow @safe { } - void warning(...) pure { + void warning(...) pure nothrow @safe { } StringCache* cache; LexerConfig config; } + +public auto byToken(ubyte[] range) +{ + LexerConfig config; + StringCache* cache = new StringCache(StringCache.defaultBucketCount); + return DLexer(range, config, cache); +} + +public auto byToken(ubyte[] range, StringCache* cache) +{ + LexerConfig config; + return DLexer(range, config, cache); +} + +public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache) +{ + return DLexer(range, config, cache); +} +unittest +{ + import std.stdio; + auto source = cast(ubyte[]) q{ import std.stdio;}c; + auto tokens = byToken(source); + assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".", + tok!"identifier", tok!";"])); +} diff --git a/stdx/lexer.d b/stdx/lexer.d index 7f23be4..c8cd8f4 100644 --- a/stdx/lexer.d +++ b/stdx/lexer.d @@ -17,8 +17,6 @@ import std.range; import std.traits; import std.conv; import std.math; -import dpick.buffer.buffer; -import dpick.buffer.traits; /** * Template for determining the type used for a token type. Selects the smallest @@ -191,12 +189,13 @@ public: mixin (extraFields); } -mixin template Lexer(R, IDType, Token, alias defaultTokenFunction, +mixin template Lexer(IDType, Token, alias defaultTokenFunction, alias staticTokens, alias dynamicTokens, alias pseudoTokens, alias pseudoTokenHandlers, alias possibleDefaultTokens) { static string generateCaseStatements(string[] tokens, size_t offset = 0) { + import std.conv; string code; for (size_t i = 0; i < tokens.length; i++) { @@ -216,9 +215,9 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction, code ~= generateLeaf(tokens[i], indent ~ " "); else { - code ~= indent ~ " if (range.lookahead(" ~ text(tokens[i].length) ~ ").length == 0)\n"; + code ~= indent ~ " if (!range.canPeek(" ~ text(tokens[i].length) ~ "))\n"; code ~= indent ~ " goto outer_default;\n"; - code ~= indent ~ " if (range.lookahead(" ~ text(tokens[i].length) ~ ") == \"" ~ escape(tokens[i]) ~ "\")\n"; + code ~= indent ~ " if (range.peek(" ~ text(tokens[i].length - 1) ~ ") == \"" ~ escape(tokens[i]) ~ "\")\n"; code ~= indent ~ " {\n"; code ~= generateLeaf(tokens[i], indent ~ " "); code ~= indent ~ " }\n"; @@ -228,11 +227,11 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction, } else { - code ~= indent ~ " if (range.lookahead(" ~ text(offset + 2) ~ ").length == 0)\n"; + code ~= indent ~ " if (!range.canPeek(" ~ text(offset + 1) ~ "))\n"; code ~= indent ~ " {\n"; code ~= generateLeaf(tokens[i][0 .. offset + 1], indent ~ " "); code ~= indent ~ " }\n"; - code ~= indent ~ " switch (range.lookahead(" ~ text(offset + 2) ~ ")[" ~ text(offset + 1) ~ "])\n"; + code ~= indent ~ " switch (range.peek(" ~ text(offset + 1) ~ ")[" ~ text(offset + 1) ~ "])\n"; code ~= indent ~ " {\n"; code ~= generateCaseStatements(tokens[i .. j], offset + 1); code ~= indent ~ " default:\n"; @@ -247,6 +246,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction, static string generateLeaf(string token, string indent) { + import std.conv; static assert (pseudoTokenHandlers.length % 2 == 0, "Each pseudo-token must have a matching function name."); string code; @@ -262,7 +262,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction, code ~= indent ~ "return " ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(token) + 1] ~ "();\n"; else if (possibleDefaultTokens.countUntil(token) >= 0) { - code ~= indent ~ "if (range.lookahead(" ~ text(token.length + 1) ~ ").length == 0 || isSeparating(range.lookahead(" ~ text(token.length + 1) ~ ")[" ~ text(token.length) ~ "]))\n"; + code ~= indent ~ "if (!range.canPeek(" ~ text(token.length + 1) ~ ") || isSeparating(" ~ text(token.length) ~ "))\n"; code ~= indent ~ "{\n"; if (token.length == 1) code ~= indent ~ " range.popFront();\n"; @@ -278,7 +278,7 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction, return code; } - const(Token) front() pure nothrow const @property + ref const(Token) front() pure nothrow const @property { return _front; } @@ -312,7 +312,21 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction, return retVal; } - Token advance() pure + /** + * This only exists because the real array() can't be called at compile-time + */ + static string[] stupidToArray(R)(R range) + { + string[] retVal; + foreach (v; range) + retVal ~= v; + return retVal; + } + + + enum loopBody = generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens))); + + auto ref Token advance() pure { if (range.empty) return Token(tok!"\0"); @@ -321,54 +335,87 @@ mixin template Lexer(R, IDType, Token, alias defaultTokenFunction, immutable size_t line = range.line; lexerLoop: switch (range.front) { - mixin(generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens)))); -// pragma(msg, generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens)))); + mixin(loopBody); + /+pragma(msg, loopBody);+/ outer_default: default: return defaultTokenFunction(); } } - /** - * This only exists because the real array() can't be called at compile-time - */ - static T[] stupidToArray(R, T = ElementType!R)(R range) - { - T[] retVal; - foreach (v; range) - retVal ~= v; - return retVal; - } - - LexerRange!(typeof(buffer(R.init))) range; + LexerRange range; Token _front; } -struct LexerRange(BufferType) if (isBuffer!BufferType) +struct LexerRange { - this(BufferType r) + + this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1) pure nothrow @safe { - this.range = r; - index = 0; - column = 1; - line = 1; + this.bytes = bytes; + this.index = index; + this.column = column; + this.line = line; } - void popFront() pure + size_t mark() const nothrow pure @safe + { + return index; + } + + void seek(size_t m) nothrow pure @safe + { + index = m; + } + + const(ubyte)[] slice(size_t m) const nothrow pure @safe + { + return bytes[m .. index]; + } + + bool empty() const nothrow pure @safe + { + return index >= bytes.length; + } + + ubyte front() const nothrow pure @safe + { + return bytes[index]; + } + + const(ubyte)[] peek(size_t p) const nothrow pure @safe + { + return bytes[index .. index + p + 1]; + } + + bool canPeek(size_t p) const nothrow pure @safe + { + return index + p < bytes.length; + } + + LexerRange save() const nothrow pure @safe + { + return LexerRange(bytes, index, column, line); + } + + void popFront() pure nothrow @safe { index++; column++; - range.popFront(); } - void incrementLine() pure nothrow + void popFrontN(size_t n) pure nothrow @safe + { + index += n; + } + + void incrementLine() pure nothrow @safe { column = 1; line++; } - BufferType range; - alias range this; + const(ubyte)[] bytes; size_t index; size_t column; size_t line; @@ -388,6 +435,13 @@ struct StringCache { public: + @disable this(); + + this(size_t bucketCount = defaultBucketCount) + { + buckets = new Item*[bucketCount]; + } + /** * Equivalent to calling cache() and get(). * --- @@ -402,6 +456,11 @@ public: return get(cache(bytes)); } + string cacheGet(const(ubyte[]) bytes, uint hash) pure nothrow @safe + { + return get(cache(bytes, hash)); + } + /** * Caches a string. * Params: bytes = the string to cache @@ -416,6 +475,12 @@ public: * --- */ size_t cache(const(ubyte)[] bytes) pure nothrow @safe + { + immutable uint hash = hashBytes(bytes); + return cache(bytes, hash); + } + + size_t cache(const(ubyte)[] bytes, uint hash) pure nothrow @safe in { assert (bytes.length > 0); @@ -426,7 +491,7 @@ public: } body { - immutable uint hash = hashBytes(bytes); + memoryRequested += bytes.length; const(Item)* found = find(bytes, hash); if (found is null) return intern(bytes, hash); @@ -453,23 +518,58 @@ public: return items[index].str; } + void printStats() + { + import std.stdio; + writeln("Load Factor: ", cast(float) items.length / cast(float) buckets.length); + writeln("Memory used by blocks: ", blocks.length * blockSize); + writeln("Memory requsted: ", memoryRequested); + writeln("rehashes: ", rehashCount); + } + + static uint hashStep(ubyte b, uint h) pure nothrow @safe + { + return (h ^ sbox[b]) * 3; + } + + static enum defaultBucketCount = 2048; + private: - size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @safe + private void rehash() pure nothrow @safe { - Item* item = new Item; - item.hash = hash; - item.str = allocate(bytes); + immutable size_t newBucketCount = items.length * 2; + buckets = new Item*[newBucketCount]; + rehashCount++; + foreach (item; items) + { + immutable size_t newIndex = item.hash % newBucketCount; + item.next = buckets[newIndex]; + buckets[newIndex] = item; + } + } + + size_t intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted + { + ubyte[] mem = allocate(bytes.length); + mem[] = bytes[]; + Item* item = cast(Item*) allocate(Item.sizeof).ptr; item.index = items.length; + item.str = cast(string) mem; + item.hash = hash; + item.next = buckets[hash % buckets.length]; + immutable bool checkLoadFactor = item.next !is null; + buckets[hash % buckets.length] = item; items ~= item; - buckets[hash % buckets.length] ~= item; + if (checkLoadFactor && (cast(float) items.length / cast(float) buckets.length) > 0.75) + rehash(); return item.index; } const(Item)* find(const(ubyte)[] bytes, uint hash) pure nothrow const @safe { immutable size_t index = hash % buckets.length; - foreach (item; buckets[index]) + for (const(Item)* item = buckets[index]; item !is null; item = item.next) { if (item.hash == hash && bytes.equal(item.str)) return item; @@ -477,53 +577,46 @@ private: return null; } - string allocate(const(ubyte)[] bytes) pure nothrow @trusted - out (retVal) - { - assert (retVal == bytes); - } - body + ubyte[] allocate(size_t byteCount) pure nothrow @trusted { import core.memory; - if (bytes.length > (pageSize / 4)) + if (byteCount > (blockSize / 4)) { - ubyte* memory = cast(ubyte*) GC.malloc(bytes.length, GC.BlkAttr.NO_SCAN); - memory[0 .. bytes.length] = bytes[]; - return cast(string) memory[0..bytes.length]; + ubyte* mem = cast(ubyte*) GC.malloc(byteCount, GC.BlkAttr.NO_SCAN); + return mem[0 .. byteCount]; } foreach (ref block; blocks) { - immutable size_t endIndex = block.used + bytes.length; - if (endIndex > block.bytes.length) + immutable size_t oldUsed = block.used; + immutable size_t end = oldUsed + byteCount; + if (end > block.bytes.length) continue; - block.bytes[block.used .. endIndex] = bytes[]; - string slice = cast(string) block.bytes[block.used .. endIndex]; - block.used = endIndex; - return slice; + block.used = end; + return block.bytes[oldUsed .. end]; } - blocks.length = blocks.length + 1; - blocks[$ - 1].bytes = (cast(ubyte*) GC.malloc(pageSize, GC.BlkAttr.NO_SCAN))[0 .. pageSize]; - blocks[$ - 1].bytes[0 .. bytes.length] = bytes[]; - blocks[$ - 1].used = bytes.length; - return cast(string) blocks[$ - 1].bytes[0 .. bytes.length]; + blocks ~= Block( + (cast(ubyte*) GC.malloc(blockSize, GC.BlkAttr.NO_SCAN))[0 .. blockSize], + byteCount); + return blocks[$ - 1].bytes[0 .. byteCount]; } static uint hashBytes(const(ubyte)[] data) pure nothrow @safe - { - uint hash = 0; - foreach (b; data) - { - hash ^= sbox[b]; - hash *= 3; - } - return hash; - } + { + uint hash = 0; + foreach (b; data) + { + hash ^= sbox[b]; + hash *= 3; + } + return hash; + } static struct Item { size_t index; string str; uint hash; + Item* next; } static struct Block @@ -532,10 +625,9 @@ private: size_t used; } - static enum pageSize = 4096 * 1024; - static enum bucketCount = 2048; + static enum blockSize = 1024 * 16; - static enum uint[] sbox = [ + public static immutable uint[] sbox = [ 0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53, 0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982, 0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56, @@ -603,6 +695,8 @@ private: ]; Item*[] items; - Item*[][bucketCount] buckets; + Item*[] buckets; Block[] blocks; + size_t memoryRequested; + uint rehashCount; }