From a060dabde7141d123dbc02b3d676ef64ec86dd76 Mon Sep 17 00:00:00 2001 From: Hackerpilot Date: Tue, 21 Jan 2014 19:48:03 -0800 Subject: [PATCH 1/2] Initial work on the lexer optimization --- build.sh | 2 +- main.d | 2 +- stdx/d/lexer.d | 2 + stdx/lexer.d | 166 ++++++++++++++++++++++++++----------------------- 4 files changed, 92 insertions(+), 80 deletions(-) diff --git a/build.sh b/build.sh index 69001e9..22e9531 100755 --- a/build.sh +++ b/build.sh @@ -12,7 +12,7 @@ dmd\ analysis/*.d\ -ofdscanner\ -m64\ - -O -release -noboundscheck + -O -release -noboundscheck -inline #gdc\ # main.d\ diff --git a/main.d b/main.d index 1d11354..9ab99f3 100644 --- a/main.d +++ b/main.d @@ -112,7 +112,7 @@ int main(string[] args) foreach (token; tokens) { writeln("«", token.text is null ? str(token.type) : token.text, - "» ", token.index, " ", token.line, " ", token.column, " ", + "» ", token.text !is null, " ", token.index, " ", token.line, " ", token.column, " ", token.comment); } return 0; diff --git a/stdx/d/lexer.d b/stdx/d/lexer.d index 82a4dcf..645977d 100644 --- a/stdx/d/lexer.d +++ b/stdx/d/lexer.d @@ -1361,6 +1361,8 @@ public struct DLexer Token lexIdentifier() pure nothrow { + import std.stdio; + debug(1) try { writeln("lexIdentifier"); } catch (Exception e) {} mixin (tokenStart); uint hash = 0; while (!range.empty && !isSeparating(0)) diff --git a/stdx/lexer.d b/stdx/lexer.d index fc20635..1676391 100644 --- a/stdx/lexer.d +++ b/stdx/lexer.d @@ -193,88 +193,90 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction, alias staticTokens, alias dynamicTokens, alias pseudoTokens, alias pseudoTokenHandlers, alias possibleDefaultTokens) { - static string generateCaseStatements(string[] tokens, size_t offset = 0) + static string generateCaseStatements(string[] tokens) { import std.conv; + import std.string; + + static string generateMask(const ubyte[] arr) + { + ulong u; + for (size_t i = 0; i < arr.length && i < 8; i++) + { + u |= (cast(ulong) arr[i]) << (i * 8); + } + return format("0x%016x", u); + } + + static string generateByteMask(size_t l) + { + return format("0x%016x", ulong.max >> ((8 - l) * 8)); + } + string code; for (size_t i = 0; i < tokens.length; i++) { - auto indent = ""; - foreach (k; 0 .. offset) - indent ~= " "; - size_t j = i + 1; - - if (offset < tokens[i].length) + immutable mask = generateMask(cast (const ubyte[]) tokens[i]); + if (tokens[i].length >= 8) + code ~= "if (frontBytes == " ~ mask ~ ")\n"; + else + code ~= "if ((frontBytes & " ~ generateByteMask(tokens[i].length) ~ ") == " ~ mask ~ ")\n"; + code ~= "{\n"; + if (staticTokens.countUntil(tokens[i]) >= 0) { - while (j < tokens.length && offset < tokens[j].length - && tokens[i][offset] == tokens[j][offset]) j++; - code ~= indent ~ "case " ~ text(cast(ubyte) tokens[i][offset]) ~ ":\n"; - if (i + 1 >= j) + if (tokens[i].length <= 8) { - if (offset + 1 == tokens[i].length) - code ~= generateLeaf(tokens[i], indent ~ " "); - else - { - code ~= indent ~ " if (!range.canPeek(" ~ text(tokens[i].length) ~ "))\n"; - code ~= indent ~ " goto outer_default;\n"; - code ~= indent ~ " if (range.peek(" ~ text(tokens[i].length - 1) ~ ") == \"" ~ escape(tokens[i]) ~ "\")\n"; - code ~= indent ~ " {\n"; - code ~= generateLeaf(tokens[i], indent ~ " "); - code ~= indent ~ " }\n"; - code ~= indent ~ " else\n"; - code ~= indent ~ " goto outer_default;\n"; - } + code ~= " range.popFrontN(" ~ text(tokens[i].length) ~ ");\n"; + code ~= " return Token(tok!\"" ~ escape(tokens[i]) ~ "\", null, line, column, index);\n"; } else { - code ~= indent ~ " if (!range.canPeek(" ~ text(offset + 1) ~ "))\n"; - code ~= indent ~ " {\n"; - code ~= generateLeaf(tokens[i][0 .. offset + 1], indent ~ " "); - code ~= indent ~ " }\n"; - code ~= indent ~ " switch (range.peek(" ~ text(offset + 1) ~ ")[" ~ text(offset + 1) ~ "])\n"; - code ~= indent ~ " {\n"; - code ~= generateCaseStatements(tokens[i .. j], offset + 1); - code ~= indent ~ " default:\n"; - code ~= generateLeaf(tokens[i][0 .. offset + 1], indent ~ " "); - code ~= indent ~ " }\n"; + code ~= " assert (false); // " ~ escape(tokens[i]) ~ "\n"; } } - i = j - 1; - } - return code; - } + else if (pseudoTokens.countUntil(tokens[i]) >= 0) + { + if (tokens[i].length < 8) + { + code ~= " return " + ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(tokens[i]) + 1] + ~ "();\n"; + } + else + { + code ~= " if (range.peek(" ~ text(tokens[i].length) ~ ") == \"" ~ escape(tokens[i]) ~"\")\n"; + code ~= " return " + ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(tokens[i]) + 1] + ~ "();\n"; + } + } + else + { + // possible default + if (tokens[i].length < 8) + { + code ~= " if (isSeparating(" ~ text(tokens[i].length) ~ "))\n"; + code ~= " {\n"; + code ~= " range.popFrontN(" ~ text(tokens[i].length) ~ ");\n"; + code ~= " return Token(tok!\"" ~ escape(tokens[i]) ~ "\", null, line, column, index);\n"; + code ~= " }\n"; + code ~= " else\n"; + code ~= " goto defaultHandler;\n"; + } + else + { + code ~= " if (range.peek(" ~ text(tokens[i].length) ~ ") == \"" ~ escape(tokens[i]) ~"\" && isSeparating(" ~ text(tokens[i].length) ~ "))\n"; + code ~= " {\n"; + code ~= " range.popFrontN(" ~ text(tokens[i].length) ~ ");\n"; + code ~= " return Token(tok!\"" ~ escape(tokens[i]) ~ "\", null, line, column, index);\n"; + code ~= " }\n"; + code ~= " else\n"; + code ~= " goto defaultHandler;\n"; + } + } + code ~= "}\n"; - static string generateLeaf(string token, string indent) - { - import std.conv; - static assert (pseudoTokenHandlers.length % 2 == 0, - "Each pseudo-token must have a matching function name."); - string code; - if (staticTokens.countUntil(token) >= 0) - { - if (token.length == 1) - code ~= indent ~ "range.popFront();\n"; - else - code ~= indent ~ "range.popFrontN(" ~ text(token.length) ~ ");\n"; - code ~= indent ~ "return Token(tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; } - else if (pseudoTokens.countUntil(token) >= 0) - code ~= indent ~ "return " ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(token) + 1] ~ "();\n"; - else if (possibleDefaultTokens.countUntil(token) >= 0) - { - code ~= indent ~ "if (!range.canPeek(" ~ text(token.length + 1) ~ ") || isSeparating(" ~ text(token.length) ~ "))\n"; - code ~= indent ~ "{\n"; - if (token.length == 1) - code ~= indent ~ " range.popFront();\n"; - else - code ~= indent ~ " range.popFrontN(" ~ text(token.length) ~ ");\n"; - code ~= indent ~ " return Token(tok!\"" ~ escape(token) ~"\", null, line, column, index);\n"; - code ~= indent ~ "}\n"; - code ~= indent ~ "else\n"; - code ~= indent ~ " goto outer_default;\n"; - } - else - code ~= indent ~ "goto outer_default;\n"; return code; } @@ -323,8 +325,17 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction, return retVal; } + enum tokenSearch = generateCaseStatements(stupidToArray(sort!"a.length > b.length"(staticTokens ~ pseudoTokens ~ possibleDefaultTokens))); - enum loopBody = generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens))); + static ulong getFront(const ubyte[] arr) pure nothrow @trusted + { + import std.stdio; + immutable importantBits = *(cast (ulong*) arr.ptr); + immutable filler = ulong.max >> ((8 - arr.length) * 8); + + debug(1) try { writefln("0x%016x", importantBits & filler); } catch (Exception e) {} + return importantBits & filler; + } Token advance() pure { @@ -333,14 +344,11 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction, immutable size_t index = range.index; immutable size_t column = range.column; immutable size_t line = range.line; - lexerLoop: switch (range.front) - { - mixin(loopBody); - /+pragma(msg, loopBody);+/ - outer_default: - default: - return defaultTokenFunction(); - } + immutable ulong frontBytes = getFront(range.peek(7)); + mixin(tokenSearch); + pragma(msg, tokenSearch); + defaultHandler: + return defaultTokenFunction(); } LexerRange range; @@ -385,7 +393,9 @@ struct LexerRange const(ubyte)[] peek(size_t p) const nothrow pure @safe { - return bytes[index .. index + p + 1]; + return index + p + 1 > bytes.length + ? bytes[index .. $] + : bytes[index .. index + p + 1]; } bool canPeek(size_t p) const nothrow pure @safe From 4ec5af9093b238612036c350f577b0000c9bb577 Mon Sep 17 00:00:00 2001 From: Hackerpilot Date: Tue, 21 Jan 2014 23:26:23 -0800 Subject: [PATCH 2/2] Lots of optimization. Updated GDC portion of build script --- build.sh | 6 +- main.d | 3 +- stats.d | 1 - stdx/d/lexer.d | 41 ++++++------ stdx/lexer.d | 171 ++++++++++++++++++++++++++++++------------------- 5 files changed, 132 insertions(+), 90 deletions(-) diff --git a/build.sh b/build.sh index 22e9531..5076df3 100755 --- a/build.sh +++ b/build.sh @@ -11,7 +11,7 @@ dmd\ stdx/d/*.d\ analysis/*.d\ -ofdscanner\ - -m64\ + -m64 -g\ -O -release -noboundscheck -inline #gdc\ @@ -23,9 +23,9 @@ dmd\ # astprinter.d\ # formatter.d\ # outliner.d\ -# style.d\ # stdx/*.d\ # stdx/d/*.d\ +# analysis/*.d\ # -O3 -frelease -fno-bounds-check\ # -odscanner\ @@ -38,8 +38,8 @@ dmd\ # astprinter.d\ # formatter.d\ # outliner.d\ -# style.d\ # stdx/*.d\ # stdx/d/*.d\ +# analysis/*.d\ # -O3 -release\ # -oq -of=dscanner\ diff --git a/main.d b/main.d index 9ab99f3..4427959 100644 --- a/main.d +++ b/main.d @@ -152,7 +152,7 @@ int main(string[] args) ulong count; foreach (f; expandArgs(args, recursive)) { - import core.memory; + LexerConfig config; config.whitespaceBehavior = WhitespaceBehavior.skip; config.stringBehavior = StringBehavior.source; @@ -162,7 +162,6 @@ int main(string[] args) count += printTokenCount(stdout, f, tokens); else count += printLineCount(stdout, f, tokens); - cache.printStats(); } writefln("total:\t%d", count); } diff --git a/stats.d b/stats.d index d4a65c4..533a948 100644 --- a/stats.d +++ b/stats.d @@ -32,7 +32,6 @@ pure nothrow bool isLineOfCode(IdType t) ulong printTokenCount(Tokens)(File output, string fileName, ref Tokens tokens) { - ulong c; foreach (ref t; tokens) { diff --git a/stdx/d/lexer.d b/stdx/d/lexer.d index 645977d..09d0ee0 100644 --- a/stdx/d/lexer.d +++ b/stdx/d/lexer.d @@ -425,7 +425,6 @@ public struct DLexer public void popFront() pure { _popFront(); - string comment = null; switch (front.type) { case tok!"comment": @@ -433,7 +432,11 @@ public struct DLexer { import std.string; if (isDocComment(front.text)) - comment = comment == null ? front.text : format("%s\n%s", comment, front.text); + { + _front.comment = _front.comment == null + ? front.text + : format("%s\n%s", _front.comment, front.text); + } do _popFront(); while (front == tok!"comment"); if (front == tok!"whitespace") goto case tok!"whitespace"; } @@ -448,7 +451,6 @@ public struct DLexer default: break; } - _front.comment = comment; } @@ -715,17 +717,16 @@ public struct DLexer lexExponent(type); break decimalLoop; case '.': - if (foundDot || !range.canPeek(1) || range.peek(1)[1] == '.') + if (foundDot || !range.canPeek(1) || range.peekAt(1) == '.') break decimalLoop; else { - auto lookahead = range.peek(1); // The following bit of silliness tries to tell the // difference between "int dot identifier" and // "double identifier". - if (lookahead.length == 2) + if (range.canPeek(1)) { - switch (lookahead[1]) + switch (range.peekAt(1)) { case '0': .. case '9': goto doubleLiteral; @@ -1362,7 +1363,6 @@ public struct DLexer Token lexIdentifier() pure nothrow { import std.stdio; - debug(1) try { writeln("lexIdentifier"); } catch (Exception e) {} mixin (tokenStart); uint hash = 0; while (!range.empty && !isSeparating(0)) @@ -1418,25 +1418,28 @@ public struct DLexer { if (range.front == '\n') return true; if (range.front == '\r') return true; - auto lookahead = range.peek(3); - if (lookahead.length == 0) return false; - if (lookahead == "\u2028" || lookahead == "\u2029") - return true; - return false; + return (range.front & 0x80) && range.canPeek(2) + && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029"); } - bool isSeparating(size_t offset) const pure nothrow @safe + bool isSeparating(size_t offset) pure nothrow @safe { - auto r = range.save(); - r.popFrontN(offset); - auto c = r.front; + if (!range.canPeek(offset)) return false; + auto c = range.peekAt(offset); + if (c >= 'A' && c <= 'Z') return false; + if (c >= 'a' && c <= 'z') return false; if (c <= 0x2f) return true; if (c >= ':' && c <= '@') return true; if (c >= '[' && c <= '^') return true; if (c >= '{' && c <= '~') return true; if (c == '`') return true; - if (c & 0x80 && (r.peek(3) == "\u2028" - || range.peek(3) == "\u2029")) return true; + if (c & 0x80) + { + auto r = range; + range.popFrontN(offset); + return (r.canPeek(2) && (r.peek(2) == "\u2028" + || r.peek(2) == "\u2029")); + } return false; } diff --git a/stdx/lexer.d b/stdx/lexer.d index 1676391..d84f04c 100644 --- a/stdx/lexer.d +++ b/stdx/lexer.d @@ -193,90 +193,130 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction, alias staticTokens, alias dynamicTokens, alias pseudoTokens, alias pseudoTokenHandlers, alias possibleDefaultTokens) { + + static string generateMask(const ubyte[] arr) + { + import std.string; + ulong u; + for (size_t i = 0; i < arr.length && i < 8; i++) + { + u |= (cast(ulong) arr[i]) << (i * 8); + } + return format("0x%016x", u); + } + + static string generateByteMask(size_t l) + { + import std.string; + return format("0x%016x", ulong.max >> ((8 - l) * 8)); + } + static string generateCaseStatements(string[] tokens) { import std.conv; import std.string; - static string generateMask(const ubyte[] arr) - { - ulong u; - for (size_t i = 0; i < arr.length && i < 8; i++) - { - u |= (cast(ulong) arr[i]) << (i * 8); - } - return format("0x%016x", u); - } - - static string generateByteMask(size_t l) - { - return format("0x%016x", ulong.max >> ((8 - l) * 8)); - } string code; for (size_t i = 0; i < tokens.length; i++) { - immutable mask = generateMask(cast (const ubyte[]) tokens[i]); - if (tokens[i].length >= 8) - code ~= "if (frontBytes == " ~ mask ~ ")\n"; - else - code ~= "if ((frontBytes & " ~ generateByteMask(tokens[i].length) ~ ") == " ~ mask ~ ")\n"; - code ~= "{\n"; - if (staticTokens.countUntil(tokens[i]) >= 0) + size_t j = i + 1; + size_t o = i; + while (j < tokens.length && tokens[i][0] == tokens[j][0]) j++; + code ~= format("case 0x%02x:\n", cast(ubyte) tokens[i][0]); + code ~= printCase(tokens[i .. j]); + i = j - 1; + } + return code; + } + + static string printCase(string[] tokens) + { + string[] t = tokens; + string[] sortedTokens = stupidToArray(sort!"a.length > b.length"(t)); + import std.conv; + + if (tokens.length == 1 && tokens[0].length == 1) + { + if (staticTokens.countUntil(tokens[0]) >= 0) { - if (tokens[i].length <= 8) + return " range.popFront();\n" + ~ " return Token(tok!\"" ~ escape(tokens[0]) ~ "\", null, line, column, index);\n"; + } + else if (pseudoTokens.countUntil(tokens[0]) >= 0) + { + return " return " + ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(tokens[0]) + 1] + ~ "();\n"; + } + } + + string code; + + foreach (i, token; sortedTokens) + { + immutable mask = generateMask(cast (const ubyte[]) token); + if (token.length >= 8) + code ~= " if (frontBytes == " ~ mask ~ ")\n"; + else + code ~= " if ((frontBytes & " ~ generateByteMask(token.length) ~ ") == " ~ mask ~ ")\n"; + code ~= " {\n"; + if (staticTokens.countUntil(token) >= 0) + { + if (token.length <= 8) { - code ~= " range.popFrontN(" ~ text(tokens[i].length) ~ ");\n"; - code ~= " return Token(tok!\"" ~ escape(tokens[i]) ~ "\", null, line, column, index);\n"; + code ~= " range.popFrontN(" ~ text(token.length) ~ ");\n"; + code ~= " return Token(tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; } else { - code ~= " assert (false); // " ~ escape(tokens[i]) ~ "\n"; + code ~= " pragma(msg, \"long static tokens not supported\"); // " ~ escape(token) ~ "\n"; } } - else if (pseudoTokens.countUntil(tokens[i]) >= 0) + else if (pseudoTokens.countUntil(token) >= 0) { - if (tokens[i].length < 8) + if (token.length < 8) { - code ~= " return " - ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(tokens[i]) + 1] + code ~= " return " + ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(token) + 1] ~ "();\n"; } else { - code ~= " if (range.peek(" ~ text(tokens[i].length) ~ ") == \"" ~ escape(tokens[i]) ~"\")\n"; - code ~= " return " - ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(tokens[i]) + 1] + code ~= " if (range.peek(" ~ text(token.length) ~ ") == \"" ~ escape(token) ~"\")\n"; + code ~= " return " + ~ pseudoTokenHandlers[pseudoTokenHandlers.countUntil(token) + 1] ~ "();\n"; } } else { // possible default - if (tokens[i].length < 8) + if (token.length < 8) { - code ~= " if (isSeparating(" ~ text(tokens[i].length) ~ "))\n"; - code ~= " {\n"; - code ~= " range.popFrontN(" ~ text(tokens[i].length) ~ ");\n"; - code ~= " return Token(tok!\"" ~ escape(tokens[i]) ~ "\", null, line, column, index);\n"; - code ~= " }\n"; - code ~= " else\n"; - code ~= " goto defaultHandler;\n"; + code ~= " if (isSeparating(" ~ text(token.length) ~ "))\n"; + code ~= " {\n"; + code ~= " range.popFrontN(" ~ text(token.length) ~ ");\n"; + code ~= " return Token(tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; + code ~= " }\n"; + code ~= " else\n"; + code ~= " goto default;\n"; } else { - code ~= " if (range.peek(" ~ text(tokens[i].length) ~ ") == \"" ~ escape(tokens[i]) ~"\" && isSeparating(" ~ text(tokens[i].length) ~ "))\n"; - code ~= " {\n"; - code ~= " range.popFrontN(" ~ text(tokens[i].length) ~ ");\n"; - code ~= " return Token(tok!\"" ~ escape(tokens[i]) ~ "\", null, line, column, index);\n"; - code ~= " }\n"; - code ~= " else\n"; - code ~= " goto defaultHandler;\n"; + code ~= " if (range.peek(" ~ text(token.length) ~ ") == \"" ~ escape(token) ~"\" && isSeparating(" ~ text(token.length) ~ "))\n"; + code ~= " {\n"; + code ~= " range.popFrontN(" ~ text(token.length) ~ ");\n"; + code ~= " return Token(tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; + code ~= " }\n"; + code ~= " else\n"; + code ~= " goto default;\n"; } } - code ~= "}\n"; - + code ~= " }\n"; } + code ~= " else\n"; + code ~= " goto default;\n"; return code; } @@ -325,15 +365,13 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction, return retVal; } - enum tokenSearch = generateCaseStatements(stupidToArray(sort!"a.length > b.length"(staticTokens ~ pseudoTokens ~ possibleDefaultTokens))); + enum tokenSearch = generateCaseStatements(stupidToArray(sort(staticTokens ~ pseudoTokens ~ possibleDefaultTokens))); static ulong getFront(const ubyte[] arr) pure nothrow @trusted { import std.stdio; immutable importantBits = *(cast (ulong*) arr.ptr); immutable filler = ulong.max >> ((8 - arr.length) * 8); - - debug(1) try { writefln("0x%016x", importantBits & filler); } catch (Exception e) {} return importantBits & filler; } @@ -345,10 +383,13 @@ mixin template Lexer(IDType, Token, alias defaultTokenFunction, immutable size_t column = range.column; immutable size_t line = range.line; immutable ulong frontBytes = getFront(range.peek(7)); + switch (frontBytes & 0x00000000_000000ff) + { mixin(tokenSearch); - pragma(msg, tokenSearch); - defaultHandler: - return defaultTokenFunction(); + /+pragma(msg, tokenSearch);+/ + default: + return defaultTokenFunction(); + } } LexerRange range; @@ -398,16 +439,16 @@ struct LexerRange : bytes[index .. index + p + 1]; } + ubyte peekAt(size_t offset) const nothrow pure @safe + { + return bytes[index + offset]; + } + bool canPeek(size_t p) const nothrow pure @safe { return index + p < bytes.length; } - LexerRange save() const nothrow pure @safe - { - return LexerRange(bytes, index, column, line); - } - void popFront() pure nothrow @safe { index++; @@ -501,7 +542,7 @@ public: } body { - memoryRequested += bytes.length; + debug memoryRequested += bytes.length; const(Item)* found = find(bytes, hash); if (found is null) return intern(bytes, hash); @@ -528,7 +569,7 @@ public: return items[index].str; } - void printStats() + debug void printStats() { import std.stdio; writeln("Load Factor: ", cast(float) items.length / cast(float) buckets.length); @@ -550,7 +591,7 @@ private: { immutable size_t newBucketCount = items.length * 2; buckets = new Item*[newBucketCount]; - rehashCount++; + debug rehashCount++; foreach (item; items) { immutable size_t newIndex = item.hash % newBucketCount; @@ -707,6 +748,6 @@ private: Item*[] items; Item*[] buckets; Block[] blocks; - size_t memoryRequested; - uint rehashCount; + debug size_t memoryRequested; + debug uint rehashCount; }