SIMD speed hacks

This commit is contained in:
Hackerpilot 2014-06-09 16:23:56 -07:00
parent 1fd6a7d932
commit 19777a5931
1 changed files with 200 additions and 6 deletions

View File

@ -435,7 +435,7 @@ public struct DLexer
public void popFront() pure
{
_popFront();
string comment = null;
string comment;
switch (front.type)
{
case tok!"comment":
@ -538,6 +538,72 @@ public struct DLexer
Token lexWhitespace() pure nothrow
{
mixin (tokenStart);
static if (__VERSION__ > 2065) version (D_InlineAsm_X86_64) while (index + 16 <= range.bytes.length)
{
ulong startAddr = (cast(ulong) range.bytes.ptr) + index;
enum space = (cast(ulong) ' ') * 0x0101010101010101L;
enum tab = (cast(ulong) '\t') * 0x0101010101010101L;
enum cr = (cast(ulong) '\r') * 0x0101010101010101L;
enum lf = (cast(ulong) '\n') * 0x0101010101010101L;
ulong charsSkipped;
ulong lineIncrement;
asm
{
mov R10, space;
mov R11, tab;
mov R12, cr;
mov R13, lf;
mov R8, startAddr;
movdqu XMM0, [R8];
mov R9, line;
// space pattern
movq XMM1, R10;
shufpd XMM1, XMM1, 0;
pcmpeqb XMM1, XMM0;
// tab pattern
movq XMM2, R11;
shufpd XMM2, XMM2, 0;
pcmpeqb XMM2, XMM0;
// CR pattern
movq XMM3, R12;
shufpd XMM3, XMM3, 0;
pcmpeqb XMM3, XMM0;
// LF pattern
movq XMM4, R13;
shufpd XMM4, XMM4, 0;
pcmpeqb XMM4, XMM0;
// Bit mask-of newlines to r10
pmovmskb R10, XMM4;
// and the masks together
por XMM1, XMM2;
por XMM1, XMM3;
por XMM1, XMM4;
pmovmskb RAX, XMM1;
not RAX;
bsf RCX, RAX;
mov charsSkipped, RCX;
mov RBX, 1;
inc CL;
shl RBX, CL;
sub RBX, 1;
and R10, RBX;
popcnt R10, R10;
mov lineIncrement, R10;
}
range.incrementLine(lineIncrement);
range.popFrontN(charsSkipped);
if (charsSkipped < 16)
goto end;
index += 16;
}
loop: do
{
switch (range.front)
@ -575,6 +641,7 @@ public struct DLexer
break loop;
}
} while (!range.empty);
end:
string text = config.whitespaceBehavior == WhitespaceBehavior.skip
? null : cache.intern(range.slice(mark));
return Token(tok!"whitespace", text, line, column, index);
@ -920,8 +987,89 @@ public struct DLexer
{
mixin (tokenStart);
IdType type = tok!"comment";
range.popFront();
range.popFront();
range.popFrontN(2);
static if (__VERSION__ > 2065) version (D_InlineAsm_X86_64) while (range.index + 16 <= range.bytes.length)
{
ulong startAddress = cast(ulong) range.bytes.ptr + range.index;
enum slash = (cast(ulong) '/') * 0x0101010101010101L;
enum star = (cast(ulong) '*') * 0x0101010101010101L;
enum lf = (cast(ulong) '\n') * 0x0101010101010101L;
ulong charsSkipped;
ulong newlineCount;
bool done;
asm
{
mov RAX, startAddress;
movdqu XMM0, [RAX];
mov R10, lf;
movq XMM2, R10;
shufpd XMM2, XMM2, 0;
pcmpeqb XMM2, XMM0;
pmovmskb R15, XMM2;
mov R10, star;
movq XMM3, R10;
shufpd XMM3, XMM3, 0;
pcmpeqb XMM3, XMM0;
pmovmskb R8, XMM3;
mov R10, slash;
movq XMM4, R10;
shufpd XMM4, XMM4, 0;
pcmpeqb XMM4, XMM0;
pmovmskb R9, XMM4;
loop:
cmp R8, 0;
je notFound;
cmp R9, 0;
je notFound;
bsf RAX, R8; // stIndex
bsf RBX, R9; // slIndex
mov RDX, RAX;
inc RDX;
cmp RDX, RBX;
je found;
cmp RAX, RBX;
jae maskSlash;
maskStar:
mov RCX, RAX;
mov R10, 1;
shl R10, CL;
xor R8, R10;
jmp loop;
maskSlash:
mov RCX, RBX;
mov R10, 1;
shl R10, CL;
xor R9, R10;
jmp loop;
notFound:
mov R14, 16;
mov charsSkipped, R14;
popcnt R14, R15;
mov newlineCount, R14;
jmp asmEnd;
found:
inc RBX;
mov charsSkipped, RBX;
mov RAX, 1;
mov done, AL;
mov RCX, RBX;
mov RBX, 1;
shl RBX, CL;
dec RBX;
and R15, RBX;
popcnt R14, R15;
mov newlineCount, R14;
asmEnd:
nop;
}
range.popFrontN(charsSkipped);
range.incrementLine(newlineCount);
if (done)
goto end;
}
while (!range.empty)
{
if (range.front == '*')
@ -936,6 +1084,7 @@ public struct DLexer
else
popFrontWhitespaceAware();
}
end:
return Token(type, cache.intern(range.slice(mark)), line, column,
index);
}
@ -944,14 +1093,59 @@ public struct DLexer
{
mixin (tokenStart);
IdType type = tok!"comment";
range.popFront();
range.popFront();
range.popFrontN(2);
static if (__VERSION__ > 2065) version (D_InlineAsm_X86_64) while (range.index + 16 <= range.bytes.length)
{
ulong startAddress = cast(ulong) range.bytes.ptr + range.index;
enum cr = (cast(ulong) '\r') * 0x0101010101010101L;
enum lf = (cast(ulong) '\n') * 0x0101010101010101L;
ulong charsSkipped;
asm
{
mov RAX, startAddress;
movdqu XMM0, [RAX];
mov R10, cr;
movq XMM1, R10;
shufpd XMM1, XMM1, 0;
pcmpeqb XMM1, XMM0;
mov R10, lf;
movq XMM2, R10;
shufpd XMM2, XMM2, 0;
pcmpeqb XMM2, XMM0;
por XMM1, XMM2;
pmovmskb RBX, XMM1;
bsf RCX, RBX;
mov RDX, 16;
cmp RBX, 0;
cmove RCX, RDX;
mov charsSkipped, RCX;
}
if (charsSkipped < 16)
{
index += charsSkipped;
column += charsSkipped;
range.popFrontN(charsSkipped);
goto end;
}
else
{
assert (charsSkipped == 16);
index += 16;
column += 16;
range.popFrontN(16);
}
}
while (!range.empty)
{
if (range.front == '\r' || range.front == '\n')
break;
range.popFront();
}
end:
return Token(type, cache.intern(range.slice(mark)), line, column,
index);
}
@ -1750,7 +1944,7 @@ public:
/**
* The default bucket count for the string cache.
*/
static enum defaultBucketCount = 2048;
static enum defaultBucketCount = 4096;
size_t allocated() pure nothrow @safe @property
{