mirror of
https://github.com/dlang/dmd.git
synced 2025-04-27 13:40:11 +03:00
193 lines
4.8 KiB
D
193 lines
4.8 KiB
D
/**
|
|
* Benchmark for array ops.
|
|
*
|
|
* Copyright: Copyright Martin Nowak 2016 -.
|
|
* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
|
|
* Authors: Martin Nowak
|
|
*/
|
|
import core.cpuid, std.algorithm, std.meta, std.stdio, std.string, std.range;
|
|
import std.datetime.stopwatch : benchmark, StopWatch, AutoStart;
|
|
|
|
float[6] getLatencies(T, string op)()
|
|
{
|
|
enum N = (64 * (1 << 6) + 64) * T.sizeof;
|
|
auto a = Array!T(N), b = Array!T(N), c = Array!T(N);
|
|
float[6] latencies = float.max;
|
|
foreach (i, ref latency; latencies)
|
|
{
|
|
auto len = 1 << i;
|
|
foreach (_; 1 .. 32)
|
|
{
|
|
a[] = 24;
|
|
b[] = 4;
|
|
c[] = 2;
|
|
__gshared T s = 2; // scalar, use __gshared to avoid const-folding
|
|
auto sw = StopWatch(AutoStart.yes);
|
|
foreach (off; size_t(0) .. size_t(64))
|
|
{
|
|
off = off * len + off;
|
|
enum op = op
|
|
.replace("scalar", "s")
|
|
.replace("a", "a[off .. off + len]")
|
|
.replace("b", "b[off .. off + len]")
|
|
.replace("c", "c[off .. off + len]");
|
|
mixin(op ~ ";");
|
|
}
|
|
latency = min(latency, sw.peek.total!"nsecs");
|
|
}
|
|
}
|
|
float[6] res = latencies[] / 1024;
|
|
return res;
|
|
}
|
|
|
|
float[4] getThroughput(T, string op)()
|
|
{
|
|
enum N = (40 * 1024 * 1024 + 64 * T.sizeof) / T.sizeof;
|
|
auto a = Array!T(N), b = Array!T(N), c = Array!T(N);
|
|
float[4] latencies = float.max;
|
|
size_t[4] lengths = [
|
|
8 * 1024 / T.sizeof, 32 * 1024 / T.sizeof, 512 * 1024 / T.sizeof, 32 * 1024 * 1024 / T
|
|
.sizeof
|
|
];
|
|
foreach (i, ref latency; latencies)
|
|
{
|
|
auto len = lengths[i] / 64;
|
|
foreach (_; 1 .. 4)
|
|
{
|
|
a[] = 24;
|
|
b[] = 4;
|
|
c[] = 2;
|
|
__gshared T s = 2; // scalar, use __gshared to avoid const-folding
|
|
auto sw = StopWatch(AutoStart.yes);
|
|
foreach (off; size_t(0) .. size_t(64))
|
|
{
|
|
off = off * len + off;
|
|
enum op = op
|
|
.replace("scalar", "s")
|
|
.replace("a", "a[off .. off + len]")
|
|
.replace("b", "b[off .. off + len]")
|
|
.replace("c", "c[off .. off + len]");
|
|
mixin(op ~ ";");
|
|
}
|
|
immutable nsecs = sw.peek.total!"nsecs";
|
|
runMasked({latency = min(latency, nsecs);});
|
|
}
|
|
}
|
|
float[4] throughputs = void;
|
|
runMasked({throughputs = T.sizeof * lengths[] / latencies[];});
|
|
return throughputs;
|
|
}
|
|
|
|
string[] genOps()
|
|
{
|
|
string[] ops;
|
|
foreach (op1; ["+", "-", "*", "/"])
|
|
{
|
|
ops ~= "a " ~ op1 ~ "= b";
|
|
ops ~= "a " ~ op1 ~ "= scalar";
|
|
foreach (op2; ["+", "-", "*", "/"])
|
|
{
|
|
ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " c";
|
|
ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " scalar";
|
|
}
|
|
}
|
|
return ops;
|
|
}
|
|
|
|
void runOp(string op)()
|
|
{
|
|
foreach (T; AliasSeq!(ubyte, ushort, uint, ulong, byte, short, int, long, float,
|
|
double))
|
|
writefln("%s, %s, %(%.2f, %), %(%s, %)", T.stringof, op,
|
|
getLatencies!(T, op), getThroughput!(T, op));
|
|
}
|
|
|
|
struct Array(T)
|
|
{
|
|
import core.stdc.stdlib : free, malloc;
|
|
|
|
this(size_t n)
|
|
{
|
|
ary = (cast(T*) malloc(T.sizeof * n))[0 .. n];
|
|
}
|
|
|
|
~this()
|
|
{
|
|
free(ary.ptr);
|
|
}
|
|
|
|
T[] ary;
|
|
alias ary this;
|
|
}
|
|
|
|
version (X86)
|
|
version = SSE;
|
|
else version (X86_64)
|
|
version = SSE;
|
|
else
|
|
static assert(0, "unimplemented");
|
|
|
|
version (SSE)
|
|
{
|
|
uint mxcsr()
|
|
{
|
|
uint ret = void;
|
|
asm
|
|
{
|
|
stmxcsr ret;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
void mxcsr(uint val)
|
|
{
|
|
asm
|
|
{
|
|
ldmxcsr val;
|
|
}
|
|
}
|
|
|
|
// http://softpixel.com/~cwright/programming/simd/sse.php
|
|
enum FPU_EXCEPTION_MASKS = 1 << 12 | 1 << 11 | 1 << 10 | 1 << 9 | 1 << 8 | 1 << 7;
|
|
enum FPU_EXCEPTION_FLAGS = 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0;
|
|
|
|
void maskFPUExceptions()
|
|
{
|
|
mxcsr = mxcsr | FPU_EXCEPTION_MASKS;
|
|
}
|
|
|
|
void unmaskFPUExceptions()
|
|
{
|
|
mxcsr = mxcsr & ~FPU_EXCEPTION_MASKS;
|
|
}
|
|
|
|
uint FPUExceptionFlags()
|
|
{
|
|
return mxcsr & FPU_EXCEPTION_FLAGS;
|
|
}
|
|
|
|
void clearFPUExceptionFlags()
|
|
{
|
|
mxcsr = mxcsr & ~FPU_EXCEPTION_FLAGS;
|
|
}
|
|
}
|
|
|
|
void runMasked(scope void delegate() dg)
|
|
{
|
|
assert(FPUExceptionFlags == 0);
|
|
maskFPUExceptions;
|
|
dg();
|
|
clearFPUExceptionFlags;
|
|
unmaskFPUExceptions;
|
|
}
|
|
|
|
void main()
|
|
{
|
|
unmaskFPUExceptions;
|
|
|
|
writefln("type, op, %(latency%s, %), %-(throughput%s, %)", iota(6)
|
|
.map!(i => 1 << i), ["8KB", "32KB", "512KB", "32768KB"]);
|
|
foreach (op; mixin("AliasSeq!(%(%s, %))".format(genOps)))
|
|
runOp!op;
|
|
maskFPUExceptions;
|
|
}
|