/** * Benchmark for array ops. * * Copyright: Copyright Martin Nowak 2016 -. * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) * Authors: Martin Nowak */ import core.cpuid, std.algorithm, std.meta, std.stdio, std.string, std.range; import std.datetime.stopwatch : benchmark, StopWatch, AutoStart; float[6] getLatencies(T, string op)() { enum N = (64 * (1 << 6) + 64) * T.sizeof; auto a = Array!T(N), b = Array!T(N), c = Array!T(N); float[6] latencies = float.max; foreach (i, ref latency; latencies) { auto len = 1 << i; foreach (_; 1 .. 32) { a[] = 24; b[] = 4; c[] = 2; __gshared T s = 2; // scalar, use __gshared to avoid const-folding auto sw = StopWatch(AutoStart.yes); foreach (off; size_t(0) .. size_t(64)) { off = off * len + off; enum op = op .replace("scalar", "s") .replace("a", "a[off .. off + len]") .replace("b", "b[off .. off + len]") .replace("c", "c[off .. off + len]"); mixin(op ~ ";"); } latency = min(latency, sw.peek.total!"nsecs"); } } float[6] res = latencies[] / 1024; return res; } float[4] getThroughput(T, string op)() { enum N = (40 * 1024 * 1024 + 64 * T.sizeof) / T.sizeof; auto a = Array!T(N), b = Array!T(N), c = Array!T(N); float[4] latencies = float.max; size_t[4] lengths = [ 8 * 1024 / T.sizeof, 32 * 1024 / T.sizeof, 512 * 1024 / T.sizeof, 32 * 1024 * 1024 / T .sizeof ]; foreach (i, ref latency; latencies) { auto len = lengths[i] / 64; foreach (_; 1 .. 4) { a[] = 24; b[] = 4; c[] = 2; __gshared T s = 2; // scalar, use __gshared to avoid const-folding auto sw = StopWatch(AutoStart.yes); foreach (off; size_t(0) .. size_t(64)) { off = off * len + off; enum op = op .replace("scalar", "s") .replace("a", "a[off .. off + len]") .replace("b", "b[off .. off + len]") .replace("c", "c[off .. off + len]"); mixin(op ~ ";"); } immutable nsecs = sw.peek.total!"nsecs"; runMasked({latency = min(latency, nsecs);}); } } float[4] throughputs = void; runMasked({throughputs = T.sizeof * lengths[] / latencies[];}); return throughputs; } string[] genOps() { string[] ops; foreach (op1; ["+", "-", "*", "/"]) { ops ~= "a " ~ op1 ~ "= b"; ops ~= "a " ~ op1 ~ "= scalar"; foreach (op2; ["+", "-", "*", "/"]) { ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " c"; ops ~= "a " ~ op1 ~ "= b " ~ op2 ~ " scalar"; } } return ops; } void runOp(string op)() { foreach (T; AliasSeq!(ubyte, ushort, uint, ulong, byte, short, int, long, float, double)) writefln("%s, %s, %(%.2f, %), %(%s, %)", T.stringof, op, getLatencies!(T, op), getThroughput!(T, op)); } struct Array(T) { import core.stdc.stdlib : free, malloc; this(size_t n) { ary = (cast(T*) malloc(T.sizeof * n))[0 .. n]; } ~this() { free(ary.ptr); } T[] ary; alias ary this; } version (X86) version = SSE; else version (X86_64) version = SSE; else static assert(0, "unimplemented"); version (SSE) { uint mxcsr() { uint ret = void; asm { stmxcsr ret; } return ret; } void mxcsr(uint val) { asm { ldmxcsr val; } } // http://softpixel.com/~cwright/programming/simd/sse.php enum FPU_EXCEPTION_MASKS = 1 << 12 | 1 << 11 | 1 << 10 | 1 << 9 | 1 << 8 | 1 << 7; enum FPU_EXCEPTION_FLAGS = 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0; void maskFPUExceptions() { mxcsr = mxcsr | FPU_EXCEPTION_MASKS; } void unmaskFPUExceptions() { mxcsr = mxcsr & ~FPU_EXCEPTION_MASKS; } uint FPUExceptionFlags() { return mxcsr & FPU_EXCEPTION_FLAGS; } void clearFPUExceptionFlags() { mxcsr = mxcsr & ~FPU_EXCEPTION_FLAGS; } } void runMasked(scope void delegate() dg) { assert(FPUExceptionFlags == 0); maskFPUExceptions; dg(); clearFPUExceptionFlags; unmaskFPUExceptions; } void main() { unmaskFPUExceptions; writefln("type, op, %(latency%s, %), %-(throughput%s, %)", iota(6) .map!(i => 1 << i), ["8KB", "32KB", "512KB", "32768KB"]); foreach (op; mixin("AliasSeq!(%(%s, %))".format(genOps))) runOp!op; maskFPUExceptions; }