mirror of
https://github.com/dlang/phobos.git
synced 2025-05-11 23:05:34 +03:00
2305 lines
45 KiB
D
2305 lines
45 KiB
D
|
|
/***************************
|
|
* D programming language http://www.digitalmars.com/d/
|
|
* Runtime support for float array operations.
|
|
* Based on code originally written by Burton Radons.
|
|
* Placed in public domain.
|
|
*/
|
|
|
|
import std.cpuid;
|
|
|
|
version (unittest)
|
|
{
|
|
/* This is so unit tests will test every CPU variant
|
|
*/
|
|
int cpuid;
|
|
const int CPUID_MAX = 5;
|
|
bool mmx() { return cpuid == 1 && std.cpuid.mmx(); }
|
|
bool sse() { return cpuid == 2 && std.cpuid.sse(); }
|
|
bool sse2() { return cpuid == 3 && std.cpuid.sse2(); }
|
|
bool amd3dnow() { return cpuid == 4 && std.cpuid.amd3dnow(); }
|
|
}
|
|
else
|
|
{
|
|
import std.cpuid;
|
|
alias std.cpuid.mmx mmx;
|
|
alias std.cpuid.sse sse;
|
|
alias std.cpuid.sse2 sse2;
|
|
alias std.cpuid.amd3dnow amd3dnow;
|
|
}
|
|
|
|
//version = log;
|
|
|
|
bool disjoint(T)(T[] a, T[] b)
|
|
{
|
|
return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
|
|
}
|
|
|
|
alias float T;
|
|
|
|
extern (C):
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] = b[] + c[]
|
|
*/
|
|
|
|
T[] _arraySliceSliceAddSliceAssign_f(T[] a, T[] c, T[] b)
|
|
in
|
|
{
|
|
assert(a.length == b.length && b.length == c.length);
|
|
assert(disjoint(a, b));
|
|
assert(disjoint(a, c));
|
|
assert(disjoint(b, c));
|
|
}
|
|
body
|
|
{
|
|
//printf("_arraySliceSliceAddSliceAssign_f()\n");
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
auto bptr = b.ptr;
|
|
auto cptr = c.ptr;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 834% faster
|
|
if (sse() && b.length >= 16)
|
|
{
|
|
version (log) printf("\tsse unaligned\n");
|
|
auto n = aptr + (b.length & ~15);
|
|
|
|
// Unaligned case
|
|
asm
|
|
{
|
|
mov EAX, bptr; // left operand
|
|
mov ECX, cptr; // right operand
|
|
mov ESI, aptr; // destination operand
|
|
mov EDI, n; // end comparison
|
|
|
|
align 8;
|
|
startsseloopb:
|
|
movups XMM0, [EAX];
|
|
movups XMM1, [EAX+16];
|
|
movups XMM2, [EAX+32];
|
|
movups XMM3, [EAX+48];
|
|
add EAX, 64;
|
|
movups XMM4, [ECX];
|
|
movups XMM5, [ECX+16];
|
|
movups XMM6, [ECX+32];
|
|
movups XMM7, [ECX+48];
|
|
add ESI, 64;
|
|
addps XMM0, XMM4;
|
|
addps XMM1, XMM5;
|
|
addps XMM2, XMM6;
|
|
addps XMM3, XMM7;
|
|
add ECX, 64;
|
|
movups [ESI+ 0-64], XMM0;
|
|
movups [ESI+16-64], XMM1;
|
|
movups [ESI+32-64], XMM2;
|
|
movups [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloopb;
|
|
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
mov cptr, ECX;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is only 13% faster
|
|
if (amd3dnow() && b.length >= 8)
|
|
{
|
|
version (log) printf("\tamd3dnow\n");
|
|
auto n = aptr + (b.length & ~7);
|
|
|
|
asm
|
|
{
|
|
mov ESI, aptr; // destination operand
|
|
mov EDI, n; // end comparison
|
|
mov EAX, bptr; // left operand
|
|
mov ECX, cptr; // right operand
|
|
|
|
align 4;
|
|
start3dnow:
|
|
movq MM0, [EAX];
|
|
movq MM1, [EAX+8];
|
|
movq MM2, [EAX+16];
|
|
movq MM3, [EAX+24];
|
|
pfadd MM0, [ECX];
|
|
pfadd MM1, [ECX+8];
|
|
pfadd MM2, [ECX+16];
|
|
pfadd MM3, [ECX+24];
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ECX, 32;
|
|
add ESI, 32;
|
|
add EAX, 32;
|
|
cmp ESI, EDI;
|
|
jb start3dnow;
|
|
|
|
emms;
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
mov cptr, ECX;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Handle remainder
|
|
version (log) if (aptr < aend) printf("\tbase\n");
|
|
while (aptr < aend)
|
|
*aptr++ = *bptr++ + *cptr++;
|
|
|
|
return a;
|
|
}
|
|
|
|
|
|
unittest
|
|
{
|
|
printf("_arraySliceSliceAddSliceAssign_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
c[] = a[] + b[];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] + b[i]))
|
|
{
|
|
printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] = b[] - c[]
|
|
*/
|
|
|
|
T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b)
|
|
in
|
|
{
|
|
assert(a.length == b.length && b.length == c.length);
|
|
assert(disjoint(a, b));
|
|
assert(disjoint(a, c));
|
|
assert(disjoint(b, c));
|
|
}
|
|
body
|
|
{
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
auto bptr = b.ptr;
|
|
auto cptr = c.ptr;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 834% faster
|
|
if (sse() && b.length >= 16)
|
|
{
|
|
auto n = aptr + (b.length & ~15);
|
|
|
|
// Unaligned case
|
|
asm
|
|
{
|
|
mov EAX, bptr; // left operand
|
|
mov ECX, cptr; // right operand
|
|
mov ESI, aptr; // destination operand
|
|
mov EDI, n; // end comparison
|
|
|
|
align 8;
|
|
startsseloopb:
|
|
movups XMM0, [EAX];
|
|
movups XMM1, [EAX+16];
|
|
movups XMM2, [EAX+32];
|
|
movups XMM3, [EAX+48];
|
|
add EAX, 64;
|
|
movups XMM4, [ECX];
|
|
movups XMM5, [ECX+16];
|
|
movups XMM6, [ECX+32];
|
|
movups XMM7, [ECX+48];
|
|
add ESI, 64;
|
|
subps XMM0, XMM4;
|
|
subps XMM1, XMM5;
|
|
subps XMM2, XMM6;
|
|
subps XMM3, XMM7;
|
|
add ECX, 64;
|
|
movups [ESI+ 0-64], XMM0;
|
|
movups [ESI+16-64], XMM1;
|
|
movups [ESI+32-64], XMM2;
|
|
movups [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloopb;
|
|
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
mov cptr, ECX;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is only 13% faster
|
|
if (amd3dnow() && b.length >= 8)
|
|
{
|
|
auto n = aptr + (b.length & ~7);
|
|
|
|
asm
|
|
{
|
|
mov ESI, aptr; // destination operand
|
|
mov EDI, n; // end comparison
|
|
mov EAX, bptr; // left operand
|
|
mov ECX, cptr; // right operand
|
|
|
|
align 4;
|
|
start3dnow:
|
|
movq MM0, [EAX];
|
|
movq MM1, [EAX+8];
|
|
movq MM2, [EAX+16];
|
|
movq MM3, [EAX+24];
|
|
pfsub MM0, [ECX];
|
|
pfsub MM1, [ECX+8];
|
|
pfsub MM2, [ECX+16];
|
|
pfsub MM3, [ECX+24];
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ECX, 32;
|
|
add ESI, 32;
|
|
add EAX, 32;
|
|
cmp ESI, EDI;
|
|
jb start3dnow;
|
|
|
|
emms;
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
mov cptr, ECX;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Handle remainder
|
|
while (aptr < aend)
|
|
*aptr++ = *bptr++ - *cptr++;
|
|
|
|
return a;
|
|
}
|
|
|
|
|
|
unittest
|
|
{
|
|
printf("_arraySliceSliceMinSliceAssign_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
c[] = a[] - b[];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] - b[i]))
|
|
{
|
|
printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] = b[] + value
|
|
*/
|
|
|
|
T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b)
|
|
in
|
|
{
|
|
assert(a.length == b.length);
|
|
assert(disjoint(a, b));
|
|
}
|
|
body
|
|
{
|
|
//printf("_arraySliceExpAddSliceAssign_f()\n");
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
auto bptr = b.ptr;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 665% faster
|
|
if (sse() && a.length >= 16)
|
|
{
|
|
auto n = aptr + (a.length & ~15);
|
|
|
|
// Unaligned case
|
|
asm
|
|
{
|
|
mov EAX, bptr;
|
|
mov ESI, aptr;
|
|
mov EDI, n;
|
|
movss XMM4, value;
|
|
shufps XMM4, XMM4, 0;
|
|
|
|
align 8;
|
|
startsseloop:
|
|
add ESI, 64;
|
|
movups XMM0, [EAX];
|
|
movups XMM1, [EAX+16];
|
|
movups XMM2, [EAX+32];
|
|
movups XMM3, [EAX+48];
|
|
add EAX, 64;
|
|
addps XMM0, XMM4;
|
|
addps XMM1, XMM4;
|
|
addps XMM2, XMM4;
|
|
addps XMM3, XMM4;
|
|
movups [ESI+ 0-64], XMM0;
|
|
movups [ESI+16-64], XMM1;
|
|
movups [ESI+32-64], XMM2;
|
|
movups [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloop;
|
|
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is 69% faster
|
|
if (amd3dnow() && a.length >= 8)
|
|
{
|
|
auto n = aptr + (a.length & ~7);
|
|
|
|
ulong w = *cast(uint *) &value;
|
|
ulong v = w | (w << 32L);
|
|
|
|
asm
|
|
{
|
|
mov ESI, aptr;
|
|
mov EDI, n;
|
|
mov EAX, bptr;
|
|
movq MM4, qword ptr [v];
|
|
|
|
align 8;
|
|
start3dnow:
|
|
movq MM0, [EAX];
|
|
movq MM1, [EAX+8];
|
|
movq MM2, [EAX+16];
|
|
movq MM3, [EAX+24];
|
|
pfadd MM0, MM4;
|
|
pfadd MM1, MM4;
|
|
pfadd MM2, MM4;
|
|
pfadd MM3, MM4;
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ESI, 32;
|
|
add EAX, 32;
|
|
cmp ESI, EDI;
|
|
jb start3dnow;
|
|
|
|
emms;
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
}
|
|
}
|
|
}
|
|
|
|
while (aptr < aend)
|
|
*aptr++ = *bptr++ + value;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arraySliceExpAddSliceAssign_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
c[] = a[] + 6;
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] + 6))
|
|
{
|
|
printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] += value
|
|
*/
|
|
|
|
T[] _arrayExpSliceAddass_f(T[] a, T value)
|
|
{
|
|
//printf("_arrayExpSliceAddass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 302% faster
|
|
if (sse() && a.length >= 16)
|
|
{
|
|
// align pointer
|
|
auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
|
|
while (aptr < n)
|
|
*aptr++ += value;
|
|
n = cast(T*)((cast(uint)aend) & ~15);
|
|
if (aptr < n)
|
|
|
|
// Aligned case
|
|
asm
|
|
{
|
|
mov ESI, aptr;
|
|
mov EDI, n;
|
|
movss XMM4, value;
|
|
shufps XMM4, XMM4, 0;
|
|
|
|
align 8;
|
|
startsseloopa:
|
|
movaps XMM0, [ESI];
|
|
movaps XMM1, [ESI+16];
|
|
movaps XMM2, [ESI+32];
|
|
movaps XMM3, [ESI+48];
|
|
add ESI, 64;
|
|
addps XMM0, XMM4;
|
|
addps XMM1, XMM4;
|
|
addps XMM2, XMM4;
|
|
addps XMM3, XMM4;
|
|
movaps [ESI+ 0-64], XMM0;
|
|
movaps [ESI+16-64], XMM1;
|
|
movaps [ESI+32-64], XMM2;
|
|
movaps [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloopa;
|
|
|
|
mov aptr, ESI;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is 63% faster
|
|
if (amd3dnow() && a.length >= 8)
|
|
{
|
|
auto n = aptr + (a.length & ~7);
|
|
|
|
ulong w = *cast(uint *) &value;
|
|
ulong v = w | (w << 32L);
|
|
|
|
asm
|
|
{
|
|
mov ESI, dword ptr [aptr];
|
|
mov EDI, dword ptr [n];
|
|
movq MM4, qword ptr [v];
|
|
|
|
align 8;
|
|
start3dnow:
|
|
movq MM0, [ESI];
|
|
movq MM1, [ESI+8];
|
|
movq MM2, [ESI+16];
|
|
movq MM3, [ESI+24];
|
|
pfadd MM0, MM4;
|
|
pfadd MM1, MM4;
|
|
pfadd MM2, MM4;
|
|
pfadd MM3, MM4;
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ESI, 32;
|
|
cmp ESI, EDI;
|
|
jb start3dnow;
|
|
|
|
emms;
|
|
mov dword ptr [aptr], ESI;
|
|
}
|
|
}
|
|
}
|
|
|
|
while (aptr < aend)
|
|
*aptr++ += value;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arrayExpSliceAddass_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
a[] = c[];
|
|
c[] += 6;
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] + 6))
|
|
{
|
|
printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] += b[]
|
|
*/
|
|
|
|
T[] _arraySliceSliceAddass_f(T[] a, T[] b)
|
|
in
|
|
{
|
|
assert (a.length == b.length);
|
|
assert (disjoint(a, b));
|
|
}
|
|
body
|
|
{
|
|
//printf("_arraySliceSliceAddass_f()\n");
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
auto bptr = b.ptr;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 468% faster
|
|
if (sse() && a.length >= 16)
|
|
{
|
|
auto n = aptr + (a.length & ~15);
|
|
|
|
// Unaligned case
|
|
asm
|
|
{
|
|
mov ECX, bptr; // right operand
|
|
mov ESI, aptr; // destination operand
|
|
mov EDI, n; // end comparison
|
|
|
|
align 8;
|
|
startsseloopb:
|
|
movups XMM0, [ESI];
|
|
movups XMM1, [ESI+16];
|
|
movups XMM2, [ESI+32];
|
|
movups XMM3, [ESI+48];
|
|
add ESI, 64;
|
|
movups XMM4, [ECX];
|
|
movups XMM5, [ECX+16];
|
|
movups XMM6, [ECX+32];
|
|
movups XMM7, [ECX+48];
|
|
add ECX, 64;
|
|
addps XMM0, XMM4;
|
|
addps XMM1, XMM5;
|
|
addps XMM2, XMM6;
|
|
addps XMM3, XMM7;
|
|
movups [ESI+ 0-64], XMM0;
|
|
movups [ESI+16-64], XMM1;
|
|
movups [ESI+32-64], XMM2;
|
|
movups [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloopb;
|
|
|
|
mov aptr, ESI;
|
|
mov bptr, ECX;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is 57% faster
|
|
if (amd3dnow() && a.length >= 8)
|
|
{
|
|
auto n = aptr + (a.length & ~7);
|
|
|
|
asm
|
|
{
|
|
mov ESI, dword ptr [aptr]; // destination operand
|
|
mov EDI, dword ptr [n]; // end comparison
|
|
mov ECX, dword ptr [bptr]; // right operand
|
|
|
|
align 4;
|
|
start3dnow:
|
|
movq MM0, [ESI];
|
|
movq MM1, [ESI+8];
|
|
movq MM2, [ESI+16];
|
|
movq MM3, [ESI+24];
|
|
pfadd MM0, [ECX];
|
|
pfadd MM1, [ECX+8];
|
|
pfadd MM2, [ECX+16];
|
|
pfadd MM3, [ECX+24];
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ESI, 32;
|
|
add ECX, 32;
|
|
cmp ESI, EDI;
|
|
jb start3dnow;
|
|
|
|
emms;
|
|
mov dword ptr [aptr], ESI;
|
|
mov dword ptr [bptr], ECX;
|
|
}
|
|
}
|
|
}
|
|
|
|
while (aptr < aend)
|
|
*aptr++ += *bptr++;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arraySliceSliceAddass_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
a[] = c[];
|
|
c[] += b[];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] + b[i]))
|
|
{
|
|
printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] = b[] - value
|
|
*/
|
|
|
|
T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b)
|
|
in
|
|
{
|
|
assert (a.length == b.length);
|
|
assert (disjoint(a, b));
|
|
}
|
|
body
|
|
{
|
|
//printf("_arraySliceExpMinSliceAssign_f()\n");
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
auto bptr = b.ptr;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 622% faster
|
|
if (sse() && a.length >= 16)
|
|
{
|
|
auto n = aptr + (a.length & ~15);
|
|
|
|
// Unaligned case
|
|
asm
|
|
{
|
|
mov EAX, bptr;
|
|
mov ESI, aptr;
|
|
mov EDI, n;
|
|
movss XMM4, value;
|
|
shufps XMM4, XMM4, 0;
|
|
|
|
align 8;
|
|
startsseloop:
|
|
add ESI, 64;
|
|
movups XMM0, [EAX];
|
|
movups XMM1, [EAX+16];
|
|
movups XMM2, [EAX+32];
|
|
movups XMM3, [EAX+48];
|
|
add EAX, 64;
|
|
subps XMM0, XMM4;
|
|
subps XMM1, XMM4;
|
|
subps XMM2, XMM4;
|
|
subps XMM3, XMM4;
|
|
movups [ESI+ 0-64], XMM0;
|
|
movups [ESI+16-64], XMM1;
|
|
movups [ESI+32-64], XMM2;
|
|
movups [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloop;
|
|
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is 67% faster
|
|
if (amd3dnow() && a.length >= 8)
|
|
{
|
|
auto n = aptr + (a.length & ~7);
|
|
|
|
T[2] w;
|
|
|
|
w[0] = w[1] = value;
|
|
|
|
asm
|
|
{
|
|
mov ESI, dword ptr [aptr];
|
|
mov EDI, dword ptr [n];
|
|
mov EAX, dword ptr [bptr];
|
|
movq MM4, qword ptr [w];
|
|
|
|
align 8;
|
|
start3dnow:
|
|
movq MM0, [EAX];
|
|
movq MM1, [EAX+8];
|
|
movq MM2, [EAX+16];
|
|
movq MM3, [EAX+24];
|
|
pfsub MM0, MM4;
|
|
pfsub MM1, MM4;
|
|
pfsub MM2, MM4;
|
|
pfsub MM3, MM4;
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ESI, 32;
|
|
add EAX, 32;
|
|
cmp ESI, EDI;
|
|
jb start3dnow;
|
|
|
|
emms;
|
|
mov dword ptr [aptr], ESI;
|
|
mov dword ptr [bptr], EAX;
|
|
}
|
|
}
|
|
}
|
|
|
|
while (aptr < aend)
|
|
*aptr++ = *bptr++ - value;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arraySliceExpMinSliceAssign_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
c[] = a[] - 6;
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] - 6))
|
|
{
|
|
printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] = value - b[]
|
|
*/
|
|
|
|
T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value)
|
|
in
|
|
{
|
|
assert (a.length == b.length);
|
|
assert (disjoint(a, b));
|
|
}
|
|
body
|
|
{
|
|
//printf("_arrayExpSliceMinSliceAssign_f()\n");
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
auto bptr = b.ptr;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 690% faster
|
|
if (sse() && a.length >= 16)
|
|
{
|
|
auto n = aptr + (a.length & ~15);
|
|
|
|
// Unaligned case
|
|
asm
|
|
{
|
|
mov EAX, bptr;
|
|
mov ESI, aptr;
|
|
mov EDI, n;
|
|
movss XMM4, value;
|
|
shufps XMM4, XMM4, 0;
|
|
|
|
align 8;
|
|
startsseloop:
|
|
add ESI, 64;
|
|
movaps XMM5, XMM4;
|
|
movaps XMM6, XMM4;
|
|
movups XMM0, [EAX];
|
|
movups XMM1, [EAX+16];
|
|
movups XMM2, [EAX+32];
|
|
movups XMM3, [EAX+48];
|
|
add EAX, 64;
|
|
subps XMM5, XMM0;
|
|
subps XMM6, XMM1;
|
|
movups [ESI+ 0-64], XMM5;
|
|
movups [ESI+16-64], XMM6;
|
|
movaps XMM5, XMM4;
|
|
movaps XMM6, XMM4;
|
|
subps XMM5, XMM2;
|
|
subps XMM6, XMM3;
|
|
movups [ESI+32-64], XMM5;
|
|
movups [ESI+48-64], XMM6;
|
|
cmp ESI, EDI;
|
|
jb startsseloop;
|
|
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is 67% faster
|
|
if (amd3dnow() && a.length >= 8)
|
|
{
|
|
auto n = aptr + (a.length & ~7);
|
|
|
|
ulong w = *cast(uint *) &value;
|
|
ulong v = w | (w << 32L);
|
|
|
|
asm
|
|
{
|
|
mov ESI, aptr;
|
|
mov EDI, n;
|
|
mov EAX, bptr;
|
|
movq MM4, qword ptr [v];
|
|
|
|
align 8;
|
|
start3dnow:
|
|
movq MM0, [EAX];
|
|
movq MM1, [EAX+8];
|
|
movq MM2, [EAX+16];
|
|
movq MM3, [EAX+24];
|
|
pfsubr MM0, MM4;
|
|
pfsubr MM1, MM4;
|
|
pfsubr MM2, MM4;
|
|
pfsubr MM3, MM4;
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ESI, 32;
|
|
add EAX, 32;
|
|
cmp ESI, EDI;
|
|
jb start3dnow;
|
|
|
|
emms;
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
}
|
|
}
|
|
}
|
|
|
|
while (aptr < aend)
|
|
*aptr++ = value - *bptr++;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arrayExpSliceMinSliceAssign_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
c[] = 6 - a[];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(6 - a[i]))
|
|
{
|
|
printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] -= value
|
|
*/
|
|
|
|
T[] _arrayExpSliceMinass_f(T[] a, T value)
|
|
{
|
|
//printf("_arrayExpSliceMinass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 304% faster
|
|
if (sse() && a.length >= 16)
|
|
{
|
|
// align pointer
|
|
auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
|
|
while (aptr < n)
|
|
*aptr++ -= value;
|
|
n = cast(T*)((cast(uint)aend) & ~15);
|
|
if (aptr < n)
|
|
|
|
// Aligned case
|
|
asm
|
|
{
|
|
mov ESI, aptr;
|
|
mov EDI, n;
|
|
movss XMM4, value;
|
|
shufps XMM4, XMM4, 0;
|
|
|
|
align 8;
|
|
startsseloopa:
|
|
movaps XMM0, [ESI];
|
|
movaps XMM1, [ESI+16];
|
|
movaps XMM2, [ESI+32];
|
|
movaps XMM3, [ESI+48];
|
|
add ESI, 64;
|
|
subps XMM0, XMM4;
|
|
subps XMM1, XMM4;
|
|
subps XMM2, XMM4;
|
|
subps XMM3, XMM4;
|
|
movaps [ESI+ 0-64], XMM0;
|
|
movaps [ESI+16-64], XMM1;
|
|
movaps [ESI+32-64], XMM2;
|
|
movaps [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloopa;
|
|
|
|
mov aptr, ESI;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is 63% faster
|
|
if (amd3dnow() && a.length >= 8)
|
|
{
|
|
auto n = aptr + (a.length & ~7);
|
|
|
|
ulong w = *cast(uint *) &value;
|
|
ulong v = w | (w << 32L);
|
|
|
|
asm
|
|
{
|
|
mov ESI, dword ptr [aptr];
|
|
mov EDI, dword ptr [n];
|
|
movq MM4, qword ptr [v];
|
|
|
|
align 8;
|
|
start:
|
|
movq MM0, [ESI];
|
|
movq MM1, [ESI+8];
|
|
movq MM2, [ESI+16];
|
|
movq MM3, [ESI+24];
|
|
pfsub MM0, MM4;
|
|
pfsub MM1, MM4;
|
|
pfsub MM2, MM4;
|
|
pfsub MM3, MM4;
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ESI, 32;
|
|
cmp ESI, EDI;
|
|
jb start;
|
|
|
|
emms;
|
|
mov dword ptr [aptr], ESI;
|
|
}
|
|
}
|
|
}
|
|
|
|
while (aptr < aend)
|
|
*aptr++ -= value;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arrayExpSliceminass_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
a[] = c[];
|
|
c[] -= 6;
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] - 6))
|
|
{
|
|
printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] -= b[]
|
|
*/
|
|
|
|
T[] _arraySliceSliceMinass_f(T[] a, T[] b)
|
|
in
|
|
{
|
|
assert (a.length == b.length);
|
|
assert (disjoint(a, b));
|
|
}
|
|
body
|
|
{
|
|
//printf("_arraySliceSliceMinass_f()\n");
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
auto bptr = b.ptr;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 468% faster
|
|
if (sse() && a.length >= 16)
|
|
{
|
|
auto n = aptr + (a.length & ~15);
|
|
|
|
// Unaligned case
|
|
asm
|
|
{
|
|
mov ECX, bptr; // right operand
|
|
mov ESI, aptr; // destination operand
|
|
mov EDI, n; // end comparison
|
|
|
|
align 8;
|
|
startsseloopb:
|
|
movups XMM0, [ESI];
|
|
movups XMM1, [ESI+16];
|
|
movups XMM2, [ESI+32];
|
|
movups XMM3, [ESI+48];
|
|
add ESI, 64;
|
|
movups XMM4, [ECX];
|
|
movups XMM5, [ECX+16];
|
|
movups XMM6, [ECX+32];
|
|
movups XMM7, [ECX+48];
|
|
add ECX, 64;
|
|
subps XMM0, XMM4;
|
|
subps XMM1, XMM5;
|
|
subps XMM2, XMM6;
|
|
subps XMM3, XMM7;
|
|
movups [ESI+ 0-64], XMM0;
|
|
movups [ESI+16-64], XMM1;
|
|
movups [ESI+32-64], XMM2;
|
|
movups [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloopb;
|
|
|
|
mov aptr, ESI;
|
|
mov bptr, ECX;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is 57% faster
|
|
if (amd3dnow() && a.length >= 8)
|
|
{
|
|
auto n = aptr + (a.length & ~7);
|
|
|
|
asm
|
|
{
|
|
mov ESI, dword ptr [aptr]; // destination operand
|
|
mov EDI, dword ptr [n]; // end comparison
|
|
mov ECX, dword ptr [bptr]; // right operand
|
|
|
|
align 4;
|
|
start:
|
|
movq MM0, [ESI];
|
|
movq MM1, [ESI+8];
|
|
movq MM2, [ESI+16];
|
|
movq MM3, [ESI+24];
|
|
pfsub MM0, [ECX];
|
|
pfsub MM1, [ECX+8];
|
|
pfsub MM2, [ECX+16];
|
|
pfsub MM3, [ECX+24];
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ESI, 32;
|
|
add ECX, 32;
|
|
cmp ESI, EDI;
|
|
jb start;
|
|
|
|
emms;
|
|
mov aptr, ESI;
|
|
mov bptr, ECX;
|
|
}
|
|
}
|
|
}
|
|
|
|
while (aptr < aend)
|
|
*aptr++ -= *bptr++;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arrayExpSliceMinass_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
a[] = c[];
|
|
c[] -= 6;
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] - 6))
|
|
{
|
|
printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] = b[] * value
|
|
*/
|
|
|
|
T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b)
|
|
in
|
|
{
|
|
assert(a.length == b.length);
|
|
assert(disjoint(a, b));
|
|
}
|
|
body
|
|
{
|
|
//printf("_arraySliceExpMulSliceAssign_f()\n");
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
auto bptr = b.ptr;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 607% faster
|
|
if (sse() && a.length >= 16)
|
|
{
|
|
auto n = aptr + (a.length & ~15);
|
|
|
|
// Unaligned case
|
|
asm
|
|
{
|
|
mov EAX, bptr;
|
|
mov ESI, aptr;
|
|
mov EDI, n;
|
|
movss XMM4, value;
|
|
shufps XMM4, XMM4, 0;
|
|
|
|
align 8;
|
|
startsseloop:
|
|
add ESI, 64;
|
|
movups XMM0, [EAX];
|
|
movups XMM1, [EAX+16];
|
|
movups XMM2, [EAX+32];
|
|
movups XMM3, [EAX+48];
|
|
add EAX, 64;
|
|
mulps XMM0, XMM4;
|
|
mulps XMM1, XMM4;
|
|
mulps XMM2, XMM4;
|
|
mulps XMM3, XMM4;
|
|
movups [ESI+ 0-64], XMM0;
|
|
movups [ESI+16-64], XMM1;
|
|
movups [ESI+32-64], XMM2;
|
|
movups [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloop;
|
|
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is 69% faster
|
|
if (amd3dnow() && a.length >= 8)
|
|
{
|
|
auto n = aptr + (a.length & ~7);
|
|
|
|
ulong w = *cast(uint *) &value;
|
|
ulong v = w | (w << 32L);
|
|
|
|
asm
|
|
{
|
|
mov ESI, dword ptr [aptr];
|
|
mov EDI, dword ptr [n];
|
|
mov EAX, dword ptr [bptr];
|
|
movq MM4, qword ptr [v];
|
|
|
|
align 8;
|
|
start:
|
|
movq MM0, [EAX];
|
|
movq MM1, [EAX+8];
|
|
movq MM2, [EAX+16];
|
|
movq MM3, [EAX+24];
|
|
pfmul MM0, MM4;
|
|
pfmul MM1, MM4;
|
|
pfmul MM2, MM4;
|
|
pfmul MM3, MM4;
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ESI, 32;
|
|
add EAX, 32;
|
|
cmp ESI, EDI;
|
|
jb start;
|
|
|
|
emms;
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
}
|
|
}
|
|
}
|
|
|
|
while (aptr < aend)
|
|
*aptr++ = *bptr++ * value;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arraySliceExpMulSliceAssign_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
c[] = a[] * 6;
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] * 6))
|
|
{
|
|
printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] = b[] * c[]
|
|
*/
|
|
|
|
T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b)
|
|
in
|
|
{
|
|
assert(a.length == b.length && b.length == c.length);
|
|
assert(disjoint(a, b));
|
|
assert(disjoint(a, c));
|
|
assert(disjoint(b, c));
|
|
}
|
|
body
|
|
{
|
|
//printf("_arraySliceSliceMulSliceAssign_f()\n");
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
auto bptr = b.ptr;
|
|
auto cptr = c.ptr;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 833% faster
|
|
if (sse() && a.length >= 16)
|
|
{
|
|
auto n = aptr + (a.length & ~15);
|
|
|
|
// Unaligned case
|
|
asm
|
|
{
|
|
mov EAX, bptr; // left operand
|
|
mov ECX, cptr; // right operand
|
|
mov ESI, aptr; // destination operand
|
|
mov EDI, n; // end comparison
|
|
|
|
align 8;
|
|
startsseloopb:
|
|
movups XMM0, [EAX];
|
|
movups XMM1, [EAX+16];
|
|
movups XMM2, [EAX+32];
|
|
movups XMM3, [EAX+48];
|
|
add ESI, 64;
|
|
movups XMM4, [ECX];
|
|
movups XMM5, [ECX+16];
|
|
movups XMM6, [ECX+32];
|
|
movups XMM7, [ECX+48];
|
|
add EAX, 64;
|
|
mulps XMM0, XMM4;
|
|
mulps XMM1, XMM5;
|
|
mulps XMM2, XMM6;
|
|
mulps XMM3, XMM7;
|
|
add ECX, 64;
|
|
movups [ESI+ 0-64], XMM0;
|
|
movups [ESI+16-64], XMM1;
|
|
movups [ESI+32-64], XMM2;
|
|
movups [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloopb;
|
|
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
mov cptr, ECX;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is only 13% faster
|
|
if (amd3dnow() && a.length >= 8)
|
|
{
|
|
auto n = aptr + (a.length & ~7);
|
|
|
|
asm
|
|
{
|
|
mov ESI, dword ptr [aptr]; // destination operand
|
|
mov EDI, dword ptr [n]; // end comparison
|
|
mov EAX, dword ptr [bptr]; // left operand
|
|
mov ECX, dword ptr [cptr]; // right operand
|
|
|
|
align 4;
|
|
start:
|
|
movq MM0, [EAX];
|
|
movq MM1, [EAX+8];
|
|
movq MM2, [EAX+16];
|
|
movq MM3, [EAX+24];
|
|
pfmul MM0, [ECX];
|
|
pfmul MM1, [ECX+8];
|
|
pfmul MM2, [ECX+16];
|
|
pfmul MM3, [ECX+24];
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ECX, 32;
|
|
add ESI, 32;
|
|
add EAX, 32;
|
|
cmp ESI, EDI;
|
|
jb start;
|
|
|
|
emms;
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
mov cptr, ECX;
|
|
}
|
|
}
|
|
}
|
|
|
|
while (aptr < aend)
|
|
*aptr++ = *bptr++ * *cptr++;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arraySliceSliceMulSliceAssign_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
c[] = a[] * b[];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] * b[i]))
|
|
{
|
|
printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] *= value
|
|
*/
|
|
|
|
T[] _arrayExpSliceMulass_f(T[] a, T value)
|
|
{
|
|
//printf("_arrayExpSliceMulass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 303% faster
|
|
if (sse() && a.length >= 16)
|
|
{
|
|
// align pointer
|
|
auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
|
|
while (aptr < n)
|
|
*aptr++ *= value;
|
|
n = cast(T*)((cast(uint)aend) & ~15);
|
|
if (aptr < n)
|
|
|
|
// Aligned case
|
|
asm
|
|
{
|
|
mov ESI, aptr;
|
|
mov EDI, n;
|
|
movss XMM4, value;
|
|
shufps XMM4, XMM4, 0;
|
|
|
|
align 8;
|
|
startsseloopa:
|
|
movaps XMM0, [ESI];
|
|
movaps XMM1, [ESI+16];
|
|
movaps XMM2, [ESI+32];
|
|
movaps XMM3, [ESI+48];
|
|
add ESI, 64;
|
|
mulps XMM0, XMM4;
|
|
mulps XMM1, XMM4;
|
|
mulps XMM2, XMM4;
|
|
mulps XMM3, XMM4;
|
|
movaps [ESI+ 0-64], XMM0;
|
|
movaps [ESI+16-64], XMM1;
|
|
movaps [ESI+32-64], XMM2;
|
|
movaps [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloopa;
|
|
|
|
mov aptr, ESI;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is 63% faster
|
|
if (amd3dnow() && a.length >= 8)
|
|
{
|
|
auto n = aptr + (a.length & ~7);
|
|
|
|
ulong w = *cast(uint *) &value;
|
|
ulong v = w | (w << 32L);
|
|
|
|
asm
|
|
{
|
|
mov ESI, dword ptr [aptr];
|
|
mov EDI, dword ptr [n];
|
|
movq MM4, qword ptr [v];
|
|
|
|
align 8;
|
|
start:
|
|
movq MM0, [ESI];
|
|
movq MM1, [ESI+8];
|
|
movq MM2, [ESI+16];
|
|
movq MM3, [ESI+24];
|
|
pfmul MM0, MM4;
|
|
pfmul MM1, MM4;
|
|
pfmul MM2, MM4;
|
|
pfmul MM3, MM4;
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ESI, 32;
|
|
cmp ESI, EDI;
|
|
jb start;
|
|
|
|
emms;
|
|
mov dword ptr [aptr], ESI;
|
|
}
|
|
}
|
|
}
|
|
|
|
while (aptr < aend)
|
|
*aptr++ *= value;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arrayExpSliceMulass_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
a[] = c[];
|
|
c[] *= 6;
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] * 6))
|
|
{
|
|
printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] *= b[]
|
|
*/
|
|
|
|
T[] _arraySliceSliceMulass_f(T[] a, T[] b)
|
|
in
|
|
{
|
|
assert (a.length == b.length);
|
|
assert (disjoint(a, b));
|
|
}
|
|
body
|
|
{
|
|
//printf("_arraySliceSliceMulass_f()\n");
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
auto bptr = b.ptr;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 525% faster
|
|
if (sse() && a.length >= 16)
|
|
{
|
|
auto n = aptr + (a.length & ~15);
|
|
|
|
// Unaligned case
|
|
asm
|
|
{
|
|
mov ECX, bptr; // right operand
|
|
mov ESI, aptr; // destination operand
|
|
mov EDI, n; // end comparison
|
|
|
|
align 8;
|
|
startsseloopb:
|
|
movups XMM0, [ESI];
|
|
movups XMM1, [ESI+16];
|
|
movups XMM2, [ESI+32];
|
|
movups XMM3, [ESI+48];
|
|
add ESI, 64;
|
|
movups XMM4, [ECX];
|
|
movups XMM5, [ECX+16];
|
|
movups XMM6, [ECX+32];
|
|
movups XMM7, [ECX+48];
|
|
add ECX, 64;
|
|
mulps XMM0, XMM4;
|
|
mulps XMM1, XMM5;
|
|
mulps XMM2, XMM6;
|
|
mulps XMM3, XMM7;
|
|
movups [ESI+ 0-64], XMM0;
|
|
movups [ESI+16-64], XMM1;
|
|
movups [ESI+32-64], XMM2;
|
|
movups [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloopb;
|
|
|
|
mov aptr, ESI;
|
|
mov bptr, ECX;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is 57% faster
|
|
if (amd3dnow() && a.length >= 8)
|
|
{
|
|
auto n = aptr + (a.length & ~7);
|
|
|
|
asm
|
|
{
|
|
mov ESI, dword ptr [aptr]; // destination operand
|
|
mov EDI, dword ptr [n]; // end comparison
|
|
mov ECX, dword ptr [bptr]; // right operand
|
|
|
|
align 4;
|
|
start:
|
|
movq MM0, [ESI];
|
|
movq MM1, [ESI+8];
|
|
movq MM2, [ESI+16];
|
|
movq MM3, [ESI+24];
|
|
pfmul MM0, [ECX];
|
|
pfmul MM1, [ECX+8];
|
|
pfmul MM2, [ECX+16];
|
|
pfmul MM3, [ECX+24];
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ESI, 32;
|
|
add ECX, 32;
|
|
cmp ESI, EDI;
|
|
jb start;
|
|
|
|
emms;
|
|
mov dword ptr [aptr], ESI;
|
|
mov dword ptr [bptr], ECX;
|
|
}
|
|
}
|
|
}
|
|
|
|
while (aptr < aend)
|
|
*aptr++ *= *bptr++;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arrayExpSliceMulass_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
a[] = c[];
|
|
c[] *= 6;
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] * 6))
|
|
{
|
|
printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] = b[] / value
|
|
*/
|
|
|
|
T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b)
|
|
in
|
|
{
|
|
assert(a.length == b.length);
|
|
assert(disjoint(a, b));
|
|
}
|
|
body
|
|
{
|
|
//printf("_arraySliceExpDivSliceAssign_f()\n");
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
auto bptr = b.ptr;
|
|
|
|
/* Multiplying by the reciprocal is faster, but does
|
|
* not produce as accurate an answer.
|
|
*/
|
|
T recip = cast(T)1 / value;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 587% faster
|
|
if (sse() && a.length >= 16)
|
|
{
|
|
auto n = aptr + (a.length & ~15);
|
|
|
|
// Unaligned case
|
|
asm
|
|
{
|
|
mov EAX, bptr;
|
|
mov ESI, aptr;
|
|
mov EDI, n;
|
|
movss XMM4, recip;
|
|
//movss XMM4, value
|
|
//rcpss XMM4, XMM4
|
|
shufps XMM4, XMM4, 0;
|
|
|
|
align 8;
|
|
startsseloop:
|
|
add ESI, 64;
|
|
movups XMM0, [EAX];
|
|
movups XMM1, [EAX+16];
|
|
movups XMM2, [EAX+32];
|
|
movups XMM3, [EAX+48];
|
|
add EAX, 64;
|
|
mulps XMM0, XMM4;
|
|
mulps XMM1, XMM4;
|
|
mulps XMM2, XMM4;
|
|
mulps XMM3, XMM4;
|
|
//divps XMM0, XMM4;
|
|
//divps XMM1, XMM4;
|
|
//divps XMM2, XMM4;
|
|
//divps XMM3, XMM4;
|
|
movups [ESI+ 0-64], XMM0;
|
|
movups [ESI+16-64], XMM1;
|
|
movups [ESI+32-64], XMM2;
|
|
movups [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloop;
|
|
|
|
mov aptr, ESI;
|
|
mov bptr, EAX;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is 72% faster
|
|
if (amd3dnow() && a.length >= 8)
|
|
{
|
|
auto n = aptr + (a.length & ~7);
|
|
|
|
T[2] w = void;
|
|
|
|
w[0] = recip;
|
|
w[1] = recip;
|
|
|
|
asm
|
|
{
|
|
mov ESI, dword ptr [aptr];
|
|
mov EDI, dword ptr [n];
|
|
mov EAX, dword ptr [bptr];
|
|
movq MM4, qword ptr [w];
|
|
|
|
align 8;
|
|
start:
|
|
movq MM0, [EAX];
|
|
movq MM1, [EAX+8];
|
|
movq MM2, [EAX+16];
|
|
movq MM3, [EAX+24];
|
|
pfmul MM0, MM4;
|
|
pfmul MM1, MM4;
|
|
pfmul MM2, MM4;
|
|
pfmul MM3, MM4;
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ESI, 32;
|
|
add EAX, 32;
|
|
cmp ESI, EDI;
|
|
jb start;
|
|
|
|
emms;
|
|
mov dword ptr [aptr], ESI;
|
|
mov dword ptr [bptr], EAX;
|
|
}
|
|
}
|
|
}
|
|
|
|
while (aptr < aend)
|
|
*aptr++ = *bptr++ * recip;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arraySliceExpDivSliceAssign_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
c[] = a[] / 8;
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] / 8))
|
|
{
|
|
printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] /= value
|
|
*/
|
|
|
|
T[] _arrayExpSliceDivass_f(T[] a, T value)
|
|
{
|
|
//printf("_arrayExpSliceDivass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
|
|
/* Multiplying by the reciprocal is faster, but does
|
|
* not produce as accurate an answer.
|
|
*/
|
|
T recip = cast(T)1 / value;
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
// SSE version is 245% faster
|
|
if (sse() && a.length >= 16)
|
|
{
|
|
// align pointer
|
|
auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
|
|
while (aptr < n)
|
|
*aptr++ *= recip;
|
|
n = cast(T*)((cast(uint)aend) & ~15);
|
|
if (aptr < n)
|
|
|
|
// Aligned case
|
|
asm
|
|
{
|
|
mov ESI, aptr;
|
|
mov EDI, n;
|
|
movss XMM4, recip;
|
|
//movss XMM4, value
|
|
//rcpss XMM4, XMM4
|
|
shufps XMM4, XMM4, 0;
|
|
|
|
align 8;
|
|
startsseloopa:
|
|
movaps XMM0, [ESI];
|
|
movaps XMM1, [ESI+16];
|
|
movaps XMM2, [ESI+32];
|
|
movaps XMM3, [ESI+48];
|
|
add ESI, 64;
|
|
mulps XMM0, XMM4;
|
|
mulps XMM1, XMM4;
|
|
mulps XMM2, XMM4;
|
|
mulps XMM3, XMM4;
|
|
//divps XMM0, XMM4;
|
|
//divps XMM1, XMM4;
|
|
//divps XMM2, XMM4;
|
|
//divps XMM3, XMM4;
|
|
movaps [ESI+ 0-64], XMM0;
|
|
movaps [ESI+16-64], XMM1;
|
|
movaps [ESI+32-64], XMM2;
|
|
movaps [ESI+48-64], XMM3;
|
|
cmp ESI, EDI;
|
|
jb startsseloopa;
|
|
|
|
mov aptr, ESI;
|
|
}
|
|
}
|
|
else
|
|
// 3DNow! version is 57% faster
|
|
if (amd3dnow() && a.length >= 8)
|
|
{
|
|
auto n = aptr + (a.length & ~7);
|
|
|
|
T[2] w = void;
|
|
|
|
w[0] = w[1] = recip;
|
|
|
|
asm
|
|
{
|
|
mov ESI, dword ptr [aptr];
|
|
mov EDI, dword ptr [n];
|
|
movq MM4, qword ptr [w];
|
|
|
|
align 8;
|
|
start:
|
|
movq MM0, [ESI];
|
|
movq MM1, [ESI+8];
|
|
movq MM2, [ESI+16];
|
|
movq MM3, [ESI+24];
|
|
pfmul MM0, MM4;
|
|
pfmul MM1, MM4;
|
|
pfmul MM2, MM4;
|
|
pfmul MM3, MM4;
|
|
movq [ESI], MM0;
|
|
movq [ESI+8], MM1;
|
|
movq [ESI+16], MM2;
|
|
movq [ESI+24], MM3;
|
|
add ESI, 32;
|
|
cmp ESI, EDI;
|
|
jb start;
|
|
|
|
emms;
|
|
mov dword ptr [aptr], ESI;
|
|
}
|
|
}
|
|
}
|
|
|
|
while (aptr < aend)
|
|
*aptr++ *= recip;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arrayExpSliceDivass_f unittest\n");
|
|
for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 2; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
a[] = c[];
|
|
c[] /= 8;
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
if (c[i] != cast(T)(a[i] / 8))
|
|
{
|
|
printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/* ======================================================================== */
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] -= b[] * value
|
|
*/
|
|
|
|
T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b)
|
|
{
|
|
return _arraySliceExpMulSliceAddass_f(a, -value, b);
|
|
}
|
|
|
|
/***********************
|
|
* Computes:
|
|
* a[] += b[] * value
|
|
*/
|
|
|
|
T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b)
|
|
in
|
|
{
|
|
assert(a.length == b.length);
|
|
assert(disjoint(a, b));
|
|
}
|
|
body
|
|
{
|
|
auto aptr = a.ptr;
|
|
auto aend = aptr + a.length;
|
|
auto bptr = b.ptr;
|
|
|
|
// Handle remainder
|
|
while (aptr < aend)
|
|
*aptr++ += *bptr++ * value;
|
|
|
|
return a;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
printf("_arraySliceExpMulSliceAddass_f unittest\n");
|
|
|
|
cpuid = 1;
|
|
{
|
|
version (log) printf(" cpuid %d\n", cpuid);
|
|
|
|
for (int j = 0; j < 1; j++)
|
|
{
|
|
const int dim = 67;
|
|
T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
a = a[j .. dim + j]; // misalign for second iteration
|
|
T[] b = new T[dim + j];
|
|
b = b[j .. dim + j];
|
|
T[] c = new T[dim + j];
|
|
c = c[j .. dim + j];
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{ a[i] = cast(T)i;
|
|
b[i] = cast(T)(i + 7);
|
|
c[i] = cast(T)(i * 2);
|
|
}
|
|
|
|
b[] = c[];
|
|
c[] += a[] * 6;
|
|
|
|
for (int i = 0; i < dim; i++)
|
|
{
|
|
//printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
|
|
if (c[i] != cast(T)(b[i] + a[i] * 6))
|
|
{
|
|
printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
|
|
assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|