/*************************** * D programming language http://www.digitalmars.com/d/ * Runtime support for float array operations. * Based on code originally written by Burton Radons. * Placed in public domain. */ import std.cpuid; version (unittest) { /* This is so unit tests will test every CPU variant */ int cpuid; const int CPUID_MAX = 5; bool mmx() { return cpuid == 1 && std.cpuid.mmx(); } bool sse() { return cpuid == 2 && std.cpuid.sse(); } bool sse2() { return cpuid == 3 && std.cpuid.sse2(); } bool amd3dnow() { return cpuid == 4 && std.cpuid.amd3dnow(); } } else { import std.cpuid; alias std.cpuid.mmx mmx; alias std.cpuid.sse sse; alias std.cpuid.sse2 sse2; alias std.cpuid.amd3dnow amd3dnow; } //version = log; bool disjoint(T)(T[] a, T[] b) { return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); } alias float T; extern (C): /* ======================================================================== */ /*********************** * Computes: * a[] = b[] + c[] */ T[] _arraySliceSliceAddSliceAssign_f(T[] a, T[] c, T[] b) in { assert(a.length == b.length && b.length == c.length); assert(disjoint(a, b)); assert(disjoint(a, c)); assert(disjoint(b, c)); } body { //printf("_arraySliceSliceAddSliceAssign_f()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; auto cptr = c.ptr; version (D_InlineAsm_X86) { // SSE version is 834% faster if (sse() && b.length >= 16) { version (log) printf("\tsse unaligned\n"); auto n = aptr + (b.length & ~15); // Unaligned case asm { mov EAX, bptr; // left operand mov ECX, cptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopb: movups XMM0, [EAX]; movups XMM1, [EAX+16]; movups XMM2, [EAX+32]; movups XMM3, [EAX+48]; add EAX, 64; movups XMM4, [ECX]; movups XMM5, [ECX+16]; movups XMM6, [ECX+32]; movups XMM7, [ECX+48]; add ESI, 64; addps XMM0, XMM4; addps XMM1, XMM5; addps XMM2, XMM6; addps XMM3, XMM7; add ECX, 64; movups [ESI+ 0-64], XMM0; movups [ESI+16-64], XMM1; movups [ESI+32-64], XMM2; movups [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopb; mov aptr, ESI; mov bptr, EAX; mov cptr, ECX; } } else // 3DNow! version is only 13% faster if (amd3dnow() && b.length >= 8) { version (log) printf("\tamd3dnow\n"); auto n = aptr + (b.length & ~7); asm { mov ESI, aptr; // destination operand mov EDI, n; // end comparison mov EAX, bptr; // left operand mov ECX, cptr; // right operand align 4; start3dnow: movq MM0, [EAX]; movq MM1, [EAX+8]; movq MM2, [EAX+16]; movq MM3, [EAX+24]; pfadd MM0, [ECX]; pfadd MM1, [ECX+8]; pfadd MM2, [ECX+16]; pfadd MM3, [ECX+24]; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ECX, 32; add ESI, 32; add EAX, 32; cmp ESI, EDI; jb start3dnow; emms; mov aptr, ESI; mov bptr, EAX; mov cptr, ECX; } } } // Handle remainder version (log) if (aptr < aend) printf("\tbase\n"); while (aptr < aend) *aptr++ = *bptr++ + *cptr++; return a; } unittest { printf("_arraySliceSliceAddSliceAssign_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] + b[]; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] + b[i])) { printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = b[] - c[] */ T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b) in { assert(a.length == b.length && b.length == c.length); assert(disjoint(a, b)); assert(disjoint(a, c)); assert(disjoint(b, c)); } body { auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; auto cptr = c.ptr; version (D_InlineAsm_X86) { // SSE version is 834% faster if (sse() && b.length >= 16) { auto n = aptr + (b.length & ~15); // Unaligned case asm { mov EAX, bptr; // left operand mov ECX, cptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopb: movups XMM0, [EAX]; movups XMM1, [EAX+16]; movups XMM2, [EAX+32]; movups XMM3, [EAX+48]; add EAX, 64; movups XMM4, [ECX]; movups XMM5, [ECX+16]; movups XMM6, [ECX+32]; movups XMM7, [ECX+48]; add ESI, 64; subps XMM0, XMM4; subps XMM1, XMM5; subps XMM2, XMM6; subps XMM3, XMM7; add ECX, 64; movups [ESI+ 0-64], XMM0; movups [ESI+16-64], XMM1; movups [ESI+32-64], XMM2; movups [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopb; mov aptr, ESI; mov bptr, EAX; mov cptr, ECX; } } else // 3DNow! version is only 13% faster if (amd3dnow() && b.length >= 8) { auto n = aptr + (b.length & ~7); asm { mov ESI, aptr; // destination operand mov EDI, n; // end comparison mov EAX, bptr; // left operand mov ECX, cptr; // right operand align 4; start3dnow: movq MM0, [EAX]; movq MM1, [EAX+8]; movq MM2, [EAX+16]; movq MM3, [EAX+24]; pfsub MM0, [ECX]; pfsub MM1, [ECX+8]; pfsub MM2, [ECX+16]; pfsub MM3, [ECX+24]; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ECX, 32; add ESI, 32; add EAX, 32; cmp ESI, EDI; jb start3dnow; emms; mov aptr, ESI; mov bptr, EAX; mov cptr, ECX; } } } // Handle remainder while (aptr < aend) *aptr++ = *bptr++ - *cptr++; return a; } unittest { printf("_arraySliceSliceMinSliceAssign_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] - b[]; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] - b[i])) { printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = b[] + value */ T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b) in { assert(a.length == b.length); assert(disjoint(a, b)); } body { //printf("_arraySliceExpAddSliceAssign_f()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE version is 665% faster if (sse() && a.length >= 16) { auto n = aptr + (a.length & ~15); // Unaligned case asm { mov EAX, bptr; mov ESI, aptr; mov EDI, n; movss XMM4, value; shufps XMM4, XMM4, 0; align 8; startsseloop: add ESI, 64; movups XMM0, [EAX]; movups XMM1, [EAX+16]; movups XMM2, [EAX+32]; movups XMM3, [EAX+48]; add EAX, 64; addps XMM0, XMM4; addps XMM1, XMM4; addps XMM2, XMM4; addps XMM3, XMM4; movups [ESI+ 0-64], XMM0; movups [ESI+16-64], XMM1; movups [ESI+32-64], XMM2; movups [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloop; mov aptr, ESI; mov bptr, EAX; } } else // 3DNow! version is 69% faster if (amd3dnow() && a.length >= 8) { auto n = aptr + (a.length & ~7); ulong w = *cast(uint *) &value; ulong v = w | (w << 32L); asm { mov ESI, aptr; mov EDI, n; mov EAX, bptr; movq MM4, qword ptr [v]; align 8; start3dnow: movq MM0, [EAX]; movq MM1, [EAX+8]; movq MM2, [EAX+16]; movq MM3, [EAX+24]; pfadd MM0, MM4; pfadd MM1, MM4; pfadd MM2, MM4; pfadd MM3, MM4; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ESI, 32; add EAX, 32; cmp ESI, EDI; jb start3dnow; emms; mov aptr, ESI; mov bptr, EAX; } } } while (aptr < aend) *aptr++ = *bptr++ + value; return a; } unittest { printf("_arraySliceExpAddSliceAssign_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] + 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] + 6)) { printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] += value */ T[] _arrayExpSliceAddass_f(T[] a, T value) { //printf("_arrayExpSliceAddass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value); auto aptr = a.ptr; auto aend = aptr + a.length; version (D_InlineAsm_X86) { // SSE version is 302% faster if (sse() && a.length >= 16) { // align pointer auto n = cast(T*)((cast(uint)aptr + 15) & ~15); while (aptr < n) *aptr++ += value; n = cast(T*)((cast(uint)aend) & ~15); if (aptr < n) // Aligned case asm { mov ESI, aptr; mov EDI, n; movss XMM4, value; shufps XMM4, XMM4, 0; align 8; startsseloopa: movaps XMM0, [ESI]; movaps XMM1, [ESI+16]; movaps XMM2, [ESI+32]; movaps XMM3, [ESI+48]; add ESI, 64; addps XMM0, XMM4; addps XMM1, XMM4; addps XMM2, XMM4; addps XMM3, XMM4; movaps [ESI+ 0-64], XMM0; movaps [ESI+16-64], XMM1; movaps [ESI+32-64], XMM2; movaps [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopa; mov aptr, ESI; } } else // 3DNow! version is 63% faster if (amd3dnow() && a.length >= 8) { auto n = aptr + (a.length & ~7); ulong w = *cast(uint *) &value; ulong v = w | (w << 32L); asm { mov ESI, dword ptr [aptr]; mov EDI, dword ptr [n]; movq MM4, qword ptr [v]; align 8; start3dnow: movq MM0, [ESI]; movq MM1, [ESI+8]; movq MM2, [ESI+16]; movq MM3, [ESI+24]; pfadd MM0, MM4; pfadd MM1, MM4; pfadd MM2, MM4; pfadd MM3, MM4; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ESI, 32; cmp ESI, EDI; jb start3dnow; emms; mov dword ptr [aptr], ESI; } } } while (aptr < aend) *aptr++ += value; return a; } unittest { printf("_arrayExpSliceAddass_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] += 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] + 6)) { printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] += b[] */ T[] _arraySliceSliceAddass_f(T[] a, T[] b) in { assert (a.length == b.length); assert (disjoint(a, b)); } body { //printf("_arraySliceSliceAddass_f()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE version is 468% faster if (sse() && a.length >= 16) { auto n = aptr + (a.length & ~15); // Unaligned case asm { mov ECX, bptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopb: movups XMM0, [ESI]; movups XMM1, [ESI+16]; movups XMM2, [ESI+32]; movups XMM3, [ESI+48]; add ESI, 64; movups XMM4, [ECX]; movups XMM5, [ECX+16]; movups XMM6, [ECX+32]; movups XMM7, [ECX+48]; add ECX, 64; addps XMM0, XMM4; addps XMM1, XMM5; addps XMM2, XMM6; addps XMM3, XMM7; movups [ESI+ 0-64], XMM0; movups [ESI+16-64], XMM1; movups [ESI+32-64], XMM2; movups [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopb; mov aptr, ESI; mov bptr, ECX; } } else // 3DNow! version is 57% faster if (amd3dnow() && a.length >= 8) { auto n = aptr + (a.length & ~7); asm { mov ESI, dword ptr [aptr]; // destination operand mov EDI, dword ptr [n]; // end comparison mov ECX, dword ptr [bptr]; // right operand align 4; start3dnow: movq MM0, [ESI]; movq MM1, [ESI+8]; movq MM2, [ESI+16]; movq MM3, [ESI+24]; pfadd MM0, [ECX]; pfadd MM1, [ECX+8]; pfadd MM2, [ECX+16]; pfadd MM3, [ECX+24]; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ESI, 32; add ECX, 32; cmp ESI, EDI; jb start3dnow; emms; mov dword ptr [aptr], ESI; mov dword ptr [bptr], ECX; } } } while (aptr < aend) *aptr++ += *bptr++; return a; } unittest { printf("_arraySliceSliceAddass_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] += b[]; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] + b[i])) { printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = b[] - value */ T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b) in { assert (a.length == b.length); assert (disjoint(a, b)); } body { //printf("_arraySliceExpMinSliceAssign_f()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE version is 622% faster if (sse() && a.length >= 16) { auto n = aptr + (a.length & ~15); // Unaligned case asm { mov EAX, bptr; mov ESI, aptr; mov EDI, n; movss XMM4, value; shufps XMM4, XMM4, 0; align 8; startsseloop: add ESI, 64; movups XMM0, [EAX]; movups XMM1, [EAX+16]; movups XMM2, [EAX+32]; movups XMM3, [EAX+48]; add EAX, 64; subps XMM0, XMM4; subps XMM1, XMM4; subps XMM2, XMM4; subps XMM3, XMM4; movups [ESI+ 0-64], XMM0; movups [ESI+16-64], XMM1; movups [ESI+32-64], XMM2; movups [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloop; mov aptr, ESI; mov bptr, EAX; } } else // 3DNow! version is 67% faster if (amd3dnow() && a.length >= 8) { auto n = aptr + (a.length & ~7); T[2] w; w[0] = w[1] = value; asm { mov ESI, dword ptr [aptr]; mov EDI, dword ptr [n]; mov EAX, dword ptr [bptr]; movq MM4, qword ptr [w]; align 8; start3dnow: movq MM0, [EAX]; movq MM1, [EAX+8]; movq MM2, [EAX+16]; movq MM3, [EAX+24]; pfsub MM0, MM4; pfsub MM1, MM4; pfsub MM2, MM4; pfsub MM3, MM4; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ESI, 32; add EAX, 32; cmp ESI, EDI; jb start3dnow; emms; mov dword ptr [aptr], ESI; mov dword ptr [bptr], EAX; } } } while (aptr < aend) *aptr++ = *bptr++ - value; return a; } unittest { printf("_arraySliceExpMinSliceAssign_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] - 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] - 6)) { printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = value - b[] */ T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value) in { assert (a.length == b.length); assert (disjoint(a, b)); } body { //printf("_arrayExpSliceMinSliceAssign_f()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE version is 690% faster if (sse() && a.length >= 16) { auto n = aptr + (a.length & ~15); // Unaligned case asm { mov EAX, bptr; mov ESI, aptr; mov EDI, n; movss XMM4, value; shufps XMM4, XMM4, 0; align 8; startsseloop: add ESI, 64; movaps XMM5, XMM4; movaps XMM6, XMM4; movups XMM0, [EAX]; movups XMM1, [EAX+16]; movups XMM2, [EAX+32]; movups XMM3, [EAX+48]; add EAX, 64; subps XMM5, XMM0; subps XMM6, XMM1; movups [ESI+ 0-64], XMM5; movups [ESI+16-64], XMM6; movaps XMM5, XMM4; movaps XMM6, XMM4; subps XMM5, XMM2; subps XMM6, XMM3; movups [ESI+32-64], XMM5; movups [ESI+48-64], XMM6; cmp ESI, EDI; jb startsseloop; mov aptr, ESI; mov bptr, EAX; } } else // 3DNow! version is 67% faster if (amd3dnow() && a.length >= 8) { auto n = aptr + (a.length & ~7); ulong w = *cast(uint *) &value; ulong v = w | (w << 32L); asm { mov ESI, aptr; mov EDI, n; mov EAX, bptr; movq MM4, qword ptr [v]; align 8; start3dnow: movq MM0, [EAX]; movq MM1, [EAX+8]; movq MM2, [EAX+16]; movq MM3, [EAX+24]; pfsubr MM0, MM4; pfsubr MM1, MM4; pfsubr MM2, MM4; pfsubr MM3, MM4; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ESI, 32; add EAX, 32; cmp ESI, EDI; jb start3dnow; emms; mov aptr, ESI; mov bptr, EAX; } } } while (aptr < aend) *aptr++ = value - *bptr++; return a; } unittest { printf("_arrayExpSliceMinSliceAssign_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = 6 - a[]; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(6 - a[i])) { printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] -= value */ T[] _arrayExpSliceMinass_f(T[] a, T value) { //printf("_arrayExpSliceMinass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value); auto aptr = a.ptr; auto aend = aptr + a.length; version (D_InlineAsm_X86) { // SSE version is 304% faster if (sse() && a.length >= 16) { // align pointer auto n = cast(T*)((cast(uint)aptr + 15) & ~15); while (aptr < n) *aptr++ -= value; n = cast(T*)((cast(uint)aend) & ~15); if (aptr < n) // Aligned case asm { mov ESI, aptr; mov EDI, n; movss XMM4, value; shufps XMM4, XMM4, 0; align 8; startsseloopa: movaps XMM0, [ESI]; movaps XMM1, [ESI+16]; movaps XMM2, [ESI+32]; movaps XMM3, [ESI+48]; add ESI, 64; subps XMM0, XMM4; subps XMM1, XMM4; subps XMM2, XMM4; subps XMM3, XMM4; movaps [ESI+ 0-64], XMM0; movaps [ESI+16-64], XMM1; movaps [ESI+32-64], XMM2; movaps [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopa; mov aptr, ESI; } } else // 3DNow! version is 63% faster if (amd3dnow() && a.length >= 8) { auto n = aptr + (a.length & ~7); ulong w = *cast(uint *) &value; ulong v = w | (w << 32L); asm { mov ESI, dword ptr [aptr]; mov EDI, dword ptr [n]; movq MM4, qword ptr [v]; align 8; start: movq MM0, [ESI]; movq MM1, [ESI+8]; movq MM2, [ESI+16]; movq MM3, [ESI+24]; pfsub MM0, MM4; pfsub MM1, MM4; pfsub MM2, MM4; pfsub MM3, MM4; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ESI, 32; cmp ESI, EDI; jb start; emms; mov dword ptr [aptr], ESI; } } } while (aptr < aend) *aptr++ -= value; return a; } unittest { printf("_arrayExpSliceminass_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] -= 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] - 6)) { printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] -= b[] */ T[] _arraySliceSliceMinass_f(T[] a, T[] b) in { assert (a.length == b.length); assert (disjoint(a, b)); } body { //printf("_arraySliceSliceMinass_f()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE version is 468% faster if (sse() && a.length >= 16) { auto n = aptr + (a.length & ~15); // Unaligned case asm { mov ECX, bptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopb: movups XMM0, [ESI]; movups XMM1, [ESI+16]; movups XMM2, [ESI+32]; movups XMM3, [ESI+48]; add ESI, 64; movups XMM4, [ECX]; movups XMM5, [ECX+16]; movups XMM6, [ECX+32]; movups XMM7, [ECX+48]; add ECX, 64; subps XMM0, XMM4; subps XMM1, XMM5; subps XMM2, XMM6; subps XMM3, XMM7; movups [ESI+ 0-64], XMM0; movups [ESI+16-64], XMM1; movups [ESI+32-64], XMM2; movups [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopb; mov aptr, ESI; mov bptr, ECX; } } else // 3DNow! version is 57% faster if (amd3dnow() && a.length >= 8) { auto n = aptr + (a.length & ~7); asm { mov ESI, dword ptr [aptr]; // destination operand mov EDI, dword ptr [n]; // end comparison mov ECX, dword ptr [bptr]; // right operand align 4; start: movq MM0, [ESI]; movq MM1, [ESI+8]; movq MM2, [ESI+16]; movq MM3, [ESI+24]; pfsub MM0, [ECX]; pfsub MM1, [ECX+8]; pfsub MM2, [ECX+16]; pfsub MM3, [ECX+24]; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ESI, 32; add ECX, 32; cmp ESI, EDI; jb start; emms; mov aptr, ESI; mov bptr, ECX; } } } while (aptr < aend) *aptr++ -= *bptr++; return a; } unittest { printf("_arrayExpSliceMinass_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] -= 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] - 6)) { printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = b[] * value */ T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b) in { assert(a.length == b.length); assert(disjoint(a, b)); } body { //printf("_arraySliceExpMulSliceAssign_f()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE version is 607% faster if (sse() && a.length >= 16) { auto n = aptr + (a.length & ~15); // Unaligned case asm { mov EAX, bptr; mov ESI, aptr; mov EDI, n; movss XMM4, value; shufps XMM4, XMM4, 0; align 8; startsseloop: add ESI, 64; movups XMM0, [EAX]; movups XMM1, [EAX+16]; movups XMM2, [EAX+32]; movups XMM3, [EAX+48]; add EAX, 64; mulps XMM0, XMM4; mulps XMM1, XMM4; mulps XMM2, XMM4; mulps XMM3, XMM4; movups [ESI+ 0-64], XMM0; movups [ESI+16-64], XMM1; movups [ESI+32-64], XMM2; movups [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloop; mov aptr, ESI; mov bptr, EAX; } } else // 3DNow! version is 69% faster if (amd3dnow() && a.length >= 8) { auto n = aptr + (a.length & ~7); ulong w = *cast(uint *) &value; ulong v = w | (w << 32L); asm { mov ESI, dword ptr [aptr]; mov EDI, dword ptr [n]; mov EAX, dword ptr [bptr]; movq MM4, qword ptr [v]; align 8; start: movq MM0, [EAX]; movq MM1, [EAX+8]; movq MM2, [EAX+16]; movq MM3, [EAX+24]; pfmul MM0, MM4; pfmul MM1, MM4; pfmul MM2, MM4; pfmul MM3, MM4; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ESI, 32; add EAX, 32; cmp ESI, EDI; jb start; emms; mov aptr, ESI; mov bptr, EAX; } } } while (aptr < aend) *aptr++ = *bptr++ * value; return a; } unittest { printf("_arraySliceExpMulSliceAssign_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] * 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] * 6)) { printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = b[] * c[] */ T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b) in { assert(a.length == b.length && b.length == c.length); assert(disjoint(a, b)); assert(disjoint(a, c)); assert(disjoint(b, c)); } body { //printf("_arraySliceSliceMulSliceAssign_f()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; auto cptr = c.ptr; version (D_InlineAsm_X86) { // SSE version is 833% faster if (sse() && a.length >= 16) { auto n = aptr + (a.length & ~15); // Unaligned case asm { mov EAX, bptr; // left operand mov ECX, cptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopb: movups XMM0, [EAX]; movups XMM1, [EAX+16]; movups XMM2, [EAX+32]; movups XMM3, [EAX+48]; add ESI, 64; movups XMM4, [ECX]; movups XMM5, [ECX+16]; movups XMM6, [ECX+32]; movups XMM7, [ECX+48]; add EAX, 64; mulps XMM0, XMM4; mulps XMM1, XMM5; mulps XMM2, XMM6; mulps XMM3, XMM7; add ECX, 64; movups [ESI+ 0-64], XMM0; movups [ESI+16-64], XMM1; movups [ESI+32-64], XMM2; movups [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopb; mov aptr, ESI; mov bptr, EAX; mov cptr, ECX; } } else // 3DNow! version is only 13% faster if (amd3dnow() && a.length >= 8) { auto n = aptr + (a.length & ~7); asm { mov ESI, dword ptr [aptr]; // destination operand mov EDI, dword ptr [n]; // end comparison mov EAX, dword ptr [bptr]; // left operand mov ECX, dword ptr [cptr]; // right operand align 4; start: movq MM0, [EAX]; movq MM1, [EAX+8]; movq MM2, [EAX+16]; movq MM3, [EAX+24]; pfmul MM0, [ECX]; pfmul MM1, [ECX+8]; pfmul MM2, [ECX+16]; pfmul MM3, [ECX+24]; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ECX, 32; add ESI, 32; add EAX, 32; cmp ESI, EDI; jb start; emms; mov aptr, ESI; mov bptr, EAX; mov cptr, ECX; } } } while (aptr < aend) *aptr++ = *bptr++ * *cptr++; return a; } unittest { printf("_arraySliceSliceMulSliceAssign_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] * b[]; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] * b[i])) { printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] *= value */ T[] _arrayExpSliceMulass_f(T[] a, T value) { //printf("_arrayExpSliceMulass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value); auto aptr = a.ptr; auto aend = aptr + a.length; version (D_InlineAsm_X86) { // SSE version is 303% faster if (sse() && a.length >= 16) { // align pointer auto n = cast(T*)((cast(uint)aptr + 15) & ~15); while (aptr < n) *aptr++ *= value; n = cast(T*)((cast(uint)aend) & ~15); if (aptr < n) // Aligned case asm { mov ESI, aptr; mov EDI, n; movss XMM4, value; shufps XMM4, XMM4, 0; align 8; startsseloopa: movaps XMM0, [ESI]; movaps XMM1, [ESI+16]; movaps XMM2, [ESI+32]; movaps XMM3, [ESI+48]; add ESI, 64; mulps XMM0, XMM4; mulps XMM1, XMM4; mulps XMM2, XMM4; mulps XMM3, XMM4; movaps [ESI+ 0-64], XMM0; movaps [ESI+16-64], XMM1; movaps [ESI+32-64], XMM2; movaps [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopa; mov aptr, ESI; } } else // 3DNow! version is 63% faster if (amd3dnow() && a.length >= 8) { auto n = aptr + (a.length & ~7); ulong w = *cast(uint *) &value; ulong v = w | (w << 32L); asm { mov ESI, dword ptr [aptr]; mov EDI, dword ptr [n]; movq MM4, qword ptr [v]; align 8; start: movq MM0, [ESI]; movq MM1, [ESI+8]; movq MM2, [ESI+16]; movq MM3, [ESI+24]; pfmul MM0, MM4; pfmul MM1, MM4; pfmul MM2, MM4; pfmul MM3, MM4; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ESI, 32; cmp ESI, EDI; jb start; emms; mov dword ptr [aptr], ESI; } } } while (aptr < aend) *aptr++ *= value; return a; } unittest { printf("_arrayExpSliceMulass_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] *= 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] * 6)) { printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] *= b[] */ T[] _arraySliceSliceMulass_f(T[] a, T[] b) in { assert (a.length == b.length); assert (disjoint(a, b)); } body { //printf("_arraySliceSliceMulass_f()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; version (D_InlineAsm_X86) { // SSE version is 525% faster if (sse() && a.length >= 16) { auto n = aptr + (a.length & ~15); // Unaligned case asm { mov ECX, bptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopb: movups XMM0, [ESI]; movups XMM1, [ESI+16]; movups XMM2, [ESI+32]; movups XMM3, [ESI+48]; add ESI, 64; movups XMM4, [ECX]; movups XMM5, [ECX+16]; movups XMM6, [ECX+32]; movups XMM7, [ECX+48]; add ECX, 64; mulps XMM0, XMM4; mulps XMM1, XMM5; mulps XMM2, XMM6; mulps XMM3, XMM7; movups [ESI+ 0-64], XMM0; movups [ESI+16-64], XMM1; movups [ESI+32-64], XMM2; movups [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopb; mov aptr, ESI; mov bptr, ECX; } } else // 3DNow! version is 57% faster if (amd3dnow() && a.length >= 8) { auto n = aptr + (a.length & ~7); asm { mov ESI, dword ptr [aptr]; // destination operand mov EDI, dword ptr [n]; // end comparison mov ECX, dword ptr [bptr]; // right operand align 4; start: movq MM0, [ESI]; movq MM1, [ESI+8]; movq MM2, [ESI+16]; movq MM3, [ESI+24]; pfmul MM0, [ECX]; pfmul MM1, [ECX+8]; pfmul MM2, [ECX+16]; pfmul MM3, [ECX+24]; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ESI, 32; add ECX, 32; cmp ESI, EDI; jb start; emms; mov dword ptr [aptr], ESI; mov dword ptr [bptr], ECX; } } } while (aptr < aend) *aptr++ *= *bptr++; return a; } unittest { printf("_arrayExpSliceMulass_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] *= 6; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] * 6)) { printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] = b[] / value */ T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b) in { assert(a.length == b.length); assert(disjoint(a, b)); } body { //printf("_arraySliceExpDivSliceAssign_f()\n"); auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; /* Multiplying by the reciprocal is faster, but does * not produce as accurate an answer. */ T recip = cast(T)1 / value; version (D_InlineAsm_X86) { // SSE version is 587% faster if (sse() && a.length >= 16) { auto n = aptr + (a.length & ~15); // Unaligned case asm { mov EAX, bptr; mov ESI, aptr; mov EDI, n; movss XMM4, recip; //movss XMM4, value //rcpss XMM4, XMM4 shufps XMM4, XMM4, 0; align 8; startsseloop: add ESI, 64; movups XMM0, [EAX]; movups XMM1, [EAX+16]; movups XMM2, [EAX+32]; movups XMM3, [EAX+48]; add EAX, 64; mulps XMM0, XMM4; mulps XMM1, XMM4; mulps XMM2, XMM4; mulps XMM3, XMM4; //divps XMM0, XMM4; //divps XMM1, XMM4; //divps XMM2, XMM4; //divps XMM3, XMM4; movups [ESI+ 0-64], XMM0; movups [ESI+16-64], XMM1; movups [ESI+32-64], XMM2; movups [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloop; mov aptr, ESI; mov bptr, EAX; } } else // 3DNow! version is 72% faster if (amd3dnow() && a.length >= 8) { auto n = aptr + (a.length & ~7); T[2] w = void; w[0] = recip; w[1] = recip; asm { mov ESI, dword ptr [aptr]; mov EDI, dword ptr [n]; mov EAX, dword ptr [bptr]; movq MM4, qword ptr [w]; align 8; start: movq MM0, [EAX]; movq MM1, [EAX+8]; movq MM2, [EAX+16]; movq MM3, [EAX+24]; pfmul MM0, MM4; pfmul MM1, MM4; pfmul MM2, MM4; pfmul MM3, MM4; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ESI, 32; add EAX, 32; cmp ESI, EDI; jb start; emms; mov dword ptr [aptr], ESI; mov dword ptr [bptr], EAX; } } } while (aptr < aend) *aptr++ = *bptr++ * recip; return a; } unittest { printf("_arraySliceExpDivSliceAssign_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } c[] = a[] / 8; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] / 8)) { printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] /= value */ T[] _arrayExpSliceDivass_f(T[] a, T value) { //printf("_arrayExpSliceDivass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value); auto aptr = a.ptr; auto aend = aptr + a.length; /* Multiplying by the reciprocal is faster, but does * not produce as accurate an answer. */ T recip = cast(T)1 / value; version (D_InlineAsm_X86) { // SSE version is 245% faster if (sse() && a.length >= 16) { // align pointer auto n = cast(T*)((cast(uint)aptr + 15) & ~15); while (aptr < n) *aptr++ *= recip; n = cast(T*)((cast(uint)aend) & ~15); if (aptr < n) // Aligned case asm { mov ESI, aptr; mov EDI, n; movss XMM4, recip; //movss XMM4, value //rcpss XMM4, XMM4 shufps XMM4, XMM4, 0; align 8; startsseloopa: movaps XMM0, [ESI]; movaps XMM1, [ESI+16]; movaps XMM2, [ESI+32]; movaps XMM3, [ESI+48]; add ESI, 64; mulps XMM0, XMM4; mulps XMM1, XMM4; mulps XMM2, XMM4; mulps XMM3, XMM4; //divps XMM0, XMM4; //divps XMM1, XMM4; //divps XMM2, XMM4; //divps XMM3, XMM4; movaps [ESI+ 0-64], XMM0; movaps [ESI+16-64], XMM1; movaps [ESI+32-64], XMM2; movaps [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopa; mov aptr, ESI; } } else // 3DNow! version is 57% faster if (amd3dnow() && a.length >= 8) { auto n = aptr + (a.length & ~7); T[2] w = void; w[0] = w[1] = recip; asm { mov ESI, dword ptr [aptr]; mov EDI, dword ptr [n]; movq MM4, qword ptr [w]; align 8; start: movq MM0, [ESI]; movq MM1, [ESI+8]; movq MM2, [ESI+16]; movq MM3, [ESI+24]; pfmul MM0, MM4; pfmul MM1, MM4; pfmul MM2, MM4; pfmul MM3, MM4; movq [ESI], MM0; movq [ESI+8], MM1; movq [ESI+16], MM2; movq [ESI+24], MM3; add ESI, 32; cmp ESI, EDI; jb start; emms; mov dword ptr [aptr], ESI; } } } while (aptr < aend) *aptr++ *= recip; return a; } unittest { printf("_arrayExpSliceDivass_f unittest\n"); for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 2; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } a[] = c[]; c[] /= 8; for (int i = 0; i < dim; i++) { if (c[i] != cast(T)(a[i] / 8)) { printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); assert(0); } } } } } /* ======================================================================== */ /*********************** * Computes: * a[] -= b[] * value */ T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b) { return _arraySliceExpMulSliceAddass_f(a, -value, b); } /*********************** * Computes: * a[] += b[] * value */ T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b) in { assert(a.length == b.length); assert(disjoint(a, b)); } body { auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; // Handle remainder while (aptr < aend) *aptr++ += *bptr++ * value; return a; } unittest { printf("_arraySliceExpMulSliceAddass_f unittest\n"); cpuid = 1; { version (log) printf(" cpuid %d\n", cpuid); for (int j = 0; j < 1; j++) { const int dim = 67; T[] a = new T[dim + j]; // aligned on 16 byte boundary a = a[j .. dim + j]; // misalign for second iteration T[] b = new T[dim + j]; b = b[j .. dim + j]; T[] c = new T[dim + j]; c = c[j .. dim + j]; for (int i = 0; i < dim; i++) { a[i] = cast(T)i; b[i] = cast(T)(i + 7); c[i] = cast(T)(i * 2); } b[] = c[]; c[] += a[] * 6; for (int i = 0; i < dim; i++) { //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); if (c[i] != cast(T)(b[i] + a[i] * 6)) { printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); assert(0); } } } } }