mirror of
https://github.com/dlang/phobos.git
synced 2025-04-26 05:00:35 +03:00
1361 lines
41 KiB
D
1361 lines
41 KiB
D
/** Optimised asm arbitrary precision arithmetic ('bignum')
|
|
* routines for X86 processors.
|
|
*
|
|
* All functions operate on arrays of uints, stored LSB first.
|
|
* If there is a destination array, it will be the first parameter.
|
|
* Currently, all of these functions are subject to change, and are
|
|
* intended for internal use only.
|
|
* The symbol [#] indicates an array of machine words which is to be
|
|
* interpreted as a multi-byte number.
|
|
*/
|
|
|
|
/* Copyright Don Clugston 2008 - 2010.
|
|
* Distributed under the Boost Software License, Version 1.0.
|
|
* (See accompanying file LICENSE_1_0.txt or copy at
|
|
* http://www.boost.org/LICENSE_1_0.txt)
|
|
*/
|
|
/**
|
|
* In simple terms, there are 3 modern x86 microarchitectures:
|
|
* (a) the P6 family (Pentium Pro, PII, PIII, PM, Core), produced by Intel;
|
|
* (b) the K6, Athlon, and AMD64 families, produced by AMD; and
|
|
* (c) the Pentium 4, produced by Marketing.
|
|
*
|
|
* This code has been optimised for the Intel P6 family.
|
|
* Generally the code remains near-optimal for Intel Core2/Corei7, after
|
|
* translating EAX-> RAX, etc, since all these CPUs use essentially the same
|
|
* pipeline, and are typically limited by memory access.
|
|
* The code uses techniques described in Agner Fog's superb Pentium manuals
|
|
* available at www.agner.org.
|
|
* Not optimised for AMD, which can do two memory loads per cycle (Intel
|
|
* CPUs can only do one). Despite this, performance is superior on AMD.
|
|
* Performance is dreadful on P4.
|
|
*
|
|
* Timing results (cycles per int)
|
|
* --Intel Pentium-- --AMD--
|
|
* PM P4 Core2 K7
|
|
* +,- 2.25 15.6 2.25 1.5
|
|
* <<,>> 2.0 6.6 2.0 5.0
|
|
* (<< MMX) 1.7 5.3 1.5 1.2
|
|
* * 5.0 15.0 4.0 4.3
|
|
* mulAdd 5.7 19.0 4.9 4.0
|
|
* div 30.0 32.0 32.0 22.4
|
|
* mulAcc(32) 6.5 20.0 5.4 4.9
|
|
*
|
|
* mulAcc(32) is multiplyAccumulate() for a 32*32 multiply. Thus it includes
|
|
* function call overhead.
|
|
* The timing for Div is quite unpredictable, but it's probably too slow
|
|
* to be useful. On 64-bit processors, these times should
|
|
* halve if run in 64-bit mode, except for the MMX functions.
|
|
*/
|
|
|
|
module std.internal.math.biguintx86;
|
|
|
|
@system:
|
|
pure:
|
|
nothrow:
|
|
|
|
/*
|
|
Naked asm is used throughout, because:
|
|
(a) it frees up the EBP register
|
|
(b) compiler bugs prevent the use of .ptr when a frame pointer is used.
|
|
*/
|
|
|
|
version (D_InlineAsm_X86)
|
|
{
|
|
|
|
private:
|
|
|
|
/* Duplicate string s, with n times, substituting index for '@'.
|
|
*
|
|
* Each instance of '@' in s is replaced by 0,1,...n-1. This is a helper
|
|
* function for some of the asm routines.
|
|
*/
|
|
string indexedLoopUnroll(int n, string s) pure @safe
|
|
{
|
|
string u;
|
|
for (int i = 0; i<n; ++i)
|
|
{
|
|
string nstr= (i>9 ? ""~ cast(char)('0'+i/10) : "") ~ cast(char)('0' + i%10);
|
|
|
|
int last = 0;
|
|
for (int j = 0; j<s.length; ++j)
|
|
{
|
|
if (s[j]=='@')
|
|
{
|
|
u ~= s[last .. j] ~ nstr;
|
|
last = j+1;
|
|
}
|
|
}
|
|
if (last<s.length) u = u ~ s[last..$];
|
|
|
|
}
|
|
return u;
|
|
}
|
|
@safe unittest
|
|
{
|
|
assert(indexedLoopUnroll(3, "@*23;")=="0*23;1*23;2*23;");
|
|
}
|
|
|
|
public:
|
|
|
|
alias BigDigit = uint; // A Bignum is an array of BigDigits. Usually the machine word size.
|
|
|
|
// Limits for when to switch between multiplication algorithms.
|
|
enum : int { KARATSUBALIMIT = 18 } // Minimum value for which Karatsuba is worthwhile.
|
|
enum : int { KARATSUBASQUARELIMIT=26 } // Minimum value for which square Karatsuba is worthwhile
|
|
|
|
/** Multi-byte addition or subtraction
|
|
* dest[#] = src1[#] + src2[#] + carry (0 or 1).
|
|
* or dest[#] = src1[#] - src2[#] - carry (0 or 1).
|
|
* Returns carry or borrow (0 or 1).
|
|
* Set op == '+' for addition, '-' for subtraction.
|
|
*/
|
|
uint multibyteAddSub(char op)(uint[] dest, const uint [] src1, const uint []
|
|
src2, uint carry) pure @safe @nogc
|
|
{
|
|
// Timing:
|
|
// Pentium M: 2.25/int
|
|
// P6 family, Core2 have a partial flags stall when reading the carry flag in
|
|
// an ADC, SBB operation after an operation such as INC or DEC which
|
|
// modifies some, but not all, flags. We avoid this by storing carry into
|
|
// a resister (AL), and restoring it after the branch.
|
|
|
|
enum { LASTPARAM = 4*4 } // 3* pushes + return address.
|
|
asm pure nothrow @nogc @trusted {
|
|
naked;
|
|
push EDI;
|
|
push EBX;
|
|
push ESI;
|
|
mov ECX, [ESP + LASTPARAM + 4*4]; // dest.length;
|
|
mov EDX, [ESP + LASTPARAM + 3*4]; // src1.ptr
|
|
mov ESI, [ESP + LASTPARAM + 1*4]; // src2.ptr
|
|
mov EDI, [ESP + LASTPARAM + 5*4]; // dest.ptr
|
|
// Carry is in EAX
|
|
// Count UP to zero (from -len) to minimize loop overhead.
|
|
lea EDX, [EDX + 4*ECX]; // EDX = end of src1.
|
|
lea ESI, [ESI + 4*ECX]; // EBP = end of src2.
|
|
lea EDI, [EDI + 4*ECX]; // EDI = end of dest.
|
|
|
|
neg ECX;
|
|
add ECX, 8;
|
|
jb L2; // if length < 8 , bypass the unrolled loop.
|
|
L_unrolled:
|
|
shr AL, 1; // get carry from EAX
|
|
}
|
|
mixin(" asm pure nothrow @nogc @trusted {"
|
|
~ indexedLoopUnroll( 8,
|
|
"mov EAX, [@*4-8*4+EDX+ECX*4];"
|
|
~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4-8*4+ESI+ECX*4];"
|
|
~ "mov [@*4-8*4+EDI+ECX*4], EAX;")
|
|
~ "}");
|
|
asm pure nothrow @nogc @trusted {
|
|
setc AL; // save carry
|
|
add ECX, 8;
|
|
ja L_unrolled;
|
|
L2: // Do the residual 1 .. 7 ints.
|
|
|
|
sub ECX, 8;
|
|
jz done;
|
|
L_residual:
|
|
shr AL, 1; // get carry from EAX
|
|
}
|
|
mixin(" asm pure nothrow @nogc @trusted {"
|
|
~ indexedLoopUnroll( 1,
|
|
"mov EAX, [@*4+EDX+ECX*4];"
|
|
~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4+ESI+ECX*4];"
|
|
~ "mov [@*4+EDI+ECX*4], EAX;") ~ "}");
|
|
asm pure nothrow @nogc @trusted {
|
|
setc AL; // save carry
|
|
add ECX, 1;
|
|
jnz L_residual;
|
|
done:
|
|
and EAX, 1; // make it O or 1.
|
|
pop ESI;
|
|
pop EBX;
|
|
pop EDI;
|
|
ret 6*4;
|
|
}
|
|
}
|
|
|
|
@safe unittest
|
|
{
|
|
uint [] a = new uint[40];
|
|
uint [] b = new uint[40];
|
|
uint [] c = new uint[40];
|
|
for (int i=0; i<a.length; ++i)
|
|
{
|
|
if (i&1) a[i]=0x8000_0000 + i;
|
|
else a[i]=i;
|
|
b[i]= 0x8000_0003;
|
|
}
|
|
c[19]=0x3333_3333;
|
|
uint carry = multibyteAddSub!('+')(c[0 .. 18], a[0 .. 18], b[0 .. 18], 0);
|
|
assert(carry == 1);
|
|
assert(c[0]==0x8000_0003);
|
|
assert(c[1]==4);
|
|
assert(c[19]==0x3333_3333); // check for overrun
|
|
for (int i=0; i<a.length; ++i)
|
|
{
|
|
a[i]=b[i]=c[i]=0;
|
|
}
|
|
a[8]=0x048D159E;
|
|
b[8]=0x048D159E;
|
|
a[10]=0x1D950C84;
|
|
b[10]=0x1D950C84;
|
|
a[5] =0x44444444;
|
|
carry = multibyteAddSub!('-')(a[0 .. 12], a[0 .. 12], b[0 .. 12], 0);
|
|
assert(a[11]==0);
|
|
for (int i=0; i<10; ++i) if (i != 5) assert(a[i]==0);
|
|
|
|
for (int q=3; q<36;++q)
|
|
{
|
|
for (int i=0; i<a.length; ++i)
|
|
{
|
|
a[i]=b[i]=c[i]=0;
|
|
}
|
|
a[q-2]=0x040000;
|
|
b[q-2]=0x040000;
|
|
carry = multibyteAddSub!('-')(a[0 .. q], a[0 .. q], b[0 .. q], 0);
|
|
assert(a[q-2]==0);
|
|
}
|
|
}
|
|
|
|
/** dest[#] += carry, or dest[#] -= carry.
|
|
* op must be '+' or '-'
|
|
* Returns final carry or borrow (0 or 1)
|
|
*/
|
|
uint multibyteIncrementAssign(char op)(uint[] dest, uint carry) pure @safe @nogc
|
|
{
|
|
enum { LASTPARAM = 1*4 } // 0* pushes + return address.
|
|
asm pure nothrow @nogc @trusted {
|
|
naked;
|
|
mov ECX, [ESP + LASTPARAM + 0*4]; // dest.length;
|
|
mov EDX, [ESP + LASTPARAM + 1*4]; // dest.ptr
|
|
// EAX = carry
|
|
L1: ;
|
|
}
|
|
static if (op=='+')
|
|
asm pure nothrow @nogc @trusted { add [EDX], EAX; }
|
|
else
|
|
asm pure nothrow @nogc @trusted { sub [EDX], EAX; }
|
|
asm pure nothrow @nogc @trusted {
|
|
mov EAX, 1;
|
|
jnc L2;
|
|
add EDX, 4;
|
|
dec ECX;
|
|
jnz L1;
|
|
mov EAX, 2;
|
|
L2: dec EAX;
|
|
ret 2*4;
|
|
}
|
|
}
|
|
|
|
/** dest[#] = src[#] << numbits
|
|
* numbits must be in the range 1 .. 31
|
|
* Returns the overflow
|
|
*/
|
|
uint multibyteShlNoMMX(uint [] dest, const uint [] src, uint numbits) pure @safe @nogc
|
|
{
|
|
// Timing: Optimal for P6 family.
|
|
// 2.0 cycles/int on PPro .. PM (limited by execution port p0)
|
|
// 5.0 cycles/int on Athlon, which has 7 cycles for SHLD!!
|
|
enum { LASTPARAM = 4*4 } // 3* pushes + return address.
|
|
asm pure nothrow @nogc @trusted {
|
|
naked;
|
|
push ESI;
|
|
push EDI;
|
|
push EBX;
|
|
mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
|
|
mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
|
|
mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
|
|
mov ECX, EAX; // numbits;
|
|
|
|
mov EAX, [-4+ESI + 4*EBX];
|
|
mov EDX, 0;
|
|
shld EDX, EAX, CL;
|
|
push EDX; // Save return value
|
|
cmp EBX, 1;
|
|
jz L_last;
|
|
mov EDX, [-4+ESI + 4*EBX];
|
|
test EBX, 1;
|
|
jz L_odd;
|
|
sub EBX, 1;
|
|
L_even:
|
|
mov EDX, [-4+ ESI + 4*EBX];
|
|
shld EAX, EDX, CL;
|
|
mov [EDI+4*EBX], EAX;
|
|
L_odd:
|
|
mov EAX, [-8+ESI + 4*EBX];
|
|
shld EDX, EAX, CL;
|
|
mov [-4+EDI + 4*EBX], EDX;
|
|
sub EBX, 2;
|
|
jg L_even;
|
|
L_last:
|
|
shl EAX, CL;
|
|
mov [EDI], EAX;
|
|
pop EAX; // pop return value
|
|
pop EBX;
|
|
pop EDI;
|
|
pop ESI;
|
|
ret 4*4;
|
|
}
|
|
}
|
|
|
|
/** dest[#] = src[#] >> numbits
|
|
* numbits must be in the range 1 .. 31
|
|
* This version uses MMX.
|
|
*/
|
|
uint multibyteShl(uint [] dest, const uint [] src, uint numbits) pure @safe @nogc
|
|
{
|
|
// Timing:
|
|
// K7 1.2/int. PM 1.7/int P4 5.3/int
|
|
enum { LASTPARAM = 4*4 } // 3* pushes + return address.
|
|
asm pure nothrow @nogc @trusted {
|
|
naked;
|
|
push ESI;
|
|
push EDI;
|
|
push EBX;
|
|
mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
|
|
mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
|
|
mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
|
|
|
|
movd MM3, EAX; // numbits = bits to shift left
|
|
xor EAX, 63;
|
|
align 16;
|
|
inc EAX;
|
|
movd MM4, EAX ; // 64-numbits = bits to shift right
|
|
|
|
// Get the return value into EAX
|
|
and EAX, 31; // EAX = 32-numbits
|
|
movd MM2, EAX; // 32-numbits
|
|
movd MM1, [ESI+4*EBX-4];
|
|
psrlq MM1, MM2;
|
|
movd EAX, MM1; // EAX = return value
|
|
test EBX, 1;
|
|
jz L_even;
|
|
L_odd:
|
|
cmp EBX, 1;
|
|
jz L_length1;
|
|
|
|
// deal with odd lengths
|
|
movq MM1, [ESI+4*EBX-8];
|
|
psrlq MM1, MM2;
|
|
movd [EDI +4*EBX-4], MM1;
|
|
sub EBX, 1;
|
|
L_even: // It's either singly or doubly even
|
|
movq MM2, [ESI + 4*EBX - 8];
|
|
psllq MM2, MM3;
|
|
sub EBX, 2;
|
|
jle L_last;
|
|
movq MM1, MM2;
|
|
add EBX, 2;
|
|
test EBX, 2;
|
|
jz L_onceeven;
|
|
sub EBX, 2;
|
|
|
|
// MAIN LOOP -- 128 bytes per iteration
|
|
L_twiceeven: // here MM2 is the carry
|
|
movq MM0, [ESI + 4*EBX-8];
|
|
psrlq MM0, MM4;
|
|
movq MM1, [ESI + 4*EBX-8];
|
|
psllq MM1, MM3;
|
|
por MM2, MM0;
|
|
movq [EDI +4*EBX], MM2;
|
|
L_onceeven: // here MM1 is the carry
|
|
movq MM0, [ESI + 4*EBX-16];
|
|
psrlq MM0, MM4;
|
|
movq MM2, [ESI + 4*EBX-16];
|
|
por MM1, MM0;
|
|
movq [EDI +4*EBX-8], MM1;
|
|
psllq MM2, MM3;
|
|
sub EBX, 4;
|
|
jg L_twiceeven;
|
|
L_last:
|
|
movq [EDI +4*EBX], MM2;
|
|
L_alldone:
|
|
emms; // NOTE: costs 6 cycles on Intel CPUs
|
|
pop EBX;
|
|
pop EDI;
|
|
pop ESI;
|
|
ret 4*4;
|
|
|
|
L_length1:
|
|
// length 1 is a special case
|
|
movd MM1, [ESI];
|
|
psllq MM1, MM3;
|
|
movd [EDI], MM1;
|
|
jmp L_alldone;
|
|
}
|
|
}
|
|
|
|
void multibyteShr(uint [] dest, const uint [] src, uint numbits) pure @safe @nogc
|
|
{
|
|
enum { LASTPARAM = 4*4 } // 3* pushes + return address.
|
|
asm pure nothrow @nogc @trusted {
|
|
naked;
|
|
push ESI;
|
|
push EDI;
|
|
push EBX;
|
|
mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
|
|
mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
|
|
align 16;
|
|
mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
|
|
lea EDI, [EDI + 4*EBX]; // EDI = end of dest
|
|
lea ESI, [ESI + 4*EBX]; // ESI = end of src
|
|
neg EBX; // count UP to zero.
|
|
|
|
movd MM3, EAX; // numbits = bits to shift right
|
|
xor EAX, 63;
|
|
inc EAX;
|
|
movd MM4, EAX ; // 64-numbits = bits to shift left
|
|
|
|
test EBX, 1;
|
|
jz L_even;
|
|
L_odd:
|
|
// deal with odd lengths
|
|
and EAX, 31; // EAX = 32-numbits
|
|
movd MM2, EAX; // 32-numbits
|
|
cmp EBX, -1;
|
|
jz L_length1;
|
|
|
|
movq MM0, [ESI+4*EBX];
|
|
psrlq MM0, MM3;
|
|
movd [EDI +4*EBX], MM0;
|
|
add EBX, 1;
|
|
L_even:
|
|
movq MM2, [ESI + 4*EBX];
|
|
psrlq MM2, MM3;
|
|
|
|
movq MM1, MM2;
|
|
add EBX, 4;
|
|
cmp EBX, -2+4;
|
|
jz L_last;
|
|
// It's either singly or doubly even
|
|
sub EBX, 2;
|
|
test EBX, 2;
|
|
jnz L_onceeven;
|
|
add EBX, 2;
|
|
|
|
// MAIN LOOP -- 128 bytes per iteration
|
|
L_twiceeven: // here MM2 is the carry
|
|
movq MM0, [ESI + 4*EBX-8];
|
|
psllq MM0, MM4;
|
|
movq MM1, [ESI + 4*EBX-8];
|
|
psrlq MM1, MM3;
|
|
por MM2, MM0;
|
|
movq [EDI +4*EBX-16], MM2;
|
|
L_onceeven: // here MM1 is the carry
|
|
movq MM0, [ESI + 4*EBX];
|
|
psllq MM0, MM4;
|
|
movq MM2, [ESI + 4*EBX];
|
|
por MM1, MM0;
|
|
movq [EDI +4*EBX-8], MM1;
|
|
psrlq MM2, MM3;
|
|
add EBX, 4;
|
|
jl L_twiceeven;
|
|
L_last:
|
|
movq [EDI +4*EBX-16], MM2;
|
|
L_alldone:
|
|
emms; // NOTE: costs 6 cycles on Intel CPUs
|
|
pop EBX;
|
|
pop EDI;
|
|
pop ESI;
|
|
ret 4*4;
|
|
|
|
L_length1:
|
|
// length 1 is a special case
|
|
movd MM1, [ESI+4*EBX];
|
|
psrlq MM1, MM3;
|
|
movd [EDI +4*EBX], MM1;
|
|
jmp L_alldone;
|
|
|
|
}
|
|
}
|
|
|
|
/** dest[#] = src[#] >> numbits
|
|
* numbits must be in the range 1 .. 31
|
|
*/
|
|
void multibyteShrNoMMX(uint [] dest, const uint [] src, uint numbits) pure @safe @nogc
|
|
{
|
|
// Timing: Optimal for P6 family.
|
|
// 2.0 cycles/int on PPro .. PM (limited by execution port p0)
|
|
// Terrible performance on AMD64, which has 7 cycles for SHRD!!
|
|
enum { LASTPARAM = 4*4 } // 3* pushes + return address.
|
|
asm pure nothrow @nogc @trusted {
|
|
naked;
|
|
push ESI;
|
|
push EDI;
|
|
push EBX;
|
|
mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
|
|
mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
|
|
mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
|
|
mov ECX, EAX; // numbits;
|
|
|
|
lea EDI, [EDI + 4*EBX]; // EDI = end of dest
|
|
lea ESI, [ESI + 4*EBX]; // ESI = end of src
|
|
neg EBX; // count UP to zero.
|
|
mov EAX, [ESI + 4*EBX];
|
|
cmp EBX, -1;
|
|
jz L_last;
|
|
mov EDX, [ESI + 4*EBX];
|
|
test EBX, 1;
|
|
jz L_odd;
|
|
add EBX, 1;
|
|
L_even:
|
|
mov EDX, [ ESI + 4*EBX];
|
|
shrd EAX, EDX, CL;
|
|
mov [-4 + EDI+4*EBX], EAX;
|
|
L_odd:
|
|
mov EAX, [4 + ESI + 4*EBX];
|
|
shrd EDX, EAX, CL;
|
|
mov [EDI + 4*EBX], EDX;
|
|
add EBX, 2;
|
|
jl L_even;
|
|
L_last:
|
|
shr EAX, CL;
|
|
mov [-4 + EDI], EAX;
|
|
|
|
pop EBX;
|
|
pop EDI;
|
|
pop ESI;
|
|
ret 4*4;
|
|
}
|
|
}
|
|
|
|
@safe unittest
|
|
{
|
|
|
|
uint [] aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
|
|
multibyteShr(aa[0..$-1], aa, 4);
|
|
assert(aa[0] == 0x6122_2222 && aa[1]==0xA455_5555
|
|
&& aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);
|
|
|
|
aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
|
|
multibyteShr(aa[2..$-1], aa[2..$-1], 4);
|
|
assert(aa[0] == 0x1222_2223 && aa[1]==0x4555_5556
|
|
&& aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);
|
|
|
|
aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
|
|
multibyteShr(aa[0..$-2], aa, 4);
|
|
assert(aa[1]==0xA455_5555 && aa[2]==0x0899_9999);
|
|
assert(aa[0]==0x6122_2222);
|
|
assert(aa[3]==0xBCCC_CCCD);
|
|
|
|
|
|
aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
|
|
uint r = multibyteShl(aa[2 .. 4], aa[2 .. 4], 4);
|
|
assert(aa[0] == 0xF0FF_FFFF && aa[1]==0x1222_2223
|
|
&& aa[2]==0x5555_5560 && aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
|
|
assert(r == 8);
|
|
|
|
aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
|
|
r = multibyteShl(aa[1 .. 4], aa[1 .. 4], 4);
|
|
assert(aa[0] == 0xF0FF_FFFF
|
|
&& aa[2]==0x5555_5561);
|
|
assert(aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
|
|
assert(r == 8);
|
|
assert(aa[1]==0x2222_2230);
|
|
|
|
aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
|
|
r = multibyteShl(aa[0 .. 4], aa[1 .. 5], 31);
|
|
}
|
|
|
|
/** dest[#] = src[#] * multiplier + carry.
|
|
* Returns carry.
|
|
*/
|
|
uint multibyteMul(uint[] dest, const uint[] src, uint multiplier, uint carry)
|
|
pure @safe
|
|
{
|
|
// Timing: definitely not optimal.
|
|
// Pentium M: 5.0 cycles/operation, has 3 resource stalls/iteration
|
|
// Fastest implementation found was 4.6 cycles/op, but not worth the complexity.
|
|
|
|
enum { LASTPARAM = 4*4 } // 4* pushes + return address.
|
|
// We'll use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
|
|
// when initializing variables to zero.
|
|
version (D_PIC)
|
|
{
|
|
enum { zero = 0 }
|
|
}
|
|
else
|
|
{
|
|
static immutable int zero = 0;
|
|
}
|
|
asm pure nothrow @nogc @trusted {
|
|
naked;
|
|
push ESI;
|
|
push EDI;
|
|
push EBX;
|
|
|
|
mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
|
|
mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
|
|
mov ESI, [ESP + LASTPARAM + 4*2]; // src.ptr
|
|
align 16;
|
|
lea EDI, [EDI + 4*EBX]; // EDI = end of dest
|
|
lea ESI, [ESI + 4*EBX]; // ESI = end of src
|
|
mov ECX, EAX; // [carry]; -- last param is in EAX.
|
|
neg EBX; // count UP to zero.
|
|
test EBX, 1;
|
|
jnz L_odd;
|
|
add EBX, 1;
|
|
L1:
|
|
mov EAX, [-4 + ESI + 4*EBX];
|
|
mul int ptr [ESP+LASTPARAM]; //[multiplier];
|
|
add EAX, ECX;
|
|
mov ECX, zero;
|
|
mov [-4+EDI + 4*EBX], EAX;
|
|
adc ECX, EDX;
|
|
L_odd:
|
|
mov EAX, [ESI + 4*EBX]; // p2
|
|
mul int ptr [ESP+LASTPARAM]; //[multiplier]; // p0*3,
|
|
add EAX, ECX;
|
|
mov ECX, zero;
|
|
adc ECX, EDX;
|
|
mov [EDI + 4*EBX], EAX;
|
|
add EBX, 2;
|
|
jl L1;
|
|
|
|
mov EAX, ECX; // get final carry
|
|
|
|
pop EBX;
|
|
pop EDI;
|
|
pop ESI;
|
|
ret 5*4;
|
|
}
|
|
}
|
|
|
|
@system unittest
|
|
{
|
|
uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
|
|
multibyteMul(aa[1 .. 4], aa[1 .. 4], 16, 0);
|
|
assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 &&
|
|
aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD);
|
|
}
|
|
|
|
// The inner multiply-and-add loop, together with the Even entry point.
|
|
// Multiples by M_ADDRESS which should be "ESP+LASTPARAM" or "ESP". OP must be "add" or "sub"
|
|
// This is the most time-critical code in the BigInt library.
|
|
// It is used by both MulAdd, multiplyAccumulate, and triangleAccumulate
|
|
string asmMulAdd_innerloop(string OP, string M_ADDRESS) pure @safe {
|
|
// The bottlenecks in this code are extremely complicated. The MUL, ADD, and ADC
|
|
// need 4 cycles on each of the ALUs units p0 and p1. So we use memory load
|
|
// (unit p2) for initializing registers to zero.
|
|
// There are also dependencies between the instructions, and we run up against the
|
|
// ROB-read limit (can only read 2 registers per cycle).
|
|
// We also need the number of uops in the loop to be a multiple of 3.
|
|
// The only available execution unit for this is p3 (memory write). Unfortunately we can't do that
|
|
// if Position-Independent Code is required.
|
|
|
|
// Register usage
|
|
// ESI = end of src
|
|
// EDI = end of dest
|
|
// EBX = index. Counts up to zero (in steps of 2).
|
|
// EDX:EAX = scratch, used in multiply.
|
|
// ECX = carry1.
|
|
// EBP = carry2.
|
|
// ESP = points to the multiplier.
|
|
|
|
// The first member of 'dest' which will be modified is [EDI+4*EBX].
|
|
// EAX must already contain the first member of 'src', [ESI+4*EBX].
|
|
|
|
version (D_PIC) { bool using_PIC = true; } else { bool using_PIC = false; }
|
|
return "
|
|
// Entry point for even length
|
|
add EBX, 1;
|
|
mov EBP, ECX; // carry
|
|
|
|
mul int ptr [" ~ M_ADDRESS ~ "]; // M
|
|
mov ECX, 0;
|
|
|
|
add EBP, EAX;
|
|
mov EAX, [ESI+4*EBX];
|
|
adc ECX, EDX;
|
|
|
|
mul int ptr [" ~ M_ADDRESS ~ "]; // M
|
|
" ~ OP ~ " [-4+EDI+4*EBX], EBP;
|
|
mov EBP, zero;
|
|
|
|
adc ECX, EAX;
|
|
mov EAX, [4+ESI+4*EBX];
|
|
|
|
adc EBP, EDX;
|
|
add EBX, 2;
|
|
jnl L_done;
|
|
L1:
|
|
mul int ptr [" ~ M_ADDRESS ~ "];
|
|
" ~ OP ~ " [-8+EDI+4*EBX], ECX;
|
|
adc EBP, EAX;
|
|
mov ECX, zero;
|
|
mov EAX, [ESI+4*EBX];
|
|
adc ECX, EDX;
|
|
" ~
|
|
(using_PIC ? "" : " mov storagenop, EDX; ") // make #uops in loop a multiple of 3, can't do this in PIC mode.
|
|
~ "
|
|
mul int ptr [" ~ M_ADDRESS ~ "];
|
|
" ~ OP ~ " [-4+EDI+4*EBX], EBP;
|
|
mov EBP, zero;
|
|
|
|
adc ECX, EAX;
|
|
mov EAX, [4+ESI+4*EBX];
|
|
|
|
adc EBP, EDX;
|
|
add EBX, 2;
|
|
jl L1;
|
|
L_done: " ~ OP ~ " [-8+EDI+4*EBX], ECX;
|
|
adc EBP, 0;
|
|
";
|
|
// final carry is now in EBP
|
|
}
|
|
|
|
string asmMulAdd_enter_odd(string OP, string M_ADDRESS) pure @safe
|
|
{
|
|
return "
|
|
mul int ptr [" ~M_ADDRESS ~"];
|
|
mov EBP, zero;
|
|
add ECX, EAX;
|
|
mov EAX, [4+ESI+4*EBX];
|
|
|
|
adc EBP, EDX;
|
|
add EBX, 2;
|
|
jl L1;
|
|
jmp L_done;
|
|
";
|
|
}
|
|
|
|
version (D_PIC) {} else
|
|
{
|
|
// This cannot be declared inside the relevant functions because __gshared is
|
|
// not allowed in @safe functions. It cannot be regular shared either since
|
|
// that gives an access violation on Win32 for some reason.
|
|
private __gshared int storagenopTriangleAccumulate;
|
|
private __gshared int storagenopMulAdd;
|
|
private __gshared int storagenopMultiplyAccumulate;
|
|
}
|
|
|
|
/**
|
|
* dest[#] += src[#] * multiplier OP carry(0 .. FFFF_FFFF).
|
|
* where op == '+' or '-'
|
|
* Returns carry out of MSB (0 .. FFFF_FFFF).
|
|
*/
|
|
uint multibyteMulAdd(char op)(uint [] dest, const uint [] src, uint
|
|
multiplier, uint carry) pure @safe @nogc {
|
|
// Timing: This is the most time-critical bignum function.
|
|
// Pentium M: 5.4 cycles/operation, still has 2 resource stalls + 1load block/iteration
|
|
|
|
// The main loop is pipelined and unrolled by 2,
|
|
// so entry to the loop is also complicated.
|
|
|
|
// Register usage
|
|
// EDX:EAX = multiply
|
|
// EBX = counter
|
|
// ECX = carry1
|
|
// EBP = carry2
|
|
// EDI = dest
|
|
// ESI = src
|
|
|
|
enum string OP = (op=='+')? "add" : "sub";
|
|
version (D_PIC)
|
|
{
|
|
enum { zero = 0 }
|
|
}
|
|
else
|
|
{
|
|
// use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
|
|
// when initializing registers to zero.
|
|
static immutable int zero = 0;
|
|
// use p3/p4 units
|
|
alias storagenop = storagenopMulAdd; // write-only
|
|
}
|
|
|
|
enum { LASTPARAM = 5*4 } // 4* pushes + return address.
|
|
asm pure nothrow @nogc @trusted {
|
|
naked;
|
|
|
|
push ESI;
|
|
push EDI;
|
|
push EBX;
|
|
push EBP;
|
|
mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
|
|
mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
|
|
align 16;
|
|
nop;
|
|
mov ESI, [ESP + LASTPARAM + 4*2]; // src.ptr
|
|
lea EDI, [EDI + 4*EBX]; // EDI = end of dest
|
|
lea ESI, [ESI + 4*EBX]; // ESI = end of src
|
|
mov EBP, 0;
|
|
mov ECX, EAX; // ECX = input carry.
|
|
neg EBX; // count UP to zero.
|
|
mov EAX, [ESI+4*EBX];
|
|
test EBX, 1;
|
|
jnz L_enter_odd;
|
|
}
|
|
// Main loop, with entry point for even length
|
|
mixin("asm pure nothrow @nogc @trusted {" ~ asmMulAdd_innerloop(OP, "ESP+LASTPARAM") ~ "}");
|
|
asm pure nothrow @nogc @trusted {
|
|
mov EAX, EBP; // get final carry
|
|
pop EBP;
|
|
pop EBX;
|
|
pop EDI;
|
|
pop ESI;
|
|
ret 5*4;
|
|
}
|
|
L_enter_odd:
|
|
mixin("asm pure nothrow @nogc @trusted {" ~ asmMulAdd_enter_odd(OP, "ESP+LASTPARAM") ~ "}");
|
|
}
|
|
|
|
@system unittest
|
|
{
|
|
|
|
uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
|
|
uint [] bb = [0x1234_1234, 0xF0F0_F0F0, 0x00C0_C0C0, 0xF0F0_F0F0, 0xC0C0_C0C0];
|
|
multibyteMulAdd!('+')(bb[1..$-1], aa[1..$-2], 16, 5);
|
|
assert(bb[0] == 0x1234_1234 && bb[4] == 0xC0C0_C0C0);
|
|
assert(bb[1] == 0x2222_2230 + 0xF0F0_F0F0+5 && bb[2] == 0x5555_5561+0x00C0_C0C0+1
|
|
&& bb[3] == 0x9999_99A4+0xF0F0_F0F0 );
|
|
}
|
|
|
|
/**
|
|
Sets result[#] = result[0 .. left.length] + left[#] * right[#]
|
|
|
|
It is defined in this way to allow cache-efficient multiplication.
|
|
This function is equivalent to:
|
|
----
|
|
for (int i = 0; i< right.length; ++i)
|
|
{
|
|
dest[left.length + i] = multibyteMulAdd(dest[i .. left.length+i],
|
|
left, right[i], 0);
|
|
}
|
|
----
|
|
*/
|
|
void multibyteMultiplyAccumulate(uint [] dest, const uint[] left,
|
|
const uint [] right) pure @safe @nogc {
|
|
// Register usage
|
|
// EDX:EAX = used in multiply
|
|
// EBX = index
|
|
// ECX = carry1
|
|
// EBP = carry2
|
|
// EDI = end of dest for this pass through the loop. Index for outer loop.
|
|
// ESI = end of left. never changes
|
|
// [ESP] = M = right[i] = multiplier for this pass through the loop.
|
|
// right.length is changed into dest.ptr+dest.length
|
|
version (D_PIC)
|
|
{
|
|
enum { zero = 0 }
|
|
}
|
|
else
|
|
{
|
|
// use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
|
|
// when initializing registers to zero.
|
|
static immutable int zero = 0;
|
|
// use p3/p4 units
|
|
alias storagenop = storagenopMultiplyAccumulate; // write-only
|
|
}
|
|
|
|
enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
|
|
asm pure nothrow @nogc @trusted {
|
|
naked;
|
|
|
|
push ESI;
|
|
push EDI;
|
|
align 16;
|
|
push EBX;
|
|
push EBP;
|
|
push EAX; // local variable M
|
|
mov EDI, [ESP + LASTPARAM + 4*5]; // dest.ptr
|
|
mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
|
|
mov ESI, [ESP + LASTPARAM + 4*3]; // left.ptr
|
|
lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass
|
|
|
|
mov EAX, [ESP + LASTPARAM + 4*0]; // right.length
|
|
lea EAX, [EDI + 4*EAX];
|
|
mov [ESP + LASTPARAM + 4*0], EAX; // last value for EDI
|
|
|
|
lea ESI, [ESI + 4*EBX]; // ESI = end of left
|
|
mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
|
|
mov EAX, [EAX];
|
|
mov [ESP], EAX; // M
|
|
outer_loop:
|
|
mov EBP, 0;
|
|
mov ECX, 0; // ECX = input carry.
|
|
neg EBX; // count UP to zero.
|
|
mov EAX, [ESI+4*EBX];
|
|
test EBX, 1;
|
|
jnz L_enter_odd;
|
|
}
|
|
// -- Inner loop, with even entry point
|
|
mixin("asm pure nothrow @nogc @trusted { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}");
|
|
asm pure nothrow @nogc @trusted {
|
|
mov [-4+EDI+4*EBX], EBP;
|
|
add EDI, 4;
|
|
cmp EDI, [ESP + LASTPARAM + 4*0]; // is EDI = &dest[$]?
|
|
jz outer_done;
|
|
mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
|
|
mov EAX, [EAX+4]; // get new M
|
|
mov [ESP], EAX; // save new M
|
|
add int ptr [ESP + LASTPARAM + 4*1], 4; // right.ptr
|
|
mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
|
|
jmp outer_loop;
|
|
outer_done:
|
|
pop EAX;
|
|
pop EBP;
|
|
pop EBX;
|
|
pop EDI;
|
|
pop ESI;
|
|
ret 6*4;
|
|
}
|
|
L_enter_odd:
|
|
mixin("asm pure nothrow @nogc @trusted {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}");
|
|
}
|
|
|
|
/** dest[#] /= divisor.
|
|
* overflow is the initial remainder, and must be in the range 0 .. divisor-1.
|
|
* divisor must not be a power of 2 (use right shift for that case;
|
|
* A division by zero will occur if divisor is a power of 2).
|
|
* Returns the final remainder
|
|
*
|
|
* Based on public domain code by Eric Bainville.
|
|
* (http://www.bealto.com/) Used with permission.
|
|
*/
|
|
uint multibyteDivAssign(uint [] dest, uint divisor, uint overflow) pure @safe @nogc
|
|
{
|
|
// Timing: limited by a horrible dependency chain.
|
|
// Pentium M: 18 cycles/op, 8 resource stalls/op.
|
|
// EAX, EDX = scratch, used by MUL
|
|
// EDI = dest
|
|
// CL = shift
|
|
// ESI = quotient
|
|
// EBX = remainderhi
|
|
// EBP = remainderlo
|
|
// [ESP-4] = mask
|
|
// [ESP] = kinv (2^64 /divisor)
|
|
enum { LASTPARAM = 5*4 } // 4* pushes + return address.
|
|
enum { LOCALS = 2*4} // MASK, KINV
|
|
asm pure nothrow @nogc @trusted {
|
|
naked;
|
|
|
|
push ESI;
|
|
push EDI;
|
|
push EBX;
|
|
push EBP;
|
|
|
|
mov EDI, [ESP + LASTPARAM + 4*2]; // dest.ptr
|
|
mov EBX, [ESP + LASTPARAM + 4*1]; // dest.length
|
|
|
|
// Loop from msb to lsb
|
|
lea EDI, [EDI + 4*EBX];
|
|
mov EBP, EAX; // rem is the input remainder, in 0 .. divisor-1
|
|
// Build the pseudo-inverse of divisor k: 2^64/k
|
|
// First determine the shift in ecx to get the max number of bits in kinv
|
|
xor ECX, ECX;
|
|
mov EAX, [ESP + LASTPARAM]; //divisor;
|
|
mov EDX, 1;
|
|
kinv1:
|
|
inc ECX;
|
|
ror EDX, 1;
|
|
shl EAX, 1;
|
|
jnc kinv1;
|
|
dec ECX;
|
|
// Here, ecx is a left shift moving the msb of k to bit 32
|
|
|
|
mov EAX, 1;
|
|
shl EAX, CL;
|
|
dec EAX;
|
|
ror EAX, CL ; //ecx bits at msb
|
|
push EAX; // MASK
|
|
|
|
// Then divide 2^(32+cx) by divisor (edx already ok)
|
|
xor EAX, EAX;
|
|
div int ptr [ESP + LASTPARAM + LOCALS-4*1]; //divisor;
|
|
push EAX; // kinv
|
|
align 16;
|
|
L2:
|
|
// Get 32 bits of quotient approx, multiplying
|
|
// most significant word of (rem*2^32+input)
|
|
mov EAX, [ESP+4]; //MASK;
|
|
and EAX, [EDI - 4];
|
|
or EAX, EBP;
|
|
rol EAX, CL;
|
|
mov EBX, EBP;
|
|
mov EBP, [EDI - 4];
|
|
mul int ptr [ESP]; //KINV;
|
|
|
|
shl EAX, 1;
|
|
rcl EDX, 1;
|
|
|
|
// Multiply by k and subtract to get remainder
|
|
// Subtraction must be done on two words
|
|
mov EAX, EDX;
|
|
mov ESI, EDX; // quot = high word
|
|
mul int ptr [ESP + LASTPARAM+LOCALS]; //divisor;
|
|
sub EBP, EAX;
|
|
sbb EBX, EDX;
|
|
jz Lb; // high word is 0, goto adjust on single word
|
|
|
|
// Adjust quotient and remainder on two words
|
|
Ld: inc ESI;
|
|
sub EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
|
|
sbb EBX, 0;
|
|
jnz Ld;
|
|
|
|
// Adjust quotient and remainder on single word
|
|
Lb: cmp EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
|
|
jc Lc; // rem in 0 .. divisor-1, OK
|
|
sub EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
|
|
inc ESI;
|
|
jmp Lb;
|
|
|
|
// Store result
|
|
Lc:
|
|
mov [EDI - 4], ESI;
|
|
lea EDI, [EDI - 4];
|
|
dec int ptr [ESP + LASTPARAM + 4*1+LOCALS]; // len
|
|
jnz L2;
|
|
|
|
pop EAX; // discard kinv
|
|
pop EAX; // discard mask
|
|
|
|
mov EAX, EBP; // return final remainder
|
|
pop EBP;
|
|
pop EBX;
|
|
pop EDI;
|
|
pop ESI;
|
|
ret 3*4;
|
|
}
|
|
}
|
|
|
|
@safe unittest
|
|
{
|
|
uint [] aa = new uint[101];
|
|
for (int i=0; i<aa.length; ++i) aa[i] = 0x8765_4321 * (i+3);
|
|
uint overflow = multibyteMul(aa, aa, 0x8EFD_FCFB, 0x33FF_7461);
|
|
uint r = multibyteDivAssign(aa, 0x8EFD_FCFB, overflow);
|
|
for (int i=0; i<aa.length-1; ++i) assert(aa[i] == 0x8765_4321 * (i+3));
|
|
assert(r == 0x33FF_7461);
|
|
}
|
|
|
|
// Set dest[2*i .. 2*i+1]+=src[i]*src[i]
|
|
void multibyteAddDiagonalSquares(uint [] dest, const uint [] src) pure @safe @nogc
|
|
{
|
|
/* Unlike mulAdd, the carry is only 1 bit,
|
|
since FFFF*FFFF+FFFF_FFFF = 1_0000_0000.
|
|
Note also that on the last iteration, no carry can occur.
|
|
As for multibyteAdd, we save & restore carry flag through the loop.
|
|
|
|
The timing is entirely dictated by the dependency chain. We could
|
|
improve it by moving the mov EAX after the adc [EDI], EAX. Probably not worthwhile.
|
|
*/
|
|
enum { LASTPARAM = 4*5 } // 4* pushes + return address.
|
|
asm pure nothrow @nogc @trusted {
|
|
naked;
|
|
push ESI;
|
|
push EDI;
|
|
push EBX;
|
|
push ECX;
|
|
mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
|
|
mov EBX, [ESP + LASTPARAM + 4*0]; //src.length;
|
|
mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
|
|
lea EDI, [EDI + 8*EBX]; // EDI = end of dest
|
|
lea ESI, [ESI + 4*EBX]; // ESI = end of src
|
|
neg EBX; // count UP to zero.
|
|
xor ECX, ECX; // initial carry = 0.
|
|
L1:
|
|
mov EAX, [ESI + 4*EBX];
|
|
mul EAX, EAX;
|
|
shr CL, 1; // get carry
|
|
adc [EDI + 8*EBX], EAX;
|
|
adc [EDI + 8*EBX + 4], EDX;
|
|
setc CL; // save carry
|
|
inc EBX;
|
|
jnz L1;
|
|
|
|
pop ECX;
|
|
pop EBX;
|
|
pop EDI;
|
|
pop ESI;
|
|
ret 4*4;
|
|
}
|
|
}
|
|
|
|
@safe unittest
|
|
{
|
|
uint [] aa = new uint[13];
|
|
uint [] bb = new uint[6];
|
|
for (int i=0; i<aa.length; ++i) aa[i] = 0x8000_0000;
|
|
for (int i=0; i<bb.length; ++i) bb[i] = i;
|
|
aa[$-1]= 7;
|
|
multibyteAddDiagonalSquares(aa[0..$-1], bb);
|
|
assert(aa[$-1]==7);
|
|
for (int i=0; i<bb.length; ++i) { assert(aa[2*i]==0x8000_0000+i*i); assert(aa[2*i+1]==0x8000_0000); }
|
|
}
|
|
|
|
void multibyteTriangleAccumulateD(uint[] dest, uint[] x) pure @safe @nogc
|
|
{
|
|
for (int i = 0; i < x.length-3; ++i)
|
|
{
|
|
dest[i+x.length] = multibyteMulAdd!('+')(
|
|
dest[i+i+1 .. i+x.length], x[i+1..$], x[i], 0);
|
|
}
|
|
ulong c = cast(ulong)(x[$-3]) * x[$-2] + dest[$-5];
|
|
dest[$-5] = cast(uint) c;
|
|
c >>= 32;
|
|
c += cast(ulong)(x[$-3]) * x[$-1] + dest[$-4];
|
|
dest[$-4] = cast(uint) c;
|
|
c >>= 32;
|
|
length2:
|
|
c += cast(ulong)(x[$-2]) * x[$-1];
|
|
dest[$-3] = cast(uint) c;
|
|
c >>= 32;
|
|
dest[$-2] = cast(uint) c;
|
|
}
|
|
|
|
//dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
|
|
// assert(dest.length = src.length*2);
|
|
// assert(src.length >= 3);
|
|
void multibyteTriangleAccumulateAsm(uint[] dest, const uint[] src) pure @safe @nogc
|
|
{
|
|
// Register usage
|
|
// EDX:EAX = used in multiply
|
|
// EBX = index
|
|
// ECX = carry1
|
|
// EBP = carry2
|
|
// EDI = end of dest for this pass through the loop. Index for outer loop.
|
|
// ESI = end of src. never changes
|
|
// [ESP] = M = src[i] = multiplier for this pass through the loop.
|
|
// dest.length is changed into dest.ptr+dest.length
|
|
version (D_PIC)
|
|
{
|
|
enum { zero = 0 }
|
|
}
|
|
else
|
|
{
|
|
// use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
|
|
// when initializing registers to zero.
|
|
static immutable int zero = 0;
|
|
// use p3/p4 units
|
|
alias storagenop = storagenopTriangleAccumulate; // write-only
|
|
}
|
|
|
|
enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
|
|
asm pure nothrow @nogc @trusted {
|
|
naked;
|
|
|
|
push ESI;
|
|
push EDI;
|
|
align 16;
|
|
push EBX;
|
|
push EBP;
|
|
push EAX; // local variable M= src[i]
|
|
mov EDI, [ESP + LASTPARAM + 4*3]; // dest.ptr
|
|
mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
|
|
mov ESI, [ESP + LASTPARAM + 4*1]; // src.ptr
|
|
|
|
lea ESI, [ESI + 4*EBX]; // ESI = end of left
|
|
add int ptr [ESP + LASTPARAM + 4*1], 4; // src.ptr, used for getting M
|
|
|
|
// local variable [ESP + LASTPARAM + 4*2] = last value for EDI
|
|
lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass
|
|
|
|
lea EAX, [EDI + 4*EBX-3*4]; // up to src.length - 3
|
|
mov [ESP + LASTPARAM + 4*2], EAX; // last value for EDI = &dest[src.length*2 -3]
|
|
|
|
cmp EBX, 3;
|
|
jz length_is_3;
|
|
|
|
// We start at src[1], not src[0].
|
|
dec EBX;
|
|
mov [ESP + LASTPARAM + 4*0], EBX;
|
|
|
|
outer_loop:
|
|
mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
|
|
mov EBP, 0;
|
|
mov ECX, 0; // ECX = input carry.
|
|
dec int ptr [ESP + LASTPARAM + 4*0]; // Next time, the length will be shorter by 1.
|
|
neg EBX; // count UP to zero.
|
|
|
|
mov EAX, [ESI + 4*EBX - 4*1]; // get new M
|
|
mov [ESP], EAX; // save new M
|
|
|
|
mov EAX, [ESI+4*EBX];
|
|
test EBX, 1;
|
|
jnz L_enter_odd;
|
|
}
|
|
// -- Inner loop, with even entry point
|
|
mixin("asm pure nothrow @nogc @trusted { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}");
|
|
asm pure nothrow @nogc @trusted {
|
|
mov [-4+EDI+4*EBX], EBP;
|
|
add EDI, 4;
|
|
cmp EDI, [ESP + LASTPARAM + 4*2]; // is EDI = &dest[$-3]?
|
|
jnz outer_loop;
|
|
length_is_3:
|
|
mov EAX, [ESI - 4*3];
|
|
mul EAX, [ESI - 4*2];
|
|
mov ECX, 0;
|
|
add [EDI-2*4], EAX; // ECX:dest[$-5] += x[$-3] * x[$-2]
|
|
adc ECX, EDX;
|
|
|
|
mov EAX, [ESI - 4*3];
|
|
mul EAX, [ESI - 4*1]; // x[$-3] * x[$-1]
|
|
add EAX, ECX;
|
|
mov ECX, 0;
|
|
adc EDX, 0;
|
|
// now EDX: EAX = c + x[$-3] * x[$-1]
|
|
add [EDI-1*4], EAX; // ECX:dest[$-4] += (EDX:EAX)
|
|
adc ECX, EDX; // ECX holds dest[$-3], it acts as carry for the last row
|
|
// do length == 2
|
|
mov EAX, [ESI - 4*2];
|
|
mul EAX, [ESI - 4*1];
|
|
add ECX, EAX;
|
|
adc EDX, 0;
|
|
mov [EDI - 0*4], ECX; // dest[$-2:$-3] = c + x[$-2] * x[$-1];
|
|
mov [EDI + 1*4], EDX;
|
|
|
|
pop EAX;
|
|
pop EBP;
|
|
pop EBX;
|
|
pop EDI;
|
|
pop ESI;
|
|
ret 4*4;
|
|
}
|
|
L_enter_odd:
|
|
mixin("asm pure nothrow @nogc @trusted {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}");
|
|
}
|
|
|
|
@safe unittest
|
|
{
|
|
uint [] aa = new uint[200];
|
|
uint [] a = aa[0 .. 100];
|
|
uint [] b = new uint [100];
|
|
aa[] = 761;
|
|
a[] = 0;
|
|
b[] = 0;
|
|
a[3] = 6;
|
|
b[0]=1;
|
|
b[1] = 17;
|
|
b[50 .. 100]=78;
|
|
multibyteTriangleAccumulateAsm(a, b[0 .. 50]);
|
|
uint [] c = new uint[100];
|
|
c[] = 0;
|
|
c[1] = 17;
|
|
c[3] = 6;
|
|
assert(a[]==c[]);
|
|
assert(a[0]==0);
|
|
aa[] = 0xFFFF_FFFF;
|
|
a[] = 0;
|
|
b[] = 0;
|
|
b[0]= 0xbf6a1f01;
|
|
b[1]= 0x6e38ed64;
|
|
b[2]= 0xdaa797ed;
|
|
b[3] = 0;
|
|
|
|
multibyteTriangleAccumulateAsm(a[0 .. 8], b[0 .. 4]);
|
|
assert(a[1]==0x3a600964);
|
|
assert(a[2]==0x339974f6);
|
|
assert(a[3]==0x46736fce);
|
|
assert(a[4]==0x5e24a2b4);
|
|
|
|
b[3] = 0xe93ff9f4;
|
|
b[4] = 0x184f03;
|
|
a[]=0;
|
|
multibyteTriangleAccumulateAsm(a[0 .. 14], b[0 .. 7]);
|
|
assert(a[3]==0x79fff5c2);
|
|
assert(a[4]==0xcf384241);
|
|
assert(a[5]== 0x4a17fc8);
|
|
assert(a[6]==0x4d549025);
|
|
}
|
|
|
|
|
|
void multibyteSquare(BigDigit[] result, const BigDigit [] x) pure @safe @nogc
|
|
{
|
|
if (x.length < 4)
|
|
{
|
|
// Special cases, not worth doing triangular.
|
|
result[x.length] = multibyteMul(result[0 .. x.length], x, x[0], 0);
|
|
multibyteMultiplyAccumulate(result[1..$], x, x[1..$]);
|
|
return;
|
|
}
|
|
// Do half a square multiply.
|
|
// dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
|
|
result[x.length] = multibyteMul(result[1 .. x.length], x[1..$], x[0], 0);
|
|
multibyteTriangleAccumulateAsm(result[2..$], x[1..$]);
|
|
// Multiply by 2
|
|
result[$-1] = multibyteShlNoMMX(result[1..$-1], result[1..$-1], 1);
|
|
// And add the diagonal elements
|
|
result[0] = 0;
|
|
multibyteAddDiagonalSquares(result, x);
|
|
}
|
|
|
|
version (BignumPerformanceTest)
|
|
{
|
|
import core.stdc.stdio;
|
|
int clock() { asm { push EBX; xor EAX, EAX; cpuid; pop EBX; rdtsc; } }
|
|
|
|
__gshared uint [2200] X1;
|
|
__gshared uint [2200] Y1;
|
|
__gshared uint [4000] Z1;
|
|
|
|
void testPerformance() pure
|
|
{
|
|
// The performance results at the top of this file were obtained using
|
|
// a Windows device driver to access the CPU performance counters.
|
|
// The code below is less accurate but more widely usable.
|
|
// The value for division is quite inconsistent.
|
|
for (int i=0; i<X1.length; ++i) { X1[i]=i; Y1[i]=i; Z1[i]=i; }
|
|
int t, t0;
|
|
multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7);
|
|
t0 = clock();
|
|
multibyteShl(Z1[0 .. 1000], X1[0 .. 1000], 7);
|
|
t = clock();
|
|
multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7);
|
|
auto shltime = (clock() - t) - (t - t0);
|
|
t0 = clock();
|
|
multibyteShr(Z1[2 .. 1002], X1[4 .. 1004], 13);
|
|
t = clock();
|
|
multibyteShr(Z1[2 .. 2002], X1[4 .. 2004], 13);
|
|
auto shrtime = (clock() - t) - (t - t0);
|
|
t0 = clock();
|
|
multibyteAddSub!('+')(Z1[0 .. 1000], X1[0 .. 1000], Y1[0 .. 1000], 0);
|
|
t = clock();
|
|
multibyteAddSub!('+')(Z1[0 .. 2000], X1[0 .. 2000], Y1[0 .. 2000], 0);
|
|
auto addtime = (clock() - t) - (t-t0);
|
|
t0 = clock();
|
|
multibyteMul(Z1[0 .. 1000], X1[0 .. 1000], 7, 0);
|
|
t = clock();
|
|
multibyteMul(Z1[0 .. 2000], X1[0 .. 2000], 7, 0);
|
|
auto multime = (clock() - t) - (t - t0);
|
|
multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0);
|
|
t0 = clock();
|
|
multibyteMulAdd!('+')(Z1[0 .. 1000], X1[0 .. 1000], 217, 0);
|
|
t = clock();
|
|
multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0);
|
|
auto muladdtime = (clock() - t) - (t - t0);
|
|
multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]);
|
|
t = clock();
|
|
multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]);
|
|
auto accumtime = clock() - t;
|
|
t0 = clock();
|
|
multibyteDivAssign(Z1[0 .. 2000], 217, 0);
|
|
t = clock();
|
|
multibyteDivAssign(Z1[0 .. 1000], 37, 0);
|
|
auto divtime = (t - t0) - (clock() - t);
|
|
t= clock();
|
|
multibyteSquare(Z1[0 .. 64], X1[0 .. 32]);
|
|
auto squaretime = clock() - t;
|
|
|
|
printf("-- BigInt asm performance (cycles/int) --\n");
|
|
printf("Add: %.2f\n", addtime/1000.0);
|
|
printf("Shl: %.2f\n", shltime/1000.0);
|
|
printf("Shr: %.2f\n", shrtime/1000.0);
|
|
printf("Mul: %.2f\n", multime/1000.0);
|
|
printf("MulAdd: %.2f\n", muladdtime/1000.0);
|
|
printf("Div: %.2f\n", divtime/1000.0);
|
|
printf("MulAccum32: %.2f*n*n (total %d)\n", accumtime/(32.0*32.0), accumtime);
|
|
printf("Square32: %.2f*n*n (total %d)\n\n", squaretime/(32.0*32.0), squaretime);
|
|
}
|
|
|
|
static this()
|
|
{
|
|
testPerformance();
|
|
}
|
|
}
|
|
|
|
} // version (D_InlineAsm_X86)
|