/** Optimised asm arbitrary precision arithmetic ('bignum') * routines for X86 processors. * * All functions operate on arrays of uints, stored LSB first. * If there is a destination array, it will be the first parameter. * Currently, all of these functions are subject to change, and are * intended for internal use only. * The symbol [#] indicates an array of machine words which is to be * interpreted as a multi-byte number. */ /* Copyright Don Clugston 2008 - 2010. * Distributed under the Boost Software License, Version 1.0. * (See accompanying file LICENSE_1_0.txt or copy at * http://www.boost.org/LICENSE_1_0.txt) */ /** * In simple terms, there are 3 modern x86 microarchitectures: * (a) the P6 family (Pentium Pro, PII, PIII, PM, Core), produced by Intel; * (b) the K6, Athlon, and AMD64 families, produced by AMD; and * (c) the Pentium 4, produced by Marketing. * * This code has been optimised for the Intel P6 family. * Generally the code remains near-optimal for Intel Core2/Corei7, after * translating EAX-> RAX, etc, since all these CPUs use essentially the same * pipeline, and are typically limited by memory access. * The code uses techniques described in Agner Fog's superb Pentium manuals * available at www.agner.org. * Not optimised for AMD, which can do two memory loads per cycle (Intel * CPUs can only do one). Despite this, performance is superior on AMD. * Performance is dreadful on P4. * * Timing results (cycles per int) * --Intel Pentium-- --AMD-- * PM P4 Core2 K7 * +,- 2.25 15.6 2.25 1.5 * <<,>> 2.0 6.6 2.0 5.0 * (<< MMX) 1.7 5.3 1.5 1.2 * * 5.0 15.0 4.0 4.3 * mulAdd 5.7 19.0 4.9 4.0 * div 30.0 32.0 32.0 22.4 * mulAcc(32) 6.5 20.0 5.4 4.9 * * mulAcc(32) is multiplyAccumulate() for a 32*32 multiply. Thus it includes * function call overhead. * The timing for Div is quite unpredictable, but it's probably too slow * to be useful. On 64-bit processors, these times should * halve if run in 64-bit mode, except for the MMX functions. */ module std.internal.math.biguintx86; @system: pure: nothrow: /* Naked asm is used throughout, because: (a) it frees up the EBP register (b) compiler bugs prevent the use of .ptr when a frame pointer is used. */ version (D_InlineAsm_X86) { private: /* Duplicate string s, with n times, substituting index for '@'. * * Each instance of '@' in s is replaced by 0,1,...n-1. This is a helper * function for some of the asm routines. */ string indexedLoopUnroll(int n, string s) pure @safe { string u; for (int i = 0; i9 ? ""~ cast(char)('0'+i/10) : "") ~ cast(char)('0' + i%10); int last = 0; for (int j = 0; j> numbits * numbits must be in the range 1 .. 31 * This version uses MMX. */ uint multibyteShl(uint [] dest, const uint [] src, uint numbits) pure @safe { // Timing: // K7 1.2/int. PM 1.7/int P4 5.3/int enum { LASTPARAM = 4*4 } // 3* pushes + return address. asm pure nothrow @trusted { naked; push ESI; push EDI; push EBX; mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr; mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length; mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr; movd MM3, EAX; // numbits = bits to shift left xor EAX, 63; align 16; inc EAX; movd MM4, EAX ; // 64-numbits = bits to shift right // Get the return value into EAX and EAX, 31; // EAX = 32-numbits movd MM2, EAX; // 32-numbits movd MM1, [ESI+4*EBX-4]; psrlq MM1, MM2; movd EAX, MM1; // EAX = return value test EBX, 1; jz L_even; L_odd: cmp EBX, 1; jz L_length1; // deal with odd lengths movq MM1, [ESI+4*EBX-8]; psrlq MM1, MM2; movd [EDI +4*EBX-4], MM1; sub EBX, 1; L_even: // It's either singly or doubly even movq MM2, [ESI + 4*EBX - 8]; psllq MM2, MM3; sub EBX, 2; jle L_last; movq MM1, MM2; add EBX, 2; test EBX, 2; jz L_onceeven; sub EBX, 2; // MAIN LOOP -- 128 bytes per iteration L_twiceeven: // here MM2 is the carry movq MM0, [ESI + 4*EBX-8]; psrlq MM0, MM4; movq MM1, [ESI + 4*EBX-8]; psllq MM1, MM3; por MM2, MM0; movq [EDI +4*EBX], MM2; L_onceeven: // here MM1 is the carry movq MM0, [ESI + 4*EBX-16]; psrlq MM0, MM4; movq MM2, [ESI + 4*EBX-16]; por MM1, MM0; movq [EDI +4*EBX-8], MM1; psllq MM2, MM3; sub EBX, 4; jg L_twiceeven; L_last: movq [EDI +4*EBX], MM2; L_alldone: emms; // NOTE: costs 6 cycles on Intel CPUs pop EBX; pop EDI; pop ESI; ret 4*4; L_length1: // length 1 is a special case movd MM1, [ESI]; psllq MM1, MM3; movd [EDI], MM1; jmp L_alldone; } } void multibyteShr(uint [] dest, const uint [] src, uint numbits) pure @safe { enum { LASTPARAM = 4*4 } // 3* pushes + return address. asm pure nothrow @trusted { naked; push ESI; push EDI; push EBX; mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr; mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length; align 16; mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr; lea EDI, [EDI + 4*EBX]; // EDI = end of dest lea ESI, [ESI + 4*EBX]; // ESI = end of src neg EBX; // count UP to zero. movd MM3, EAX; // numbits = bits to shift right xor EAX, 63; inc EAX; movd MM4, EAX ; // 64-numbits = bits to shift left test EBX, 1; jz L_even; L_odd: // deal with odd lengths and EAX, 31; // EAX = 32-numbits movd MM2, EAX; // 32-numbits cmp EBX, -1; jz L_length1; movq MM0, [ESI+4*EBX]; psrlq MM0, MM3; movd [EDI +4*EBX], MM0; add EBX, 1; L_even: movq MM2, [ESI + 4*EBX]; psrlq MM2, MM3; movq MM1, MM2; add EBX, 4; cmp EBX, -2+4; jz L_last; // It's either singly or doubly even sub EBX, 2; test EBX, 2; jnz L_onceeven; add EBX, 2; // MAIN LOOP -- 128 bytes per iteration L_twiceeven: // here MM2 is the carry movq MM0, [ESI + 4*EBX-8]; psllq MM0, MM4; movq MM1, [ESI + 4*EBX-8]; psrlq MM1, MM3; por MM2, MM0; movq [EDI +4*EBX-16], MM2; L_onceeven: // here MM1 is the carry movq MM0, [ESI + 4*EBX]; psllq MM0, MM4; movq MM2, [ESI + 4*EBX]; por MM1, MM0; movq [EDI +4*EBX-8], MM1; psrlq MM2, MM3; add EBX, 4; jl L_twiceeven; L_last: movq [EDI +4*EBX-16], MM2; L_alldone: emms; // NOTE: costs 6 cycles on Intel CPUs pop EBX; pop EDI; pop ESI; ret 4*4; L_length1: // length 1 is a special case movd MM1, [ESI+4*EBX]; psrlq MM1, MM3; movd [EDI +4*EBX], MM1; jmp L_alldone; } } /** dest[#] = src[#] >> numbits * numbits must be in the range 1 .. 31 */ void multibyteShrNoMMX(uint [] dest, const uint [] src, uint numbits) pure @safe { // Timing: Optimal for P6 family. // 2.0 cycles/int on PPro .. PM (limited by execution port p0) // Terrible performance on AMD64, which has 7 cycles for SHRD!! enum { LASTPARAM = 4*4 } // 3* pushes + return address. asm pure nothrow @trusted { naked; push ESI; push EDI; push EBX; mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr; mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length; mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr; mov ECX, EAX; // numbits; lea EDI, [EDI + 4*EBX]; // EDI = end of dest lea ESI, [ESI + 4*EBX]; // ESI = end of src neg EBX; // count UP to zero. mov EAX, [ESI + 4*EBX]; cmp EBX, -1; jz L_last; mov EDX, [ESI + 4*EBX]; test EBX, 1; jz L_odd; add EBX, 1; L_even: mov EDX, [ ESI + 4*EBX]; shrd EAX, EDX, CL; mov [-4 + EDI+4*EBX], EAX; L_odd: mov EAX, [4 + ESI + 4*EBX]; shrd EDX, EAX, CL; mov [EDI + 4*EBX], EDX; add EBX, 2; jl L_even; L_last: shr EAX, CL; mov [-4 + EDI], EAX; pop EBX; pop EDI; pop ESI; ret 4*4; } } @safe unittest { uint [] aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; multibyteShr(aa[0..$-1], aa, 4); assert(aa[0] == 0x6122_2222 && aa[1]==0xA455_5555 && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC); aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; multibyteShr(aa[2..$-1], aa[2..$-1], 4); assert(aa[0] == 0x1222_2223 && aa[1]==0x4555_5556 && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC); aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; multibyteShr(aa[0..$-2], aa, 4); assert(aa[1]==0xA455_5555 && aa[2]==0x0899_9999); assert(aa[0]==0x6122_2222); assert(aa[3]==0xBCCC_CCCD); aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; uint r = multibyteShl(aa[2 .. 4], aa[2 .. 4], 4); assert(aa[0] == 0xF0FF_FFFF && aa[1]==0x1222_2223 && aa[2]==0x5555_5560 && aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD); assert(r == 8); aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; r = multibyteShl(aa[1 .. 4], aa[1 .. 4], 4); assert(aa[0] == 0xF0FF_FFFF && aa[2]==0x5555_5561); assert(aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD); assert(r == 8); assert(aa[1]==0x2222_2230); aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; r = multibyteShl(aa[0 .. 4], aa[1 .. 5], 31); } /** dest[#] = src[#] * multiplier + carry. * Returns carry. */ uint multibyteMul(uint[] dest, const uint[] src, uint multiplier, uint carry) pure @safe { // Timing: definitely not optimal. // Pentium M: 5.0 cycles/operation, has 3 resource stalls/iteration // Fastest implementation found was 4.6 cycles/op, but not worth the complexity. enum { LASTPARAM = 4*4 } // 4* pushes + return address. // We'll use p2 (load unit) instead of the overworked p0 or p1 (ALU units) // when initializing variables to zero. version (D_PIC) { enum { zero = 0 } } else { static immutable int zero = 0; } asm pure nothrow @trusted { naked; push ESI; push EDI; push EBX; mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length mov ESI, [ESP + LASTPARAM + 4*2]; // src.ptr align 16; lea EDI, [EDI + 4*EBX]; // EDI = end of dest lea ESI, [ESI + 4*EBX]; // ESI = end of src mov ECX, EAX; // [carry]; -- last param is in EAX. neg EBX; // count UP to zero. test EBX, 1; jnz L_odd; add EBX, 1; L1: mov EAX, [-4 + ESI + 4*EBX]; mul int ptr [ESP+LASTPARAM]; //[multiplier]; add EAX, ECX; mov ECX, zero; mov [-4+EDI + 4*EBX], EAX; adc ECX, EDX; L_odd: mov EAX, [ESI + 4*EBX]; // p2 mul int ptr [ESP+LASTPARAM]; //[multiplier]; // p0*3, add EAX, ECX; mov ECX, zero; adc ECX, EDX; mov [EDI + 4*EBX], EAX; add EBX, 2; jl L1; mov EAX, ECX; // get final carry pop EBX; pop EDI; pop ESI; ret 5*4; } } @system unittest { uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; multibyteMul(aa[1 .. 4], aa[1 .. 4], 16, 0); assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 && aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD); } // The inner multiply-and-add loop, together with the Even entry point. // Multiples by M_ADDRESS which should be "ESP+LASTPARAM" or "ESP". OP must be "add" or "sub" // This is the most time-critical code in the BigInt library. // It is used by both MulAdd, multiplyAccumulate, and triangleAccumulate string asmMulAdd_innerloop(string OP, string M_ADDRESS) pure @safe { // The bottlenecks in this code are extremely complicated. The MUL, ADD, and ADC // need 4 cycles on each of the ALUs units p0 and p1. So we use memory load // (unit p2) for initializing registers to zero. // There are also dependencies between the instructions, and we run up against the // ROB-read limit (can only read 2 registers per cycle). // We also need the number of uops in the loop to be a multiple of 3. // The only available execution unit for this is p3 (memory write). Unfortunately we can't do that // if Position-Independent Code is required. // Register usage // ESI = end of src // EDI = end of dest // EBX = index. Counts up to zero (in steps of 2). // EDX:EAX = scratch, used in multiply. // ECX = carry1. // EBP = carry2. // ESP = points to the multiplier. // The first member of 'dest' which will be modified is [EDI+4*EBX]. // EAX must already contain the first member of 'src', [ESI+4*EBX]. version (D_PIC) { bool using_PIC = true; } else { bool using_PIC = false; } return " // Entry point for even length add EBX, 1; mov EBP, ECX; // carry mul int ptr [" ~ M_ADDRESS ~ "]; // M mov ECX, 0; add EBP, EAX; mov EAX, [ESI+4*EBX]; adc ECX, EDX; mul int ptr [" ~ M_ADDRESS ~ "]; // M " ~ OP ~ " [-4+EDI+4*EBX], EBP; mov EBP, zero; adc ECX, EAX; mov EAX, [4+ESI+4*EBX]; adc EBP, EDX; add EBX, 2; jnl L_done; L1: mul int ptr [" ~ M_ADDRESS ~ "]; " ~ OP ~ " [-8+EDI+4*EBX], ECX; adc EBP, EAX; mov ECX, zero; mov EAX, [ESI+4*EBX]; adc ECX, EDX; " ~ (using_PIC ? "" : " mov storagenop, EDX; ") // make #uops in loop a multiple of 3, can't do this in PIC mode. ~ " mul int ptr [" ~ M_ADDRESS ~ "]; " ~ OP ~ " [-4+EDI+4*EBX], EBP; mov EBP, zero; adc ECX, EAX; mov EAX, [4+ESI+4*EBX]; adc EBP, EDX; add EBX, 2; jl L1; L_done: " ~ OP ~ " [-8+EDI+4*EBX], ECX; adc EBP, 0; "; // final carry is now in EBP } string asmMulAdd_enter_odd(string OP, string M_ADDRESS) pure @safe { return " mul int ptr [" ~M_ADDRESS ~"]; mov EBP, zero; add ECX, EAX; mov EAX, [4+ESI+4*EBX]; adc EBP, EDX; add EBX, 2; jl L1; jmp L_done; "; } version (D_PIC) {} else { // This cannot be declared inside the relevant functions because __gshared is // not allowed in @safe functions. It cannot be regular shared either since // that gives an access violation on Win32 for some reason. private __gshared int storagenopTriangleAccumulate; private __gshared int storagenopMulAdd; private __gshared int storagenopMultiplyAccumulate; } /** * dest[#] += src[#] * multiplier OP carry(0 .. FFFF_FFFF). * where op == '+' or '-' * Returns carry out of MSB (0 .. FFFF_FFFF). */ uint multibyteMulAdd(char op)(uint [] dest, const uint [] src, uint multiplier, uint carry) pure @safe { // Timing: This is the most time-critical bignum function. // Pentium M: 5.4 cycles/operation, still has 2 resource stalls + 1load block/iteration // The main loop is pipelined and unrolled by 2, // so entry to the loop is also complicated. // Register usage // EDX:EAX = multiply // EBX = counter // ECX = carry1 // EBP = carry2 // EDI = dest // ESI = src enum string OP = (op=='+')? "add" : "sub"; version (D_PIC) { enum { zero = 0 } } else { // use p2 (load unit) instead of the overworked p0 or p1 (ALU units) // when initializing registers to zero. static immutable int zero = 0; // use p3/p4 units alias storagenop = storagenopMulAdd; // write-only } enum { LASTPARAM = 5*4 } // 4* pushes + return address. asm pure nothrow @trusted { naked; push ESI; push EDI; push EBX; push EBP; mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length align 16; nop; mov ESI, [ESP + LASTPARAM + 4*2]; // src.ptr lea EDI, [EDI + 4*EBX]; // EDI = end of dest lea ESI, [ESI + 4*EBX]; // ESI = end of src mov EBP, 0; mov ECX, EAX; // ECX = input carry. neg EBX; // count UP to zero. mov EAX, [ESI+4*EBX]; test EBX, 1; jnz L_enter_odd; } // Main loop, with entry point for even length mixin("asm pure nothrow @trusted {" ~ asmMulAdd_innerloop(OP, "ESP+LASTPARAM") ~ "}"); asm pure nothrow @trusted { mov EAX, EBP; // get final carry pop EBP; pop EBX; pop EDI; pop ESI; ret 5*4; } L_enter_odd: mixin("asm pure nothrow @trusted {" ~ asmMulAdd_enter_odd(OP, "ESP+LASTPARAM") ~ "}"); } @system unittest { uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; uint [] bb = [0x1234_1234, 0xF0F0_F0F0, 0x00C0_C0C0, 0xF0F0_F0F0, 0xC0C0_C0C0]; multibyteMulAdd!('+')(bb[1..$-1], aa[1..$-2], 16, 5); assert(bb[0] == 0x1234_1234 && bb[4] == 0xC0C0_C0C0); assert(bb[1] == 0x2222_2230 + 0xF0F0_F0F0+5 && bb[2] == 0x5555_5561+0x00C0_C0C0+1 && bb[3] == 0x9999_99A4+0xF0F0_F0F0 ); } /** Sets result[#] = result[0 .. left.length] + left[#] * right[#] It is defined in this way to allow cache-efficient multiplication. This function is equivalent to: ---- for (int i = 0; i< right.length; ++i) { dest[left.length + i] = multibyteMulAdd(dest[i .. left.length+i], left, right[i], 0); } ---- */ void multibyteMultiplyAccumulate(uint [] dest, const uint[] left, const uint [] right) pure @safe { // Register usage // EDX:EAX = used in multiply // EBX = index // ECX = carry1 // EBP = carry2 // EDI = end of dest for this pass through the loop. Index for outer loop. // ESI = end of left. never changes // [ESP] = M = right[i] = multiplier for this pass through the loop. // right.length is changed into dest.ptr+dest.length version (D_PIC) { enum { zero = 0 } } else { // use p2 (load unit) instead of the overworked p0 or p1 (ALU units) // when initializing registers to zero. static immutable int zero = 0; // use p3/p4 units alias storagenop = storagenopMultiplyAccumulate; // write-only } enum { LASTPARAM = 6*4 } // 4* pushes + local + return address. asm pure nothrow @trusted { naked; push ESI; push EDI; align 16; push EBX; push EBP; push EAX; // local variable M mov EDI, [ESP + LASTPARAM + 4*5]; // dest.ptr mov EBX, [ESP + LASTPARAM + 4*2]; // left.length mov ESI, [ESP + LASTPARAM + 4*3]; // left.ptr lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass mov EAX, [ESP + LASTPARAM + 4*0]; // right.length lea EAX, [EDI + 4*EAX]; mov [ESP + LASTPARAM + 4*0], EAX; // last value for EDI lea ESI, [ESI + 4*EBX]; // ESI = end of left mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr mov EAX, [EAX]; mov [ESP], EAX; // M outer_loop: mov EBP, 0; mov ECX, 0; // ECX = input carry. neg EBX; // count UP to zero. mov EAX, [ESI+4*EBX]; test EBX, 1; jnz L_enter_odd; } // -- Inner loop, with even entry point mixin("asm pure nothrow @trusted { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}"); asm pure nothrow @trusted { mov [-4+EDI+4*EBX], EBP; add EDI, 4; cmp EDI, [ESP + LASTPARAM + 4*0]; // is EDI = &dest[$]? jz outer_done; mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr mov EAX, [EAX+4]; // get new M mov [ESP], EAX; // save new M add int ptr [ESP + LASTPARAM + 4*1], 4; // right.ptr mov EBX, [ESP + LASTPARAM + 4*2]; // left.length jmp outer_loop; outer_done: pop EAX; pop EBP; pop EBX; pop EDI; pop ESI; ret 6*4; } L_enter_odd: mixin("asm pure nothrow @trusted {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}"); } /** dest[#] /= divisor. * overflow is the initial remainder, and must be in the range 0 .. divisor-1. * divisor must not be a power of 2 (use right shift for that case; * A division by zero will occur if divisor is a power of 2). * Returns the final remainder * * Based on public domain code by Eric Bainville. * (http://www.bealto.com/) Used with permission. */ uint multibyteDivAssign(uint [] dest, uint divisor, uint overflow) pure @safe { // Timing: limited by a horrible dependency chain. // Pentium M: 18 cycles/op, 8 resource stalls/op. // EAX, EDX = scratch, used by MUL // EDI = dest // CL = shift // ESI = quotient // EBX = remainderhi // EBP = remainderlo // [ESP-4] = mask // [ESP] = kinv (2^64 /divisor) enum { LASTPARAM = 5*4 } // 4* pushes + return address. enum { LOCALS = 2*4} // MASK, KINV asm pure nothrow @trusted { naked; push ESI; push EDI; push EBX; push EBP; mov EDI, [ESP + LASTPARAM + 4*2]; // dest.ptr mov EBX, [ESP + LASTPARAM + 4*1]; // dest.length // Loop from msb to lsb lea EDI, [EDI + 4*EBX]; mov EBP, EAX; // rem is the input remainder, in 0 .. divisor-1 // Build the pseudo-inverse of divisor k: 2^64/k // First determine the shift in ecx to get the max number of bits in kinv xor ECX, ECX; mov EAX, [ESP + LASTPARAM]; //divisor; mov EDX, 1; kinv1: inc ECX; ror EDX, 1; shl EAX, 1; jnc kinv1; dec ECX; // Here, ecx is a left shift moving the msb of k to bit 32 mov EAX, 1; shl EAX, CL; dec EAX; ror EAX, CL ; //ecx bits at msb push EAX; // MASK // Then divide 2^(32+cx) by divisor (edx already ok) xor EAX, EAX; div int ptr [ESP + LASTPARAM + LOCALS-4*1]; //divisor; push EAX; // kinv align 16; L2: // Get 32 bits of quotient approx, multiplying // most significant word of (rem*2^32+input) mov EAX, [ESP+4]; //MASK; and EAX, [EDI - 4]; or EAX, EBP; rol EAX, CL; mov EBX, EBP; mov EBP, [EDI - 4]; mul int ptr [ESP]; //KINV; shl EAX, 1; rcl EDX, 1; // Multiply by k and subtract to get remainder // Subtraction must be done on two words mov EAX, EDX; mov ESI, EDX; // quot = high word mul int ptr [ESP + LASTPARAM+LOCALS]; //divisor; sub EBP, EAX; sbb EBX, EDX; jz Lb; // high word is 0, goto adjust on single word // Adjust quotient and remainder on two words Ld: inc ESI; sub EBP, [ESP + LASTPARAM+LOCALS]; //divisor; sbb EBX, 0; jnz Ld; // Adjust quotient and remainder on single word Lb: cmp EBP, [ESP + LASTPARAM+LOCALS]; //divisor; jc Lc; // rem in 0 .. divisor-1, OK sub EBP, [ESP + LASTPARAM+LOCALS]; //divisor; inc ESI; jmp Lb; // Store result Lc: mov [EDI - 4], ESI; lea EDI, [EDI - 4]; dec int ptr [ESP + LASTPARAM + 4*1+LOCALS]; // len jnz L2; pop EAX; // discard kinv pop EAX; // discard mask mov EAX, EBP; // return final remainder pop EBP; pop EBX; pop EDI; pop ESI; ret 3*4; } } @safe unittest { uint [] aa = new uint[101]; for (int i=0; i>= 32; c += cast(ulong)(x[$-3]) * x[$-1] + dest[$-4]; dest[$-4] = cast(uint) c; c >>= 32; length2: c += cast(ulong)(x[$-2]) * x[$-1]; dest[$-3] = cast(uint) c; c >>= 32; dest[$-2] = cast(uint) c; } //dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1] // assert(dest.length = src.length*2); // assert(src.length >= 3); void multibyteTriangleAccumulateAsm(uint[] dest, const uint[] src) pure @safe { // Register usage // EDX:EAX = used in multiply // EBX = index // ECX = carry1 // EBP = carry2 // EDI = end of dest for this pass through the loop. Index for outer loop. // ESI = end of src. never changes // [ESP] = M = src[i] = multiplier for this pass through the loop. // dest.length is changed into dest.ptr+dest.length version (D_PIC) { enum { zero = 0 } } else { // use p2 (load unit) instead of the overworked p0 or p1 (ALU units) // when initializing registers to zero. static immutable int zero = 0; // use p3/p4 units alias storagenop = storagenopTriangleAccumulate; // write-only } enum { LASTPARAM = 6*4 } // 4* pushes + local + return address. asm pure nothrow @trusted { naked; push ESI; push EDI; align 16; push EBX; push EBP; push EAX; // local variable M= src[i] mov EDI, [ESP + LASTPARAM + 4*3]; // dest.ptr mov EBX, [ESP + LASTPARAM + 4*0]; // src.length mov ESI, [ESP + LASTPARAM + 4*1]; // src.ptr lea ESI, [ESI + 4*EBX]; // ESI = end of left add int ptr [ESP + LASTPARAM + 4*1], 4; // src.ptr, used for getting M // local variable [ESP + LASTPARAM + 4*2] = last value for EDI lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass lea EAX, [EDI + 4*EBX-3*4]; // up to src.length - 3 mov [ESP + LASTPARAM + 4*2], EAX; // last value for EDI = &dest[src.length*2 -3] cmp EBX, 3; jz length_is_3; // We start at src[1], not src[0]. dec EBX; mov [ESP + LASTPARAM + 4*0], EBX; outer_loop: mov EBX, [ESP + LASTPARAM + 4*0]; // src.length mov EBP, 0; mov ECX, 0; // ECX = input carry. dec int ptr [ESP + LASTPARAM + 4*0]; // Next time, the length will be shorter by 1. neg EBX; // count UP to zero. mov EAX, [ESI + 4*EBX - 4*1]; // get new M mov [ESP], EAX; // save new M mov EAX, [ESI+4*EBX]; test EBX, 1; jnz L_enter_odd; } // -- Inner loop, with even entry point mixin("asm pure nothrow @trusted { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}"); asm pure nothrow @trusted { mov [-4+EDI+4*EBX], EBP; add EDI, 4; cmp EDI, [ESP + LASTPARAM + 4*2]; // is EDI = &dest[$-3]? jnz outer_loop; length_is_3: mov EAX, [ESI - 4*3]; mul EAX, [ESI - 4*2]; mov ECX, 0; add [EDI-2*4], EAX; // ECX:dest[$-5] += x[$-3] * x[$-2] adc ECX, EDX; mov EAX, [ESI - 4*3]; mul EAX, [ESI - 4*1]; // x[$-3] * x[$-1] add EAX, ECX; mov ECX, 0; adc EDX, 0; // now EDX: EAX = c + x[$-3] * x[$-1] add [EDI-1*4], EAX; // ECX:dest[$-4] += (EDX:EAX) adc ECX, EDX; // ECX holds dest[$-3], it acts as carry for the last row // do length == 2 mov EAX, [ESI - 4*2]; mul EAX, [ESI - 4*1]; add ECX, EAX; adc EDX, 0; mov [EDI - 0*4], ECX; // dest[$-2:$-3] = c + x[$-2] * x[$-1]; mov [EDI + 1*4], EDX; pop EAX; pop EBP; pop EBX; pop EDI; pop ESI; ret 4*4; } L_enter_odd: mixin("asm pure nothrow @trusted {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}"); } @safe unittest { uint [] aa = new uint[200]; uint [] a = aa[0 .. 100]; uint [] b = new uint [100]; aa[] = 761; a[] = 0; b[] = 0; a[3] = 6; b[0]=1; b[1] = 17; b[50 .. 100]=78; multibyteTriangleAccumulateAsm(a, b[0 .. 50]); uint [] c = new uint[100]; c[] = 0; c[1] = 17; c[3] = 6; assert(a[]==c[]); assert(a[0]==0); aa[] = 0xFFFF_FFFF; a[] = 0; b[] = 0; b[0]= 0xbf6a1f01; b[1]= 0x6e38ed64; b[2]= 0xdaa797ed; b[3] = 0; multibyteTriangleAccumulateAsm(a[0 .. 8], b[0 .. 4]); assert(a[1]==0x3a600964); assert(a[2]==0x339974f6); assert(a[3]==0x46736fce); assert(a[4]==0x5e24a2b4); b[3] = 0xe93ff9f4; b[4] = 0x184f03; a[]=0; multibyteTriangleAccumulateAsm(a[0 .. 14], b[0 .. 7]); assert(a[3]==0x79fff5c2); assert(a[4]==0xcf384241); assert(a[5]== 0x4a17fc8); assert(a[6]==0x4d549025); } void multibyteSquare(BigDigit[] result, const BigDigit [] x) pure @safe { if (x.length < 4) { // Special cases, not worth doing triangular. result[x.length] = multibyteMul(result[0 .. x.length], x, x[0], 0); multibyteMultiplyAccumulate(result[1..$], x, x[1..$]); return; } // Do half a square multiply. // dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1] result[x.length] = multibyteMul(result[1 .. x.length], x[1..$], x[0], 0); multibyteTriangleAccumulateAsm(result[2..$], x[1..$]); // Multiply by 2 result[$-1] = multibyteShlNoMMX(result[1..$-1], result[1..$-1], 1); // And add the diagonal elements result[0] = 0; multibyteAddDiagonalSquares(result, x); } version (BignumPerformanceTest) { import core.stdc.stdio; int clock() { asm { push EBX; xor EAX, EAX; cpuid; pop EBX; rdtsc; } } __gshared uint [2200] X1; __gshared uint [2200] Y1; __gshared uint [4000] Z1; void testPerformance() pure { // The performance results at the top of this file were obtained using // a Windows device driver to access the CPU performance counters. // The code below is less accurate but more widely usable. // The value for division is quite inconsistent. for (int i=0; i