From c09debbe32b2b05bc4712a06a3c0c79722e3779d Mon Sep 17 00:00:00 2001
From: Don Clugston <dclugston@googlemail.com>
Date: Mon, 30 Jun 2008 14:47:37 +0000
Subject: [PATCH] Restored bigint (fixed so that it compiles). I've updated my
 asm code so that it uses dynamic arrays throughout, instead of a mix of
 pointers and arrays. It's possible that pointers-only would be better. None
 of the asm code is hooked up yet.

---
 std/bigint.d | 2982 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 2979 insertions(+), 3 deletions(-)

diff --git a/std/bigint.d b/std/bigint.d
index bd90fbd27..39ab092e0 100644
--- a/std/bigint.d
+++ b/std/bigint.d
@@ -1,4 +1,2980 @@
-// This is just a placeholder
-// bigint coming soon
+// Written in the D programming language.
 
-module std.bigint;
+/**
+Provides a BigInt struct for multiprecision integer arithmetic.
+
+The internal representation is binary, not decimal.
+
+All relevant operators are overloaded.
+
+Example:
+----------------------------------------------------
+        BigInt a = "9588669891916142";
+        BigInt b = "7452469135154800";
+        auto c = a * b;
+        assert(c == "71459266416693160362545788781600");
+        auto d = b * a;
+        assert(d == "71459266416693160362545788781600");
+        assert(d == c);
+        d = c * "794628672112";
+        assert(d == "56783581982794522489042432639320434378739200");
+        auto e = c + d;
+        assert(e == "56783581982865981755459125799682980167520800");
+        auto f = d + c;
+        assert(f == e);
+        auto g = f - c;
+        assert(g == d);
+        g = f - d;
+        assert(g == c);
+        e = 12345678;
+        g = c + e;
+        auto h = g / b;
+        auto i = g % b;
+        assert(h == a);
+        assert(i == e);
+----------------------------------------------------
+
+Authors: Janice Caron
+
+Date: 2008.05.18
+
+License: Public Domain
+
+
+Macros:
+    WIKI=Phobos/StdBigint
+*/
+
+module bigint;
+import std.string       : format;
+import std.stdio        : writef, writefln;
+import std.algorithm    : min, max, swap;
+import std.traits       : isIntegral;
+
+alias uint Digit; /// alias for uint
+
+/******************
+ * Struct representing a multiprecision integer
+ */
+struct BigInt
+{
+    Digits digits = [ cast(Digit)0 ];
+
+    static const BigInt ZERO = { [ cast(Digit)0 ] };
+    static const BigInt ONE = { [ cast(Digit)1 ] };
+
+    ///
+    void opAssign(const BigInt n)
+    {
+        digits = n.digits;
+    }
+
+    ///
+    void opAssign(int n)
+    {
+        digits = cast(Digits)( [ cast(Digit)n ] );
+    }
+
+    ///
+    void opAssign(uint n)
+    {
+        static if(BIG_ENDIAN) { auto a = [ cast(Digit)0, n ]; }
+        else                  { auto a = [ cast(Digit)n, 0 ]; }
+        Big b = bigInt(a);
+        digits = b.digits;
+    }
+
+    ///
+    void opAssign(long n)
+    {
+        static if(BIG_ENDIAN) { auto a = [ cast(Digit)(n>>32), cast(Digit)n ]; }
+        else                  { auto a = [ cast(Digit)n, cast(Digit)(n>>32) ]; }
+        Big b = bigInt(a);
+        digits = b.digits;
+    }
+
+    ///
+    void opAssign(ulong n)
+    {
+        static if(BIG_ENDIAN) { auto a = [ cast(Digit)0, cast(Digit)(n>>32), cast(Digit)n ]; }
+        else                  { auto a = [ cast(Digit)n, cast(Digit)(n>>32), cast(Digit)0 ]; }
+        Big b = bigInt(a);
+        digits = b.digits;
+    }
+
+    ///
+    void opAssign(string s)
+    {
+        Big b = fromString(s);
+        digits = b.digits;
+    }
+
+    ///
+    static BigInt opCall(T)(T n)
+    {
+        BigInt r;
+        r.opAssign(n);
+        return r;
+    }
+
+    // Convert TO other types
+
+    ///
+    void castTo(out BigInt r) const
+    {
+        r.digits = digits;
+    }
+
+    ///
+    void castTo(out int r) const
+    {
+        r = cast(int)tail(digits,1u)[0];
+    }
+
+    ///
+    void castTo(out uint r) const
+    {
+        r = cast(uint)tail(digits,1u)[0];
+    }
+
+    ///
+    void castTo(out long r) const
+    {
+        ulong t;
+        castTo(t);
+        r = cast(long)t;
+    }
+
+    ///
+    void castTo(out ulong r) const
+    {
+        mixin(setUp("x","*this"));
+        r = peek(xp);
+        xp = next(xp);
+        if (xp != xe) r += cast(ulong)(peek(xp)) << 32;
+    }
+
+    ///
+    void castTo(out string r) const
+    {
+        r = decimal(*this);
+    }
+
+    // Unary operator overloads
+
+    ///
+    BigInt opPos() const
+    {
+        BigInt r;
+        r.digits = digits;
+        return r;
+    }
+
+    ///
+    BigInt opNeg() const
+    {
+        return neg(*this);
+    }
+
+    ///
+    BigInt opCom() const
+    {
+        return com(*this);
+    }
+
+    ///
+    BigInt opPostInc()
+    {
+        BigInt n = *this;
+        opAddAssign(1);
+        return n;
+    }
+
+    ///
+    BigInt opPostDec()
+    {
+        BigInt n = *this;
+        opSubAssign(1);
+        return n;
+    }
+
+    // Binary operator overloads
+
+    ///
+    BigInt opAdd(T)(T n) const
+    {
+        return opAdd(BigInt(n));
+    }
+
+    ///
+    BigInt opAdd(T:int)(T n) const
+    {
+        return add(*this,cast(Digit)n);
+    }
+
+    ///
+    BigInt opAdd(T:const(BigInt))(T n) const
+    {
+        return add(*this,n);
+    }
+
+    ///
+    void opAddAssign(T)(T n)
+    {
+        auto r = opAdd(n);
+        digits = r.digits;
+    }
+
+    ///
+    BigInt opSub(T)(T n) const
+    {
+        return opSub(BigInt(n));
+    }
+
+    ///
+    BigInt opSub(T:int)(T n) const
+    {
+        return sub(*this,cast(Digit)n);
+    }
+
+    ///
+    BigInt opSub(T:const(BigInt))(T n) const
+    {
+        return sub(*this,n);
+    }
+
+    ///
+    void opSubAssign(T)(T n)
+    {
+        auto r = opSub(n);
+        digits = r.digits;
+    }
+
+    ///
+    BigInt opMul(T)(T n) const
+    {
+        return opMul(BigInt(n));
+    }
+
+    ///
+    BigInt opMul(T:int)(T n) const
+    {
+        if (cast(int)n == int.min) return opMul(BigInt(n));
+        int xs = sgn;
+        if (xs == 0 || n == 0) return BigInt.ZERO;
+        int ys = n > 0 ? 1 : -1;
+        auto x = abs;
+        auto y = n > 0 ? n : -n;
+        auto r = mul(x,y);
+        return (xs == ys) ? r : -r;
+    }
+
+    ///
+    BigInt opMul(T:const(BigInt))(T n) const
+    {
+        int xs = sgn;
+        int ys = n.sgn;
+        if (xs == 0 || ys == 0) return BigInt.ZERO;
+        auto x = abs;
+        auto y = n.abs;
+        auto r = mul(x,y);
+        return (xs == ys) ? r : -r;
+    }
+
+    ///
+    void opMulAssign(T)(T n)
+    {
+        auto r = opMul(n);
+        digits = r.digits;
+    }
+
+    /*
+        Here's how the signs work
+         7 /  3 = 2
+         7 %  3 = 1
+         7 / -3 = -2
+         7 % -3 = 1
+        -7 /  3 = -2
+        -7 %  3 = -1
+        -7 / -3 = 2
+        -7 % -3 = -1
+    */
+
+    ///
+    BigInt opDiv(T)(T n) const
+    {
+        return opDiv(BigInt(n));
+    }
+
+    ///
+    BigInt opDiv(T:int)(T n) const
+    {
+        if (n == 0) throw new Exception("Divide by zero");
+        if (cast(int)n == int.min) return opDiv(BigInt(n));
+        int xs = sgn;
+        int ys = n > 0 ? 1 : -1;
+        if (xs == 0) return BigInt.ZERO;
+        auto x = abs;
+        auto y = n > 0 ? n : -n;
+        auto r = div(x,y);
+        return (xs == ys) ? r.q : -r.q;
+    }
+
+    ///
+    BigInt opDiv(T:const(BigInt))(T n) const
+    {
+        int xs = sgn;
+        int ys = n.sgn;
+        if (ys == 0) throw new Exception("Divide by zero");
+        if (xs == 0) return BigInt.ZERO;
+        auto x = abs;
+        auto y = n.abs;
+        auto r = div(x,y);
+        return (xs == ys) ? r.q : -r.q;
+    }
+
+    ///
+    void opDivAssign(T)(T n)
+    {
+        auto r = opDiv(n);
+        digits = r.digits;
+    }
+
+    ///
+    BigInt opMod(T)(T n) const
+    {
+        return opMod(BigInt(n));
+    }
+
+    ///
+    int opMod(T:int)(T n) const
+    {
+        if (n == 0) throw new Exception("Divide by zero");
+        int xs = sgn;
+        if (xs == 0) return n;
+        auto x = abs;
+        auto y = n > 0 ? n : -n;
+        auto r = div(x,y);
+        return (xs == 1) ? r.r : -r.r;
+    }
+
+    ///
+    BigInt opMod(T:const(BigInt))(T n) const
+    {
+        int xs = sgn;
+        int ys = n.sgn;
+        if (ys == 0) throw new Exception("Divide by zero");
+        if (xs == 0) return n;
+        auto x = abs;
+        auto y = n.abs;
+        auto r = div(x,y);
+        assert(r.r.abs < n.abs);
+        return (xs == 1) ? r.r : -r.r;
+    }
+
+    ///
+    void opModAssign(T:int)(T n)
+    {
+        auto r = opMod(BigInt(n));
+        digits = r.digits;
+    }
+
+    ///
+    void opModAssign(T)(T n)
+    {
+        auto r = opMod(n);
+        digits = r.digits;
+    }
+
+    ///
+    BigInt opAnd(T)(T n) const
+    {
+        return opAnd(BigInt(n));
+    }
+
+    ///
+    BigInt opAnd(T:int)(T n) const
+    {
+        return and(*this,cast(Digit)n);
+    }
+
+    ///
+    uint opAnd(T:uint)(T n) const
+    {
+        uint t;
+        castTo(t);
+        return t & n;
+    }
+
+    ///
+    BigInt opAnd(T:const(BigInt))(T n) const
+    {
+        return and(*this,n);
+    }
+
+    ///
+    void opAndAssign(T:uint)(T n)
+    {
+        auto r = opAnd(BigInt(n));
+        digits = r.digits;
+    }
+
+    ///
+    void opAndAssign(T)(T n)
+    {
+        auto r = opAnd(n);
+        digits = r.digits;
+    }
+
+    ///
+    BigInt opOr(T)(T n) const
+    {
+        return opOr(BigInt(n));
+    }
+
+    ///
+    BigInt opOr(T:int)(T n) const
+    {
+        return or(*this,cast(Digit)n);
+    }
+
+    ///
+    BigInt opOr(T:const(BigInt))(T n) const
+    {
+        return or(*this,n);
+    }
+
+    ///
+    void opOrAssign(T)(T n)
+    {
+        auto r = opOr(n);
+        digits = r.digits;
+    }
+
+    ///
+    BigInt opXor(T)(T n) const
+    {
+        return opXor(BigInt(n));
+    }
+
+    ///
+    BigInt opXor(T:int)(T n) const
+    {
+        return xor(*this,cast(Digit)n);
+    }
+
+    ///
+    BigInt opXor(T:const(BigInt))(T n) const
+    {
+        return xor(*this,n);
+    }
+
+    ///
+    void opXorAssign(T)(T n)
+    {
+        auto r = opXor(n);
+        digits = r.digits;
+    }
+
+    ///
+    BigInt opShl(uint n) const
+    {
+        uint hi = n >> 5;
+        uint lo = n & 0x1F;
+        Big r = *this;
+        if (lo != 0) r = shl(r,lo);
+        if (hi != 0) r = shlDigits(r,hi);
+        return r;
+    }
+
+    ///
+    void opShlAssign(uint n)
+    {
+        auto r = opShl(n);
+        digits = r.digits;
+    }
+
+    ///
+    BigInt opShr(uint n) const
+    {
+        uint hi = n >> 5;
+        uint lo = n & 0x1F;
+        Big r = *this;
+        if (lo != 0) r = shr(r,lo).q;
+        if (hi != 0) r = shrDigits(r,hi);
+        return r;
+    }
+
+    ///
+    void opShrAssign(uint n)
+    {
+        auto r = opShr(n);
+        digits = r.digits;
+    }
+    
+    ///
+    BigInt opUShr(T)(T n) const
+    {
+        if (sgn >= 0) return opShr(n);
+        else throw new Exception(">>> cannot be applied to negative numbers");
+    }
+    
+    ///
+    void opUShrAssign(T)(T n)
+    {
+        if (sgn >= 0) opShrAssign(n);
+        else throw new Exception(">>>= cannot be applied to negative numbers");
+    }
+
+    ///
+    int opEquals(T)(T n) const
+    {
+        return opEquals(BigInt(n));
+    }
+
+    ///
+    int opEquals(T:int)(T n) const
+    {
+        return digits.length == 1 && digits[0] == n;
+    }
+
+    ///
+    int opEquals(T:const(BigInt))(T n) const
+    {
+        return digits == n.digits;
+    }
+
+    ///
+    int opCmp(T)(T n) const
+    {
+        return opCmp(BigInt(n));
+    }
+
+    ///
+    int opCmp(T:int)(T n) const
+    {
+        int t = cmp(*this,n);
+        return t == 0 ? 0 : (t > 0 ? 1 : -1);
+    }
+
+    ///
+    int opCmp(T:const(BigInt))(T n) const
+    {
+        int t = cmp(*this,n);
+        return t == 0 ? 0 : (t > 0 ? 1 : -1);
+    }
+
+    ///
+    string toString() const
+    {
+        return decimal(*this);
+    }
+
+    ///
+    hash_t toHash() const
+    {
+        hash_t h = 0;
+        foreach(Digit d;digits) { h += d; }
+        return h;
+    }
+
+    private int sgn() const
+    {
+        int t = cmp(*this,0);
+        return t == 0 ? 0 : (t > 0 ? 1 : -1);
+    }
+
+    private BigInt abs() const
+    {
+        return sgn >= 0 ? opPos : opNeg;
+    }
+}
+
+// ----------- EVERYTHING PRIVATE BEYOND THIS POINT -----------
+private:
+
+// Aliases and Typedefs
+
+alias BigInt                Big;
+alias invariant(Digit)[]    Digits;
+typedef Digit[]             DownArray;
+typedef Digit*              DownPtr;
+alias int                   SignedDigit;
+alias long                  SignedWideDigit;
+alias Digit                 Unused;
+typedef Digit[]             UpArray;
+typedef Digit*              UpPtr;
+alias ulong                 WideDigit;
+
+struct Big_Digit { Big q; Digit r; }
+struct Big_Big{ Big q; Big r; }
+
+// Endianness
+
+// The constant BIG_ENDIAN determines the ordering of digits within arrays.
+// If BIG_ENDIAN is true, then bigints are stored most significant digit first.
+// If BIG_ENDIAN is true, then bigints are stored most significant digit last.
+
+// Note that this does not necessarily have to be the same as the endianness
+// of the platform architecture.
+
+// Setting BIG_ENDIAN opposite to platform endianness allows unittests
+// to run in reverse endianness. (And they still pass).
+
+version(BigEndian) { enum bool BIG_ENDIAN = true;  }
+else               { enum bool BIG_ENDIAN = false; }
+
+// String conversion
+
+void parseError()
+{
+    throw new Exception("Parse Error");
+}
+
+Big fromString(string s)
+{
+    if (s.length == 0) parseError();
+    if (s[0] == '-')
+    {
+        return -fromString(s[1..$]);
+    }
+    if (s.length > 2 && s[0] == '0' && (s[1] == 'x' || s[1] == 'X'))
+    {
+        return fromHex(s[2..$]);
+    }
+    return fromDecimal(s);
+}
+
+Big fromDecimal(string s)
+{
+    bool invalid = true;
+    Big r = Big.ZERO;
+    foreach(char c;s)
+    {
+        if (c == '_') continue;
+        if (c < '0' || c > '9') parseError();
+        invalid = false;
+        //r = 10 * r + (c - '0');
+        r *= 10;
+        r += (c - '0');
+    }
+    if (invalid) parseError();
+    return r;
+}
+
+Big fromHex(string s)
+{
+    bool invalid = true;
+    Big r = Big.ZERO;
+    foreach(char c;s)
+    {
+        switch(c)
+        {
+        case '_':
+            continue;
+
+        case '0','1','2','3','4','5','6','7','8','9':
+            r = (r << 4) + (c - '0');
+            invalid = false;
+            break;
+
+        case 'A','B','C','D','E','F':
+            r = (r << 4) + (c - 'A' + 10);
+            invalid = false;
+            break;
+
+        case 'a','b','c','d','e','f':
+            r = (r << 4) + (c - 'A' + 10);
+            invalid = false;
+            break;
+
+        default:
+            parseError();
+        }
+    }
+    if (invalid) parseError();
+    return r;
+}
+
+string decimal(Big b)
+{
+    if (b < 0) return "-" ~ decimal(-b);
+
+    if (b < 10)
+    {
+        int n;
+        b.castTo(n);
+        char c = cast(char)(n + '0');
+        return [ c ];
+    }
+
+    auto t = div(b,10);
+    auto r = cast(string)(decimal(t.q) ~ [ cast(char)(t.r + '0') ]);
+    return r;
+}
+
+// Shrinking
+
+Big bigInt(DownArray a)
+{
+    if (a.length == 0) return Big.ZERO;
+
+    Big r;
+
+    if (a.length == 1)
+    {
+        r.digits = cast(Digits)a;
+    }
+    else
+    {
+        auto xp = begin(a);
+        auto xe = end(a);
+        auto d1 = peek(xp);
+        xp = next(xp);
+        auto s = signOf(d1);
+        while (xp != xe)
+        {
+            if (d1 != s) break;
+            auto d2 = peek(xp);
+            if (signOf(d2) != s) break;
+            xp = next(xp);
+            d1 = d2;
+        }
+        r.digits = freezeRange(xp, xe);
+    }
+    return r;
+}
+
+static if(BIG_ENDIAN)
+{
+    alias UpArray       BwdArray;
+    alias UpPtr         BwdPtr;
+    alias DownArray     FwdArray;
+    alias DownPtr       FwdPtr;
+
+    Digit[] join(Digit[] t, Digit[] u) { return t ~ u; }
+    T head(T)(T t,size_t n) { return cast(T)(t[0..n]); }
+    T tail(T)(T t,size_t n) { return cast(T)(t[$-n..$]); }
+}
+else
+{
+    alias DownArray     BwdArray;
+    alias DownPtr       BwdPtr;
+    alias UpArray       FwdArray;
+    alias UpPtr         FwdPtr;
+
+    Digit[] join(Digit[] t, Digit[] u) { return u ~ t; }
+    T head(T)(T t,size_t n) { return cast(T)(t[$-n..$]); }
+    T tail(T)(T t,size_t n) { return cast(T)(t[0..n]); }
+}
+
+// Really simple functions
+
+FwdPtr advance(FwdPtr p,size_t n) { return p + n; }
+BwdPtr advance(BwdPtr p,size_t n) { return p - n; }
+
+Digit begin(Digit a) { return a; }
+FwdPtr begin(FwdArray a) { return cast(FwdPtr)(a.ptr); }
+BwdPtr begin(BwdArray a) { return cast(BwdPtr)(a.ptr + a.length - 1); }
+
+Big bigInt(Digit a) { Digit[] t; t.length = 1; t[0] = a; return bigInt(cast(DownArray)t); }
+Big bigInt(Digits a) { return bigInt(cast(DownArray)a); }
+Big bigInt(Digit[] a) { return bigInt(cast(DownArray)a); }
+Big bigInt(UpArray a) { return bigInt(cast(DownArray)a); }
+
+Digit downArray(Digit a) { return a; }
+DownArray downArray(Digit[] a) { return cast(DownArray)a; }
+DownArray downArray(Big a) { return cast(DownArray)(a.digits); }
+
+Digit end(Digit a) { return a; }
+FwdPtr end(FwdArray a) { return cast(FwdPtr)(a.ptr + a.length); }
+BwdPtr end(BwdArray a) { return cast(BwdPtr)(a.ptr + - 1); }
+
+Digit first(Digit a) { return a; }
+Digit first(FwdArray a) { return a[0]; }
+Digit first(BwdArray a) { return a[$-1]; }
+
+Digits freezeRange(FwdPtr p, FwdPtr q) { return cast(Digits)((p-1)[0..(q-p+1)]); }
+Digits freezeRange(BwdPtr p, BwdPtr q) { return cast(Digits)((q+1)[0..(p-q+1)]); }
+
+Digit last(Digit a) { return a; }
+Digit last(FwdArray a) { return a[$-1]; }
+Digit last(BwdArray a) { return a[0]; }
+
+Digit lsd(Digit a) { return a; }
+Digit lsd(DownArray a) { return last(a); }
+Digit lsd(UpArray a) { return first(a); }
+
+Digit msd(Digit a) { return a; }
+Digit msd(DownArray a) { return first(a); }
+Digit msd(UpArray a) { return last(a); }
+
+size_t lengthOf(Digit a) { return 1; }
+size_t lengthOf(Big a) { return a.digits.length; }
+size_t lengthOf(DownArray a) { return a.length; }
+size_t lengthOf(UpArray a) { return a.length; }
+
+Digit next(Digit d) { return d; }
+FwdPtr next(FwdPtr p) { return p + 1; }
+BwdPtr next(BwdPtr p) { return p - 1; }
+
+Digit peek(Digit d) { return d; }
+Digit peek(Digit* p) { return *p; }
+
+void poke(DownPtr p,Digit d) { *p = d; }
+void poke(DownPtr p,WideDigit d) { *p = cast(Digit)d; }
+void poke(DownPtr p,SignedWideDigit d) { *p = cast(Digit)d; }
+void poke(UpPtr p,Digit d) { *p = d; }
+void poke(UpPtr p,WideDigit d) { *p = cast(Digit)d; }
+void poke(UpPtr p,SignedWideDigit d) { *p = cast(Digit)d; }
+
+Big shrink(Big a) { return bigInt(cast(DownArray)(a.digits)); }
+
+FwdArray slice(FwdPtr ptr, size_t len) { return cast(FwdArray)(ptr[0..len]); }
+BwdArray slice(BwdPtr ptr, size_t len) { return cast(BwdArray)((ptr-len+1)[0..len]); }
+
+Digit signOf(SignedDigit d) { return d < 0 ? -1 : 0; }
+
+Digit upArray(Digit a) { return a; }
+UpArray upArray(Digit[] a) { return cast(UpArray)a; }
+UpArray upArray(in Big a) { return cast(UpArray)(a.digits); }
+
+// Core functions
+
+WideDigit addCore(Digit x,Digit y,WideDigit c) { return (c + x) + y; }
+
+Digit andCore(Digit x,Digit y,Digit c) { return x & y; }
+
+WideDigit divCore(Digit x,Digit y,WideDigit c)
+{
+    c <<= 32;
+    c += x;
+    WideDigit r = c % y;
+    c /= y;
+    c += r << 32;
+    return c;
+}
+
+WideDigit shlCore(Digit x,Digit y,WideDigit c) { return c + (cast(WideDigit)x << y); }
+
+WideDigit shrCore(Digit x,Digit y,WideDigit c) { return c + (x >> y) + (cast(WideDigit)x << (64-y)); }
+
+WideDigit mulCore(Digit x,Digit y,WideDigit c) { return c + (cast(WideDigit)x * y); }
+
+WideDigit subCore(Digit x,Digit y,WideDigit c) { return (c + x) - y; }
+
+Digit orCore(Digit x,Digit y,Digit c) { return x | y; }
+
+Digit xorCore(Digit x,Digit y,Digit c) { return x ^ y; }
+
+// Update functions
+
+Digit updateDigit(Digit c) { return c; }
+
+WideDigit updateShr(WideDigit c) { return cast(SignedWideDigit)c >> 32; }
+
+WideDigit updateUShr(WideDigit c) { return c >> 32; }
+
+// Helper functions
+
+int cmp(DownPtr xp, DownPtr xe, DownPtr yp)
+{
+    while (xp != xe)
+    {
+        auto xd = peek(xp);
+        auto yd = peek(yp);
+        if (xd < yd) return -1;
+        if (xd > yd) return 1;
+        xp = next(xp);
+        yp = next(yp);
+    }
+    return 0;
+}
+
+void mulInner(Big a, UpPtr rp, WideDigit y)
+{
+    WideDigit c;
+    mixin(setUp("x","a"));
+
+    while (xp != xe)
+    {
+        c += y * peek(xp) + peek(rp);
+        poke(rp,c);
+        xp = next(xp);
+        rp = next(rp);
+        c = updateShr(c);
+    }
+
+    mixin(runOnce(   "mulCore","updateShr","xs","y"));
+}
+
+void divInner(DownPtr xp, DownPtr cachePtr, size_t len)
+{
+    Digit result;
+
+    debug // sanity checking
+    {
+        DownArray _divisor = slice(cachePtr,32*len);
+        _divisor = tail(_divisor,len);
+    }
+
+    DownPtr rp = xp;
+    xp = next(xp);
+    DownPtr xe = advance(xp,len);
+
+    debug // sanity checking
+    {
+        DownArray _remainder = slice(xp,len);  // will be modified in-place
+        DownArray _original = cast(DownArray)_remainder.dup;  // but we'll keep this one
+    }
+
+    for (Digit mask=0x80000000; mask!=0; mask>>=1)
+    {
+        int t = cmp(xp,xe,cachePtr);
+        if (t >= 0)
+        {
+            debug // sanity checking
+            {
+                DownArray _after = slice(xp,len);   // will be modified in-place
+                DownArray _before = cast(DownArray)_after.dup;     // but we'll keep this one
+                DownArray _test = slice(cachePtr,len);
+            }
+
+            result += mask;
+            Digit carry = subInPlace(xp,cachePtr,len);
+            debug
+            {
+                BigInt before = bigInt(_before);
+                BigInt test = bigInt(_test);
+                BigInt after = bigInt(_after);
+
+                assert(after + test == before);
+                assert(carry == 0);
+            }
+        }
+        cachePtr = advance(cachePtr,len);
+    }
+
+    debug // sanity checking
+    {
+        // in theory, quotient * _divisor + _remainder == _original
+        BigInt quotient = result;
+        BigInt divisor = bigInt(_divisor);
+        BigInt remainder = bigInt(_remainder);
+        BigInt original = bigInt(_original);
+
+        assert(quotient * divisor + remainder == original);
+    }
+
+    poke(rp,result);
+}
+
+Digit subInPlace(DownPtr downPtrX, DownPtr downPtrY, size_t len)
+{
+    UpPtr xp = cast(UpPtr)(advance(downPtrX,len-1));
+    UpPtr yp = cast(UpPtr)(advance(downPtrY,len-1));
+    UpPtr xe = advance(xp,len);
+
+    SignedWideDigit c;
+
+    while (xp != xe)
+    {
+        c += peek(xp);
+        c -= peek(yp);
+        poke(xp,c);
+        c >>= 32;
+        xp = next(xp);
+        yp = next(yp);
+    }
+
+    return cast(Digit)c;
+}
+
+DownPtr makeDivCache(DownArray y)
+{
+    // Pad with a leading zero
+    auto paddedY = cast(UpArray)(join([Digit.init],y));
+
+    auto upCache = cast(UpArray)new Digit[32 * paddedY.length];
+    auto rp = begin(upCache);
+
+    // Fill upCache by successively leftshifting x by one bit
+    for (int i=0; i<32; ++i)
+    {
+        auto xp = begin(paddedY);
+        auto xe = end(paddedY);
+
+        WideDigit c;
+
+        // Shift lefy by one bit
+        while (xp != xe)
+        {
+            Digit xd = peek(xp);
+            poke(rp,xd);
+            c += xd;
+            c += xd;
+            poke(xp,c);
+            c >>= 32;
+            xp = next(xp);
+            rp = next(rp);
+        }
+    }
+
+    auto downCache = cast(DownArray)upCache;
+
+    static if(false) // make true to display cache
+    {
+        for (int j=0; j<32; ++j)
+        {
+            writefln("bit %02d: ",31-j,hex(downCache[paddedY.length*j..paddedY.length*(j+1)]));
+        }
+    }
+
+    return begin(downCache);
+}
+
+// Mixins
+
+string runOnce(string core, string updater, string xp, string yp)
+{
+    return
+    "{
+        auto xd = peek("~xp~");
+        auto yd = peek("~yp~");
+        c = "~core~"(xd,yd,c);
+        poke(rp,c);
+        "~xp~" = next("~xp~");
+        "~yp~" = next("~yp~");
+        rp = next(rp);
+        c = "~updater~"(c);
+    }";
+}
+
+string runTo(string dest, string core, string updater, string xp, string yp)
+{
+    string s = runOnce(core,updater,xp,yp);
+    return
+    "
+        static if(isIntegral!(typeof("~dest~"))) {"~s~"}
+        else
+        {
+            while("~dest[0..1]~"p!="~dest~") {"~s~"}
+        }
+    ";
+}
+
+string setDown(string x,string a)
+{
+    return
+    "
+        auto "~x~" = downArray("~a~");
+        auto "~x~"p = begin("~x~");
+        auto "~x~"e = end("~x~");
+        auto "~x~"s = signOf(msd("~x~"));
+    ";
+}
+
+string setUp(string x,string a)
+{
+    return
+    "
+        auto "~x~" = upArray("~a~");
+        auto "~x~"p = begin("~x~");
+        auto "~x~"e = end("~x~");
+        auto "~x~"s = signOf(msd("~x~"));
+    ";
+}
+
+// BigInt functions
+
+Big neg(Big b)
+{
+    auto r = upArray(new Digit[lengthOf(b) + 1]);
+    auto rp = begin(r);
+
+    SignedDigit a;
+    WideDigit c;
+
+    mixin(setUp("x","a"));
+    mixin(setUp("y","b"));
+
+    mixin(runOnce(   "subCore","updateShr","xp","yp"));
+    mixin(runTo("ye","subCore","updateShr","xs","yp"));
+    mixin(runOnce(   "subCore","updateShr","xs","ys"));
+
+    return bigInt(r);
+}
+
+Big com(Big a)
+{
+    auto r = upArray(new Digit[lengthOf(a)]);
+    auto rp = begin(r);
+
+    Digit c;
+
+    mixin(setUp("x","a"));
+    Digit ys = uint.max;
+
+    mixin(runTo("xe","xorCore","updateDigit","xp","ys"));
+
+    return bigInt(r);
+}
+
+Big add(Other)(Big a,Other b)
+{
+    static if(is(Other==BigInt)) if (lengthOf(a) < lengthOf(b))
+    {
+    swap(a,b);
+    }
+    auto r = upArray(new Digit[max(lengthOf(a),lengthOf(b)) + 1]);
+    auto rp = begin(r);
+
+    WideDigit c;
+
+    mixin(setUp("x","a"));
+    mixin(setUp("y","b"));
+
+    mixin(runTo("ye","addCore","updateShr","xp","yp"));
+    mixin(runTo("xe","addCore","updateShr","xp","ys"));
+    mixin(runOnce(   "addCore","updateShr","xs","ys"));
+
+    return bigInt(r);
+}
+
+Big sub(Big a,Big b)
+{
+    auto r = upArray(new Digit[max(lengthOf(a),lengthOf(b)) + 1]);
+    auto rp = begin(r);
+    auto re = advance(rp,min(lengthOf(a),lengthOf(b)));
+
+    WideDigit c;
+
+    mixin(setUp("x","a"));
+    mixin(setUp("y","b"));
+
+    mixin(runTo("re","subCore","updateShr","xp","yp"));
+    if (lengthOf(x) >= lengthOf(y))
+    {
+        mixin(runTo("xe","subCore","updateShr","xp","ys"));
+    }
+    else
+    {
+        mixin(runTo("ye","subCore","updateShr","xs","yp"));
+    }
+    mixin(runOnce("subCore","updateShr","xs","ys"));
+
+    return bigInt(r);
+}
+
+Big sub(Big a, Digit b)
+{
+    auto r = upArray(new Digit[lengthOf(a) + 1]);
+    auto rp = begin(r);
+
+    WideDigit c;
+
+    mixin(setUp("x","a"));
+    mixin(setUp("y","b"));
+
+    mixin(runOnce(   "subCore","updateShr","xp","yp"));
+    mixin(runTo("xe","subCore","updateShr","xp","ys"));
+    mixin(runOnce(   "subCore","updateShr","xs","ys"));
+
+    return bigInt(r);
+}
+
+Big mul(Big a, Big b) // a and b must be positive
+{
+    auto r = upArray(new Digit[lengthOf(a) + lengthOf(b)]);
+    auto rp = begin(r);
+
+    mixin(setUp("y","b"));
+    while (yp != ye)
+    {
+        mulInner(a,rp,peek(yp));
+        yp = next(yp);
+        rp = next(rp);
+    }
+
+    return bigInt(r);
+}
+
+Big mul(Big a, Digit b) // a and b must be positive
+{
+    auto r = upArray(new Digit[lengthOf(a) + 1]);
+    auto rp = begin(r);
+
+    WideDigit c;
+
+    mixin(setUp("x","a"));
+
+    mixin(runTo("xe","mulCore","updateShr","xp","b"));
+    poke(rp,c);
+
+    return bigInt(r);
+}
+
+Big_Big div(Big a, Big b) // a and b must be positive
+{
+    auto lenX = lengthOf(a);
+    auto lenY = lengthOf(b) + 1;
+
+    auto r = cast(DownArray)join(new Digit[lenY], cast(Digit[])a.digits);
+    auto rp = begin(r);
+    auto re = advance(rp,lenX);
+
+    auto y = downArray(b);
+    auto cache = makeDivCache(y);
+
+    while (rp != re)
+    {
+        divInner(rp, cache, lenY);
+        rp = next(rp);
+    }
+
+    Big quotient  = bigInt(cast(DownArray)(head(r,lenX)));
+    Big remainder = bigInt(cast(DownArray)(tail(r,lenY)));
+
+    return Big_Big(quotient,remainder);
+}
+
+Big_Digit div(Big a, Digit b) // a and b must be positive
+{
+    auto r = downArray(new Digit[lengthOf(a)]);
+    auto rp = begin(r);
+
+    WideDigit c;
+
+    mixin(setDown("x","a"));
+
+    mixin(runTo("xe","divCore","updateUShr","xp","b"));
+
+    return Big_Digit(bigInt(r),cast(Digit)c);
+}
+
+Big and(Other)(Big a, Other b)
+{
+    static if(is(Other==BigInt)) if (lengthOf(a) < lengthOf(b)) { swap(a,b); }
+    auto r = upArray(new Digit[max(lengthOf(a),lengthOf(b))]);
+    auto rp = begin(r);
+
+    Digit c;
+
+    mixin(setUp("x","a"));
+    mixin(setUp("y","b"));
+
+    mixin(runTo("ye","andCore","updateDigit","xp","yp"));
+    mixin(runTo("xe","andCore","updateDigit","xp","ys"));
+
+    return bigInt(r);
+}
+
+Big or(Other)(Big a, Other b)
+{
+    static if(is(Other==BigInt)) if (lengthOf(a) < lengthOf(b)) { swap(a,b); }
+    auto r = upArray(new Digit[max(lengthOf(a),lengthOf(b))]);
+    auto rp = begin(r);
+
+    Digit c;
+
+    mixin(setUp("x","a"));
+    mixin(setUp("y","b"));
+
+    mixin(runTo("ye","orCore","updateDigit","xp","yp"));
+    mixin(runTo("xe","orCore","updateDigit","xp","ys"));
+
+    return bigInt(r);
+}
+
+Big xor(Other)(Big a, Other b)
+{
+    static if(is(Other==BigInt)) if (lengthOf(a) < lengthOf(b)) { swap(a,b); }
+    auto r = upArray(new Digit[max(lengthOf(a),lengthOf(b))]);
+    auto rp = begin(r);
+
+    Digit c;
+
+    mixin(setUp("x","a"));
+    mixin(setUp("y","b"));
+
+    mixin(runTo("ye","xorCore","updateDigit","xp","yp"));
+    mixin(runTo("xe","xorCore","updateDigit","xp","ys"));
+
+    return bigInt(r);
+}
+
+Big shl(Big a, Digit b)
+{
+    auto r = upArray(new Digit[lengthOf(a) + 1]);
+    auto rp = begin(r);
+
+    WideDigit c;
+
+    mixin(setUp("x","a"));
+
+    mixin(runTo("xe","shlCore","updateShr","xp","b"));
+    poke(rp,c);
+
+    return bigInt(r);
+}
+
+Big shlDigits(Big a, uint n)
+{
+    Big b;
+    b.digits = cast(Digits)join(a.digits.dup, new Digit[n]);
+    return b;
+}
+
+Big_Digit shr(Big a, Digit b)
+{
+    auto r = downArray(new Digit[lengthOf(a)]);
+    auto rp = begin(r);
+
+    mixin(setDown("x","a"));
+
+    WideDigit c = (signOf(msd(x)) << (32-b)) & uint.max;
+
+    mixin(runTo("xe","shrCore","updateUShr","xp","b"));
+
+    return Big_Digit(bigInt(r),cast(Digit)(c >> (32-b)));
+}
+
+Big shrDigits(Big a, uint n)
+{
+    if (lengthOf(a) < n)
+    {
+        return bigInt(signOf(msd(downArray(a))));
+    }
+    Big b;
+    b.digits = cast(Digits)head(a.digits, a.digits.length-n);
+    return b;
+}
+
+int cmp(Big a, Big b) // assumes a and b are both shrunk
+{
+    mixin(setDown("x","a"));
+    mixin(setDown("y","b"));
+
+    if (xs != ys) return xs - ys;
+
+    if (lengthOf(x) > lengthOf(y)) return cast(SignedDigit)xs < 0 ? -1 : 1;
+    if (lengthOf(x) < lengthOf(y)) return cast(SignedDigit)ys < 0 ? 1 : -1;
+
+    return cmp(xp,xe,yp);
+}
+
+int cmp(Big a, Digit b) // assumes a is shrunk
+{
+    mixin(setDown("x","a"));
+    Digit ys = signOf(b);
+
+    if (xs != ys) return xs - ys;
+
+    if (lengthOf(x) > 1) return cast(SignedDigit)xs < 0 ? -1 : 1;
+
+    return peek(xp) - b;
+}
+
+// Debugging functions
+
+Big makeBig(Digits array...)
+{
+    Big r;
+    static if(BIG_ENDIAN)
+    {
+        r.digits = array;
+    }
+    else
+    {
+        r.digits = cast(Digits)(array.dup.reverse);
+    }
+    return r;
+}
+
+string hex(Big x)
+{
+    return "\r" ~ hex(x.digits);
+}
+
+string hex(in Digit[] x)
+{
+    string r;
+    static if(BIG_ENDIAN)
+    {
+        auto array = x;
+    }
+    else
+    {
+        auto array = x.dup.reverse;
+    }
+    for (int i=array.length; i<4; ++i)
+    {
+        r ~= "----------, ";
+    }
+    foreach(d;array)
+    {
+        r ~= format("0x%08X, ",d);
+    }
+    return r;
+}
+
+string dump(string name)
+{
+    return
+    "{  writef(\"(%d) "~name~" = \",__LINE__);
+        static if(is(typeof("~name~") == Digit)) writefln(\"Digit %08X\","~name~");
+        else static if(is(typeof("~name~") == WideDigit)) writefln(\"WideDigit %016X\","~name~");
+        else static if(is(typeof("~name~") == SignedDigit)) writefln(\"SignedDigit %08X\","~name~");
+        else static if(is(typeof("~name~") == SignedWideDigit)) writefln(\"SignedWideDigit %016X\","~name~");
+        else writefln(typeof("~name~").stringof,\" \","~name~");
+    }";
+}
+
+void diag(int line = __LINE__, string file = __FILE__)
+{
+   writefln("%s(%d) executed.", file, line);
+}
+
+// Unittests
+
+unittest
+{
+    // This block of unittests demonstrates that we can shrink arrays correctly
+    {
+        auto a = makeBig( 0x00000000 );
+        auto r = shrink(a);
+        assert(r.digits == a.digits, hex(r));
+            }{
+        auto a = makeBig( 0xFFFFFFFF );
+        auto r = shrink(a);
+        assert(r.digits == a.digits, hex(r));
+            }{
+        auto a = makeBig( 0x44444444 );
+        auto r = shrink(a);
+        assert(r.digits == a.digits, hex(r));
+            }{
+        auto a = makeBig( 0xCCCCCCCC );
+        auto r = shrink(a);
+        assert(r.digits == a.digits, hex(r));
+            }{
+        auto a = makeBig( 0x00000000, 0x00000000, 0x44444444, 0x44444444 );
+        auto b = makeBig(                         0x44444444, 0x44444444 );
+        auto r = shrink(a);
+        assert(r.digits == b.digits, hex(r));
+            }{
+        auto a = makeBig( 0x00000000, 0x00000000, 0xCCCCCCCC, 0xCCCCCCCC );
+        auto b = makeBig(             0x00000000, 0xCCCCCCCC, 0xCCCCCCCC );
+        auto r = shrink(a);
+        assert(r.digits == b.digits, hex(r));
+            }{
+        auto a = makeBig( 0xFFFFFFFF, 0xFFFFFFFF, 0x44444444, 0x44444444 );
+        auto b = makeBig(             0xFFFFFFFF, 0x44444444, 0x44444444 );
+        auto r = shrink(a);
+        assert(r.digits == b.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that neg(Big) works
+    {
+        auto x = makeBig( 0x66666666, 0x66666660 );
+        auto z = makeBig( 0x99999999, 0x999999A0 );
+        auto r = neg(x);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig(             0x80000000, 0x00000000 );
+        auto z = makeBig( 0x00000000, 0x80000000, 0x00000000 );
+        auto r = neg(x);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0x00000000, 0x80000000, 0x00000000 );
+        auto z = makeBig(             0x80000000, 0x00000000 );
+        auto r = neg(x);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that com(Big) works
+    {
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto z = makeBig( 0xFEDCBA98, 0x76543210 );
+        auto r = com(x);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that add(Big,Big) works
+    {
+        auto x = makeBig(             0x66666666, 0x66666660 );
+        auto y = makeBig(             0x77777777, 0x77777770 );
+        auto z = makeBig( 0x00000000, 0xDDDDDDDD, 0xDDDDDDD0 );
+        auto r = add(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig(             0x99999999, 0x99999990 );
+        auto y = makeBig(             0xAAAAAAAA, 0xAAAAAAA0 );
+        auto z = makeBig( 0xFFFFFFFF, 0x44444444, 0x44444430 );
+        auto r = add(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0xEEEEEEEE, 0xEEEEEEE0 );
+        auto y = makeBig( 0x66666666, 0x66666660 );
+        auto z = makeBig( 0x55555555, 0x55555540 );
+        auto r = add(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0x99999999, 0x99999990 );
+        auto y = makeBig( 0x66666666, 0x66666660 );
+        auto z = makeBig(             0xFFFFFFF0 );
+        auto r = add(x,y);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that add(Big,int) works
+    {
+        auto x = makeBig( 0x66666666, 0x66666660 );
+        auto y =                      0x77777770  ;
+        auto z = makeBig( 0x66666666, 0xDDDDDDD0 );
+        auto r = add(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0x99999999, 0x99999990 );
+        auto y =                      0xAAAAAAA0  ;
+        auto z = makeBig( 0x99999999, 0x44444430 );
+        auto r = add(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0xEEEEEEEE, 0xEEEEEEE0 );
+        auto y =                      0x66666660  ;
+        auto z = makeBig( 0xEEEEEEEF, 0x55555540 );
+        auto r = add(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0x99999999, 0x99999990 );
+        auto y =                      0x66666660 ;
+        auto z = makeBig( 0x99999999, 0xFFFFFFF0 );
+        auto r = add(x,y);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that sub(Big,Big) works
+    {
+        auto x = makeBig( 0x22222222, 0x22222222, 0x22222222 );
+        auto y = makeBig(             0x11111111, 0x11111111 );
+        auto z = makeBig( 0x22222222, 0x11111111, 0x11111111 );
+        auto r = sub(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig(             0x22222222, 0x22222222 );
+        auto y = makeBig( 0x11111111, 0x11111111, 0x11111111 );
+        auto z = makeBig( 0xEEEEEEEF, 0x11111111, 0x11111111 );
+        auto r = sub(x,y);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that sub(Big,int) works
+    {
+        auto x = makeBig( 0x22222222, 0x22222222 );
+        auto y =                      0x11111111  ;
+        auto z = makeBig( 0x22222222, 0x11111111 );
+        auto r = sub(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0x22222222, 0x22222222 );
+        auto y =                      0x80000000  ;
+        auto z = makeBig( 0x22222222, 0xA2222222 );
+        auto r = sub(x,y);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that mul(Big,Big) works
+    {
+        auto x = makeBig(                         0x01111111, 0x11111111 );
+        auto y = makeBig(                         0x01111111, 0x11111111 );
+        auto z = makeBig( 0x00012345, 0x6789ABCD, 0xEFEDCBA9, 0x87654321 );
+        auto r = mul(x,y);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that mul(Big,uint) works
+    {
+        auto x = makeBig(             0x11111111, 0x11111111 );
+        auto y =                                  0x11111111  ;
+        auto z = makeBig( 0x01234567, 0x88888888, 0x87654321 );
+        auto r = mul(x,y);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that div(Big,Big) works
+    {
+        auto x = makeBig( 0x00000014 );
+        auto y = makeBig( 0x00000007 );
+        auto z = makeBig( 0x00000002 );
+        auto w = makeBig( 0x00000006 );
+        auto t = div(x,y);
+        assert(t.q.digits == z.digits, hex(t.q));
+        assert(t.r.digits == w.digits, hex(t.r));
+            }{
+        auto x = makeBig( 0x00012345, 0x6789ABCD, 0xEFEDCBA9, 0x87654321 );
+        auto y = makeBig(                         0x01111111, 0x11111111 );
+        auto z = makeBig(                         0x01111111, 0x11111111 );
+        auto w = makeBig(                                     0x00000000 );
+        auto t = div(x,y);
+        assert(t.q.digits == z.digits, hex(t.q));
+        assert(t.r.digits == w.digits, hex(t.r));
+            }{
+        auto x = makeBig( 0x00012345, 0x6789ABCD, 0xEFEDCBA9, 0x98765432 );
+        auto y = makeBig(                         0x01111111, 0x11111111 );
+        auto z = makeBig(                         0x01111111, 0x11111111 );
+        auto w = makeBig(                                     0x11111111 );
+        auto t = div(x,y);
+        assert(t.q.digits == z.digits, hex(t.q));
+        assert(t.r.digits == w.digits, hex(t.r));
+    }
+
+    // This block of unittests demonstrates that div(Big,uint) works
+    {
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y =                      0x01234567  ;
+        auto z = makeBig( 0x00000001, 0x00000079 );
+        auto t = div(x,y);
+        assert(t.q.digits == z.digits, hex(t.q));
+        assert(t.r == 0x00000040, format("remainder = %08X",t.r));
+            }{
+        auto x = makeBig( 0x00000000, 0xAB54A98C, 0xEB1F0AD2 );
+        auto y =                                  0x0000000A  ;
+        auto z = makeBig(             0x112210F4, 0x7DE98115 );
+        auto t = div(x,y);
+        assert(t.q.digits == z.digits, hex(t.q));
+        assert(t.r == 0x00000000, format("remainder = %08X",t.r));
+    }
+
+    // This block of unittests demonstrates that and(Big,Big) works
+    {
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y = makeBig( 0X33333333, 0X33333333 );
+        auto z = makeBig( 0x01230123, 0x01230123 );
+        auto r = and(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y = makeBig(             0X33333333 );
+        auto z = makeBig(             0x01230123 );
+        auto r = and(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y = makeBig(             0XCCCCCCCC );
+        auto z = makeBig( 0x01234567, 0x8888CCCC );
+        auto r = and(x,y);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that and(Big,int) works
+    {
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y =                      0X33333333  ;
+        auto z = makeBig(             0x01230123 );
+        auto r = and(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y =                      0XCCCCCCCC  ;
+        auto z = makeBig( 0x01234567, 0x8888CCCC );
+        auto r = and(x,y);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that or(Big,Big) works
+    {
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y = makeBig( 0X33333333, 0X33333333 );
+        auto z = makeBig( 0x33337777, 0xBBBBFFFF );
+        auto r = or(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y = makeBig(             0X33333333 );
+        auto z = makeBig( 0x01234567, 0xBBBBFFFF );
+        auto r = or(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y = makeBig(             0XCCCCCCCC );
+        auto z = makeBig(             0xCDEFCDEF );
+        auto r = or(x,y);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that or(Big,int) works
+    {
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y =                      0X33333333  ;
+        auto z = makeBig( 0x01234567, 0xBBBBFFFF );
+        auto r = or(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y =                      0XCCCCCCCC  ;
+        auto z = makeBig(             0xCDEFCDEF );
+        auto r = or(x,y);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that xor(Big,int) works
+    {
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y = makeBig( 0X33333333, 0X33333333 );
+        auto z = makeBig( 0x32107654, 0xBA98FEDC );
+        auto r = xor(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y = makeBig(             0X33333333 );
+        auto z = makeBig( 0x01234567, 0xBA98FEDC );
+        auto r = xor(x,y);
+        assert(r.digits == z.digits, hex(r));
+            }{
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y = makeBig(             0XCCCCCCCC );
+        auto z = makeBig( 0xFEDCBA98, 0x45670123 );
+        auto r = xor(x,y);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that xor(Big,int) works
+    {
+        auto x = makeBig( 0x01234567, 0x89ABCDEF );
+        auto y =                      0X33333333  ;
+        auto z = makeBig( 0x01234567, 0xBA98FEDC );
+        auto r = xor(x,y);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that shl(Big,uint) works
+    {
+        Big x = makeBig(             0x01234567, 0x89ABCDEF );
+        Big z = makeBig( 0x00000123, 0x456789AB, 0xCDEF0000 );
+        Big r = shl(x,16);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that shlDigits(Big,uint) works
+    {
+        Big x = makeBig(                         0x01234567, 0x89ABCDEF );
+        Big z = makeBig( 0x01234567, 0x89ABCDEF, 0x00000000, 0x00000000 );
+        Big r = shlDigits(x,2);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that shr(Big,uint) works
+    {
+        Big x = makeBig( 0x00000123, 0x456789AB, 0xCDEF4444 );
+        Big z = makeBig(             0x12345678, 0x9ABCDEF4 );
+        auto t = shr(x,12);
+        assert(t.q.digits == z.digits, hex(t.q));
+        assert(t.r == 0x00000444, format("remainder = %08X",t.r));
+            }{
+        auto x = makeBig( 0x80000000, 0x00000000 );
+        auto z = makeBig( 0xFFFF8000, 0x00000000 );
+        auto t = shr(x,16);
+        assert(t.q.digits == z.digits, hex(t.q));
+        assert(t.r == 0x00000000, format("remainder = %08X",t.r));
+    }
+
+    // This block of unittests demonstrates that shrDigits(Big,uint) works
+    {
+        Big x = makeBig( 0x01234567, 0x89ABCDEF, 0xFEDCBA98, 0x76543210 );
+        Big z = makeBig(                         0x01234567, 0x89ABCDEF );
+        Big r = shrDigits(x,2);
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that cmp(Big,Big) works
+    {
+        Big x = makeBig( 0x11111111, 0x11111111, 0x11111111 );
+        Big y = makeBig(             0x11111111, 0x11111111 );
+        assert(cmp(x,y) > 0);
+            }{
+        Big x = makeBig(             0x11111111, 0x11111111 );
+        Big y = makeBig( 0x11111111, 0x11111111, 0x11111111 );
+        assert(cmp(x,y) < 0);
+            }{
+        Big x = makeBig( 0xEEEEEEEE, 0xEEEEEEEE, 0xEEEEEEEE );
+        Big y = makeBig(             0xEEEEEEEE, 0xEEEEEEEE );
+        assert(cmp(x,y) < 0);
+            }{
+        Big x = makeBig(             0xEEEEEEEE, 0xEEEEEEEE );
+        Big y = makeBig( 0xEEEEEEEE, 0xEEEEEEEE, 0xEEEEEEEE );
+        assert(cmp(x,y) > 0);
+            }{
+        Big x = makeBig( 0x33333333, 0x22222222, 0xEEEEEEEE );
+        Big y = makeBig( 0x33333333, 0x11111111, 0xEEEEEEEE );
+        assert(cmp(x,y) > 0);
+            }{
+        Big x = makeBig( 0x33333333, 0x11111111, 0xEEEEEEEE );
+        Big y = makeBig( 0x33333333, 0x22222222, 0xEEEEEEEE );
+        assert(cmp(x,y) < 0);
+            }{
+        Big x = makeBig( 0x33333333, 0x11111111, 0xEEEEEEEE );
+        Big y = makeBig( 0xEEEEEEEE, 0x22222222, 0xEEEEEEEE );
+        assert(cmp(x,y) > 0);
+            }{
+        Big x = makeBig( 0x01234567, 0x88888888, 0x76543210 );
+        Big y = makeBig( 0x01234567, 0x88888888, 0x76543210 );
+        assert(cmp(x,y) == 0);
+    }
+
+    // This block of unittests demonstrates that cmp(Big,uint) works
+    {
+        Big x = makeBig( 0x11111111, 0x11111111, 0x11111111 );
+        Digit y =                                0x11111111  ;
+        assert(cmp(x,y) > 0);
+            }{
+        Big x = makeBig( 0xEEEEEEEE, 0xEEEEEEEE, 0xEEEEEEEE );
+        Digit y =                                0xEEEEEEEE ;
+        assert(cmp(x,y) < 0);
+            }{
+        Big x = makeBig( 0x22222222 );
+        Digit y =        0x11111111  ;
+        assert(cmp(x,y) > 0);
+            }{
+        Big x = makeBig( 0x11111111 );
+        Digit y =        0x22222222  ;
+        assert(cmp(x,y) < 0);
+            }{
+        Big x = makeBig( 0x76543210 );
+        Digit y =        0x76543210  ;
+        assert(cmp(x,y) == 0);
+    }
+
+    // This block of unittests demonstrates that fromString(string) works
+    {
+        Big r = fromString("123");
+        Big z = makeBig( 0x0000007B );
+        assert(r.digits == z.digits, hex(r));
+            }{
+        Big r = fromString("12_345_678_901_234_567_890");
+        Big z = makeBig( 0x00000000, 0xAB54A98C, 0xEB1F0AD2 );
+        assert(r.digits == z.digits, hex(r));
+            }{
+        Big r = fromString("-12_345_678_901_234_567_890");
+        Big z = makeBig( 0xFFFFFFFF, 0x54AB5673, 0x14E0F52E );
+        assert(r.digits == z.digits, hex(r));
+            }{
+        Big r = fromString("0x0123_4567_89AB_CDEF");
+        Big z = makeBig( 0x01234567, 0x89ABCDEF );
+        assert(r.digits == z.digits, hex(r));
+            }{
+        Big r = fromString("-0x0123_4567_89AB_CDEF");
+        Big z = makeBig( 0xFEDCBA98, 0x76543211 );
+        assert(r.digits == z.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that decimal(Big) works
+    {
+        Big x = makeBig( 0x0000007B );
+        string r = decimal(x);
+        assert(r == "123", r);
+            }{
+        Big x = makeBig( 0x00000000, 0xAB54A98C, 0xEB1F0AD2 );
+        string r = decimal(x);
+        assert(r == "12345678901234567890", r);
+            }{
+        Big x = makeBig( 0xFFFFFFFF, 0x54AB5673, 0x14E0F52E );
+        string r = decimal(x);
+        assert(r == "-12345678901234567890", r);
+            }{
+        Big x = makeBig( 0x01234567, 0x89ABCDEF );
+        string r = decimal(x);
+        assert(r == "81985529216486895", r);
+            }{
+        Big x = makeBig( 0xFEDCBA98, 0x76543211 );
+        string r = decimal(x);
+        assert(r == "-81985529216486895", r);
+    }
+
+    // This block of unittests demonstrates that opAssign works
+    {
+        Big z = makeBig( 0x00000064 );
+        Big r;
+        r.opAssign(z);
+        assert(z.digits == r.digits, hex(r));
+            //
+        r.opAssign( cast(int)100 );
+        assert(z.digits == r.digits, hex(r));
+            //
+        r.opAssign( cast(uint)100 );
+        assert(z.digits == r.digits, hex(r));
+            //
+        r.opAssign( cast(long)100 );
+        assert(z.digits == r.digits, hex(r));
+            //
+        r.opAssign( cast(ulong)100 );
+        assert(z.digits == r.digits, hex(r));
+            //
+        r.opAssign( "100" );
+        assert(z.digits == r.digits, hex(r));
+            }{
+        Big r;
+        r.opAssign( cast(long)0xFEDCBA9876543210 );
+        Big z = makeBig( 0xFEDCBA98, 0x76543210 );
+        assert(z.digits == r.digits, hex(r));
+            }{
+        Big r;
+        r.opAssign( cast(ulong)0xFEDCBA9876543210 );
+        Big z = makeBig( 0x00000000, 0xFEDCBA98, 0x76543210 );
+        assert(z.digits == r.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that static opCall works
+    {
+        Big z = makeBig( 0x00000064 );
+        Big r = BigInt(z);
+        assert(z.digits == r.digits, hex(r));
+            //
+        r = BigInt( cast(int)100 );
+        assert(z.digits == r.digits, hex(r));
+            //
+        r = BigInt( cast(uint)100 );
+        assert(z.digits == r.digits, hex(r));
+            //
+        r = BigInt( cast(long)100 );
+        assert(z.digits == r.digits, hex(r));
+            //
+        r = BigInt( cast(ulong)100 );
+        assert(z.digits == r.digits, hex(r));
+            //
+        r = BigInt( "100" );
+        assert(z.digits == r.digits, hex(r));
+            }{
+        Big r = BigInt( cast(long)0xFEDCBA9876543210 );
+        Big z = makeBig( 0xFEDCBA98, 0x76543210 );
+        assert(z.digits == r.digits, hex(r));
+            }{
+        Big r = BigInt( cast(ulong)0xFEDCBA9876543210 );
+        Big z = makeBig( 0x00000000, 0xFEDCBA98, 0x76543210 );
+        assert(z.digits == r.digits, hex(r));
+    }
+
+    // This block of unittests demonstrates that castTo works
+    {
+        BigInt z = makeBig( 0x00000000, 0x89ABCDEF, 0x89ABCDEF );
+        BigInt r;
+        z.castTo(r);
+        assert(z.digits == r.digits, hex(r));
+            //
+        int i;
+        z.castTo(i);
+        assert(i == -1985229329);
+            //
+        uint j;
+        z.castTo(j);
+        assert(j == 2309737967);
+            //
+        long k;
+        z.castTo(k);
+        assert(k == -8526495040805286417);
+            //
+        ulong l;
+        z.castTo(l);
+        assert(l == 9920249032904265199u);
+            //
+        string s;
+        z.castTo(s);
+        assert(s == "9920249032904265199");
+    }
+
+    // This block of unittests demonstrates that opEquals and opCmp work
+    {
+        BigInt x = makeBig( 0x00000000, 0x89ABCDEF, 0x89ABCDEF );
+        assert(x  > BigInt("9920249032904265198"));
+        assert(x == BigInt("9920249032904265199"));
+        assert(x  < BigInt("9920249032904265200"));
+            //
+        assert(x.opEquals(BigInt("9920249032904265199")));
+        assert(x.opCmp(BigInt("9920249032904265199") == 0));
+            //
+        BigInt y = 42;
+        assert(y  > 41);
+        assert(y == 42);
+        assert(y  < 43);
+    }
+
+    // This block of unittests demonstrates that opNeg works
+    {
+        BigInt x = "100000000000000";
+        BigInt y = "-100000000000000";
+        assert(x == -y);
+        assert(-x == y);
+        assert(x == -(-x));
+    }
+
+    // This block of unittests demonstrates that opPos works
+    {
+        BigInt x = "100000000000000";
+        assert(x == +x);
+    }
+
+    // This block of unittests demonstrates that opCom works
+    {
+        BigInt x = "100000000000000";
+        BigInt y = "-100000000000001";
+        assert(x == ~y);
+        assert(~x == y);
+        assert(x == ~(~x));
+    }
+
+    // This block of unittests demonstrates that opPostInc and opPostDec work
+    {
+        BigInt x = "100000000000000";
+        BigInt y = x++;
+        assert(y == BigInt("100000000000000"));
+        assert(x == BigInt("100000000000001"));
+            }{
+        BigInt x = "100000000000000";
+        BigInt y = x--;
+        assert(y == BigInt("100000000000000"));
+        assert(x == BigInt( "99999999999999"));
+    }
+
+    // This block of unittests demonstrates that opAdd works
+    {
+        BigInt x = "100000000000000";
+        BigInt y = x + 42;
+        assert(y == BigInt("100000000000042"));
+        BigInt z = x + x;
+        assert(z == BigInt("200000000000000"));
+            }{
+        BigInt x = "100000000000000";
+        BigInt y = x + -42;
+        assert(y == BigInt("99999999999958"));
+        BigInt z = x + -(x + x);
+        assert(z == BigInt("-100000000000000"));
+            }{
+        BigInt x = "100000000000000";
+        x += 42;
+        assert(x == BigInt("100000000000042"));
+    }
+
+    // This block of unittests demonstrates that opSub works
+    {
+        BigInt x = "100000000000000";
+        BigInt y = x - 42;
+        assert(y == BigInt("99999999999958"));
+        BigInt z = x - x;
+        assert(z == BigInt.init);
+            }{
+        BigInt x = "100000000000000";
+        BigInt y = x - -42;
+        assert(y == BigInt("100000000000042"));
+        BigInt z = x - -(x + x);
+        assert(z == BigInt("300000000000000"));
+            }{
+        BigInt x = "100000000000000";
+        x -= 42;
+        assert(x == BigInt("99999999999958"));
+    }
+
+    // This block of unittests demonstrates that opMul works
+    {
+        BigInt a = "9588669891916142";
+        BigInt b = "7452469135154800";
+        auto c = a * b;
+        assert(c == "71459266416693160362545788781600");
+            }{
+        BigInt a = "-9588669891916142";
+        BigInt b = "7452469135154800";
+        auto c = a * b;
+        assert(c == "-71459266416693160362545788781600");
+            }{
+        BigInt a = "9588669891916142";
+        BigInt b = "-7452469135154800";
+        auto c = a * b;
+        assert(c == "-71459266416693160362545788781600");
+            }{
+        BigInt a = "-9588669891916142";
+        BigInt b = "-7452469135154800";
+        auto c = a * b;
+        assert(c == "71459266416693160362545788781600");
+            }{
+        BigInt a = "10000000000000000000";
+        a *= -4;
+        assert(a == "-40000000000000000000");
+    }
+
+    // This block of unittests demonstrates that opDiv works
+    {
+        BigInt a = "10000000000000000";
+        BigInt b = "7";
+        auto c = a / b;
+        assert(c == "1428571428571428");
+            }{
+        BigInt a = "-10000000000000000";
+        BigInt b = "7";
+        auto c = a / b;
+        assert(c == "-1428571428571428");
+            }{
+        BigInt a = "10000000000000000";
+        BigInt b = "-7";
+        auto c = a / b;
+        assert(c == "-1428571428571428");
+            }{
+        BigInt a = "-10000000000000000";
+        BigInt b = "-7";
+        auto c = a / b;
+        assert(c == "1428571428571428");
+            }{
+        BigInt a = "10000000000000000";
+        a /= -7;
+        assert(a == "-1428571428571428");
+    }
+
+    // This block of unittests demonstrates that opMod works
+    {
+        BigInt a = "10000000000000000";
+        BigInt b = "7";
+        auto c = a % b;
+        assert(c == 4);
+            }{
+        BigInt a = "-10000000000000000";
+        BigInt b = "7";
+        auto c = a % b;
+        assert(c == -4);
+            }{
+        BigInt a = "10000000000000000";
+        BigInt b = "-7";
+        auto c = a % b;
+        assert(c == 4);
+            }{
+        BigInt a = "-10000000000000000";
+        BigInt b = "-7";
+        auto c = a % b;
+        assert(c == -4);
+            }{
+        BigInt a = "10000000000000000";
+        a %= -7;
+        assert(a == 4);
+            }{
+        BigInt a = "10000000000000000";
+        a %= BigInt("0x80000000");
+            }{
+        BigInt a = "10000000000000000";
+        int i = 0x80000000;
+        a %= i;
+    }
+
+    // This block of unittests demonstrates that opAnd works
+    {
+        BigInt x = "0xCCCCCCCC";
+        auto y = x & 0xAAAAAAAA;
+        assert(y == 0x88888888);
+        static assert(is(typeof(y) == uint));
+            }{
+        BigInt x = "0xCCCCCCCC";
+        x &= 0xAAAAAAAA;
+        assert(x == 0x88888888);
+    }
+
+    // This block of unittests demonstrates that opOr works
+    {
+        BigInt x = "0xCCCCCCCC";
+        auto y = x | 0xAAAAAAAA;
+        assert(y == 0xEEEEEEEE);
+            }{
+        BigInt x = "0xCCCCCCCC";
+        x |= 0xAAAAAAAA;
+        assert(x == 0xEEEEEEEE);
+    }
+
+    // This block of unittests demonstrates that opXor works
+    {
+        BigInt x = "0xCCCCCCCC";
+        auto y = x ^ 0xAAAAAAAA;
+        assert(y == 0x66666666);
+            }{
+        BigInt x = "0xCCCCCCCC";
+        x ^= 0xAAAAAAAA;
+        assert(x == 0x66666666);
+    }
+
+
+    // This block of unittests demonstrates that opShl works
+    {
+        BigInt x = "0x1234567";
+        BigInt y = x << 80;
+        assert(y == BigInt("0x123456700000000000000000000"));
+            }{
+        BigInt x = "0x1234567";
+        x <<= 80;
+        assert(x == BigInt("0x123456700000000000000000000"));
+    }
+
+    // This block of unittests demonstrates that opShr works
+    {
+        BigInt x = "0x1234567FFFFFFFFFFFFFFFFFFFF";
+        BigInt y = x >> 80;
+        assert(y == BigInt("0x1234567"));
+            }{
+        BigInt x = "0x1234567FFFFFFFFFFFFFFFFFFFF";
+        x >>= 80;
+        assert(x == BigInt("0x1234567"));
+    }
+
+    // This block of unittests demonstrates that toString works
+    {
+        string s = "128649024696729866742487649";
+        BigInt x = s;
+        string t = x.toString;
+        assert(t == s);
+    }
+
+    // This block of unittests demonstrates that sgn and abs work
+    {
+        BigInt x = 1000;
+        assert(x.sgn == 1);
+        assert(x.abs == 1000);
+            }{
+        BigInt x = -1000;
+        assert(x.sgn == -1);
+        assert(x.abs == 1000);
+            }{
+        BigInt x = 0;
+        assert(x.sgn == 0);
+        assert(x.abs == 0);
+    }
+
+    // Silly ad hoc test
+    {
+        BigInt a = "9588669891916142";
+        BigInt b = "7452469135154800";
+        auto c = a * b;                                             // c = a.b
+        assert(c == "71459266416693160362545788781600");
+        auto d = b * a;                                             // d = a.b
+        assert(d == "71459266416693160362545788781600");
+        assert(d == c);
+        d = c * "794628672112";                                     // d = 794628672112.a.b
+        assert(d == "56783581982794522489042432639320434378739200");
+        auto e = c + d;                                             // e = 794628672113.a.b
+        assert(e == "56783581982865981755459125799682980167520800");
+        auto f = d + c;                                             // f = 794628672113.a.b
+        assert(f == e);
+        auto g = f - c;                                             // g = 794628672112.a.b
+        assert(g == d);
+        g = f - d;                                                  // g = a.b
+        assert(g == c);
+        e = 12345678;                                               // e = 12345678
+        g = c + e;                                                  // g = a.b + e
+        auto h = g / b;                                             // h = a
+        auto i = g % b;                                             // i = e
+        assert(h == a);
+        assert(i == e);
+    }
+}
+
+
+/** Optimised asm arbitrary precision arithmetic ('bignum') 
+ * routines for X86 processors.
+ *
+ * All functions operate on arrays of uints, stored LSB first.
+ * If there is a destination array, it will be the first parameter.
+ * Currently, all of these functions are subject to change, and are
+ * intended for internal use only.
+ *
+ * Author: Don Clugston
+ * Date: May 2008.
+ *
+ * License: Public Domain
+ *
+ * In simple terms, there are 3 modern x86 microarchitectures:
+ * (a) the P6 family (Pentium Pro, PII, PIII, PM, Core), produced by Intel;
+ * (b) the K6, Athlon, and AMD64 families, produced by AMD; and
+ * (c) the Pentium 4, produced by Marketing.
+ *
+ * This code has been optimised for the Intel P6 family, except that it only
+ * uses the basic instruction set (doesn't use MMX, SSE, SSE2)
+ * Generally the code remains near-optimal for Core2, after translating
+ * EAX-> RAX, etc, since all these CPUs use essentially the same pipeline.
+ * The code uses techniques described in Agner Fog's superb manuals available
+ * at www.agner.org.
+ * Not optimal for AMD64, which can do two memory loads per cycle (Intel
+ * CPUs can only do one).
+ *
+ *  Timing results (cycles per int)
+ *         PentiumM Core2
+ *  +,-      2.25   2.25
+ *  &,|,^    2.0    2.0
+ *  <<,>>    2.0    2.0
+ *  cmp      2.0    2.0
+ *  *        5.0
+ *  mulAdd   5.4
+ *  div     18.0
+ */
+private:
+// BUG: These asm routines do not work for GDC.
+// It seems that GDC is completely ignoring the parameter passing ABI!
+// GDC passes the final parameter on the stack, instead of in EAX.
+version(D_InlineAsm_X86) {
+/* Duplicate string s, with n times, substituting index for '@'.
+ *
+ * Each instance of '@' in s is replaced by 0,1,...n-1. This is a helper
+ * function for some of the asm routines.
+ */
+char [] indexedLoopUnroll(int n, invariant char [] s)
+{
+    char [] u;
+    for (int i = 0; i<n; ++i) {
+        char [] nstr= ((i>9 ? ""~ cast(char)('0'+i/10) : "") ~ cast(char)('0' + i%10)).dup;
+        
+        int last = 0;
+        for (int j = 0; j<s.length; ++j) {
+            if (s[j]=='@') {
+                u ~= s[last..j] ~ nstr;
+                last = j+1;
+            }
+        }
+        if (last<s.length) u = u ~ s[last..$];
+        
+    }
+    return u;    
+}
+unittest
+{
+assert(indexedLoopUnroll(3, "@*23;")=="0*23;1*23;2*23;");
+}
+
+/** Multi-byte addition or subtraction
+ *    dest[] = src1[] + src2[] + carry (0 or 1).
+ * or dest[] = src1[] - src2[] - carry (0 or 1).
+ * Returns carry or borrow (0 or 1).
+ * Set op == '+' for addition, '-' for subtraction.
+ */
+uint multibyteAddSub(char op)(uint[] dest, uint [] src1, uint [] src2, uint carry)
+{
+    // Timing:
+    // Pentium M: 2.25/int
+    // P6 family, Core2 have a partial flags stall when reading the carry flag in
+    // an ADC, SBB operation after an operation such as INC or DEC which
+    // modifies some, but not all, flags. We avoid this by storing carry into
+    // a resister (AL), and restoring it after the branch.
+    // * Count UP to zero (from -len) to minimize loop overhead.
+
+    enum { UNROLLFACTOR = 8 };
+    enum { LASTPARAM = 4*4 } // 3* pushes + return address.
+    asm {
+        naked;
+        push EDI;
+        push EBX;
+        push ESI;
+        align 16; // This aligns L1 on a 16-byte boundary.
+        mov ECX, [ESP + LASTPARAM + 4*4]; // dest.length;
+        mov EDX, [ESP + LASTPARAM + 3*4]; // src1.ptr
+        mov EDI, [ESP + LASTPARAM + 5*4]; // dest.ptr
+             // Carry is automatically in EAX
+        mov ESI, [ESP + LASTPARAM + 1*4]; // src2.ptr
+        lea EDX, [EDX + 4*ECX]; // EDX = end of src1.
+        lea EDI, [EDI + 4*ECX]; // EDI = end of dest.
+        lea ESI, [ESI + 4*ECX]; // EBP = end of src2.        
+        neg ECX;
+        and ECX, 0xFFFF_FFF8;
+        jz L2; // length <8
+    L1:
+        shr AL, 1; // get carry from the lsb of AL
+    }
+    mixin(" asm {"
+    ~ indexedLoopUnroll( UNROLLFACTOR, "
+        mov EAX, [@*4+EDX+ECX*4];
+        "~ (op=='+'?"adc" : "sbb") ~ " EAX, [@*4+ESI+ECX*4];
+        mov [@*4+EDI+ECX*4], EAX;
+        ") ~ "}");
+    asm {
+        setc AL; // Save carry into the lsb of AL
+        add ECX, UNROLLFACTOR;        
+        jnz L1;
+L2:
+        mov ECX, [ESP + LASTPARAM + 2*4]; // dest.length;
+        and ECX, 7;
+        jz done; // divisible by 8 -- no residue
+        neg ECX;
+L3: // Do the residual 1..7 ints.
+        shr AL, 1; // get carry from EAX
+        mov EAX, [EDX+ECX*4];
+    }
+    static if (op=='+')
+    {
+        asm { adc EAX, [ESI+ECX*4]; }
+    }
+    else 
+    {
+        asm { sbb EAX, [ESI+ECX*4]; }
+    }
+    asm {
+        mov [EDI+ECX*4], EAX;       
+        setc AL; // save carry
+        add ECX, 1;
+        jnz L3;                
+done:
+        and EAX, 1; // make it O or 1.
+        pop ESI;
+        pop EBX;
+        pop EDI;
+        ret 6*4;
+    } 
+}
+
+unittest
+{
+    uint [] a = new uint[20];
+    uint [] b = new uint[20];
+    uint [] c = new uint[20];
+    for (int i=0; i<a.length; ++i)
+    {
+        if (i&1) a[i]=0x8000_0000 + i;
+        else a[i]=i;
+        b[i]= 0x8000_0003;
+    }
+    c[19]=0x3333_3333;
+    uint carry = multibyteAddSub!('+')(c[0..18], a[0..18], b[0..18], 0);
+    assert(carry==1);
+    assert(c[0]==0x8000_0003);
+    assert(c[1]==4);
+    assert(c[19]==0x3333_3333); // check for overrun
+}
+
+/** dest[] += carry, or dest[] -= carry.
+ *  op must be '+' or '-'
+ *  Returns final carry or borrow (0 or 1)
+ */
+uint multibyteIncrement(char op)(uint[] dest, uint carry)
+{
+    enum { LASTPARAM = 1*4 } // 0* pushes + return address.
+    asm {
+        naked;
+        mov ECX, [ESP + LASTPARAM + 0*4]; // dest.length;
+        mov EDX, [ESP + LASTPARAM + 1*4]; // dest.ptr
+        // EAX  = carry
+L1: ;
+    }
+    static if (op=='+')
+        asm { add [EDX], EAX; }
+    else 
+        asm { sub [EDX], EAX; }    
+    asm {
+        mov EAX, 1;
+        jnc L2;
+        add EDX, 4;        
+        dec ECX;
+        jnz L1;
+        mov EAX, 2;
+L2:     dec EAX;
+        ret 2*4;
+    }
+}
+
+/** Dest[] = src1[] op src2[]
+*   where op == '&' or '|' or '^'
+*/
+uint multibyteLogical(char op)(uint [] dest, uint [] src1, uint [] src2)
+{
+    // PM: 2 cycles/operation. Limited by execution unit p2.
+    // (AMD64 could reach 1.5 cycles/operation since it has TWO read ports.
+    // On Core2, we could use SSE2 with 128-bit reads).
+    enum { LASTPARAM = 3*4 } // 2* pushes + return address.
+    asm {
+        naked;
+        push EDI;
+        push ESI;        
+        mov EDI, [ESP + LASTPARAM + 4*5]; // dest.ptr
+        mov ECX, [ESP + LASTPARAM + 4*4]; // dest.length;
+        mov EDX, [ESP + LASTPARAM + 4*3]; // src1.ptr;
+        mov ESI, [ESP + LASTPARAM + 4*1]; // src2.ptr
+        lea EDI, [EDI + 4*ECX]; // EDI = end of dest.
+        lea EDX, [EDX + 4*ECX]; // EDX = end of src1.
+        lea ESI, [ESI + 4*ECX]; // ESI = end of src2.
+        neg ECX;
+L1:
+        mov EAX, [EDX+ECX*4];
+    }
+    static if (op=='&') asm {        and EAX, [ESI+ECX*4]; }
+    else   if (op=='|') asm {        or  EAX, [ESI+ECX*4]; }
+    else   if (op=='^') asm {        xor EAX, [ESI+ECX*4]; }
+    asm {
+        mov [EDI + ECX *4], EAX;
+        add ECX, 1;
+        jl L1;
+        
+        pop ESI;
+        pop EDI;
+        ret 6*4;
+    } 
+}
+
+unittest
+{
+    uint [] bb = [0x0F0F_0F0F, 0xF0F0_F0F0, 0x0F0F_0F0F, 0xF0F0_F0F0];
+    for (int qqq=0; qqq<3; ++qqq) {
+        uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];    
+        
+        switch(qqq) {
+        case 0:
+            multibyteLogical!('&')(aa[1..3], aa[1..3], bb[0..4]);
+            assert(aa[1]==0x0202_0203 && aa[2]==0x4050_5050 && aa[3]== 0x8999_999A);
+            break;
+        case 1:
+            multibyteLogical!('|')(aa[1..2], aa[1..2], bb[0..3]);
+            assert(aa[1]==0x1F2F_2F2F && aa[2]==0x4555_5556 && aa[3]== 0x8999_999A);
+            break;
+        case 2:
+            multibyteLogical!('^')(aa[1..2], aa[1..2], bb[0..3]);
+            assert(aa[1]==0x1D2D_2D2C && aa[2]==0x4555_5556 && aa[3]== 0x8999_999A);
+            break;
+        }
+        
+        assert(aa[0]==0xF0FF_FFFF);
+    }
+}
+    
+/** dest[] = src[] << numbits
+ *  numbits must be in the range 1..31
+ */
+void multibyteShl(uint [] dest, uint [] src, uint numbits)
+{
+    // Timing: Optimal for P6 family.
+    // 2.0 cycles/int on PPro..PM (limited by execution port p0)
+    // Terrible performance on AMD64, which has 7 cycles for SHLD!!
+    enum { LASTPARAM = 4*4 } // 3* pushes + return address.
+    asm {
+        naked;
+        push ESI;
+        push EDI;
+        push EBX;
+        mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
+        mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
+        mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
+        mov ECX, EAX; // numbits;
+
+        mov EAX, [-4+ESI + 4*EBX];
+        cmp EBX, 1;
+        jz L_last;
+        mov EDX, [-4+ESI + 4*EBX];
+        test EBX, 1;
+        jz L_odd;
+        sub EBX, 1;        
+L_even:
+        mov EDX, [-4+ ESI + 4*EBX];
+        shld EAX, EDX, CL;
+        mov [EDI+4*EBX], EAX;
+L_odd:
+        mov EAX, [-8+ESI + 4*EBX];
+        shld EDX, EAX, CL;
+        mov [-4+EDI + 4*EBX], EDX;        
+        sub EBX, 2;
+        jg L_even;
+L_last:
+        shl EAX, CL;
+        mov [EDI], EAX;
+
+        pop EBX;
+        pop EDI;
+        pop ESI;
+        ret 4*4;
+     }
+}
+
+/** dest[] = src[] >> numbits
+ *  numbits must be in the range 1..31
+ */
+void multibyteShr(uint [] dest, uint [] src, uint numbits)
+{
+    // Timing: Optimal for P6 family.
+    // 2.0 cycles/int on PPro..PM (limited by execution port p0)
+    // Terrible performance on AMD64, which has 7 cycles for SHRD!!
+    enum { LASTPARAM = 4*4 } // 3* pushes + return address.
+    asm {
+        naked;
+        push ESI;
+        push EDI;
+        push EBX;
+        mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
+        mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
+        mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
+        mov ECX, EAX; // numbits;
+
+        lea EDI, [EDI + 4*EBX]; // EDI = end of dest
+        lea ESI, [ESI + 4*EBX]; // ESI = end of src
+        neg EBX;                // count UP to zero.
+        mov EAX, [ESI + 4*EBX];
+        cmp EBX, -1;
+        jz L_last;
+        mov EDX, [ESI + 4*EBX];
+        test EBX, 1;
+        jz L_odd;
+        add EBX, 1;        
+L_even:
+        mov EDX, [ ESI + 4*EBX];
+        shrd EAX, EDX, CL;
+        mov [-4 + EDI+4*EBX], EAX;
+L_odd:
+        mov EAX, [4 + ESI + 4*EBX];
+        shrd EDX, EAX, CL;
+        mov [EDI + 4*EBX], EDX;        
+        add EBX, 2;
+        jl L_even;
+L_last:
+        shr EAX, CL;
+        mov [-4 + EDI], EAX;
+        
+        pop EBX;
+        pop EDI;
+        pop ESI;
+        ret 4*4;
+     }
+}
+
+unittest
+{
+    uint [] aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+    multibyteShr(aa[0..$-2], aa, 4);
+	assert(aa[0]==0x6122_2222 && aa[1]==0xA455_5555 && aa[2]==0x0899_9999);
+	assert(aa[3]==0xBCCC_CCCD);
+
+    aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+    multibyteShr(aa[0..$-1], aa, 4);
+	assert(aa[0] == 0x6122_2222 && aa[1]==0xA455_5555 
+	    && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);
+
+    aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+    multibyteShl(aa[1..4], aa[1..$], 4);
+	assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 
+	    && aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD);
+}
+
+/** dest[] = src[] * multiplier + carry.
+ * Returns carry.
+ */
+uint multibyteMul(uint[] dest, uint[] src, uint multiplier, uint carry)
+{
+    // Timing: definitely not optimal.
+    // Pentium M: 5.0 cycles/operation, has 3 resource stalls/iteration
+    // Fastest implementation found was 4.6 cycles/op, but not worth the complexity.
+
+    enum { LASTPARAM = 4*4 } // 4* pushes + return address.
+    // We'll use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
+    // when initializing variables to zero.
+    version(D_PIC)
+    {
+        enum { zero = 0 }
+    }
+    else
+    {
+        static int zero = 0;
+    }
+    asm {
+        naked;      
+        push ESI;
+        push EDI;
+        push EBX;
+        
+        mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
+        mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
+        mov ESI, [ESP + LASTPARAM + 4*2];  // src.ptr
+        align 16;
+        lea EDI, [EDI + 4*EBX]; // EDI = end of dest
+        lea ESI, [ESI + 4*EBX]; // ESI = end of src
+        mov ECX, EAX; // [carry]; -- last param is in EAX.
+        neg EBX;                // count UP to zero.
+        test EBX, 1;
+        jnz L_odd;
+        add EBX, 1;
+ L1:
+        mov EAX, [-4 + ESI + 4*EBX];
+        mul int ptr [ESP+LASTPARAM]; //[multiplier];
+        add EAX, ECX;
+        mov ECX, zero;
+        mov [-4+EDI + 4*EBX], EAX;
+        adc ECX, EDX;
+L_odd:        
+        mov EAX, [ESI + 4*EBX];  // p2
+        mul int ptr [ESP+LASTPARAM]; //[multiplier]; // p0*3, 
+        add EAX, ECX;
+        mov ECX, zero;
+        adc ECX, EDX;
+        mov [EDI + 4*EBX], EAX;
+        add EBX, 2;
+        jl L1;
+        
+        mov EAX, ECX; // get final carry
+
+        pop EBX;
+        pop EDI;
+        pop ESI;
+        ret 5*4;
+     }
+}
+
+unittest
+{
+    uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+    multibyteMul(aa[1..4], aa[1..4], 16, 0);
+	assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 && aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD);
+}
+
+/**
+ * dest[] += src[] * multiplier + carry(0..FFFF_FFFF).
+ * Returns carry out of MSB (0..FFFF_FFFF).
+ */
+uint multibyteMulAdd(uint [] dest, uint[] src, uint multiplier, uint carry)
+{
+    // Timing: This is the most time-critical bignum function.
+    // Pentium M: 5.4 cycles/operation, still has 2 resource stalls + 1 load block/iteration
+    
+    // The bottlenecks in this code are extremely complicated. The MUL, ADD, and ADC
+    // need 4 cycles on each of the ALUs units p0 and p1. So we use memory load 
+    // (unit p2) for initializing registers to zero.
+    // There are also dependencies between the instructions, and we run up against the
+    // ROB-read limit (can only read 2 registers per cycle).
+    // We also need the number of uops in the loop to be a multiple of 3.
+    // The only available execution unit for this is p3 (memory write)
+    
+    // The main loop is pipelined and unrolled by 2, so entry to the loop is also complicated.
+    
+    version(D_PIC) {
+        enum { zero = 0 }
+    } else {
+        // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
+        // when initializing registers to zero.
+        static int zero = 0;
+        // use p3/p4 units 
+        static int storagenop; // write-only
+    }
+    
+    enum { LASTPARAM = 5*4 } // 4* pushes + return address.
+    asm {
+        naked;
+        
+        push ESI;
+        push EDI;
+        push EBX;
+        push EBP;
+        mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
+        mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
+        align 16;
+        nop;
+        mov ESI, [ESP + LASTPARAM + 4*2];  // src.ptr
+        lea EDI, [EDI + 4*EBX]; // EDI = end of dest
+        lea ESI, [ESI + 4*EBX]; // ESI = end of src
+        mov EBP, 0;
+        mov ECX, EAX; // ECX = input carry.
+        neg EBX;                // count UP to zero.
+        mov EAX, [ESI+4*EBX];
+        test EBX, 1;
+        jnz L_enter_odd;
+        // Entry point for even length
+        add EBX, 1;
+        mov EBP, ECX; // carry
+        
+        mul int ptr [ESP+LASTPARAM];
+        mov ECX, 0;
+ 
+        add EBP, EAX;
+        mov EAX, [ESI+4*EBX];
+        adc ECX, EDX;
+
+        mul int ptr [ESP+LASTPARAM];
+
+        add [-4+EDI+4*EBX], EBP;
+        mov EBP, zero;
+    
+        adc ECX, EAX;
+        mov EAX, [4+ESI+4*EBX];
+    
+        adc EBP, EDX;    
+        add EBX, 2;
+        jnl L_done;
+        // Main loop
+L1:
+        mul int ptr [ESP+LASTPARAM];
+        add [-8+EDI+4*EBX], ECX;
+        mov ECX, zero;
+ 
+        adc EBP, EAX;
+        mov EAX, [ESI+4*EBX];
+        
+        adc ECX, EDX;
+    }
+    version(D_PIC) {} else {
+     asm {
+        mov storagenop, EDX; // make #uops in loop a multiple of 3
+     }
+    }
+    asm {        
+        mul int ptr [ESP+LASTPARAM];
+        add [-4+EDI+4*EBX], EBP;
+        mov EBP, zero;
+    
+        adc ECX, EAX;
+        mov EAX, [4+ESI+4*EBX];
+    
+        adc EBP, EDX;    
+        add EBX, 2;
+        jl L1;
+L_done:
+        add [-8+EDI+4*EBX], ECX;
+        mov EAX, EBP; // get final carry
+        adc EAX, 0;
+        pop EBP;
+        pop EBX;
+        pop EDI;
+        pop ESI;
+        ret 5*4;
+        
+L_enter_odd:
+        mul int ptr [ESP+LASTPARAM];
+        mov EBP, zero;   
+        add ECX, EAX;
+        mov EAX, [4+ESI+4*EBX];
+    
+        adc EBP, EDX;    
+        add EBX, 2;
+        jl L1;
+        jmp L_done;
+
+     }
+}
+
+unittest {
+    
+    uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
+    uint [] bb = [0x1234_1234, 0xF0F0_F0F0, 0x00C0_C0C0, 0xF0F0_F0F0, 0xC0C0_C0C0];
+    multibyteMulAdd(bb[1..$-1], aa[1..$], 16, 5);
+	assert(bb[0] == 0x1234_1234 && bb[4] == 0xC0C0_C0C0);
+    assert(bb[1] == 0x2222_2230 + 0xF0F0_F0F0+5 && bb[2] == 0x5555_5561+0x00C0_C0C0+1
+	    && bb[3] == 0x9999_99A4+0xF0F0_F0F0 );
+}
+
+/**  dest[] /= divisor.
+ * overflow is the initial remainder, and must be in the range 0..divisor-1.
+ * divisor must not be a power of 2 (use right shift for that case;
+ * A division by zero will occur if divisor is a power of 2).
+ * Returns the final remainder
+ *
+ * Based on public domain code by Eric Bainville. 
+ * (http://www.bealto.com/) Used with permission.
+ */
+uint multibyteDiv(uint [] dest, uint divisor, uint overflow)
+{
+    // Timing: limited by a horrible dependency chain.
+    // Pentium M: 18 cycles/op, 8 resource stalls/op.
+    // EAX, EDX = scratch, used by MUL
+    // EDI = dest
+    // CL = shift
+    // ESI = quotient
+    // EBX = remainderhi
+    // EBP = remainderlo
+    // [ESP-4] = mask
+    // [ESP] = kinv (2^64 /divisor)
+    enum { LASTPARAM = 5*4 } // 4* pushes + return address.
+    enum { LOCALS = 2*4} // MASK, KINV
+    asm {
+        naked;
+        
+        push ESI;
+        push EDI;
+        push EBX;
+        push EBP;
+        
+        mov EDI, [ESP + LASTPARAM + 4*2]; // dest.ptr
+        mov EBX, [ESP + LASTPARAM + 4*1]; // dest.length
+
+        // Loop from msb to lsb
+        lea     EDI, [EDI + 4*EBX];        
+        mov EBP, EAX; // rem is the input remainder, in 0..divisor-1
+        // Build the pseudo-inverse of divisor k: 2^64/k
+        // First determine the shift in ecx to get the max number of bits in kinv
+        xor     ECX, ECX;
+        mov     EAX, [ESP + LASTPARAM]; //divisor;
+        mov     EDX, 1;
+kinv1:
+        inc     ECX;
+        ror     EDX, 1;
+        shl     EAX, 1;
+        jnc     kinv1;
+        dec     ECX;
+        // Here, ecx is a left shift moving the msb of k to bit 32
+        
+        mov     EAX, 1;
+        shl     EAX, CL;
+        dec     EAX;
+        ror     EAX, CL ; //ecx bits at msb
+        push    EAX;
+        
+        
+        // Then divide 2^(32+cx) by divisor (edx already ok)
+        xor     EAX, EAX;
+        div     int ptr [ESP + LASTPARAM +  LOCALS-4*1]; //divisor;
+        push    EAX; // kinv        
+        // Here kinv has 64 bits
+
+        align   16;
+L2:
+        // Get 32 bits of quotient approx, multiplying
+        // most significant word of (rem*2^32+input)
+        mov     EAX, [ESP+4]; //MASK;
+        and     EAX, [EDI - 4];
+        or      EAX, EBP;
+        rol     EAX, CL;
+        mov     EBX, EBP;
+        mov     EBP, [EDI - 4];
+        mul     int ptr [ESP]; //KINV;
+                
+        shl     EAX, 1;
+        rcl     EDX, 1;
+        
+        // Multiply by k and subtract to get remainder
+        // Subtraction must be done on two words
+        mov     EAX, EDX;
+        mov     ESI, EDX; // quot = high word
+        mul     int ptr [ESP + LASTPARAM+LOCALS]; //divisor;
+        sub     EBP, EAX;
+        sbb     EBX, EDX;   
+        jz      Lb;  // high word is 0, goto adjust on single word
+
+        // Adjust quotient and remainder on two words
+Ld:     inc     ESI;
+        sub     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
+        sbb     EBX, 0;
+        jnz     Ld;
+        
+        // Adjust quotient and remainder on single word
+Lb:     cmp     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
+        jc      Lc; // rem in 0..divisor-1, OK        
+        sub     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
+        inc     ESI;
+        jmp     Lb;
+        
+        // Store result
+Lc:
+        mov     [EDI - 4], ESI;
+        lea     EDI, [EDI - 4];
+        dec     int ptr [ESP + LASTPARAM + 4*1+LOCALS]; // len
+        jnz	L2;
+        
+        pop EAX; // discard kinv
+        pop EAX; // discard mask
+        
+        mov     EAX, EBP; // return final remainder
+        pop     EBP;
+        pop     EBX;
+        pop     EDI;
+        pop     ESI;        
+        ret     3*4;
+    }
+}
+
+unittest {
+    uint [] aa = new uint[101];
+    for (int i=0; i<aa.length; ++i) aa[i] = 0x8765_4321 * (i+3);
+    uint overflow = multibyteMul(aa, aa, 0x8EFD_FCFB, 0x33FF_7461);
+    uint r = multibyteDiv(aa, 0x8EFD_FCFB, overflow);
+    for (int i=0; i<aa.length-1; ++i) assert(aa[i] == 0x8765_4321 * (i+3));
+    assert(r==0x33FF_7461);
+
+}
+
+} // version(D_InlineAsm_X86)