* exp() is now 3 times faster.

* sinh, cosh, tanh now use D implementations rather than the C standard library.
* tan() now uses the C library if asm is unavailable.
This commit is contained in:
Don Clugston 2009-02-23 09:11:43 +00:00
parent 4fd01b9517
commit ae4fd3b9f7

View file

@ -69,6 +69,18 @@ private import std.string;
private import std.c.math;
private import std.traits;
version(GNU){
// GDC can't actually do inline asm.
} else version(D_InlineAsm_X86) {
version = Naked_D_InlineAsm_X86;
} else version(LDC) {
import ldc.intrinsics;
version(X86)
{
version = LDC_X86;
}
}
private:
/*
@ -377,6 +389,7 @@ unittest{
real tan(real x)
{
version(Naked_D_InlineAsm_X86) {
asm
{
fld x[EBP] ; // load theta
@ -408,7 +421,10 @@ trigerr:
return real.nan;
Lret:
;
;
} else {
return stdc.math.tanl(x);
}
}
unittest
@ -534,7 +550,12 @@ real atan2(real y, real x) { return std.c.math.atan2l(y,x); }
* $(TR $(TD $(PLUSMN)$(INFIN)) $(TD $(PLUSMN)0.0) $(TD no) )
* )
*/
real cosh(real x) { return std.c.math.coshl(x); }
real cosh(real x) {
// cosh = (exp(x)+exp(-x))/2.
// The naive implementation works correctly.
real y = exp(x);
return (y + 1.0/y) * 0.5;
}
/***********************************
* Calculates the hyperbolic sine of x.
@ -545,7 +566,18 @@ real cosh(real x) { return std.c.math.coshl(x); }
* $(TR $(TD $(PLUSMN)$(INFIN)) $(TD $(PLUSMN)$(INFIN)) $(TD no))
* )
*/
real sinh(real x) { return std.c.math.sinhl(x); }
real sinh(real x)
{
// sinh(x) = (exp(x)-exp(-x))/2;
// Very large arguments could cause an overflow, but
// the maximum value of x for which exp(x) + exp(-x)) != exp(x)
// is x = 0.5 * (real.mant_dig) * LN2. // = 22.1807 for real80.
if (fabs(x) > real.mant_dig * LN2) {
return copysign(0.5 * exp(fabs(x)), x);
}
real y = expm1(x);
return 0.5 * y / (y+1) * (y+2);
}
/***********************************
* Calculates the hyperbolic tangent of x.
@ -556,11 +588,15 @@ real sinh(real x) { return std.c.math.sinhl(x); }
* $(TR $(TD $(PLUSMN)$(INFIN)) $(TD $(PLUSMN)1.0) $(TD no))
* )
*/
real tanh(real x) { return std.c.math.tanhl(x); }
//real acosh(real x) { return std.c.math.acoshl(x); }
//real asinh(real x) { return std.c.math.asinhl(x); }
//real atanh(real x) { return std.c.math.atanhl(x); }
real tanh(real x)
{
// tanh(x) = (exp(x) - exp(-x))/(exp(x)+exp(-x))
if (fabs(x) > real.mant_dig * LN2) {
return copysign(1, x);
}
real y = expm1(2*x);
return y / (y + 2);
}
/***********************************
* Calculates the inverse hyperbolic cosine of x.
@ -742,7 +778,96 @@ creal sqrt(creal z)
* $(TR $(TD -$(INFIN)) $(TD +0.0) )
* )
*/
real exp(real x) { return std.c.math.expl(x); }
real exp(real x)
{
version(Naked_D_InlineAsm_X86) {
/* exp() for x87 80-bit reals, IEEE754-2008 conformant.
* Author: Don Clugston.
*
* exp(x) = 2^(rndint(y))* 2^(y-rndint(y)) where y = LN2*x.
* The trick for high performance is to avoid the fscale(28cycles on core2),
* frndint(19 cycles), leaving f2xm1(19 cycles) as the only slow instruction.
*
* We can do frndint by using fist. BUT we can't use it for huge numbers,
* because it will set the Invalid Operation flag is overflow or NaN occurs.
* Fortunately, whenever this happens the result would be zero or infinity.
*
* We can perform fscale by directly poking into the exponent. BUT this doesn't
* work for the (very rare) cases where the result is subnormal. So we fall back
* to the slow method in that case.
*/
enum { PARAMSIZE = (real.sizeof+3)&(0xFFFF_FFFC) } // always a multiple of 4
asm {
naked;
fld real ptr [ESP+4] ; // x
mov AX, [ESP+4+8]; // AX = exponent and sign
sub ESP, 12+8; // Create scratch space on the stack
// [ESP,ESP+2] = scratchint
// [ESP+4..+6, +8..+10, +10] = scratchreal
// set scratchreal mantissa = 1.0
mov dword ptr [ESP+8], 0;
mov dword ptr [ESP+8+4], 0x80000000;
and AX, 0x7FFF; // drop sign bit
cmp AX, 0x401D; // avoid InvalidException in fist
jae L_extreme;
fldl2e;
fmul ; // y = x*log2(e)
fist dword ptr [ESP]; // scratchint = rndint(y)
fisub dword ptr [ESP]; // y - rndint(y)
// and now set scratchreal exponent
mov EAX, [ESP];
add EAX, 0x3fff;
jle short L_subnormal;
cmp EAX,0x8000;
jge short L_overflow;
mov [ESP+8+8],AX;
L_normal:
f2xm1;
fld1;
fadd ; // 2^(y-rndint(z))
fld real ptr [ESP+8] ; // 2^rndint(y)
add ESP,12+8;
fmulp ST(1), ST;
ret PARAMSIZE;
L_subnormal:
// Result will be subnormal.
// In this rare case, the simple poking method doesn't work.
// The speed doesn't matter, so use the slow fscale method.
fild dword ptr [ESP]; // scratchint
fld1;
fscale;
fstp real ptr [ESP+8]; // scratchreal = 2^scratchint
fstp ST(0),ST; // drop scratchint
jmp L_normal;
L_extreme: // Extreme exponent. X is very large positive, very
// large negative, infinity, or NaN.
fxam;
fstsw AX;
test AX, 0x0400; // NaN_or_zero, but we already know x!=0
jz L_was_nan; // if x is NaN, returns x
// set scratchreal = real.min
// squaring it will return 0, setting underflow flag
mov word ptr [ESP+8+8], 1;
test AX, 0x0200;
jnz L_waslargenegative;
L_overflow:
// Set scratchreal = real.max.
// squaring it will create infinity, and set overflow flag.
mov word ptr [ESP+8+8], 0x7FFE;
L_waslargenegative:
fstp ST(0), ST;
fld real ptr [ESP+8]; // load scratchreal
fmul ST(0), ST; // square it, to create havoc!
L_was_nan:
add ESP,12+8;
ret PARAMSIZE;
}
} else {
return std.c.math.expl(x);
}
}
/**********************
* Calculates 2$(SUP x).