Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.

Dismiss

Fast Power Bug ?

4 views

Skip to first unread message

Skybuck Flying

unread,

Feb 20, 2023, 1:32:43 AM2/20/23

FastPower(Single, Single) does not work with Base = 0,474733531475067 eg and Exponent = 150, should return zero.
The same checks as in rtl before Result := FastExp2(AExponent * FastLog2(ABase)); make it work as it should.

function FastPower(const ABase, AExponent: Single): Single;
begin
Result := FastExp2(AExponent * FastLog2(ABase));
end;

// 32 bit implementation

function FastExp2(const A: Single): Single; assembler;
var
OldFlags, NewFlags: UInt32;
asm
// Set rounding mode to Round Positive (=Round Down)
stmxcsr [OldFlags]
movss xmm0, [A]
mov ecx, [OldFlags]
xorps xmm1, xmm1
and ecx, SSE_ROUND_MASK
movss xmm3, xmm0
or ecx, SSE_ROUND_DOWN
movss xmm5, xmm0
mov [NewFlags], ecx

movss xmm1, DWORD [SSE_EXP2_F1]
ldmxcsr [NewFlags]

// Z := A - RoundDown(A)
cvtps2dq xmm3, xmm3
addss xmm1, xmm5 // A + 121.2740575
cvtdq2ps xmm3, xmm3
movss xmm2, DWORD [SSE_EXP2_F2]
subss xmm0, xmm3

movss xmm3, DWORD [SSE_EXP2_F3]
movss xmm4, DWORD [SSE_EXP2_F4]
subss xmm3, xmm0 // (4.84252568 - Z)
mulss xmm0, xmm4 // 1.49012907 * Z
divss xmm2, xmm3
movss xmm5, DWORD [SSE_EXP2_F5]
addss xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
subss xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
mulss xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
cvtps2dq xmm1, xmm1

// Restore rounding mode
ldmxcsr [OldFlags]

movss [Result], xmm1
end;

function FastLog2(const A: Single): Single; assembler;
asm
movss xmm0, [A]
movss xmm2, DWORD [SSE_MASK_FRACTION]
movss xmm1, xmm0

// MX.I := (VX.I and $007FFFFF) or $3F000000
movss xmm3, DWORD [SSE_LOG2_I1]
pand xmm0, xmm2
cvtdq2ps xmm1, xmm1
movss xmm4, DWORD [SSE_LOG2_F1]
por xmm0, xmm3

movss xmm2, DWORD [SSE_LOG2_F2]
mulss xmm1, xmm4 // VX.I * 1.1920928955078125e-7
movss xmm3, DWORD [SSE_LOG2_F3]
subss xmm1, xmm2 // Result - 124.22551499
mulss xmm3, xmm0
movss xmm4, DWORD [SSE_LOG2_F5]
subss xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
movss xmm2, DWORD [SSE_LOG2_F4]
addss xmm0, xmm4
divss xmm2, xmm0
subss xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)

movss [Result], xmm1
end;

// 64 bit implementation

function FastExp2(const A: Single): Single; assembler;
var
OldFlags, NewFlags: UInt32;
asm
// Set rounding mode to Round Positive (=Round Down)
stmxcsr [OldFlags]
mov ecx, [OldFlags]
xorps xmm1, xmm1
and ecx, SSE_ROUND_MASK
movss xmm3, xmm0
or ecx, SSE_ROUND_DOWN
movss xmm5, xmm0
mov [NewFlags], ecx

movss xmm1, DWORD [SSE_EXP2_F1]
ldmxcsr [NewFlags]

// Z := A - RoundDown(A)
cvtps2dq xmm3, xmm3
addss xmm1, xmm5 // A + 121.2740575
cvtdq2ps xmm3, xmm3
movss xmm2, DWORD [SSE_EXP2_F2]
subss xmm0, xmm3

movss xmm3, DWORD [SSE_EXP2_F3]
movss xmm4, DWORD [SSE_EXP2_F4]
subss xmm3, xmm0 // (4.84252568 - Z)
mulss xmm0, xmm4 // 1.49012907 * Z
divss xmm2, xmm3
movss xmm5, DWORD [SSE_EXP2_F5]
addss xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
subss xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
mulss xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
cvtps2dq xmm1, xmm1

// Restore rounding mode
ldmxcsr [OldFlags]

movss xmm0, xmm1
end;

function FastLog2(const A: Single): Single; assembler;
asm
movss xmm2, DWORD [SSE_MASK_FRACTION]
movss xmm1, xmm0

// MX.I := (VX.I and $007FFFFF) or $3F000000
movss xmm3, DWORD [SSE_LOG2_I1]
pand xmm0, xmm2
cvtdq2ps xmm1, xmm1
movss xmm4, DWORD [SSE_LOG2_F1]
por xmm0, xmm3

movss xmm2, DWORD [SSE_LOG2_F2]
mulss xmm1, xmm4 // VX.I * 1.1920928955078125e-7
movss xmm3, DWORD [SSE_LOG2_F3]
subss xmm1, xmm2 // Result - 124.22551499
mulss xmm3, xmm0
movss xmm4, DWORD [SSE_LOG2_F5]
subss xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
movss xmm2, DWORD [SSE_LOG2_F4]
addss xmm0, xmm4
divss xmm2, xmm0
subss xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)

movss xmm0, xmm1
end;

Any idea what bug could be ?

Bye,
Skybuck.

Skybuck Flying

unread,

Feb 20, 2023, 1:45:47 AM2/20/23

Proper test program, not sure what to make of this...

program TestProgram;

{$APPTYPE CONSOLE}

{$R *.res}

{

TestProgram for FastPower

version 0.01 created on 20 feb 2023 by Skybuck Flying ! ;)

}

uses
System.SysUtils,
System.Math,
System.Math.Vectors,
System.Types;

const
{ SSE rounding modes (bits in MXCSR register) }
SSE_ROUND_MASK = $FFFF9FFF;
SSE_ROUND_NEAREST = $00000000;
SSE_ROUND_DOWN = $00002000;
SSE_ROUND_UP = $00004000;
SSE_ROUND_TRUNC = $00006000;

{ These constants fit in a single XMM register. These values represent
sign-bits as used by 32-bit floating-point values.
XOR'ing a floating-point value with $80000000 swaps the sign.
XOR'ing a floating-point value with $00000000 leaves the value unchanged. }
SSE_MASK_SIGN: array [0..3] of UInt32 = ($80000000, $80000000, $80000000, $80000000);
SSE_MASK_NPNP: array [0..3] of UInt32 = ($80000000, $00000000, $80000000, $00000000);
SSE_MASK_PNPN: array [0..3] of UInt32 = ($00000000, $80000000, $00000000, $80000000);
SSE_MASK_0FFF: array [0..3] of UInt32 = ($FFFFFFFF, $FFFFFFFF, $FFFFFFFF, $00000000);

{ These constants mask off an element of the binary representation of a
32-bit floating-point value. }
SSE_MASK_FRACTION: array [0..3] of UInt32 = ($007FFFFF, $007FFFFF, $007FFFFF, $007FFFFF);
SSE_MASK_EXPONENT: array [0..3] of UInt32 = ($7F800000, $7F800000, $7F800000, $7F800000);
SSE_MASK_ABS_VAL : array [0..3] of UInt32 = ($7FFFFFFF, $7FFFFFFF, $7FFFFFFF, $7FFFFFFF);

{ Commonly used floating-point values }
SSE_ONE_HALF : array [0..3] of Single = (0.5, 0.5, 0.5, 0.5);
SSE_ONE : array [0..3] of Single = (1, 1, 1, 1);
SSE_TWO : array [0..3] of Single = (2, 2, 2, 2);
SSE_THREE : array [0..3] of Single = (3, 3, 3, 3);
SSE_PI_OVER_180 : array [0..3] of Single = (Pi / 180, Pi / 180, Pi / 180, Pi / 180);
SSE_180_OVER_PI : array [0..3] of Single = (180 / Pi, 180 / Pi, 180 / Pi, 180 / Pi);
SSE_NEG_INFINITY: array [0..3] of Single = (NegInfinity, NegInfinity, NegInfinity, NegInfinity);
SSE_PI_OVER_4 : array [0..3] of Single = (Pi / 4, Pi / 4, Pi / 4, Pi / 4);

{ Commonly used integer values }
SSE_INT_ONE : array [0..3] of Integer = (1, 1, 1, 1);
SSE_INT_NOT_ONE : array [0..3] of Cardinal = ($FFFFFFFE, $FFFFFFFE, $FFFFFFFE, $FFFFFFFE);
SSE_INT_TWO : array [0..3] of Integer = (2, 2, 2, 2);
SSE_INT_FOUR : array [0..3] of Integer = (4, 4, 4, 4);

{ Constants for approximating trigonometric functions }
SSE_FOPI: array [0..3] of Single = (1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516);
SSE_SINCOF_P0: array [0..3] of Single = (-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4);
SSE_SINCOF_P1: array [0..3] of Single = (8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3);
SSE_SINCOF_P2: array [0..3] of Single = (-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1);
SSE_COSCOF_P0: array [0..3] of Single = (2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005);
SSE_COSCOF_P1: array [0..3] of Single = (-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003);
SSE_COSCOF_P2: array [0..3] of Single = (4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002);

SSE_EXP_A1 : array [0..3] of Single = (12102203.1615614, 12102203.1615614, 12102203.1615614, 12102203.1615614);
SSE_EXP_A2 : array [0..3] of Single = (1065353216, 1065353216, 1065353216, 1065353216);
SSE_EXP_CST: array [0..3] of Single = (2139095040, 2139095040, 2139095040, 2139095040);
SSE_EXP_F1 : array [0..3] of Single = (0.509964287281036376953125, 0.509964287281036376953125, 0.509964287281036376953125, 0.509964287281036376953125);
SSE_EXP_F2 : array [0..3] of Single = (0.3120158612728118896484375, 0.3120158612728118896484375, 0.3120158612728118896484375, 0.3120158612728118896484375);
SSE_EXP_F3 : array [0..3] of Single = (0.1666135489940643310546875, 0.1666135489940643310546875, 0.1666135489940643310546875, 0.1666135489940643310546875);
SSE_EXP_F4 : array [0..3] of Single = (-2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3);
SSE_EXP_F5 : array [0..3] of Single = (1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2);
SSE_EXP_I1 : array [0..3] of UInt32 = ($3F800000, $3F800000, $3F800000, $3F800000);

SSE_LN_CST: array [0..3] of Single = (-89.93423858, -89.93423858, -89.93423858, -89.93423858);
SSE_LN_F1 : array [0..3] of Single = (3.3977745, 3.3977745, 3.3977745, 3.3977745);
SSE_LN_F2 : array [0..3] of Single = (2.2744832, 2.2744832, 2.2744832, 2.2744832);
SSE_LN_F3 : array [0..3] of Single = (0.024982445, 0.024982445, 0.024982445, 0.024982445);
SSE_LN_F4 : array [0..3] of Single = (0.24371102, 0.24371102, 0.24371102, 0.24371102);
SSE_LN_F5 : array [0..3] of Single = (0.69314718055995, 0.69314718055995, 0.69314718055995, 0.69314718055995);

SSE_LOG2_I1: array [0..3] of UInt32 = ($3F000000, $3F000000, $3F000000, $3F000000);
SSE_LOG2_F1: array [0..3] of Single = (1.1920928955078125e-7, 1.1920928955078125e-7, 1.1920928955078125e-7, 1.1920928955078125e-7);
SSE_LOG2_F2: array [0..3] of Single = (124.22551499, 124.22551499, 124.22551499, 124.22551499);
SSE_LOG2_F3: array [0..3] of Single = (1.498030302, 1.498030302, 1.498030302, 1.498030302);
SSE_LOG2_F4: array [0..3] of Single = (1.72587999, 1.72587999, 1.72587999, 1.72587999);
SSE_LOG2_F5: array [0..3] of Single = (0.3520887068, 0.3520887068, 0.3520887068, 0.3520887068);

SSE_EXP2_F1: array [0..3] of Single = (121.2740575, 121.2740575, 121.2740575, 121.2740575);
SSE_EXP2_F2: array [0..3] of Single = (27.7280233, 27.7280233, 27.7280233, 27.7280233);
SSE_EXP2_F3: array [0..3] of Single = (4.84252568, 4.84252568, 4.84252568, 4.84252568);
SSE_EXP2_F4: array [0..3] of Single = (1.49012907, 1.49012907, 1.49012907, 1.49012907);
SSE_EXP2_F5: array [0..3] of Single = ($00800000, $00800000, $00800000, $00800000);

function FastPower(const ABase, AExponent: Single): Single;
begin
Result := FastExp2(AExponent * FastLog2(ABase));
end;

procedure Main;
var
vSingle1, vSingle2 : single;
begin
writeln('program started');

// FastPower(Single, Single) does not work with Base = 0,474733531475067 eg and Exponent = 150, should return zero.
// The same checks as in rtl before Result := FastExp2(AExponent * FastLog2(ABase)); make it work as it should.

vSingle1 := 0.474733531475067;
vSingle2 := 150;

writeln( FastPower( vSingle1, vSingle2 ) : 16 : 16 );

writeln('program finished');
end;

begin
try
Main;
except
on E: Exception do
Writeln(E.ClassName, ': ', E.Message);
end;
ReadLn;
end.

Bye,
Skybuck.

Skybuck Flying

unread,

Feb 20, 2023, 9:51:23 PM2/20/23

Original author of bug report has replied to shed more light on this:

"
Actually I don't remember where i faced the problem. But rtl definitely returns zero.
System.Math.Power(0.474733531475067, 150) = 0
while
neslib.FastMath.FastPower(0.474733531475067, 150) returns -3,35626519513237e+28
"

Bye for now,
Skybuck.

0 new messages