Why is trunc() so slow ?

Bonita Montero

unread,

Jan 3, 2021, 12:44:32 PM1/3/21

to

I've just wondered why trunc() of the MSVCRT is so slow.

So I've written my own trunc():

#include <iostream>
#include <cmath>
#include <cassert>
#include <limits>
#include <vector>
#include <random>
#include <chrono>
#include <cassert>

using namespace std;
using namespace chrono;

double ftrunc( double d );
float ftrunc( float f );

int main()
{
union
{
double dAssemble;
struct
{
uint64_t mantissa : 52;
uint64_t exponent : 11;
uint64_t sign : 1;
};
};
mt19937_64 rg;
uniform_int_distribution<uint64_t> mantissaValues( 0, ((uint64_t)1 <<
52) - 1 );
// equal of only fractional numbers,
// mantissa with integral parts and
fractions
// and mantissa without fractions
uniform_int_distribution<unsigned> exponentValues( 0x3FF - 52, 0x3FF +
2 * 52 - 1 ),
signValues( 0, 1 );
vector<double> vd( 1'000 );
for( double &d : vd )
mantissa = mantissaValues( rg ),
exponent = exponentValues( rg ),
sign = signValues( rg ),
d = dAssemble;
for( double d : vd )
assert(ftrunc( d ) == trunc( d ));
double volatile dummyValue;
time_point<high_resolution_clock> start = high_resolution_clock::now();
size_t const turns = 1'000'000'000 / vd.size();
for( size_t i = turns; i; --i )
for( double d : vd )
dummyValue = ftrunc( d );
double ns = (double)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();
cout << "ftrunc: " << ns / (vd.size() * turns) << endl;
start = high_resolution_clock::now();
for( size_t i = turns; i; --i )
for( double d : vd )
dummyValue = trunc( d );
ns = (double)duration_cast<nanoseconds>( high_resolution_clock::now() -
start ).count();
cout << "trunc: " << ns / (vd.size() * turns) << endl;
}

double ftrunc( double d )
{
static_assert(sizeof(double) == sizeof(uint64_t), "sizeof(double) not
equal to sizeof(uint64_t)");
static_assert(numeric_limits<double>::is_iec559, "double must be
IEEE-754");
// assume size_t is our CPU's native register-width
static_assert(sizeof(size_t) == sizeof(uint64_t) || sizeof(size_t) ==
sizeof(uint32_t), "register-width must be 32 or 64 bit");
if constexpr( sizeof(size_t) == sizeof(uint64_t) )
// we have 64 bit registers
{
unsigned const MANTISSA_BITS = 52,
EXP_BIAS = 0x3FF,
INF_NAN_BASE = 0x7FF;
uint64_t const EXP_MASK = (uint64_t)0x7FF
<< MANTISSA_BITS,
SIGN_MASK = (uint64_t)0x800
<< MANTISSA_BITS ,
MIN_INTEGRAL_DIGITS_EXP = (uint64_t) EXP_BIAS
<< MANTISSA_BITS,
MIN_INTEGRAL_ONLY_EXP = (uint64_t)(EXP_BIAS +
MANTISSA_BITS) << MANTISSA_BITS,
INF_NAN_EXP = (uint64_t)INF_NAN_BASE
<< MANTISSA_BITS,
NEG_MANTISSA_MASK = 0x000FFFFFFFFFFFFFu;
union
{
double du;
uint64_t dx;
};
du = d;
uint64_t exp = dx & EXP_MASK;
if( exp >= MIN_INTEGRAL_DIGITS_EXP )
// value has integral digits
if( exp < MIN_INTEGRAL_ONLY_EXP )
{
// there are fraction-digits to mask out, mask them
unsigned shift = (unsigned)(exp >> MANTISSA_BITS) - EXP_BIAS;
dx &= ~(NEG_MANTISSA_MASK >> shift);
return du;
}
else
if( exp < INF_NAN_EXP )
// value is integral
return du;
else
// infinite, NaN, SNaN
// raise exception on SNaN if necessary
return du + du;
else
{
// below +/-1.0
// return +/-0.0
dx &= SIGN_MASK;
return du;
}
}
else if constexpr( sizeof(size_t) == sizeof(uint32_t) )
// we have 32 bit registers
{
unsigned const MANTISSA_BITS = 52,
HI_MANTISSA_BITS = 20,
EXP_BIAS = 0x3FF,
INF_NAN_BASE = 0x7FF;
uint32_t const EXP_MASK = (uint32_t)0x7FFu
<< HI_MANTISSA_BITS,
SIGN_MASK = (uint32_t)0x800u
<< HI_MANTISSA_BITS,
MIN_INTEGRAL_DIGITS_EXP = (uint32_t) EXP_BIAS
<< HI_MANTISSA_BITS,
MAX_INTEGRAL32_EXP = (uint32_t)(EXP_BIAS +
HI_MANTISSA_BITS) << HI_MANTISSA_BITS,
MIN_INTEGRAL_ONLY_EXP = (uint32_t)(EXP_BIAS +
MANTISSA_BITS) << HI_MANTISSA_BITS,
INF_NAN_EXP = (uint32_t)INF_NAN_BASE
<< HI_MANTISSA_BITS,
NEG_HI_MANTISSA_MASK = 0x000FFFFFu,
NEG_LO_MANTISSA_MASK = 0xFFFFFFFFu;
union
{
double du;
struct
{
uint32_t dxLo;
uint32_t dxHi;
};
};
du = d;
uint32_t exp = dxHi & EXP_MASK;
if( exp >= MIN_INTEGRAL_DIGITS_EXP )
// value has integral digits
if( exp < MIN_INTEGRAL_ONLY_EXP )
// there are fraction-digits to mask out
if( exp <= MAX_INTEGRAL32_EXP )
{
// the fraction digits are in the upper dword, mask them and zero
the lower dword
unsigned shift = (unsigned)(exp >> HI_MANTISSA_BITS) - EXP_BIAS;
dxHi &= ~(NEG_HI_MANTISSA_MASK >> shift);
dxLo = 0;
return du;
}
else
{
// the fraction digits are in the lower dword, mask them
unsigned shift = (unsigned)(exp >> HI_MANTISSA_BITS) - EXP_BIAS -
HI_MANTISSA_BITS;
dxLo &= ~(NEG_LO_MANTISSA_MASK >> shift);
return du;
}
else
if( exp < INF_NAN_EXP )
// value is integral
return du;
else
// infinite, NaN, SNaN
// raise exception on SNaN if necessary
return du + du;
else
{
// below +/-1.0
// return +/-0.0
dxHi &= SIGN_MASK;
dxLo = 0;
return du;
}
}
}

float ftrunc( float f )
{
static_assert(sizeof(float) == sizeof(uint32_t), "sizeof(float) not
equal to sizeof(uint32_t)");
static_assert(numeric_limits<float>::is_iec559, "float must be IEEE-754");
unsigned const MANTISSA_BITS = 23,
EXP_BIAS = 0x7F,
INF_NAN_BASE = 0xFF;
uint32_t const EXP_MASK = (uint32_t)0xFF
<< MANTISSA_BITS,
SIGN_MASK = (uint32_t)0x100
<< MANTISSA_BITS ,
MIN_INTEGRAL_DIGITS_EXP = (uint32_t) EXP_BIAS
<< MANTISSA_BITS,
MIN_INTEGRAL_ONLY_EXP = (uint32_t)(EXP_BIAS +
MANTISSA_BITS) << MANTISSA_BITS,
INF_NAN_EXP = (uint32_t)INF_NAN_BASE
<< MANTISSA_BITS,
NEG_MANTISSA_MASK = 0x007FFFFFu;
union
{
float fu;
uint32_t fx;
};
fu = f;
uint32_t exp = fx & EXP_MASK;
if( exp >= MIN_INTEGRAL_DIGITS_EXP )
// value has integral digits
if( exp < MIN_INTEGRAL_ONLY_EXP )
{
// there are fraction-digits to mask out, mask them
unsigned shift = (unsigned)(exp >> MANTISSA_BITS) - EXP_BIAS;
fx &= ~(NEG_MANTISSA_MASK >> shift);
return fu;
}
else
if( exp < INF_NAN_EXP )
// value is integral
return fu;
else
// infinite, NaN, SNaN
// raise exception on SNaN if necessary
return fu + fu;
else
{
// below +/-1.0
// return +/-0.0
fx &= SIGN_MASK;
return fu;
}
}

With 32 bit code my ftrunc is 1,9 times as fast in my benchmark.
With 32 bit code it is about 4,9 times as fast in the same benchmark.
So what's the magic behind MS' trunc() implementation I miss ?
It couldn't be only that my ftrunc() is inlined and constants might
be kept in registers; I think this wouldn't make a difference like
that for the 64-bit-code.
#

Bo Persson

unread,

Jan 3, 2021, 1:00:06 PM1/3/21

to

On 2021-01-03 at 18:44, Bonita Montero wrote:
> I've just wondered why trunc() of the MSVCRT is so slow.
>
> So I've written my own trunc():
>

>

> With 32 bit code my ftrunc is 1,9 times as fast in my benchmark.
> With 32 bit code it is about 4,9 times as fast in the same benchmark.
> So what's the magic behind MS' trunc() implementation I miss ?
> It couldn't be only that my ftrunc() is inlined and constants might
> be kept in registers; I think this wouldn't make a difference like
> that for the 64-bit-code.

Are you statically linking to the C runtime, or are you calling a
DLL-version of the function?

Bo Persson

Bonita Montero

unread,

Jan 3, 2021, 1:00:55 PM1/3/21

to

> With 32 bit code my ftrunc is 1,9 times as fast in my benchmark.
> With 32 bit code it is about 4,9 times as fast in the same benchmark.

64

Bonita Montero

unread,

Jan 3, 2021, 1:01:48 PM1/3/21

to

>> With 32 bit code my ftrunc is 1,9 times as fast in my benchmark.
>> With 32 bit code it is about 4,9 times as fast in the same benchmark.
>> So what's the magic behind MS' trunc() implementation I miss ?
>> It couldn't be only that my ftrunc() is inlined and constants might
>> be kept in registers; I think this wouldn't make a difference like
>> that for the 64-bit-code.

> Are you statically linking to the C runtime, or are you calling a
> DLL-version of the function?

Wouldn't make a big difference until the code doesn't become inlined.
But even then, it bet it wouldn'T 4,9 times faster.

Bonita Montero

unread,

Jan 3, 2021, 1:14:49 PM1/3/21

to

> Are you statically linking to the C runtime, or are you calling a
> DLL-version of the function?

I just tested it with linking it with /MT and /MD. The statically
linked version is 21% slower; I would have expected the opposite
if ever there's a difference.

Christian Gollwitzer

unread,

Jan 4, 2021, 3:18:20 AM1/4/21

to

Am 03.01.21 um 18:44 schrieb Bonita Montero:

> I've just wondered why trunc() of the MSVCRT is so slow.
>
> So I've written my own trunc():

> So what's the magic behind MS' trunc() implementation I miss ?

Look at the compiled assembly. I would expect that the library version
uses SSE code to do the trunc, while yours runs at the integer ALU.

Christian