How many x86 DIVs can your CPU handle at once ?

Bonita Montero

unread,

Mar 25, 2023, 10:13:23 AM3/25/23

to

I had some performance issues because of a lot of modulo-operations
with self-built hashing where the number of buckets coudln't be a
power of two. So I asked myself what's the throughput of a modulo
operation where the hash occupies the whole bit-span, i.e. the most
siginificant bit is always set and the modoulo-operand is very small
so that the subtract-and-shift engine keeps running for a lot of
cycles.
I'm XOR-ing the result of the modulo-3-operation with the next value
to make the next calculation dependent of the one before; most CPU
time should be spent by the division itself so the extra XOR-overhead
shouldn't be significant. All other operations will be executed OoO
-parallel on your CPU so that the division and the XOR-operation
should run back-to-back.
With the below code I do some interleaving with multiple chains
of the described XOR-DIVs through an unrolling helper lambda.
This allows the CPU to execute multiple DIVs at once.
Surprisingly my small HP EliteDesk 800 G2 Linux PC with a Skylake
CPU gets a higher throughput than my Zen2 AMD CPU when two or more
DIV-chains are executed OoO-parallel. The Skylake-CPU can handle
two DIVs at once whereas the Zen2-CPU can handle only one.

#include <iostream>
#include <random>
#include <chrono>
#include <vector>
#include <atomic>
#include <utility>
#include <type_traits>
#include <limits>

using namespace std;
using namespace chrono;

int main()
{
uint64_t sum, mod = 3;
constexpr size_t
N = 0x400 / sizeof(uint64_t),
ROUNDS = 1'000'000;
mt19937_64 mt;
uniform_int_distribution<uint64_t> uid(
numeric_limits<int64_t>::min(), -1 );
vector<uint64_t> counters( N );
for( uint64_t &c : counters )
c = uid( mt );
auto testPar = [&]<size_t Par>() mutable -> double
{
uint64_t prevs[Par] = { 0 };
auto start = high_resolution_clock::now();
auto unroll = []<size_t ... Indices, typename Fn>(
index_sequence<Indices ...>, Fn fn )
{
(fn.template operator ()<Indices>(), ...);
};
for( size_t r = ROUNDS; r--; )
for( size_t i = 0; i + Par < N; i += Par )
unroll( make_index_sequence<Par>(),
[&]<size_t I>()
{
prevs[I] = (counters[i + I] ^ prevs[I]) % mod;
} );
double ns = duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count() / ((double)ROUNDS * N);
uint64_t sumPrevs = 0;
unroll( make_index_sequence<Par>(), [&]<size_t I>() {
sumPrevs += prevs[I]; } );
sum = sumPrevs;
return ns;
};
using test_lambda_t = remove_reference_t<decltype(testPar)>;
static struct
{
size_t par;
double (test_lambda_t::*fn)();
} const tests[] =
{
{ 1, &test_lambda_t::template operator ()<1> },
{ 2, &test_lambda_t::template operator ()<2> },
{ 3, &test_lambda_t::template operator ()<3> },
{ 4, &test_lambda_t::template operator ()<4> }
};
for( auto const &test : tests )
cout << test.par << ": " << (testPar.*test.fn)() << endl;
}

Show me your results (-std:c++20 / -std=c++20) !

James Taylor

unread,

Mar 25, 2023, 9:32:16 PM3/25/23

to

On 25/03/2023 14:14, Bonita Montero wrote:
>
> Show me your results (-std:c++20 / -std=c++20) !
>
>

Sure if you can post a program that compiles in VS 2022 - 17.5.3

Bonita Montero

unread,

Mar 25, 2023, 11:05:34 PM3/25/23

to

Am 26.03.2023 um 03:30 schrieb James Taylor:

> Sure if you can post a program that compiles in VS 2022 - 17.5.3

It's C++20. Maybe this works :

#include <iostream>
#include <random>
#include <chrono>
#include <vector>

#include <utility>
#include <atomic>

using namespace std;
using namespace chrono;

atomic_uint64_t aSum, aDenom = 3;

int main()
{

constexpr size_t
N = 0x400 / sizeof(uint64_t),
ROUNDS = 1'000'000;
mt19937_64 mt;

uniform_int_distribution<uint64_t> uid( (uint64_t)1 << 63, -1 );

vector<uint64_t> counters( N );
for( uint64_t &c : counters )
c = uid( mt );

auto unroll = []<size_t ... Indices, typename Fn>(
index_sequence<Indices ...>, Fn fn )
{
(fn.template operator ()<Indices>(), ...);
};

auto testPar = [&]<size_t Par>( integral_constant<size_t, Par> ) -> double

{
uint64_t prevs[Par] = { 0 };

auto seq = make_index_sequence<Par>();

auto start = high_resolution_clock::now();

uint64_t denom = ::aDenom.load( memory_order::relaxed );

for( size_t r = ROUNDS; r--; )
for( size_t i = 0; i + Par < N; i += Par )

unroll( seq, [&]<size_t I>() { prevs[I] = (counters[i + I] ^
prevs[I]) % denom; } );

double ns = duration_cast<nanoseconds>( high_resolution_clock::now() -
start ).count() / ((double)ROUNDS * N);

uint64_t sum = 0;
unroll( seq, [&]<size_t I>() { sum += prevs[I]; } );
::aSum.store( sum, memory_order::relaxed );
return ns;
};
unroll( make_index_sequence<4>(),
[&]<size_t I>()
{
cout << I << ": " << testPar( integral_constant<size_t, I + 1>() ) <<
endl;
} );
}