Here's my code:
#define NOMINMAX
#if defined(_MSC_VER)
#include <Windows.h>
#endif
#include <iostream>
#include <chrono>
#include <vector>
#include <random>
#include <limits>
#include <immintrin.h>
using namespace std;
using namespace chrono;
uint64_t DIVPD( uint64_t rounds, __m128d *results, __m128d *dividends,
__m128d *divisors, size_t n )
{
for( uint64_t r = rounds; r; --r )
for( size_t i = 0; i != n; ++i )
results[i] = _mm_div_pd( dividends[i], divisors[i] );
return rounds * n;
}
uint64_t VDIVPD( uint64_t rounds, __m256d *results, __m256d *dividends,
__m256d *divisors, size_t n )
{
for( uint64_t r = rounds; r; --r )
for( size_t i = 0; i != n; ++i )
results[i] = _mm256_div_pd( dividends[i], divisors[i] );
return rounds * n;
}
int main()
{
#if defined(_MSC_VER)
SetPriorityClass( GetCurrentProcess(), HIGH_PRIORITY_CLASS );
SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_HIGHEST );
#endif
uint64_t const ROUNDS = 1'000'000;
size_t const SIZE = 1'000;
random_device rd;
normal_distribution<double> nd( 0.0,
(double)numeric_limits<int64_t>::max() / 2 );
vector<__m128d> fResults,
fDividends,
fDivisors;
vector<__m256d> dResults,
dDividends,
dDivisors;
double v0, v1, v2, v3;
fResults.resize( SIZE );
fDividends.resize( SIZE );
fDivisors.resize( SIZE );
dResults.resize( SIZE );
dDividends.resize( SIZE );
dDivisors.resize( SIZE );
for( size_t i = 0; i != SIZE; ++i )
v0 = nd( rd ),
v1 = nd( rd ),
v2 = nd( rd ),
v3 = nd( rd ),
fDividends[i].m128d_f64[0] = v0,
fDividends[i].m128d_f64[1] = v1,
fDivisors[i].m128d_f64[0] = v2,
fDivisors[i].m128d_f64[1] = v3,
dDividends[i].m256d_f64[0] = v0,
dDividends[i].m256d_f64[1] = v1,
dDividends[i].m256d_f64[2] = v2,
dDividends[i].m256d_f64[3] = v3,
dDivisors[i].m256d_f64[0] = nd( rd ),
dDivisors[i].m256d_f64[1] = nd( rd ),
dDivisors[i].m256d_f64[2] = nd( rd ),
dDivisors[i].m256d_f64[3] = nd( rd );
time_point<high_resolution_clock> start;
uint64_t rounds;
uint64_t ns;
start = high_resolution_clock::now();
rounds = DIVPD( ROUNDS, &fResults[0], &fDividends[0],
&fDivisors[0], SIZE );
ns = (uint64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();
cout << (double)ns / rounds << endl;
start = high_resolution_clock::now();
rounds = VDIVPD( ROUNDS, &dResults[0], &dDividends[0],
&dDivisors[0], SIZE );
ns = (uint64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();
cout << (double)ns / rounds << endl;
}
On my 1800X, VDIVPD is exactly half as fast as DIVPD because
a 256 bit AVX data word is pumped through a single 128 bit FPU.