This modified program shows at which point parallel sorting
becomes more efficient than single-threaded sorting. For my
Zen2-CPU this is 0x1000 elements.
#include <iostream>
#include <algorithm>
#include <execution>
#include <vector>
#include <random>
#include <chrono>
#include <charconv>
#if defined(_MSC_VER)
#pragma warning(disable: 4996)
#endif
using namespace std;
using namespace chrono;
int main( int argc, char **argv )
{
if( argc < 2 )
return EXIT_FAILURE;
size_t upper;
bool b16 = strnicmp( argv[1], "0x", 2 ) == 0;
argv[1] += b16 * 2;
if( from_chars_result fcr = from_chars( argv[1], argv[1] + strlen(
argv[1] ), upper, b16 ? 16 : 10 ); (bool)
fcr.ec || *fcr.ptr )
return EXIT_FAILURE;
vector<size_t> vs;
vs.reserve( upper );
size_t size = 1;
do
{
size = size * 2 <= upper ? size * 2 : upper;
vs.resize( size );
mt19937_64 mt;
uniform_int_distribution<size_t> uid( 0, -1 );
auto perf = [&]( auto fn )
{
for( size_t &s : vs )
s = uid( mt );
auto start = high_resolution_clock::now();
fn();
return duration_cast<nanoseconds>( high_resolution_clock::now() -
start ).count() / 1.0e9;
};
double
tParallel = perf( [&]() { sort( execution::parallel_policy(),
vs.begin(), vs.end() ); } ),
tSerial = perf( [&]() { sort( vs.begin(), vs.end() ); } );
cout << "0x" << hex << size << ": " << tSerial / tParallel << endl;
} while( size < upper );
}