I've written a little program that tests the throughput of fetch_add
on an increasing number of processors in your systems and if you chose
it, it couts the throughput of compare_exchange_weak. On my Ryzen
Threadripper 3990X (Zen2) / Windows 10 (SMT disabled) the fetch_add
timings are linar with increasing number of threads and the compare
_exchange_weak-timings are linear in the beginning, but become expo-
nential at the end.
I'd like to see your results:
#include <iostream>
#include <cstring>
#include <atomic>
#include <charconv>
#include <thread>
#include <vector>
#include <semaphore>
#include <chrono>
#include <algorithm>
#include <functional>
using namespace std;
using namespace chrono;
int main( int argc, char **argv )
{
if( argc < 2 )
return EXIT_FAILURE;
bool xchg = strcmp( argv[1], "xchg" ) == 0;
if( argc - xchg < 2 )
return EXIT_FAILURE;
auto parseValue = []( char const *str ) -> unsigned
{
unsigned value;
from_chars_result fcr = from_chars( str, str + strlen( str ), value );
if(
fcr.ec != errc() || *fcr.ptr )
return -1;
return value;
};
unsigned fromThreads, toThreads;
if( argc - xchg == 2 )
if( (fromThreads = toThreads = parseValue( argv[1 + xchg] )) == -1 )
return EXIT_FAILURE;
else;
else
if( (fromThreads = parseValue( argv[1 + xchg] )) == -1 || (toThreads =
parseValue( argv[2 + xchg] )) == -1 )
return EXIT_FAILURE;
unsigned hc = thread::hardware_concurrency();
hc = hc ? hc : toThreads;
toThreads = toThreads <= hc ? toThreads : hc;
fromThreads = fromThreads <= hc ? fromThreads : hc;
if( fromThreads > toThreads )
swap( fromThreads, toThreads );
for( unsigned nThreads = fromThreads; nThreads <= toThreads; ++nThreads )
{
atomic_uint readyCountDown( nThreads );
binary_semaphore semReady( 0 );
counting_semaphore semRun( 0 );
atomic_uint synch( nThreads );
atomic_uint64_t aui64;
atomic_uint64_t nsSum( 0 );
auto theThread = [&]( function<void()> &addFn, size_t n )
{
if( readyCountDown.fetch_sub( 1, memory_order_relaxed ) == 1 )
semReady.release();
semRun.acquire();
if( synch.fetch_sub( 1, memory_order_relaxed ) != 1 )
while( synch.load( memory_order_relaxed ) );
auto start = high_resolution_clock::now();
for( ; n; --n )
addFn();
nsSum.fetch_add( (int64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count(), memory_order_relaxed );
};
vector<jthread> threads;
threads.reserve( nThreads );
static size_t const TURNS = 10'000'000;
auto fetchAddFn = [&]() { aui64.fetch_add( 1, memory_order_relaxed ); };
auto cmpXchgFn = [&]()
{
uint64_t ref = aui64.load( memory_order_relaxed );
while( !aui64.compare_exchange_weak( ref, ref + 1,
memory_order_relaxed ) );
};
function<void()> xchgFn;
if( !xchg )
xchgFn = bind( fetchAddFn );
else
xchgFn = bind( cmpXchgFn );
for( unsigned t = 0; t != nThreads; ++t )
threads.emplace_back( theThread, xchgFn ), TURNS );
semReady.acquire();
semRun.release( nThreads );
for( jthread &thr : threads )
thr.join();
double ns = (double)(int64_t)nsSum.load( memory_order_relaxed );
ns = ns / ((double)TURNS * (int)nThreads);
cout << ns << endl;
}
}
The timings are important for every kind of synchronization on your PC.
The programm can be called like that
./a.out <n-threads> - tests fetch_add witn n-threads
./a.out <from-threads> <to-threads> - tests fetch_add ranging from
from-threads to to-threads
./a.out xchg <n-threads> - tests compar_exchange_weak with
n-threads
./a.out xchg <from-threads> <to-threads> - tests compar_exchange_weak
ranging from-threads to
to-threads