I was researching a bit to get a contention-free LRU-list that scales
lineary with the number of processors. My work is finished now and I
have the nearly perfect solulion for this issue.
But after that I thought what the costs of cachline-swapping between
cores might be, so I wrote a little program:
#if defined(_MSC_VER)
#include <Windows.h>
#elif defined(__unix__)
#include <sys/sysinfo.h>
#include <sched.h>
#include <pthread.h>
#endif
#include <iostream>
#include <thread>
#include <cstddef>
#include <atomic>
#include <functional>
#include <chrono>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <array>
unsigned getNumberOfProcessors();
using namespace std;
using namespace chrono;
int main( int argc, char **argv )
{
if( argc < 2 )
return -1;
double nsPerClockCycle = 1.0 / (atof( argv[1] ) * 1.0e9);
auto thrXadd = []( uint8_t volatile &run, size_t adds,
atomic<uint64_t> &atm, atomic<size_t> &misses )
{
while( !run );
for( size_t i = 0; i != adds; ++i )
atm.fetch_add( 1 );
};
auto thrXchg = []( uint8_t volatile &run, size_t adds,
atomic<uint64_t> &atm, atomic<size_t> &misses )
{
while( !run );
size_t missed = 0;
for( size_t i = 0; i != adds; ++i )
{
uint64_t v;
for( ; ; )
{
v = (uint64_t)atm;
if( atm.compare_exchange_weak( v, v + 1 ) )
break;
else
++missed;
}
atm.fetch_add( 1 );
}
misses.fetch_add( missed );
};
using threadfunc = void (*)( uint8_t volatile &, size_t,
atomic<uint64_t> &, atomic<size_t> & );
array<threadfunc, 2> atf;
array<char const *, 2> threadDescr;
size_t const ADDS = 10'000'000;
unsigned nProcessors = getNumberOfProcessors();
atf[0] = thrXadd;
atf[1] = thrXchg;
threadDescr[0] = "xadd-thread";
threadDescr[1] = "cmpxchge-thread";
for( size_t m = 0; m != atf.size(); ++m )
{
cout << threadDescr[m] << ":" << endl;
for( unsigned nThreads = 1; nThreads <= nProcessors; ++nThreads )
{
atomic<size_t> misses( 0 );
uint8_t run = false;
atomic<uint64_t> atm( 0 );
vector<thread> threads;
for( unsigned i = 0; i != nThreads; ++i )
{
threads.emplace_back( atf[m], ref( run ), ADDS, ref(
atm ), ref( misses ) );
#if defined(_MSC_VER)
SetThreadAffinityMask( threads[i].native_handle(),
(DWORD_PTR)1 << i );
#elif defined(__unix__)
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(i, &cpuset);
pthread_setaffinity_np( threads[i].native_handle(),
sizeof cpuset, &cpuset );
#endif
}
time_point<high_resolution_clock> start =
high_resolution_clock::now();
run = true;
for( unsigned i = 0; i != nThreads; ++i )
threads[i].join();
uint64_t ns = (uint64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();;
double nsPerAdd = (double)ns / nThreads / ADDS / 1.0e9;
cout << "cycles: " << nsPerAdd / nsPerClockCycle << "
misses-ratio: " << (int)(100.0 * (size_t)misses / nThreads / ADDS) <<
"%" << endl;
}
cout << endl;
}
}
unsigned getNumberOfProcessors()
{
#if defined(_MSC_VER)
#include <Windows.h>
#elif defined(__unix__)
#include <sys/sysinfo.h>
#include <sched.h>
#include <pthread.h>
#endif
#include <iostream>
#include <thread>
#include <cstddef>
#include <atomic>
#include <functional>
#include <chrono>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <array>
unsigned getNumberOfProcessors();
using namespace std;
using namespace chrono;
int main( int argc, char **argv )
{
if( argc < 2 )
return -1;
double nsPerClockCycle = 1.0 / (atof( argv[1] ) * 1.0e9);
auto thrXadd = []( uint8_t volatile &run, size_t adds,
atomic<uint64_t> &atm, atomic<size_t> &misses )
{
while( !run );
for( size_t i = 0; i != adds; ++i )
atm.fetch_add( 1 );
};
auto thrXchg = []( uint8_t volatile &run, size_t adds,
atomic<uint64_t> &atm, atomic<size_t> &misses )
{
while( !run );
size_t missed = 0;
for( size_t i = 0; i != adds; ++i )
{
uint64_t v;
for( ; ; )
{
v = (uint64_t)atm;
if( atm.compare_exchange_weak( v, v + 1 ) )
break;
else
++missed;
}
atm.fetch_add( 1 );
}
misses.fetch_add( missed );
};
using threadfunc = void (*)( uint8_t volatile &, size_t,
atomic<uint64_t> &, atomic<size_t> & );
array<threadfunc, 2> atf;
array<char const *, 2> threadDescr;
size_t const ADDS = 10'000'000;
unsigned nProcessors = getNumberOfProcessors();
atf[0] = thrXadd;
atf[1] = thrXchg;
threadDescr[0] = "xadd-thread";
threadDescr[1] = "cmpxchge-thread";
for( size_t m = 0; m != atf.size(); ++m )
{
cout << threadDescr[m] << ":" << endl;
for( unsigned nThreads = 1; nThreads <= nProcessors; ++nThreads )
{
atomic<size_t> misses( 0 );
uint8_t run = false;
atomic<uint64_t> atm( 0 );
vector<thread> threads;
for( unsigned i = 0; i != nThreads; ++i )
{
threads.emplace_back( atf[m], ref( run ), ADDS, ref(
atm ), ref( misses ) );
#if defined(_MSC_VER)
SetThreadAffinityMask( threads[i].native_handle(),
(DWORD_PTR)1 << i );
#elif defined(__unix__)
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(i, &cpuset);
pthread_setaffinity_np( threads[i].native_handle(),
sizeof cpuset, &cpuset );
#endif
}
time_point<high_resolution_clock> start =
high_resolution_clock::now();
run = true;
for( unsigned i = 0; i != nThreads; ++i )
threads[i].join();
uint64_t ns = (uint64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();;
double nsPerAdd = (double)ns / nThreads / ADDS / 1.0e9;
cout << "threads: " << nThreads << " cycles: " << nsPerAdd
/ nsPerClockCycle << " misses-ratio: " << (int)(100.0 * (size_t)misses /
nThreads / ADDS) << "%" << endl;
}
cout << endl;
}
}
unsigned getNumberOfProcessors()
{
#if defined(_MSC_VER)
SYSTEM_INFO si;
GetSystemInfo( &si );
return (unsigned)si.dwNumberOfProcessors;
#elif defined(__unix__)
return (unsigned)get_nprocs();
#endif
}
(I tested the above program with WIndows and Windows Subsystem for
Linux. Under Windows I can see the threads in the process explorer
expanding over the cores. But under WSL / Linux, the threads won't
get their proper affinity through pthread_setaffinity_np although
it always returns, zero, i.e. success. Can anyone verify this
behaviour under a native Linux?)
You must give the base-frequency of your processor in GHz as the
only command-line parameter.
I have run the numbers for my Ryzen 7 1800X:
xadd-thread:
threads: 1 cycles: 24.7371 misses-ratio: 0%
threads: 2 cycles: 30.3089 misses-ratio: 0%
threads: 3 cycles: 38.7209 misses-ratio: 0%
threads: 4 cycles: 45.6885 misses-ratio: 0%
threads: 5 cycles: 45.7617 misses-ratio: 0%
threads: 6 cycles: 47.0349 misses-ratio: 0%
threads: 7 cycles: 45.5177 misses-ratio: 0%
threads: 8 cycles: 46.274 misses-ratio: 0%
threads: 9 cycles: 48.4246 misses-ratio: 0%
threads: 10 cycles: 49.6137 misses-ratio: 0%
threads: 11 cycles: 54.3124 misses-ratio: 0%
threads: 12 cycles: 57.5544 misses-ratio: 0%
threads: 13 cycles: 60.3801 misses-ratio: 0%
threads: 14 cycles: 61.889 misses-ratio: 0%
threads: 15 cycles: 62.8714 misses-ratio: 0%
threads: 16 cycles: 64.9949 misses-ratio: 0%
cmpxchge-thread:
threads: 1 cycles: 48.1681 misses-ratio: 0%
threads: 2 cycles: 98.0831 misses-ratio: 150%
threads: 3 cycles: 121.076 misses-ratio: 150%
threads: 4 cycles: 190.618 misses-ratio: 252%
threads: 5 cycles: 190.002 misses-ratio: 244%
threads: 6 cycles: 300.953 misses-ratio: 433%
threads: 7 cycles: 293.362 misses-ratio: 416%
threads: 8 cycles: 417.765 misses-ratio: 665%
threads: 9 cycles: 387.265 misses-ratio: 596%
threads: 10 cycles: 387.103 misses-ratio: 550%
threads: 11 cycles: 373.895 misses-ratio: 503%
threads: 12 cycles: 391.558 misses-ratio: 484%
threads: 13 cycles: 384.176 misses-ratio: 462%
threads: 14 cycles: 405.095 misses-ratio: 440%
threads: 15 cycles: 387.216 misses-ratio: 411%
threads: 16 cycles: 454.743 misses-ratio: 475%
So here are both results as a graph:
https://ibin.co/4vykmbJ5OGpw.png
So 24 clock-cycles here is the single-threaed value; a single memory
-access in L1-cache would take 4 clock sycles. The values raise about
40 clock cycles higher for my Ryzen. That's pretty good if you consider
that the Ryzens have this brain-damaged CCX-concept where the cores are
grouped in clusters of 4 cores where the groups have a slower intercon-
nect than the cores within each group.
But the CMPXCHG-solution which essentially does the same really hurts.
First, even a single thread has about half of the throughput the XADD
-solution has. Then the XCHG-variant goes up to about 450 clock-cycles
in average for a successfull XADD when being contended by all other 16
threads on the CPU. The misses ratio-gives the ratio of additional
misses over successfull XCHGs. So with 16 cores you need almost six
retries until you get a successfull exchange; tha's really bad.