Cacheline Ping-Pong

Bonita Montero

unread,

Sep 23, 2019, 3:08:10 AM9/23/19

to

I was researching a bit to get a contention-free LRU-list that scales
lineary with the number of processors. My work is finished now and I
have the nearly perfect solulion for this issue.
But after that I thought what the costs of cachline-swapping between
cores might be, so I wrote a little program:

#if defined(_MSC_VER)
#include <Windows.h>
#elif defined(__unix__)
#include <sys/sysinfo.h>
#include <sched.h>
#include <pthread.h>
#endif
#include <iostream>
#include <thread>
#include <cstddef>
#include <atomic>
#include <functional>
#include <chrono>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <array>

unsigned getNumberOfProcessors();

using namespace std;
using namespace chrono;

int main( int argc, char **argv )
{
if( argc < 2 )
return -1;
double nsPerClockCycle = 1.0 / (atof( argv[1] ) * 1.0e9);

auto thrXadd = []( uint8_t volatile &run, size_t adds,
atomic<uint64_t> &atm, atomic<size_t> &misses )
{
while( !run );
for( size_t i = 0; i != adds; ++i )
atm.fetch_add( 1 );
};
auto thrXchg = []( uint8_t volatile &run, size_t adds,
atomic<uint64_t> &atm, atomic<size_t> &misses )
{
while( !run );
size_t missed = 0;
for( size_t i = 0; i != adds; ++i )
{
uint64_t v;
for( ; ; )
{
v = (uint64_t)atm;
if( atm.compare_exchange_weak( v, v + 1 ) )
break;
else
++missed;

}
atm.fetch_add( 1 );
}
misses.fetch_add( missed );
};
using threadfunc = void (*)( uint8_t volatile &, size_t,
atomic<uint64_t> &, atomic<size_t> & );
array<threadfunc, 2> atf;
array<char const *, 2> threadDescr;
size_t const ADDS = 10'000'000;
unsigned nProcessors = getNumberOfProcessors();

atf[0] = thrXadd;
atf[1] = thrXchg;
threadDescr[0] = "xadd-thread";
threadDescr[1] = "cmpxchge-thread";

for( size_t m = 0; m != atf.size(); ++m )
{
cout << threadDescr[m] << ":" << endl;
for( unsigned nThreads = 1; nThreads <= nProcessors; ++nThreads )
{
atomic<size_t> misses( 0 );
uint8_t run = false;
atomic<uint64_t> atm( 0 );

vector<thread> threads;
for( unsigned i = 0; i != nThreads; ++i )
{
threads.emplace_back( atf[m], ref( run ), ADDS, ref(
atm ), ref( misses ) );
#if defined(_MSC_VER)
SetThreadAffinityMask( threads[i].native_handle(),
(DWORD_PTR)1 << i );
#elif defined(__unix__)
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(i, &cpuset);
pthread_setaffinity_np( threads[i].native_handle(),
sizeof cpuset, &cpuset );
#endif
}
time_point<high_resolution_clock> start =
high_resolution_clock::now();
run = true;
for( unsigned i = 0; i != nThreads; ++i )
threads[i].join();
uint64_t ns = (uint64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();;

double nsPerAdd = (double)ns / nThreads / ADDS / 1.0e9;
cout << "cycles: " << nsPerAdd / nsPerClockCycle << "
misses-ratio: " << (int)(100.0 * (size_t)misses / nThreads / ADDS) <<
"%" << endl;
}
cout << endl;
}
}

unsigned getNumberOfProcessors()
{
#if defined(_MSC_VER)
#include <Windows.h>
#elif defined(__unix__)
#include <sys/sysinfo.h>
#include <sched.h>
#include <pthread.h>
#endif
#include <iostream>
#include <thread>
#include <cstddef>
#include <atomic>
#include <functional>
#include <chrono>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <array>

unsigned getNumberOfProcessors();

using namespace std;
using namespace chrono;

int main( int argc, char **argv )
{
if( argc < 2 )
return -1;
double nsPerClockCycle = 1.0 / (atof( argv[1] ) * 1.0e9);

auto thrXadd = []( uint8_t volatile &run, size_t adds,
atomic<uint64_t> &atm, atomic<size_t> &misses )
{
while( !run );
for( size_t i = 0; i != adds; ++i )
atm.fetch_add( 1 );
};
auto thrXchg = []( uint8_t volatile &run, size_t adds,
atomic<uint64_t> &atm, atomic<size_t> &misses )
{
while( !run );
size_t missed = 0;
for( size_t i = 0; i != adds; ++i )
{
uint64_t v;
for( ; ; )
{
v = (uint64_t)atm;
if( atm.compare_exchange_weak( v, v + 1 ) )
break;
else
++missed;

}
atm.fetch_add( 1 );
}
misses.fetch_add( missed );
};
using threadfunc = void (*)( uint8_t volatile &, size_t,
atomic<uint64_t> &, atomic<size_t> & );
array<threadfunc, 2> atf;
array<char const *, 2> threadDescr;
size_t const ADDS = 10'000'000;
unsigned nProcessors = getNumberOfProcessors();

atf[0] = thrXadd;
atf[1] = thrXchg;
threadDescr[0] = "xadd-thread";
threadDescr[1] = "cmpxchge-thread";

for( size_t m = 0; m != atf.size(); ++m )
{
cout << threadDescr[m] << ":" << endl;
for( unsigned nThreads = 1; nThreads <= nProcessors; ++nThreads )
{
atomic<size_t> misses( 0 );
uint8_t run = false;
atomic<uint64_t> atm( 0 );

vector<thread> threads;
for( unsigned i = 0; i != nThreads; ++i )
{
threads.emplace_back( atf[m], ref( run ), ADDS, ref(
atm ), ref( misses ) );
#if defined(_MSC_VER)
SetThreadAffinityMask( threads[i].native_handle(),
(DWORD_PTR)1 << i );
#elif defined(__unix__)
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(i, &cpuset);
pthread_setaffinity_np( threads[i].native_handle(),
sizeof cpuset, &cpuset );
#endif
}
time_point<high_resolution_clock> start =
high_resolution_clock::now();
run = true;
for( unsigned i = 0; i != nThreads; ++i )
threads[i].join();
uint64_t ns = (uint64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();;

double nsPerAdd = (double)ns / nThreads / ADDS / 1.0e9;
cout << "threads: " << nThreads << " cycles: " << nsPerAdd
/ nsPerClockCycle << " misses-ratio: " << (int)(100.0 * (size_t)misses /
nThreads / ADDS) << "%" << endl;
}
cout << endl;
}
}

unsigned getNumberOfProcessors()
{
#if defined(_MSC_VER)
SYSTEM_INFO si;
GetSystemInfo( &si );
return (unsigned)si.dwNumberOfProcessors;
#elif defined(__unix__)
return (unsigned)get_nprocs();
#endif
}

(I tested the above program with WIndows and Windows Subsystem for
Linux. Under Windows I can see the threads in the process explorer
expanding over the cores. But under WSL / Linux, the threads won't
get their proper affinity through pthread_setaffinity_np although
it always returns, zero, i.e. success. Can anyone verify this
behaviour under a native Linux?)

You must give the base-frequency of your processor in GHz as the
only command-line parameter.
I have run the numbers for my Ryzen 7 1800X:

xadd-thread:
threads: 1 cycles: 24.7371 misses-ratio: 0%
threads: 2 cycles: 30.3089 misses-ratio: 0%
threads: 3 cycles: 38.7209 misses-ratio: 0%
threads: 4 cycles: 45.6885 misses-ratio: 0%
threads: 5 cycles: 45.7617 misses-ratio: 0%
threads: 6 cycles: 47.0349 misses-ratio: 0%
threads: 7 cycles: 45.5177 misses-ratio: 0%
threads: 8 cycles: 46.274 misses-ratio: 0%
threads: 9 cycles: 48.4246 misses-ratio: 0%
threads: 10 cycles: 49.6137 misses-ratio: 0%
threads: 11 cycles: 54.3124 misses-ratio: 0%
threads: 12 cycles: 57.5544 misses-ratio: 0%
threads: 13 cycles: 60.3801 misses-ratio: 0%
threads: 14 cycles: 61.889 misses-ratio: 0%
threads: 15 cycles: 62.8714 misses-ratio: 0%
threads: 16 cycles: 64.9949 misses-ratio: 0%

cmpxchge-thread:
threads: 1 cycles: 48.1681 misses-ratio: 0%
threads: 2 cycles: 98.0831 misses-ratio: 150%
threads: 3 cycles: 121.076 misses-ratio: 150%
threads: 4 cycles: 190.618 misses-ratio: 252%
threads: 5 cycles: 190.002 misses-ratio: 244%
threads: 6 cycles: 300.953 misses-ratio: 433%
threads: 7 cycles: 293.362 misses-ratio: 416%
threads: 8 cycles: 417.765 misses-ratio: 665%
threads: 9 cycles: 387.265 misses-ratio: 596%
threads: 10 cycles: 387.103 misses-ratio: 550%
threads: 11 cycles: 373.895 misses-ratio: 503%
threads: 12 cycles: 391.558 misses-ratio: 484%
threads: 13 cycles: 384.176 misses-ratio: 462%
threads: 14 cycles: 405.095 misses-ratio: 440%
threads: 15 cycles: 387.216 misses-ratio: 411%
threads: 16 cycles: 454.743 misses-ratio: 475%

So here are both results as a graph:
https://ibin.co/4vykmbJ5OGpw.png

So 24 clock-cycles here is the single-threaed value; a single memory
-access in L1-cache would take 4 clock sycles. The values raise about
40 clock cycles higher for my Ryzen. That's pretty good if you consider
that the Ryzens have this brain-damaged CCX-concept where the cores are
grouped in clusters of 4 cores where the groups have a slower intercon-
nect than the cores within each group.
But the CMPXCHG-solution which essentially does the same really hurts.
First, even a single thread has about half of the throughput the XADD
-solution has. Then the XCHG-variant goes up to about 450 clock-cycles
in average for a successfull XADD when being contended by all other 16
threads on the CPU. The misses ratio-gives the ratio of additional
misses over successfull XCHGs. So with 16 cores you need almost six
retries until you get a successfull exchange; tha's really bad.

Bonita Montero

unread,

Sep 23, 2019, 3:09:14 AM9/23/19

to

> #if defined(_MSC_VER)
> ...
Sorry, I've pasted the code twice.

Chris M. Thomasson

unread,

Sep 23, 2019, 3:25:26 PM9/23/19

to

No problem. A quick look, and its interesting to me. Might have some
more time in a couple of days. Busy with rendering some fractals right
now. Fwiw, you should try to emulate it in Relacy Race Detector, it
finds many problems:

https://github.com/dvyukov/relacy

http://www.1024cores.net/home/relacy-race-detector

Iirc, it has support for Windows API wrt sync.

Chris M. Thomasson

unread,

Sep 23, 2019, 6:46:12 PM9/23/19

to

On 9/23/2019 12:08 AM, Bonita Montero wrote:
> I was researching a bit to get a contention-free LRU-list that scales
> lineary with the number of processors. My work is finished now and I
> have the nearly perfect solulion for this issue.
> But after that I thought what the costs of cachline-swapping between
> cores might be, so I wrote a little program:

[...]

> int main( int argc, char **argv )
> {
>     if( argc < 2 )
>         return -1;
>     double nsPerClockCycle = 1.0 / (atof( argv[1] ) * 1.0e9);
>
>     auto thrXadd = []( uint8_t volatile &run, size_t adds,
> atomic<uint64_t> &atm, atomic<size_t> &misses )
>     {
>         while( !run );
>         for( size_t i = 0; i != adds; ++i )
>             atm.fetch_add( 1 );
>     };
>     auto thrXchg = []( uint8_t volatile &run, size_t adds,
> atomic<uint64_t> &atm, atomic<size_t> &misses )
>     {
>         while( !run );
>         size_t missed = 0;
>         for( size_t i = 0; i != adds; ++i )
>         {
>             uint64_t v;
>             for( ; ; )
>             {
>                 v = (uint64_t)atm;
>                 if( atm.compare_exchange_weak( v, v + 1 ) )
>                     break;
>                 else
>                     ++missed;
>
>             }
>             atm.fetch_add( 1 );

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Not sure why you are adding to v with fetch_add here after you already
added to v with the CAS? Also, fwiw, atomic::compare_exchange_weak
automatically returns the previous value in v on failure, no need to
reload it on ever iteration of the loop.

>         }
>         misses.fetch_add( missed );
>     };
[...]

Bonita Montero

unread,

Sep 23, 2019, 10:39:25 PM9/23/19

to

> Not sure why you are adding to v with fetch_add here after you already
> added to v with the CAS? Also, fwiw, atomic::compare_exchange_weak
> automatically returns the previous value in v on failure, no need to
> reload it on ever iteration of the loop.

I'm just testing the latency of moving cachelines between cores in two
variants. The source was a bit misordered because I pastet it twice, so
here it is another time.

#if defined(_MSC_VER)
#include <Windows.h>
#elif defined(__unix__)
#include <sys/sysinfo.h>
#include <sched.h>
#include <pthread.h>
#endif
#include <iostream>
#include <thread>
#include <cstddef>
#include <atomic>
#include <functional>
#include <chrono>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <array>

unsigned getNumberOfProcessors();

using namespace std;
using namespace chrono;

int main( int argc, char **argv )
{
if( argc < 2 )
return -1;
double nsPerClockCycle = 1.0 / (atof( argv[1] ) * 1.0e9);

auto thrXadd = []( uint8_t volatile &run, size_t adds,
atomic<uint64_t> &atm, atomic<size_t> &misses )
{
while( !run );
for( size_t i = 0; i != adds; ++i )
atm.fetch_add( 1 );
};
auto thrXchg = []( uint8_t volatile &run, size_t adds,
atomic<uint64_t> &atm, atomic<size_t> &misses )
{
while( !run );
size_t missed = 0;
for( size_t i = 0; i != adds; ++i )
{
uint64_t v;
for( ; ; )
{
v = (uint64_t)atm;
if( atm.compare_exchange_weak( v, v + 1 ) )
break;
else
++missed;

}
atm.fetch_add( 1 );
}