> I could redesign my "cachline ping pong" code to use TSX-RTM with-
> out having tested this on my PC; so could anyone here run the num-
> bers on that?
So here's the code:
#if defined(_MSC_VER)
#include <Windows.h>
#include <intrin.h>
#elif defined(__unix__)
#include <sys/sysinfo.h>
#include <sched.h>
#include <pthread.h>
#include <immintrin.h>
#endif
#include <iostream>
#include <thread>
#include <cstddef>
#include <atomic>
#include <functional>
#include <chrono>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <array>
unsigned getNumberOfProcessors();
bool hasTSX();
using namespace std;
using namespace chrono;
inline
size_t fetchAdd( size_t volatile &v, size_t a )
{
#if defined(_MSC_VER)
#if defined(_M_X64)
return (size_t)_InterlockedExchangeAdd64( &(__int64 &)v, (__int64)a );
#elif defined(_M_IX86)
return (size_t)_InterlockedExchangeAdd( &(long &)v, (long)a );
#else
#error unsupported architecture
#endif
#elif defined(__GNUC__) || defined(__clang__)
return __sync_fetch_and_add( &v, a );
#else
#error unsupported architecture
#endif
}
inline
size_t compareExchange( size_t volatile &v, size_t c, size_t x )
{
#if defined(_MSC_VER)
#if defined(_M_X64)
return (size_t)_InterlockedCompareExchange64( &(__int64 &)v,
(__int64)x, (__int64)c );
#elif defined(_M_IX86)
return (size_t)_InterlockedCompareExchange( &(long &)v, (long)x,
(long)c );
#else
#error unsupported architecture
#endif
#elif defined(__GNUC__) || defined(__clang__)
return __sync_val_compare_and_swap( &v, c, x );
#else
#error unsupported architecture
#endif
}
inline
void rtmFetchAdd( size_t volatile &v, size_t a )
{
_xbegin();
++v;
_xend();
}
int main( int argc, char **argv )
{
if( argc < 2 )
return -1;
double nsPerClockCycle = 1.0 / (atof( argv[1] ) * 1.0e9);
auto thrXadd = []( uint8_t volatile &run, size_t adds, size_t
volatile &atm, atomic<size_t> &misses )
{
while( !run );
for( size_t i = adds; i; --i )
fetchAdd( atm, 1 );
};
auto thrXchg = []( uint8_t volatile &run, size_t adds, size_t
volatile &atm, atomic<size_t> &misses )
{
while( !run );
size_t missed = 0;
for( size_t i = adds, cmp = atm; i; --i )
{
for( size_t res; ; )
if( (res = compareExchange( atm, cmp, cmp + 1 )) == cmp )
{
cmp = cmp + 1;
break;
}
else
cmp = res,
++missed;
}
misses.fetch_add( missed );
};
auto rtmAdd = []( uint8_t volatile &run, size_t adds, size_t
volatile &atm, atomic<size_t> &misses )
{
while( !run );
for( size_t i = adds; i; --i )
rtmFetchAdd( atm, 1 );
};
using threadfunc = void (*)( uint8_t volatile &, size_t, size_t
volatile &, atomic<size_t> & );
array<threadfunc, 3> atf;
array<char const *, 3> threadDescr;
size_t nTests;
size_t const ADDS = 10'000'000;
unsigned nProcessors = getNumberOfProcessors();
atf[0] = thrXadd;
atf[1] = thrXchg;
atf[2] = rtmAdd;
threadDescr[0] = "xadd-thread";
threadDescr[1] = "cmpxchge-thread";
threadDescr[2] = "rtm-thread";
nTests = hasTSX() ? atf.size() : atf.size() - 1;
for( size_t m = 0; m != atf.size(); ++m )
{
cout << threadDescr[m] << ":" << endl;
for( unsigned nThreads = 1; nThreads <= nProcessors; ++nThreads )
{
atomic<size_t> misses( 0 );
uint8_t run = false;
size_t atm;
vector<thread> threads;
for( unsigned i = 0; i != nThreads; ++i )
{
threads.emplace_back( atf[m], ref( run ), ADDS, ref(
atm ), ref( misses ) );
#if defined(_MSC_VER)
SetThreadAffinityMask( threads[i].native_handle(),
(DWORD_PTR)1 << i );
#elif defined(__unix__)
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(i, &cpuset);
pthread_setaffinity_np( threads[i].native_handle(),
sizeof cpuset, &cpuset );
#endif
}
time_point<high_resolution_clock> start =
high_resolution_clock::now();
run = true;
for( unsigned i = 0; i != nThreads; ++i )
threads[i].join();
uint64_t ns = (uint64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();;
double nsPerAdd = (double)ns / nThreads / ADDS / 1.0e9;
cout << "threads: " << nThreads << " cycles: " << nsPerAdd
/ nsPerClockCycle << " misses-ratio: " << (int)(100.0 * (size_t)misses /
nThreads / ADDS) << "%" << endl;
}
cout << endl;
}
}
unsigned getNumberOfProcessors()
{
#if defined(_MSC_VER)
SYSTEM_INFO si;
GetSystemInfo( &si );
return (unsigned)si.dwNumberOfProcessors;
#elif defined(__unix__)
return (unsigned)get_nprocs();
#endif
}
bool hasTSX()
{
#if defined(_MSC_VER)
int regs[4];
__cpuidex( regs, 7, 0 );
return regs[1] & (1 << 11);
#else
return true;
#endif
}
The code has to be compiled with -mrtm with the gcc. I was to lazy to
code a correct RTM-detection for Linux so on Linux-machines the code
would crash without RTM-support. But RTM is that what I'm about here;
so could someone with a TSX-/RTM-enabled CPU please compile that on
his machine and paste the output here? The issue I'm interested in
is if rtmFetchAdd() is faster than compareExchange() or fetchAdd().