Now, that's a version with my own semaphore-class:
#if defined(_MSC_VER)
#include <Windows.h>
#elif defined(__unix__)
#include <unistd.h>
#include <sched.h>
#include <sys/resource.h>
#include <sys/types.h>
#endif
#include <iostream>
#include <atomic>
#include <mutex>
#include <condition_variable>
#include <chrono>
#include <cstdlib>
#include <vector>
#include <thread>
#include "semaphore.h"
using namespace std;
using namespace chrono;
#if defined(_MSC_VER)
int wmain( int argc, wchar_t **argv )
#else
int main( int argc, char **argv )
#endif
{
#if !defined(_MSC_VER)
unsigned processors = thread::hardware_concurrency();
if( !processors )
return EXIT_FAILURE;
#else
DWORD dwLength = 0;
if( !GetLogicalProcessorInformationEx( RelationGroup, nullptr,
&dwLength ) &&
GetLastError() != ERROR_INSUFFICIENT_BUFFER )
return EXIT_FAILURE;
vector<char> buf( dwLength );
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pSlpie =
(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buf.data();
if( !GetLogicalProcessorInformationEx( RelationGroup, pSlpie, &dwLength ) )
return EXIT_FAILURE;
unsigned processors = 0;
for( unsigned i = 0; i != pSlpie->Group.ActiveGroupCount; ++i )
processors += pSlpie->Group.GroupInfo[i].ActiveProcessorCount;
auto getProcessorGroup = [pSlpie, processors]( unsigned processor ) ->
unsigned
{
processor %= processors;
BYTE pCount;
unsigned group = 0;
for( ; group != pSlpie->Group.ActiveGroupCount
&& processor >= (pCount =
pSlpie->Group.GroupInfo[group].ActiveProcessorCount);
++group, processor -= pCount );
return group;
};
#endif
#if defined(_MSC_VER)
// log on as a different user to gain higher scheduling-prioroties
HANDLE hToken;
if( argc >= 3 && LogonUserW( argv[1], nullptr, argv[2],
LOGON32_LOGON_INTERACTIVE, LOGON32_PROVIDER_DEFAULT, &hToken ) )
(void)ImpersonateLoggedOnUser( hToken );
HANDLE hCurrentProcess = GetCurrentProcess();
SetPriorityClass( hCurrentProcess, HIGH_PRIORITY_CLASS );
SetPriorityClass( hCurrentProcess, REALTIME_PRIORITY_CLASS );
#elif defined(__unix__)
for( int nice = 0; nice != -21 && setpriority( PRIO_PROCESS, getpid(),
nice ) == 0; --nice );
#endif
uint64_t const ROUNDS = 1'000'000;
auto benchmark = [&]( bool xadd )
{
semaphore semReady,
semStart;
uint64_t count;
atomic<uint64_t> atomicValue;
mutex mtx;
uint64_t totalTicks;
uint64_t totalFails;
auto atomicThread = [&]()
{
using hrc_tp = time_point<high_resolution_clock>;
semReady.forced_release();
semStart.forced_wait();
uint64_t n = count;
uint64_t ref;
hrc_tp start = high_resolution_clock::now();
uint64_t fails = 0;
if( !xadd )
for( ; n; --n )
for( ref = atomicValue; !atomicValue.compare_exchange_weak( ref,
ref + 1, memory_order_relaxed ); ++fails );
else
for( ; n; --n )
atomicValue.fetch_add( 1, memory_order_relaxed );
uint64_t ns = duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();
lock_guard<mutex> lock( mtx );
totalTicks += ns;
totalFails += fails;
};
for( unsigned t = 1; t <= processors; ++t )
{
vector<thread> vt;
vt.reserve( t );
count = ROUNDS;
totalTicks = 0;
totalFails = 0;
for( unsigned i = 0; i != t; ++i )
{
vt.emplace_back( atomicThread );
#if defined(_MSC_VER)
GROUP_AFFINITY ga;
HANDLE hThread;
hThread = vt.back().native_handle();
SetThreadAffinityMask( hThread, (DWORD_PTR)1 << i );
GetThreadGroupAffinity( hThread, &ga );
ga.Group = getProcessorGroup( i );
SetThreadGroupAffinity( hThread, &ga, nullptr );
SetThreadPriority( hThread, THREAD_PRIORITY_HIGHEST );
SetThreadPriority( hThread, THREAD_PRIORITY_TIME_CRITICAL );
#elif defined(__unix__)
cpu_set_t cpuSet;
CPU_ZERO( &cpuSet );
CPU_SET( i, &cpuSet );
pthread_setaffinity_np( vt.back().native_handle(), sizeof cpuSet,
&cpuSet );
#endif
}
for( unsigned i = 0; i != t; ++i )
semReady.forced_wait();
semStart.release( t );
for( thread &thr : vt )
thr.join();
double ns = (double)(int64_t)totalTicks / (int)t / ROUNDS;
double failsPerSucc = (double)(int64_t)totalFails / (int)t / ROUNDS;
cout << t << " threads: " << ns << "ns";
if( !xadd )
cout << ", avg fails: " << failsPerSucc;
cout << endl;
}
};
cout << "xchg:" << endl;
benchmark( false );
cout << "xadd:" << endl;
benchmark( true );
}
// semaphore.h:
#pragma once
#if defined(_MSC_VER)
#include <Windows.h>
#include <intrin.h>
#elif defined(__unix__)
#include <semaphore.h>
#include <pthread.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/sem.h>
#include <sys/stat.h>
#include <climits>
#endif
#include <new>
#include <cstdint>
#include <cassert>
#include <system_error>
class semaphore
{
public:
semaphore();
semaphore( semaphore const & ) = delete;
void operator =( semaphore const & ) = delete;
~semaphore();
bool wait();
void forced_wait();
unsigned release( unsigned count = 1 );
void forced_release( unsigned count = 1 );
private:
#if defined(_MSC_VER)
HANDLE m_hSem;
#elif defined(__unix__)
#if defined(SYSV_SEMAPHORE)
union semun
{
int val;
semid_ds *buf;
unsigned short *array;
seminfo *__buf;
};
int m_semid;
#else
sem_t m_sem;
#endif
#else
#error need platform-specific semaphore!
#endif
};
#if defined(_MSC_VER)
// throws system_error if creating semaphore failed
inline
semaphore::semaphore()
{
using namespace std;
if( (m_hSem = CreateSemaphore( nullptr, 0, 0x7FFFFFFF, nullptr )) == NULL )
throw system_error( error_code( (int)GetLastError(), system_category()
), "creating semaphore failed" );
}
inline
semaphore::~semaphore()
{
BOOL success = CloseHandle( m_hSem );
assert(success);
}
inline
bool semaphore::wait()
{
bool success = WaitForSingleObject( m_hSem, INFINITE ) == WAIT_OBJECT_0;
assert(success);
return success;
}
inline
unsigned semaphore::release( unsigned count )
{
BOOL success = ReleaseSemaphore( m_hSem, (LONG)count, nullptr );
assert(success);
return success ? 0 : count;
}
#elif defined(__unix__)
// throws system_error if creating semaphore failed
inline
semaphore::semaphore()
{
using namespace std;
#if defined(SYSV_SEMAPHORE)
if( (m_semid = semget( IPC_PRIVATE, 1, IPC_CREAT | S_IRWXU )) == -1 )
throw system_error( error_code( errno, system_category() ), "creating
semaphore failed" );
semun su;
su.val = 0;
if( semctl( m_semid, 0, SETVAL, su ) == -1 )
{
int errNo = errno;
int ret = semctl( m_semid, 0, IPC_RMID );
assert(ret != -1);
throw system_error( error_code( errNo, system_category() ), "setting
semaphore to zero after creation failed" );
}
#else
if( sem_init( &m_sem, 0, 0 ) != 0 )
throw system_error( error_code( errno, system_category() ), "creating
semaphore failed" );
#endif
}
inline
semaphore::~semaphore()
{
#if defined(SYSV_SEMAPHORE)
int ret = semctl( m_semid, 0, IPC_RMID );
assert(ret != -1);
#else
int ret = sem_destroy( &m_sem );
assert(ret == 0);
#endif
}
inline
bool semaphore::wait()
{
#if defined(SYSV_SEMAPHORE)
int ret;
do
{
sembuf sb;
sb.sem_op = -1;
sb.sem_num = 0;
sb.sem_flg = 0;
ret = semop( m_semid, &sb, 1 );
} while( ret == EINTR );
assert(ret == 0);
return ret == 0;
#else
int ret;
do
ret = sem_wait( &m_sem );
while( ret == EINTR );
assert(ret == 0);
return ret == 0;
#endif
}
inline
unsigned semaphore::release( unsigned count )
{
#if defined(SYSV_SEMAPHORE)
for( unsigned short release; (release = (unsigned short)(count %
SHRT_MAX)) != 0; count -= release )
{
int ret;
do
{
sembuf sb;
sb.sem_op = (short)release;
sb.sem_num = 0;
sb.sem_flg = 0;
ret = semop( m_semid, &sb, 1 );
} while( ret == EINTR );
if( ret != 0 )
{
assert(false);
return count;
}
}
return 0;
#else
for( ; count; --count )
if( sem_post( &m_sem ) != 0 )
{
assert(false);
return count;
}
return 0;
#endif
}
#else
#error need platform-specific semaphore
#endif
inline
void semaphore::forced_wait()
{
while( !wait() );
}
inline
void semaphore::forced_release( unsigned count )
{
for( ; count; count = release( count ) );
}