Bonita Montero
unread,Jun 17, 2023, 12:22:51 PM6/17/23You do not have permission to delete messages in this group
Either email addresses are anonymous for this group or you need the view member email addresses permission to view the original message
to
I wanted to test how many time it takes for a thread to signal
a semaphore to another thread and to wait to be signalled back.
That's essential for mutexes when they're contended. I tested
this under Windows 11 with a Ryzen 9 7950X system.
I tested different combinations of logical cores. The first
thread is always fixed on the first core and the other thread
is varying. I print the X2 APIC ID along with the result.
The fastest result I get is about 20.000 clock cycles for one
switch to the other thread. I think that's enormous.
A similar benchmark written for linux using Posix semapohres
gives about 8.000 clock cylces per switch on a 3990X system.
That's a huge difference since the CPU is a Zen2-CPU with a
much lower clock rate than the 7950X Zen4 system.
#include <Windows.h>
#include <iostream>
#include <thread>
#include <system_error>
#include <chrono>
#include <latch>
#include <charconv>
#include <intrin.h>
using namespace std;
using namespace chrono;
int main( int argc, char **argv )
{
static auto errTerm = []( bool succ, char const *what )
{
if( succ )
return;
cerr << what << endl;
terminate();
};
int regs[4];
__cpuid( regs, 0 );
errTerm( (unsigned)regs[0] >= 0xB, "max CPUID below 0xB" );
bool fPrio = SetPriorityClass( GetCurrentProcess(),
REALTIME_PRIORITY_CLASS )
|| SetPriorityClass( GetCurrentProcess(), HIGH_PRIORITY_CLASS );
errTerm( fPrio, "can't set process priority class" );
unsigned nCPUs = jthread::hardware_concurrency();
for( unsigned cpuB = 1; cpuB != nCPUs; ++cpuB )
{
auto init = []( HANDLE &hSem, bool set )
{
hSem = CreateSemaphoreA( nullptr, set, 1, nullptr );
errTerm( hSem, "can't create semaphore" );
};
HANDLE hSemA, hSemB;
init( hSemA, false );
init( hSemB, true );
atomic_int64_t tSum( 0 );
latch latRun( 3 );
auto flipThread = [&]( HANDLE hSemMe, HANDLE hSemYou, size_t n,
uint32_t *pX2ApicId )
{
latRun.arrive_and_wait();
auto start = high_resolution_clock::now();
for( ; n--; )
errTerm( WaitForSingleObject( hSemMe, INFINITE ) == WAIT_OBJECT_0,
"can't wait for semaphore" ),
errTerm( ReleaseSemaphore( hSemYou, 1, nullptr ), "can't post
semaphore" );
tSum.fetch_add( duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count(), memory_order::relaxed );
if( !pX2ApicId )
return;
int regs[4];
__cpuidex( regs, 0xB, 0 );
*pX2ApicId = regs[3];
};
constexpr size_t ROUNDS = 10'000;
uint32_t x2ApicId;
jthread
thrA( flipThread, hSemA, hSemB, ROUNDS, nullptr ),
thrB( flipThread, hSemB, hSemA, ROUNDS, &x2ApicId );
errTerm( SetThreadAffinityMask( thrA.native_handle(), 1 ), "can't set
CPU affinity" );
errTerm( SetThreadAffinityMask( thrB.native_handle(), (DWORD_PTR)1 <<
cpuB ), "can't set CPU affinity" );
latRun.arrive_and_wait();
thrA.join();
thrB.join();
cout << x2ApicId << ": " << (double)tSum.load( memory_order::relaxed )
/ (2.0 * ROUNDS) << endl;
};
}