I've edited the
en.wikipedia.org article about false sharing.
The soucecode and the short paragraph below is mine.
After writing the code for the Wikipedia-article I asked myself
if false sharing is a noteworthy problem if the cacheline is shared
among threads on the same SMT-core.
Under Windows and Linux logical cores are usually enumerated in
a way that all cores are enumerated sequentially and then they're
enumerated in the same order with the SMT-sibling. So it's easy to
attach two threads to the same SMT-core. The below source exactly
does this (working on Windows and Linux, unfortunately not under
WSL) and measures the impact of false sharing and unshared access
to different cachelines. Additionally differentiates between pure
atomic access and access to volatile variables (giving warnings
because volatile is partitially deprecated with C++20, but it
actually still works).
Here are the results for my Ryzen Threadripper 3990X system.
I had to disable half of the cores of my CPU in the BIOS because
the Windows kernel's scheduler can only handle 64 logical cores.
I could have used the processor group APIs to attach each thread
to the first processor of each of the two processor groups, but
disabling half the cores in the BIOS is simpler for me.
shared, atomic: 100%
shared, volatile: 65.5%
unshared, atomic: 27.3%
unshared, volatile: 9.7%
#if defined(_WIN32)
#include <Windows.h>
#elif defined(__unix__)
#include <pthread.h>
#include <sched.h>
#endif
#include <iostream>
#include <thread>
#include <new>
#include <atomic>
#include <cstdlib>
#include <semaphore>
#include <chrono>
#include <functional>
#include <cmath>
using namespace std;
using namespace chrono;
void setThreadAffinityAndPrio( jthread::native_handle_type threadHandle,
unsigned cpu );
int main()
{
unsigned hc = jthread::hardware_concurrency();
if( !hc )
exit( EXIT_FAILURE );
using atomic_type = atomic<int>;
using volatile_type = int volatile;
auto bench = [&]<bool FalseSharing, typename Type>() -> double
requires same_as<Type, atomic_type> || same_as<Type, volatile_type>
{
constexpr size_t
#if defined(__cpp_lib_hardware_interference_size)
CL_SIZE = hardware_destructive_interference_size,
#else
CL_SIZE = 64,
#endif
SECOND_ALIGN = FalseSharing ? sizeof(Type) : CL_SIZE;
atomic<unsigned> readyCountDown( 2 );
binary_semaphore semReady( false );
counting_semaphore semRun( 0 );
atomic<unsigned> synch( 2 );
struct
{
Type a alignas(CL_SIZE);
Type b alignas(SECOND_ALIGN);
} sharedOrNot;
atomic<int64_t> nsSum( 0 );
auto theThread = [&]( Type *value )
{
if( !--readyCountDown )
semReady.release();
semRun.acquire();
if( --synch )
while( synch );
auto start = high_resolution_clock::now();
for( size_t r = 10'000'000; r--; )
++*value;
nsSum += duration_cast<nanoseconds>( high_resolution_clock::now() -
start ).count();
};
jthread threads[2] = { jthread( theThread, &sharedOrNot.a ), jthread(
theThread, &sharedOrNot.b ) };
semReady.acquire();
for( unsigned i = 0; i != 2; ++i )
setThreadAffinityAndPrio( threads[1].native_handle(), hc / 2 * i );
semRun.release( 2 );
for( jthread &jt : threads )
jt.join();
return (double)nsSum;
};
using bench_t = decltype(bench);
using bench_fn = decltype(&bench_t::template operator ()<true,
atomic_type>);
static struct bench_and_explain
{
bench_fn fn;
char const *explain;
} const benchFns[] =
{
{ &bench_t::template operator ()<true, atomic_type>, "shared, atomic" },
{ &bench_t::template operator ()<true, volatile_type>, "shared,
volatile" },
{ &bench_t::template operator ()<false, atomic_type>, "unshared,
atomic" },
{ &bench_t::template operator ()<false, volatile_type>, "unshared,
volatile" }
};
constexpr size_t N_BENCHES = size( benchFns );
double timings[N_BENCHES];
for( size_t i = 0; i != N_BENCHES; ++i )
timings[i] = (bench.*(benchFns[i].fn))();
for( size_t i = 0; i != N_BENCHES; ++i )
{
double pct = i ? trunc( timings[i] / timings[0] * 1000.0 ) / 10.0 : 100;
cout << benchFns[i].explain << ": " << pct << "%" << endl;
}
}
void setThreadAffinityAndPrio( jthread::native_handle_type threadHandle,
unsigned cpu )
{
#if defined(_WIN32)
bool succ =
SetThreadAffinityMask( threadHandle, (DWORD_PTR)1 << cpu )
&& (SetThreadPriority( threadHandle, THREAD_PRIORITY_TIME_CRITICAL)
|| SetThreadPriority( threadHandle, THREAD_PRIORITY_HIGHEST ));
if( !succ )
#elif defined(__unix__)
cpu_set_t cpuSet;
CPU_ZERO(&cpuSet);
CPU_SET(cpu, &cpuSet);
if( pthread_setaffinity_np( threadHandle, sizeof cpuSet, &cpuSet ) )
#endif
exit( EXIT_FAILURE );
}