Am 28.01.2024 um 11:32 schrieb Marcel Mueller:
> Reverse memory access is typically slower simply because the
> last data of a cache line (after a cache miss) arrives at last.
I tested this and for all offsets within a cacheline I get thes
same timing for all three of my computers:
#include <iostream>
#include <vector>
#include <chrono>
#include <atomic>
using namespace std;
using namespace chrono;
#if defined(__cpp_lib_hardware_interference_size)
constexpr size_t CL_SIZE = hardware_constructive_interference_size;
#else
constexpr size_t CL_SIZE = 64;
#endif
atomic_char aSum;
int main()
{
constexpr size_t
BLOCK_SIZE = 1ull << 20,
BLOCKS = BLOCK_SIZE / CL_SIZE,
ROUNDS = 1000;
vector<char> block( BLOCK_SIZE );
for( size_t offset = 0; offset != CL_SIZE; ++offset )
{
auto start = high_resolution_clock::now();
char sum = 0;
for( size_t round = ROUNDS; round--; )
for( size_t i = offset; i < BLOCK_SIZE; sum += block[i], i += CL_SIZE );
::aSum.store( sum, memory_order_relaxed );
cout << offset << ": " <<
duration_cast<nanoseconds>(high_resolution_clock::now() - start).count()
/ ((double)BLOCKS * ROUNDS) << endl;
}
}