> If the second page misses in the TLB, there will be a TLB
> fill penalty which for an application running in a virtual
> machine will require 23 memory accesses to walk the page
> table (anywhere from 3 to 5 for bare-metal translation table
> walks depending on page size).
I'm measuring the penalty while crossing a page-boundary where
both pages are in the TLB - so this is irrelevant here.
> On a TLB Hit, there still may be a latency hit to obtain
> the cache line for the first line of the second page.
With all my three PCs the time for crossing a page-boundary, an
aligned access and an unaligned access within a page are all the
same.
I first thought that with the access crossing a page-boundary the
CPU does check for duplicate loads in the queue for outstanding
OoO-loads and satisfies them all from the same load. So I modified
my code a bit to have a configurable numer of accesses to different
page-boundaries:
#if defined(_WIN32)
#include <Windows.h>
#elif defined(__unix__)
#include <unistd.h>
#endif
#include <iostream>
#include <vector>
#include <cstdint>
#include <chrono>
#include <atomic>
#include <string_view>
using namespace std;
using namespace chrono;
using namespace chrono_literals;
using T = uint64_t;
atomic<T> aSum;
int main()
{
#if defined(_WIN32)
SetThreadAffinityMask( GetCurrentThread(), 1 );
if( !SetThreadPriority( GetCurrentThread(),
THREAD_PRIORITY_TIME_CRITICAL ) )
SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_HIGHEST );
#endif
auto bench = [&]<typename T, size_t NBoundaries>()
requires is_scalar_v<T>
{
auto probe = [&]( vector<void *> const &addrs ) -> double
{
T sum = 0;
size_t rounds = 0;
nanoseconds nsSum( nanoseconds( 0 ) );
size_t
interval = 1'000'000 / addrs.size(),
roundsPerInterval = interval * addrs.size();
do
{
auto start = high_resolution_clock::now();
for( size_t r = interval; r--; )
for( void *p : addrs )
sum += ((atomic<T> *)p)->load( memory_order_relaxed );
nsSum += duration_cast<nanoseconds>( high_resolution_clock::now() -
start );
rounds += roundsPerInterval;
} while( nsSum < 250ms );
::aSum = sum;
return (double)(ptrdiff_t)rounds / (int64_t)nsSum.count();
};
auto getPageSize = []() -> size_t
{
#if defined(_WIN32)
SYSTEM_INFO si;
GetSystemInfo( &si );
return si.dwPageSize;
#elif defined(__unix__)
return (size_t)sysconf( _SC_PAGESIZE );
#endif
};
size_t pageSize = getPageSize();
auto allocPages = [&]( size_t nPages ) -> void *
{
#if defined(_WIN32)
return VirtualAlloc( nullptr, nPages * pageSize, MEM_RESERVE |
MEM_COMMIT, PAGE_READWRITE );
#elif defined(__unix__)
return mmap( nullptr, nPages * pageSize, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, -1, 0 );
#endif
};
void *p = allocPages( NBoundaries + 1 );
vector<void *> addrs;
double times[3];
ptrdiff_t offset = -1;
do
{
addrs.resize( 0 );
addrs.reserve( NBoundaries );
for( size_t b = 0; b != NBoundaries; ++b )
addrs.emplace_back( (void *)((size_t)p + pageSize + offset) );
times[offset + 1] = probe( addrs );
} while( ++offset <= 1 );
auto pct = []( double tRel, double tBase ) -> int { return (int)((tRel
/ tBase - 1.0) * 100.0 + 0.5); };
cout << "crossing page boundaries: " << pct( times[0], times[1] ) <<
"%" << endl;
cout << "within page: " << pct( times[2], times[1] ) <<
"%" << endl;
};
bench.operator ()<T, 64>();
}
But this code also gives me the same access-times for unaligned accesses
within a page, aligned accesses and accesses crossing a page-boundary
for all my three PCs.
> Both of these will only hit when the relevent conditions
> exist, so they'll be difficult to measure without access
> to the TLB and cache flush instruction(s) on your target
> architecture.
The TLB privilege-checks are done on every acccess, even when the
address is scoped by the TLB.