There are two instructions called PDEP and PEXT on x86, introduced
with Intel Haswell, which scatter or gather certain bits of a source
operand to a destination operand:
https://www.felixcloutier.com/x86/pdep
https://www.felixcloutier.com/x86/pext
On
agner.org i found some data on the latencies and througputs of
these instructions but they don't fit with what I measured. On my
Zen2 Ryzen Threadripper 3990X I compiled the below code with Intel
MSVC++ 2022 and C++ 2022 I got some strange results. I checked the
generated code of both compilers and the code fits to what I expect,
i.e. it's absolutely the same what you've written in assembly. On
all compilers I chose I got a throuput of about 34ns per instruction.
That's not what the
agner.org tables report but about 150 clock cycles
per instruction.
#include <iostream>
#include <vector>
#include <chrono>
#include <random>
#include <cstdint>
#include <atomic>
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__) || defined(__llvm__)
#include <immintrin.h>
#endif
using namespace std;
using namespace chrono;
atomic_uint64_t aSum( 0 );
int main()
{
constexpr size_t
N = 0x1000,
ROUNDS = 10'000;
vector<uint64_t> data( N, 0 );
mt19937_64 mt;
uniform_int_distribution<uint64_t> uid( 0, -1 );
for( uint64_t &d : data )
d = uid( mt );
auto pdep = []( uint64_t data, uint64_t mask ) -> uint64_t { return
_pdep_u64( data, mask ); };
auto pext = []( uint64_t data, uint64_t mask ) -> uint64_t { return
_pext_u64( data, mask ); };
auto bench = [&]<typename Permute>( Permute permute ) -> double
{
uint64_t sum = 0;
auto start = high_resolution_clock::now();
constexpr uint64_t MASK = 0x5555555555555555u;
for( size_t r = ROUNDS; r--; )
for( uint64_t d : data )
sum += permute( d, MASK );
double ns = (double)(int64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count() / ((double)N * ROUNDS);
::aSum = sum;
return ns;
};
cout << bench( pdep ) << endl;
cout << bench( pext ) << endl;
}
Please report your numbers.