> I suspect the results will be highly dependent on details, like the
> exact chip you are using, and where you draw the line between "small
> blocks" and "big blocks".
Here's a little benchmark that compares rep movsq with avx-copying
(without loop-unrolling!):
C++-Code:
#include <Windows.h>
#include <iostream>
#include <cstring>
#include <cstdint>
#include <chrono>
#include <intrin.h>
using namespace std;
using namespace chrono;
extern "C" void fAvx( __m256 *src, __m256 *dst, size_t size, size_t
repts );
extern "C" void fMovs( __m256 *src, __m256 *dst, size_t size, size_t
repts );
int main()
{
size_t const PAGE = 4096,
ROUNDS = 100'000;
char *pPage = (char *)VirtualAlloc( nullptr, 2 * PAGE,
MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE );
__m256 *src = (__m256 *)pPage,
*dst = (__m256 *)(pPage + PAGE);
memset( pPage, 0, 2 * PAGE );
using timestamp = time_point<high_resolution_clock>;
for( size_t size = 1; size <= (PAGE / 32); ++size )
{
timestamp start = high_resolution_clock::now();
fAvx( src, dst, size, ROUNDS );
uint64_t avxNs = (uint64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();;
start = high_resolution_clock::now();
fMovs( src, dst, size, ROUNDS );
uint64_t movsNs = (uint64_t)duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();;
cout << "size: " << size << "\tavx:\t" << avxNs / 1.0E6 <<
"\tmovs\t" << movsNs / 1.0E6 << endl;
}
}
Asm-Code:
_TEXT SEGMENT
; void fAvx( __m256 *src, __m256 *dst, size_t count, size_t repts );
; rcx: src
; rdx: dst
; r8: count
; r9: repts
fAvx PROC
test r9, r9
jz zero
test r8, r8
jz zero
mov rax, r8
shl rax, 5
add rax, rcx
sub rdx, rcx
mov r10, rcx
mov r11, rdx
jmp avxLoop
reptLoop:
mov rcx, r10
mov rdx, r11
avxLoop:
vmovups ymm0, [rcx]
vmovups [rcx+rdx], ymm0
add rcx, 32
cmp rcx, rax
jne avxLoop
dec r9
jnz reptLoop
zero:
ret
fAvx ENDP
; void fMovs( __m256 *src, __m256 *dst, size_t count, size_t repts );
; rcx: src
; rdx: dst
; r8: count
; r9: repts
fMovs PROC
test r9, r9
jz zero
push rsi
push rdi
mov r10, rcx
mov r11, rdx
lea rdx, [r8 * 4]
reptLoop:
mov rsi, r10
mov rdi, r11
mov rcx, rdx
rep movsq
dec r9
jnz reptLoop
pop rdi
pop rsi
zero:
ret
fMovs ENDP
_TEXT ENDS
END
That's the relative speedup of AVX over rep movsq:
size: 1 1383,79%
size: 2 737,12%
size: 3 433,35%
size: 4 342,41%
size: 5 283,20%
size: 6 431,57%
size: 7 351,47%
size: 8 340,53%
size: 9 314,24%
size: 10 325,57%
size: 11 270,96%
size: 12 327,83%
size: 13 296,13%
size: 14 275,73%
size: 15 284,19%
size: 16 317,27%
size: 17 331,54%
size: 18 266,05%
size: 19 287,00%
size: 20 281,83%
size: 21 276,17%
size: 22 261,85%
size: 23 263,01%
size: 24 251,48%
size: 25 247,98%
size: 26 237,64%
size: 27 239,66%
size: 28 187,04%
size: 29 185,92%
size: 30 189,09%
size: 31 168,90%
size: 32 179,31%
size: 33 220,31%
size: 34 192,71%
size: 35 207,33%
size: 36 214,69%
size: 37 156,90%
size: 38 169,47%
size: 39 184,87%
size: 40 159,98%
size: 41 175,79%
size: 42 156,60%
size: 43 162,29%
size: 44 155,36%
size: 45 158,09%
size: 46 164,42%
size: 47 154,88%
size: 48 164,17%
size: 49 155,84%
size: 50 157,59%
size: 51 148,29%
size: 52 152,67%
size: 53 139,59%
size: 54 149,78%
size: 55 140,99%
size: 56 146,94%
size: 57 142,01%
size: 58 148,15%
size: 59 141,62%
size: 60 152,89%
size: 61 152,00%
size: 62 149,20%
size: 63 150,13%
size: 64 150,45%
size: 65 140,96%
size: 66 132,11%
size: 67 142,80%
size: 68 135,96%
size: 69 146,18%
size: 70 140,17%
size: 71 139,63%
size: 72 139,22%
size: 73 131,02%
size: 74 145,43%
size: 75 138,23%
size: 76 132,02%
size: 77 142,05%
size: 78 135,97%
size: 79 136,52%
size: 80 138,93%
size: 81 136,06%
size: 82 138,59%
size: 83 139,08%
size: 84 134,50%
size: 85 136,64%
size: 86 134,28%
size: 87 133,35%
size: 88 129,82%
size: 89 138,07%
size: 90 132,57%
size: 91 125,16%
size: 92 138,73%
size: 93 135,70%
size: 94 131,55%
size: 95 126,62%
size: 96 134,87%
size: 97 130,83%
size: 98 129,21%
size: 99 126,70%
size: 100 133,07%
size: 101 129,39%
size: 102 129,12%
size: 103 125,27%
size: 104 124,14%
size: 105 131,78%
size: 106 132,87%
size: 107 131,40%
size: 108 128,29%
size: 109 122,95%
size: 110 121,13%
size: 111 121,73%
size: 112 126,26%
size: 113 130,87%
size: 114 131,31%
size: 115 124,70%
size: 116 119,53%
size: 117 121,42%
size: 118 120,34%
size: 119 125,65%
size: 120 124,95%
size: 121 130,36%
size: 122 128,35%
size: 123 128,25%
size: 124 127,47%
size: 125 124,28%
size: 126 124,14%
size: 127 122,69%
size: 128 122,76%
So movsq is never faster.
Here's the result graphically:
https://app.unsee.cc/#45f34f42
So its also exact the opposite as Melzzz said: movsq becomes
more competitive as the block-size raises.