"timprince" <
tpr...@computer.org> wrote in message
news:9k9p6n...@mid.individual.net...
> I should have said that the first option which would enable gfortran to
> use movups was the -mtune=barcelona.
> The original core 2 duo would perform better by splitting misaligned loads
> rather than using movups, so it was difficult for compilers to do the
> right thing consistently, without an alignment assertion such as you
> wanted. Mis-informing the compiler by setting a specific CPU type rather
> than native could work to your advantage if you had sufficient aligned
> loads not recognized by the compiler.
Can recent versions of ifort do this? Here is a benchmark, to run
it all you have to do is to post a disassembly of compiler output.
It comes from a paper
http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=6066463
which looks kind of like an SSE version of kissfft. The Fortran
version is:
! File: hll128.f90
! Compile with:
! gfortran -c -fno-range-check hll128.f90
subroutine fft128(x,y,pTwiddle,pBinv) bind(C,name='fft128')
use ISO_C_BINDING
implicit none
integer, parameter :: logn = 7
integer, parameter :: N = 2**logn
real(C_FLOAT) x(0:N-1), y(0:N-1)
type(C_PTR) pTwiddle(2*(logn-2))
integer(C_INT8_T) pBinv(0:N-1)
integer level
real(C_FLOAT) m0(0:3),m1(0:3),m2(0:3),m3(0:3),m4(0:3), &
m5(0:3),m6(0:3)
real(C_FLOAT), pointer :: pCosa(:)
real(C_FLOAT), pointer :: pSina(:)
integer block
integer i, j, k
integer(C_INT), parameter :: signer(0:3) = &
[0_C_INT,0_C_INT,int(Z'80000000',C_INT),0_C_INT]
do level = logn, 3, -1
call C_F_POINTER(pTwiddle(2*(logn-level)+1),pCosa, &
[2**(level-1)])
call C_F_POINTER(pTwiddle(2*(logn-level)+2),pSina, &
[2**(level-1)])
do block = 0, 2**logn-1, 2**level
do k = 0, 2**(level-1)-1, 4
m0 = x(block+k:block+k+3)
m1 = x(block+k+2**(level-1):block+k+2**(level-1)+3)
m2 = y(block+k:block+k+3)
m3 = y(block+k+2**(level-1):block+k+2**(level-1)+3)
m4 = m0+m1
m5 = m2+m3
x(block+k:block+k+3) = m4
y(block+k:block+k+3) = m5
m4 = m0-m1
m5 = m2-m3
m0 = pCosa(k+1:k+4)
m1 = pSina(k+1:k+4)
m2 = m0*m4
m3 = m1*m5
m6 = m2+m3
x(block+k+2**(level-1):block+k+2**(level-1)+3) = m6
m2 = m0*m5
m3 = m1*m4
m6 = m2-m3
y(block+k+2**(level-1):block+k+2**(level-1)+3) = m6
end do
end do
end do
do block = 0, 2**logn-1, 4
m0 = x(block:block+3)
m1 = y(block:block+3)
m2 = [m0(0),m1(0),m0(1),m1(1)]
m3 = [m0(2),m1(2),m0(3),m1(3)]
m4 = m2+m3
m5 = m2-m3
! m5(2) = -m5(2)
m5 = transfer(ieor(transfer(m5,signer),signer),m5)
m0 = [m4(0),m4(2),m5(0),m5(3)]
m1 = [m4(1),m4(3),m5(1),m5(2)]
x(block:block+3) = m0
y(block:block+3) = m1
end do
do block = 0, 2**logn-1, 4
m0 = x(block:block+3)
m1 = y(block:block+3)
m2 = [m0(0),m0(2),m1(0),m1(2)]
m3 = [m0(1),m0(3),m1(1),m1(3)]
m4 = m2+m3
m5 = m2-m3
m0 = [m4(0),m5(0),m4(1),m5(1)]
m1 = [m4(2),m5(2),m4(3),m5(3)]
x(block:block+3) = m0
y(block:block+3) = m1
end do
do i = 0, N-1
j = pBinv(i)
j = iand(j,int(Z'ff'))
if(j > i) then
x([i,j]) = x([j,i])
y([i,j]) = y([j,i])
end if
end do
end subroutine fft128
And the hoped-for disassembly might look something like:
format MS64 coff
; File: fasm128a.asm
; Assembled with: fasm fasm128a.asm
; See test file fft128a.f90
section 'CODE' code readable executable align 16
align 16
public fft128
fft128:
; Save registers
push rbx
push rsi
push rdi
push rbp
push r12
mov [rsp+72], rcx ; x
mov [rsp+64], rdx ; y
; mov [rsp+56], r8 ; pTwiddle
mov [rsp+48], r9 ; pBinv
mov r10, 256
mov r12, 1
point8_outer:
mov rsi, [r8] ; cos array
mov rdi, [r8+8] ; sin array
add rsi, r10
add rdi, r10
mov rax, [rsp+72] ; x
lea rcx, [rax+2*r10]
add rax, r10
mov rdx, [rsp+64] ; y
lea rbx, [rdx+2*r10]
add rdx, r10
mov r9, r12
point8_middle:
xor ebp, ebp
sub rbp, r10
point8_inner:
movaps xmm0, [rax+rbp] ; x(i:i+3)
movaps xmm1, [rcx+rbp] ; x(j:j+3)
movaps xmm2, [rdx+rbp] ; y(i:i+3)
movaps xmm3, [rbx+rbp] ; y(j:j+3)
movaps xmm4, xmm0 ; x(i:i+3)
addps xmm4, xmm1 ; x(i:i+3)+x(j:j+3)
movaps [rax+rbp], xmm4
movaps xmm4, xmm2 ; y(i:i+3)
addps xmm4, xmm3 ; y(i:i+3)+y(j:j+3)
movaps [rdx+rbp], xmm4
subps xmm0, xmm1 ; x(i:i+3)-x(j:j+3)
subps xmm2, xmm3 ; y(i:i+3)-y(j:j+3)
movaps xmm1, [rsi+rbp] ; cos(theta)
movaps xmm3, [rdi+rbp] ; sin(theta)
movaps xmm4, xmm0 ; x(i:i+3)-x(j:j+3)
mulps xmm4, xmm1 ; cos(theta)*(x(i:i+3)-x(j:j+3))
mulps xmm0, xmm3 ; sin(theta)*(x(i:i+3)-x(j:j+3))
mulps xmm3, xmm2 ; sin(theta)*(y(i:i+3)-y(j:j+3))
mulps xmm2, xmm1 ; cos(theta)*(y(i:i+3)-y(j:j+3))
addps xmm4, xmm3 ; cos(theta)*(x(i:i+3)-x(j:j+3))
; +sin(theta)*(y(i:i+3)-y(j:j+3))
movaps [rcx+rbp], xmm4
subps xmm2, xmm0 ; cos(theta)*(y(i:i+3)-y(j:j+3))
; -sin(theta)*(x(i:i+3)-x(j:j+3))
movaps [rbx+rbp], xmm2
add rbp, 16
js point8_inner
lea rax, [rax+2*r10]
lea rcx, [rcx+2*r10]
lea rdx, [rdx+2*r10]
lea rbx, [rbx+2*r10]
sub r9, 1
ja point8_middle
add r8, 16
shr r10, 1
shl r12, 1
cmp r10, 8
ja point8_outer
mov eax, 80000000h
movd xmm5, eax
pslldq xmm5, 8
mov rax, [rsp+72] ; x
mov rdx, [rsp+64] ; y
mov rbp, -512
sub rax, rbp
sub rdx, rbp
point4:
movaps xmm0, [rax+rbp]
movaps xmm1, [rdx+rbp]
movaps xmm2, xmm0
unpcklps xmm2, xmm1
unpckhps xmm0, xmm1
movaps xmm4, xmm2
addps xmm4, xmm0
subps xmm2, xmm0
xorps xmm2, xmm5
movaps xmm0, xmm4
shufps xmm0, xmm2, 0c8h ; 11001000
shufps xmm4, xmm2, 9dh ; 10011101
movaps [rax+rbp], xmm0
movaps [rdx+rbp], xmm4
add rbp, 16
js point4
mov rbp, -512
point2:
movaps xmm0, [rax+rbp]
movaps xmm1, [rdx+rbp]
movaps xmm2, xmm0
shufps xmm2, xmm1, 88h ; 10001000
shufps xmm0, xmm1, 0ddh ; 11011101
movaps xmm4, xmm2
addps xmm4, xmm0
subps xmm2, xmm0
movaps xmm0, xmm4
unpcklps xmm4, xmm2
unpckhps xmm0, xmm2
movaps [rax+rbp], xmm4
movaps [rdx+rbp], xmm0
add rbp, 16
js point2
mov rcx, [rsp+48] ; pBinv
mov rax, [rsp+72] ; x
mov rdx, [rsp+64] ; y
xor ebp, ebp
bitrev:
movzx ebx, byte[rcx+rbp]
cmp ebp, ebx
jnb bitskip
mov esi, [rax+4*rbp]
mov edi, [rax+4*rbx]
mov [rax+4*rbx], esi
mov [rax+4*rbp], edi
mov esi, [rdx+4*rbp]
mov edi, [rdx+4*rbx]
mov [rdx+4*rbx], esi
mov [rdx+4*rbp], edi
bitskip:
inc ebp
cmp ebp, 128
jb bitrev
; Restore registers
cleanup:
pop r12
pop rbp
pop rdi
pop rsi
pop rbx
ret
align 16
public _rdtsc
_rdtsc:
rdtsc
shl rdx, 32
or rax, rdx
ret
Just a brief glance at the disassembly should suffice to determine
whether the compiler is getting the idea.