nbody bench take two

MelissA

unread,

Apr 12, 2012, 3:32:28 AM4/12/12

to

I have posted first in alt.lang.asm , but I'll try here
too.
Program is translated from this Java program:
http://shootout.alioth.debian.org/u64q/program.php?test=nbody&lang=java&id=2

First assembler version ran for 35 seconds while
this one, optimized runs for 23 seconds on q6600 @ 2.4GHz

I have vectorized calculations , used sse3 changed registers,
used movapd instead of movsd for register to register sse move, tried
everything but can't get any faster.
I have checked gcc's output it does not use packed SSE2 instructions,
rather single but is faster.
Intel Fortran program has best optimization it executes at 15seconds.

Program follows:

format ELF64

SIZEOFBODY equ 64
struc body {
.x dq ?
.y dq ?
.z dq ?
.filler dq ?
.vx dq ?
.vy dq ?
.vz dq ?
.mass dq ?
}

macro init_body b, x,y,z,vx,vy,vz,mass{
mov rax,x
mov [b#.x],rax
mov rax,y
mov [b#.y],rax
mov rax,z
mov [b#.z],rax

mov rax, vx
movq xmm0,rax
mulsd xmm0,[DAYS_PER_YEAR]
movsd [b#.vx],xmm0

mov rax,vy
movq xmm0,rax
mulsd xmm0,[DAYS_PER_YEAR]
movsd [b#.vy],xmm0

mov rax,vz
movq xmm0,rax
mulsd xmm0,[DAYS_PER_YEAR]
movsd [b#.vz],xmm0

mov rax,mass
movq xmm0,rax
mulsd xmm0,[SOLAR_MASS]
movsd [b#.mass],xmm0
}

virtual at 0
oBody body
end virtual

; this macro needs to be optimized
; rest of program does not executes in a loop
macro advance
{
; xmm15 holds dt
local .L0,.L1,.L2,.L3
mov ecx,5 ; ecx - > i
mov rbx,sun
.L0:
dec ecx
jz .L2
mov r9, rcx ; r9 -> j
lea rdx, [rbx+SIZEOFBODY]
.L1:

movapd xmm0,dqword[rbx + oBody.x]
movsd xmm2,[rbx + oBody.z]
movapd xmm1,dqword[rdx + oBody.x]

subpd xmm0,xmm1 ; dx,dy -> xmm0
subsd xmm2,[rdx + oBody.z] ; dz -> xmm2

movapd xmm3,xmm0
movapd xmm5,xmm2

mulpd xmm3,xmm3
mulsd xmm5,xmm5

haddpd xmm3,xmm3
addsd xmm3,xmm5 ; dsquared -> xmm3

sqrtsd xmm4, xmm3 ; distance -> xmm4

mulsd xmm3,xmm4
movapd xmm5, xmm15
; most slowest instruction takes almost 50% of execution time
divsd xmm5,xmm3 ; mag -> xmm5

;-----------------------------------------------
movsd xmm6, [rdx + oBody.mass]
mulsd xmm6, xmm5 ; precompute bodies[j].mass * mag
movddup xmm6,xmm6

movapd xmm3, dqword[rbx + oBody.vx]
movapd xmm8, xmm0
mulpd xmm8, xmm6
subpd xmm3,xmm8
movapd dqword[rbx + oBody.vx],xmm3
; iBody.vx -= dx * bodies[j].mass * mag;

movsd xmm3, [rbx + oBody.vz]
movapd xmm9, xmm2
mulsd xmm9, xmm6
subsd xmm3,xmm9
movsd [rbx + oBody.vz],xmm3
; ----------------------------------------------
movsd xmm6, [rbx + oBody.mass]
mulsd xmm6, xmm5 ; precompute iBody.mass * mag
movddup xmm6,xmm6

movapd xmm3, dqword[rdx + oBody.vx]
mulpd xmm0, xmm6
addpd xmm3, xmm0
movapd dqword[rdx + oBody.vx], xmm3
; bodies[j].vx += dx * iBody.mass * mag;

movsd xmm3, [rdx + oBody.vz]
mulsd xmm2, xmm6
addsd xmm3, xmm2
movsd [rdx + oBody.vz], xmm3
;-----------------------------------------
add rdx,SIZEOFBODY
dec r9
jnz .L1
add rbx,SIZEOFBODY
jmp .L0
.L2:
mov rbx,sun
mov ecx,5
.L3:
movapd xmm0, dqword[rbx + oBody.x]
movsd xmm2, [rbx + oBody.z]

movapd xmm3 , xmm15
movddup xmm3,xmm3
mulpd xmm3, dqword[rbx + oBody.vx]
addpd xmm0, xmm3
movapd dqword[rbx + oBody.x], xmm0

movapd xmm3 , xmm15
mulsd xmm3, [rbx + oBody.vz]
addsd xmm2, xmm3
movsd [rbx + oBody.z], xmm2

add rbx,SIZEOFBODY
dec ecx
jnz .L3

}

section '.text' executable align 16
extrn printf
extrn atoi
public main

main:
mov qword[n],1
; rdi - > argc , rsi -> argv
cmp rdi,2
jl .begin
mov rdi,qword[rsi+8] ; argv[1] -> rdi
call atoi
mov qword[n],rax

mov eax,0
mov rdi, argv
mov rsi,[n]
sub rsp,8
call printf
add rsp,8
.begin:
sub rsp,8
mov eax,2
mov rdi,message

; init solar mass
movsd xmm0, qword[PI]
movsd xmm1,xmm0
mulsd xmm0,qword[SOLAR_MASS]
mulsd xmm0,xmm1
movsd [SOLAR_MASS],xmm0
call printf

; init bodies
init_body sun,0f,0f,0f,0f,0f,0f,1f

init_body jupiter,4.84143144246472090e+00, \
-1.16032004402742839e+00,\
-1.03622044471123109e-01,\
1.66007664274403694e-03, \
7.69901118419740425e-03, \
-6.90460016972063023e-05,\
9.54791938424326609e-04;
mov rbx,jupiter
call print_body

init_body saturn,8.34336671824457987e+00, \
4.12479856412430479e+00, \
-4.03523417114321381e-01,\
-2.76742510726862411e-03,\
4.99852801234917238e-03, \
2.30417297573763929e-05, \
2.85885980666130812e-04;
mov rbx,saturn
call print_body

init_body uranus,1.28943695621391310e+01, \
-1.51111514016986312e+01,\
-2.23307578892655734e-01,\
2.96460137564761618e-03, \
2.37847173959480950e-03, \
-2.96589568540237556e-05,\
4.36624404335156298e-05
mov rbx,uranus
call print_body

init_body neptune,1.53796971148509165e+01, \
-2.59193146099879641e+01,\
1.79258772950371181e-01, \
2.68067772490389322e-03, \
1.62824170038242295e-03, \
-9.51592254519715870e-05,\
5.15138902046611451e-05;
mov rbx,neptune
call print_body

pxor xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2

virtual at rbx
.oBody body
end virtual

mov rbx,sun
mov ecx,5
; init
; ----------------------------------
.L0:
movsd xmm3, [.oBody.vx]
mulsd xmm3, [.oBody.mass]
addsd xmm0, xmm3

movsd xmm3, [.oBody.vy]
mulsd xmm3, [.oBody.mass]
addsd xmm1, xmm3

movsd xmm3, [.oBody.vz]
mulsd xmm3, [.oBody.mass]
addsd xmm2, xmm3

add rbx, SIZEOFBODY ;
dec ecx
jnz .L0

mov rbx,sun
call offset_momentum
call print_body
; ----------------------------------------
call energy
call print_energy

mov r8, [n]
mov rax,0.01
movq xmm15,rax
.L1:
advance
dec r8
jnz .L1

call energy
call print_energy

add rsp,8
xor eax,eax
ret

; px xmm0 , py xmm1 , pz xmm2
offset_momentum:
virtual at rbx
.oBody body
end virtual

mov rax,0x8000000000000000
movq xmm3, rax

xorpd xmm0,xmm3
xorpd xmm1,xmm3
xorpd xmm2,xmm3
divsd xmm0,[SOLAR_MASS]
divsd xmm1,[SOLAR_MASS]
divsd xmm2,[SOLAR_MASS]
movsd [.oBody.vx],xmm0
movsd [.oBody.vy],xmm1
movsd [.oBody.vz],xmm2
ret

print_body:
virtual at rbx
.oBody body
end virtual
sub rsp,8
mov eax,7
mov rdi,bmsg
movq xmm0,[.oBody.x]
movq xmm1,[.oBody.y]
movq xmm2,[.oBody.z]
movq xmm3,[.oBody.vx]
movq xmm4,[.oBody.vy]
movq xmm5,[.oBody.vz]
movq xmm6,[.oBody.mass]
call printf
add rsp,8
ret
; xmm0 resulting energy
energy:
virtual at rbx
.iBody body
end virtual
virtual at rdx
.jBody body
end virtual
mov rbx, sun
mov ecx, 5
mov rax,0.0
movq xmm0, rax
mov rax,0.5
.L0:

movsd xmm1, [.iBody.vx]
mulsd xmm1,xmm1

movsd xmm2, [.iBody.vy]
mulsd xmm2,xmm2

movsd xmm3, [.iBody.vz]
mulsd xmm3,xmm3

addsd xmm1, xmm2
addsd xmm1, xmm3

mulsd xmm1, [.iBody.mass]

movq xmm2, rax
mulsd xmm2, xmm1

addsd xmm0, xmm2

dec ecx
jz .L2

lea rdx, [rbx+SIZEOFBODY]
push rcx
.L1:
movsd xmm1, [.iBody.x]
subsd xmm1, [.jBody.x]

movsd xmm2, [.iBody.y]
subsd xmm2, [.jBody.y]

movsd xmm3, [.iBody.z]
subsd xmm3, [.jBody.z]

mulsd xmm1,xmm1
mulsd xmm2,xmm2
mulsd xmm3,xmm3

addsd xmm1, xmm2
addsd xmm1, xmm3

sqrtsd xmm1,xmm1

movsd xmm2, [.iBody.mass]
mulsd xmm2, [.jBody.mass]
divsd xmm2, xmm1

subsd xmm0, xmm2
add rdx, SIZEOFBODY
dec ecx
jnz .L1

add rbx, SIZEOFBODY
pop rcx
jmp .L0
.L2:
ret

print_energy:
sub rsp,8
mov eax,1
mov rdi, msg
call printf
add rsp, 8
ret

section '.data' writeable align 16

message db 'Hello World %2.9f %2.9f !',0xa,0
bmsg db 'x: %.9f',0xa,'y: %.9f',0xa,'z: %.9f',0xa, \
'vx: %.9f',0xa,'vy: %.9f',0xa,'vz: %.9f',0xa, \
'mass: %.9f',0xa,0xa,0
msg db '%.9f',0xa,0
argv db 'argv : %d',0xa,0
align 8
PI dq 3.141592653589793
SOLAR_MASS dq 4.0
DAYS_PER_YEAR dq 365.24

section '.bss' writeable align 16

sun body
jupiter body
saturn body
uranus body
neptune body

n rq 1

MelissA

unread,

Apr 12, 2012, 9:05:55 PM4/12/12

to

I finally got it ;)
Trick is to separate loop in three parts so
that calculation of magnitude can be vectorized.
I followed Fortran program this time and got
16 seconds on q6...@2.4GHz.

So number one program is now 2 seconds ahead ...

Program follows:

format ELF64

SIZEOFBODY equ 64

SIZEOFDIFF equ 24
struc diff{
.dx dq ?
.dy dq ?
.dz dq ?

virtual at 0
r diff
end virtual

macro advance
{
; xmm15 holds dt

local .L0,.L1,.L2,.L3,.L4,.L5
mov ecx,4 ; ecx - > i
mov rsi,rr
mov rbx,sun
.L0:
mov r9d, ecx ; r9 -> j

lea rdx, [rbx + SIZEOFBODY]
.L1:

movapd xmm0,dqword[rbx + oBody.x]

movsd xmm1,[rbx + oBody.z]

subpd xmm0, dqword[rdx + oBody.x]; dx,dy -> xmm0
subsd xmm1,[rdx + oBody.z] ; dz -> xmm2

movupd dqword[rsi+r.dx],xmm0
movsd [rsi+r.dz],xmm1

add rsi, SIZEOFDIFF
add rdx, SIZEOFBODY
dec r9d

jnz .L1
add rbx, SIZEOFBODY

dec ecx
jnz .L0
;-----------------------------------
mov ecx,5
mov rsi,rr
mov rdi,mag
.L2:
movlpd xmm3,[rsi+r.dx]
movlpd xmm4,[rsi+r.dy]
movlpd xmm5,[rsi+r.dz]

add rsi,SIZEOFDIFF

movhpd xmm3,[rsi+r.dx]
movhpd xmm4,[rsi+r.dy]
movhpd xmm5,[rsi+r.dz]

mulpd xmm3,xmm3
mulpd xmm4,xmm4
mulpd xmm5,xmm5

addpd xmm3,xmm4
addpd xmm3,xmm5 ; dsquared -> xmm3

sqrtpd xmm4, xmm3 ; distance -> xmm4

mulpd xmm3,xmm4
movapd xmm5, xmm15
movddup xmm5,xmm5

; most slowest instruction takes almost 50% of execution time

divpd xmm5,xmm3 ; mag -> xmm5
movupd dqword[rdi],xmm5
add rdi,16
add rsi,SIZEOFDIFF
dec ecx
jnz .L2
;-----------------------------------------------
mov ecx,4
mov rbx,sun
mov rsi,rr
mov rdi,mag
.L3:
mov r9d, ecx
lea rdx, [rbx+SIZEOFBODY]
.L4:

movsd xmm6, [rdx + oBody.mass]

mulsd xmm6, [rdi] ; precompute bodies[j].mass * mag
movddup xmm6,xmm6

movupd xmm10,dqword[rsi+r.dx]
movsd xmm11,[rsi+r.dz]

movapd xmm3, dqword[rbx + oBody.vx]

movapd xmm8, xmm10

mulpd xmm8, xmm6
subpd xmm3,xmm8
movapd dqword[rbx + oBody.vx],xmm3
; iBody.vx -= dx * bodies[j].mass * mag;

movsd xmm3, [rbx + oBody.vz]

movapd xmm9, xmm11

mulsd xmm9, xmm6
subsd xmm3,xmm9
movsd [rbx + oBody.vz],xmm3
; ----------------------------------------------
movsd xmm6, [rbx + oBody.mass]

mulsd xmm6, [rdi] ; precompute iBody.mass * mag

movddup xmm6,xmm6

movapd xmm3, dqword[rdx + oBody.vx]

movapd xmm0, xmm10

mulpd xmm0, xmm6
addpd xmm3, xmm0
movapd dqword[rdx + oBody.vx], xmm3
; bodies[j].vx += dx * iBody.mass * mag;

movsd xmm3, [rdx + oBody.vz]

movapd xmm2, xmm11

mulsd xmm2, xmm6
addsd xmm3, xmm2
movsd [rdx + oBody.vz], xmm3
;-----------------------------------------
add rdx,SIZEOFBODY

add rsi,SIZEOFDIFF
add rdi,8
dec r9d
jnz .L4

add rbx,SIZEOFBODY
dec ecx
jnz .L3

mov rbx,sun
mov ecx,5
.L5:

movapd xmm0, dqword[rbx + oBody.x]
movsd xmm2, [rbx + oBody.z]

movapd xmm3 , xmm15
movddup xmm3,xmm3
mulpd xmm3, dqword[rbx + oBody.vx]
addpd xmm0, xmm3
movapd dqword[rbx + oBody.x], xmm0

movapd xmm3 , xmm15
mulsd xmm3, [rbx + oBody.vz]
addsd xmm2, xmm3
movsd [rbx + oBody.z], xmm2

add rbx,SIZEOFBODY
dec ecx

jnz .L5

rr rq 30
mag rq 10

n rq 1

Alex Mizrahi

unread,

Apr 13, 2012, 3:46:43 AM4/13/12

to

> I finally got it ;)
> Trick is to separate loop in three parts so
> that calculation of magnitude can be vectorized.
> I followed Fortran program this time and got
> 16 seconds on q6...@2.4GHz.

> So number one program is now 2 seconds ahead ...

I have an idea how to parallelize it, i.e. run on two or more cores.
It looks like none of shootout programs can do it. It's not possible
with high-level synchronization primitives, but with busy wait it
_might_ work.

I've measured that it takes about 100 cycles for data to propagate from
one core to another, so if you have 800 cycles total you can split it
into 400 in each thread and 100 for synchronization for 1.6 speedup. (I
don't remember concrete numbers, but I once calculated them at least
approximately.)

Splitting work among two cores is fairly complex. We have 10 interactions:

(0,1), (0,2), (0, 3), (0, 4)
(1,2), (1, 3), (1, 4)
(2, 3), (2, 4)
(3, 4)

So, say, thread 1 will do interactions with body 0 while thread 2 will
do the rest. Then they share information with each other and compute
missing values.
I don't remember a concrete scheme I came up with, it was fairly complex
and was rotating (i.e. threads do different things on even and odd
iterations), but it was kinda plausible.

For four threads it's easier (if we aim at 2x speedup rather than 4x
speedup): thread 1 computes all interactions for body 1, thread 2
computes all interactions for body 2 and so on. Then they share
resulting coordinates with each other. Additionally they share
interactions of their body with body 0, and thread 1 adds them all to
compute coordinates of body 0 and share it.

So, each thread computes only 4 out of 10 interactions and computes
coordinates/velocity only for one body. So we could get speedup slightly
higher than 2x. But thread 1 does more work, and there might be some
synchronization overhead, so it might around 2x, maybe somewhat less.

But I would be very surprised if there won't be any speedup for 4 thread
implementation.

Bernhard Schornak

unread,

Apr 13, 2012, 6:00:23 AM4/13/12

to

mov rax, x
mov rbx, y
mov rcx, z
mov [b#.x], rax
mov [b#.y], rbx
mov [b#.z], rcx

Preloading three registers is faster than 'shuffling' all
data through one and the same register. Your version exe-
cutes in 24 clocks, my replacement needs only 7 clocks to
perform the same task. One cycle is lost after preloading
those parameters, 'mov reg, mem' takes 4 clocks. Might be
worth a thought to preload another parameter required for
later calculations while the processor is waiting for in-
coming data (the 4th execution pipe is idle, anyway).

> mov rax, vx
> movq xmm0,rax
> mulsd xmm0,[DAYS_PER_YEAR]
> movsd [b#.vx],xmm0
>
> mov rax,vy
> movq xmm0,rax
> mulsd xmm0,[DAYS_PER_YEAR]
> movsd [b#.vy],xmm0
>
> mov rax,vz
> movq xmm0,rax
> mulsd xmm0,[DAYS_PER_YEAR]
> movsd [b#.vz],xmm0
>
> mov rax,mass
> movq xmm0,rax
> mulsd xmm0,[SOLAR_MASS]
> movsd [b#.mass],xmm0

movddup xmm3, [DAYS_PER_YEAR] ; 4
movsd xmm4, [SOLAR_MASS] ; 4
movdqa xmm0, vx ; 4
movsd xmm1, vz ; 4
movsd xmm2, mass ; 4
mulpd xmm0, xmm3 ; 6
mulsd xmm1, xmm3 ; 6
mulsd xmm2, xmm4 ; 6
movdqa [b#.vx], xmm0 ; 4
movsd [b#.vz], xmm1 ; 4
movsd [b#.mass], xmm2 ; 4

> }

The same in short. Saves 5 lines and gains some speed. ;)

This

mov GPR, [mem]
movd XMM, REG

needs 12 clocks to execute, while

movq XMM, [mem]

does the same thing in 4 clocks...

See AMD 'Software Optimisation Guide' (PDF # 47414), page
171...174. SQRTSD/SQRTPD = 38, DIVSD/DIVPD = 27 clocks on
a FX-8150.

<snip>

Preloading all required parameters from memory into unused
(11 GP, 7 XMM) registers saves some clocks. Choose another
instruction order to avoid dependencies - a coarse example
how to resolve them (partially...):

movq xmm0, dqword[rbx + oBody.vx] ; 4
movq xmm1, [rbx + oBody.vz] ; 4
movddup xmm3, xmm15 ; 2
movddup xmm4, xmm15 ; 2
mulpd xmm3, xmm0 ; 6
mulsd xmm4, xmm1 ; 6
addpd xmm0, xmm3 ; 2
addsd xmm2, xmm4 ; 2
movapd dqword[rbx + oBody.x], xmm0 ; 4
movsd [rbx + oBody.z], xmm2 ; 4

This still isn't truly optimised, but it shows how to make
your code slightly faster. Inserting some more -unrelated-
instructions between loading and using a register might be
required to 'hide' latencies.

Greetings from Augsburg

Bernhard Schornak

MelissA

unread,

Apr 13, 2012, 8:39:23 AM4/13/12

to

On Fri, 13 Apr 2012 12:00:23 +0200
Bernhard Schornak <scho...@nospicedham.web.de> wrote:

>
> Preloading all required parameters from memory into unused
> (11 GP, 7 XMM) registers saves some clocks. Choose another
> instruction order to avoid dependencies - a coarse example
> how to resolve them (partially...):
>
>
> movq xmm0, dqword[rbx + oBody.vx] ; 4
> movq xmm1, [rbx + oBody.vz] ; 4
> movddup xmm3, xmm15 ; 2
> movddup xmm4, xmm15 ; 2
> mulpd xmm3, xmm0 ; 6
> mulsd xmm4, xmm1 ; 6
> addpd xmm0, xmm3 ; 2
> addsd xmm2, xmm4 ; 2
> movapd dqword[rbx + oBody.x], xmm0 ; 4
> movsd [rbx + oBody.z], xmm2 ; 4
>
>
> This still isn't truly optimised, but it shows how to make
> your code slightly faster. Inserting some more -unrelated-
> instructions between loading and using a register might be
> required to 'hide' latencies.
>
>

Thanks Bernard for all suggestions, I have rearranged macro advance and
now program executes at 14 seconds same as number one Fortran ;)
Didn;t change macro init_body as it does not influence execution
speed noticeably, but thanks for all suggestions.

Program follows:

format ELF64

SIZEOFBODY equ 64

SIZEOFDIFF equ 32

struc diff{
.dx dq ?
.dy dq ?

.dz dq ?

.filler dq ?
}
struc body {
.x dq ?
.y dq ?
.z dq ?
.filler dq ?
.vx dq ?
.vy dq ?
.vz dq ?
.mass dq ?
}

macro init_body b, x,y,z,vx,vy,vz,mass{
mov rax,x
mov [b#.x],rax
mov rax,y
mov [b#.y],rax
mov rax,z
mov [b#.z],rax

mov rax, vx
movq xmm0,rax
mulsd xmm0,[DAYS_PER_YEAR]
movsd [b#.vx],xmm0

mov rax,vy
movq xmm0,rax
mulsd xmm0,[DAYS_PER_YEAR]
movsd [b#.vy],xmm0

mov rax,vz
movq xmm0,rax
mulsd xmm0,[DAYS_PER_YEAR]
movsd [b#.vz],xmm0

mov rax,mass
movq xmm0,rax
mulsd xmm0,[SOLAR_MASS]
movsd [b#.mass],xmm0
}

virtual at 0
oBody body
end virtual

virtual at 0
r diff
end virtual

macro advance
{
; xmm15 holds dt

local .L0,.L1,.L2,.L3,.L4,.L5
mov ecx,4 ; ecx - > i
mov rsi,rr
mov rbx,sun
.L0:

mov r9d, ecx ; r9 -> j

lea rdx, [rbx + SIZEOFBODY]
.L1:

movapd xmm0,dqword[rbx + oBody.x]

movsd xmm1,[rbx + oBody.z]

subpd xmm0, dqword[rdx + oBody.x]; dx,dy -> xmm0

subsd xmm1,[rdx + oBody.z] ; dz -> xmm2

movapd dqword[rsi+r.dx],xmm0

movsd [rsi+r.dz],xmm1

add rsi, SIZEOFDIFF
add rdx, SIZEOFBODY
dec r9d

jnz .L1
add rbx, SIZEOFBODY

dec ecx
jnz .L0
;-----------------------------------
mov ecx,5
mov rsi,rr
mov rdi,mag
.L2:

movlpd xmm3,[rsi+r.dx]
movlpd xmm4,[rsi+r.dy]
movlpd xmm5,[rsi+r.dz]

movhpd xmm3,[rsi+r.dx+SIZEOFDIFF]
movhpd xmm4,[rsi+r.dy+SIZEOFDIFF]
movhpd xmm5,[rsi+r.dz+SIZEOFDIFF]

movddup xmm6,xmm15

mulpd xmm3,xmm3
mulpd xmm4,xmm4
mulpd xmm5,xmm5

addpd xmm3,xmm4
addpd xmm3,xmm5 ; dsquared -> xmm3

sqrtpd xmm4, xmm3 ; distance -> xmm4

mulpd xmm3,xmm4
divpd xmm6,xmm3 ; mag -> xmm5
movapd dqword[rdi],xmm6

add rdi,16
add rsi,2*SIZEOFDIFF

dec ecx
jnz .L2
;-----------------------------------------------
mov ecx,4
mov rbx,sun
mov rsi,rr
mov rdi,mag
.L3:
mov r9d, ecx
lea rdx, [rbx+SIZEOFBODY]
.L4:

movsd xmm6, [rdx + oBody.mass]

mulsd xmm6, [rdi] ; precompute bodies[j].mass * mag
movddup xmm6,xmm6

movapd xmm10,dqword[rsi+r.dx]
movsd xmm11,[rsi+r.dz]

movapd xmm3, dqword[rbx + oBody.vx]

movsd xmm4, [rbx + oBody.vz]

movapd xmm8, xmm10
movapd xmm9, xmm11
mulpd xmm8, xmm6
mulsd xmm9, xmm6
subpd xmm3,xmm8
subsd xmm4,xmm9

movapd dqword[rbx + oBody.vx],xmm3
; iBody.vx -= dx * bodies[j].mass * mag;

movsd [rbx + oBody.vz],xmm4
; ----------------------------------------------
movsd xmm7, [rbx + oBody.mass]
mulsd xmm7, [rdi] ; precompute iBody.mass * mag
movddup xmm7,xmm7

movapd xmm3, dqword[rdx + oBody.vx]

movsd xmm4, [rdx + oBody.vz]

movapd xmm0, xmm10
movapd xmm2, xmm11
mulpd xmm0, xmm7
mulsd xmm2, xmm7
addpd xmm3, xmm0
addsd xmm4, xmm2

movapd dqword[rdx + oBody.vx], xmm3
; bodies[j].vx += dx * iBody.mass * mag;

movsd [rdx + oBody.vz], xmm4

;-----------------------------------------
add rdx,SIZEOFBODY
add rsi,SIZEOFDIFF
add rdi,8
dec r9d
jnz .L4

add rbx,SIZEOFBODY
dec ecx
jnz .L3

mov rbx,sun
mov ecx,5
.L5:

movapd xmm0, dqword[rbx + oBody.x]

movsd xmm1, [rbx + oBody.z]

movddup xmm2 , xmm15
movapd xmm3, xmm15

mulpd xmm2,dqword[rbx + oBody.vx]

mulsd xmm3, [rbx + oBody.vz]

addpd xmm0, xmm2
addsd xmm1, xmm3

movapd dqword[rbx + oBody.x], xmm0

movsd [rbx + oBody.z], xmm1

add rbx,SIZEOFBODY
dec ecx
jnz .L5

rr rq 40

MelissA

unread,

Apr 13, 2012, 8:43:14 AM4/13/12

to

On Fri, 13 Apr 2012 10:46:43 +0300
Alex Mizrahi <alex.m...@nospicedham.gmail.com> wrote:

> > I finally got it ;)
> > Trick is to separate loop in three parts so
> > that calculation of magnitude can be vectorized.
> > I followed Fortran program this time and got
> > 16 seconds on q6...@2.4GHz.
>
> > So number one program is now 2 seconds ahead ...
>
> I have an idea how to parallelize it, i.e. run on two or more cores.
> It looks like none of shootout programs can do it. It's not possible
> with high-level synchronization primitives, but with busy wait it
> _might_ work.
>
> I've measured that it takes about 100 cycles for data to propagate
> from one core to another, so if you have 800 cycles total you can
> split it into 400 in each thread and 100 for synchronization for 1.6
> speedup. (I don't remember concrete numbers, but I once calculated
> them at least approximately.)

Thanks for suggestion but I think parallelizing and threads would be
very difficult task for me ;)
Anyway program now executes at 14 secs (Fortran program speed) so
if you wish you can further workout how your idea can be
implemented in assembly on Linux.

Nathan

unread,

Apr 13, 2012, 10:22:54 AM4/13/12

to

On Apr 13, 8:39 am, MelissA <me...@nospicedham.a.com> wrote:
>
> Program follows:
>

<snip, snip>

You still have the 'printf' library call in there -- it contributes to
the duration of your program. Hidden inside it is a system call
(probably 'write'). You definitely want to avoid multiple system
calls because they include the costly 'context switch'... the
transition from USER mode to KERNEL mode and then another transition
back to USER mode when the system function is finished.

I noticed that the candidates for the other languages do not display
results to the screen. If you intend a "fair" and "honest"
comparison, then you should eliminate the 'results display' from you
ASM version.

Nathan.
--
http://clax.inspiretomorrow.net

Thomas Boell

unread,

Apr 14, 2012, 11:06:56 AM4/14/12

to

On Fri, 13 Apr 2012 14:39:23 +0200
MelissA <me...@nospicedham.a.com> wrote:

> Thanks Bernard for all suggestions, I have rearranged macro advance and
> now program executes at 14 seconds same as number one Fortran ;)
> Didn;t change macro init_body as it does not influence execution
> speed noticeably, but thanks for all suggestions.

That sounds about right for a well-optimized SIMD assembly program. :)
No wonder they are "not interested" in an assembly implementation.

Thomas Boell

unread,

Apr 14, 2012, 11:11:25 AM4/14/12

to

On Fri, 13 Apr 2012 07:22:54 -0700 (PDT)
Nathan <nathan...@nospicedham.gmail.com> wrote:

> On Apr 13, 8:39 am, MelissA <me...@nospicedham.a.com> wrote:
> >
> > Program follows:
> >
>
> <snip, snip>
>
> You still have the 'printf' library call in there -- it contributes to
> the duration of your program.

Not really. printf is not called in the main loop. The time it takes is
negligible compared to the many seconds spent in the main loop.

>Hidden inside it is a system call
> (probably 'write'). You definitely want to avoid multiple system
> calls because they include the costly 'context switch'... the
> transition from USER mode to KERNEL mode and then another transition
> back to USER mode when the system function is finished.
>
> I noticed that the candidates for the other languages do not display
> results to the screen. If you intend a "fair" and "honest"
> comparison, then you should eliminate the 'results display' from you
> ASM version.

Huh? They do. The programs need to output the results so they can be
checked for correctness.

Bernhard Schornak

unread,

Apr 14, 2012, 12:03:20 PM4/14/12

to

Being curious, I copied the GCC version from the linked site
and compiled it. Those 50,000,000 iterations are executed in
7.4 seconds on a FX-8150 @ 3.6 GHz (less than 30% CPU load).

That's more than 2.5 times faster than the published result,
leaving some doubts about the reliability of the entire test
suite... ;)

MelissA

unread,

Apr 14, 2012, 1:16:39 PM4/14/12

to

On Sat, 14 Apr 2012 18:03:20 +0200

Well, if you try assembler version you will surely get even better
timing.
Just grab fasm and try it. I'm interested how this assembler program
fares on AMD FX as gcc does not vectorize rather use scalar SSE
instructions.

Greetings!

Isaac Gouy

unread,

Apr 14, 2012, 1:40:40 PM4/14/12

to

On Apr 14, 9:03 am, Bernhard Schornak <schor...@nospicedham.web.de>
wrote:
-snip-

> Being curious, I copied the GCC version from the linked site
> and compiled it. Those 50,000,000 iterations are executed in
> 7.4 seconds on a FX-8150 @ 3.6 GHz (less than 30% CPU load).
>
> That's more than 2.5 times faster than the published result,
> leaving some doubts about the reliability of the entire test
> suite... ;)

Maybe your computer is faster :-)

Nathan

unread,

Apr 14, 2012, 2:02:13 PM4/14/12

to

On Apr 14, 11:11 am, Thomas Boell <tbo...@nospicedham.domain.invalid>
wrote:

> On Fri, 13 Apr 2012 07:22:54 -0700 (PDT)
>

> Nathan <nathancba...@nospicedham.gmail.com> wrote:
> > On Apr 13, 8:39 am, MelissA <me...@nospicedham.a.com> wrote:
>
> > > Program follows:
>
> > <snip, snip>
>
> > You still have the 'printf' library call in there -- it contributes to
> > the duration of your program.
>
> Not really. printf is not called in the main loop. The time it takes is
> negligible compared to the many seconds spent in the main loop.
>

Screen scroll of a term window is NOT negligible when you are
measuring a very short time interval. Perhaps it would be better to
modify each program to do 10 iterations? Or maybe do the 'time stuff'
inside the program instead of using an external timer??

>
> > I noticed that the candidates for the other languages do not display
> > results to the screen. If you intend a "fair" and "honest"
> > comparison, then you should eliminate the 'results display' from you
> > ASM version.
>
> Huh? They do. The programs need to output the results so they can be
> checked for correctness.

They do not. Take a careful look at Melissa's console capture (quoted
below); see if you can spot the difference.

,---
| quoted from other thread
,---
Interestingly I have under clocked my e8400 to 2.4GHz
and benchmark runs much faster there
bmaxa@maxa:~/shootout$ time ./nbodycpp 50000000
-0.169075164
-0.169059907

real 0m10.781s
user 0m10.733s
sys 0m0.016s
bmaxa@maxa:~/shootout$ time ./nbodyf 50000000
-0.169075164
-0.169059907

real 0m28.651s
user 0m28.530s
sys 0m0.032s
bmaxa@maxa:~/shootout$ time java nbody 50000000
-0.169075164
-0.169059907

real 0m14.551s
user 0m13.053s
sys 0m0.028s
bmaxa@maxa:~/shootout$ time ./nbody 50000000
argv : 50000000
Hello World 39.478417604 3.141592654 !
x: 4.841431442
y: -1.160320044
z: -0.103622044
vx: 0.606326393
vy: 2.811986845
vz: -0.025218362
mass: 0.037693675

x: 8.343366718
y: 4.124798564
z: -0.403523417
vx: -1.010774346
vy: 1.825662371
vz: 0.008415761
mass: 0.011286326

x: 12.894369562
y: -15.111151402
z: -0.223307579
vx: 1.082791006
vy: 0.868713018
vz: -0.010832637
mass: 0.001723724

x: 15.379697115
y: -25.919314610
z: 0.179258773
vx: 0.979090732
vy: 0.594698999
vz: -0.034755956
mass: 0.002033687

x: 0.000000000
y: 0.000000000
z: 0.000000000
vx: -0.000387663
vy: -0.003275359
vz: 0.000023936
mass: 39.478417604

-0.169075164
-0.169059907

real 0m24.776s
user 0m24.950s
sys 0m0.040s
`---

Nathan.

Thomas Boell

unread,

Apr 14, 2012, 11:59:58 PM4/14/12

to

On Sat, 14 Apr 2012 11:02:13 -0700 (PDT)
Nathan <nathan...@gmail.com> wrote:

> On Apr 14, 11:11 am, Thomas Boell <tbo...@nospicedham.domain.invalid>
> wrote:
> > On Fri, 13 Apr 2012 07:22:54 -0700 (PDT)
> >
> > Nathan <nathancba...@nospicedham.gmail.com> wrote:
> > > On Apr 13, 8:39 am, MelissA <me...@nospicedham.a.com> wrote:
> >
> > > > Program follows:
> >
> > > <snip, snip>
> >
> > > You still have the 'printf' library call in there -- it contributes to
> > > the duration of your program.
> >
> > Not really. printf is not called in the main loop. The time it takes is
> > negligible compared to the many seconds spent in the main loop.
> >
>
> Screen scroll of a term window is NOT negligible when you are
> measuring a very short time interval. Perhaps it would be better to
> modify each program to do 10 iterations? Or maybe do the 'time stuff'
> inside the program instead of using an external timer??

He is not measuring a "very short time interval".

> > > I noticed that the candidates for the other languages do not display
> > > results to the screen. If you intend a "fair" and "honest"
> > > comparison, then you should eliminate the 'results display' from you
> > > ASM version.
> >
> > Huh? They do. The programs need to output the results so they can be
> > checked for correctness.
>
> They do not. Take a careful look at Melissa's console capture (quoted
> below); see if you can spot the difference.

The programs do output their results, and your quote shows that.

Oh, her program outputs about 45 lines more. Yes, that could make up
for the time difference.

If she's outputting to a line printer.

Nathan

unread,

Apr 15, 2012, 2:39:39 AM4/15/12

to

On Apr 14, 11:59 pm, Thomas Boell <tbo...@nospicedham.domain.invalid>
wrote:

>
> Oh, her program outputs about 45 lines more. Yes, that could make up
> for the time difference.
>
> If she's outputting to a line printer.

Some 'modern' Linux varieties don't allow easy access to the regular
terminal console. They provide a various assortment of GUI
(graphical) representations of the 'term' instead. Some of these can
be a bit slow.

Nathan.

Bernhard Schornak

unread,

Apr 16, 2012, 5:02:20 AM4/16/12

to

My environment is AS (and GCC for executables) on a 64 bit
Windoze 7 platform. Feeding fasm with your code (replacing
ELF64 with PE64) throws multiple errors. Seems it does not
know how to handle those 'extern' statements ('printf' and
'atoi'). GCC automatically links them against its standard
libraries if <stdlib.h> and <stdio.h> are included...

As I didn't find any libraries in fasm. there is no way to
test your code. I'm coding a tool to convert LETNi to AT&T
style syntax for my SRE editor at the moment, but it takes
some days. Requires a lot of work to evaluate and reformat
simple text fragments... ;)

Bernhard Schornak

unread,

Apr 16, 2012, 5:02:28 AM4/16/12

to

Probably... ;)

James Van Buskirk

unread,

Apr 16, 2012, 9:49:07 AM4/16/12

to

"Bernhard Schornak" <scho...@nospicedham.web.de> wrote in message
news:jmgn6t$hqm$1...@dont-email.me...

> My environment is AS (and GCC for executables) on a 64 bit
> Windoze 7 platform. Feeding fasm with your code (replacing
> ELF64 with PE64) throws multiple errors. Seems it does not
> know how to handle those 'extern' statements ('printf' and
> 'atoi'). GCC automatically links them against its standard
> libraries if <stdlib.h> and <stdio.h> are included...

> As I didn't find any libraries in fasm. there is no way to
> test your code. I'm coding a tool to convert LETNi to AT&T
> style syntax for my SRE editor at the moment, but it takes
> some days. Requires a lot of work to evaluate and reformat
> simple text fragments... ;)

You need to link with msvcrt.lib or maybe libc.lib to get printf
and atoi. Here is an example with msvcrt.lib:

C:\Asm\FASM\EXAMPLES\WIN64\my_example>type ex1.asm
format MS64 COFF

extrn printf
extrn ExitProcess

section 'CODE' code readable executable align 16

align 16
public _main
_main:
sub rsp, 40
mov rcx, mess
call printf
mov rcx, 0
call ExitProcess

section 'DATA' data readable writeable align 16
mess db 'Hello, world!', 0ah, 0

C:\Asm\FASM\EXAMPLES\WIN64\my_example>fasm ex1.asm
flat assembler version 1.67.18 (1631193 kilobytes memory)
3 passes, 282 bytes.

C:\Asm\FASM\EXAMPLES\WIN64\my_example>link ex1.obj /subsystem:console
/defaultli
b:kernel32.lib /defaultlib:msvcrt /entry:_main
Microsoft (R) Incremental Linker Version 8.00.40310.39
Copyright (C) Microsoft Corporation. All rights reserved.

C:\Asm\FASM\EXAMPLES\WIN64\my_example>ex1
Hello, world!

You can probably get ld.exe to work with this too, but it's
harder, at least for me, to use. You need a 64-bit link.exe
and environmental variables set up correctly for the above
example to work. You can get that through some MS SDK or
other. The bigger problem is that the nbody code uses
a 64-bit Linux calling convention and on Windows you are
going to be using a different calling convention. That will
require a careful examination and modification of the code.

--
write(*,*) transfer((/17.392111325966148d0,6.5794487871554595D-85, &
6.0134700243160014d-154/),(/'x'/)); end

James Van Buskirk

unread,

Apr 16, 2012, 1:47:02 PM4/16/12

to

"James Van Buskirk" <not_...@nospicedham.comcast.net> wrote in message
news:jmh80g$7gp$1...@dont-email.me...

> You can probably get ld.exe to work with this too, but it's
> harder, at least for me, to use.

OK, ld.exe is really easy, too:

C:\Asm\FASM\EXAMPLES\WIN64\my_example>type ex2.asm

format MS64 COFF

extrn printf
extrn ExitProcess

section 'CODE' code readable executable align 16

align 16

public main

main:
sub rsp, 40
mov rcx, mess
call printf
mov rcx, 0
call ExitProcess

section 'DATA' data readable writeable align 16
mess db 'Hello, world!', 0ah, 0

C:\Asm\FASM\EXAMPLES\WIN64\my_example>fasm ex2.asm
flat assembler version 1.67.18 (1681917 kilobytes memory)
3 passes, 282 bytes.

C:\Asm\FASM\EXAMPLES\WIN64\my_example>gcc ex2.obj -oex2

C:\Asm\FASM\EXAMPLES\WIN64\my_example>ex2
Hello, world!

So you can link the nbody code via:

C:\Asm\FASM\EXAMPLES\WIN64\nbody>fasm nbody.asm
flat assembler version 1.67.18 (1680861 kilobytes memory)
3 passes, 3285 bytes.

C:\Asm\FASM\EXAMPLES\WIN64\nbody>gcc nbody.obj -onbody

But as already pointed out it will just crash because it's
written to the wrong calling convention.

Bernhard Schornak

unread,

Apr 17, 2012, 4:02:46 AM4/17/12

to

"fasm test.asm test.exe"

throws this error:

"test.asm [199]:
extrn printf
error: illegal instruction."

Obviously, fasm stumbles upon the external 'printf' and
cannot resolve it. If fasm doesn't provide a stdlib for
Windoze, there's no way to compile that code.

No problem to replace RAX/RDI with RCX/RDX... ;)

MelissA

unread,

Apr 17, 2012, 5:17:01 AM4/17/12

to

Problem is that of you want to make pe executable directly
extrn/public directives want work. In that case you have to import
functions differently.

You have to make coff object file instead, and than link.

>
> No problem to replace RAX/RDI with RCX/RDX... ;)

I guess that argc,argv parameter passing is also different.

>
>
> Greetings from Augsburg
>
> Bernhard Schornak

Greetings !

James Van Buskirk

unread,

Apr 17, 2012, 7:43:14 PM4/17/12

to

"Bernhard Schornak" <scho...@nospicedham.web.de> wrote in message

news:jmj834$prt$1...@dont-email.me...

> "fasm test.asm test.exe"

> throws this error:

> "test.asm [199]:
> extrn printf
> error: illegal instruction."

> Obviously, fasm stumbles upon the external 'printf' and
> cannot resolve it. If fasm doesn't provide a stdlib for
> Windoze, there's no way to compile that code.

> No problem to replace RAX/RDI with RCX/RDX... ;)

OK, I couldn't reproduce your issue and I'm back from Exercise Period
so I thought I would take another look. I'm not sure whether test.asm
was the topical nbody code or my ex2.asm example, but I couldn't get
either to reproduce your error statement even with "format PE64" in
use. However, I downloaded the latest version of fasm (1.70), and
with this I can get to your error statement.

C:\Asm\FASM\EXAMPLES\WIN64\my_example>type ex3.asm
;format MS64 COFF
format PE64

extrn printf
extrn ExitProcess

section 'CODE' code readable executable align 16

align 16
public main
main:
sub rsp, 40
mov rcx, mess
call printf
mov rcx, 0
call ExitProcess

section 'DATA' data readable writeable align 16
mess db 'Hello, world!', 0ah, 0

C:\Asm\FASM\EXAMPLES\WIN64\my_example>fasm ex3.asm ex3.exe
flat assembler version 1.70 (1621897 kilobytes memory)
ex3.asm [4]:

extrn printf
error: illegal instruction.

So we can see that with the new fasm v. 1.70, the "format PE64"
causes the error. But we don't want that format because it
just makes things needlessly complicated. Instead use
"format MS64 coff":

C:\Asm\FASM\EXAMPLES\WIN64\my_example>type ex2.asm
format MS64 COFF

extrn printf
extrn ExitProcess

section 'CODE' code readable executable align 16

align 16
public main
main:
sub rsp, 40
mov rcx, mess
call printf
mov rcx, 0
call ExitProcess

section 'DATA' data readable writeable align 16
mess db 'Hello, world!', 0ah, 0

C:\Asm\FASM\EXAMPLES\WIN64\my_example>fasm ex2.asm

flat assembler version 1.70 (1622057 kilobytes memory)

3 passes, 282 bytes.

C:\Asm\FASM\EXAMPLES\WIN64\my_example>gcc ex2.obj -oex2

C:\Asm\FASM\EXAMPLES\WIN64\my_example>ex2
Hello, world!

This means you will need an extra step to link the resulting
*.obj file with ld.exe as shown above, but that should be
painless for you since you are working with as.exe and likely
have the gcc driver already installed, right? Hopefully this
gets you over the hump of creating a valid executable; adjusting
the resulting nbody.asm code for the 64-bit Windows calling
convention is up to you.

Bernhard Schornak

unread,

Apr 19, 2012, 4:27:08 AM4/19/12

to

Following all steps, fasm emits this message:

"test.asm [202]:
align 16
error: section is not aligned enough."

Seems, fasm's Linux and Windoze versions are not
compatible. (Tried with align 16...2097152 - all
of them failed.)

Probably, it is faster to convert the given code
to AT&T syntax than to figure out how fasm might
be convinced to work properly.

Greetings from Augsburg

Bernhard Schornak

P.S.: "test.asm" is the posted "nbody.asm" code.
I copied it to a file without any changes.

Bernhard Schornak

unread,

Apr 19, 2012, 4:27:46 AM4/19/12

to

Evaluating the GCC version, there is not much to
improve, anyway. The only possible improvement I
see is to rearrange the structures in a way that
two square roots are calculated simultaneously -
SQRTPD has the same latency as SQRTSD (should be
about 30...40 percent faster).

Shouldn't be a problem to do the calculation for
two planets simultaneously and 'interleave' both
calculations. We have 16 XMM and 8 MMX registers
to store frequently used parameters... ;)

Terje Mathisen

unread,

Apr 19, 2012, 5:38:31 AM4/19/12

to

Bernhard Schornak wrote:
> Evaluating the GCC version, there is not much to
> improve, anyway. The only possible improvement I
> see is to rearrange the structures in a way that
> two square roots are calculated simultaneously -
> SQRTPD has the same latency as SQRTSD (should be
> about 30...40 percent faster).
>
> Shouldn't be a problem to do the calculation for
> two planets simultaneously and 'interleave' both
> calculations. We have 16 XMM and 8 MMX registers
> to store frequently used parameters... ;)

There is at least one very obvious optimization here:

Start by calculating invsqrt(squared_distance)!

I.e. all the force terms are proportional to some power of the inverse
distance between a pair of bodies, so if you start with an inverse
square root and then calculate the second and third power of that
result, you have all the scaling factors you need.

You can do this for 4 such distances in parallel by using the 4-way
inverse square root lookup, do one NR iteration in float and then split
the results into two pairs of doubles before the final NR iteration.

This results in about 48-50 significant mantissa bits, if you need more
than this do one more NR iteration.

The final result should be to replace 4 divisions and 4 square roots
with ~20-40 cycles of SSE code.

I've only glanced at the reference (Java?) code but this approach seemed
like a very obvious idea!

Terje
PS. Many years ago I doubled the speed of a computational fluid
chemistry code by the exact same approach; calculating three inverse
square roots in parallel using regular C code with a table lookup
instead of the SSE lookup opcode. :-)
--
- <Terje.Mathisen at tmsw.no>
"almost all programming can be viewed as an exercise in caching"

James Van Buskirk

unread,

Apr 19, 2012, 10:52:06 AM4/19/12

to

"Bernhard Schornak" <scho...@nospicedham.web.de> wrote in message

news:jmoi8k$ie8$1...@dont-email.me...

> Following all steps, fasm emits this message:

> "test.asm [202]:
> align 16
> error: section is not aligned enough."

> Seems, fasm's Linux and Windoze versions are not
> compatible. (Tried with align 16...2097152 - all
> of them failed.)

> Probably, it is faster to convert the given code
> to AT&T syntax than to figure out how fasm might
> be convinced to work properly.

> P.S.: "test.asm" is the posted "nbody.asm" code.
> I copied it to a file without any changes.

That's really a mystery because when I took the code from:

http://groups.google.com/group/alt.lang.asm/msg/3a1286c201645155?hl=en&dmode=source

Then assemble it with fasm for Windows:

C:\Asm\FASM\EXAMPLES\WIN64\nbody>fasm nbody.asm

flat assembler version 1.70 (1611367 kilobytes memory)
3 passes, 5015 bytes.

C:\Asm\FASM\EXAMPLES\WIN64\nbody>gcc nbody.o -onbody.exe

Modifying the first line from:

format ELF64

to:

format MS64 coff

I get the results:

C:\Asm\FASM\EXAMPLES\WIN64\nbody>fasm nbody.asm

flat assembler version 1.70 (1612307 kilobytes memory)
3 passes, 3285 bytes.

C:\Asm\FASM\EXAMPLES\WIN64\nbody>gcc nbody.obj -onbody.exe

So I can't tell what you are doing differently. Do you have all the
code? The last line should be:

n rq 1

MelissA

unread,

Apr 19, 2012, 1:52:52 PM4/19/12

to

On Thu, 19 Apr 2012 10:27:46 +0200

That is what I did, so got Fortran time.
I have calculated magnitude in
following way:

mov ecx,5
mov rsi,rr
mov rdi,mag
.L2:

movlpd xmm3,[rsi+r.dx]
movlpd xmm4,[rsi+r.dy]
movlpd xmm5,[rsi+r.dz]

movhpd xmm3,[rsi+r.dx+SIZEOFDIFF]
movhpd xmm4,[rsi+r.dy+SIZEOFDIFF]
movhpd xmm5,[rsi+r.dz+SIZEOFDIFF]

movddup xmm6,xmm15

mulpd xmm3,xmm3
mulpd xmm4,xmm4
mulpd xmm5,xmm5

addpd xmm3,xmm4
addpd xmm3,xmm5 ; dsquared -> xmm3

sqrtpd xmm4, xmm3 ; distance -> xmm4

mulpd xmm3,xmm4
divpd xmm6,xmm3 ; mag -> xmm5
movapd dqword[rdi],xmm6

add rdi,16
add rsi,2*SIZEOFDIFF
dec ecx
jnz .L2

> Shouldn't be a problem to do the calculation for
> two planets simultaneously and 'interleave' both
> calculations. We have 16 XMM and 8 MMX registers
> to store frequently used parameters... ;)

;)

>
>
> Greetings from Augsburg
>
> Bernhard Schornak

Greetings!

Bernhard Schornak

unread,

Apr 21, 2012, 2:48:23 AM4/21/12

to

It is. Following your last post, the first line was
changed to "format MS64 COFF" before I tried again.
My 'make.cmd':

"fasm test.asm test.obj
gcc test.obj -o test.exe"

Giving up with fasm - finishing the 'LETNi to AT&T'
converter surely takes less time than learning fasm
syntax to compile just one single file written in a
'foreign language'... ;)

Bernhard Schornak

unread,

Apr 21, 2012, 2:48:39 AM4/21/12

to

MelissA wrote:

> On Thu, 19 Apr 2012 10:27:46 +0200
> Bernhard Schornak<scho...@nospicedham.web.de> wrote:

<snip>

Terje's reply should help to improve your code. Currently,
it spends 65 clocks with waiting for the results of SQRTPD
and DIVPD. Using reciprocals (as suggested in my 1st post)
could reduce latency to less than 30 clocks. I do not have
a clue how large Terje's lookup table might be, but: Using
lookup tables reduced latency even more (some calculations
were superfluous in this case).

I'm busy with my 'LETNi to AT&T' conversion function, so I
just know your code from the output emitted by the current
pre-beta version. No time to dig deeper into the matter at
the moment... ;)

Bernhard Schornak

unread,

Apr 21, 2012, 2:49:05 AM4/21/12

to

Well, I just have 'low level' math knowledge, but I
know AMD's Optimisation Guide by heart. As the guide
states, replacing square roots and/or divisions with
reciprocal multiplications gains some speed, because
two packed multiplies can be executed simultaneously
in P0/P1, while square roots and divisions block all
pipes until the result is present. We could do about
12 multiplications in the same time it takes to cal-
culate a packed square root, or nine multiplications
while a packed division is calculated.

Do you remember how large your lookup table was? I'm
just curious... ;)

Terje Mathisen

unread,

Apr 22, 2012, 7:02:26 AM4/22/12

to

Bernhard Schornak wrote:

> Terje Mathisen wrote:
>> PS. Many years ago I doubled the speed of a computational fluid
>> chemistry code by the
>> exact same approach; calculating three inverse square roots in
>> parallel using regular C
>> code with a table lookup instead of the SSE lookup opcode. :-)
>
> Well, I just have 'low level' math knowledge, but I
> know AMD's Optimisation Guide by heart. As the guide
> states, replacing square roots and/or divisions with
> reciprocal multiplications gains some speed, because
> two packed multiplies can be executed simultaneously
> in P0/P1, while square roots and divisions block all
> pipes until the result is present. We could do about
> 12 multiplications in the same time it takes to cal-
> culate a packed square root, or nine multiplications
> while a packed division is calculated.
>
> Do you remember how large your lookup table was? I'm
> just curious... ;)

I'm not sure, but probably 11-12 bits of index, consisting of the bottom
exponent bit plus the top mantissa bits, with either 8-bit or 16-bit
table values. The rest of the exponent part is of course a simple matter
of a shift and subtract. (The hidden mantissa bit is of course always 1
so it doesn't need to be part of the lookup.)

The NR iteration gives a little bit more than twice the precision per
iteration, so 2-3 (fixed/unrolled) loops gives more or less the same
results as FDIV + FSQRT.

Terje

Terje Mathisen

unread,

Apr 22, 2012, 8:41:11 AM4/22/12

to

Bernhard Schornak wrote:
> Terje's reply should help to improve your code. Currently,
> it spends 65 clocks with waiting for the results of SQRTPD
> and DIVPD. Using reciprocals (as suggested in my 1st post)
> could reduce latency to less than 30 clocks. I do not have
> a clue how large Terje's lookup table might be, but: Using
> lookup tables reduced latency even more (some calculations
> were superfluous in this case).

The huge step is to use the simd 4-way lookup opcode, as I wrote that
allows you to get 4 (more or less double prec) results in just a few clocks.

In the current case I would calculate as many such values in parallel as
possible, i.e 3x4 or so in 32-bit mode.

With 10 (?) planetary bodies you get 45 reciprocal distances.

Terje

>
> I'm busy with my 'LETNi to AT&T' conversion function, so I
> just know your code from the output emitted by the current
> pre-beta version. No time to dig deeper into the matter at
> the moment... ;)
>
>
> Greetings from Augsburg
>
> Bernhard Schornak

Bernhard Schornak

unread,

Apr 23, 2012, 7:48:24 AM4/23/12

to

Terje Mathisen wrote:

> Bernhard Schornak wrote:
>> Terje's reply should help to improve your code. Currently,
>> it spends 65 clocks with waiting for the results of SQRTPD
>> and DIVPD. Using reciprocals (as suggested in my 1st post)
>> could reduce latency to less than 30 clocks. I do not have
>> a clue how large Terje's lookup table might be, but: Using
>> lookup tables reduced latency even more (some calculations
>> were superfluous in this case).
>
> The huge step is to use the simd 4-way lookup opcode, as I wrote that allows you to get 4
> (more or less double prec) results in just a few clocks.
>
> In the current case I would calculate as many such values in parallel as possible, i.e 3x4
> or so in 32-bit mode.
>
> With 10 (?) planetary bodies you get 45 reciprocal distances.

Only the outer four planets Jupiter, Saturn, Uranus and
Neptune are calculated for n years (with n = 50,000,000
for the contest).

Bernhard Schornak

unread,

Apr 23, 2012, 7:49:20 AM4/23/12

to

Terje Mathisen wrote:

> Bernhard Schornak wrote:
>> Terje's reply should help to improve your code. Currently,
>> it spends 65 clocks with waiting for the results of SQRTPD
>> and DIVPD. Using reciprocals (as suggested in my 1st post)
>> could reduce latency to less than 30 clocks. I do not have
>> a clue how large Terje's lookup table might be, but: Using
>> lookup tables reduced latency even more (some calculations
>> were superfluous in this case).
>
> The huge step is to use the simd 4-way lookup opcode, as I wrote that allows you to get 4
> (more or less double prec) results in just a few clocks.
>
> In the current case I would calculate as many such values in parallel as possible, i.e 3x4
> or so in 32-bit mode.
>
> With 10 (?) planetary bodies you get 45 reciprocal distances.

Only the outer four planets Jupiter, Saturn, Uranus and
Neptune are calculated for n years (with n = 50,000,000
for the contest).