Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

nbody computer language shootout bench

60 views
Skip to first unread message

Melissa

unread,
Apr 10, 2012, 8:57:52 AM4/10/12
to
Hi , this is my attempt to implement in assembly
this java program:
http://shootout.alioth.debian.org/u64q/program.php?test=nbody&lang=java&id=2

My program executes in 35 seconds , while java in 22 seconds ;)

Only place that needs optimisations is macro advance.

Thanks for any help.
; nbody.asm
;------------------------------------------------
format ELF64

SIZEOFBODY equ 56
struc body {
.x dq ?
.y dq ?
.z dq ?
.vx dq ?
.vy dq ?
.vz dq ?
.mass dq ?
}

macro init_body b, x,y,z,vx,vy,vz,mass{
mov rax,x
mov [b#.x],rax
mov rax,y
mov [b#.y],rax
mov rax,z
mov [b#.z],rax

mov rax, vx
movq xmm0,rax
mulsd xmm0,[DAYS_PER_YEAR]
movsd [b#.vx],xmm0

mov rax,vy
movq xmm0,rax
mulsd xmm0,[DAYS_PER_YEAR]
movsd [b#.vy],xmm0

mov rax,vz
movq xmm0,rax
mulsd xmm0,[DAYS_PER_YEAR]
movsd [b#.vz],xmm0

mov rax,mass
movq xmm0,rax
mulsd xmm0,[SOLAR_MASS]
movsd [b#.mass],xmm0
}

virtual at 0
oBody body
end virtual

macro advance dt
{
local .L0,.L1,.L2,.L3
mov ecx,5 ; ecx - > i
mov rax,dt
mov rbx,sun
.L0:
dec ecx
jz .L2
mov r9, rcx ; r9 -> j
lea rdx, [rbx+SIZEOFBODY]
.L1:
movsd xmm0,[rbx + oBody.x]
movsd xmm1,[rbx + oBody.y]
movsd xmm2,[rbx + oBody.z]

subsd xmm0,[rdx + oBody.x] ; dx -> xmm0
subsd xmm1,[rdx + oBody.y] ; dy -> xmm1
subsd xmm2,[rdx + oBody.z] ; dz -> xmm2

movsd xmm3,xmm0
movsd xmm4,xmm1
movsd xmm5,xmm2

mulsd xmm3,xmm3
mulsd xmm4,xmm4
mulsd xmm5,xmm5

addsd xmm3,xmm4
addsd xmm3,xmm5 ; dsquared -> xmm3

sqrtsd xmm4, xmm3 ; distance -> xmm4

mulsd xmm3,xmm4
movq xmm5, rax
divsd xmm5,xmm3 ; mag -> xmm5

movsd xmm6, [rdx + oBody.mass]
mulsd xmm6, xmm5 ; precompute bodies[j].mass * mag

movsd xmm3, [rbx + oBody.vx]
movsd xmm4, xmm0
mulsd xmm4, xmm6
subsd xmm3,xmm4
movsd [rbx + oBody.vx],xmm3 ; iBody.vx -= dx * bodies[j].mass * ;mag;

movsd xmm3, [rbx + oBody.vy]
movsd xmm4, xmm1
mulsd xmm4, xmm6
subsd xmm3,xmm4
movsd [rbx + oBody.vy],xmm3

movsd xmm3, [rbx + oBody.vz]
movsd xmm4, xmm2
mulsd xmm4, xmm6
subsd xmm3,xmm4
movsd [rbx + oBody.vz],xmm3
; ----------------------------------------------
movsd xmm6, [rbx + oBody.mass]
mulsd xmm6, xmm5 ; precompute iBody.mass * mag

movsd xmm3, [rdx + oBody.vx]
movsd xmm4, xmm0
mulsd xmm4, xmm6
addsd xmm3, xmm4
movsd [rdx + oBody.vx], xmm3 ; bodies[j].vx += dx * iBody.mass ;* mag;

movsd xmm3, [rdx + oBody.vy]
movsd xmm4, xmm1
mulsd xmm4, xmm6
addsd xmm3, xmm4
movsd [rdx + oBody.vy], xmm3

movsd xmm3, [rdx + oBody.vz]
movsd xmm4, xmm2
mulsd xmm4, xmm6
addsd xmm3, xmm4
movsd [rdx + oBody.vz], xmm3
;-----------------------------------------
add rdx,SIZEOFBODY
dec r9
jnz .L1
add rbx,SIZEOFBODY
jmp .L0
.L2:
mov rbx,sun
mov ecx,5
.L3:
movsd xmm0, [rbx + oBody.x]
movsd xmm1, [rbx + oBody.y]
movsd xmm2, [rbx + oBody.z]

movq xmm3 , rax
mulsd xmm3, [rbx + oBody.vx]
addsd xmm0, xmm3
movsd [rbx + oBody.x], xmm0

movq xmm3 , rax
mulsd xmm3, [rbx + oBody.vy]
addsd xmm1, xmm3
movsd [rbx + oBody.y], xmm1

movq xmm3 , rax
mulsd xmm3, [rbx + oBody.vz]
addsd xmm2, xmm3
movsd [rbx + oBody.z], xmm2

add rbx,SIZEOFBODY
dec ecx
jnz .L3

}

section '.text' executable align 16
extrn printf
extrn atoi
public main

main:
mov qword[n],1
; rdi - > argc , rsi -> argv
cmp rdi,2
jl .begin
mov rdi,qword[rsi+8] ; argv[1] -> rdi
call atoi
mov qword[n],rax

mov eax,0
mov rdi, argv
mov rsi,[n]
sub rsp,8
call printf
add rsp,8
.begin:
sub rsp,8
mov eax,2
mov rdi,message

; init solar mass
movsd xmm0, qword[PI]
movsd xmm1,xmm0
mulsd xmm0,qword[SOLAR_MASS]
mulsd xmm0,xmm1
movsd [SOLAR_MASS],xmm0
call printf

; init bodies
init_body sun,0f,0f,0f,0f,0f,0f,1f

init_body jupiter,4.84143144246472090e+00, \
-1.16032004402742839e+00,\
-1.03622044471123109e-01,\
1.66007664274403694e-03, \
7.69901118419740425e-03, \
-6.90460016972063023e-05,\
9.54791938424326609e-04;
mov rbx,jupiter
call print_body

init_body saturn,8.34336671824457987e+00, \
4.12479856412430479e+00, \
-4.03523417114321381e-01,\
-2.76742510726862411e-03,\
4.99852801234917238e-03, \
2.30417297573763929e-05, \
2.85885980666130812e-04;
mov rbx,saturn
call print_body

init_body uranus,1.28943695621391310e+01, \
-1.51111514016986312e+01,\
-2.23307578892655734e-01,\
2.96460137564761618e-03, \
2.37847173959480950e-03, \
-2.96589568540237556e-05,\
4.36624404335156298e-05
mov rbx,uranus
call print_body

init_body neptune,1.53796971148509165e+01, \
-2.59193146099879641e+01,\
1.79258772950371181e-01, \
2.68067772490389322e-03, \
1.62824170038242295e-03, \
-9.51592254519715870e-05,\
5.15138902046611451e-05;
mov rbx,neptune
call print_body

pxor xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2

virtual at rbx
.oBody body
end virtual

mov rbx,sun
mov ecx,5
; init
; ----------------------------------
.L0:
movsd xmm3, [.oBody.vx]
mulsd xmm3, [.oBody.mass]
addsd xmm0, xmm3

movsd xmm3, [.oBody.vy]
mulsd xmm3, [.oBody.mass]
addsd xmm1, xmm3

movsd xmm3, [.oBody.vz]
mulsd xmm3, [.oBody.mass]
addsd xmm2, xmm3

add rbx, SIZEOFBODY ;
dec ecx
jnz .L0

mov rbx,sun
call offset_momentum
call print_body
; ----------------------------------------
call energy
call print_energy

mov r8, [n]
.L1:
advance 0.01
dec r8
jnz .L1

call energy
call print_energy

add rsp,8
xor eax,eax
ret

; px xmm0 , py xmm1 , pz xmm2
offset_momentum:
virtual at rbx
.oBody body
end virtual

mov rax,0x8000000000000000
movq xmm3, rax

xorpd xmm0,xmm3
xorpd xmm1,xmm3
xorpd xmm2,xmm3
divsd xmm0,[SOLAR_MASS]
divsd xmm1,[SOLAR_MASS]
divsd xmm2,[SOLAR_MASS]
movsd [.oBody.vx],xmm0
movsd [.oBody.vy],xmm1
movsd [.oBody.vz],xmm2
ret

print_body:
virtual at rbx
.oBody body
end virtual
sub rsp,8
mov eax,7
mov rdi,bmsg
movq xmm0,[.oBody.x]
movq xmm1,[.oBody.y]
movq xmm2,[.oBody.z]
movq xmm3,[.oBody.vx]
movq xmm4,[.oBody.vy]
movq xmm5,[.oBody.vz]
movq xmm6,[.oBody.mass]
call printf
add rsp,8
ret
; xmm0 resulting energy
energy:
virtual at rbx
.iBody body
end virtual
virtual at rdx
.jBody body
end virtual
mov rbx, sun
mov ecx, 5
mov rax,0.0
movq xmm0, rax
mov rax,0.5
.L0:

movsd xmm1, [.iBody.vx]
mulsd xmm1,xmm1

movsd xmm2, [.iBody.vy]
mulsd xmm2,xmm2

movsd xmm3, [.iBody.vz]
mulsd xmm3,xmm3

addsd xmm1, xmm2
addsd xmm1, xmm3

mulsd xmm1, [.iBody.mass]

movq xmm2, rax
mulsd xmm2, xmm1

addsd xmm0, xmm2

dec ecx
jz .L2

lea rdx, [rbx+SIZEOFBODY]
push rcx
.L1:
movsd xmm1, [.iBody.x]
subsd xmm1, [.jBody.x]

movsd xmm2, [.iBody.y]
subsd xmm2, [.jBody.y]

movsd xmm3, [.iBody.z]
subsd xmm3, [.jBody.z]

mulsd xmm1,xmm1
mulsd xmm2,xmm2
mulsd xmm3,xmm3

addsd xmm1, xmm2
addsd xmm1, xmm3

sqrtsd xmm1,xmm1

movsd xmm2, [.iBody.mass]
mulsd xmm2, [.jBody.mass]
divsd xmm2, xmm1

subsd xmm0, xmm2
add rdx, SIZEOFBODY
dec ecx
jnz .L1

add rbx, SIZEOFBODY
pop rcx
jmp .L0
.L2:
ret

print_energy:
sub rsp,8
mov eax,1
mov rdi, msg
call printf
add rsp, 8
ret

section '.data' writeable align 16

message db 'Hello World %2.9f %2.9f !',0xa,0
bmsg db 'x: %.9f',0xa,'y: %.9f',0xa,'z: %.9f',0xa, \
'vx: %.9f',0xa,'vy: %.9f',0xa,'vz: %.9f',0xa, \
'mass: %.9f',0xa,0xa,0
msg db '%.9f',0xa,0
argv db 'argv : %d',0xa,0
align 8
PI dq 3.141592653589793
SOLAR_MASS dq 4.0
DAYS_PER_YEAR dq 365.24

section '.bss' writeable align 16


sun body
jupiter body
saturn body
uranus body
neptune body

n rq 1

Rod Pemberton

unread,
Apr 10, 2012, 10:09:13 AM4/10/12
to
"Melissa" <d...@ne.invalid> wrote in message news:jm1aog$730$1...@solani.org...
> Hi , this is my attempt to implement in assembly
> this java program: [link]
>
> My program executes in 35 seconds , while java in 22 seconds ;)
>

The times of benchmarks are very cpu, operating system, and code dependent.

If you're submitting to them for timing, then the assembly should win, if
it's optimized. Assembly doesn't have the language overhead of C or Fortran
or Java or any other high-level language, but still has access to the same
floating point or MMX or SSE instructions to do the work.

I.e., the benchmarks must be run on the same type of cpu, at the same system
frequency, with the same memory and I/O subsystem, and using the same OS.
Supposedly, their times are for Intel Q6600 quad-core using x64 Ubuntu
Linux. They don't say what speed the cpu is running, however. I.e., for
your code to be properly timed, you'd have to submit it to them and have
them time it, since it seems they haven't provided sufficient information to
recreate their setup. Most of the results seem to be inline with using
equivalent code, based on what I've seen from other benchmarks. For some of
the results, it seems like people not only translated the code, but went
through and attempted to optimize it for more speed too. Of course, the
opposite can happen too. Sometimes novice users of a language port the code
and end up with worse code, causing some languages to rank lower.
IMO, if the tested code is "identical" or as equivalent as possible, the
results you should see are assembly in 1st, C in 2nd, C++ in 3rd, etc.
Java seems to be ranked quite a bit higher than I would expect it ... It's
usually ranked in the lower 3rd of benchmarks. Fortran is ranked far, far
higher than I would expect it. The Fortran used happens to be by Intel
which may be a factor in that, i.e., coded by an Intel expert and/or
psuedo-Fortran. It's a horrid language. It should have much more overhead
than appears to be the situation. It looks like someone did some extensive
code optimization on the Fortran which would explain it's rank, but it's
been decades since I coded in it and can't be entirely sure of that. Most
modern dynamic languages like Ruby, Perl, Python etc should be down near the
bottom, where they are. They are typically coded in C and emitted as C, or
sometimes C++. Most users of those types of languages claim they are far
faster than they are usually ranked ...

> Only place that needs optimisations is macro advance.
>

Sorry, I'm not familiar with floating point, MMX, or SSE etc. Someone will
usually respond here, but if not you could try a post to comp.lang.asm.x86.
It's a moderated Usenet group, so you'll have to wait for post approval.


Rod Pemberton



MelissA

unread,
Apr 10, 2012, 1:37:40 PM4/10/12
to
On Tue, 10 Apr 2012 10:09:13 -0400
"Rod Pemberton" <do_no...@notemailnot.cmm> wrote:

> "Melissa" <d...@ne.invalid> wrote in message
> news:jm1aog$730$1...@solani.org...
> > Hi , this is my attempt to implement in assembly
> > this java program: [link]
> >
> > My program executes in 35 seconds , while java in 22 seconds ;)
> >
>
> The times of benchmarks are very cpu, operating system, and code
> dependent.
>
> If you're submitting to them for timing, then the assembly should
> win, if it's optimized. Assembly doesn't have the language overhead
> of C or Fortran or Java or any other high-level language, but still
> has access to the same floating point or MMX or SSE instructions to
> do the work.
>
> I.e., the benchmarks must be run on the same type of cpu, at the same
> system frequency, with the same memory and I/O subsystem, and using
> the same OS. Supposedly, their times are for Intel Q6600 quad-core
> using x64 Ubuntu Linux. They don't say what speed the cpu is
> running, however.

Seems that they are running q6600@ 2.4g GHz
I have one on work and my program executes 35 seconds,
while Java executes 22 second, finally c++ executes
at 20 seconds.

Interestingly I have under clocked my e8400 to 2.4GHz
and benchmark runs much faster there
bmaxa@maxa:~/shootout$ time ./nbodycpp 50000000
-0.169075164
-0.169059907

real 0m10.781s
user 0m10.733s
sys 0m0.016s
bmaxa@maxa:~/shootout$ time ./nbodyf 50000000
-0.169075164
-0.169059907

real 0m28.651s
user 0m28.530s
sys 0m0.032s
bmaxa@maxa:~/shootout$ time java nbody 50000000
-0.169075164
-0.169059907

real 0m14.551s
user 0m13.053s
sys 0m0.028s
bmaxa@maxa:~/shootout$ time ./nbody 50000000
argv : 50000000
Hello World 39.478417604 3.141592654 !
x: 4.841431442
y: -1.160320044
z: -0.103622044
vx: 0.606326393
vy: 2.811986845
vz: -0.025218362
mass: 0.037693675

x: 8.343366718
y: 4.124798564
z: -0.403523417
vx: -1.010774346
vy: 1.825662371
vz: 0.008415761
mass: 0.011286326

x: 12.894369562
y: -15.111151402
z: -0.223307579
vx: 1.082791006
vy: 0.868713018
vz: -0.010832637
mass: 0.001723724

x: 15.379697115
y: -25.919314610
z: 0.179258773
vx: 0.979090732
vy: 0.594698999
vz: -0.034755956
mass: 0.002033687

x: 0.000000000
y: 0.000000000
z: 0.000000000
vx: -0.000387663
vy: -0.003275359
vz: 0.000023936
mass: 39.478417604

-0.169075164
-0.169059907

real 0m24.776s
user 0m24.950s
sys 0m0.040s

Seems that SSE code runs much faster on e8400 than on q6600

I.e., for your code to be properly timed, you'd
> have to submit it to them and have them time it, since it seems they
> haven't provided sufficient information to recreate their setup.

I asked them they are not interested in assembler.

> Most of the results seem to be inline with using equivalent code,
> based on what I've seen from other benchmarks. For some of the
> results, it seems like people not only translated the code, but went
> through and attempted to optimize it for more speed too. Of course,
> the opposite can happen too. Sometimes novice users of a language
> port the code and end up with worse code, causing some languages to
> rank lower. IMO, if the tested code is "identical" or as equivalent
> as possible, the results you should see are assembly in 1st, C in
> 2nd, C++ in 3rd, etc. Java seems to be ranked quite a bit higher than
> I would expect it ... It's usually ranked in the lower 3rd of
> benchmarks. Fortran is ranked far, far higher than I would expect
> it.

Yes, as you can see here gfortran is most slowest of all that I have
tried on e8400 ;) my program is second slowest ;)
Intel Fortran seems that is very well optimized for their processors.


>
> > Only place that needs optimisations is macro advance.
> >
>
> Sorry, I'm not familiar with floating point, MMX, or SSE etc.
> Someone will usually respond here, but if not you could try a post to
> comp.lang.asm.x86. It's a moderated Usenet group, so you'll have to
> wait for post approval.

Thanks , I will.
Have looked into gcc code and besides loops unrolling I can't see
any other trick that they have used.
My program is much shorter yet more than twice slower.


Nathan

unread,
Apr 11, 2012, 2:34:53 AM4/11/12
to
On Apr 10, 1:37 pm, MelissA <me...@a.com> wrote:

What happens when you redirect the output??

change this:

> bmaxa@maxa:~/shootout$ time ./nbody 50000000

to:

bmaxa@maxa:~/shootout$ time ./nbody 50000000 >/dev/null

Nathan.
--
http://clax.inspiretomorrow.net

MelissA

unread,
Apr 11, 2012, 6:02:42 AM4/11/12
to
On Tue, 10 Apr 2012 23:34:53 -0700 (PDT)
Nathan <nathan...@gmail.com> wrote:

> On Apr 10, 1:37 pm, MelissA <me...@a.com> wrote:
>
> What happens when you redirect the output??
>
> change this:
>
> > bmaxa@maxa:~/shootout$ time ./nbody 50000000
>
> to:
>
> bmaxa@maxa:~/shootout$ time ./nbody 50000000 >/dev/null
>
> Nathan.


Same. Nothing is printed in loop. In the mean time I sped up program by
several seconds just by eliminating temporary register ;)
bmaxa@maxa:~/shootout$ diff nbody.asm nbody1.asm
109,111c109,110
< movsd xmm4, xmm0
< mulsd xmm4, xmm6
< addsd xmm3, xmm4
---
> mulsd xmm0, xmm6
> addsd xmm3, xmm0
115,117c114,115
< movsd xmm4, xmm1
< mulsd xmm4, xmm6
< addsd xmm3, xmm4
---
> mulsd xmm1, xmm6
> addsd xmm3, xmm1
121,123c119,120
< movsd xmm4, xmm2
< mulsd xmm4, xmm6
< addsd xmm3, xmm4
---
> mulsd xmm2, xmm6
> addsd xmm3, xmm2
bmaxa@maxa:~/shootout$

Offending slow macro/proc is now:
mulsd xmm0, xmm6
addsd xmm3, xmm0
movsd [rdx + oBody.vx], xmm3 ; bodies[j].vx += dx * iBody.mass * mag;

movsd xmm3, [rdx + oBody.vy]
mulsd xmm1, xmm6
addsd xmm3, xmm1
movsd [rdx + oBody.vy], xmm3

movsd xmm3, [rdx + oBody.vz]
mulsd xmm2, xmm6
addsd xmm3, xmm2

Isaac Gouy

unread,
Apr 11, 2012, 1:10:52 PM4/11/12
to
On Apr 10, 7:09 am, "Rod Pemberton" <do_not_h...@notemailnot.cmm>
wrote:

-snip-
> IMO, if the tested code is "identical" or as equivalent as possible, the
> results you should see are assembly in 1st, C in 2nd, C++ in 3rd, etc.
> Java seems to be ranked quite a bit higher than I would expect it ...  It's
> usually ranked in the lower 3rd of benchmarks.

It's very easy to show Java performing badly on toy benchmarks and
microbenchmarks - just transliterate C or C++ code into Java putting
all the code into a single main method and then only let the program
run for a few thousandths of a second :-)

But if the code is structured like ordinary Java code, and the program
runs for longer than it takes to startup a JVM, toy benchmarks and
microbenchmarks seem to take about twice as long as C or C++ code (or
less).

> Fortran is ranked far, far higher than I would expect it.  The Fortran used
> happens to be by Intel which may be a factor in that, i.e., coded by an Intel expert
> and/or psuedo-Fortran.

iirc coded by programmers who are paid to program Fortran.

My guess is that not all compilers are created equal ;-)



Nomen Nescio

unread,
Apr 11, 2012, 4:47:43 PM4/11/12
to
"Rod Pemberton" <do_no...@notemailnot.cmm> wrote:

> "Melissa" <d...@ne.invalid> wrote in message news:jm1aog$730$1...@solani.org...
> > Hi , this is my attempt to implement in assembly
> > this java program: [link]
> >
> > My program executes in 35 seconds , while java in 22 seconds ;)

L O L...something is terribly TERRIBLY WRONG HERE!!!!!!!!!!!!!!!


> usually ranked in the lower 3rd of benchmarks. Fortran is ranked far, far
> higher than I would expect it.

Why, because it's not C?

> The Fortran used happens to be by Intel which may be a factor in that,
> i.e., coded by an Intel expert and/or psuedo-Fortran.

Intel Fortran, C and C++ always beat gcc in any contest on Intel chips. They have the best
optimizations since they have all the details.

> It's a horrid language.

Says who? It's the language with the longest history of optimisation around.

> It should have much more overhead than appears to be the situation.

Why?

MelissA

unread,
Apr 11, 2012, 6:16:44 PM4/11/12
to
On Wed, 11 Apr 2012 12:02:42 +0200
MelissA <me...@a.com> wrote:

I have sped up program further by vectorizing
x and y calculations and added some more registers.
Strange,but, by just adding registers there is 10% speed up.
Now program executes at 23 seconds on q6600 @ 2.4 Ghz

New version of macro advance follows:
(struct body have filler before vx in order
to use movapd instead of movupd)
SIZEOFBODY equ 64
struc body {
.x dq ?
.y dq ?
.z dq ?
.filler dq ?
.vx dq ?
.vy dq ?
.vz dq ?
.mass dq ?
}

macro advance
{
; xmm15 holds dt
local .L0,.L1,.L2,.L3
mov ecx,5 ; ecx - > i
mov rbx,sun
.L0:
dec ecx
jz .L2
mov r9, rcx ; r9 -> j
lea rdx, [rbx+SIZEOFBODY]
.L1:

movapd xmm0,dqword[rbx + oBody.x]
movsd xmm2,[rbx + oBody.z]
movapd xmm1,dqword[rdx + oBody.x]

subpd xmm0,xmm1 ; dx,dy -> xmm0
subsd xmm2,[rdx + oBody.z] ; dz -> xmm2

movapd xmm3,xmm0
movapd xmm5,xmm2

mulpd xmm3,xmm3
mulsd xmm5,xmm5

haddpd xmm3,xmm3
addsd xmm3,xmm5 ; dsquared -> xmm3

sqrtsd xmm4, xmm3 ; distance -> xmm4

mulsd xmm3,xmm4
movapd xmm5, xmm15
; most slowest instruction takes almost 50% of execution time
divsd xmm5,xmm3 ; mag -> xmm5

;-----------------------------------------------
movsd xmm6, [rdx + oBody.mass]
mulsd xmm6, xmm5 ; precompute bodies[j].mass * mag
movddup xmm6,xmm6

movapd xmm3, dqword[rbx + oBody.vx]
movapd xmm8, xmm0
mulpd xmm8, xmm6
subpd xmm3,xmm8
movapd dqword[rbx + oBody.vx],xmm3 ; iBody.vx -= dx *
bodies[j].mass * mag;

movsd xmm3, [rbx + oBody.vz]
movapd xmm9, xmm2
mulsd xmm9, xmm6
subsd xmm3,xmm9
movsd [rbx + oBody.vz],xmm3
; ----------------------------------------------
movsd xmm6, [rbx + oBody.mass]
mulsd xmm6, xmm5 ; precompute iBody.mass * mag
movddup xmm6,xmm6

movapd xmm3, dqword[rdx + oBody.vx]
mulpd xmm0, xmm6
addpd xmm3, xmm0
movapd dqword[rdx + oBody.vx], xmm3 ; bodies[j].vx += dx *
iBody.mass * mag;

movsd xmm3, [rdx + oBody.vz]
mulsd xmm2, xmm6
addsd xmm3, xmm2
movsd [rdx + oBody.vz], xmm3
;-----------------------------------------
add rdx,SIZEOFBODY
dec r9
jnz .L1
add rbx,SIZEOFBODY
jmp .L0
.L2:
mov rbx,sun
mov ecx,5
.L3:
movapd xmm0, dqword[rbx + oBody.x]
movsd xmm2, [rbx + oBody.z]

movapd xmm3 , xmm15
movddup xmm3,xmm3
mulpd xmm3, dqword[rbx + oBody.vx]
addpd xmm0, xmm3
movapd dqword[rbx + oBody.x], xmm0

movapd xmm3 , xmm15

Rod Pemberton

unread,
Apr 11, 2012, 7:14:49 PM4/11/12
to
"Nomen Nescio" <nob...@dizum.com> wrote in message
news:1afe1fd66bf1580a...@dizum.com...
> "Rod Pemberton" <do_no...@notemailnot.cmm> wrote:
...

> > usually ranked in the lower 3rd of benchmarks. Fortran is
> > ranked far, far higher than I would expect it.
>
> Why, because it's not C?
>

The guy living under the bridge knows me too well ... ;-)

Why do I always have to justify things for you? If you seriously believe
I'm wrong instead of just attempting to annoy me, why can't you post a valid
counter argument?

Because, it's a freakin' crap language, IMO. Almost all other high-level
languages have far more overhead than C. Fortran did well with numbers.
However, the rest of the language sucked as compared to other languages:
syntax sucked, control flow sucked, overhead sucked, character I/O sucked,
file I/O sucked, system interface sucked, documentation sucked. It just
sucked, except for floating point, where it's better than C numerically.
Maybe, it has changed a bit over the past three decades ... In the past few
years, a few have told me it has. However, the flaws inherent in the system
are never worked out, be it C, C++, Fortran, Forth, Pascal, US government,
Christianity, whatever ... The original flaws are kept - because they are
beloved by users, believers, programmers, whomever - and then more abhorrent
flaws are added onto the original mess, repeatedly, until eventually
everyone screams at the resultant nightmare. However, they are all
dependent on the nightmare and so choose to live in hell. It's optimism and
hope and do-goodism gone awry. Too many cooks spoil ... No one ever starts
with a clean slate or fixes the inherent problems once a language has been
established via a large user base. When they do, they meet with lackluster
success. Why? Because the advantages of the fixes aren't sufficient to
justify anyone changing or learning or improving anything. Just look at the
D language, US government laws, pick any topic, ...


> > It's a horrid language.
>
> Says who?

Says me.

Do I now need to get permission from you to express my well founded
opinions? Why?

Hey, if you like it, that's your business. Some people will literally eat
anything, and others like being tied up and beaten ... Not me! So, to
each, his own. However, not everyone is masochistic enough to enjoy the
chains and bruises of Fortran, so to speak.

> It's the language with the longest history of optimisation around.

Oh, I don't doubt that. To me, it's a bit like my Volkswagen Super Beetle.
It can be rebuilt forever since parts will always be available. However, it
also needs to be rebuilt forever since it can't ever be permanently fixed
... Some designs are just flawed forever. Evolution is working hard to fix
some of that ... When people step up and attempt to fix the problems
themselves, the timid world screams in horror at the supposed monstrosities:
Science, eugenics, pesticides, genetically modified foods, fetal tissue
research, ... Then, it's decades of hostile fights, horrendous laws, and
nightmares dragging the abundance of ignorant and stupid people into
the future.


Rod Pemberton
PS. I thought you said you weren't ever talking to again me since I
apparently offended you in c.l.f., in part by being something I've never
been: manic-depressive. I'm very stoic, BTW.






Robert Wessel

unread,
Apr 11, 2012, 8:31:55 PM4/11/12
to
On Wed, 11 Apr 2012 19:14:49 -0400, "Rod Pemberton"
<do_no...@notemailnot.cmm> wrote:

>"Nomen Nescio" <nob...@dizum.com> wrote in message
>news:1afe1fd66bf1580a...@dizum.com...
>> "Rod Pemberton" <do_no...@notemailnot.cmm> wrote:
>...
>
>> > usually ranked in the lower 3rd of benchmarks. Fortran is
>> > ranked far, far higher than I would expect it.
>>
>> Why, because it's not C?
>>
>
>The guy living under the bridge knows me too well ... ;-)
>
>Why do I always have to justify things for you? If you seriously believe
>I'm wrong instead of just attempting to annoy me, why can't you post a valid
>counter argument?
>
>Because, it's a freakin' crap language, IMO. Almost all other high-level
>languages have far more overhead than C. Fortran did well with numbers.
>However, the rest of the language sucked as compared to other languages:
>syntax sucked, control flow sucked, overhead sucked, character I/O sucked,
>file I/O sucked, system interface sucked, documentation sucked. It just
>sucked, except for floating point, where it's better than C numerically.


Which is why it's still used by the HPC community. Whatever other
crap Fortran is lugging around, it's good at the kind of math done in
the HPC world, especially stuff that can be vectorized by a compiler
(and pretty much nothing beats Fortran compilers at that).

And, in case it escaped your notice, that's exactly what this
benchmark program is. So why the surprise that Fortran is actually
good at it?

MelissA

unread,
Apr 12, 2012, 3:15:43 AM4/12/12
to
On Wed, 11 Apr 2012 22:47:43 +0200 (CEST)
Nomen Nescio <nob...@dizum.com> wrote:

> "Rod Pemberton" <do_no...@notemailnot.cmm> wrote:
>
> > "Melissa" <d...@ne.invalid> wrote in message
> > news:jm1aog$730$1...@solani.org...
> > > Hi , this is my attempt to implement in assembly
> > > this java program: [link]
> > >
> > > My program executes in 35 seconds , while java in 22 seconds ;)
>
> L O L...something is terribly TERRIBLY WRONG HERE!!!!!!!!!!!!!!!

It's not wrong, program prints correct result ;)
Just compare first and last version 35 vs 23 seconds.
If I comment out div instruction in macro advance
program executes in 15 seconds ;)
It happens that Fortran optimization uses
some magic ;)



Thomas Boell

unread,
Apr 12, 2012, 6:57:10 AM4/12/12
to
Did you test your program, the Java program, and the Fortran program on
the same machine?

MelissA

unread,
Apr 12, 2012, 6:59:54 AM4/12/12
to
Yes. On q6600 @ 2.4GHz centos 6. Result of Java and c++ seems to be same
as on shootout site.

MelissA

unread,
Apr 12, 2012, 7:04:23 AM4/12/12
to
On Thu, 12 Apr 2012 12:59:54 +0200
Oh, I don;t have Intel Fortran. Tested just java , c++ and asm.
gfortran is slower than my slowest version of program.

Fritz Wuehler

unread,
Apr 12, 2012, 7:20:09 AM4/12/12
to
MelissA <me...@a.com> wrote:

> On Wed, 11 Apr 2012 22:47:43 +0200 (CEST)
> Nomen Nescio <nob...@dizum.com> wrote:
>
> > "Rod Pemberton" <do_no...@notemailnot.cmm> wrote:
> >
> > > "Melissa" <d...@ne.invalid> wrote in message
> > > news:jm1aog$730$1...@solani.org...
> > > > Hi , this is my attempt to implement in assembly
> > > > this java program: [link]
> > > >
> > > > My program executes in 35 seconds , while java in 22 seconds ;)
> >
> > L O L...something is terribly TERRIBLY WRONG HERE!!!!!!!!!!!!!!!
>
> It's not wrong, program prints correct result ;)

So what?

> Just compare first and last version 35 vs 23 seconds.
> If I comment out div instruction in macro advance
> program executes in 15 seconds ;)

You have some seriously messed up code I am surprised Rod didnt fix it.

> It happens that Fortran optimization uses
> some magic ;)

I don't doubt that but Java was faster than your assembly.

!!!!!!!!!!!something is terribly TERRIBLY WRONG HERE!!!!!!!!!!!!!!!

Thomas Boell

unread,
Apr 12, 2012, 8:09:16 AM4/12/12
to
On Thu, 12 Apr 2012 13:20:09 +0200
Fritz Wuehler <fr...@spamexpire-201204.rodent.frell.theremailer.net>
wrote:

> MelissA <me...@a.com> wrote:
> [...]
> > Just compare first and last version 35 vs 23 seconds.
> > If I comment out div instruction in macro advance
> > program executes in 15 seconds ;)
>
> You have some seriously messed up code I am surprised Rod didnt fix it.
>
> > It happens that Fortran optimization uses
> > some magic ;)
>
> I don't doubt that but Java was faster than your assembly.
>
> !!!!!!!!!!!something is terribly TERRIBLY WRONG HERE!!!!!!!!!!!!!!!

Why don't you write something better? Should be terribly easy.

Robert Redelmeier

unread,
Apr 12, 2012, 10:29:02 AM4/12/12
to
Rod Pemberton <do_no...@notemailnot.cmm> wrote in part:
> Because, it's a freakin' crap language, IMO. Almost all other
> high-level languages have far more overhead than C. Fortran did
> well with numbers. However, the rest of the language sucked
> as compared to other languages: syntax sucked, control flow
> sucked, overhead sucked, character I/O sucked, file I/O sucked,
> system interface sucked, documentation sucked. It just sucked,
> except for floating point, where it's better than C numerically.
> Maybe, it has changed a bit over the past three decades ...

Try _six_ decades. Many of FORTRAN's "features"/flaws are
legacies and a result of preserving backwards compatibility.

Glass half full, or half empty? In everything there are
things to like and things to dislike. Which do you focus on?
Personally, I think it higher priority to take advantage of
strengths with avoiding weaknesses subordinate.

The very limitations of FORTRAN control flow, especially
around DO - loops are things that make vectorization easier
which keeps FORTRAN very viable for numeric processing.

-- Robert


MelissA

unread,
Apr 12, 2012, 4:20:33 PM4/12/12
to
I wouldn't ask for help if it is *that* easy ;)

Fritz Wuehler

unread,
Apr 12, 2012, 9:03:46 PM4/12/12
to
I could write it in Perl and it would be faster than her assembly

L O L L O L L O L L O L L O L L O L L O L L O L L O L L O L
>

Branimir Maksimovic

unread,
Apr 12, 2012, 9:18:32 PM4/12/12
to
On Fri, 13 Apr 2012 03:03:46 +0200
Perl needs 23 minutes...

http://shootout.alioth.debian.org/u64q/program.php?test=nbody&lang=perl&id=1

My new version executes at 16 seconds which is just two seconds slower
than fortran ;)
Posted new version on comp.lang.asm.x86 and here so it will be delayed
a bit ;)


Nomen Nescio

unread,
Apr 15, 2012, 3:15:09 PM4/15/12
to
"Rod Pemberton" <do_no...@notemailnot.cmm> wrote:

> "Nomen Nescio" <nob...@dizum.com> wrote in message
> news:1afe1fd66bf1580a...@dizum.com...
> > "Rod Pemberton" <do_no...@notemailnot.cmm> wrote:
> ...
>
> > > usually ranked in the lower 3rd of benchmarks. Fortran is
> > > ranked far, far higher than I would expect it.
> >
> > Why, because it's not C?
> >
>
> The guy living under the bridge knows me too well ... ;-)

I saw you living in a van down by the river!

> Why do I always have to justify things for you?

Because every time you post something not 100% related to C you're usually
wrong. Somebody has to point it out, that's usenet!

> If you seriously believe I'm wrong instead of just attempting to annoy me,
> why can't you post a valid counter argument?

Hmm that's some seriously good advice that applies to pretty much every post
where you have ever typed the word "mainframe". Anyway I had to establish
what the problem was before I could post a valid counter argument. Now that
you confirmed my suspicion I will.

> Because, it's a freakin' crap language, IMO. Almost all other high-level
> languages have far more overhead than C.

True at compilation time, probably not generally true at runtime unless
you're talking about scripting languages.

> Fortran did well with numbers. However, the rest of the language sucked as
> compared to other languages: syntax sucked, control flow sucked, overhead
> sucked, character I/O sucked, file I/O sucked, system interface sucked,
> documentation sucked.

On the implementation YOU knew about. On real machines (yeah mainframes) it
was very efficient and they spent tons of money on improving it. A lot of
the optimisation techniques we have today are based on work they did with
FORTRAN.

Even if older non-mainframe versions of FORTRAN/Fortran may or may not have
"sucked" today almost everything that runs on Linux/UNIX uses libc so the
character io, file io, system interface etc are all identical to C.
Everything uses libc. And the documentation? Well IBM's doc was always great
and from what I saw there are plenty of UNIX/Linux compilers and the doc is
pretty good too. Not GCC shit, I mean from vendors like Intel, etc.

> It just sucked, except for floating point, where it's better than C
> numerically. Maybe, it has changed a bit over the past three decades ...

Yeah almost everything has but your opinions ;-)

> In the past few years, a few have told me it has. However, the flaws
> inherent in the system are never worked out, be it C, C++, Fortran, Forth,
> Pascal, US government, Christianity, whatever ... The original flaws are
> kept - because they are beloved by users, believers, programmers, whomever
> - and then more abhorrent flaws are added onto the original mess,
> repeatedly, until eventually everyone screams at the resultant nightmare.

This paragraph deserves a usenet sticky!

> However, they are all dependent on the nightmare and so choose to live in
> hell. It's optimism and hope and do-goodism gone awry. Too many cooks
> spoil ... No one ever starts with a clean slate or fixes the inherent
> problems once a language has been established via a large user base. When
> they do, they meet with lackluster success. Why? Because the advantages
> of the fixes aren't sufficient to justify anyone changing or learning or
> improving anything. Just look at the D language, US government laws, pick
> any topic, ...

I agree with that too. There is one thing they did with a clean slate and it
still works great so many decades later, the IBM mainframe! And assembler,
PL/I, COBOL, etc. Yeah even FORTRAN.

> > It's the language with the longest history of optimisation around.
>
> Oh, I don't doubt that. To me, it's a bit like my Volkswagen Super
> Beetle.

No, it isnt. FORTRAN was designed, from the beginning. They spent tons of
research and money inventing all sorts of optimisations. The beetle was shit
from the beginning and no amount of rustoleum or Wal Mart fans can fix it.

Fritz Wuehler

unread,
Apr 15, 2012, 5:05:01 PM4/15/12
to
"Rod Pemberton" <do_no...@notemailnot.cmm> wrote:

> "Nomen Nescio" <nob...@dizum.com> wrote in message
> news:1afe1fd66bf1580a...@dizum.com...
> > "Rod Pemberton" <do_no...@notemailnot.cmm> wrote:
> ...
>
> > > usually ranked in the lower 3rd of benchmarks. Fortran is
> > > ranked far, far higher than I would expect it.
> >
> > Why, because it's not C?
> >
>
> The guy living under the bridge knows me too well ... ;-)

I saw you living in a van down by the river!

> Why do I always have to justify things for you?

Because every time you post something not 100% related to C you're usually
wrong. Somebody has to point it out, that's usenet!

> If you seriously believe I'm wrong instead of just attempting to annoy me,
> why can't you post a valid counter argument?

Hmm that's some seriously good advice that applies to pretty much every post
where you have ever typed the word "mainframe". Anyway I had to establish
what the problem was before I could post a valid counter argument. Now that
you confirmed my suspicion I will.

> Because, it's a freakin' crap language, IMO. Almost all other high-level
> languages have far more overhead than C.

True at compilation time, probably not generally true at runtime unless
you're talking about scripting languages.

> Fortran did well with numbers. However, the rest of the language sucked as
> compared to other languages: syntax sucked, control flow sucked, overhead
> sucked, character I/O sucked, file I/O sucked, system interface sucked,
> documentation sucked.

On the implementation YOU knew about. On real machines (yeah mainframes) it
was very efficient and they spent tons of money on improving it. A lot of
the optimisation techniques we have today are based on work they did with
FORTRAN.

Even if older non-mainframe versions of FORTRAN/Fortran may or may not have
"sucked" today almost everything that runs on Linux/UNIX uses libc so the
character io, file io, system interface etc are all identical to C.
Everything uses libc. And the documentation? Well IBM's doc was always great
and from what I saw there are plenty of UNIX/Linux compilers and the doc is
pretty good too. Not GCC shit, I mean from vendors like Intel, etc.

> It just sucked, except for floating point, where it's better than C
> numerically. Maybe, it has changed a bit over the past three decades ...

Yeah almost everything has but your opinions ;-)

> In the past few years, a few have told me it has. However, the flaws
> inherent in the system are never worked out, be it C, C++, Fortran, Forth,
> Pascal, US government, Christianity, whatever ... The original flaws are
> kept - because they are beloved by users, believers, programmers, whomever
> - and then more abhorrent flaws are added onto the original mess,
> repeatedly, until eventually everyone screams at the resultant nightmare.

This paragraph deserves a usenet sticky!

> However, they are all dependent on the nightmare and so choose to live in
> hell. It's optimism and hope and do-goodism gone awry. Too many cooks
> spoil ... No one ever starts with a clean slate or fixes the inherent
> problems once a language has been established via a large user base. When
> they do, they meet with lackluster success. Why? Because the advantages
> of the fixes aren't sufficient to justify anyone changing or learning or
> improving anything. Just look at the D language, US government laws, pick
> any topic, ...

I agree with that too. There is one thing they did with a clean slate and it
still works great so many decades later, the IBM mainframe! And assembler,
PL/I, COBOL, etc. Yeah even FORTRAN.

> > It's the language with the longest history of optimisation around.
>
> Oh, I don't doubt that. To me, it's a bit like my Volkswagen Super
> Beetle.

0 new messages