My adventures with parallel programming with Julia continue. Here is a different issue from other threads: My parallel function is 8300x slower than my serial function even though I am running on 4 processes on a multi-core machine.
I have Julia 0.3.8. Here is my program in its entirety (not very long).
function main()
nbig::Int16 = 7
nbod::Int16 = nbig
bod = Float64[
0 1 2 3 4 5 6 # x position
0 0 0 0 0 0 0 # y position
0 0 0 0 0 0 0 # z position
0 0 0 0 0 0 0 # x velocity
0 0 0 0 0 0 0 # y velocity
0 0 0 0 0 0 0 # z velocity
1 1 1 1 1 1 1 # Mass
]
a = zeros(3,nbod)
@time for k = 1:1000
gravity_1!(bod, nbig, nbod, a)
end
println(a[1,:])
@time for k = 1:1000
gravity_2!(bod, nbig, nbod, a)
end
println(a[1,:])
end
function gravity_1!(bod, nbig, nbod, a)
for i = 1:nbod
a[1,i] = 0.0
a[2,i] = 0.0
a[3,i] = 0.0
end
@inbounds for i = 1:nbig
for j = (i + 1):nbod
dx = bod[1,j] - bod[1,i]
dy = bod[2,j] - bod[2,i]
dz = bod[3,j] - bod[3,i]
s_1 = 1.0 / sqrt(dx*dx+dy*dy+dz*dz)
s_3 = s_1 * s_1 * s_1
tmp1 = s_3 * bod[7,i]
tmp2 = s_3 * bod[7,j]
a[1,j] = a[1,j] - tmp1*dx
a[2,j] = a[2,j] - tmp1*dy
a[3,j] = a[3,j] - tmp1*dz
a[1,i] = a[1,i] + tmp2*dx
a[2,i] = a[2,i] + tmp2*dy
a[3,i] = a[3,i] + tmp2*dz
end
end
return a
end
function gravity_2!(bod, nbig, nbod, a)
for i = 1:nbod
a[1,i] = 0.0
a[2,i] = 0.0
a[3,i] = 0.0
end
@inbounds @sync @parallel for i = 1:nbig
for j = (i + 1):nbod
dx = bod[1,j] - bod[1,i]
dy = bod[2,j] - bod[2,i]
dz = bod[3,j] - bod[3,i]
s_1 = 1.0 / sqrt(dx*dx+dy*dy+dz*dz)
s_3 = s_1 * s_1 * s_1
tmp1 = s_3 * bod[7,i]
tmp2 = s_3 * bod[7,j]
a[1,j] = a[1,j] - tmp1*dx
a[2,j] = a[2,j] - tmp1*dy
a[3,j] = a[3,j] - tmp1*dz
a[1,i] = a[1,i] + tmp2*dx
a[2,i] = a[2,i] + tmp2*dy
a[3,i] = a[3,i] + tmp2*dz
end
end
return a
end
So this is a straight forward N-body gravity calculation. Yes, I realize that gravity_2!() is wrong, but that's fine. Right now I'm just talking about the CPU time. When I run this on my computer I get:
So, the serial version takes 0.000475 seconds and the parallel takes 3.95 seconds. Furthermore, the parallel version is calling the garbage collector. I suspect that the problem has something to do with the memory access. Maybe the parallel code is wasting a lot of time copying variables in memory. But whatever the reason, this is bad. The documentation says that @parallel is supposed to be fast, even for very small loops, but that's not what I'm seeing. A non-buggy implementation will be even slower.
Have I missed something? Is there an obvious error in how I'm using the parallel constructs?
I would appreciate any guidance you may offer.
Daniel.