please find below some code that calculates the spherical distance between random points on a sphere. It typically gives me a 5x speedup on our 28 processor machine.
To get even more speedup I would love to parallelize the outer loop in the sdist function, which essentially fills a triangle matrix.
from numba import njit
import numpy
import math
import time
import os
@njit(cache=False,parallel=False)
def tdist(dists,lats,lons,weight):
x=numpy.cos(lats*math.pi/180.)*numpy.cos(lons*math.pi/180.)
y=numpy.cos(lats*math.pi/180.)*numpy.sin(lons*math.pi/180.)
z=numpy.sin(lats*math.pi/180.)
sdist(dists, x, y, z)
dists[:]=numpy.arccos(dists*0.999999)
return
@njit(cache=False,parallel=True)
def pdist(dists,lats,lons,weight):
x=numpy.cos(lats*math.pi/180.)*numpy.cos(lons*math.pi/180.)
y=numpy.cos(lats*math.pi/180.)*numpy.sin(lons*math.pi/180.)
z=numpy.sin(lats*math.pi/180.)
sdist(dists, x, y, z)
dists[:]=numpy.arccos(dists*0.999999)
return
@njit()
def sdist(dists,x,y,z):
id=0
for l in range(x.shape[0]):
for k in range(l,x.shape[0]):
dists[id]=x[l]*x[k]+y[l]*y[k]+z[l]*z[k]
id+=1
return
n=3000
lats=numpy.random.rand(n)*180.-90.
lons=numpy.random.rand(n)*360.
dists=numpy.empty((n+1)*n/2,numpy.float64)
dists2=dists.copy()
for r in range(20):
t=time.time()
tdist(dists,lats,lons,1)
t1=time.time()
pdist(dists2,lats,lons,1)
print ('serial: {:5.4f}s parallel: {:5.4f}s speedup: {:5.2f}'.format(t1-t,time.time()-t1,(t1-t)/(time.time()-t1)))
assert(numpy.sum(dists)==numpy.sum(dists2))
t=time.time()
for r in range(20):
tdist(dists,lats,lons,1)
t1=time.time()
for r in range(20):
pdist(dists2,lats,lons,1)
print ('sequential serial: {:5.4f}s parallel: {:5.4f}s speedup: {:5.2f}'.format(t1-t,time.time()-t1,(t1-t)/(time.time()-t1)))
assert(numpy.sum(dists)==numpy.sum(dists2))