I have a convolution function that I have translated to Cython (with great effect), and since it is relatively easy to parallelize, I thought that I would do so. While the single-threaded cython code works fine, I noticed that the parallelized version gives me a different result each time I run it (given the same input) and I'm not sure why. Perhaps someone here might have an idea of what's going on and what I can do to fix it? The parallelized code almost works right, and is much faster, so I would really like to figure this one out.
def grid_1d(DTYPE_t[:] u, CTYPE_t[:] vis, \
double du, int Nu, double umin, int W, double alpha, bool hermitianize):
cdef int N = u.shape[0]
cdef double Du = W*du
cdef Py_ssize_t indx, undx, kndx
cdef double uval, uref, tu, gcf_val
cdef CTYPE_t visval
cdef double temp = 0.
cdef DTYPE_t [:] ug = \
np.zeros(N*W, dtype=DTYPE)
cdef CTYPE_t [:] visg = \
np.zeros(N*W, dtype=CTYPE)
cdef CTYPE_t [:] gv = \
np.zeros(Nu, dtype=CTYPE) # output array
# From Beatty et al. (2005)
cdef double beta = get_beta(W, alpha)
cdef int hcheck = 0
if hermitianize:
hcheck = 1
# do convolution
for indx in prange(N, nogil=True, schedule='guided'):
visval = vis[indx]
uval = u[indx]
uref = ceil((uval - 0.5*W*du)/du)*du
for undx in range(W):
tu = uref + undx*du
kndx = indx*W + undx
# just returns the convolution kernel value
gcf_val = gcf_kaiser(tu-uval, Du, beta)
visg[kndx] = visval*gcf_val
ug[kndx] = tu
cdef Py_ssize_t undx2 = 0
# sample onto regular grid
for indx in prange(N*W, nogil=True, schedule='guided'):
temp = (ug[indx] - umin)/du + 0.5
undx2 = int(temp)
if (undx2>=0 and undx2<Nu):
gv[undx2] = gv[undx2] + visg[indx]
if hcheck:
temp = (-1.*ug[indx] - umin)/du + 0.5
undx = int(temp)
if (undx2>=0 and undx2<Nu):
gv[undx2] = gv[undx2] + visg[indx].conjugate()
return gv