2 def dev_sum(a, b):
3 return a + b
4
5 @cuda.jit
6 def cu_sum(a, num_to_sums, b):
7 sa = cuda.shared.array(shape=(512,), dtype=float64)
8 tx = cuda.threadIdx.x
9 bx = cuda.blockIdx.x
10 bw = cuda.blockDim.x
11 i = tx + bx * bw
12
13 sa[tx] = 0
14 if i < a.shape[0]:
15 num_choices = 200
16 num_to_sum = num_to_sums[i]
17 l_a = cuda.local.array(shape=num_choices, dtype=float64)
18
19 l_sum = float64(0.)
20 for j in xrange(num_to_sum):
21 l_a[j] = math.exp(a[i, j])
22 l_sum += l_a[j]
23
24 l_b = float64(0.)
25 # uncommenting line 26 means results match those expected (i.e. from Numpy)
26 #l_one = l_sum / l_sum
27 for j in xrange(num_to_sum):
28 # fails unless line 26 is uncommented
29 l_b += math.log(l_a[j]) * l_sum / l_sum
30
31 sa[tx] = l_b
32 cuda.syncthreads()
33 if tx == 0:
34 # Uses the first thread of each block to perform the actual reduction
35 s = sa[tx]
36 for j in range(1, bw):
37 s = dev_sum(s, sa[j])
38 b[bx] = s
import numpy as np
def test_cu_sum(self):
n = 1000 # total length of vector to be summed
np.random.seed(seed=1112)
a = np.random.uniform(low=-100, high=100, size=(n, 200)).astype(np.float64)
num_to_sums = np.random.randint(1, 200, n)
# add up a different number of items from each dimension of n
n_dim = 0
numpy_sum_a = np.float64(0.)
for num in num_to_sums:
for i in xrange(num):
numpy_sum_a += np.log(np.exp(a[n_dim, i]))
n_dim += 1
d_a = cuda.to_device(a)
grid_size = n
threads = 512
threads_per_block = (threads, 1)
blocks_per_grid = int(math.ceil(grid_size / float(threads)))
b = np.zeros((blocks_per_grid,), dtype=np.float64)
d_b = cuda.to_device(b)
d_num_to_sums = cuda.to_device(num_to_sums)
cu_sum[blocks_per_grid, threads_per_block](d_a, d_num_to_sums, d_b)
d_b.to_host()
sum_a = 0
for result_index in xrange(blocks_per_grid):
sum_a += b[result_index]
self.assertAlmostEqual(sum_a, numpy_sum_a)