numba CUDA performance advice

0 views

Skip to first unread message

Martin Langer

unread,

Aug 4, 2017, 9:18:26 AM8/4/17

to Numba Public Discussion - Public

Hi,

I'm looking for some advice for an implementation of some parallel code of mine:

import math
import time
import numpy as np

import numba
from numba import cuda


def generate_iteration_operator(lin1, lin2, cub1, cub2, coupling, order):
    exponent = 2**order + 1
    
    def iteration_operator(point, output):
        def boundary_condition(x):
            temp_mod = (x + 0.5) % 1.0
            if temp_mod < 0.0:
                return temp_mod + 0.5
            else:
                return temp_mod - 0.5
        
        p1 = point[0]
        p2 = point[1]
        q1 = point[2]
        q2 = point[3]

        new_q1 = boundary_condition(q1 + p1)
        new_q2 = boundary_condition(q2 + p2)

        coupling_func = coupling * (1.0 - math.cos(2*math.pi*(new_q1+new_q2)))

        arg1 = 2 * new_q1
        arg2 = 2 * new_q2

        dV1 = (lin1*arg1 - (lin1+cub1)*arg1**exponent + cub1*arg1*arg1*arg1)
        dV2 = (lin2*arg2 - (lin2+cub2)*arg2**exponent + cub2*arg2*arg2*arg2)
        
        new_p1 = boundary_condition(p1 - dV1 + coupling_func)
        new_p2 = boundary_condition(p2 - dV2 + coupling_func)

        output[0] = new_p1
        output[1] = new_p2
        output[2] = new_q1
        output[3] = new_q2

    return iteration_operator


def generate_cuda_kernel(py_iter_operator, n_init_conds):
    dev_operator = cuda.jit(py_iter_operator, device=True, inline=True)
    
    threadsperblock = 32
    blockspergrid = (n_init_conds + (threadsperblock - 1)) // threadsperblock
    chunk_size = 6

    @cuda.jit
    def parallel_computation(mem_array):
        i = cuda.grid(1)
        
        if i < mem_array.shape[0]:
            buffer = cuda.local.array((chunk_size, 4), numba.float64)
            n_chunks = mem_array.shape[1] // chunk_size
            n_remainder = mem_array.shape[1] % chunk_size
            
            for k in range(4):
                buffer[0, k] = mem_array[i, 0, k]
            
            for chunk_id in range(n_chunks):
                for iter_idx in range(chunk_size-1):
                    dev_operator(buffer[iter_idx],  buffer[iter_idx+1])
               
                chunk_idx = chunk_id*chunk_size
                for j in range(chunk_size):
                    for k in range(4):
                        mem_array[i, chunk_idx+j, k] = buffer[j, k]
    
                dev_operator(buffer[chunk_size-1], buffer[0])
        
            for iter_idx in range(n_remainder-1):
                dev_operator(buffer[iter_idx], 
                             buffer[iter_idx+1])

            remainder_offset = n_chunks*chunk_size
            for j in range(n_remainder):
                for k in range(4):
                    mem_array[i, remainder_offset+j, k] = buffer[j, k]
    
    kernel = parallel_computation[blockspergrid, threadsperblock]

    return kernel


def meas_time(method, n, *args):
    start = time.time()
    for _ in range(n):
        method(*args)
    end = time.time()
    duration = (end - start) / n
    return "execution time: {} ms".format(round(100*duration))


mapping = generate_iteration_operator(1.04, 1.35, 0.1, 0.1, 0.3, 5)
init_conds = 32*32
orbit_length = 10000

mem_array = np.zeros((init_conds, orbit_length, 4))
mem_array[:,0,:] = np.array([0.001, 0.009, 0.006, 0.003])

cuda_kernel = generate_cuda_kernel(mapping, init_conds)
cuda_kernel(mem_array)
print("computed: {} orbits with length {}".format(init_conds, orbit_length))
print(meas_time(cuda_kernel, 10, mem_array))

I don't have any experience with CUDA but I know that global memory access are expensive

so I improved performance a little bit by performing iterations chunk wise in a local buffer.

My problem:

The code is not really fast (not really better than prange with numba), has anyone some advice or experience on how to do it better?

Sincerely,

Martin

PS.:

I get some strange error if I don't import numba before explicitly importing cuda:

from numba import cuda