Using CUDA-aware MPI / GPUDirect with Numba

0 views

Skip to first unread message

kirchen...@googlemail.com

unread,

Feb 25, 2018, 4:57:46 AM2/25/18

to Numba Public Discussion - Public

Hi,

I managed to get a working solution to send numba GPU arrays directly over MPI (with mpi4py) using a CUDA-aware MPI implementation that I wanted to share with you.

See example code below.

Cheers,

Manuel

#!/usr/bin/env python
"""
Example adapted from user lebedov:


https://gist.github.com/lebedov/8514d3456a94a6c73e6d


Demo of how to pass GPU memory managed by numba to mpi4py.


Notes
-----
This code can be used to perform peer-to-peer communication of data via
NVIDIA's GPUDirect technology if mpi4py has been built against a
CUDA-enabled MPI implementation.
"""
import sys


import numpy as np
from mpi4py import MPI
from numba import cuda


comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()


if rank == 0:
    ary = np.arange(100,200,10,dtype=np.double)
    gpu_ary = cuda.to_device(ary)
    print(('before (%i): ' % rank)+str(ary))
    comm.Send(
        [MPI.memory.fromaddress(
            gpu_ary.device_ctypes_pointer.value,
            gpu_ary.alloc_size),
            MPI.DOUBLE], dest=1 )
    print('sent')
    print(('after  (%i): ' % rank)+str(gpu_ary.copy_to_host()))
elif rank == 1:
    ary = np.zeros(10, dtype=np.double)
    gpu_ary = cuda.to_device(ary)
    print(('before (%i): ' % rank)+str(ary))
    comm.Recv(
        [MPI.memory.fromaddress(
            gpu_ary.device_ctypes_pointer.value,
            gpu_ary.alloc_size),
            MPI.DOUBLE], source=0)
    print('received')
    print(('after  (%i): ' % rank)+str(gpu_ary.copy_to_host()))

kirchen...@googlemail.com

unread,

Feb 25, 2018, 5:17:38 AM2/25/18

to Numba Public Discussion - Public, kirchen...@googlemail.com

For completeness, here is the same example for pycuda.

#!/usr/bin/env python

"""
Example adapted from user lebedov:


https://gist.github.com/lebedov/8514d3456a94a6c73e6d

Demo of how to pass GPU memory managed by pycuda to mpi4py.




Notes
-----
This code can be used to perform peer-to-peer communication of data via
NVIDIA's GPUDirect technology if mpi4py has been built against a
CUDA-enabled MPI implementation.
"""


import atexit
import sys


import pycuda

import numpy as np
from mpi4py import MPI

import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
drv.init()




comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()




N_gpu = drv.Device(0).count()
if N_gpu < 2:
    sys.stdout.write('at least 2 GPUs required')
else:
    dev = drv.Device(rank)
    ctx = dev.make_context()
    atexit.register(ctx.pop)
    atexit.register(MPI.Finalize)


    if rank == 0:
        x_gpu = gpuarray.arange(100, 200, 10, dtype=np.double)
        print(('before (%i): ' % rank)+str(x_gpu))
        comm.Send(
            [MPI.memory.fromaddress(x_gpu.ptr, x_gpu.nbytes),
             MPI.DOUBLE], dest=1)
        print('sent')
        print(('after  (%i): ' % rank)+str(x_gpu))
    elif rank == 1:
        x_gpu = gpuarray.zeros(10, dtype=np.double)
        print(('before (%i): ' % rank)+str(x_gpu))
        comm.Recv(
            [MPI.memory.fromaddress(x_gpu.ptr, x_gpu.nbytes),


             MPI.DOUBLE], source=0)
        print('received')


        print(('after  (%i): ' % rank)+str(x_gpu))

Reply all

Reply to author

Forward

0 new messages