Using CUDA-aware MPI / GPUDirect with Numba

0 views
Skip to first unread message

kirchen...@googlemail.com

unread,
Feb 25, 2018, 4:57:46 AM2/25/18
to Numba Public Discussion - Public
Hi,

I managed to get a working solution to send numba GPU arrays directly over MPI (with mpi4py) using a CUDA-aware MPI implementation that I wanted to share with you.
See example code below.

Cheers,
Manuel

#!/usr/bin/env python
"""
Example adapted from user lebedov:


https://gist.github.com/lebedov/8514d3456a94a6c73e6d


Demo of how to pass GPU memory managed by numba to mpi4py.


Notes
-----
This code can be used to perform peer-to-peer communication of data via
NVIDIA's GPUDirect technology if mpi4py has been built against a
CUDA-enabled MPI implementation.
"""

import sys


import numpy as np
from mpi4py import MPI
from numba import cuda


comm
= MPI.COMM_WORLD
size
= comm.Get_size()
rank
= comm.Get_rank()


if rank == 0:
    ary
= np.arange(100,200,10,dtype=np.double)
    gpu_ary
= cuda.to_device(ary)
   
print(('before (%i): ' % rank)+str(ary))
    comm
.Send(
       
[MPI.memory.fromaddress(
            gpu_ary
.device_ctypes_pointer.value,
            gpu_ary
.alloc_size),
            MPI
.DOUBLE], dest=1 )
   
print('sent')
   
print(('after  (%i): ' % rank)+str(gpu_ary.copy_to_host()))
elif rank == 1:
    ary
= np.zeros(10, dtype=np.double)
    gpu_ary
= cuda.to_device(ary)
   
print(('before (%i): ' % rank)+str(ary))
    comm
.Recv(
       
[MPI.memory.fromaddress(
            gpu_ary
.device_ctypes_pointer.value,
            gpu_ary
.alloc_size),
            MPI
.DOUBLE], source=0)
   
print('received')
   
print(('after  (%i): ' % rank)+str(gpu_ary.copy_to_host()))



kirchen...@googlemail.com

unread,
Feb 25, 2018, 5:17:38 AM2/25/18
to Numba Public Discussion - Public, kirchen...@googlemail.com
For completeness, here is the same example for pycuda.

#!/usr/bin/env python


"""
Example adapted from user lebedov:


https://gist.github.com/lebedov/8514d3456a94a6c73e6d


Demo of how to pass GPU memory managed by pycuda to mpi4py.



Notes
-----
This code can be used to perform peer-to-peer communication of data via
NVIDIA's GPUDirect technology if mpi4py has been built against a
CUDA-enabled MPI implementation.
"""

import atexit
import sys


import pycuda


import numpy as np
from mpi4py import MPI


import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
drv
.init()



comm
= MPI.COMM_WORLD
size
= comm.Get_size()
rank
= comm.Get_rank()



N_gpu
= drv.Device(0).count()
if N_gpu < 2:
    sys
.stdout.write('at least 2 GPUs required')
else:
    dev
= drv.Device(rank)
    ctx
= dev.make_context()
    atexit
.register(ctx.pop)
    atexit
.register(MPI.Finalize)


   
if rank == 0:
        x_gpu
= gpuarray.arange(100, 200, 10, dtype=np.double)
       
print(('before (%i): ' % rank)+str(x_gpu))
        comm
.Send(
           
[MPI.memory.fromaddress(x_gpu.ptr, x_gpu.nbytes),
             MPI
.DOUBLE], dest=1)
       
print('sent')
       
print(('after  (%i): ' % rank)+str(x_gpu))
   
elif rank == 1:
        x_gpu
= gpuarray.zeros(10, dtype=np.double)
       
print(('before (%i): ' % rank)+str(x_gpu))
        comm
.Recv(
           
[MPI.memory.fromaddress(x_gpu.ptr, x_gpu.nbytes),

             MPI
.DOUBLE], source=0)
       
print('received')

       
print(('after  (%i): ' % rank)+str(x_gpu))
Reply all
Reply to author
Forward
0 new messages