Dear all,
I recently encountered an overflow error when using Scatterv to distribute a large array.
I did make a segmentation of the array, but it seems the displacement arg will itself be overflowed:
eri_mo = mpi.scatter(eri_sliced, root=0, data=eri_data)
File "/home/zhcui/program/mpi4pyscf/mpi4pyscf/tools/mpi.py", line 273, in scatter
comm.Scatterv([sendbuf, counts_seg, displs+p0, mpi_dtype],
File "mpi4py/MPI/Comm.pyx", line 626, in mpi4py.MPI.Comm.Scatterv
File "mpi4py/MPI/msgbuffer.pxi", line 538, in mpi4py.MPI._p_msg_cco.for_scatter
File "mpi4py/MPI/msgbuffer.pxi", line 440, in mpi4py.MPI._p_msg_cco.for_cco_send
File "mpi4py/MPI/msgbuffer.pxi", line 313, in mpi4py.MPI.message_vector
File "mpi4py/MPI/asarray.pxi", line 22, in mpi4py.MPI.chkarray
File "mpi4py/MPI/asarray.pxi", line 15, in mpi4py.MPI.getarray
OverflowError: value too large to convert to int
My wrapper is like the following:
def prange(start, stop, step):
nsteps = (stop - start + step - 1) // step
nsteps = max(comm.allgather(nsteps))
for i in range(nsteps):
i0 = min(stop, start + i * step)
i1 = min(stop, i0 + step)
yield i0, i1
def scatter(sendbuf, root=0, data=None):
if rank == root:
mpi_dtype = numpy.result_type(*sendbuf).char
shape = comm.scatter([x.shape for x in sendbuf])
counts = numpy.asarray([x.size for x in sendbuf])
comm.bcast((mpi_dtype, counts))
if data is None:
sendbuf = [numpy.asarray(x, mpi_dtype).ravel() for x in sendbuf]
sendbuf = numpy.hstack(sendbuf)
else:
sendbuf = numpy.asarray(data, order='C')
else:
shape = comm.scatter(None)
mpi_dtype, counts = comm.bcast(None)
displs = numpy.append(0, numpy.cumsum(counts[:-1]))
recvbuf = numpy.empty(numpy.prod(shape), dtype=mpi_dtype)
#DONOT use lib.prange. lib.prange may terminate early in some processes
for p0, p1 in prange(0, numpy.max(counts), BLKSIZE):
counts_seg = _segment_counts(counts, p0, p1)
comm.Scatterv([sendbuf, counts_seg, displs+p0, mpi_dtype],
[recvbuf[p0:p1], mpi_dtype], root)
return recvbuf.reshape(shape)
It seems that the data is already chuncked, but the displs may be larger than INT size.
Is there a way to avoid that? Thanks!
Best,
Zhihao