Hi,
I posted earlier for advice on memory usage. I was using MAGMA's testing code, specifically testing_dsyevd. I just figured out that the testing code also allocates a big chunk of memory to store another copy of the matrix for diagonalization in LAPACK.
I decided to write my own small test (well, to copy/paste the MAGMA test and modify :) ). I removed the LAPACK memory allocation and this allowed me to test some slightly larger matrices. The code is attached and also pasted below.
In my code, I run magma_dsyevdx_m to get both the eigenvalues and eigenvectors of a large matrix and I run on four Nvidia A100 GPU's. I'm getting a core dump for matrices larger than about 92,000.
When I run my code for a matrix of size 92k, everything works fine:
$ ./magma-test 92000
% MAGMA 2.6.1 64-bit magma_int_t, 64-bit pointer.
Compiled with CUDA support for 8.0
% CUDA runtime 11030, driver 11030. OpenMP threads 32.
% device 0: NVIDIA A100-PCIE-40GB, 1410.0 MHz clock, 40536.2 MiB memory, capability 8.0
% device 1: NVIDIA A100-PCIE-40GB, 1410.0 MHz clock, 40536.2 MiB memory, capability 8.0
% device 2: NVIDIA A100-PCIE-40GB, 1410.0 MHz clock, 40536.2 MiB memory, capability 8.0
% device 3: NVIDIA A100-PCIE-40GB, 1410.0 MHz clock, 40536.2 MiB memory, capability 8.0
% Wed Sep 22 08:33:49 2021
% ngpu = 4
% N CPU Time (sec) GPU Time (sec) |S-S_magma| |A-USU^H| |I-U^H U|
%===========================================================================
Workspace query complete. info = 0, lwork = 16928552001, liwork = 460003
Finished allocating memory.
Finished filling matrix.
92000 --- 826.4779 --- --- --- ok
However, when I increase to 93k:
$./magma-test 93000
% MAGMA 2.6.1 64-bit magma_int_t, 64-bit pointer.
Compiled with CUDA support for 8.0
% CUDA runtime 11030, driver 11030. OpenMP threads 32.
% device 0: NVIDIA A100-PCIE-40GB, 1410.0 MHz clock, 40536.2 MiB memory, capability 8.0
% device 1: NVIDIA A100-PCIE-40GB, 1410.0 MHz clock, 40536.2 MiB memory, capability 8.0
% device 2: NVIDIA A100-PCIE-40GB, 1410.0 MHz clock, 40536.2 MiB memory, capability 8.0
% device 3: NVIDIA A100-PCIE-40GB, 1410.0 MHz clock, 40536.2 MiB memory, capability 8.0
% Wed Sep 22 09:06:34 2021
% ngpu = 4
% N CPU Time (sec) GPU Time (sec) |S-S_magma| |A-USU^H| |I-U^H U|
%===========================================================================
Workspace query complete. info = 0, lwork = 17298558001, liwork = 465003
Finished allocating memory.
Finished filling matrix.
Aborted (core dumped)
The use of resources is obviously not very different in these two cases, so I haven't been able to suss out a reasonable explanation yet. In both cases, I have around 45-50 GB free system memory and am using < 20 GB out of the 40 GB per GPU.
Any advice would be great! Thanks!
Cheers,
tom
magma-test.cpp:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
// includes, project
#include "magma_v2.h"
#include "magma_lapack.h"
#include "magma_operators.h"
#include <gsl/gsl_math.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_sf.h>
const gsl_rng_type * T;
gsl_rng * r;
int main( int argc, char** argv)
{
magma_int_t N = atoi(argv[1]);;
magma_init();
magma_print_environment();
real_Double_t gpu_time;
double *h_R,*h_work, aux_work[1];
double *w1;
magma_int_t *iwork, aux_iwork[1];
magma_int_t Nfound, info, lwork, liwork, lda;
int status = 0;
struct timeval t1;
gettimeofday(&t1, NULL);
gsl_rng_env_setup();
T = gsl_rng_default;
gsl_rng_default_seed = t1.tv_usec * t1.tv_usec;
r = gsl_rng_alloc (T);
// pass ngpu = -1 to test multi-GPU code using 1 gpu
magma_int_t abs_ngpu = 4;
printf("%% ngpu = %lld\n", (long long) abs_ngpu);
printf("%% N CPU Time (sec) GPU Time (sec) |S-S_magma| |A-USU^H| |I-U^H U|\n");
printf("%%============================================================================\n");
lda = N;
Nfound = N;
magma_int_t il = 0, iu = N;
double vl = 0, vu = N;
// query for workspace sizes
magma_dsyevdx_m( abs_ngpu, MagmaVec, MagmaRangeAll, MagmaLower,
N, NULL, lda,
vl, vu, il, iu,
&Nfound, w1,
aux_work, -1,
aux_iwork, -1,
&info );
lwork = (magma_int_t) MAGMA_D_REAL( aux_work[0] );
liwork = aux_iwork[0];
printf("Workspace query complete. info = %lld, lwork = %lld, liwork = %lld\n", info, lwork, liwork);
/* Allocate host memory for the matrix */
magma_dmalloc_cpu( &w1, N );
magma_imalloc_cpu( &iwork, liwork );
magma_dmalloc_pinned( &h_R, N*lda );
magma_dmalloc_pinned( &h_work, lwork );
printf("Finished allocating memory.\n");
for(int j = 0; j < N; j++)
for(int i = j; i < N; i++)
h_R[i + j*N] = 2.0 - 4.0 * gsl_rng_uniform(r);
printf("Finished filling matrix.\n");
/* ====================================================================
Performs operation using MAGMA
=================================================================== */
gpu_time = magma_wtime();
magma_dsyevdx_m( abs_ngpu, MagmaVec, MagmaRangeAll, MagmaLower,
N, h_R, lda,
vl, vu, il, iu,
&Nfound, w1,
h_work, lwork,
iwork, liwork,
&info );
gpu_time = magma_wtime() - gpu_time;
if (info != 0) {
printf("magma_dsyevd returned error %lld: %s.\n",
(long long) info, magma_strerror( info ));
}
bool okay = true;
printf("%5lld --- %9.4f --- ",
(long long) N, gpu_time);
// print error checks
printf(" --- --- ");
printf(" %s\n", (okay ? "ok" : "failed"));
status += !okay;
if(N < 100)
for(int i = 0; i < N; i++)
printf("%f\n",w1[i]);
magma_free_cpu( w1 );
magma_free_cpu( iwork );
magma_free_pinned( h_R );
magma_free_pinned( h_work );
fflush( stdout );
magma_finalize();
return status;
}