Execution of simple MAGMA inversion matrix distributed on 2 GPU cards

henry wasker

unread,

Oct 15, 2021, 2:06:47 PM10/15/21

to MAGMA User

From examples in doc of MAGMA, I try to do a simple program from MAGMA doc which inverses a large matrix using in the same time 2 GPU cards RTX A6000 with the

hardware component NvLink. Ideally, I would like to combine the power of the 2 GPU cards.

With only one GPU card, I can inverse a matrix of size ~ 50,000 x 50,000.

For my code, I need to inverse a matrix of size 120,000 x 120,000 : so I wonder if it

could be possible to use simultaneously both cards to carry out this inversion.

Here is my flags for compilation :

CXX = nvcc -O3

LAPACK = /opt/intel/oneapi/mkl/latest/lib/intel64

MAGMA = /usr/local/magma

INCLUDE_CUDA=/usr/local/cuda/include

LIBCUDA=/usr/local/cuda/lib64

CXXFLAGS = -c -I${MAGMA}/include -I${INCLUDE_CUDA} -lpthread

LDFLAGS = -L${LAPACK} -lmkl_intel_lp64 -L${LIBCUDA} -lcuda -lcudart -lcublas -L${MAGMA}/lib -lmagma -lpthread

SOURCES = example_double_MAGMA_NVIDIA.cpp

EXECUTABLE = main_magma_double_example.exe

and here is the last version of my attempt of this code :

#include <stdio.h>

#include <cuda.h>
#include "magma_v2.h"
#include "magma_lapack.h"
#define min(a,b) (((a)<(b))?(a):(b))

int main( int argc , char** argv)
{
magma_init (); // initialize Magma
int num_gpus = 2;
magma_setdevice (0);
magma_queue_t queues[num_gpus ];
for( int dev = 0; dev < num_gpus; ++dev ) {
magma_queue_create( dev , &queues[dev] );
}
magma_int_t err;
real_Double_t cpu_time ,gpu_time;
magma_int_t m = 8192, n = 8192; // a,r - mxn matrices
magma_int_t mm = m*n;
magma_int_t nrhs =100; // b - nxnrhs , c - mxnrhs matrices
magma_int_t *ipiv; // array of indices of interchanged rows
magma_int_t n2=m*n; // size of a,r
magma_int_t nnrhs=n*nrhs; // size of b
magma_int_t mnrhs=m*nrhs; // size of c
double *a, *r; // a,r - mxn matrices on the host
double *b, *c;// b - nxnrhs , c - mxnrhs matrices on the host
double *dwork; // dwork - workspace
magmaDouble_ptr d_la[num_gpus ];
double alpha =1.0, beta =0.0; // alpha=1,beta=0
magma_int_t ldwork; // size of dwork
ldwork = m * magma_get_dgetri_nb( m ); // optimal block size

//4.3 LU decomposition and solving general linear systems 282
magma_int_t n_local;
magma_int_t ione = 1, info;
magma_int_t i, min_mn=min(m,n), nb;
magma_int_t ldn_local;// mxldn_local - size of the part of a
magma_int_t ISEED [4] = {0,0,0,1}; // on i-th device
nb =magma_get_dgetrf_nb(m,n); // optim.block size for dgetrf
// allocate memory on cpu
ipiv=( magma_int_t *) malloc(min_mn*sizeof(magma_int_t ));

// host memory for ipiv
err = magma_dmalloc_cpu (&a,n2); // host memory for a
err = magma_dmalloc_pinned (&r,n2); // host memory for r
err = magma_dmalloc_pinned (&b,nnrhs); // host memory for b
err = magma_dmalloc_pinned (&c,mnrhs); // host memory for c

// allocate device memory on num_gpus devices
for(i=0; i<num_gpus; i++){
n_local = ((n/nb)/ num_gpus )*nb;
if (i < (n/nb)% num_gpus)
n_local += nb;
else if (i == (n/nb)% num_gpus)
n_local += n%nb;
ldn_local = (( n_local +31)/32)*32;
magma_setdevice(i);
err = magma_dmalloc (&d_la[i],m*ldn_local ); // device memory
} // on i-th device
magma_setdevice (0);

lapackf77_dlarnv (&ione ,ISEED ,&mm ,a); // randomize a

// copy the corresponding parts of the matrix r to num_gpus
magma_dsetmatrix_1D_col_bcyclic( num_gpus , m, n, nb , a, m, d_la , m, queues );

// MAGMA
// LU decomposition on num_gpus devices with partial pivoting
// and row interchanges , row i is interchanged with row ipiv(i)
gpu_time = magma_sync_wtime(NULL);
magma_dgetrf_mgpu( num_gpus, m, n, d_la, m, ipiv, &info);
magma_dgetri_gpu(m, a, m, ipiv, dwork, ldwork, &info);
gpu_time = magma_sync_wtime(NULL)-gpu_time;
printf("magma_dgetrf_mgpu time: %7.5f sec.\n",gpu_time );

// print part of the solution from dgetrf_mgpu and dgetrs
printf("upper left corner of a^-1*a:\n");
magma_dprint( 4, 4, a, m); // magma_dgetrf_mgpu + dgetrs
free(ipiv); // free host memory
free(a); // free host memory
magma_free_pinned(r); // free host memory
magma_free_pinned(b); // free host memory
magma_free_pinned(c); // free host memory
for(i=0; i<num_gpus; i++){
magma_free(d_la[i] ); // free device memory
}
for( int dev = 0; dev < num_gpus; ++dev ) {
magma_queue_destroy( queues[dev] );
}
magma_finalize ();

}

Everything compiles fine but get following errors at execution :

CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:162
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:172
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:162
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:172
CUDA runtime error: an illegal memory access was encountered (700) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:173
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:162
CUDA runtime error: an illegal memory access was encountered (700) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:163
...

However, I think I have initialized well the variable d_la but it seems that errors of coding remain.

If someone could help to fix this bug at execution, this woud be fine.

Any help is welcome,

Regards

Mark Gates

unread,

Oct 18, 2021, 2:21:14 PM10/18/21

to henry wasker, MAGMA User

Is an explicit inverse really needed for your application, or are you really solving a system of equations, Ax = b? If you just need to solve a system of equations, doing a factorization and solve (gesv or getrf + getrs) is generally faster and more accurate than doing an explicit inverse and multiplying.

If you really do need an inverse, as some applications do:

I noticed in your code that magma_dgetri_gpu is taking `a` on the CPU, but expects `a` on the GPU. Unfortunately it appears MAGMA does not have a CPU interface (magma_dgetri), nor a multi-GPU interface (magma_dgetri_mgpu) to use.

Mark

henry wasker

unread,

Oct 19, 2021, 3:04:49 PM10/19/21

to MAGMA User, mga...@icl.utk.edu, MAGMA User, henry wasker

Hello Mark !

Thanks for your quick answer.

you advise me to use p?dgetri and p?dgetrf functions from Matrix Inversion: ScaLAPACK Computational Routines.

1) But there are a few examples on the web that show how to use for example the routines pdgetri and pdgetrf.

I think that solution of my issue is potentially to use these ScaLAPACK functions.

2) Will my code become a MPI code with these functions (I dream maybe ;) ). By the way, is it possible to convert it easily to MPI ?

If someone could give me tracks or clues to implement them in my case (function matrix_inverse_lapack) ...

Best regards, Henry

Mark Gates

unread,

Oct 21, 2021, 2:49:20 PM10/21/21

to henry wasker, MAGMA User

A potential problem is 50000^2 > 2^31, so if you use 32-bit int instead of 64-bit int64_t, computing the size of the matrix will overflow and yield a negative size. In this case, MAGMA and BLAS/LAPACK must be configured with ILP64 to use 64-bit integers. The largest size that fits in signed 32-bit is about 46000 x 46000.

Mark

henry wasker

unread,

Oct 21, 2021, 2:55:16 PM10/21/21

to MAGMA User, mga...@icl.utk.edu

In C++, I need a function which enables the inversion on large matrixes (at the maximum, I need to inverse `120,000 x 120,000` size matrix).

Firstly, I tried with the workstation at work to perform this with `LAPACKE` `dgetri` and `dgetrf` routines with `Intel OneAPI` framework.

The workstation has 1TB of RAM and 2 GPU cards RTX A6000 of 48GB for each one.

Currently, the maximum that I can invert is roughly a matrix of `50,000 x 50,000`. Over this size, `the 1TB RAM if full` and I get the following error :

terminate called after throwing an instance of 'std::bad_alloc'
what(): std::bad_alloc
Command terminated by signal 6

I was told to use `p?dgetri` and `p?dgetrf` functions from [Matrix Inversion: ScaLAPACK Computational Routines .

But there are a few examples on the web that show how to use for example the routines `pdgetri` and `pdgetrf`.

I think that solution of my issue is potentially to use these ScaLAPACK functions.

I have found an example of matrix inversion on from this [link Inverse matrix] with `pdgetrf` and `pdgetri` :

#include <mpi.h>
#include <cstdio>
#include <cassert>
//#include "block_cyclic_mat.h"
#include <mkl.h>
#include "mkl_scalapack.h"
#include <vector>
#include <memory>

using namespace std;

static double getri_flops(int N)
{
// From:
// https://icl.cs.utk.edu/svn/scalapack-dev/scalapack/trunk/TESTING/LIN/pdinvdriver.f
// Factorization: 2/3 N^3 - 1/2 N^2
// Inverse : 4/3 N^3 - N^2

return ((2.0/3.0 * N * N * N) - (1.0/2.0 * N * N) +
(4.0/3.0 * N * N * N) - (N * N))/1024.0/1024.0/1024.0;
}

/// n_global: the order of the matrix
static void inv_driver(int n_global)
{

auto grid = make_shared<blacs_grid_t>();

//// self code
//n_global = 3;
//double *aaa = new double(n_global*n_global);
//for (int i = 0; i < 9; i++)
//{
// aaa[i] = i + 1;
//}
//aaa[8] = 10;
//auto a = block_cyclic_mat_t::createWithArray(grid, n_global, n_global, aaa);

// Create a NxN random matrix A
auto a = block_cyclic_mat_t::random(grid, n_global, n_global);

// Create a NxN matrix to hold A^{-1}
auto ai = block_cyclic_mat_t::constant(grid, n_global, n_global);

// Copy A to A^{-1} since it will be overwritten during factorization
copy_n(a->local_data(), a->local_size(), ai->local_data());

MPI_Barrier (MPI_COMM_WORLD);

double t0 = MPI_Wtime();

// Factorize A
int ia = 1, ja = 1;
vector<int> ipiv(a->local_rows() + a->row_block_size() + 100);
int info;

//å«ä¹åºè¯¥æ¯D-GE-TRFã
//ç¬¬ä¸ä¸ªDè¡¨ç¤ºæä»¬çç©éµæ¯doubleç±»åç
//GEè¡¨ç¤ºæä»¬çç©éµæ¯Generalç±»åç
//TRFè¡¨ç¤ºå¯¹ç©éµè¿è¡ä¸è§åè§£ä¹å°±æ¯æä»¬éå¸¸æè¯´çLUåè§£ã
pdgetrf_(n_global, n_global,
ai->local_data(), ia, ja, ai->descriptor(),
ipiv.data(),
info);
assert(info == 0);
double t_factor = MPI_Wtime() - t0;

// Compute A^{-1} based on the LU factorization

// Compute workspace for double and integer work arrays on each process
int lwork = 10;
int liwork = 10;
vector<double> work (lwork);
vector<int> iwork(liwork);

lwork = liwork = -1;

// è®¡ç®lworkä¸liworkçå¼
pdgetri_(n_global,
ai->local_data(), ia, ja, ai->descriptor(),
ipiv.data(),
work.data(), lwork, iwork.data(), liwork, info);
assert(info == 0);
lwork = static_cast<int>(work[0]);
liwork = static_cast<size_t>(iwork[0]);
work.resize(lwork);
iwork.resize(liwork);

// Now compute the inverse
t0 = MPI_Wtime();
pdgetri_(n_global,
ai->local_data(), ia, ja, ai->descriptor(),
ipiv.data(),
work.data(), lwork, iwork.data(), liwork, info);
assert(info == 0);
double t_solve = MPI_Wtime() - t0;

// Verify that the inverse is correct using A*A^{-1} = I
auto identity = block_cyclic_mat_t::diagonal(grid, n_global, n_global);

// Compute I = A * A^{-1} - I and verify that the ||I|| is small
char nein = 'N';
double alpha = 1.0, beta = -1.0;
pdgemm_(nein, nein, n_global, n_global, n_global, alpha,
a->local_data() , ia, ja, a->descriptor(),
ai->local_data(), ia, ja, ai->descriptor(),
beta,
identity->local_data(), ia, ja, identity->descriptor());

// Compute 1-norm of the result
char norm='1';
work.resize(identity->local_cols());
double err = pdlange_(norm, n_global, n_global,
identity->local_data(), ia, ja, identity->descriptor(), work.data());

double t_total = t_factor + t_solve;
double t_glob;
MPI_Reduce(&t_total, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

if (grid->iam() == 0)
{
double gflops = getri_flops(n_global)/t_glob/grid->nprocs();
printf("\n"
"MATRIX INVERSE BENCHMARK SUMMARY\n"
"================================\n"
"N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n"
"Time for PxGETRF + PxGETRI = %10.7f seconds\tGflops/Proc = %10.7f, Error = %f\n",
n_global, grid->nprocs(), grid->nprows(), grid->npcols(),
t_glob, gflops, err);fflush(stdout);
}
}

int main(int argc, char** argv)
{
MPI_Init(&argc, &argv);
int n_global = 4096;

if (argc > 1)
{
n_global = int(atol(argv[1])); // stol åç¬¦ä¸²è½¬é¿æ´å½¢
}

inv_driver(n_global);
MPI_Finalize();
}

I tried to compile this source in the following way :

mpicxx -I/opt/intel/oneapi/mpi/latest/include -L${MKLROOT}/lib/intel64 -lmkl_scalapack_ilp64 -lmkl_intel_ ilp64 -lmkl_intel_thread -lmkl_core -lmkl_blacs_intelmpi_ilp64 -liomp5 -lpthread -lm -ldl main.cpp

But I get the following errors :

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
main.cpp: In function ‘void inv_driver(int)’:
main.cpp:27:29: error: ‘blacs_grid_t’ was not declared in this scope
auto grid = make_shared<blacs_grid_t>();
^~~~~~~~~~~~
main.cpp:27:29: note: suggested alternative: ‘clockid_t’
auto grid = make_shared<blacs_grid_t>();
^~~~~~~~~~~~
clockid_t
main.cpp:27:43: error: no matching function for call to ‘make_shared<<expression error> >()’
auto grid = make_shared<blacs_grid_t>();
^
In file included from /usr/include/c++/8/memory:81,
from main.cpp:8:
/usr/include/c++/8/bits/shared_ptr.h:718:5: note: candidate: ‘template<class _Tp, class ... _Args> std::shared_ptr<_Tp> std::make_shared(_Args&& ...)’
make_shared(_Args&&... __args)
^~~~~~~~~~~
/usr/include/c++/8/bits/shared_ptr.h:718:5: note: template argument deduction/substitution failed:
main.cpp:27:43: error: template argument 1 is invalid
auto grid = make_shared<blacs_grid_t>();
^
main.cpp:41:14: error: ‘block_cyclic_mat_t’ has not been declared
auto a = block_cyclic_mat_t::random(grid, n_global, n_global);
^~~~~~~~~~~~~~~~~~
main.cpp:44:15: error: ‘block_cyclic_mat_t’ has not been declared
auto ai = block_cyclic_mat_t::constant(grid, n_global, n_global);
^~~~~~~~~~~~~~~~~~
main.cpp:47:5: error: ‘copy_n’ was not declared in this scope
copy_n(a->local_data(), a->local_size(), ai->local_data());
^~~~~~
main.cpp:47:5: note: suggested alternative: ‘ctpcon’
copy_n(a->local_data(), a->local_size(), ai->local_data());
^~~~~~
ctpcon
main.cpp:100:21: error: ‘block_cyclic_mat_t’ has not been declared
auto identity = block_cyclic_mat_t::diagonal(grid, n_global, n_global);
^~~~~~~~~~~~~~~~~~
main.cpp:105:5: error: ‘pdgemm_’ was not declared in this scope
pdgemm_(nein, nein, n_global, n_global, n_global, alpha,
^~~~~~~
main.cpp:105:5: note: suggested alternative: ‘pdgels_’
pdgemm_(nein, nein, n_global, n_global, n_global, alpha,
^~~~~~~
pdgels_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

I couldn't find into my OneAPI installation the header `"block_cyclic_mat.h"`.

Moreover, I get a lot of same error about `"block_cyclic_mat_t"` with : `"error: ‘block_cyclic_mat_t’` has not been declared"

By the way, I have also errors about `"blacs_grid_t’` was not declared in this scope" and `"error: no matching function for call to ‘make_shared<<expression error> >()’ "`

Does anyone see what's wrong and if this is the case, how to circumvent these compilation errors ?

Best regards.

henry wasker

unread,

Oct 21, 2021, 5:44:36 PM10/21/21

to Mark Gates, MAGMA User

Hi !

Here is the command that I use for compiling MAGMA-2.6.1 :

cmake -DUSE_FORTRAN=OFF -DGPU_TARGET=Ampere -DLAPACK_LIBRARIES= /opt/intel/oneapi/mkl/latest/lib/intel64/ -DMAGMA_ENABLE_CUDA=ON ../

Which is the flag to add for configuring with ILP64 to use 64-bit integers ?

Best regards

Mark Gates

unread,

Oct 22, 2021, 10:27:10 AM10/22/21

to henry wasker, MAGMA User

Hmm, it appears the CMake build doesn't have ILP64 support, though it should be easy to add. The Makefile supports it, e.g., see make.inc.mkl-gcc-ilp64

Mark

henry wasker

unread,

Oct 24, 2021, 1:33:45 PM10/24/21

to MAGMA User, mga...@icl.utk.edu, MAGMA User, henry wasker

Hello,

in C++, I need a function which enables the inversion on large matrixes (at the maximum, I need to inverse 120,000 x 120,000 size matrix).

Firstly, I tried with the workstation at work to perform this with LAPACKE dgetri and dgetrf routines with Intel OneAPI framework.

The workstation has 1TB of RAM and 2 GPU cards RTX A6000 of 48GB for each one and AMD 60 cores processor EPYC.

Currently, the maximum that I can invert is roughly a matrix of 50,000 x 50,000. Over this size, the 1TB RAM if full and I get the following error :

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

terminate called after throwing an instance of 'std::bad_alloc' what(): std::bad_alloc Command terminated by signal 6

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Here the implemented function which inverses a matrix :

--------------------------------------------------------------------------------------------------------------------------------

// Inversion Matrix : passing Matrixes by Reference
void matrix_inverse_lapack(vector<vector<double>> const &F_matrix, vector<vector<double>> &F_output) {

// Index for loop and arrays
int i, j, ip, idx;

// Size of F_matrix
int N = F_matrix.size();
cout << "m = " << N << endl;

int *IPIV = new int[N];

// Statement of main array to inverse
double *arr = new double[N*N];

for (i = 0; i<N; i++){
    for (j = 0; j<N; j++){
      idx = i*N + j;
      arr[idx] = F_matrix[i][j];
    }
}

// LAPACKE routines
int info1 = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, arr, N, IPIV);
int info2 = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N, arr, N, IPIV);

for (i = 0; i<N; i++){
    for (j = 0; j<N; j++){
      idx = i*N + j;
      F_output[i][j] = arr[idx];
    }
}

delete[] IPIV;
delete[] arr;
}

--------------------------------------------------------------------------------------------------------------------------------

I was told to use p?dgetri and p?dgetrf functions from Matrix Inversion: ScaLAPACK Computational Routines.

But there are a few examples on the web that show how to use for example the routines pdgetri and pdgetrf.

I think that solution of my issue is potentially to use these ScaLAPACK functions.

I have found an example of matrix inversion on from this link Inverse matrix with pdgetrf and pdgetri :

----------------------------------------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

I tried to compile this source in the following way :

mpiicpc -I/opt/intel/oneapi/mpi/latest/include -L${MKLROOT}/lib/intel64 -lmkl_scalapack_ilp64 -lmkl_intel_ ilp64 -lmkl_intel_thread -lmkl_core -lmkl_blacs_intelmpi_ilp64 -liomp5 -lpthread -lm -ldl main.cpp

But I get the following errors :

--------------------------------------------------------------------------------------------------------------------------------------------------------------------

main.cpp(27): error: identifier "blacs_grid_t" is undefined

auto grid = make_shared<blacs_grid_t>();
^

main.cpp(41): error: name followed by "::" must be a class or namespace name

auto a = block_cyclic_mat_t::random(grid, n_global, n_global);

^

main.cpp(44): error: name followed by "::" must be a class or namespace name

auto ai = block_cyclic_mat_t::constant(grid, n_global, n_global);

^

main.cpp(47): error: identifier "copy_n" is undefined

copy_n(a->local_data(), a->local_size(), ai->local_data());

      ^

main.cpp(62): error: argument of type "int" is incompatible with parameter of type "const int *"
      pdgetrf_(n_global, n_global,
               ^

main.cpp(62): error: argument of type "int" is incompatible with parameter of type "const int *"
      pdgetrf_(n_global, n_global,
                         ^

main.cpp(63): error: argument of type "int" is incompatible with parameter of type "const int *"

ai->local_data(), ia, ja, ai->descriptor(),

^

main.cpp(63): error: argument of type "int" is incompatible with parameter of type "const int *"

ai->local_data(), ia, ja, ai->descriptor(),

                                ^

main.cpp(65): error: argument of type "int" is incompatible with parameter of type "int *"
          info);
          ^

main.cpp(80): error: argument of type "int" is incompatible with parameter of type "const int *"
      pdgetri_(n_global,
               ^

main.cpp(81): error: argument of type "int" is incompatible with parameter of type "const int *"

ai->local_data(), ia, ja, ai->descriptor(),

^

main.cpp(81): error: argument of type "int" is incompatible with parameter of type "const int *"

ai->local_data(), ia, ja, ai->descriptor(),

^

main.cpp(83): error: argument of type "int" is incompatible with parameter of type "const int *"