I have found an example of matrix inversion on from this
[link Inverse matrix] with `pdgetrf` and `pdgetri` :
  #include <mpi.h>
  #include <cstdio>
  #include <cassert>
  //#include "block_cyclic_mat.h"
  #include <mkl.h>
  #include "mkl_scalapack.h"
  #include <vector>
  #include <memory>
 Â
  using namespace std;
 Â
  static double getri_flops(int N)
  {
    // From:
    //
https://icl.cs.utk.edu/svn/scalapack-dev/scalapack/trunk/TESTING/LIN/pdinvdriver.f  Â
    // Factorization: 2/3 N^3 - 1/2 N^2
    // Inverse    : 4/3 N^3 - N^2
 Â
    return ((2.0/3.0 * N * N * N) - (1.0/2.0 * N * N) +
      (4.0/3.0 * N * N * N) - (N * N))/1024.0/1024.0/1024.0;
  }
 Â
  ///  n_global: the order of the matrix
  static void inv_driver(int n_global)
  {
 Â
    auto grid = make_shared<blacs_grid_t>();
 Â
  //// self code
  //n_global = 3;
  //double *aaa = new double(n_global*n_global);
  //for (int i = 0; i < 9; i++)
  //{
  // aaa[i] = i + 1;
  //}
  //aaa[8] = 10;
  //auto a = block_cyclic_mat_t::createWithArray(grid, n_global, n_global, aaa);
 Â
 Â
    // Create a NxN random matrix A
    auto a = block_cyclic_mat_t::random(grid, n_global, n_global);    Â
 Â
    // Create a NxN matrix to hold A^{-1}
    auto ai = block_cyclic_mat_t::constant(grid, n_global, n_global);
 Â
    // Copy A to A^{-1} since it will be overwritten during factorization
    copy_n(a->local_data(), a->local_size(), ai->local_data());
 Â
    MPI_Barrier (MPI_COMM_WORLD);
 Â
    double t0 = MPI_Wtime();
   Â
    // Factorize A
    int ia = 1, ja = 1;
    vector<int> ipiv(a->local_rows() + a->row_block_size() + 100);
    int info;
 Â
  //å«ä¹åºè¯¥æ¯D-GE-TRFã
  //第ä¸ä¸ªD表示æä»¬çç©éµæ¯doubleç±»åç
  //GE表示æä»¬çç©éµæ¯Generalç±»åç
  //TRF表示对ç©éµè¿è¡ä¸è§åè§£ä¹å°±æ¯æä»¬é常æè¯´çLUåè§£ã
    pdgetrf_(n_global, n_global,
      ai->local_data(), ia, ja, ai->descriptor(),
      ipiv.data(),
      info);
    assert(info == 0);
    double t_factor = MPI_Wtime() - t0;
 Â
    // Compute A^{-1} based on the LU factorization
 Â
    // Compute workspace for double and integer work arrays on each process
    int lwork  = 10;
    int liwork = 10;
    vector<double>   work (lwork);
    vector<int> iwork(liwork);
 Â
    lwork = liwork = -1; Â
 Â
  // 计ç®lworkä¸liworkçå¼
    pdgetri_(n_global,
      ai->local_data(), ia, ja, ai->descriptor(),
      ipiv.data(),
      work.data(), lwork, iwork.data(), liwork, info);
    assert(info == 0);
    lwork  = static_cast<int>(work[0]);
    liwork = static_cast<size_t>(iwork[0]);
    work.resize(lwork);
    iwork.resize(liwork);
 Â
    // Now compute the inverse
    t0 = MPI_Wtime();
    pdgetri_(n_global,
      ai->local_data(), ia, ja, ai->descriptor(),
      ipiv.data(),
      work.data(), lwork, iwork.data(), liwork, info);
    assert(info == 0);
    double t_solve = MPI_Wtime() - t0;
 Â
    // Verify that the inverse is correct using A*A^{-1} = I
    auto identity = block_cyclic_mat_t::diagonal(grid, n_global, n_global);
 Â
    // Compute I = A * A^{-1} - I and verify that the ||I|| is small  Â
    char nein = 'N';
    double alpha = 1.0, beta = -1.0;
    pdgemm_(nein, nein, n_global, n_global, n_global, alpha,
      a->local_data() , ia, ja, a->descriptor(),
      ai->local_data(), ia, ja, ai->descriptor(),
      beta,
      identity->local_data(), ia, ja, identity->descriptor());
 Â
    // Compute 1-norm of the result
    char norm='1';
    work.resize(identity->local_cols());
    double err = pdlange_(norm, n_global, n_global,
      identity->local_data(), ia, ja, identity->descriptor(), work.data());
 Â
    double t_total = t_factor + t_solve;
    double t_glob;
    MPI_Reduce(&t_total, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
 Â
    if (grid->iam() == 0)
    {
      double gflops = getri_flops(n_global)/t_glob/grid->nprocs();
      printf("\n"
        "MATRIX INVERSE BENCHMARK SUMMARY\n"
        "================================\n"
        "N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n"
        "Time for PxGETRF + PxGETRI = %10.7f seconds\tGflops/Proc = %10.7f, Error = %f\n",
        n_global, grid->nprocs(), grid->nprows(), grid->npcols(),
        t_glob, gflops, err);fflush(stdout);
    }
  }
 Â
  int main(int argc, char** argv)
  {
    MPI_Init(&argc, &argv);
    int n_global = 4096;
 Â
    if (argc > 1)
    {
      n_global = int(atol(argv[1])); // stol  åÂç¬¦ä¸²è½¬é¿æ´å½¢
    }
 Â
    inv_driver(n_global);
    MPI_Finalize();
  }
Â
I tried to compile this source in the following way :
  mpicxx -I/opt/intel/oneapi/mpi/latest/include -L${MKLROOT}/lib/intel64 -lmkl_scalapack_ilp64 -lmkl_intel_ ilp64 -lmkl_intel_thread -lmkl_core -lmkl_blacs_intelmpi_ilp64 -liomp5 -lpthread -lm -ldl main.cpp
But I get the following errors :
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  main.cpp: In function ‘void inv_driver(int)’:
  main.cpp:27:29: error: ‘blacs_grid_t’ was not declared in this scope
     auto grid = make_shared<blacs_grid_t>();
                 ^~~~~~~~~~~~
  main.cpp:27:29: note: suggested alternative: ‘clockid_t’
     auto grid = make_shared<blacs_grid_t>();
                 ^~~~~~~~~~~~
                 clockid_t
  main.cpp:27:43: error: no matching function for call to ‘make_shared<<expression error> >()’
     auto grid = make_shared<blacs_grid_t>();
                        ^
  In file included from /usr/include/c++/8/memory:81,
           from main.cpp:8:
  /usr/include/c++/8/bits/shared_ptr.h:718:5: note: candidate: ‘template<class _Tp, class ... _Args> std::shared_ptr<_Tp> std::make_shared(_Args&& ...)’
     make_shared(_Args&&... __args)
     ^~~~~~~~~~~
  /usr/include/c++/8/bits/shared_ptr.h:718:5: note:  template argument deduction/substitution failed:
  main.cpp:27:43: error: template argument 1 is invalid
     auto grid = make_shared<blacs_grid_t>();
                        ^
  main.cpp:41:14: error: ‘block_cyclic_mat_t’ has not been declared
     auto a = block_cyclic_mat_t::random(grid, n_global, n_global);
         ^~~~~~~~~~~~~~~~~~
  main.cpp:44:15: error: ‘block_cyclic_mat_t’ has not been declared
     auto ai = block_cyclic_mat_t::constant(grid, n_global, n_global);
          ^~~~~~~~~~~~~~~~~~
  main.cpp:47:5: error: ‘copy_n’ was not declared in this scope
     copy_n(a->local_data(), a->local_size(), ai->local_data());
     ^~~~~~
  main.cpp:47:5: note: suggested alternative: ‘ctpcon’
     copy_n(a->local_data(), a->local_size(), ai->local_data());
     ^~~~~~
     ctpcon
  main.cpp:100:21: error: ‘block_cyclic_mat_t’ has not been declared
     auto identity = block_cyclic_mat_t::diagonal(grid, n_global, n_global);
             ^~~~~~~~~~~~~~~~~~
  main.cpp:105:5: error: ‘pdgemm_’ was not declared in this scope
     pdgemm_(nein, nein, n_global, n_global, n_global, alpha,
     ^~~~~~~
  main.cpp:105:5: note: suggested alternative: ‘pdgels_’
     pdgemm_(nein, nein, n_global, n_global, n_global, alpha,
     ^~~~~~~
     pdgels_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
I couldn't find into my OneAPI installation the header `"block_cyclic_mat.h"`.
Moreover, I get a lot of same error about `"block_cyclic_mat_t"` with : `"error: ‘block_cyclic_mat_t’` has not been declared"
By the way, I have also errors about `"blacs_grid_t’` was not declared in this scope" and `"error: no matching function for call to ‘make_shared<<expression error> >()’ "`
Does anyone see what's wrong and if this is the case, how to circumvent these compilation errors ?
Best regards.