Problem with CUDA backend

73 views
Skip to first unread message

Amir Hossein Jafari Matin

unread,
Oct 26, 2020, 6:04:03 AM10/26/20
to PyFR Mailing List
Dear PyFR Developers,

I am trying to run a simulation with CUDA backend on 12 GPUs. However I get the following error:
Traceback (most recent call last):
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/util.py", line 33, in __call__
    res = cache[key]
KeyError: (<function CUDAKernelProvider._build_kernel at 0x2b76ca4d9268>, b'\x80\x03X\x08\x00\x00\x00mpicfluxq\x00X3&\x00\x00\n\n// AoSoA macros\n#define SOA_SZ 32\n#define SOA_IX(a, v, nv) ((((a) / SOA_SZ)*(nv) + (v))*SOA_SZ + (a) % SOA_SZ)\n\n// Typedefs\ntypedef double fpdtype_t;\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n__global__ void mpicflux(int _nx, fpdtype_t* __restrict__ gradul_v, const int* __restrict__ gradul_vix, const int* __restrict__ gradul_vrstri, const fpdtype_t* __restrict__ magnl_v, const fpdtype_t* __restrict__ nl_v, int ldnl, fpdtype_t* __restrict__ ul_v, const int* __restrict__ ul_vix, fpdtype_t* __restrict__ ur_v)\n               {\n                   int _x = blockIdx.x*blockDim.x + threadIdx.x;\n                   #define X_IDX (_x)\n                   #define X_IDX_AOSOA(v, nv) SOA_IX(X_IDX, v, nv)\n                   if (_x < _nx)\n                   {\n                       \n    // Perform the Riemann solve\n    fpdtype_t ficomm[5], fvcomm;\n    {\n\n    // Compute the left and right fluxes + velocities and pressures\n    fpdtype_t fl_[3][5], fr_[3][5];\n    fpdtype_t vl_[3], vr_[3];\n    fpdtype_t pl_, pr_;\n\n    {\n\n    fpdtype_t invrho__ = 1.0/ul_v[ul_vix[X_IDX] + SOA_SZ*(0)], E__ = ul_v[ul_vix[X_IDX] + SOA_SZ*(4)];\n\n    // Compute the velocities\n    fpdtype_t rhov__[3];\n    rhov__[0] = ul_v[ul_vix[X_IDX] + SOA_SZ*(1)];\n    vl_[0] = invrho__*rhov__[0];\n    rhov__[1] = ul_v[ul_vix[X_IDX] + SOA_SZ*(2)];\n    vl_[1] = invrho__*rhov__[1];\n    rhov__[2] = ul_v[ul_vix[X_IDX] + SOA_SZ*(3)];\n    vl_[2] = invrho__*rhov__[2];\n\n    // Compute the pressure\n    pl_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1]) + (rhov__[2])*(rhov__[2])));\n\n    // Density and energy fluxes\n    fl_[0][0] = rhov__[0];\n    fl_[0][4] = (E__ + pl_)*vl_[0];\n    fl_[1][0] = rhov__[1];\n    fl_[1][4] = (E__ + pl_)*vl_[1];\n    fl_[2][0] = rhov__[2];\n    fl_[2][4] = (E__ + pl_)*vl_[2];\n\n    // Momentum fluxes\n    fl_[0][1] = rhov__[0]*vl_[0] + pl_;\n    fl_[0][2] = rhov__[0]*vl_[1];\n    fl_[0][3] = rhov__[0]*vl_[2];\n    fl_[1][1] = rhov__[1]*vl_[0];\n    fl_[1][2] = rhov__[1]*vl_[1] + pl_;\n    fl_[1][3] = rhov__[1]*vl_[2];\n    fl_[2][1] = rhov__[2]*vl_[0];\n    fl_[2][2] = rhov__[2]*vl_[1];\n    fl_[2][3] = rhov__[2]*vl_[2] + pl_;\n\n};\n    {\n\n    fpdtype_t invrho__ = 1.0/ur_v[_nx*(0) + X_IDX], E__ = ur_v[_nx*(4) + X_IDX];\n\n    // Compute the velocities\n    fpdtype_t rhov__[3];\n    rhov__[0] = ur_v[_nx*(1) + X_IDX];\n    vr_[0] = invrho__*rhov__[0];\n    rhov__[1] = ur_v[_nx*(2) + X_IDX];\n    vr_[1] = invrho__*rhov__[1];\n    rhov__[2] = ur_v[_nx*(3) + X_IDX];\n    vr_[2] = invrho__*rhov__[2];\n\n    // Compute the pressure\n    pr_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1]) + (rhov__[2])*(rhov__[2])));\n\n    // Density and energy fluxes\n    fr_[0][0] = rhov__[0];\n    fr_[0][4] = (E__ + pr_)*vr_[0];\n    fr_[1][0] = rhov__[1];\n    fr_[1][4] = (E__ + pr_)*vr_[1];\n    fr_[2][0] = rhov__[2];\n    fr_[2][4] = (E__ + pr_)*vr_[2];\n\n    // Momentum fluxes\n    fr_[0][1] = rhov__[0]*vr_[0] + pr_;\n    fr_[0][2] = rhov__[0]*vr_[1];\n    fr_[0][3] = rhov__[0]*vr_[2];\n    fr_[1][1] = rhov__[1]*vr_[0];\n    fr_[1][2] = rhov__[1]*vr_[1] + pr_;\n    fr_[1][3] = rhov__[1]*vr_[2];\n    fr_[2][1] = rhov__[2]*vr_[0];\n    fr_[2][2] = rhov__[2]*vr_[1];\n    fr_[2][3] = rhov__[2]*vr_[2] + pr_;\n\n};\n\n    // Sum the left and right velocities and take the normal\n    fpdtype_t nv_ = ((nl_v[ldnl*(0) + X_IDX])*(vl_[0] + vr_[0]) + (nl_v[ldnl*(1) + X_IDX])*(vl_[1] + vr_[1]) + (nl_v[ldnl*(2) + X_IDX])*(vl_[2] + vr_[2]));\n\n    // Estimate the maximum wave speed / 2\n    fpdtype_t a_ = sqrt(0.35*(pl_ + pr_)/(ul_v[ul_vix[X_IDX] + SOA_SZ*(0)] + ur_v[_nx*(0) + X_IDX]))\n                + 0.25*fabs(nv_);\n\n    // Output\n    ficomm[0] = 0.5*(nl_v[ldnl*(0) + X_IDX]*(fl_[0][0] + fr_[0][0]) + nl_v[ldnl*(1) + X_IDX]*(fl_[1][0] + fr_[1][0]) + nl_v[ldnl*(2) + X_IDX]*(fl_[2][0] + fr_[2][0]))\n             + a_*(ul_v[ul_vix[X_IDX] + SOA_SZ*(0)] - ur_v[_nx*(0) + X_IDX]);\n    ficomm[1] = 0.5*(nl_v[ldnl*(0) + X_IDX]*(fl_[0][1] + fr_[0][1]) + nl_v[ldnl*(1) + X_IDX]*(fl_[1][1] + fr_[1][1]) + nl_v[ldnl*(2) + X_IDX]*(fl_[2][1] + fr_[2][1]))\n             + a_*(ul_v[ul_vix[X_IDX] + SOA_SZ*(1)] - ur_v[_nx*(1) + X_IDX]);\n    ficomm[2] = 0.5*(nl_v[ldnl*(0) + X_IDX]*(fl_[0][2] + fr_[0][2]) + nl_v[ldnl*(1) + X_IDX]*(fl_[1][2] + fr_[1][2]) + nl_v[ldnl*(2) + X_IDX]*(fl_[2][2] + fr_[2][2]))\n             + a_*(ul_v[ul_vix[X_IDX] + SOA_SZ*(2)] - ur_v[_nx*(2) + X_IDX]);\n    ficomm[3] = 0.5*(nl_v[ldnl*(0) + X_IDX]*(fl_[0][3] + fr_[0][3]) + nl_v[ldnl*(1) + X_IDX]*(fl_[1][3] + fr_[1][3]) + nl_v[ldnl*(2) + X_IDX]*(fl_[2][3] + fr_[2][3]))\n             + a_*(ul_v[ul_vix[X_IDX] + SOA_SZ*(3)] - ur_v[_nx*(3) + X_IDX]);\n    ficomm[4] = 0.5*(nl_v[ldnl*(0) + X_IDX]*(fl_[0][4] + fr_[0][4]) + nl_v[ldnl*(1) + X_IDX]*(fl_[1][4] + fr_[1][4]) + nl_v[ldnl*(2) + X_IDX]*(fl_[2][4] + fr_[2][4]))\n             + a_*(ul_v[ul_vix[X_IDX] + SOA_SZ*(4)] - ur_v[_nx*(4) + X_IDX]);\n\n};\n\n    fpdtype_t fvl[3][5] = {{0}};\n    {\n\n    fpdtype_t rho_  = ul_v[ul_vix[X_IDX] + SOA_SZ*(0)];\n    fpdtype_t rhou_ = ul_v[ul_vix[X_IDX] + SOA_SZ*(1)], rhov_ = ul_v[ul_vix[X_IDX] + SOA_SZ*(2)], rhow_ = ul_v[ul_vix[X_IDX] + SOA_SZ*(3)];\n    fpdtype_t E_    = ul_v[ul_vix[X_IDX] + SOA_SZ*(4)];\n\n    fpdtype_t rcprho_ = 1.0/rho_;\n    fpdtype_t u_ = rcprho_*rhou_, v_ = rcprho_*rhov_, w_ = rcprho_*rhow_;\n\n    fpdtype_t rho_x_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(0) + SOA_SZ*(0)];\n    fpdtype_t rho_y_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(1) + SOA_SZ*(0)];\n    fpdtype_t rho_z_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(2) + SOA_SZ*(0)];\n\n    // Velocity derivatives (rho_*grad[u_,v_,w_])\n    fpdtype_t u_x_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(0) + SOA_SZ*(1)] - u_*rho_x_;\n    fpdtype_t u_y_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(1) + SOA_SZ*(1)] - u_*rho_y_;\n    fpdtype_t u_z_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(2) + SOA_SZ*(1)] - u_*rho_z_;\n    fpdtype_t v_x_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(0) + SOA_SZ*(2)] - v_*rho_x_;\n    fpdtype_t v_y_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(1) + SOA_SZ*(2)] - v_*rho_y_;\n    fpdtype_t v_z_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(2) + SOA_SZ*(2)] - v_*rho_z_;\n    fpdtype_t w_x_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(0) + SOA_SZ*(3)] - w_*rho_x_;\n    fpdtype_t w_y_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(1) + SOA_SZ*(3)] - w_*rho_y_;\n    fpdtype_t w_z_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(2) + SOA_SZ*(3)] - w_*rho_z_;\n\n    fpdtype_t E_x_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(0) + SOA_SZ*(4)];\n    fpdtype_t E_y_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(1) + SOA_SZ*(4)];\n    fpdtype_t E_z_ = gradul_v[gradul_vix[X_IDX] + gradul_vrstri[X_IDX]*(2) + SOA_SZ*(4)];\n\n    // Compute the temperature and viscosity\n    fpdtype_t cpT_ = 1.4*(rcprho_*E_ - 0.5*(u_*u_ + v_*v_ + w_*w_));\n    fpdtype_t Trat_ = 0.4*cpT_;\n    fpdtype_t mu_c_ = 4.59985915432e-05*Trat_*sqrt(Trat_)\n                   / (cpT_ + 1.270376356);\n\n    // Compute temperature derivatives (c_v*dT/d[x,y,z])\n    fpdtype_t T_x_ = rcprho_*(E_x_ - (rcprho_*rho_x_*E_ + u_*u_x_ + v_*v_x_ + w_*w_x_));\n    fpdtype_t T_y_ = rcprho_*(E_y_ - (rcprho_*rho_y_*E_ + u_*u_y_ + v_*v_y_ + w_*w_y_));\n    fpdtype_t T_z_ = rcprho_*(E_z_ - (rcprho_*rho_z_*E_ + u_*u_z_ + v_*v_z_ + w_*w_z_));\n\n    // Negated stress tensor elements\n    fpdtype_t t_xx_ = -2*mu_c_*rcprho_*(u_x_ - 0.3333333333333333*(u_x_ + v_y_ + w_z_));\n    fpdtype_t t_yy_ = -2*mu_c_*rcprho_*(v_y_ - 0.3333333333333333*(u_x_ + v_y_ + w_z_));\n    fpdtype_t t_zz_ = -2*mu_c_*rcprho_*(w_z_ - 0.3333333333333333*(u_x_ + v_y_ + w_z_));\n    fpdtype_t t_xy_ = -mu_c_*rcprho_*(v_x_ + u_y_);\n    fpdtype_t t_xz_ = -mu_c_*rcprho_*(u_z_ + w_x_);\n    fpdtype_t t_yz_ = -mu_c_*rcprho_*(w_y_ + v_z_);\n\n    fvl[0][1] += t_xx_;     fvl[1][1] += t_xy_;     fvl[2][1] += t_xz_;\n    fvl[0][2] += t_xy_;     fvl[1][2] += t_yy_;     fvl[2][2] += t_yz_;\n    fvl[0][3] += t_xz_;     fvl[1][3] += t_yz_;     fvl[2][3] += t_zz_;\n\n    fvl[0][4] += u_*t_xx_ + v_*t_xy_ + w_*t_xz_ + -mu_c_*1.9444444444444444*T_x_;\n    fvl[1][4] += u_*t_xy_ + v_*t_yy_ + w_*t_yz_ + -mu_c_*1.9444444444444444*T_y_;\n    fvl[2][4] += u_*t_xz_ + v_*t_yz_ + w_*t_zz_ + -mu_c_*1.9444444444444444*T_z_;\n\n};\n    {\n\n\n};\n\n\n    fvcomm = nl_v[ldnl*(0) + X_IDX]*fvl[0][0] + nl_v[ldnl*(1) + X_IDX]*fvl[1][0] + nl_v[ldnl*(2) + X_IDX]*fvl[2][0];\n    fvcomm += 0.1*(ul_v[ul_vix[X_IDX] + SOA_SZ*(0)] - ur_v[_nx*(0) + X_IDX]);\n\n    ul_v[ul_vix[X_IDX] + SOA_SZ*(0)] = magnl_v[X_IDX]*(ficomm[0] + fvcomm);\n    fvcomm = nl_v[ldnl*(0) + X_IDX]*fvl[0][1] + nl_v[ldnl*(1) + X_IDX]*fvl[1][1] + nl_v[ldnl*(2) + X_IDX]*fvl[2][1];\n    fvcomm += 0.1*(ul_v[ul_vix[X_IDX] + SOA_SZ*(1)] - ur_v[_nx*(1) + X_IDX]);\n\n    ul_v[ul_vix[X_IDX] + SOA_SZ*(1)] = magnl_v[X_IDX]*(ficomm[1] + fvcomm);\n    fvcomm = nl_v[ldnl*(0) + X_IDX]*fvl[0][2] + nl_v[ldnl*(1) + X_IDX]*fvl[1][2] + nl_v[ldnl*(2) + X_IDX]*fvl[2][2];\n    fvcomm += 0.1*(ul_v[ul_vix[X_IDX] + SOA_SZ*(2)] - ur_v[_nx*(2) + X_IDX]);\n\n    ul_v[ul_vix[X_IDX] + SOA_SZ*(2)] = magnl_v[X_IDX]*(ficomm[2] + fvcomm);\n    fvcomm = nl_v[ldnl*(0) + X_IDX]*fvl[0][3] + nl_v[ldnl*(1) + X_IDX]*fvl[1][3] + nl_v[ldnl*(2) + X_IDX]*fvl[2][3];\n    fvcomm += 0.1*(ul_v[ul_vix[X_IDX] + SOA_SZ*(3)] - ur_v[_nx*(3) + X_IDX]);\n\n    ul_v[ul_vix[X_IDX] + SOA_SZ*(3)] = magnl_v[X_IDX]*(ficomm[3] + fvcomm);\n    fvcomm = nl_v[ldnl*(0) + X_IDX]*fvl[0][4] + nl_v[ldnl*(1) + X_IDX]*fvl[1][4] + nl_v[ldnl*(2) + X_IDX]*fvl[2][4];\n    fvcomm += 0.1*(ul_v[ul_vix[X_IDX] + SOA_SZ*(4)] - ur_v[_nx*(4) + X_IDX]);\n\n    ul_v[ul_vix[X_IDX] + SOA_SZ*(4)] = magnl_v[X_IDX]*(ficomm[4] + fvcomm);\n\n                   }\n                   #undef X_IDX\n                   #undef X_IDX_AOSOA\n               }\n\nq\x01]q\x02(cnumpy\nint32\nq\x03cnumpy\nint64\nq\x04h\x04h\x04h\x04h\x04h\x03h\x04h\x04h\x04e\x87q\x05.', b'\x80\x03}q\x00.')

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/bin/pyfr", line 11, in <module>
    sys.exit(main())
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/__main__.py", line 112, in main
    args.process(args)
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/__main__.py", line 263, in process_restart
    _process_common(args, mesh, soln, cfg)
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/__main__.py", line 226, in _process_common
    solver = get_solver(backend, rallocs, mesh, soln, cfg)
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/solvers/__init__.py", line 16, in get_solver
    return get_integrator(backend, systemcls, rallocs, mesh, initsoln, cfg)
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/integrators/__init__.py", line 36, in get_integrator
    return integrator(backend, systemcls, rallocs, mesh, initsoln, cfg)
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/integrators/std/controllers.py", line 80, in __init__
    super().__init__(*args, **kwargs)
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/integrators/std/controllers.py", line 14, in __init__
    super().__init__(*args, **kwargs)
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/integrators/std/steppers.py", line 159, in __init__
    super().__init__(*args, **kwargs)
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/integrators/std/base.py", line 19, in __init__
    nregs=self.nregs, cfg=cfg)
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/solvers/base/system.py", line 64, in __init__
    self._gen_kernels(eles, int_inters, mpi_inters, bc_inters)
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/solvers/base/system.py", line 193, in _gen_kernels
    kernels[pn, kn].append(kgetter())
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/solvers/navstokes/inters.py", line 69, in <lambda>
    magnl=self._mag_pnorm_lhs, nl=self._norm_pnorm_lhs
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/backends/base/backend.py", line 163, in kernel
    return kern(*args, **kwargs)
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/backends/base/kernels.py", line 162, in kernel_meth
    fun = self._build_kernel(name, src, list(it.chain(*argt)))
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/util.py", line 35, in __call__
    res = cache[key] = self.func(*args, **kwargs)
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pyfr/backends/cuda/provider.py", line 20, in _build_kernel
    fun = compiler.SourceModule(src).get_function(name)
  File "/rwthfs/rz/cluster/home/am073751/PyFR/ENV4/lib/python3.7/site-packages/pycuda/compiler.py", line 294, in __init__
    self.module = module_from_buffer(cubin)
pycuda._driver.LogicError: cuModuleLoadDataEx failed: device kernel image is invalid - error   : Binary format for key='0', ident='' is not recognized
application called MPI_Abort(MPI_COMM_WORLD, 1) - process 7

 The same case works just fine with OpenMP backend. I was wondering if you could help me with the error.


Regards 
Amir

Freddie Witherden

unread,
Oct 26, 2020, 8:40:41 AM10/26/20
to pyfrmai...@googlegroups.com
Hi,

On 26/10/2020 05:04, Amir Hossein Jafari Matin wrote:
>
>  The same case works just fine with OpenMP backend. I was wondering if
> you could help me with the error.

In my experience this error comes about when the version of nvcc that is
being used to compile our run-time generated kernels is newer than the
version of CUDA that PyCUDA was linked against.

To this end, can you try running any of the PyCUDA examples from the
PyCUDA website?

Regards, Freddie.

Amir Hossein Jafari Matin

unread,
Oct 26, 2020, 12:42:33 PM10/26/20
to PyFR Mailing List
Hi,

thanks for your kind reply.
It did work as I changed the version of CUDA.

Regards
Amir

Reply all
Reply to author
Forward
0 new messages