Help with a probable PEBCAK bug

63 views
Skip to first unread message

Ryan Iley

unread,
Dec 16, 2024, 12:12:41 AM12/16/24
to ArrayFire Users
Hey ArrayFire-Users,

I'm trying out ArrayFire in my hobby project.  I thought I understood the programming model, but something is amiss.  Let me demonstrate with a code sample.  (I'm developing this on my laptop, but intend to run it on a proper GPU someday, hence my attraction to ArrayFire.)

#include <arrayfire.h>
int main(int argc, char **argv) {
  af::info();
  { // Broken.
    af::array a = af::iota(af::dim4(3));
    af::print("broken attempt",
              af::tile(a, 1, 3) - af::tile(a.T(), 3, 1));
  }
  { // Works as I expected the above to work.
    af::array a = af::iota(af::dim4(3));
    af::array b = af::tile(a, 1, 3);
    af::array c = af::tile(a.T(), 3, 1);
    af::eval(c);
    af::print("working attempt", b - c);
  }
  return 0;
}

And output.

$ uname -a
Linux my-laptop 6.6.63-1-lts #1 SMP PREEMPT_DYNAMIC Fri, 22 Nov 2024 15:39:56 +0000 x86_64 GNU/Linux
$ make && ./my_main
ArrayFire v3.9.0 (OpenCL, 64-bit Linux, build default)
[0] AMD: gfx1103, 2048 MB -- OpenCL 2.0  -- Device driver 3625.0 (HSA1.1,LC) -- FP64 Support: True
-1- INTEL: AMD Ryzen 7 7840HS w/ Radeon 780M Graphics     , 62095 MB -- OpenCL 3.0 (Build 0) -- Device driver 2024.17.3.0.08_160000 -- FP64 Support: True
broken attempt
[3 3 1 1]
   Offset: 0
   Strides: [1 3 9 9]
    0.0000     0.0000     0.0000
    0.0000     0.0000     0.0000
    0.0000     0.0000     0.0000

working attempt
[3 3 1 1]
   Offset: 0
   Strides: [1 3 9 9]
    0.0000    -1.0000    -2.0000
    1.0000     0.0000    -1.0000
    2.0000     1.0000     0.0000


This sure seems to me like a bug in code other than mine.  My understanding is that af::print() calls af::eval() on its input, and I would expect af::eval(b-c) to recursively eval the entire expression tree (or something smarter, that works).  My goal is to write a smoothed particle hydrodynamics simulation, currently using a cubic spline kernel function -- 20+ operators in one invocation.  I find it difficult to believe I'm intended to break each one into a temp variable and call af::eval() on it.

Am I doing something wrong?  It seems unlikely that I'd find a bug in my first line of code using this library.

Thanks!

Ryan Iley

unread,
Dec 21, 2024, 9:57:49 AM12/21/24
to ArrayFire Users
I've refined my approach.

My main method now contains the following.  In the version that works, the eval is uncommented.  I run this twice, with AF_TRACE=all AF_JIT_KERNEL_TRACE=stdout.
{
  LOG(INFO) << "start block";

  af::array a = af::iota(af::dim4(3));
  af::array b = af::tile(a, 1, 3);
  af::array c = af::tile(a.T(), 3, 1);
  // af::eval(c);
  af::print("difference matrix", b-c);
  LOG(INFO) << "end block";
}

When I uncomment af::eval(c), I get this: (I elided 20 or so lines of "failed to load libafoneapi", "libcublas", "libafcuda"; it did find "libafopencl" and "libforge" and I don't have nvidia or intel hardware on this machine)

I0000 00:00:1734790114.212265    6640 my_main.cc:206] start block
[mem][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/memory.cpp:216 ] nativeAlloc: 1 KB 0x5e498840a3a0

__kernel void KER13942457609286270440(
__global float *in0, KParam iInfo0,
__global float *out0, int offset0,
KParam oInfo){
    int id0 = get_global_id(0);
    int id1 = get_global_id(1);
    const int id0End = oInfo.dims[0];
    const int id1End = oInfo.dims[1];
    if ((id0 < id0End) & (id1 < id1End)) {
        const int id2 = get_global_id(2);
#define id3 0
        const int ostrides1 = oInfo.strides[1];
        int idx = (int)oInfo.strides[0]*id0 + ostrides1*id1 + (int)oInfo.strides[2]*id2;
out0 += offset0;


int idx0 = id0*(id0<iInfo0.dims[0])*iInfo0.strides[0] + id1*(id1<iInfo0.dims[1])*iInfo0.strides[1] + id2*(id2<iInfo0.dims[2])*iInfo0.strides[2] + id3*(id3<iInfo0.dims[3])*iInfo0.strides[3] + iInfo0.offset;
float val0 = in0[idx0];
float val1 = __noop(val0);
out0[idx] = val1;

    }
}
[jit][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/compile_module.cpp:264 ] {13696084684904111006 : loaded from /home/my_user/.arrayfire/KER13696084684904111006_CL_4098_GFX1103_AF_39.bin for gfx1103 }
[kernel][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/jit.cpp:460 ] Launching : Dims: [3,3,1,1] Global: [32,3,1] Local: [32,1,1] threads: 96
difference matrix

[3 3 1 1]
   Offset: 0
   Strides: [1 3 9 9]
[mem][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/memory.cpp:216 ] nativeAlloc: 1 KB 0x5e49896478b0
[mem][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/memory.cpp:216 ] nativeAlloc: 1 KB 0x5e4989647b70

__kernel void KER5443687041980553984(
__global float *in0, KParam iInfo0,
__global float *in2, KParam iInfo2,
__global float *out0, int offset0,
KParam oInfo){
    int id0 = get_global_id(0);
    int id1 = get_global_id(1);
    const int id0End = oInfo.dims[0];
    const int id1End = oInfo.dims[1];
    if ((id0 < id0End) & (id1 < id1End)) {
        const int id2 = get_global_id(2);
#define id3 0
        const int ostrides1 = oInfo.strides[1];
        int idx = (int)oInfo.strides[0]*id0 + ostrides1*id1 + (int)oInfo.strides[2]*id2;
out0 += offset0;


int idx0 = id0*(id0<iInfo0.dims[0])*iInfo0.strides[0] + id1*(id1<iInfo0.dims[1])*iInfo0.strides[1] + id2*(id2<iInfo0.dims[2])*iInfo0.strides[2] + id3*(id3<iInfo0.dims[3])*iInfo0.strides[3] + iInfo0.offset;
int idx2 = id0*(id0<iInfo2.dims[0])*iInfo2.strides[0] + id1*(id1<iInfo2.dims[1])*iInfo2.strides[1] + id2*(id2<iInfo2.dims[2])*iInfo2.strides[2] + id3*(id3<iInfo2.dims[3])*iInfo2.strides[3] + iInfo2.offset;
float val0 = in0[idx0];
float val1 = __noop(val0);
float val2 = in2[idx2];
float val3 = __sub(val1, val2);
out0[idx] = val3;

    }
}
[jit][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/compile_module.cpp:264 ] {6102516970022899895  : loaded from /home/my_user/.arrayfire/KER6102516970022899895_CL_4098_GFX1103_AF_39.bin for gfx1103 }
[kernel][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/jit.cpp:460 ] Launching : Dims: [3,3,1,1] Global: [32,3,1] Local: [32,1,1] threads: 96
[jit][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/compile_module.cpp:264 ] {12058074024761959428 : loaded from /home/my_user/.arrayfire/KER12058074024761959428_CL_4098_GFX1103_AF_39.bin for gfx1103 }
[kernel][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/Kernel.hpp:34 ] Launching transpose

    0.0000    -1.0000    -2.0000
    1.0000     0.0000    -1.0000
    2.0000     1.0000     0.0000

I0000 00:00:1734790114.679734    6640 my_main.cc:212] end block

When I comment af::eval(c), I get this:

I0000 00:00:1734790114.212265    6640 my_main.cc:206] start block
difference matrix

[3 3 1 1]
   Offset: 0
   Strides: [1 3 9 9]
[mem][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/memory.cpp:216 ] nativeAlloc: 1 KB 0x5c05f5575b30
[mem][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/memory.cpp:216 ] nativeAlloc: 1 KB 0x5c05f430dc50

__kernel void KER8398990534882206871(
__global float *in0, KParam iInfo0,
__global float *out0, int offset0,
KParam oInfo){
    int id0 = get_global_id(0);
    int id1 = get_global_id(1);
    const int id0End = oInfo.dims[0];
    const int id1End = oInfo.dims[1];
    if ((id0 < id0End) & (id1 < id1End)) {
        const int id2 = get_global_id(2);
#define id3 0
        const int ostrides1 = oInfo.strides[1];
        int idx = (int)oInfo.strides[0]*id0 + ostrides1*id1 + (int)oInfo.strides[2]*id2;
out0 += offset0;


int idx0 = id0*(id0<iInfo0.dims[0])*iInfo0.strides[0] + id1*(id1<iInfo0.dims[1])*iInfo0.strides[1] + id2*(id2<iInfo0.dims[2])*iInfo0.strides[2] + id3*(id3<iInfo0.dims[3])*iInfo0.strides[3] + iInfo0.offset;
float val0 = in0[idx0];
float val1 = __noop(val0);
float val2 = __noop(val0);
float val3 = __sub(val1, val2);
out0[idx] = val3;

    }
}
[jit][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/compile_module.cpp:264 ] {15673618099317739243 : loaded from /home/my_user/.arrayfire/KER15673618099317739243_CL_4098_GFX1103_AF_39.bin for gfx1103 }
[kernel][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/jit.cpp:460 ] Launching : Dims: [3,3,1,1] Global: [32,3,1] Local: [32,1,1] threads: 96
[jit][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/compile_module.cpp:264 ] {12058074024761959428 : loaded from /home/my_user/.arrayfire/KER12058074024761959428_CL_4098_GFX1103_AF_39.bin for gfx1103 }
[kernel][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/Kernel.hpp:34 ] Launching transpose

    0.0000     0.0000     0.0000
    0.0000     0.0000     0.0000
    0.0000     0.0000     0.0000

I0000 00:00:1734790114.699193    6640 my_main.cc:212] end block

(I find it much easier to view the difference in a diff tool.)

In short, it seems when af::eval(c) is commented, the respective kernel isn't run.

I am confounded.  Could someone help?
Reply all
Reply to author
Forward
0 new messages