I've refined my approach.
My main method now contains the following. In the version that works, the eval is uncommented. I run this twice, with AF_TRACE=all AF_JIT_KERNEL_TRACE=stdout.
{
LOG(INFO) << "start block";
af::array a = af::iota(af::dim4(3));
af::array b = af::tile(a, 1, 3);
af::array c = af::tile(a.T(), 3, 1);
// af::eval(c);
af::print("difference matrix", b-c);
LOG(INFO) << "end block";
}
When I uncomment af::eval(c), I get this: (I elided 20 or so lines of "failed to load libafoneapi", "libcublas", "libafcuda"; it did find "libafopencl" and "libforge" and I don't have nvidia or intel hardware on this machine)
I0000 00:00:1734790114.212265 6640 my_main.cc:206] start block
[mem][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/memory.cpp:216 ] nativeAlloc: 1 KB 0x5e498840a3a0
__kernel void KER13942457609286270440(
__global float *in0, KParam iInfo0,
__global float *out0, int offset0,
KParam oInfo){
int id0 = get_global_id(0);
int id1 = get_global_id(1);
const int id0End = oInfo.dims[0];
const int id1End = oInfo.dims[1];
if ((id0 < id0End) & (id1 < id1End)) {
const int id2 = get_global_id(2);
#define id3 0
const int ostrides1 = oInfo.strides[1];
int idx = (int)oInfo.strides[0]*id0 + ostrides1*id1 + (int)oInfo.strides[2]*id2;
out0 += offset0;
int idx0 = id0*(id0<iInfo0.dims[0])*iInfo0.strides[0] + id1*(id1<iInfo0.dims[1])*iInfo0.strides[1] + id2*(id2<iInfo0.dims[2])*iInfo0.strides[2] + id3*(id3<iInfo0.dims[3])*iInfo0.strides[3] + iInfo0.offset;
float val0 = in0[idx0];
float val1 = __noop(val0);
out0[idx] = val1;
}
}
[jit][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/compile_module.cpp:264 ] {13696084684904111006 : loaded from /home/my_user/.arrayfire/KER13696084684904111006_CL_4098_GFX1103_AF_39.bin for gfx1103 }
[kernel][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/jit.cpp:460 ] Launching : Dims: [3,3,1,1] Global: [32,3,1] Local: [32,1,1] threads: 96
difference matrix
[3 3 1 1]
Offset: 0
Strides: [1 3 9 9]
[mem][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/memory.cpp:216 ] nativeAlloc: 1 KB 0x5e49896478b0
[mem][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/memory.cpp:216 ] nativeAlloc: 1 KB 0x5e4989647b70
__kernel void KER5443687041980553984(
__global float *in0, KParam iInfo0,
__global float *in2, KParam iInfo2,
__global float *out0, int offset0,
KParam oInfo){
int id0 = get_global_id(0);
int id1 = get_global_id(1);
const int id0End = oInfo.dims[0];
const int id1End = oInfo.dims[1];
if ((id0 < id0End) & (id1 < id1End)) {
const int id2 = get_global_id(2);
#define id3 0
const int ostrides1 = oInfo.strides[1];
int idx = (int)oInfo.strides[0]*id0 + ostrides1*id1 + (int)oInfo.strides[2]*id2;
out0 += offset0;
int idx0 = id0*(id0<iInfo0.dims[0])*iInfo0.strides[0] + id1*(id1<iInfo0.dims[1])*iInfo0.strides[1] + id2*(id2<iInfo0.dims[2])*iInfo0.strides[2] + id3*(id3<iInfo0.dims[3])*iInfo0.strides[3] + iInfo0.offset;
int idx2 = id0*(id0<iInfo2.dims[0])*iInfo2.strides[0] + id1*(id1<iInfo2.dims[1])*iInfo2.strides[1] + id2*(id2<iInfo2.dims[2])*iInfo2.strides[2] + id3*(id3<iInfo2.dims[3])*iInfo2.strides[3] + iInfo2.offset;
float val0 = in0[idx0];
float val1 = __noop(val0);
float val2 = in2[idx2];
float val3 = __sub(val1, val2);
out0[idx] = val3;
}
}
[jit][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/compile_module.cpp:264 ] {6102516970022899895 : loaded from /home/my_user/.arrayfire/KER6102516970022899895_CL_4098_GFX1103_AF_39.bin for gfx1103 }
[kernel][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/jit.cpp:460 ] Launching : Dims: [3,3,1,1] Global: [32,3,1] Local: [32,1,1] threads: 96
[jit][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/compile_module.cpp:264 ] {12058074024761959428 : loaded from /home/my_user/.arrayfire/KER12058074024761959428_CL_4098_GFX1103_AF_39.bin for gfx1103 }
[kernel][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/Kernel.hpp:34 ] Launching transpose
0.0000 -1.0000 -2.0000
1.0000 0.0000 -1.0000
2.0000 1.0000 0.0000
I0000 00:00:1734790114.679734 6640 my_main.cc:212] end block
When I comment af::eval(c), I get this:
I0000 00:00:1734790114.212265 6640 my_main.cc:206] start block
difference matrix
[3 3 1 1]
Offset: 0
Strides: [1 3 9 9]
[mem][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/memory.cpp:216 ] nativeAlloc: 1 KB 0x5c05f5575b30
[mem][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/memory.cpp:216 ] nativeAlloc: 1 KB 0x5c05f430dc50
__kernel void KER8398990534882206871(
__global float *in0, KParam iInfo0,
__global float *out0, int offset0,
KParam oInfo){
int id0 = get_global_id(0);
int id1 = get_global_id(1);
const int id0End = oInfo.dims[0];
const int id1End = oInfo.dims[1];
if ((id0 < id0End) & (id1 < id1End)) {
const int id2 = get_global_id(2);
#define id3 0
const int ostrides1 = oInfo.strides[1];
int idx = (int)oInfo.strides[0]*id0 + ostrides1*id1 + (int)oInfo.strides[2]*id2;
out0 += offset0;
int idx0 = id0*(id0<iInfo0.dims[0])*iInfo0.strides[0] + id1*(id1<iInfo0.dims[1])*iInfo0.strides[1] + id2*(id2<iInfo0.dims[2])*iInfo0.strides[2] + id3*(id3<iInfo0.dims[3])*iInfo0.strides[3] + iInfo0.offset;
float val0 = in0[idx0];
float val1 = __noop(val0);
float val2 = __noop(val0);
float val3 = __sub(val1, val2);
out0[idx] = val3;
}
}
[jit][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/compile_module.cpp:264 ] {15673618099317739243 : loaded from /home/my_user/.arrayfire/KER15673618099317739243_CL_4098_GFX1103_AF_39.bin for gfx1103 }
[kernel][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/jit.cpp:460 ] Launching : Dims: [3,3,1,1] Global: [32,3,1] Local: [32,1,1] threads: 96
[jit][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/compile_module.cpp:264 ] {12058074024761959428 : loaded from /home/my_user/.arrayfire/KER12058074024761959428_CL_4098_GFX1103_AF_39.bin for gfx1103 }
[kernel][1734790114][6640] [ /usr/src/debug/arrayfire/arrayfire-full-v3.9.0/src/backend/opencl/Kernel.hpp:34 ] Launching transpose
0.0000 0.0000 0.0000
0.0000 0.0000 0.0000
0.0000 0.0000 0.0000
I0000 00:00:1734790114.699193 6640 my_main.cc:212] end block
(I find it much easier to view the difference in a diff tool.)
In short, it seems when af::eval(c) is commented, the respective kernel isn't run.
I am confounded. Could someone help?