My idea is to split the xy planes by 16x16 blocks, and
in each GPU block, there are 18x18 threads, so all threads can read in
data to the shared memory within one GPU clock.
However, the code below does not work as I expected.
Can TA help with this lab 1 ?
In main.cu:
dim3 block_new(BLOCK_SIZE_TOTAL,BLOCK_SIZE_TOTAL);
dim3 grid_new( (nx-2)/BLOCK_SIZE_CORE, (ny-2)/BLOCK_SIZE_CORE);
block2D_opt_2<<<grid_new, block_new>>>(fac, d_A0, d_Anext, nx, ny, nz);
In kernel1.2.cu:
__global__ void block2D_opt_2(float fac,float *A0,float *Anext, int
nx, int ny, int nz)
{
//For optimization 2
int inx=threadIdx.x;
int iny=threadIdx.y;
// coordinates at original nx * ny matrix
int i = blockIdx.x*(blockDim.x-2)+inx;
int j = blockIdx.y*(blockDim.y-2)+iny;
__shared__ float sh_A0[BLOCK_SIZE_TOTAL][BLOCK_SIZE_TOTAL];
sh_A0[iny][inx]=0.0f;
__syncthreads();
bool w_region = (i > 0) && (j > 0) && (i < nx-1) && (j < ny-1);
for(int k=1;k<nz-1;k++) {
sh_A0[iny][inx]=A0[Index3D (nx, ny, i, j, k)];
__syncthreads();
if(w_region)
{
Anext[Index3D (nx, ny, i, j, k)] =
A0[Index3D (nx, ny, i, j, k + 1)] +
A0[Index3D (nx, ny, i, j, k - 1)] +
sh_A0[iny+1][inx ] +
sh_A0[iny-1][inx ] +
sh_A0[iny ][inx+1] +
sh_A0[iny ][inx-1]
- 6.0f * sh_A0[iny][inx] / (fac*fac);
}
__syncthreads();
}
}
Xiaowei