I am currently trying to write a Computation which does the following tasks:
- Take 2 arrays in1 (real, 512*512) and in2(real, 512*512) as inputs
- Apply zero-padding to both inputs to obtain in1_pad (real, 1024*1024) and in2_pad(real, 1024*1024)
- Use in1_pad and in2_pad as inputs to feed combine_complex() thus getting in1_pad+1j*in2_pad
- Compute the FFT of in1_pad+1j*in2_pad
To achieve this, I first wrote a Transformation which performs the zero-padding operation:
def zero_padding(arr_in, nrows_zp, ncols_zp):
assert not reikna.cluda.dtypes.is_complex(arr_in.dtype)
arr_in_type = Type(dtype=arr_in.dtype, shape=arr_in.shape)
arr_out_type = Type(dtype=arr_in.dtype, shape=(nrows_zp, ncols_zp))
nrows, ncols = arr_in.shape
return Transformation(
[Parameter('output', Annotation(arr_out_type, 'o')),
Parameter('input', Annotation(arr_in_type, 'i')),
],
"""
## get 2D indice
VSIZE_T idx_col = threadIdx.x + blockDim.x * blockIdx.x;
##VSIZE_T idx_col = get_global_id(0);
VSIZE_T idx_row = threadIdx.y + blockDim.y * blockIdx.y;
##VSIZE_T idx_row = get_global_id(1);
if (idx_col<${ncols} & idx_row<${nrows})
${output.store_same}(${input.load_same});
else
${output.store_same}(0);
""",
render_kwds=dict(nrows=nrows, ncols=ncols),
)
This Transformation works as expected (I convert it to a PureParallel object to test it).
To implement the whole algorithm, I built a class inherited from Computation:
class TwoFFTForPriceOfOne(Computation):
def __init__(self, x, y, nrows_pad=1024, ncols_pad=2048):
# only for 2D inputs
assert x.ndim == 2
# ensure input images are real ones
assert( not reikna.cluda.dtypes.is_complex(x.dtype) and not
reikna.cluda.dtypes.is_complex(x.dtype))
# array type
float_type = Type(np.float32, x.shape)
cplx_type_zp = Type(complex_for(x.dtype), (nrows_pad, ncols_pad))
# input of to_complex_trf is the output of zero_pad
zero_pad_x = zero_padding(x, nrows_zp=nrows_pad, ncols_zp=ncols_pad)
zero_pad_y = zero_padding(y, nrows_zp=nrows_pad, ncols_zp=ncols_pad)
# 2 real --> 1 complex
to_complex_trf = combine_complex(cplx_type_zp)
# FFT
fft = cudaFFT.FFT(cplx_type_zp)
# FFT input is the output of to_complex_trf
fft.parameter.input.connect(to_complex_trf, to_complex_trf.output,
input_real=to_complex_trf.real,
input_imag=to_complex_trf.imag)
# connects both padding blocks
fft.parameter.input_real.connect(zero_pad_x, zero_pad_x.output,
input1=zero_pad_x.input)
fft.parameter.input_imag.connect(zero_pad_y, zero_pad_y.output,
input2=zero_pad_y.input)
self._fft = fft
Computation.__init__(self,
[Parameter('output1', Annotation(cplx_type_zp, 'o')),
Parameter('input1', Annotation(float_type, 'i')),
Parameter('input2', Annotation(float_type, 'i')),
])
def _build_plan(self, plan_factory, device_params, output1,
input1, input2):
print(self._fft.signature.parameters)
plan = plan_factory()
plan.computation_call(self._fft, output1, input1, input2)
return plan
if __name__ == '__main__':
plt.close('all')
plt.ion()
nrows_pad = 1024
ncols_pad = 2048
# inputs
in1 = misc.lena().astype(np.float32)
in2 = misc.lena().astype(np.float32)
api = cluda.cuda_api()
thr = api.Thread.create()
# send inputs to GPU device
in1_device = thr.to_device(in1)
in2_device = thr.to_device(in2)
out1_device_pad = thr.array(shape=(nrows_pad, ncols_pad),
dtype=complex_for(in1.dtype))
out2_device_pad = thr.array(shape=(nrows_pad, ncols_pad),
dtype=complex_for(in1.dtype))
# inputs should NOT be complex
comp = TwoFFTForPriceOfOne(in1, in2, nrows_pad=nrows_pad,
ncols_pad=ncols_pad)
comp_c = comp.compile(thr)
print(comp_c.signature.parameters)
comp_c(input1=in1_device, input2=in2_device,
output1=out1_device_pad)
# CPU
res = np.fft.fft2(in1 + 1j * in2, (nrows_pad, ncols_pad))
fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True, sharey=True)
ax1.imshow(np.absolute(res))
ax2.imshow(np.absolute(out1_device_pad.get()))
thr.release()
If I remove the zero-padding blocks within the Computation (i.e. only combining the two inputs as a complex array and then computing the FFT, it yields the same result as a Numpy implementation. However, it seems that the zero-padding Transformation (which is OK is used on its own) is not correctly connected within the constructor of TwoFFTForPriceOfOne.
Indeed, I get no error but the output result significantly differ from the Numpy one.
If I compute the inverse FFT of the Reikna output (using Numpy), it seems that the rows are concatenated on the first row.
res = np.fft.fft2(in1 + 1j * in2, (nrows_pad, ncols_pad))
fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True, sharey=True)
ax1.imshow(np.real(np.fft.ifft2(res)))
ax2.imshow(np.real(np.fft.ifft2(out1_device_pad.get())))
However, I do not know what the difference between directly calling CUDA/OpenCL built-in stuff (get_global_id / threadIdx,...) and calling Reikna built-in variable (${idxs})?