#include <cub/cub.cuh>
#include <thrust/complex.h>
// reduction functor
struct CustomSum
{
template <typename T>
__device__ __forceinline__
T operator()(const T &a, const T &b) const {
return a+b;
}
};
int main(){
// Declare, allocate, and initialize device-accessible pointers for input and output
int num_items;
thrust::complex<float> *d_in;
thrust::complex<float> *d_out;
CustomSum sum_op;
thrust::complex<float> init;
// Determine temporary device storage requirements
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, sum_op, init);
// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// Run reduction
cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, sum_op, init);
}