The multiply is executed on the GPU if the device memory space of A maps to the CUDA backend, which is the default case but this can be changed to whatever the user wants.
The current design of CUSP does not allow the user to explicitly set the dimensions of the kernels. This simplifies the implementation and is in agreement with the design of Thrust, the core library of primitives CUSP uses in many algorithms. You could always introduce your own implementation of the algorithms to add this information through the use of execution policies.
#include <cusp/array1d.h>
#include <cusp/csr_matrix.h>
#include <cusp/multiply.h>
#include <cusp/gallery/poisson.h>
struct my_policy : public cusp::cuda::execution_policy<my_policy>{};
template<typename IndexType, typename ValueType, typename MemorySpace>
void multiply(my_policy& exec,
const cusp::csr_matrix<IndexType,ValueType,MemorySpace> & A,
const cusp::array1d<ValueType,MemorySpace> & x,
cusp::array1d<ValueType,MemorySpace> & y)
{
typedef cusp::csr_matrix<IndexType,ValueType,MemorySpace> MatrixType;
typedef cusp::array1d<ValueType,MemorySpace> VectorType1;
typedef cusp::array1d<ValueType,MemorySpace> VectorType2;
typedef cusp::constant_functor<ValueType> UnaryFunction;
typedef thrust::multiplies<ValueType> BinaryFunction1;
typedef thrust::plus<ValueType> BinaryFunction2;
using namespace cusp::system::cuda;
using namespace cusp::system::cuda::detail;
UnaryFunction initialize(0);
BinaryFunction1 combine;
BinaryFunction2 reduce;
typedef typename MatrixType::row_offsets_array_type::const_iterator RowIterator;
typedef typename MatrixType::column_indices_array_type::const_iterator ColumnIterator;
typedef typename MatrixType::values_array_type::const_iterator ValueIterator1;
typedef typename VectorType1::const_iterator ValueIterator2;
typedef typename VectorType2::iterator ValueIterator3;
const size_t THREADS_PER_BLOCK = 128;
const size_t THREADS_PER_VECTOR = 8;
const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
const size_t MAX_BLOCKS = cusp::system::cuda::detail::max_active_blocks(
spmv_csr_vector_kernel<RowIterator, ColumnIterator, ValueIterator1, ValueIterator2, ValueIterator3,
UnaryFunction, BinaryFunction1, BinaryFunction2,
VECTORS_PER_BLOCK, THREADS_PER_VECTOR>, THREADS_PER_BLOCK, (size_t) 0);
const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, VECTORS_PER_BLOCK));
std::cout << "Calling my multiply with " << THREADS_PER_BLOCK << " threads and "
<< THREADS_PER_VECTOR << " threads-per-vector" << std::endl;
std::cout << "Mapping yielded " << NUM_BLOCKS << " blocks" << std::endl;
cudaStream_t s = stream(thrust::detail::derived_cast(exec));
spmv_csr_vector_kernel<RowIterator, ColumnIterator, ValueIterator1, ValueIterator2, ValueIterator3,
UnaryFunction, BinaryFunction1, BinaryFunction2,
VECTORS_PER_BLOCK, THREADS_PER_VECTOR> <<<NUM_BLOCKS, THREADS_PER_BLOCK, 0, s>>>
(A.num_rows, A.row_offsets.begin(), A.column_indices.begin(), A.values.begin(), x.begin(), y.begin(),
initialize, combine, reduce);
}
int main(int argc, char** argv)
{
typedef int IndexType;
typedef double ValueType;
typedef cusp::device_memory MemorySpace;
cusp::csr_matrix<IndexType,ValueType,MemorySpace> A;
if (argc == 1)
{
std::cout << "Using default matrix (5-pt Laplacian stencil)" << std::endl;
cusp::gallery::poisson5pt(A, 1000, 1000);
}
size_t N = A.num_rows;
cusp::array1d<ValueType, MemorySpace> x(N,0);
cusp::array1d<ValueType, MemorySpace> b(N,1);
my_policy exec;
cusp::multiply(exec, A, x, b);
return 0;
}