#include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh> // A stateful callback functor that maintains a running prefix to be applied
// during consecutive scan operations.
struct BlockPrefixCallbackOp
{
// Running prefix
int running_total;
// Constructor
__device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
// Callback operator to be entered by the first warp of threads in the block.
// Thread-0 is responsible for returning a value for seeding the block-wide scan.
__device__ int operator()(int block_aggregate)
{
int old_prefix = running_total;
running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
return old_prefix;
}
};
__global__ void ExampleKernel(int *d_data, int num_items, ...)
{
// Specialize BlockScan for a 1D block of 128 threads
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
// Initialize running total
BlockPrefixCallbackOp prefix_op(INT_MIN);
// Have the block iterate over segments of items
for (int block_offset = 0; block_offset < num_items; block_offset += 128)
{
// Load a segment of consecutive items that are blocked across threads
int thread_data = d_data[block_offset];
// Collectively compute the block-wide exclusive prefix max scan
thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); __syncthreads();
// Store scanned items to output segment
d_data[block_offset] = thread_data;
}