The average runtime of the following kernel with all allocation, copies, and casts is 4ms:
__global__ void testKernel(
u_int16_t* table,
u_int16_t* A,
u_int16_t* A_prime,
uint32_t N)
{
uint32_t column = blockDim.x * blockIdx.x + threadIdx.x;
if (column >= N)
{
return;
}
column *= 3;
uint32_t V = A[column] + A[column + 1] + A[column + 2];
uint32_t V_prime = table[V];
A_prime[column] = A[column] * V_prime;
A_prime[column + 1] = A[column + 1] * V_prime;
A_prime[column + 2] = A[column + 2] * V_prime;
}
void test()
{
static constexpr size_t columns = 3145728;
static std::vector<uint16_t> input(3 * columns);
static std::vector<uint16_t> output(3 * columns);
static std::vector<uint16_t> table(2048);
uint16_t* d_input = nullptr;
uint16_t* d_output = nullptr;
uint16_t* d_table = nullptr;
cudaMalloc(&d_input, input.size() * sizeof(uint16_t));
cudaMalloc(&d_output, output.size() * sizeof(uint16_t));
cudaMalloc(&d_table, table.size() * sizeof(uint16_t));
cudaMemcpy(d_input, input.data(), input.size() * sizeof(uint16_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_table, table.data(), table.size() * sizeof(uint16_t), cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid = columns / threadsPerBlock;
testKernel<<<blocksPerGrid, threadsPerBlock>>>(d_table, d_input, d_output, columns);
cudaError_t error = cudaDeviceSynchronize();
if (error != cudaSuccess)
{
cudaFree(d_input);
cudaFree(d_output);
cudaFree(d_table);
return;
}
cudaMemcpy(output.data(), (void*)d_output, output.size() * sizeof(uint16_t), cudaMemcpyDeviceToHost);
cudaFree(d_input);
cudaFree(d_output);
cudaFree(d_table);
}
int main(int argc, char * argv[])
{
for (int i = 0; i < 100; ++i)
{
QElapsedTimer timer;
timer.start();
test();
std::cout << timer.elapsed() << std::endl;
}
return 0;