Hey guys,
i'm doing some experimens with reduction kernels and compare the resulting runtimes with the runtime of cudppReduce.
However, i've encountered an interesting problem which i couldn't solve by myself. Maybe one of you have a solution for me.
The problem is, that in case of cudppReduce running on a tesla C2075 i get independent of elements data type and number
of elements (> 1000000) same runtime: approx.: 0.0001 seconds. When i run the same binary on a geforce GTX 480 which
is installed on the same system and in the same time used for display management, the resulting runtimes differ depending
on data type and number of elements to be processed, as expected. I've executed the same experiments also on a laptop
equipped with a geforce gtx 460M and got same behavior as in case of the tesla C2075 card. My own kernels behave as expected,
by increasing the number of elements the resulting runtime increases as well. I have looked on the code of cudppReduce function
and it looks same as my own kernel, so the difference in runtime behavior comes from somewhere else. Maybe some optimizations
by the compiler or deeper in cudpp library i don't know. I hope you can help me with this problem.
Here is the code of my cudpp implementation:
double tstamp()
{
timeval time;
gettimeofday(&time,NULL);
return double(time.tv_sec + time.tv_usec * 1e-6);
}
void ReduceDataUsingCUDPP(double* pSource_p,double* pTarget_p,double InitValue_p,int ElemCount_p)
{
CUDPPConfiguration config;
CUDPPHandle cudppLibHandle = 0;
CUDPPHandle reducePlan = 0;
cudppCreate(&cudppLibHandle);
config.algorithm = CUDPP_REDUCE;
config.datatype = CUDPP_DOUBLE;
config.op = CUDPP_ADD;
config.options = CUDPP_OPTION_FORWARD;
if(cudppPlan(cudppLibHandle,&reducePlan,config,ElemCount_p,1,0) != CUDPP_SUCCESS)
{
std::cout << "ERROR IN BUILDING A REDUCE PLAN";
exit(1);
}
if(cudppReduce(reducePlan,pTarget_p,pSource_p,ElemCount_p) != CUDPP_SUCCESS)
{
std::cout << "CUDPP REDUCTION FAILED";
exit(1);
}
cudppDestroy(reducePlan);
cudppDestroy(cudppLibHandle);
}
template < typename DATA_T >
void ComputeReductionUsingCUDPP(int ElemCount_p)
{
std::vector< DATA_T > hostSource(ElemCount_p,0);
double t0,t1;
float locCPUTime,globCPUTime;
float locGPUTime,globGPUTime;
DATA_T* pDevSource;
DATA_T* pDevTarget;
DATA_T hostResult = DATA_T(0);
DATA_T devResult;
int correctResults = 0;
globCPUTime = 0.0;
globGPUTime = 0.0;
cudaMalloc((void**)&pDevSource,sizeof(DATA_T) * ElemCount_p);
cudaMalloc((void**)&pDevTarget,sizeof(DATA_T));
for(int i = 0; i < TEST_RUN_COUNT; ++i)
{
InitVector(hostSource,false);
cudaMemcpy(pDevSource,&hostSource[0],sizeof(DATA_T) * ElemCount_p,cudaMemcpyHostToDevice);
cudaMemset(pDevTarget,0,sizeof(DATA_T));
t0 = tstamp();
ReduceDataUsingCUDPP(pDevSource,pDevTarget,DATA_T(0),ElemCount_p);
t1 = tstamp();
locGPUTime = t1 - t0;
t0 = tstamp();
hostResult = CPUComputeReduction< DATA_T,ADD >(hostSource);
t1 = tstamp();
locCPUTime = (t1 - t0);
cudaMemcpy(&devResult,pDevTarget,sizeof(DATA_T),cudaMemcpyDeviceToHost);
if(hostResult == devResult)
++correctResults;
globCPUTime += locCPUTime;
globGPUTime += locGPUTime;
}
std::cout << "-------------------------- CUDPP RESULTS ----------------------------- " << std::endl;
std::cout << correctResults << " OF " << TEST_RUN_COUNT << std::endl;
if(correctResults == TEST_RUN_COUNT) std::cout << "ALL RESULTS ARE CORRECT" << std::endl;
std::cout << "HOST TIME: " << ((globCPUTime) / TEST_RUN_COUNT) << " SEC" << std::endl;
std::cout << "DEVICE TIME: " << ((globGPUTime) / TEST_RUN_COUNT) << " SEC" << std::endl;
std::cout << "-------------------------- CUDPP RESULTS ----------------------------- " << std::endl;
std::cout << std::endl << std::endl;
cudaFree(pDevSource);
}
int main(int argc,char* argv[])
{
int type,elemCount;
cudaSetDevice(0);
if(argc > 2)
{
type = atoi(argv[1]);
elemCount = atoi(argv[2]);
}
else
{
std::cout << "TYPE: ";
std::cin >> type;
std::cout << "ELEMENT COUNT: ";
std::cin >> elemCount;
}
ComputeReductionUsingCUDPP< double >(elemCount);
}