Hi Isaac. I send you my code without any problem. Is the sample code
you propose in the repository. I copy here for you to see if they want
other users. The code would be:
#include <gmac/cuda.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#define VECTOR_SIZE 1024
#define BLOCK_SIZE 256
__global__ void vecAdd(float *c, const float *a, const float *b,
size_t size)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if(i >= size) return;
c[i] = a[i] + b[i];
}
int main(int argc, char * argv[])
{
float *a, *b, *c;
/* 1- Allocate the input vectors */
assert(gmacMalloc((void **)&a, VECTOR_SIZE * sizeof(float)) ==
gmacSuccess);
assert(gmacMalloc((void **)&b, VECTOR_SIZE * sizeof(float)) ==
gmacSuccess);
/* 2- Initialize the input vectors */
for (int i = 0; i < VECTOR_SIZE; i++) {
a[i] = 1.0f * rand();
b[i] = 1.0f * rand();
}
/* 3- Allocate the output vector */
assert(gmacMalloc((void **)&c, VECTOR_SIZE * sizeof(float)) ==
gmacSuccess);
/* 4- Invoke the kernel */
dim3 block(BLOCK_SIZE);
dim3 grid(VECTOR_SIZE / BLOCK_SIZE);
if(VECTOR_SIZE % BLOCK_SIZE) grid.x++;
vecAdd<<<grid, block>>>(c, a, b, VECTOR_SIZE);
/* 5- Wait for kernel completion */
assert(gmacThreadSynchronize() == gmacSuccess);
/* Check the result */
for(int i = 0; i < VECTOR_SIZE; i++) {
assert(c[i] = a[i] + b[i]);
}
/* 6- Free shared structures */
assert(gmacFree(a) == gmacSuccess);
assert(gmacFree(b) == gmacSuccess);
assert(gmacFree(c) == gmacSuccess);
return 0;
}
Well not if it's something to consider, but CUDA capability of my
graphics card is 1.1. I can test the code in my work I have a C2050
card (Fermi)