struct myStruct
{
boost::compute::float2_ ab;
boost::compute::float4_ c;
boost::compute::float4_ d;
//Here some more float4_ members
boost::compute::int4_ e;
};
struct myStruct
{
boost::compute::int2_ ab;
boost::compute::float4_ c;
boost::compute::float4_ d;
//Here some more float4_ members
boost::compute::int4_ e;
};
struct myStruct
{
myStruct(int x)
: id_attributes(10 + x,x)
, c(0.0f,0.0f,0.0f,10000000.0f)
, d(10.0f,10.0f,10.0f,10000.0f)
{
}
boost::compute::int2_ id_attributes;
boost::compute::float4_ c;
boost::compute::float4_ d;
};
typedef std::vector<myStruct> HostData;
BOOST_COMPUTE_ADAPT_STRUCT(myStruct, myStruct, (id_attributes, c, d))
typedef boost::compute::vector<myStruct> GPUData;
#define get_typedef boost::compute::type_definition<myStruct>() + "\n"
struct CContext {
boost::compute::device device;
boost::compute::context context;
boost::compute::command_queue queue;
CContext() :
device ( boost::compute::system::default_device() ),
context( boost::compute::system::default_context() ),
queue ( boost::compute::system::default_queue() )
{}
};
void myStructKernel()
{
CContext context;
HostData data;
data.push_back(myStruct(1));
data.push_back(myStruct(2));
data.push_back(myStruct(3));
//Print A
GPUData gpudata(data.size(), context.context);
boost::compute::copy(data.begin(), data.end(), gpudata.begin(), context.queue);
//Print B
std::string source = BOOST_COMPUTE_STRINGIZE_SOURCE(
__kernel void custom_kernel(__global const myStruct *input, __global myStruct *output)
{
const uint i = get_global_id(0);
const __global myStruct *testy = &input[i];
output[i].d.xyz = testy->c.xyz;
float3 newval = testy->c.xyz - (float3)(5.f);
output[i].c.xyz = newval;
}
);
source = get_typedef + source;
boost::compute::program program = boost::compute::program::build_with_source(source, context.context);
boost::compute::kernel custom_kernel = program.create_kernel("custom_kernel");
custom_kernel.set_arg(0, gpudata);
custom_kernel.set_arg(1, gpudata);
context.queue.enqueue_1d_range_kernel(custom_kernel, 0, gpudata.size(), 1);
context.queue.finish();
//Print C
boost::compute::copy(gpudata.begin(), gpudata.end(), data.begin(), context.queue);
//Print D
}
//Print A (Host)
id/attributes: 11/1
c: 0/0/0/1e+007
d: 0/0/0/1e+007
--------------------------------
id/attributes: 12/2
c: 0/0/0/1e+007
d: 0/0/0/1e+007
--------------------------------
id/attributes: 13/3
c: 0/0/0/1e+007
d: 0/0/0/1e+007
--------------------------------
//Print B (Device after Copy)
id/attributes: 11/1
c: 0/0/0/1e+007
d: 0/0/0/1e+007
--------------------------------
id/attributes: 12/2
c: 0/0/0/1e+007
d: 0/0/0/1e+007
--------------------------------
id/attributes: 13/3
c: 0/0/0/1e+007
d: 0/0/0/1e+007
--------------------------------
//Print C (Device after Kernel)
id/attributes: -1063256064/-1063256064
c: -5/0/1.54143e-044/1.4013e-045
d: -5/0/1.54143e-044/1.4013e-045
--------------------------------
id/attributes: 12/2
c: -5/-5/-5/1e+007
d: -5/-5/-5/1e+007
--------------------------------
id/attributes: -1063256064/-1063256064
c: -5/0/1.82169e-044/4.2039e-045
d: -5/0/1.82169e-044/4.2039e-045
--------------------------------
//Print D (Host after copy)
id/attributes: -1063256064/-1063256064
c: -5/0/1.54143e-044/1.4013e-045
d: -5/0/1.54143e-044/1.4013e-045
--------------------------------
id/attributes: 12/2
c: -5/-5/-5/1e+007
d: -5/-5/-5/1e+007
--------------------------------
id/attributes: -1063256064/-1063256064
c: -5/0/1.82169e-044/4.2039e-045
d: -5/0/1.82169e-044/4.2039e-045
--------------------------------
//Print A (Host)
id/attributes: 11/1
c: 0/0/0/1e+007
d: 0/0/0/1e+007
--------------------------------
id/attributes: 12/2
c: 0/0/0/1e+007
d: 0/0/0/1e+007
--------------------------------
id/attributes: 13/3
c: 0/0/0/1e+007
d: 0/0/0/1e+007
--------------------------------
//Print B (Device after Copy)
id/attributes: 11/1
c: 0/0/0/1e+007
d: 0/0/0/1e+007
--------------------------------
id/attributes: 12/2
c: 0/0/0/1e+007
d: 0/0/0/1e+007
--------------------------------
id/attributes: 13/3
c: 0/0/0/1e+007
d: 0/0/0/1e+007
--------------------------------
//Print C (Device after Kernel)
id/attributes: 11/1
c: -5/-5/-5/1e+007
d: -5/-5/-5/1e+007
--------------------------------
id/attributes: 12/2
c: -5/-5/-5/1e+007
d: -5/-5/-5/1e+007
--------------------------------
id/attributes: 13/3
c: -5/-5/-5/1e+007
d: -5/-5/-5/1e+007
--------------------------------
//Print D (Host after copy)
id/attributes: 11/1
c: -5/-5/-5/1e+007
d: -5/-5/-5/1e+007
--------------------------------
id/attributes: 12/2
c: -5/-5/-5/1e+007
d: -5/-5/-5/1e+007
--------------------------------
id/attributes: 13/3
c: -5/-5/-5/1e+007
d: -5/-5/-5/1e+007
--------------------------------
//Print A (Host)
id/attributes: 11/1
c: 0/0/0/1e+007
d: 10/10/10/10000
--------------------------------
id/attributes: 12/2
c: 0/0/0/1e+007
d: 10/10/10/10000
--------------------------------
id/attributes: 13/3
c: 0/0/0/1e+007
d: 10/10/10/10000
--------------------------------
//Print B (Device after Copy)
id/attributes: 11/1
c: 0/0/0/1e+007
d: 10/10/10/10000
--------------------------------
id/attributes: 12/2
c: 0/0/0/1e+007
d: 10/10/10/10000
--------------------------------
id/attributes: 13/3
c: 0/0/0/1e+007
d: 10/10/10/10000
--------------------------------
//Print C (Device after Kernel)
id/attributes: 11/1
c: -5/-5/-5/1e+007
d: 0/0/0/10000
--------------------------------
id/attributes: 12/2
c: -5/-5/-5/1e+007
d: 0/0/0/10000
--------------------------------
id/attributes: 13/3
c: -5/-5/-5/1e+007
d: 0/0/0/10000
--------------------------------
//Print D (Host after copy)
id/attributes: 11/1
c: -5/-5/-5/1e+007
d: 0/0/0/10000
--------------------------------
id/attributes: 12/2
c: -5/-5/-5/1e+007
d: 0/0/0/10000
--------------------------------
id/attributes: 13/3
c: -5/-5/-5/1e+007
d: 0/0/0/10000
--------------------------------
//Print A (Host)
id/attributes: 11/1
c: 0/0/0/1e+007
d: 10/10/10/10000
--------------------------------
id/attributes: 12/2
c: 0/0/0/1e+007
d: 10/10/10/10000
--------------------------------
id/attributes: 13/3
c: 0/0/0/1e+007
d: 10/10/10/10000
--------------------------------
//Print B (Device after Copy)
id/attributes: 11/1
c: 0/0/0/1e+007
d: 10/10/10/10000
--------------------------------
id/attributes: 12/2
c: 0/0/0/1e+007
d: 10/10/10/10000
--------------------------------
id/attributes: 13/3
c: 0/0/0/1e+007
d: 10/10/10/10000
--------------------------------
//Print C (Device after Kernel)
id/attributes: -1063256064/-1063256064
c: -5/0/1.54143e-044/1.4013e-045
d: 0/10/10/10000
--------------------------------
id/attributes: 12/2
c: -5/-5/-5/1e+007
d: 0/0/0/10000
--------------------------------
id/attributes: -1063256064/-1063256064
c: -5/0/1.82169e-044/4.2039e-045
d: 0/10/10/10000
--------------------------------
//Print D (Host after copy)
id/attributes: -1063256064/-1063256064
c: -5/0/1.54143e-044/1.4013e-045
d: 0/10/10/10000
--------------------------------
id/attributes: 12/2
c: -5/-5/-5/1e+007
d: 0/0/0/10000
--------------------------------
id/attributes: -1063256064/-1063256064
c: -5/0/1.82169e-044/4.2039e-045
d: 0/10/10/10000
--------------------------------
Thanks.
I installed the latest driver and the problem still persists. :-/ I will install the Intel SDK and see if that makes a difference!
And yes, currently I am simply trying out what is possible and what not. I haven't worked with GPU targetting code before (i.e. C99 and OpenCL) so this is all basic research to later make some design choices.
I guess you mean struct of arrays probably fits the vectorization pattern of GPUs better (and helps for easier parallelization). So you say it's valid to have a struct with boost::compute::vectors (or rather global pointers to the buffers)?
And I assume you mean that the private memory variables are faster to execute than picking them from the global array?
Cheers so far
OK. Let me know then.
I mean when you use a normal typically struct, then each time you want to use any value from that struct, you have to loadwhole struct. You want to read 2 ints, that is 8 bytes, and you end up reading 16 bytes or whatever the size of your struct is,more than 8 bytes... or you can try reading only those 2 ints but then you have strided memory access. Both ways yourperformance is damaged.
In this case struct of arrays technique would really be a struct of boost::compute::vectors (or boost::compute::buffers, whatever).
Yes. Read value to private mem, use it and later save it. Don't read twice the same value from a buffer if you reallyreally don't need to.
Am Freitag, 29. Juli 2016 11:41:58 UTC+2 schrieb Jakub Szuppe:OK. Let me know then.
With the Intel SDK it magically works! :)
I mean when you use a normal typically struct, then each time you want to use any value from that struct, you have to loadwhole struct. You want to read 2 ints, that is 8 bytes, and you end up reading 16 bytes or whatever the size of your struct is,more than 8 bytes... or you can try reading only those 2 ints but then you have strided memory access. Both ways yourperformance is damaged.
Ah I see. Sure thing. Totally following Einsteins "You gotta do things as simple as possible. But not simpler".In this case struct of arrays technique would really be a struct of boost::compute::vectors (or boost::compute::buffers, whatever).
Excellent. Cheers for that info.
Yes. Read value to private mem, use it and later save it. Don't read twice the same value from a buffer if you reallyreally don't need to.
Will do!
Thanks for all your input Jakub. This is all really helpful. I will probably have more questions in the next weeks but currently it seems I will be able to use compute for my commercial projects in the future.