i = 0;
for(j = 0; j < 512; j++) {
i += a[j] * b[j];
}
Initialization is done by
volatile float i;
float a[512], b[512];
int j;
for(j = 0; j < 512; j++) {
a[j] = j * 0.1;
b[j] = j * 0.1;
}
Thanks
Dirk
Here's a simple NEON version, unrolled 8 times:
float
vmac_neon(const float *a, const float *b, unsigned n)
{
float s = 0;
asm ("vmov.f32 q8, #0.0 \n\t"
"vmov.f32 q9, #0.0 \n\t"
"1: \n\t"
"subs %3, %3, #8 \n\t"
"vld1.32 {d0,d1,d2,d3}, [%1]! \n\t"
"vld1.32 {d4,d5,d6,d7}, [%2]! \n\t"
"vmla.f32 q8, q0, q2 \n\t"
"vmla.f32 q9, q1, q3 \n\t"
"bgt 1b \n\t"
"vadd.f32 q8, q8, q9 \n\t"
"vpadd.f32 d0, d16, d17 \n\t"
"vadd.f32 %0, s0, s1 \n\t"
: "=w"(s), "+r"(a), "+r"(b), "+r"(n)
:: "q0", "q1", "q2", "q3", "q8", "q9");
return s;
}
For comparison, I used this C function:
float
vmac_c(const float *a, const float *b, unsigned n)
{
float s = 0;
unsigned i;
for(i = 0; i < n; i++) {
s += a[i] * b[i];
}
return s;
}
Using gcc csl 2007q3 with flags -O3 -fomit-frame-pointer
-mfloat-abi=softfp -mfpu=neon -mcpu=cortex-a8 -ftree-vectorize
-ffast-math, the NEON version is about twice as fast as the C version.
Dropping -ffast-math makes the C version about 7 times slower, and
gives a slightly different result (differing in the 8th decimal
digit).
I was surprised to see that gcc actually managed to vectorise the code
a bit, even if hand-crafted assembler easily outperforms it.
--
Måns Rullgård
ma...@mansr.com