i = 0;
for(j = 0; j < 512; j++) {
   i += a[j] * b[j];
}
Initialization is done by
volatile float i;
float a[512], b[512];
int j;
for(j = 0; j < 512; j++) {
    a[j] = j * 0.1;
    b[j] = j * 0.1;
}
Thanks
Dirk
Here's a simple NEON version, unrolled 8 times:
float
vmac_neon(const float *a, const float *b, unsigned n)
{
    float s = 0;
    asm ("vmov.f32  q8, #0.0                  \n\t"
         "vmov.f32  q9, #0.0                  \n\t"
         "1:                                  \n\t"
         "subs      %3, %3, #8                \n\t"
         "vld1.32   {d0,d1,d2,d3}, [%1]!      \n\t"
         "vld1.32   {d4,d5,d6,d7}, [%2]!      \n\t"
         "vmla.f32  q8, q0, q2                \n\t"
         "vmla.f32  q9, q1, q3                \n\t"
         "bgt       1b                        \n\t"
         "vadd.f32  q8, q8, q9                \n\t"
         "vpadd.f32 d0, d16, d17              \n\t"
         "vadd.f32  %0, s0, s1                \n\t"
         : "=w"(s), "+r"(a), "+r"(b), "+r"(n)
         :: "q0", "q1", "q2", "q3", "q8", "q9");
    return s;
}
For comparison, I used this C function:
float
vmac_c(const float *a, const float *b, unsigned n)
{
    float s = 0;
    unsigned i;
    for(i = 0; i < n; i++) {
        s += a[i] * b[i];
    }
    return s;
}
Using gcc csl 2007q3 with flags -O3 -fomit-frame-pointer
-mfloat-abi=softfp -mfpu=neon -mcpu=cortex-a8 -ftree-vectorize
-ffast-math, the NEON version is about twice as fast as the C version.
Dropping -ffast-math makes the C version about 7 times slower, and
gives a slightly different result (differing in the 8th decimal
digit).
I was surprised to see that gcc actually managed to vectorise the code
a bit, even if hand-crafted assembler easily outperforms it.
-- 
Måns Rullgård
ma...@mansr.com