We've been using ISPC to generate optimised implementations of various math routines to superb effect, typically beating our hand written intrinsic editions by 5-10%. So firstly many thanks!
We've seen an odd code generation pattern in the ARM NEON generated by ISPC however:
.text
.globl BinauralReverb_privProcessA4_ARM32HF_NEON_I32X4
.align 2
.type BinauralReverb_privProcessA4_ARM32HF_NEON_I32X4,%function
BinauralReverb_privProcessA4_ARM32HF_NEON_I32X4: @ @BinauralReverb_privProcessA4_ARM32HF_NEON_I32X4
.fnstart
@ BB#0: @ %allocas
push {r4, r5, r11, lr}
vpush {d8, d9, d10, d11, d12, d13}
mov r4, r0
ldr r0, [sp, #68]
vld1.64 {d16, d17}, [r1:128]
mov r5, r3
vld1.64 {d18, d19}, [r2:128]
vmov.i32 q5, #0x0
vldr s0, [r0]
ldr r0, [sp, #72]
vdup.32 d0, d0[0]
vmul.f32 q4, q8, d0[0]
vld1.32 {d16[], d17[]}, [r0:32]
ldr r0, [sp, #64]
vfma.f32 q4, q8, q9
vld1.64 {d12, d13}, [r0:128]
vmov.i32 q9, #0x0
vld1.64 {d16, d17}, [r5:128]
vfma.f32 q9, q8, q6
vst1.64 {d8, d9}, [r4:128]
vpadd.f32 d0, d18, d19
bl add_f32
add r0, r5, #64
vmov.i32 q9, #0x0
vld1.64 {d16, d17}, [r0:128]
vfma.f32 q9, q8, q6
vadd.f32 s0, s0, s16
vstr s0, [r4]
vpadd.f32 d0, d18, d19
bl add_f32
add r0, r5, #128
vmov.i32 q9, #0x0
vld1.64 {d16, d17}, [r0:128]
vfma.f32 q9, q8, q6
vadd.f32 s0, s0, s17
vstr s0, [r4, #4]
vpadd.f32 d0, d18, d19
bl add_f32
add r0, r5, #192
vadd.f32 s0, s0, s18
vstr s0, [r4, #8]
vld1.64 {d16, d17}, [r0:128]
vfma.f32 q5, q8, q6
vpadd.f32 d0, d10, d11
bl add_f32
vadd.f32 s0, s0, s19
mov r0, #0
vstr s0, [r4, #12]
vpop {d8, d9, d10, d11, d12, d13}
pop {r4, r5, r11, pc}
.Lfunc_end_0_6:
.size BinauralReverb_privProcessA4_ARM32HF_NEON_I32X4, .Lfunc_end_0_6-BinauralReverb_privProcessA4_ARM32HF_NEON_I32X4
.cantunwind
.fnend
Note the repeated "bl add_f32" calls. This is implemented by ISPC as:
Obviously there is no good reason why the single instruction vadd.f32 isn't simply being inlined instead of introducing a branch and link call.
... and ...
... and indeed, the above function is indeed using reduce_add().
It looks like reduce_add() causes the NEON LLVM to generate a non-inlineable add_f32 function. Is there some good reason that this LLVM IR isn't marked alwaysinline?