Hi!
I was trying to speedup solve-field performance on a Cortex-A53 arm processor and came accross the function ctmf_helper using some architecture dependent simd/vector code.
They exist for SSE2,MMX and ALTIVEC but nothing for ARM.
I looked up arm64 docs and added a similar section, and it speeded up image2xy by 20% and my solve-field by 10%. I don't know enough to know if it is always safe, or if it needs
other changes for alignment in malloc etc, but it works for my case.
--------------------------------------
#include <altivec.h>
+#elif defined(USE_ARM64)
+#include <arm_neon.h>
#endif
--------------
+#elif defined(USE_ARM64)
+static inline void histogram_add( const uint16_t x[16], uint16_t y[16] )
+{
+ *(uint16x4_t*) &y[0] = vadd_u16( *(uint16x4_t*) &y[0], *(uint16x4_t*) &x[0] );
+ *(uint16x4_t*) &y[4] = vadd_u16( *(uint16x4_t*) &y[4], *(uint16x4_t*) &x[4] );
+ *(uint16x4_t*) &y[8] = vadd_u16( *(uint16x4_t*) &y[8], *(uint16x4_t*) &x[8] );
+ *(uint16x4_t*) &y[12] = vadd_u16( *(uint16x4_t*) &y[12], *(uint16x4_t*) &x[12] );
+}
#else
---------------------
+#elif defined(USE_ARM64)
+static inline void histogram_sub( const uint16_t x[16], uint16_t y[16] )
+{
+ *(uint16x4_t*) &y[0] = vsub_u16( *(uint16x4_t*) &y[0], *(uint16x4_t*) &x[0] );
+ *(uint16x4_t*) &y[4] = vsub_u16( *(uint16x4_t*) &y[4], *(uint16x4_t*) &x[4] );
+ *(uint16x4_t*) &y[8] = vsub_u16( *(uint16x4_t*) &y[8], *(uint16x4_t*) &x[8] );
+ *(uint16x4_t*) &y[12] = vsub_u16( *(uint16x4_t*) &y[12], *(uint16x4_t*) &x[12] );
+}
#else
Thought I would pass it along, and check if any other perf changes/flags are known for arm platforms.