/*******/ #include <arm_neon.h> void neon_add(float * __restrict out, float * __restrict a, float * __restrict b) { float32x2_t tmp1, tmp2; tmp1 = vset_lane_f32(*a, tmp1, 0); tmp2 = vset_lane_f32(*b, tmp2, 0); tmp1 = vadd_f32(tmp1, tmp2); *out = vget_lane_f32(tmp1, 0); } /*******/ 00000000 <neon_add>: 0: e5913000 ldr r3, [r1] 4: eddf0b07 vldr d16, [pc, #28] ; 28 <neon_add+0x28> 8: e5922000 ldr r2, [r2] c: eddf1b05 vldr d17, [pc, #20] ; 28 <neon_add+0x28> 10: ee003b90 vmov.32 d16[0], r3 14: ee012b90 vmov.32 d17[0], r2 18: f2400da1 vadd.f32 d16, d16, d17 1c: f4c0080f vst1.32 {d16[0]}, [r0] 20: e12fff1e bx lr 24: e1a00000 nop (mov r0,r0) gcc fails to use a single instruction vld1.32 {d16[0]}, [r1] instead of 0: e5913000 ldr r3, [r1] 4: eddf0b07 vldr d16, [pc, #28] ; 28 <neon_add+0x28> 10: ee003b90 vmov.32 d16[0], r3
though trunk of a recent vintage generates the following bit of code which is slightly better. There's no reason why we can't expand in a better manner and generate the vld1.f32. Marking as an enhancement. mov r3, #0 ldr r2, [r2, #0] @ float vdup.32 d16, r3 ldr r3, [r1, #0] @ float vmov d17, d16 @ v2sf vmov.32 d17[0], r2 vmov.32 d16[0], r3 vadd.f32 d16, d16, d17 vst1.32 {d16[0]}, [r0] bx lr
(In reply to comment #1) > mov r3, #0 > vdup.32 d16, r3 Also maybe "veor.32 d16, d16, d16" here? Or drop this NEON register initialization completely because it is a redundant operation and was not explicitly requested in the original C code? After all, from IHI0042D_aapcs.pdf: "The FPSCR is the only status register that may be accessed by conforming code. It is a global register with the following properties: * The condition code bits (28-31), the cumulative saturation (QC) bit (27) and the cumulative exception-status bits (0-4) are not preserved across a public interface." and from ARM ARM: "Advanced SIMD arithmetic always uses untrapped exception handling" Tracking the cumulative exception-status bits may be tricky in general (using ununitialized value for NEON arithmetics can set them arbitrarily), but as long as they are not used in any way in the function itself, they are irrelevant.
The whole point of submitting this PR was to find an efficient way to use NEON instructions to operate on any arbitrary scalar floating point values in order to overcome Cortex-A8 VFP Lite inherent slowness (maybe make it transparent via wrapping it into a C++ class and use operator overloading). Using 'vdup_n_f32' to load a single floating point value seems to be better than 'vset_lane_f32' here because we don't have to deal with uninitialized part of the register. But 'vdup_n_f32' suffers from the similar performance issues (VLD1 instruction is not used directly) and results in redundant instructions emitted when the value is loaded from memory. Optimistically, something like this should have been used instead of 'vdup_n_f32' in this case: static inline float32x2_t vdup_n_f32_mem(float *p) { float32x2_t result; asm ("vld1.f32 {%P0[]}, [%1, :32]" : "=w" (result) : "r" (p) : "memory"); return result; } If wonder if it is possible to check at compile time whether the operand comes from memory or from a register? Something similar to '__builtin_constant_p' builtin-function? Or use multiple alternatives feature for inline assembly constraints to emit either VMOV or VLD1? Anything else?
(In reply to comment #3) > Or use multiple alternatives feature for inline assembly constraints to emit either VMOV or VLD1? Well, this kind of works :) But is very ugly and fragile: /***************************************/ #include <arm_neon.h> /* Override a slow 'vdup_n_f32' intrinsic with something better */ static inline float32x2_t vdup_n_f32_fast(float x) { float32x2_t result; asm ( ".set vdup_n_f32_fast_CODE_EMITTED,0\n" ".irp regname,r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14\n" ".ifeqs \"\\regname\", \"%1\"\n" " vdup.32 %P0, %1\n" " .set vdup_n_f32_fast_CODE_EMITTED,1\n" ".endif\n" ".ifeqs \"[\\regname, #0]\", \"%1\"\n" " vld1.f32 {%P0[]}, [\\regname, :32]\n" " .set vdup_n_f32_fast_CODE_EMITTED,1\n" ".endif\n" ".endr\n" ".if vdup_n_f32_fast_CODE_EMITTED == 0\n" ".error \"Fixme: icky macros from 'vdup_n_f32_fast' failed\"\n" ".endif\n" : "=w,w" (result) : "r,Q" (x) : "memory"); return result; } #define vdup_n_f32(x) vdup_n_f32_fast(x) /* Now let's test it for accessing data in registers */ float neon_add_regs(float a, float b) { float32x2_t tmp1, tmp2; tmp1 = vdup_n_f32(a); tmp2 = vdup_n_f32(b); tmp1 = vadd_f32(tmp1, tmp2); return vget_lane_f32(tmp1, 0); } /* ... and in memory */ void neon_add_mem(float * __restrict out, float * __restrict a, float * __restrict b) { float32x2_t tmp1, tmp2; tmp1 = vdup_n_f32(*a); tmp2 = vdup_n_f32(*b); tmp1 = vadd_f32(tmp1, tmp2); *out = vget_lane_f32(tmp1, 0); } /***************************************/ $ objdump -d test.o 00000000 <neon_add_mem>: 0: f4e10c9f vld1.32 {d16[]}, [r1, :32] 4: f4e21c9f vld1.32 {d17[]}, [r2, :32] 8: f2400da1 vadd.f32 d16, d16, d17 c: f4c0080f vst1.32 {d16[0]}, [r0] 10: e12fff1e bx lr 00000014 <neon_add_regs>: 14: ee800b90 vdup.32 d16, r0 18: ee811b90 vdup.32 d17, r1 1c: f2400da1 vadd.f32 d16, d16, d17 20: ee100b90 vmov.32 r0, d16[0] 24: e12fff1e bx lr
This seems to have improved a lot. Thanks for your hard work. .cpu cortex-a8 .eabi_attribute 27, 3 .eabi_attribute 28, 1 .fpu neon .eabi_attribute 20, 1 .eabi_attribute 21, 1 .eabi_attribute 23, 3 .eabi_attribute 24, 1 .eabi_attribute 25, 1 .eabi_attribute 26, 1 .eabi_attribute 30, 2 .eabi_attribute 34, 1 .eabi_attribute 18, 4 .file "test.c" .text .align 2 .global neon_add .type neon_add, %function neon_add: @ args = 0, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 @ link register save eliminated. vmov.f32 d16, #0.0 @ v2sf vmov d17, d16 @ v2sf vld1.32 {d16[0]}, [r1] vld1.32 {d17[0]}, [r2] vadd.f32 d16, d16, d17 vst1.32 {d16[0]}, [r0] bx lr .size neon_add, .-neon_add .ident "GCC: (GNU) 4.8.0 20121209 (experimental)"