[Bug target/43364] Suboptimal code for the use of ARM NEON intrinsic "vset_lane_f32"
siarhei dot siamashka at gmail dot com
gcc-bugzilla@gcc.gnu.org
Tue Jun 15 20:35:00 GMT 2010
------- Comment #4 from siarhei dot siamashka at gmail dot com 2010-06-15 20:34 -------
(In reply to comment #3)
> Or use multiple alternatives feature for inline assembly constraints to emit either VMOV or VLD1?
Well, this kind of works :) But is very ugly and fragile:
/***************************************/
#include <arm_neon.h>
/* Override a slow 'vdup_n_f32' intrinsic with something better */
static inline float32x2_t vdup_n_f32_fast(float x)
{
float32x2_t result;
asm (
".set vdup_n_f32_fast_CODE_EMITTED,0\n"
".irp regname,r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14\n"
".ifeqs \"\\regname\", \"%1\"\n"
" vdup.32 %P0, %1\n"
" .set vdup_n_f32_fast_CODE_EMITTED,1\n"
".endif\n"
".ifeqs \"[\\regname, #0]\", \"%1\"\n"
" vld1.f32 {%P0[]}, [\\regname, :32]\n"
" .set vdup_n_f32_fast_CODE_EMITTED,1\n"
".endif\n"
".endr\n"
".if vdup_n_f32_fast_CODE_EMITTED == 0\n"
".error \"Fixme: icky macros from 'vdup_n_f32_fast' failed\"\n"
".endif\n"
: "=w,w" (result) : "r,Q" (x) : "memory");
return result;
}
#define vdup_n_f32(x) vdup_n_f32_fast(x)
/* Now let's test it for accessing data in registers */
float neon_add_regs(float a, float b)
{
float32x2_t tmp1, tmp2;
tmp1 = vdup_n_f32(a);
tmp2 = vdup_n_f32(b);
tmp1 = vadd_f32(tmp1, tmp2);
return vget_lane_f32(tmp1, 0);
}
/* ... and in memory */
void neon_add_mem(float * __restrict out,
float * __restrict a,
float * __restrict b)
{
float32x2_t tmp1, tmp2;
tmp1 = vdup_n_f32(*a);
tmp2 = vdup_n_f32(*b);
tmp1 = vadd_f32(tmp1, tmp2);
*out = vget_lane_f32(tmp1, 0);
}
/***************************************/
$ objdump -d test.o
00000000 <neon_add_mem>:
0: f4e10c9f vld1.32 {d16[]}, [r1, :32]
4: f4e21c9f vld1.32 {d17[]}, [r2, :32]
8: f2400da1 vadd.f32 d16, d16, d17
c: f4c0080f vst1.32 {d16[0]}, [r0]
10: e12fff1e bx lr
00000014 <neon_add_regs>:
14: ee800b90 vdup.32 d16, r0
18: ee811b90 vdup.32 d17, r1
1c: f2400da1 vadd.f32 d16, d16, d17
20: ee100b90 vmov.32 r0, d16[0]
24: e12fff1e bx lr
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43364
More information about the Gcc-bugs
mailing list