[Bug target/43364] Suboptimal code for the use of ARM NEON intrinsic "vset_lane_f32"

Tue Jun 15 20:35:00 GMT 2010

------- Comment #4 from siarhei dot siamashka at gmail dot com  2010-06-15 20:34 -------
(In reply to comment #3)
> Or use multiple alternatives feature for inline assembly constraints to emit either VMOV or VLD1?

Well, this kind of works :) But is very ugly and fragile:

/***************************************/
#include <arm_neon.h>

/* Override a slow 'vdup_n_f32' intrinsic with something better */

static inline float32x2_t vdup_n_f32_fast(float x)
{
    float32x2_t result;
    asm (
    ".set vdup_n_f32_fast_CODE_EMITTED,0\n"
    ".irp regname,r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14\n"
    ".ifeqs \"\\regname\", \"%1\"\n"
    "    vdup.32 %P0, %1\n"
    "    .set vdup_n_f32_fast_CODE_EMITTED,1\n"
    ".endif\n"
    ".ifeqs \"[\\regname, #0]\", \"%1\"\n"
    "    vld1.f32 {%P0[]}, [\\regname, :32]\n"
    "    .set vdup_n_f32_fast_CODE_EMITTED,1\n"
    ".endif\n"
    ".endr\n"
    ".if vdup_n_f32_fast_CODE_EMITTED == 0\n"
    ".error \"Fixme: icky macros from 'vdup_n_f32_fast' failed\"\n"
    ".endif\n"
    : "=w,w" (result) : "r,Q" (x) : "memory");
    return result;
}

#define vdup_n_f32(x) vdup_n_f32_fast(x)

/* Now let's test it for accessing data in registers */

float neon_add_regs(float a, float b)
{
    float32x2_t tmp1, tmp2;
    tmp1 = vdup_n_f32(a);
    tmp2 = vdup_n_f32(b);
    tmp1 = vadd_f32(tmp1, tmp2);
    return vget_lane_f32(tmp1, 0);
}

/* ... and in memory */

void neon_add_mem(float * __restrict out,
                  float * __restrict a,
                  float * __restrict b)
{
    float32x2_t tmp1, tmp2;
    tmp1 = vdup_n_f32(*a);
    tmp2 = vdup_n_f32(*b);
    tmp1 = vadd_f32(tmp1, tmp2);
    *out = vget_lane_f32(tmp1, 0);
}
/***************************************/

$ objdump -d test.o

00000000 <neon_add_mem>:
   0:   f4e10c9f        vld1.32 {d16[]}, [r1, :32]
   4:   f4e21c9f        vld1.32 {d17[]}, [r2, :32]
   8:   f2400da1        vadd.f32        d16, d16, d17
   c:   f4c0080f        vst1.32 {d16[0]}, [r0]
  10:   e12fff1e        bx      lr

00000014 <neon_add_regs>:
  14:   ee800b90        vdup.32 d16, r0
  18:   ee811b90        vdup.32 d17, r1
  1c:   f2400da1        vadd.f32        d16, d16, d17
  20:   ee100b90        vmov.32 r0, d16[0]
  24:   e12fff1e        bx      lr

-- 

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43364