Bug 43364 - Suboptimal code for the use of ARM NEON intrinsic "vset_lane_f32"
Summary: Suboptimal code for the use of ARM NEON intrinsic "vset_lane_f32"
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 4.4.3
: P3 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: 47562
  Show dependency treegraph
 
Reported: 2010-03-14 13:02 UTC by Siarhei Siamashka
Modified: 2012-12-10 02:12 UTC (History)
2 users (show)

See Also:
Host: arm-unknown-linux-gnueabi
Target: arm-unknown-linux-gnueabi
Build: arm-unknown-linux-gnueabi
Known to work:
Known to fail: 4.4.3, 4.5.0
Last reconfirmed: 2010-03-15 11:58:16


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Siarhei Siamashka 2010-03-14 13:02:19 UTC
/*******/
#include <arm_neon.h>

void neon_add(float * __restrict out, float * __restrict a, float * __restrict b)
{
    float32x2_t tmp1, tmp2;
    tmp1 = vset_lane_f32(*a, tmp1, 0);
    tmp2 = vset_lane_f32(*b, tmp2, 0);
    tmp1 = vadd_f32(tmp1, tmp2);
    *out = vget_lane_f32(tmp1, 0);
}
/*******/

00000000 <neon_add>:
   0:   e5913000        ldr     r3, [r1]
   4:   eddf0b07        vldr    d16, [pc, #28]  ; 28 <neon_add+0x28>
   8:   e5922000        ldr     r2, [r2]
   c:   eddf1b05        vldr    d17, [pc, #20]  ; 28 <neon_add+0x28>
  10:   ee003b90        vmov.32 d16[0], r3
  14:   ee012b90        vmov.32 d17[0], r2
  18:   f2400da1        vadd.f32        d16, d16, d17
  1c:   f4c0080f        vst1.32 {d16[0]}, [r0]
  20:   e12fff1e        bx      lr
  24:   e1a00000        nop                     (mov r0,r0)


gcc fails to use a single instruction

   vld1.32 {d16[0]}, [r1]

instead of

   0:   e5913000        ldr     r3, [r1]
   4:   eddf0b07        vldr    d16, [pc, #28]  ; 28 <neon_add+0x28>
  10:   ee003b90        vmov.32 d16[0], r3
Comment 1 Ramana Radhakrishnan 2010-03-15 11:58:15 UTC
 though trunk of a recent vintage generates the following bit of code which is slightly better. There's no reason why we can't expand in a better manner and generate the vld1.f32. Marking as an enhancement.
  

        mov     r3, #0
        ldr     r2, [r2, #0]    @ float
        vdup.32 d16, r3
        ldr     r3, [r1, #0]    @ float
        vmov    d17, d16  @ v2sf
        vmov.32 d17[0], r2
        vmov.32 d16[0], r3
        vadd.f32        d16, d16, d17
        vst1.32 {d16[0]}, [r0]
        bx      lr
Comment 2 Siarhei Siamashka 2010-04-12 05:26:16 UTC
(In reply to comment #1)
>         mov     r3, #0
>         vdup.32 d16, r3

Also maybe "veor.32 d16, d16, d16" here?

Or drop this NEON register initialization completely because it is a redundant operation and was not explicitly requested in the original C code?

After all, from IHI0042D_aapcs.pdf:
"The FPSCR is the only status register that may be accessed by conforming code. It is a global register with the following properties:
* The condition code bits (28-31), the cumulative saturation (QC) bit (27) and the cumulative exception-status bits (0-4) are not preserved across a public interface."

and from ARM ARM:
"Advanced SIMD arithmetic always uses untrapped exception handling"

Tracking the cumulative exception-status bits may be tricky in general (using ununitialized value for NEON arithmetics can set them arbitrarily), but as long as they are not used in any way in the function itself, they are irrelevant.
Comment 3 Siarhei Siamashka 2010-06-15 20:14:32 UTC
The whole point of submitting this PR was to find an efficient way to use NEON instructions to operate on any arbitrary scalar floating point values in order to overcome Cortex-A8 VFP Lite inherent slowness (maybe make it transparent via wrapping it into a C++ class and use operator overloading).

Using 'vdup_n_f32' to load a single floating point value seems to be better than 'vset_lane_f32' here because we don't have to deal with uninitialized part of the register. But 'vdup_n_f32' suffers from the similar performance issues (VLD1 instruction is not used directly) and results in redundant instructions emitted when the value is loaded from memory. Optimistically, something like this should have been used instead of 'vdup_n_f32' in this case:

static inline float32x2_t vdup_n_f32_mem(float *p)
{
    float32x2_t result;
    asm ("vld1.f32 {%P0[]}, [%1, :32]" : "=w" (result) : "r" (p) : "memory");
    return result;
}

If wonder if it is possible to check at compile time whether the operand comes from memory or from a register? Something similar to '__builtin_constant_p' builtin-function? Or use multiple alternatives feature for inline assembly constraints to emit either VMOV or VLD1? Anything else?
Comment 4 Siarhei Siamashka 2010-06-15 20:34:58 UTC
(In reply to comment #3)
> Or use multiple alternatives feature for inline assembly constraints to emit either VMOV or VLD1?

Well, this kind of works :) But is very ugly and fragile:

/***************************************/
#include <arm_neon.h>

/* Override a slow 'vdup_n_f32' intrinsic with something better */

static inline float32x2_t vdup_n_f32_fast(float x)
{
    float32x2_t result;
    asm (
    ".set vdup_n_f32_fast_CODE_EMITTED,0\n"
    ".irp regname,r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14\n"
    ".ifeqs \"\\regname\", \"%1\"\n"
    "    vdup.32 %P0, %1\n"
    "    .set vdup_n_f32_fast_CODE_EMITTED,1\n"
    ".endif\n"
    ".ifeqs \"[\\regname, #0]\", \"%1\"\n"
    "    vld1.f32 {%P0[]}, [\\regname, :32]\n"
    "    .set vdup_n_f32_fast_CODE_EMITTED,1\n"
    ".endif\n"
    ".endr\n"
    ".if vdup_n_f32_fast_CODE_EMITTED == 0\n"
    ".error \"Fixme: icky macros from 'vdup_n_f32_fast' failed\"\n"
    ".endif\n"
    : "=w,w" (result) : "r,Q" (x) : "memory");
    return result;
}

#define vdup_n_f32(x) vdup_n_f32_fast(x)

/* Now let's test it for accessing data in registers */

float neon_add_regs(float a, float b)
{
    float32x2_t tmp1, tmp2;
    tmp1 = vdup_n_f32(a);
    tmp2 = vdup_n_f32(b);
    tmp1 = vadd_f32(tmp1, tmp2);
    return vget_lane_f32(tmp1, 0);
}

/* ... and in memory */

void neon_add_mem(float * __restrict out,
                  float * __restrict a,
                  float * __restrict b)
{
    float32x2_t tmp1, tmp2;
    tmp1 = vdup_n_f32(*a);
    tmp2 = vdup_n_f32(*b);
    tmp1 = vadd_f32(tmp1, tmp2);
    *out = vget_lane_f32(tmp1, 0);
}
/***************************************/

$ objdump -d test.o

00000000 <neon_add_mem>:
   0:   f4e10c9f        vld1.32 {d16[]}, [r1, :32]
   4:   f4e21c9f        vld1.32 {d17[]}, [r2, :32]
   8:   f2400da1        vadd.f32        d16, d16, d17
   c:   f4c0080f        vst1.32 {d16[0]}, [r0]
  10:   e12fff1e        bx      lr

00000014 <neon_add_regs>:
  14:   ee800b90        vdup.32 d16, r0
  18:   ee811b90        vdup.32 d17, r1
  1c:   f2400da1        vadd.f32        d16, d16, d17
  20:   ee100b90        vmov.32 r0, d16[0]
  24:   e12fff1e        bx      lr
Comment 5 Siarhei Siamashka 2012-12-10 02:12:05 UTC
This seems to have improved a lot. Thanks for your hard work.

	.cpu cortex-a8
	.eabi_attribute 27, 3
	.eabi_attribute 28, 1
	.fpu neon
	.eabi_attribute 20, 1
	.eabi_attribute 21, 1
	.eabi_attribute 23, 3
	.eabi_attribute 24, 1
	.eabi_attribute 25, 1
	.eabi_attribute 26, 1
	.eabi_attribute 30, 2
	.eabi_attribute 34, 1
	.eabi_attribute 18, 4
	.file	"test.c"
	.text
	.align	2
	.global	neon_add
	.type	neon_add, %function
neon_add:
	@ args = 0, pretend = 0, frame = 0
	@ frame_needed = 0, uses_anonymous_args = 0
	@ link register save eliminated.
	vmov.f32	d16, #0.0  @ v2sf
	vmov	d17, d16  @ v2sf
	vld1.32	{d16[0]}, [r1]
	vld1.32	{d17[0]}, [r2]
	vadd.f32	d16, d16, d17
	vst1.32	{d16[0]}, [r0]
	bx	lr
	.size	neon_add, .-neon_add
	.ident	"GCC: (GNU) 4.8.0 20121209 (experimental)"