[Bug target/63173] performance problem with simd intrinsics vld2_dup_* on aarch64-none-elf

venkataramanan.kumar at amd dot com gcc-bugzilla@gcc.gnu.org
Tue Oct 14 06:20:00 GMT 2014


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63173

--- Comment #2 from Venkataramanan <venkataramanan.kumar at amd dot com> ---
Changed the test case to work with latest GCC trunk 

#include <arm_neon.h>
int16x4x2_t foo(int16_t * __restrict pDataA,
                                 int16_t *  __restrict pDataB)
{
        int16x4x2_t DataA, DataB, DataC;

        DataA = vld2_dup_s16(pDataA);
        DataB = vld2_dup_s16(pDataB);

        DataC.val[0] = vqadd_s16( DataA.val[0], DataB.val[0] );
        DataC.val[1] = vqadd_s16( DataA.val[1], DataB.val[1] );

        return DataC;
}

Still seeing loads and stores via memory.

 foo:
        sub     sp, sp, #16
        // Start of user assembly
// 11788
"/home/venkataramanan-kumar/work/pr62308/builds/destdir/x86_64-unknown-linux-gnu/lib/gcc/aarch64-none-elf/5.0.0/include/arm_neon.h"
1
        ld2r {v16.4h, v17.4h}, [x0]
        st1 {v16.4h, v17.4h}, [sp]

// 0 "" 2
        // End of user assembly
        ldr     d0, [sp]
        ldr     d1, [sp, 8]
        // Start of user assembly
// 11788
"/home/venkataramanan-kumar/work/pr62308/builds/destdir/x86_64-unknown-linux-gnu/lib/gcc/aarch64-none-elf/5.0.0/include/arm_neon.h"
1
        ld2r {v16.4h, v17.4h}, [x1]
        st1 {v16.4h, v17.4h}, [sp]

// 0 "" 2
        // End of user assembly
        ldr     d3, [sp]
        ldr     d2, [sp, 8]
        add     sp, sp, 16
        sqadd   v0.4h, v0.4h, v3.4h
        sqadd   v1.4h, v1.4h, v2.4h
        ret
        .size   foo, .-foo
        .ident  "GCC: (Linaro GCC 2014.10) 5.0.0 20140930 (experimental)"



More information about the Gcc-bugs mailing list