[Bug target/103272] New: failure to use vld20/vld21 to vectorize for ARM MVE

clyon at gcc dot gnu.org gcc-bugzilla@gcc.gnu.org
Tue Nov 16 10:07:48 GMT 2021


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103272

            Bug ID: 103272
           Summary: failure to use vld20/vld21 to vectorize for ARM MVE
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: clyon at gcc dot gnu.org
  Target Milestone: ---

With current GCC trunk with -mcpu=cortex-m55 -mfpu=auto

#include <stdint.h>

typedef struct {
  int16_t v1;
  int16_t v2;
} data;

void test (data* restrict d, data* restrict x,
           data* restrict y, uint32_t L) {
  for (uint32_t i = 0; i < L*16; i++) {
     d[i].v1 = x[i].v1*y[i].v1;
     d[i].v2 = x[i].v2*y[i].v2;
   }
}

we generate:
test:
        lsls    r3, r3, #4
        beq     .L9
        lsls    r3, r3, #2
        push    {lr}
        sub     lr, r3, #16
        lsr     lr, lr, #4
        add     lr, lr, #1
        dls     lr, lr
.L3:
        vldrh.16        q3, [r2], #16
        vldrh.16        q2, [r1], #16
        vmul.i16        q3, q3, q2
        vstrh.16        q3, [r0], #16
        le      lr, .L3
        ldr     pc, [sp], #4
.L9:
        bx      lr


while LLVM generates:
test:
        push    {r7, lr}
        mov     r7, sp
        mov.w   r12, #0
        cmp.w   r12, r3, lsl #4
        it      eq
        popeq   {r7, pc}
        mvn     r12, #7
        add.w   r12, r12, r3, lsl #4
        movs    r3, #1
        add.w   lr, r3, r12, lsr #3
.LBB0_2:
        vld20.16        {q0, q1}, [r1]
        vld20.16        {q2, q3}, [r2]
        vld21.16        {q0, q1}, [r1]!
        vld21.16        {q2, q3}, [r2]!
        vmul.i16        q0, q2, q0
        vmul.i16        q1, q3, q1
        vst20.16        {q0, q1}, [r0]
        vst21.16        {q0, q1}, [r0]!
        le      lr, .LBB0_2
        pop     {r7, pc}


OTOH, GCC vectorizes better the samples included in the testsuite
(gcc.target/arm/simd/mve-vld2.c)


More information about the Gcc-bugs mailing list