[Bug middle-end/91753] New: Bad register allocation of multi-register types

Thu Sep 12 12:57:00 GMT 2019

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91753

            Bug ID: 91753
           Summary: Bad register allocation of multi-register types
           Product: gcc
           Version: 10.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: wilco at gcc dot gnu.org
  Target Milestone: ---

The following example shows that register allocation of types which require
multiple registers is quite non-optimal:

#include <stdint.h>

#include <arm_neon.h>
void neon_transform_nada(const uint8x16x4_t table, uint8_t * values, int
volume) {
  uint8x16_t x1 = vld1q_u8(values + 0);
  uint8x16_t x2 = vld1q_u8(values + 16);
  uint8x16_t x3 = vld1q_u8(values + 16*2);
  uint8x16_t x4 = vld1q_u8(values + 16*3);
  for(int i = 0; i  <  volume; i++) {
          x1 = vqtbx4q_u8(x1, table,x1);
          x2 = vqtbx4q_u8(x2, table,x2);
          x3 = vqtbx4q_u8(x3, table,x3);
          x4 = vqtbx4q_u8(x4, table,x4);
   }
  vst1q_u8(values + 0,    x1);
  vst1q_u8(values + 16,   x2);
  vst1q_u8(values + 16*2, x3);
  vst1q_u8(values + 16*3, x4);
}

With -O2/O3:

neon_transform_nada:
        cmp     w1, 0
        ldp     q31, q30, [x0]
        ldp     q29, q28, [x0, 32]
        ble     .L2
        mov     v27.16b, v1.16b
        mov     w2, 0
        mov     v26.16b, v3.16b
        mov     v25.16b, v0.16b
        mov     v24.16b, v2.16b
        .p2align 3,,7
.L3:
        mov     v0.16b, v25.16b
        add     w2, w2, 1
        mov     v20.16b, v25.16b
        cmp     w1, w2
        mov     v16.16b, v25.16b
        mov     v4.16b, v25.16b
        mov     v1.16b, v27.16b
        mov     v21.16b, v27.16b
        mov     v17.16b, v27.16b
        mov     v5.16b, v27.16b
        mov     v2.16b, v24.16b
        mov     v22.16b, v24.16b
        mov     v18.16b, v24.16b
        mov     v6.16b, v24.16b
        mov     v3.16b, v26.16b
        mov     v23.16b, v26.16b
        mov     v19.16b, v26.16b
        mov     v7.16b, v26.16b
        tbx     v31.16b, {v0.16b - v3.16b}, v31.16b
        tbx     v30.16b, {v20.16b - v23.16b}, v30.16b
        tbx     v29.16b, {v16.16b - v19.16b}, v29.16b
        tbx     v28.16b, {v4.16b - v7.16b}, v28.16b
        bne     .L3
.L2:
        stp     q31, q30, [x0]
        stp     q29, q28, [x0, 32]
        ret

With -O1 it looks a lot better but there are still 4 redundant moves:

neon_transform_nada:
        ldr     q19, [x0]
        ldr     q18, [x0, 16]
        ldr     q17, [x0, 32]
        ldr     q16, [x0, 48]
        cmp     w1, 0
        ble     .L2
        mov     w2, 0
.L3:
        mov     v4.16b, v0.16b
        mov     v5.16b, v1.16b
        mov     v6.16b, v2.16b
        mov     v7.16b, v3.16b
        tbx     v19.16b, {v4.16b - v7.16b}, v19.16b
        tbx     v18.16b, {v4.16b - v7.16b}, v18.16b
        tbx     v17.16b, {v4.16b - v7.16b}, v17.16b
        tbx     v16.16b, {v4.16b - v7.16b}, v16.16b
        add     w2, w2, 1
        cmp     w1, w2
        bne     .L3
.L2:
        str     q19, [x0]
        str     q18, [x0, 16]
        str     q17, [x0, 32]
        str     q16, [x0, 48]
        ret