[Bug middle-end/91753] New: Bad register allocation of multi-register types
wilco at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Thu Sep 12 12:57:00 GMT 2019
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91753
Bug ID: 91753
Summary: Bad register allocation of multi-register types
Product: gcc
Version: 10.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: wilco at gcc dot gnu.org
Target Milestone: ---
The following example shows that register allocation of types which require
multiple registers is quite non-optimal:
#include <stdint.h>
#include <arm_neon.h>
void neon_transform_nada(const uint8x16x4_t table, uint8_t * values, int
volume) {
uint8x16_t x1 = vld1q_u8(values + 0);
uint8x16_t x2 = vld1q_u8(values + 16);
uint8x16_t x3 = vld1q_u8(values + 16*2);
uint8x16_t x4 = vld1q_u8(values + 16*3);
for(int i = 0; i < volume; i++) {
x1 = vqtbx4q_u8(x1, table,x1);
x2 = vqtbx4q_u8(x2, table,x2);
x3 = vqtbx4q_u8(x3, table,x3);
x4 = vqtbx4q_u8(x4, table,x4);
}
vst1q_u8(values + 0, x1);
vst1q_u8(values + 16, x2);
vst1q_u8(values + 16*2, x3);
vst1q_u8(values + 16*3, x4);
}
With -O2/O3:
neon_transform_nada:
cmp w1, 0
ldp q31, q30, [x0]
ldp q29, q28, [x0, 32]
ble .L2
mov v27.16b, v1.16b
mov w2, 0
mov v26.16b, v3.16b
mov v25.16b, v0.16b
mov v24.16b, v2.16b
.p2align 3,,7
.L3:
mov v0.16b, v25.16b
add w2, w2, 1
mov v20.16b, v25.16b
cmp w1, w2
mov v16.16b, v25.16b
mov v4.16b, v25.16b
mov v1.16b, v27.16b
mov v21.16b, v27.16b
mov v17.16b, v27.16b
mov v5.16b, v27.16b
mov v2.16b, v24.16b
mov v22.16b, v24.16b
mov v18.16b, v24.16b
mov v6.16b, v24.16b
mov v3.16b, v26.16b
mov v23.16b, v26.16b
mov v19.16b, v26.16b
mov v7.16b, v26.16b
tbx v31.16b, {v0.16b - v3.16b}, v31.16b
tbx v30.16b, {v20.16b - v23.16b}, v30.16b
tbx v29.16b, {v16.16b - v19.16b}, v29.16b
tbx v28.16b, {v4.16b - v7.16b}, v28.16b
bne .L3
.L2:
stp q31, q30, [x0]
stp q29, q28, [x0, 32]
ret
With -O1 it looks a lot better but there are still 4 redundant moves:
neon_transform_nada:
ldr q19, [x0]
ldr q18, [x0, 16]
ldr q17, [x0, 32]
ldr q16, [x0, 48]
cmp w1, 0
ble .L2
mov w2, 0
.L3:
mov v4.16b, v0.16b
mov v5.16b, v1.16b
mov v6.16b, v2.16b
mov v7.16b, v3.16b
tbx v19.16b, {v4.16b - v7.16b}, v19.16b
tbx v18.16b, {v4.16b - v7.16b}, v18.16b
tbx v17.16b, {v4.16b - v7.16b}, v17.16b
tbx v16.16b, {v4.16b - v7.16b}, v16.16b
add w2, w2, 1
cmp w1, w2
bne .L3
.L2:
str q19, [x0]
str q18, [x0, 16]
str q17, [x0, 32]
str q16, [x0, 48]
ret
More information about the Gcc-bugs
mailing list