[Bug c/108412] New: Negative optimization of GCSE && LOOP INVARIANTS
juzhe.zhong at rivai dot ai
gcc-bugzilla@gcc.gnu.org
Mon Jan 16 02:04:04 GMT 2023
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108412
Bug ID: 108412
Summary: Negative optimization of GCSE && LOOP INVARIANTS
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: juzhe.zhong at rivai dot ai
Target Milestone: ---
Consider such case:
void f (void * restrict in, void * restrict out, int l, int n, int m)
{
int vl = 32;
for (int i = 0; i < n; i++)
{
vint8mf8_t v1 = __riscv_vle8_v_i8mf8 (in + i + 1, vl);
__riscv_vse8_v_i8mf8 (out + i + 1, v1, vl);
vint8mf8_t v2 = __riscv_vle8_v_i8mf8 (in + i + 2, vl);
__riscv_vse8_v_i8mf8 (out + i + 2, v2, vl);
vint8mf8_t v3 = __riscv_vle8_v_i8mf8 (in + i + 3, vl);
__riscv_vse8_v_i8mf8 (out + i + 3, v3, vl);
vint8mf8_t v4 = __riscv_vle8_v_i8mf8 (in + i + 4, vl);
__riscv_vse8_v_i8mf8 (out + i + 4, v4, vl);
vint8mf8_t v5 = __riscv_vle8_v_i8mf8 (in + i + 5, vl);
__riscv_vse8_v_i8mf8 (out + i + 5, v5, vl);
vint8mf8_t v6 = __riscv_vle8_v_i8mf8 (in + i + 6, vl);
__riscv_vse8_v_i8mf8 (out + i + 6, v6, vl);
vint8mf8_t v7 = __riscv_vle8_v_i8mf8 (in + i + 7, vl);
__riscv_vse8_v_i8mf8 (out + i + 7, v7, vl);
vint8mf8_t v8 = __riscv_vle8_v_i8mf8 (in + i + 8, vl);
__riscv_vse8_v_i8mf8 (out + i + 8, v8, vl);
vint8mf8_t v9 = __riscv_vle8_v_i8mf8 (in + i + 9, vl);
__riscv_vse8_v_i8mf8 (out + i + 9, v9, vl);
vint8mf8_t v10 = __riscv_vle8_v_i8mf8 (in + i + 10, vl);
__riscv_vse8_v_i8mf8 (out + i + 10, v10, vl);
vint8mf8_t v11 = __riscv_vle8_v_i8mf8 (in + i + 11, vl);
__riscv_vse8_v_i8mf8 (out + i + 11, v11, vl);
vint8mf8_t v12 = __riscv_vle8_v_i8mf8 (in + i + 12, vl);
__riscv_vse8_v_i8mf8 (out + i + 12, v12, vl);
vint8mf8_t v13 = __riscv_vle8_v_i8mf8 (in + i + 13, vl);
__riscv_vse8_v_i8mf8 (out + i + 13, v13, vl);
vint8mf8_t v14 = __riscv_vle8_v_i8mf8 (in + i + 14, vl);
__riscv_vse8_v_i8mf8 (out + i + 14, v14, vl);
}
}
-O3 ASM:
f:
addi sp,sp,-192
sw a3,28(sp)
ble a3,zero,.L1
addi a5,a0,1
sw a5,76(sp)
addi a5,a1,1
sw a5,80(sp)
addi a5,a0,2
sw a5,84(sp)
addi a5,a1,2
sw a5,88(sp)
addi a5,a0,3
sw a5,92(sp)
addi a5,a1,3
sw a5,96(sp)
addi a5,a0,4
sw a5,100(sp)
addi a5,a1,4
sw a5,104(sp)
addi a5,a0,5
sw a5,108(sp)
addi a5,a1,5
sw a5,112(sp)
addi a5,a0,6
sw a5,116(sp)
addi a5,a1,6
sw a5,120(sp)
addi a5,a0,7
sw a5,124(sp)
addi a5,a1,7
sw a5,128(sp)
addi a5,a0,8
sw a5,132(sp)
addi a5,a1,8
sw a5,136(sp)
addi a5,a0,9
sw a5,140(sp)
addi a5,a1,9
sw a5,32(sp)
addi a5,a0,10
sw a5,36(sp)
addi a5,a1,10
sw a5,40(sp)
sw s0,188(sp)
addi a5,a0,11
sw s1,184(sp)
sw s2,180(sp)
sw s3,176(sp)
sw s4,172(sp)
sw s5,168(sp)
sw s6,164(sp)
sw s7,160(sp)
sw s8,156(sp)
sw s9,152(sp)
sw s10,148(sp)
sw s11,144(sp)
sw a5,44(sp)
addi a5,a1,11
sw a5,48(sp)
addi a5,a0,12
sw a5,52(sp)
addi a5,a1,12
sw a5,56(sp)
addi a5,a0,13
sw a5,60(sp)
addi a5,a1,13
sw a5,64(sp)
addi a5,a0,14
sw a5,68(sp)
addi a5,a1,14
sw a5,72(sp)
li a4,0
li a5,32
vsetvli zero,a5,e8,mf8,ta,ma
.L3:
lw a3,76(sp)
lw t2,60(sp)
add s11,a3,a4
lw a3,80(sp)
add t2,t2,a4
add t0,a3,a4
lw a3,84(sp)
add s10,a3,a4
lw a3,88(sp)
add t6,a3,a4
lw a3,92(sp)
add s9,a3,a4
lw a3,96(sp)
add t5,a3,a4
lw a3,100(sp)
add s8,a3,a4
lw a3,104(sp)
add t4,a3,a4
lw a3,108(sp)
add s7,a3,a4
lw a3,112(sp)
add t3,a3,a4
lw a3,116(sp)
add s6,a3,a4
lw a3,120(sp)
add t1,a3,a4
lw a3,124(sp)
add s5,a3,a4
lw a3,128(sp)
add a7,a3,a4
lw a3,132(sp)
add s4,a3,a4
lw a3,136(sp)
add a6,a3,a4
lw a3,140(sp)
add s3,a3,a4
lw a3,32(sp)
add a0,a3,a4
lw a3,36(sp)
add s2,a3,a4
lw a3,40(sp)
add a1,a3,a4
lw a3,44(sp)
add s1,a3,a4
lw a3,48(sp)
add a2,a3,a4
lw a3,52(sp)
add s0,a3,a4
lw a3,56(sp)
add a3,a3,a4
sw a3,12(sp)
lw a3,64(sp)
add a3,a3,a4
sw a3,16(sp)
lw a3,68(sp)
add a3,a3,a4
sw a3,20(sp)
lw a3,72(sp)
add a3,a3,a4
sw a3,24(sp)
vle8.v v24,0(s11)
vse8.v v24,0(t0)
vle8.v v24,0(s10)
vse8.v v24,0(t6)
vle8.v v24,0(s9)
vse8.v v24,0(t5)
vle8.v v24,0(s8)
vse8.v v24,0(t4)
vle8.v v24,0(s7)
vse8.v v24,0(t3)
vle8.v v24,0(s6)
vse8.v v24,0(t1)
vle8.v v24,0(s5)
vse8.v v24,0(a7)
vle8.v v24,0(s4)
vse8.v v24,0(a6)
vle8.v v24,0(s3)
vse8.v v24,0(a0)
vle8.v v24,0(s2)
vse8.v v24,0(a1)
vle8.v v24,0(s1)
vse8.v v24,0(a2)
lw a3,12(sp)
vle8.v v24,0(s0)
addi a4,a4,1
vse8.v v24,0(a3)
lw a3,16(sp)
vle8.v v24,0(t2)
vse8.v v24,0(a3)
lw a3,20(sp)
vle8.v v24,0(a3)
lw a3,24(sp)
vse8.v v24,0(a3)
lw a3,28(sp)
bne a3,a4,.L3
lw s0,188(sp)
lw s1,184(sp)
lw s2,180(sp)
lw s3,176(sp)
lw s4,172(sp)
lw s5,168(sp)
lw s6,164(sp)
lw s7,160(sp)
lw s8,156(sp)
lw s9,152(sp)
lw s10,148(sp)
lw s11,144(sp)
.L1:
addi sp,sp,192
jr ra
Codegen is quite ugly.
However, if we try -O3 -fno-gcse -fno-schedule-insns -fno-move-loop-invariants
ASM is much better, same as LLVM:
f:
ble a3,zero,.L1
li a5,0
vsetvli zero,a4,e8,mf8,ta,ma
.L3:
addi a2,a0,1
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,1
add a2,a2,a5
vse8.v v24,0(a2)
addi a2,a0,2
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,2
add a2,a2,a5
vse8.v v24,0(a2)
addi a2,a0,3
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,3
add a2,a2,a5
vse8.v v24,0(a2)
addi a2,a0,4
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,4
add a2,a2,a5
vse8.v v24,0(a2)
addi a2,a0,5
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,5
add a2,a2,a5
vse8.v v24,0(a2)
addi a2,a0,6
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,6
add a2,a2,a5
vse8.v v24,0(a2)
addi a2,a0,7
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,7
add a2,a2,a5
vse8.v v24,0(a2)
addi a2,a0,8
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,8
add a2,a2,a5
vse8.v v24,0(a2)
addi a2,a0,9
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,9
add a2,a2,a5
vse8.v v24,0(a2)
addi a2,a0,10
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,10
add a2,a2,a5
vse8.v v24,0(a2)
addi a2,a0,11
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,11
add a2,a2,a5
vse8.v v24,0(a2)
addi a2,a0,12
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,12
add a2,a2,a5
vse8.v v24,0(a2)
addi a2,a0,13
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,13
add a2,a2,a5
vse8.v v24,0(a2)
addi a2,a0,14
add a2,a2,a5
vle8.v v24,0(a2)
addi a2,a1,14
add a2,a2,a5
vse8.v v24,0(a2)
addi a5,a5,1
bne a3,a5,.L3
.L1:
ret
Currently, RVV support doesn't have any Cost model. I am not sure whether it's
related to it. May need someone help me to fix it. Thanks.
More information about the Gcc-bugs
mailing list