Bug 108412 - RISC-V: Negative optimization of GCSE && LOOP INVARIANTS
Summary: RISC-V: Negative optimization of GCSE && LOOP INVARIANTS
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: rtl-optimization (show other bugs)
Version: 13.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks:
 
Reported: 2023-01-16 02:04 UTC by JuzheZhong
Modified: 2023-12-21 08:25 UTC (History)
2 users (show)

See Also:
Host:
Target: riscv
Build:
Known to work:
Known to fail:
Last reconfirmed:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description JuzheZhong 2023-01-16 02:04:04 UTC
Consider such case:
void f (void * restrict in, void * restrict out, int l, int n, int m)
{
  int vl = 32;
  for (int i = 0; i < n; i++)
    {
      vint8mf8_t v1 = __riscv_vle8_v_i8mf8 (in + i + 1, vl);
      __riscv_vse8_v_i8mf8 (out + i + 1, v1, vl);

      vint8mf8_t v2 = __riscv_vle8_v_i8mf8 (in + i + 2, vl);
      __riscv_vse8_v_i8mf8 (out + i + 2, v2, vl);

      vint8mf8_t v3 = __riscv_vle8_v_i8mf8 (in + i + 3, vl);
      __riscv_vse8_v_i8mf8 (out + i + 3, v3, vl);

      vint8mf8_t v4 = __riscv_vle8_v_i8mf8 (in + i + 4, vl);
      __riscv_vse8_v_i8mf8 (out + i + 4, v4, vl);

      vint8mf8_t v5 = __riscv_vle8_v_i8mf8 (in + i + 5, vl);
      __riscv_vse8_v_i8mf8 (out + i + 5, v5, vl);

      vint8mf8_t v6 = __riscv_vle8_v_i8mf8 (in + i + 6, vl);
      __riscv_vse8_v_i8mf8 (out + i + 6, v6, vl);

      vint8mf8_t v7 = __riscv_vle8_v_i8mf8 (in + i + 7, vl);
      __riscv_vse8_v_i8mf8 (out + i + 7, v7, vl);

      vint8mf8_t v8 = __riscv_vle8_v_i8mf8 (in + i + 8, vl);
      __riscv_vse8_v_i8mf8 (out + i + 8, v8, vl);

      vint8mf8_t v9 = __riscv_vle8_v_i8mf8 (in + i + 9, vl);
      __riscv_vse8_v_i8mf8 (out + i + 9, v9, vl);

      vint8mf8_t v10 = __riscv_vle8_v_i8mf8 (in + i + 10, vl);
      __riscv_vse8_v_i8mf8 (out + i + 10, v10, vl);

      vint8mf8_t v11 = __riscv_vle8_v_i8mf8 (in + i + 11, vl);
      __riscv_vse8_v_i8mf8 (out + i + 11, v11, vl);

      vint8mf8_t v12 = __riscv_vle8_v_i8mf8 (in + i + 12, vl);
      __riscv_vse8_v_i8mf8 (out + i + 12, v12, vl);

      vint8mf8_t v13 = __riscv_vle8_v_i8mf8 (in + i + 13, vl);
      __riscv_vse8_v_i8mf8 (out + i + 13, v13, vl);

      vint8mf8_t v14 = __riscv_vle8_v_i8mf8 (in + i + 14, vl);
      __riscv_vse8_v_i8mf8 (out + i + 14, v14, vl);
    }
}

-O3 ASM:
f:
	addi	sp,sp,-192
	sw	a3,28(sp)
	ble	a3,zero,.L1
	addi	a5,a0,1
	sw	a5,76(sp)
	addi	a5,a1,1
	sw	a5,80(sp)
	addi	a5,a0,2
	sw	a5,84(sp)
	addi	a5,a1,2
	sw	a5,88(sp)
	addi	a5,a0,3
	sw	a5,92(sp)
	addi	a5,a1,3
	sw	a5,96(sp)
	addi	a5,a0,4
	sw	a5,100(sp)
	addi	a5,a1,4
	sw	a5,104(sp)
	addi	a5,a0,5
	sw	a5,108(sp)
	addi	a5,a1,5
	sw	a5,112(sp)
	addi	a5,a0,6
	sw	a5,116(sp)
	addi	a5,a1,6
	sw	a5,120(sp)
	addi	a5,a0,7
	sw	a5,124(sp)
	addi	a5,a1,7
	sw	a5,128(sp)
	addi	a5,a0,8
	sw	a5,132(sp)
	addi	a5,a1,8
	sw	a5,136(sp)
	addi	a5,a0,9
	sw	a5,140(sp)
	addi	a5,a1,9
	sw	a5,32(sp)
	addi	a5,a0,10
	sw	a5,36(sp)
	addi	a5,a1,10
	sw	a5,40(sp)
	sw	s0,188(sp)
	addi	a5,a0,11
	sw	s1,184(sp)
	sw	s2,180(sp)
	sw	s3,176(sp)
	sw	s4,172(sp)
	sw	s5,168(sp)
	sw	s6,164(sp)
	sw	s7,160(sp)
	sw	s8,156(sp)
	sw	s9,152(sp)
	sw	s10,148(sp)
	sw	s11,144(sp)
	sw	a5,44(sp)
	addi	a5,a1,11
	sw	a5,48(sp)
	addi	a5,a0,12
	sw	a5,52(sp)
	addi	a5,a1,12
	sw	a5,56(sp)
	addi	a5,a0,13
	sw	a5,60(sp)
	addi	a5,a1,13
	sw	a5,64(sp)
	addi	a5,a0,14
	sw	a5,68(sp)
	addi	a5,a1,14
	sw	a5,72(sp)
	li	a4,0
	li	a5,32
	vsetvli	zero,a5,e8,mf8,ta,ma
.L3:
	lw	a3,76(sp)
	lw	t2,60(sp)
	add	s11,a3,a4
	lw	a3,80(sp)
	add	t2,t2,a4
	add	t0,a3,a4
	lw	a3,84(sp)
	add	s10,a3,a4
	lw	a3,88(sp)
	add	t6,a3,a4
	lw	a3,92(sp)
	add	s9,a3,a4
	lw	a3,96(sp)
	add	t5,a3,a4
	lw	a3,100(sp)
	add	s8,a3,a4
	lw	a3,104(sp)
	add	t4,a3,a4
	lw	a3,108(sp)
	add	s7,a3,a4
	lw	a3,112(sp)
	add	t3,a3,a4
	lw	a3,116(sp)
	add	s6,a3,a4
	lw	a3,120(sp)
	add	t1,a3,a4
	lw	a3,124(sp)
	add	s5,a3,a4
	lw	a3,128(sp)
	add	a7,a3,a4
	lw	a3,132(sp)
	add	s4,a3,a4
	lw	a3,136(sp)
	add	a6,a3,a4
	lw	a3,140(sp)
	add	s3,a3,a4
	lw	a3,32(sp)
	add	a0,a3,a4
	lw	a3,36(sp)
	add	s2,a3,a4
	lw	a3,40(sp)
	add	a1,a3,a4
	lw	a3,44(sp)
	add	s1,a3,a4
	lw	a3,48(sp)
	add	a2,a3,a4
	lw	a3,52(sp)
	add	s0,a3,a4
	lw	a3,56(sp)
	add	a3,a3,a4
	sw	a3,12(sp)
	lw	a3,64(sp)
	add	a3,a3,a4
	sw	a3,16(sp)
	lw	a3,68(sp)
	add	a3,a3,a4
	sw	a3,20(sp)
	lw	a3,72(sp)
	add	a3,a3,a4
	sw	a3,24(sp)
	vle8.v	v24,0(s11)
	vse8.v	v24,0(t0)
	vle8.v	v24,0(s10)
	vse8.v	v24,0(t6)
	vle8.v	v24,0(s9)
	vse8.v	v24,0(t5)
	vle8.v	v24,0(s8)
	vse8.v	v24,0(t4)
	vle8.v	v24,0(s7)
	vse8.v	v24,0(t3)
	vle8.v	v24,0(s6)
	vse8.v	v24,0(t1)
	vle8.v	v24,0(s5)
	vse8.v	v24,0(a7)
	vle8.v	v24,0(s4)
	vse8.v	v24,0(a6)
	vle8.v	v24,0(s3)
	vse8.v	v24,0(a0)
	vle8.v	v24,0(s2)
	vse8.v	v24,0(a1)
	vle8.v	v24,0(s1)
	vse8.v	v24,0(a2)
	lw	a3,12(sp)
	vle8.v	v24,0(s0)
	addi	a4,a4,1
	vse8.v	v24,0(a3)
	lw	a3,16(sp)
	vle8.v	v24,0(t2)
	vse8.v	v24,0(a3)
	lw	a3,20(sp)
	vle8.v	v24,0(a3)
	lw	a3,24(sp)
	vse8.v	v24,0(a3)
	lw	a3,28(sp)
	bne	a3,a4,.L3
	lw	s0,188(sp)
	lw	s1,184(sp)
	lw	s2,180(sp)
	lw	s3,176(sp)
	lw	s4,172(sp)
	lw	s5,168(sp)
	lw	s6,164(sp)
	lw	s7,160(sp)
	lw	s8,156(sp)
	lw	s9,152(sp)
	lw	s10,148(sp)
	lw	s11,144(sp)
.L1:
	addi	sp,sp,192
	jr	ra

Codegen is quite ugly.

However, if we try -O3 -fno-gcse -fno-schedule-insns -fno-move-loop-invariants

ASM is much better, same as LLVM:
f:
	ble	a3,zero,.L1
	li	a5,0
	vsetvli	zero,a4,e8,mf8,ta,ma
.L3:
	addi	a2,a0,1
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,1
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a2,a0,2
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,2
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a2,a0,3
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,3
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a2,a0,4
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,4
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a2,a0,5
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,5
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a2,a0,6
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,6
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a2,a0,7
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,7
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a2,a0,8
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,8
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a2,a0,9
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,9
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a2,a0,10
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,10
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a2,a0,11
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,11
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a2,a0,12
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,12
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a2,a0,13
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,13
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a2,a0,14
	add	a2,a2,a5
	vle8.v	v24,0(a2)
	addi	a2,a1,14
	add	a2,a2,a5
	vse8.v	v24,0(a2)
	addi	a5,a5,1
	bne	a3,a5,.L3
.L1:
	ret

Currently, RVV support doesn't have any Cost model. I am not sure whether it's related to it. May need someone help me to fix it. Thanks.
Comment 1 JuzheZhong 2023-01-16 02:23:38 UTC
(In reply to JuzheZhong from comment #0)
> Consider such case:
> void f (void * restrict in, void * restrict out, int l, int n, int m)
> {
>   int vl = 32;
>   for (int i = 0; i < n; i++)
>     {
>       vint8mf8_t v1 = __riscv_vle8_v_i8mf8 (in + i + 1, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 1, v1, vl);
> 
>       vint8mf8_t v2 = __riscv_vle8_v_i8mf8 (in + i + 2, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 2, v2, vl);
> 
>       vint8mf8_t v3 = __riscv_vle8_v_i8mf8 (in + i + 3, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 3, v3, vl);
> 
>       vint8mf8_t v4 = __riscv_vle8_v_i8mf8 (in + i + 4, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 4, v4, vl);
> 
>       vint8mf8_t v5 = __riscv_vle8_v_i8mf8 (in + i + 5, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 5, v5, vl);
> 
>       vint8mf8_t v6 = __riscv_vle8_v_i8mf8 (in + i + 6, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 6, v6, vl);
> 
>       vint8mf8_t v7 = __riscv_vle8_v_i8mf8 (in + i + 7, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 7, v7, vl);
> 
>       vint8mf8_t v8 = __riscv_vle8_v_i8mf8 (in + i + 8, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 8, v8, vl);
> 
>       vint8mf8_t v9 = __riscv_vle8_v_i8mf8 (in + i + 9, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 9, v9, vl);
> 
>       vint8mf8_t v10 = __riscv_vle8_v_i8mf8 (in + i + 10, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 10, v10, vl);
> 
>       vint8mf8_t v11 = __riscv_vle8_v_i8mf8 (in + i + 11, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 11, v11, vl);
> 
>       vint8mf8_t v12 = __riscv_vle8_v_i8mf8 (in + i + 12, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 12, v12, vl);
> 
>       vint8mf8_t v13 = __riscv_vle8_v_i8mf8 (in + i + 13, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 13, v13, vl);
> 
>       vint8mf8_t v14 = __riscv_vle8_v_i8mf8 (in + i + 14, vl);
>       __riscv_vse8_v_i8mf8 (out + i + 14, v14, vl);
>     }
> }
> 
> -O3 ASM:
> f:
> 	addi	sp,sp,-192
> 	sw	a3,28(sp)
> 	ble	a3,zero,.L1
> 	addi	a5,a0,1
> 	sw	a5,76(sp)
> 	addi	a5,a1,1
> 	sw	a5,80(sp)
> 	addi	a5,a0,2
> 	sw	a5,84(sp)
> 	addi	a5,a1,2
> 	sw	a5,88(sp)
> 	addi	a5,a0,3
> 	sw	a5,92(sp)
> 	addi	a5,a1,3
> 	sw	a5,96(sp)
> 	addi	a5,a0,4
> 	sw	a5,100(sp)
> 	addi	a5,a1,4
> 	sw	a5,104(sp)
> 	addi	a5,a0,5
> 	sw	a5,108(sp)
> 	addi	a5,a1,5
> 	sw	a5,112(sp)
> 	addi	a5,a0,6
> 	sw	a5,116(sp)
> 	addi	a5,a1,6
> 	sw	a5,120(sp)
> 	addi	a5,a0,7
> 	sw	a5,124(sp)
> 	addi	a5,a1,7
> 	sw	a5,128(sp)
> 	addi	a5,a0,8
> 	sw	a5,132(sp)
> 	addi	a5,a1,8
> 	sw	a5,136(sp)
> 	addi	a5,a0,9
> 	sw	a5,140(sp)
> 	addi	a5,a1,9
> 	sw	a5,32(sp)
> 	addi	a5,a0,10
> 	sw	a5,36(sp)
> 	addi	a5,a1,10
> 	sw	a5,40(sp)
> 	sw	s0,188(sp)
> 	addi	a5,a0,11
> 	sw	s1,184(sp)
> 	sw	s2,180(sp)
> 	sw	s3,176(sp)
> 	sw	s4,172(sp)
> 	sw	s5,168(sp)
> 	sw	s6,164(sp)
> 	sw	s7,160(sp)
> 	sw	s8,156(sp)
> 	sw	s9,152(sp)
> 	sw	s10,148(sp)
> 	sw	s11,144(sp)
> 	sw	a5,44(sp)
> 	addi	a5,a1,11
> 	sw	a5,48(sp)
> 	addi	a5,a0,12
> 	sw	a5,52(sp)
> 	addi	a5,a1,12
> 	sw	a5,56(sp)
> 	addi	a5,a0,13
> 	sw	a5,60(sp)
> 	addi	a5,a1,13
> 	sw	a5,64(sp)
> 	addi	a5,a0,14
> 	sw	a5,68(sp)
> 	addi	a5,a1,14
> 	sw	a5,72(sp)
> 	li	a4,0
> 	li	a5,32
> 	vsetvli	zero,a5,e8,mf8,ta,ma
> .L3:
> 	lw	a3,76(sp)
> 	lw	t2,60(sp)
> 	add	s11,a3,a4
> 	lw	a3,80(sp)
> 	add	t2,t2,a4
> 	add	t0,a3,a4
> 	lw	a3,84(sp)
> 	add	s10,a3,a4
> 	lw	a3,88(sp)
> 	add	t6,a3,a4
> 	lw	a3,92(sp)
> 	add	s9,a3,a4
> 	lw	a3,96(sp)
> 	add	t5,a3,a4
> 	lw	a3,100(sp)
> 	add	s8,a3,a4
> 	lw	a3,104(sp)
> 	add	t4,a3,a4
> 	lw	a3,108(sp)
> 	add	s7,a3,a4
> 	lw	a3,112(sp)
> 	add	t3,a3,a4
> 	lw	a3,116(sp)
> 	add	s6,a3,a4
> 	lw	a3,120(sp)
> 	add	t1,a3,a4
> 	lw	a3,124(sp)
> 	add	s5,a3,a4
> 	lw	a3,128(sp)
> 	add	a7,a3,a4
> 	lw	a3,132(sp)
> 	add	s4,a3,a4
> 	lw	a3,136(sp)
> 	add	a6,a3,a4
> 	lw	a3,140(sp)
> 	add	s3,a3,a4
> 	lw	a3,32(sp)
> 	add	a0,a3,a4
> 	lw	a3,36(sp)
> 	add	s2,a3,a4
> 	lw	a3,40(sp)
> 	add	a1,a3,a4
> 	lw	a3,44(sp)
> 	add	s1,a3,a4
> 	lw	a3,48(sp)
> 	add	a2,a3,a4
> 	lw	a3,52(sp)
> 	add	s0,a3,a4
> 	lw	a3,56(sp)
> 	add	a3,a3,a4
> 	sw	a3,12(sp)
> 	lw	a3,64(sp)
> 	add	a3,a3,a4
> 	sw	a3,16(sp)
> 	lw	a3,68(sp)
> 	add	a3,a3,a4
> 	sw	a3,20(sp)
> 	lw	a3,72(sp)
> 	add	a3,a3,a4
> 	sw	a3,24(sp)
> 	vle8.v	v24,0(s11)
> 	vse8.v	v24,0(t0)
> 	vle8.v	v24,0(s10)
> 	vse8.v	v24,0(t6)
> 	vle8.v	v24,0(s9)
> 	vse8.v	v24,0(t5)
> 	vle8.v	v24,0(s8)
> 	vse8.v	v24,0(t4)
> 	vle8.v	v24,0(s7)
> 	vse8.v	v24,0(t3)
> 	vle8.v	v24,0(s6)
> 	vse8.v	v24,0(t1)
> 	vle8.v	v24,0(s5)
> 	vse8.v	v24,0(a7)
> 	vle8.v	v24,0(s4)
> 	vse8.v	v24,0(a6)
> 	vle8.v	v24,0(s3)
> 	vse8.v	v24,0(a0)
> 	vle8.v	v24,0(s2)
> 	vse8.v	v24,0(a1)
> 	vle8.v	v24,0(s1)
> 	vse8.v	v24,0(a2)
> 	lw	a3,12(sp)
> 	vle8.v	v24,0(s0)
> 	addi	a4,a4,1
> 	vse8.v	v24,0(a3)
> 	lw	a3,16(sp)
> 	vle8.v	v24,0(t2)
> 	vse8.v	v24,0(a3)
> 	lw	a3,20(sp)
> 	vle8.v	v24,0(a3)
> 	lw	a3,24(sp)
> 	vse8.v	v24,0(a3)
> 	lw	a3,28(sp)
> 	bne	a3,a4,.L3
> 	lw	s0,188(sp)
> 	lw	s1,184(sp)
> 	lw	s2,180(sp)
> 	lw	s3,176(sp)
> 	lw	s4,172(sp)
> 	lw	s5,168(sp)
> 	lw	s6,164(sp)
> 	lw	s7,160(sp)
> 	lw	s8,156(sp)
> 	lw	s9,152(sp)
> 	lw	s10,148(sp)
> 	lw	s11,144(sp)
> .L1:
> 	addi	sp,sp,192
> 	jr	ra
> 
> Codegen is quite ugly.
> 
> However, if we try -O3 -fno-gcse -fno-schedule-insns
> -fno-move-loop-invariants
> 
> ASM is much better, same as LLVM:
> f:
> 	ble	a3,zero,.L1
> 	li	a5,0
> .L3:
>       li a4,32
> 	vsetvli	zero,a4,e8,mf8,ta,ma
> 	addi	a2,a0,1
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,1
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a2,a0,2
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,2
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a2,a0,3
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,3
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a2,a0,4
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,4
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a2,a0,5
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,5
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a2,a0,6
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,6
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a2,a0,7
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,7
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a2,a0,8
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,8
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a2,a0,9
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,9
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a2,a0,10
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,10
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a2,a0,11
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,11
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a2,a0,12
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,12
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a2,a0,13
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,13
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a2,a0,14
> 	add	a2,a2,a5
> 	vle8.v	v24,0(a2)
> 	addi	a2,a1,14
> 	add	a2,a2,a5
> 	vse8.v	v24,0(a2)
> 	addi	a5,a5,1
> 	bne	a3,a5,.L3
> .L1:
> 	ret
> 
> Currently, RVV support doesn't have any Cost model. I am not sure whether
> it's related to it. May need someone help me to fix it. Thanks.


If we disable hoist, it overall make codes better. However, it also makes li a4,32 not hoist outside the loop so that VSETVL PASS can not hoist vsetvl instruction outside the loop too.
Comment 2 Andrew Pinski 2023-01-16 02:49:44 UTC
-fsched-pressure is something definitely to look into for RISCV to improve pre-ra scheduling .
Comment 3 Robin Dapp 2023-08-24 09:28:59 UTC
I played around a bit with the scheduling model and the pressure-aware scheduling.  -fsched-pressure alone does not seem to help because the problem is indeed the latency of vector load and store.  The scheduler will try to keep dependent loads and stores apart (for the number of cycles specified), and then, after realizing there is nothing to put in between, lump everything together at the end of the sequence.  That's a well known but unfortunate property of scheduling.

Will need to think of something but not resolved for now.
Comment 4 JuzheZhong 2023-12-21 08:24:49 UTC
This issue is fixed when we use -mtune=sifive-u74 so it won't be a problem.
Comment 5 JuzheZhong 2023-12-21 08:25:09 UTC
This issue is fixed when we use -mtune=sifive-u74 so it won't be a problem.