in this example gcc generates 4 permutations for foo (while none is required) On the positive side the code for bar (which is a more realistic use case) seems optimal. float x[1024]; float y[1024]; float z[1024]; void foo() { for (int i=0; i<512; ++i) x[1023-i] += y[1023-i]*z[512-i]; } void bar() { for (int i=0; i<512; ++i) x[1023-i] += y[i]*z[i+512]; } c++ -Ofast -march=haswell -S revloop.cc; cat revloop.s __Z3foov: LFB0: vmovdqa LC0(%rip), %ymm2 xorl %eax, %eax leaq 4064+_x(%rip), %rdx leaq 4064+_y(%rip), %rsi leaq 2020+_z(%rip), %rcx .align 4,0x90 L2: vpermd (%rdx,%rax), %ymm2, %ymm0 vpermd (%rcx,%rax), %ymm2, %ymm1 vpermd (%rsi,%rax), %ymm2, %ymm3 vfmadd231ps %ymm1, %ymm3, %ymm0 vpermd %ymm0, %ymm2, %ymm0 vmovaps %ymm0, (%rdx,%rax) subq $32, %rax cmpq $-2048, %rax jne L2 vzeroupper ret LFE0: .section __TEXT,__text_cold,regular,pure_instructions LCOLDE1: .text LHOTE1: .section __TEXT,__text_cold,regular,pure_instructions LCOLDB2: .text LHOTB2: .align 4,0x90 .globl __Z3barv __Z3barv: LFB1: vmovdqa LC0(%rip), %ymm1 leaq 2048+_z(%rip), %rdx leaq _y(%rip), %rcx leaq 4064+_x(%rip), %rax leaq 4096+_z(%rip), %rsi .align 4,0x90 L6: vmovaps (%rdx), %ymm2 addq $32, %rdx vpermd (%rax), %ymm1, %ymm0 addq $32, %rcx vfmadd231ps -32(%rcx), %ymm2, %ymm0 subq $32, %rax vpermd %ymm0, %ymm1, %ymm0 vmovaps %ymm0, 32(%rax) cmpq %rsi, %rdx jne L6 vzeroupper ret LFE1:
if I write it "reverse" void foo2() { for (int i=511; i>=0; --i) x[1023-i] += y[1023-i]*z[512-i]; } its ok __Z4foo2v: LFB1: leaq 2048+_x(%rip), %rdx xorl %eax, %eax leaq 4+_z(%rip), %rsi leaq 2048+_y(%rip), %rcx .align 4,0x90 L6: vmovaps (%rdx,%rax), %ymm1 vmovups (%rsi,%rax), %ymm0 vfmadd132ps (%rcx,%rax), %ymm1, %ymm0 vmovaps %ymm0, (%rdx,%rax) addq $32, %rax cmpq $2048, %rax jne L6 vzeroupper ret
Confirmed. We fail to detect that all DRs are accessed "reverse" which is the case where we can drop the permutes. We also fail to reverse the positive vectors if they happen to be lower in number: float x[1024]; float y[1024]; float z[1024]; void foo() { for (int i=0; i<512; ++i) x[i] += y[1023-i]*z[512-i]; } produces .L2: vpermd (%rdx), %ymm1, %ymm0 subq $32, %rdx vpermd (%rcx), %ymm1, %ymm2 addq $32, %rax vfmadd213ps -32(%rax), %ymm2, %ymm0 subq $32, %rcx vmovaps %ymm0, -32(%rax) cmpq $z-28, %rdx jne .L2 instead of permuting the result before storing it.
Possibly easier is the case of a reduction, where permutations are clearly irrelevant. int f(int*arr,int size){ int sum=0; for(int i = 0; i < size; i++){ sum += arr[size-1-i]; } return sum; } We still have a VEC_PERM_EXPR in the hot loop before accumulating. (by the way, we accumulate in a variable of type "vector(4) int", while I would expect "vector(4) unsigned int" for overflow reasons)
*** Bug 112892 has been marked as a duplicate of this bug. ***
*** Bug 115819 has been marked as a duplicate of this bug. ***
Just the simple: ``` void foo (int *__restrict A, int n) { for (int i = n; i > 0; --i) { A[i] += 1; } } ``` Produces the double PERM here. From PR 115819 .
*** Bug 116337 has been marked as a duplicate of this bug. ***
*** Bug 79934 has been marked as a duplicate of this bug. ***