61338 – too many permutation in a vectorized "reverse loop"

Bug 61338 - too many permutation in a vectorized "reverse loop"

Summary: too many permutation in a vectorized "reverse loop"

Status:	NEW

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	tree-optimization (show other bugs)
Version:	4.9.0

Importance:	P3 enhancement
Target Milestone:	---
Assignee:	Not yet assigned to anyone

URL:
Keywords:	missed-optimization

Duplicates (3):	79934 112892 116337 (view as bug list)
Depends on:
Blocks:	vectorizer
	Show dependency tree / graph

Reported:	2014-05-28 09:12 UTC by vincenzo Innocente
Modified:	2024-08-12 07:08 UTC (History)
CC List:	4 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed:	2014-05-28 00:00:00

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description vincenzo Innocente 2014-05-28 09:12:09 UTC

in this example gcc generates 4 permutations for foo (while none is required)
On the positive side the code for bar (which is a more realistic use case) seems optimal.

float x[1024];
float y[1024];
float z[1024];

void foo() {
  for (int i=0; i<512; ++i)
    x[1023-i] += y[1023-i]*z[512-i];
}


void bar() {
  for (int i=0; i<512; ++i)
    x[1023-i] += y[i]*z[i+512];
}

c++ -Ofast -march=haswell -S revloop.cc; cat revloop.s

__Z3foov:
LFB0:
	vmovdqa	LC0(%rip), %ymm2
	xorl	%eax, %eax
	leaq	4064+_x(%rip), %rdx
	leaq	4064+_y(%rip), %rsi
	leaq	2020+_z(%rip), %rcx
	.align 4,0x90
L2:
	vpermd	(%rdx,%rax), %ymm2, %ymm0
	vpermd	(%rcx,%rax), %ymm2, %ymm1
	vpermd	(%rsi,%rax), %ymm2, %ymm3
	vfmadd231ps	%ymm1, %ymm3, %ymm0
	vpermd	%ymm0, %ymm2, %ymm0
	vmovaps	%ymm0, (%rdx,%rax)
	subq	$32, %rax
	cmpq	$-2048, %rax
	jne	L2
	vzeroupper
	ret
LFE0:
	.section __TEXT,__text_cold,regular,pure_instructions
LCOLDE1:
	.text
LHOTE1:
	.section __TEXT,__text_cold,regular,pure_instructions
LCOLDB2:
	.text
LHOTB2:
	.align 4,0x90
	.globl __Z3barv
__Z3barv:
LFB1:
	vmovdqa	LC0(%rip), %ymm1
	leaq	2048+_z(%rip), %rdx
	leaq	_y(%rip), %rcx
	leaq	4064+_x(%rip), %rax
	leaq	4096+_z(%rip), %rsi
	.align 4,0x90
L6:
	vmovaps	(%rdx), %ymm2
	addq	$32, %rdx
	vpermd	(%rax), %ymm1, %ymm0
	addq	$32, %rcx
	vfmadd231ps	-32(%rcx), %ymm2, %ymm0
	subq	$32, %rax
	vpermd	%ymm0, %ymm1, %ymm0
	vmovaps	%ymm0, 32(%rax)
	cmpq	%rsi, %rdx
	jne	L6
	vzeroupper
	ret
LFE1:

Comment 1 vincenzo Innocente 2014-05-28 09:19:19 UTC

if I write it "reverse"
void foo2() {
  for (int i=511; i>=0; --i)
    x[1023-i] += y[1023-i]*z[512-i];
}

its ok
__Z4foo2v:
LFB1:
	leaq	2048+_x(%rip), %rdx
	xorl	%eax, %eax
	leaq	4+_z(%rip), %rsi
	leaq	2048+_y(%rip), %rcx
	.align 4,0x90
L6:
	vmovaps	(%rdx,%rax), %ymm1
	vmovups	(%rsi,%rax), %ymm0
	vfmadd132ps	(%rcx,%rax), %ymm1, %ymm0
	vmovaps	%ymm0, (%rdx,%rax)
	addq	$32, %rax
	cmpq	$2048, %rax
	jne	L6
	vzeroupper
	ret

Comment 2 Richard Biener 2014-05-28 10:33:27 UTC

Confirmed.  We fail to detect that all DRs are accessed "reverse" which is the
case where we can drop the permutes.  We also fail to reverse the
positive vectors if they happen to be lower in number:

float x[1024];
float y[1024];
float z[1024];

void foo() {
    for (int i=0; i<512; ++i)
      x[i] += y[1023-i]*z[512-i];
}

produces

.L2:
        vpermd  (%rdx), %ymm1, %ymm0
        subq    $32, %rdx
        vpermd  (%rcx), %ymm1, %ymm2
        addq    $32, %rax
        vfmadd213ps     -32(%rax), %ymm2, %ymm0
        subq    $32, %rcx
        vmovaps %ymm0, -32(%rax)
        cmpq    $z-28, %rdx
        jne     .L2

instead of permuting the result before storing it.

Comment 3 Marc Glisse 2020-03-16 08:49:22 UTC

Possibly easier is the case of a reduction, where permutations are clearly irrelevant.

int f(int*arr,int size){
  int sum=0;
  for(int i = 0; i < size; i++){
    sum += arr[size-1-i];
  }
  return sum;
}

We still have a VEC_PERM_EXPR in the hot loop before accumulating.

(by the way, we accumulate in a variable of type "vector(4) int", while I would expect "vector(4) unsigned int" for overflow reasons)

Comment 4 Andrew Pinski 2023-12-07 07:29:53 UTC

*** Bug 112892 has been marked as a duplicate of this bug. ***

Comment 5 Andrew Pinski 2024-07-07 23:24:08 UTC

*** Bug 115819 has been marked as a duplicate of this bug. ***

Comment 6 Andrew Pinski 2024-07-07 23:25:23 UTC

Just the simple:
```
void foo (int *__restrict A, int n) {
    for (int i = n; i > 0; --i) {
        A[i] += 1;
    }
}
```

Produces the double PERM here.

From PR 115819 .

Comment 7 Andrew Pinski 2024-08-12 06:25:48 UTC

*** Bug 116337 has been marked as a duplicate of this bug. ***

Comment 8 Andrew Pinski 2024-08-12 07:08:22 UTC

*** Bug 79934 has been marked as a duplicate of this bug. ***