55723 – loop vectorization inefficient in presence of multiple identical conditions

Bug 55723 - loop vectorization inefficient in presence of multiple identical conditions

Summary: loop vectorization inefficient in presence of multiple identical conditions

Status:	UNCONFIRMED

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	tree-optimization (show other bugs)
Version:	4.8.0

Importance:	P3 normal
Target Milestone:	---
Assignee:	Not yet assigned to anyone

URL:
Keywords:	missed-optimization

Depends on:
Blocks:	vectorizer
	Show dependency tree / graph

Reported:	2012-12-17 18:57 UTC by vincenzo Innocente
Modified:	2021-10-01 02:51 UTC (History)
CC List:	0 users

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed:

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description vincenzo Innocente 2012-12-17 18:57:54 UTC

in the following code, basic block vectorization seems to be more efficient that standard loop vectorization (I measure 20% better)
Is the loop vectorization computing the polynomial twice?


gcc version 4.8.0 20121215 (experimental) [trunk revision 194522] (GCC) 



cat AtanT.cc;
typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;

template<typename Float>
inline
Float atan(Float t) {
  constexpr float PIO4F = 0.7853981633974483096f;

  Float z= (t > 0.4142135623730950f) ? (t-1.0f)/(t+1.0f) : t;
  
  Float z2 = z * z;
  Float ret =
    ((( 8.05374449538e-2f * z2
	- 1.38776856032E-1f) * z2
      + 1.99777106478E-1f) * z2
     - 3.33329491539E-1f) * z2 * z
    + z;
  
  // move back in place
  return ( t > 0.4142135623730950f ) ? ret+PIO4F : ret;
  return ret;
}

float32x4_t va[1024];
float32x4_t vb[1024];

float a[4*1024];
float b[4*1024];

void computeV() {
  for (int i=0;i!=1024;++i)
    vb[i]=atan(va[i]);
}

//inline
void computeL() {
  for (int i=0;i!=4*1024;++i)
    b[i]=atan(a[i]);
}
Vincenzos-MacBook-Pro:floatPrec innocent$ c++ -std=c++11 -Ofast -march=corei7 -S AtanT.cc; cat AtanT.s
	.text
	.align 4,0x90
	.globl __Z8computeVv
__Z8computeVv:
LFB1:
	movaps	LC1(%rip), %xmm4
	leaq	_va(%rip), %rcx
	xorl	%eax, %eax
	movaps	LC0(%rip), %xmm10
	leaq	_vb(%rip), %rdx
	movaps	LC2(%rip), %xmm9
	movaps	LC3(%rip), %xmm8
	movaps	LC4(%rip), %xmm7
	movaps	LC5(%rip), %xmm6
	movaps	LC6(%rip), %xmm5
	.align 4,0x90
L3:
	movaps	(%rcx,%rax), %xmm1
	movaps	%xmm1, %xmm3
	movaps	%xmm1, %xmm2
	addps	%xmm4, %xmm3
	subps	%xmm4, %xmm2
	rcpps	%xmm3, %xmm0
	mulps	%xmm0, %xmm3
	mulps	%xmm0, %xmm3
	addps	%xmm0, %xmm0
	subps	%xmm3, %xmm0
	movaps	%xmm1, %xmm3
	mulps	%xmm0, %xmm2
	movaps	%xmm10, %xmm0
	cmpltps	%xmm1, %xmm0
	blendvps	%xmm0, %xmm2, %xmm3
	movaps	%xmm3, %xmm2
	mulps	%xmm3, %xmm2
	movaps	%xmm2, %xmm1
	mulps	%xmm9, %xmm1
	subps	%xmm8, %xmm1
	mulps	%xmm2, %xmm1
	addps	%xmm7, %xmm1
	mulps	%xmm2, %xmm1
	subps	%xmm6, %xmm1
	mulps	%xmm2, %xmm1
	addps	%xmm4, %xmm1
	mulps	%xmm3, %xmm1
	movaps	%xmm1, %xmm2
	addps	%xmm5, %xmm2
	blendvps	%xmm0, %xmm2, %xmm1
	movaps	%xmm1, (%rdx,%rax)
	addq	$16, %rax
	cmpq	$16384, %rax
	jne	L3
	rep; ret
LFE1:
	.align 4,0x90
	.globl __Z8computeLv
__Z8computeLv:
LFB2:
	movaps	LC1(%rip), %xmm5
	leaq	_a(%rip), %rcx
	xorl	%eax, %eax
	movaps	LC0(%rip), %xmm11
	leaq	_b(%rip), %rdx
	movaps	LC2(%rip), %xmm9
	movaps	LC7(%rip), %xmm8
	movaps	LC4(%rip), %xmm7
	movaps	LC8(%rip), %xmm6
	movaps	LC6(%rip), %xmm10
	.align 4,0x90
L7:
	movaps	(%rcx,%rax), %xmm0
	movaps	%xmm0, %xmm3
	movaps	%xmm0, %xmm1
	addps	%xmm5, %xmm3
	subps	%xmm5, %xmm1
	rcpps	%xmm3, %xmm2
	mulps	%xmm2, %xmm3
	mulps	%xmm2, %xmm3
	addps	%xmm2, %xmm2
	subps	%xmm3, %xmm2
	movaps	%xmm0, %xmm3
	mulps	%xmm0, %xmm3
	mulps	%xmm2, %xmm1
	movaps	%xmm1, %xmm4
	mulps	%xmm1, %xmm4
	movaps	%xmm4, %xmm2
	mulps	%xmm9, %xmm2
	addps	%xmm8, %xmm2
	mulps	%xmm4, %xmm2
	addps	%xmm7, %xmm2
	mulps	%xmm4, %xmm2
	addps	%xmm6, %xmm2
	mulps	%xmm4, %xmm2
	movaps	%xmm11, %xmm4
	cmpltps	%xmm0, %xmm4
	addps	%xmm5, %xmm2
	mulps	%xmm1, %xmm2
	movaps	%xmm3, %xmm1
	mulps	%xmm9, %xmm1
	addps	%xmm10, %xmm2
	addps	%xmm8, %xmm1
	mulps	%xmm3, %xmm1
	addps	%xmm7, %xmm1
	mulps	%xmm3, %xmm1
	addps	%xmm6, %xmm1
	mulps	%xmm3, %xmm1
	addps	%xmm5, %xmm1
	mulps	%xmm0, %xmm1
	movaps	%xmm4, %xmm0
	blendvps	%xmm0, %xmm2, %xmm1
	movaps	%xmm1, (%rdx,%rax)
	addq	$16, %rax
	cmpq	$16384, %rax
	jne	L7
	rep; ret

Comment 1 vincenzo Innocente 2012-12-17 19:25:37 UTC

moving the second blending  before the polynomial makes the two loops to produce almost identical code
This is not always possible though.
Bug in the loop optimizer?

template<typename Float>
inline
Float atan(Float t) {
  constexpr float PIO4F = 0.7853981633974483096f;
  constexpr Float zero = {0};
  Float z= (t > 0.4142135623730950f) ? (t-1.0f)/(t+1.0f) : t;
  Float ret = ( t > 0.4142135623730950f ) ? zero+PIO4F : zero;

  Float z2 = z * z;
  ret +=
    ((( 8.05374449538e-2f * z2
	- 1.38776856032E-1f) * z2
      + 1.99777106478E-1f) * z2
     - 3.33329491539E-1f) * z2 * z
    + z;

  return ret;
}

Comment 2 vincenzo Innocente 2012-12-20 15:39:13 UTC

It seems that in presence of identical conditions the vectorizer prefers to compute two "full" branches
and do just one blend.
This is not always the most efficient choice as the  benchmark in comment 1 demonstrates.

Another simple example:
for bar two rsqrtps and one blend
for foo one rsqrtps and two blends

#include<cmath>
float a[1024];
float b[1024];


void bar(){
  for (int i=0;i!=1024;++i) {
    auto z = a[i];
    if (a[i] > 3.14f) z-=1.f;
    b[i] = 1.f/std::sqrt(z);
    if (a[i] > 3.14f) b[i]-=1.f;
  }
}

void foo(){
  for (int i=0;i!=1024;++i) {
    auto z = a[i];
    if (a[i] > 3.14f) z-=1.f;
    b[i] = 1.f/std::sqrt(z);
    if (a[i] > 1.f) b[i]-=1.f;
  }
}

c++ -std=c++11 -Ofast -march=corei7 -S twoif.cc -ftree-vectorizer-verbose=1  -ftree-loop-if-convert-stores; cat twoif.s | c++filt


bar():
LFB221:
	movaps	LC0(%rip), %xmm6
	leaq	signed char(%rip), %rax
	movaps	LC1(%rip), %xmm5
	leaq	bool(%rip), %rdx
	movaps	LC2(%rip), %xmm4
	leaq	4096+signed char(%rip), %rcx
	movaps	LC3(%rip), %xmm7
	.align 4,0x90
L3:
	movaps	(%rax), %xmm0
	addq	$16, %rax
	addq	$16, %rdx
	rsqrtps	%xmm0, %xmm3
	movaps	%xmm0, %xmm2
	subps	%xmm6, %xmm2
	rsqrtps	%xmm2, %xmm1
	mulps	%xmm1, %xmm2
	mulps	%xmm1, %xmm2
	mulps	%xmm4, %xmm1
	addps	%xmm5, %xmm2
	mulps	%xmm1, %xmm2
	movaps	%xmm3, %xmm1
	mulps	%xmm0, %xmm1
	subps	%xmm6, %xmm2
	mulps	%xmm3, %xmm1
	mulps	%xmm4, %xmm3
	addps	%xmm5, %xmm1
	mulps	%xmm3, %xmm1
	movaps	%xmm7, %xmm3
	cmpltps	%xmm0, %xmm3
	movaps	%xmm3, %xmm0
	blendvps	%xmm0, %xmm2, %xmm1
	movaps	%xmm1, -16(%rdx)
	cmpq	%rcx, %rax
	jne	L3
	rep; ret
LFE221:
	.align 4,0x90
	.globl foo()
foo():
LFB222:
	movaps	LC3(%rip), %xmm7
	leaq	signed char(%rip), %rax
	movaps	LC0(%rip), %xmm3
	leaq	bool(%rip), %rdx
	movaps	LC1(%rip), %xmm6
	leaq	4096+signed char(%rip), %rcx
	movaps	LC2(%rip), %xmm5
	.align 4,0x90
L7:
	movaps	(%rax), %xmm2
	movaps	%xmm7, %xmm0
	addq	$16, %rax
	addq	$16, %rdx
	movaps	%xmm2, %xmm1
	cmpltps	%xmm2, %xmm0
	movaps	%xmm2, %xmm4
	subps	%xmm3, %xmm1
	blendvps	%xmm0, %xmm1, %xmm4
	rsqrtps	%xmm4, %xmm0
	movaps	%xmm4, %xmm1
	mulps	%xmm0, %xmm1
	mulps	%xmm0, %xmm1
	mulps	%xmm5, %xmm0
	addps	%xmm6, %xmm1
	mulps	%xmm0, %xmm1
	movaps	%xmm3, %xmm0
	cmpltps	%xmm2, %xmm0
	movaps	%xmm1, %xmm4
	subps	%xmm3, %xmm4
	blendvps	%xmm0, %xmm4, %xmm1
	movaps	%xmm1, -16(%rdx)
	cmpq	%rcx, %rax
	jne	L7
	rep; ret