Bug 51062 - SLP vectorization of dot (inner) product
Summary: SLP vectorization of dot (inner) product
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 4.8.0
: P3 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2011-11-09 14:44 UTC by vincenzo Innocente
Modified: 2021-08-16 05:24 UTC (History)
2 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2011-11-09 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description vincenzo Innocente 2011-11-09 14:44:44 UTC
SLP is working nicely in 4.7
the most needed missing bit is the ability to vectorize a dot product (using for instance _mm_dp_ps for sse4)

Any chance to get this any time soon?

small test here
cat dot.cc 
struct V {
  float x,y,z,w;
};

V a;
V b;

float dot() {
  return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
}

V sum() {
  V v=a;
  v.x+=b.x; v.y+=b.y; v.z+=b.z; v.w+=b.w;
  return v; 
}

c++ -Ofast -c dot.cc -march=corei7
otool -X -t -v -V dot.o | c++filt
dot():
	movss	_b+0x00000004(%rip),%xmm0
	movss	_b(%rip),%xmm1
	mulss	_a+0x00000004(%rip),%xmm0
	mulss	_a(%rip),%xmm1
	addss	%xmm1,%xmm0
	movss	_b+0x00000008(%rip),%xmm1
	mulss	_a+0x00000008(%rip),%xmm1
	addss	%xmm1,%xmm0
	movss	_b+0x0000000c(%rip),%xmm1
	mulss	_a+0x0000000c(%rip),%xmm1
	addss	%xmm1,%xmm0
	ret
	nopl	(%rax)
sum():
	movaps	_b(%rip),%xmm0
	addps	_a(%rip),%xmm0
	movaps	%xmm0,0xc8(%rsp)
	movq	0xc8(%rsp),%rax
	movaps	%xmm0,0xe8(%rsp)
	movq	_a(%rsp),%xmm1
	movd	%rax,%xmm0
	ret
Comment 1 Richard Biener 2011-11-09 14:58:42 UTC
I think we do not perform pattern detection in SLP mode.  Ira?
Comment 2 Ira Rosen 2011-11-09 16:13:08 UTC
(In reply to comment #1)
> I think we do not perform pattern detection in SLP mode.  Ira?

Right. I actually had a patch for pattern detection in SLP ready couple of hours after Stage 1 was over. But this patch doesn't handle dot product (and widen-sum), since these patterns look for reduction, i.e., loop, so it will need some additional work.
Comment 3 vincenzo Innocente 2012-11-30 13:53:41 UTC
in 4.8 using 
typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;
typedef double __attribute__( ( vector_size( 32 ) ) ) float64x4_t;
the scalar product works well IF WRITTEN as a loop!

in the following 
dot_product2 produces exactly the code I would have expected to be emitted by "dot_product"..
would be nice to have also the reduction of a single vector to emit horizonal-sum…


float dot_product(float32x4_t x, float32x4_t y) {
  float32x4_t res = x*y;
  float ret=0;
  for (int i=0;i!=4;++i) ret+=res[i];
  return ret;
}

float dot_product2(float32x4_t x, float32x4_t y) {
  float ret=0;
  for (int i=0;i!=4;++i) ret+=x[i]*y[i];
  return ret;
}


double dot_product(float64x4_t x, float64x4_t y) {
  float64x4_t res = x*y;
  double ret=0;
  for (int i=0;i!=4;++i) ret+=res[i];
  return ret;
}

double dot_product2(float64x4_t x, float64x4_t y) {
  double ret=0;
  for (int i=0;i!=4;++i) ret+=x[i]*y[i];
  return ret;
}

c++ -Ofast -ftree-vectorizer-verbose=2 -S cross.cc  -march=corei7-avx; cat cross.s | c++filt
dot_product(float __vector, float __vector):
LFB2:
	vmulps	%xmm1, %xmm0, %xmm1
	vmovaps	%xmm1, %xmm0
	vshufps	$85, %xmm1, %xmm1, %xmm2
	vaddss	%xmm0, %xmm2, %xmm0
	vunpckhps	%xmm1, %xmm1, %xmm2
	vshufps	$255, %xmm1, %xmm1, %xmm1
	vaddss	%xmm2, %xmm0, %xmm0
	vaddss	%xmm1, %xmm0, %xmm0
	ret
LFE2:
	.align 4,0x90
	.globl dot_product2(float __vector, float __vector)
dot_product2(float __vector, float __vector):
LFB3:
	vmulps	%xmm0, %xmm1, %xmm1
	vhaddps	%xmm1, %xmm1, %xmm0
	vhaddps	%xmm0, %xmm0, %xmm0
	ret
LFE3:
	.align 4,0x90
	.globl dot_product(double __vector, double __vector)
dot_product(double __vector, double __vector):
LFB4:
	vmulpd	%ymm1, %ymm0, %ymm1
	vmovapd	%xmm1, %xmm0
	vextractf128	$0x1, %ymm1, %xmm1
	vhaddpd	%xmm0, %xmm0, %xmm0
	vmovapd	%xmm1, %xmm2
	vunpckhpd	%xmm1, %xmm1, %xmm1
	vaddsd	%xmm2, %xmm0, %xmm0
	vaddsd	%xmm1, %xmm0, %xmm0
	vzeroupper
	ret
LFE4:
	.align 4,0x90
	.globl dot_product2(double __vector, double __vector)
dot_product2(double __vector, double __vector):
LFB5:
	vmulpd	%ymm0, %ymm1, %ymm1
	vhaddpd	%ymm1, %ymm1, %ymm1
	pushq	%rbp
LCFI0:
	movq	%rsp, %rbp
LCFI1:
	andq	$-32, %rsp
	addq	$16, %rsp
	vperm2f128	$1, %ymm1, %ymm1, %ymm0
	vaddpd	%ymm0, %ymm1, %ymm1
	vmovapd	%xmm1, %xmm0
	vzeroupper
	leave
LCFI2:
	ret