SLP is working nicely in 4.7 the most needed missing bit is the ability to vectorize a dot product (using for instance _mm_dp_ps for sse4) Any chance to get this any time soon? small test here cat dot.cc struct V { float x,y,z,w; }; V a; V b; float dot() { return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w; } V sum() { V v=a; v.x+=b.x; v.y+=b.y; v.z+=b.z; v.w+=b.w; return v; } c++ -Ofast -c dot.cc -march=corei7 otool -X -t -v -V dot.o | c++filt dot(): movss _b+0x00000004(%rip),%xmm0 movss _b(%rip),%xmm1 mulss _a+0x00000004(%rip),%xmm0 mulss _a(%rip),%xmm1 addss %xmm1,%xmm0 movss _b+0x00000008(%rip),%xmm1 mulss _a+0x00000008(%rip),%xmm1 addss %xmm1,%xmm0 movss _b+0x0000000c(%rip),%xmm1 mulss _a+0x0000000c(%rip),%xmm1 addss %xmm1,%xmm0 ret nopl (%rax) sum(): movaps _b(%rip),%xmm0 addps _a(%rip),%xmm0 movaps %xmm0,0xc8(%rsp) movq 0xc8(%rsp),%rax movaps %xmm0,0xe8(%rsp) movq _a(%rsp),%xmm1 movd %rax,%xmm0 ret
I think we do not perform pattern detection in SLP mode. Ira?
(In reply to comment #1) > I think we do not perform pattern detection in SLP mode. Ira? Right. I actually had a patch for pattern detection in SLP ready couple of hours after Stage 1 was over. But this patch doesn't handle dot product (and widen-sum), since these patterns look for reduction, i.e., loop, so it will need some additional work.
in 4.8 using typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t; typedef double __attribute__( ( vector_size( 32 ) ) ) float64x4_t; the scalar product works well IF WRITTEN as a loop! in the following dot_product2 produces exactly the code I would have expected to be emitted by "dot_product".. would be nice to have also the reduction of a single vector to emit horizonal-sum… float dot_product(float32x4_t x, float32x4_t y) { float32x4_t res = x*y; float ret=0; for (int i=0;i!=4;++i) ret+=res[i]; return ret; } float dot_product2(float32x4_t x, float32x4_t y) { float ret=0; for (int i=0;i!=4;++i) ret+=x[i]*y[i]; return ret; } double dot_product(float64x4_t x, float64x4_t y) { float64x4_t res = x*y; double ret=0; for (int i=0;i!=4;++i) ret+=res[i]; return ret; } double dot_product2(float64x4_t x, float64x4_t y) { double ret=0; for (int i=0;i!=4;++i) ret+=x[i]*y[i]; return ret; } c++ -Ofast -ftree-vectorizer-verbose=2 -S cross.cc -march=corei7-avx; cat cross.s | c++filt dot_product(float __vector, float __vector): LFB2: vmulps %xmm1, %xmm0, %xmm1 vmovaps %xmm1, %xmm0 vshufps $85, %xmm1, %xmm1, %xmm2 vaddss %xmm0, %xmm2, %xmm0 vunpckhps %xmm1, %xmm1, %xmm2 vshufps $255, %xmm1, %xmm1, %xmm1 vaddss %xmm2, %xmm0, %xmm0 vaddss %xmm1, %xmm0, %xmm0 ret LFE2: .align 4,0x90 .globl dot_product2(float __vector, float __vector) dot_product2(float __vector, float __vector): LFB3: vmulps %xmm0, %xmm1, %xmm1 vhaddps %xmm1, %xmm1, %xmm0 vhaddps %xmm0, %xmm0, %xmm0 ret LFE3: .align 4,0x90 .globl dot_product(double __vector, double __vector) dot_product(double __vector, double __vector): LFB4: vmulpd %ymm1, %ymm0, %ymm1 vmovapd %xmm1, %xmm0 vextractf128 $0x1, %ymm1, %xmm1 vhaddpd %xmm0, %xmm0, %xmm0 vmovapd %xmm1, %xmm2 vunpckhpd %xmm1, %xmm1, %xmm1 vaddsd %xmm2, %xmm0, %xmm0 vaddsd %xmm1, %xmm0, %xmm0 vzeroupper ret LFE4: .align 4,0x90 .globl dot_product2(double __vector, double __vector) dot_product2(double __vector, double __vector): LFB5: vmulpd %ymm0, %ymm1, %ymm1 vhaddpd %ymm1, %ymm1, %ymm1 pushq %rbp LCFI0: movq %rsp, %rbp LCFI1: andq $-32, %rsp addq $16, %rsp vperm2f128 $1, %ymm1, %ymm1, %ymm0 vaddpd %ymm0, %ymm1, %ymm1 vmovapd %xmm1, %xmm0 vzeroupper leave LCFI2: ret