This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/55723] New: SLP vectorization vs loop: SLP more efficient!
- From: "vincenzo.innocente at cern dot ch" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Mon, 17 Dec 2012 18:57:54 +0000
- Subject: [Bug tree-optimization/55723] New: SLP vectorization vs loop: SLP more efficient!
- Auto-submitted: auto-generated
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55723
Bug #: 55723
Summary: SLP vectorization vs loop: SLP more efficient!
Classification: Unclassified
Product: gcc
Version: 4.8.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
AssignedTo: unassigned@gcc.gnu.org
ReportedBy: vincenzo.innocente@cern.ch
in the following code, basic block vectorization seems to be more efficient
that standard loop vectorization (I measure 20% better)
Is the loop vectorization computing the polynomial twice?
gcc version 4.8.0 20121215 (experimental) [trunk revision 194522] (GCC)
cat AtanT.cc;
typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;
template<typename Float>
inline
Float atan(Float t) {
constexpr float PIO4F = 0.7853981633974483096f;
Float z= (t > 0.4142135623730950f) ? (t-1.0f)/(t+1.0f) : t;
Float z2 = z * z;
Float ret =
((( 8.05374449538e-2f * z2
- 1.38776856032E-1f) * z2
+ 1.99777106478E-1f) * z2
- 3.33329491539E-1f) * z2 * z
+ z;
// move back in place
return ( t > 0.4142135623730950f ) ? ret+PIO4F : ret;
return ret;
}
float32x4_t va[1024];
float32x4_t vb[1024];
float a[4*1024];
float b[4*1024];
void computeV() {
for (int i=0;i!=1024;++i)
vb[i]=atan(va[i]);
}
//inline
void computeL() {
for (int i=0;i!=4*1024;++i)
b[i]=atan(a[i]);
}
Vincenzos-MacBook-Pro:floatPrec innocent$ c++ -std=c++11 -Ofast -march=corei7
-S AtanT.cc; cat AtanT.s
.text
.align 4,0x90
.globl __Z8computeVv
__Z8computeVv:
LFB1:
movaps LC1(%rip), %xmm4
leaq _va(%rip), %rcx
xorl %eax, %eax
movaps LC0(%rip), %xmm10
leaq _vb(%rip), %rdx
movaps LC2(%rip), %xmm9
movaps LC3(%rip), %xmm8
movaps LC4(%rip), %xmm7
movaps LC5(%rip), %xmm6
movaps LC6(%rip), %xmm5
.align 4,0x90
L3:
movaps (%rcx,%rax), %xmm1
movaps %xmm1, %xmm3
movaps %xmm1, %xmm2
addps %xmm4, %xmm3
subps %xmm4, %xmm2
rcpps %xmm3, %xmm0
mulps %xmm0, %xmm3
mulps %xmm0, %xmm3
addps %xmm0, %xmm0
subps %xmm3, %xmm0
movaps %xmm1, %xmm3
mulps %xmm0, %xmm2
movaps %xmm10, %xmm0
cmpltps %xmm1, %xmm0
blendvps %xmm0, %xmm2, %xmm3
movaps %xmm3, %xmm2
mulps %xmm3, %xmm2
movaps %xmm2, %xmm1
mulps %xmm9, %xmm1
subps %xmm8, %xmm1
mulps %xmm2, %xmm1
addps %xmm7, %xmm1
mulps %xmm2, %xmm1
subps %xmm6, %xmm1
mulps %xmm2, %xmm1
addps %xmm4, %xmm1
mulps %xmm3, %xmm1
movaps %xmm1, %xmm2
addps %xmm5, %xmm2
blendvps %xmm0, %xmm2, %xmm1
movaps %xmm1, (%rdx,%rax)
addq $16, %rax
cmpq $16384, %rax
jne L3
rep; ret
LFE1:
.align 4,0x90
.globl __Z8computeLv
__Z8computeLv:
LFB2:
movaps LC1(%rip), %xmm5
leaq _a(%rip), %rcx
xorl %eax, %eax
movaps LC0(%rip), %xmm11
leaq _b(%rip), %rdx
movaps LC2(%rip), %xmm9
movaps LC7(%rip), %xmm8
movaps LC4(%rip), %xmm7
movaps LC8(%rip), %xmm6
movaps LC6(%rip), %xmm10
.align 4,0x90
L7:
movaps (%rcx,%rax), %xmm0
movaps %xmm0, %xmm3
movaps %xmm0, %xmm1
addps %xmm5, %xmm3
subps %xmm5, %xmm1
rcpps %xmm3, %xmm2
mulps %xmm2, %xmm3
mulps %xmm2, %xmm3
addps %xmm2, %xmm2
subps %xmm3, %xmm2
movaps %xmm0, %xmm3
mulps %xmm0, %xmm3
mulps %xmm2, %xmm1
movaps %xmm1, %xmm4
mulps %xmm1, %xmm4
movaps %xmm4, %xmm2
mulps %xmm9, %xmm2
addps %xmm8, %xmm2
mulps %xmm4, %xmm2
addps %xmm7, %xmm2
mulps %xmm4, %xmm2
addps %xmm6, %xmm2
mulps %xmm4, %xmm2
movaps %xmm11, %xmm4
cmpltps %xmm0, %xmm4
addps %xmm5, %xmm2
mulps %xmm1, %xmm2
movaps %xmm3, %xmm1
mulps %xmm9, %xmm1
addps %xmm10, %xmm2
addps %xmm8, %xmm1
mulps %xmm3, %xmm1
addps %xmm7, %xmm1
mulps %xmm3, %xmm1
addps %xmm6, %xmm1
mulps %xmm3, %xmm1
addps %xmm5, %xmm1
mulps %xmm0, %xmm1
movaps %xmm4, %xmm0
blendvps %xmm0, %xmm2, %xmm1
movaps %xmm1, (%rdx,%rax)
addq $16, %rax
cmpq $16384, %rax
jne L7
rep; ret