in the following code, basic block vectorization seems to be more efficient that standard loop vectorization (I measure 20% better) Is the loop vectorization computing the polynomial twice? gcc version 4.8.0 20121215 (experimental) [trunk revision 194522] (GCC) cat AtanT.cc; typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t; template<typename Float> inline Float atan(Float t) { constexpr float PIO4F = 0.7853981633974483096f; Float z= (t > 0.4142135623730950f) ? (t-1.0f)/(t+1.0f) : t; Float z2 = z * z; Float ret = ((( 8.05374449538e-2f * z2 - 1.38776856032E-1f) * z2 + 1.99777106478E-1f) * z2 - 3.33329491539E-1f) * z2 * z + z; // move back in place return ( t > 0.4142135623730950f ) ? ret+PIO4F : ret; return ret; } float32x4_t va[1024]; float32x4_t vb[1024]; float a[4*1024]; float b[4*1024]; void computeV() { for (int i=0;i!=1024;++i) vb[i]=atan(va[i]); } //inline void computeL() { for (int i=0;i!=4*1024;++i) b[i]=atan(a[i]); } Vincenzos-MacBook-Pro:floatPrec innocent$ c++ -std=c++11 -Ofast -march=corei7 -S AtanT.cc; cat AtanT.s .text .align 4,0x90 .globl __Z8computeVv __Z8computeVv: LFB1: movaps LC1(%rip), %xmm4 leaq _va(%rip), %rcx xorl %eax, %eax movaps LC0(%rip), %xmm10 leaq _vb(%rip), %rdx movaps LC2(%rip), %xmm9 movaps LC3(%rip), %xmm8 movaps LC4(%rip), %xmm7 movaps LC5(%rip), %xmm6 movaps LC6(%rip), %xmm5 .align 4,0x90 L3: movaps (%rcx,%rax), %xmm1 movaps %xmm1, %xmm3 movaps %xmm1, %xmm2 addps %xmm4, %xmm3 subps %xmm4, %xmm2 rcpps %xmm3, %xmm0 mulps %xmm0, %xmm3 mulps %xmm0, %xmm3 addps %xmm0, %xmm0 subps %xmm3, %xmm0 movaps %xmm1, %xmm3 mulps %xmm0, %xmm2 movaps %xmm10, %xmm0 cmpltps %xmm1, %xmm0 blendvps %xmm0, %xmm2, %xmm3 movaps %xmm3, %xmm2 mulps %xmm3, %xmm2 movaps %xmm2, %xmm1 mulps %xmm9, %xmm1 subps %xmm8, %xmm1 mulps %xmm2, %xmm1 addps %xmm7, %xmm1 mulps %xmm2, %xmm1 subps %xmm6, %xmm1 mulps %xmm2, %xmm1 addps %xmm4, %xmm1 mulps %xmm3, %xmm1 movaps %xmm1, %xmm2 addps %xmm5, %xmm2 blendvps %xmm0, %xmm2, %xmm1 movaps %xmm1, (%rdx,%rax) addq $16, %rax cmpq $16384, %rax jne L3 rep; ret LFE1: .align 4,0x90 .globl __Z8computeLv __Z8computeLv: LFB2: movaps LC1(%rip), %xmm5 leaq _a(%rip), %rcx xorl %eax, %eax movaps LC0(%rip), %xmm11 leaq _b(%rip), %rdx movaps LC2(%rip), %xmm9 movaps LC7(%rip), %xmm8 movaps LC4(%rip), %xmm7 movaps LC8(%rip), %xmm6 movaps LC6(%rip), %xmm10 .align 4,0x90 L7: movaps (%rcx,%rax), %xmm0 movaps %xmm0, %xmm3 movaps %xmm0, %xmm1 addps %xmm5, %xmm3 subps %xmm5, %xmm1 rcpps %xmm3, %xmm2 mulps %xmm2, %xmm3 mulps %xmm2, %xmm3 addps %xmm2, %xmm2 subps %xmm3, %xmm2 movaps %xmm0, %xmm3 mulps %xmm0, %xmm3 mulps %xmm2, %xmm1 movaps %xmm1, %xmm4 mulps %xmm1, %xmm4 movaps %xmm4, %xmm2 mulps %xmm9, %xmm2 addps %xmm8, %xmm2 mulps %xmm4, %xmm2 addps %xmm7, %xmm2 mulps %xmm4, %xmm2 addps %xmm6, %xmm2 mulps %xmm4, %xmm2 movaps %xmm11, %xmm4 cmpltps %xmm0, %xmm4 addps %xmm5, %xmm2 mulps %xmm1, %xmm2 movaps %xmm3, %xmm1 mulps %xmm9, %xmm1 addps %xmm10, %xmm2 addps %xmm8, %xmm1 mulps %xmm3, %xmm1 addps %xmm7, %xmm1 mulps %xmm3, %xmm1 addps %xmm6, %xmm1 mulps %xmm3, %xmm1 addps %xmm5, %xmm1 mulps %xmm0, %xmm1 movaps %xmm4, %xmm0 blendvps %xmm0, %xmm2, %xmm1 movaps %xmm1, (%rdx,%rax) addq $16, %rax cmpq $16384, %rax jne L7 rep; ret
moving the second blending before the polynomial makes the two loops to produce almost identical code This is not always possible though. Bug in the loop optimizer? template<typename Float> inline Float atan(Float t) { constexpr float PIO4F = 0.7853981633974483096f; constexpr Float zero = {0}; Float z= (t > 0.4142135623730950f) ? (t-1.0f)/(t+1.0f) : t; Float ret = ( t > 0.4142135623730950f ) ? zero+PIO4F : zero; Float z2 = z * z; ret += ((( 8.05374449538e-2f * z2 - 1.38776856032E-1f) * z2 + 1.99777106478E-1f) * z2 - 3.33329491539E-1f) * z2 * z + z; return ret; }
It seems that in presence of identical conditions the vectorizer prefers to compute two "full" branches and do just one blend. This is not always the most efficient choice as the benchmark in comment 1 demonstrates. Another simple example: for bar two rsqrtps and one blend for foo one rsqrtps and two blends #include<cmath> float a[1024]; float b[1024]; void bar(){ for (int i=0;i!=1024;++i) { auto z = a[i]; if (a[i] > 3.14f) z-=1.f; b[i] = 1.f/std::sqrt(z); if (a[i] > 3.14f) b[i]-=1.f; } } void foo(){ for (int i=0;i!=1024;++i) { auto z = a[i]; if (a[i] > 3.14f) z-=1.f; b[i] = 1.f/std::sqrt(z); if (a[i] > 1.f) b[i]-=1.f; } } c++ -std=c++11 -Ofast -march=corei7 -S twoif.cc -ftree-vectorizer-verbose=1 -ftree-loop-if-convert-stores; cat twoif.s | c++filt bar(): LFB221: movaps LC0(%rip), %xmm6 leaq signed char(%rip), %rax movaps LC1(%rip), %xmm5 leaq bool(%rip), %rdx movaps LC2(%rip), %xmm4 leaq 4096+signed char(%rip), %rcx movaps LC3(%rip), %xmm7 .align 4,0x90 L3: movaps (%rax), %xmm0 addq $16, %rax addq $16, %rdx rsqrtps %xmm0, %xmm3 movaps %xmm0, %xmm2 subps %xmm6, %xmm2 rsqrtps %xmm2, %xmm1 mulps %xmm1, %xmm2 mulps %xmm1, %xmm2 mulps %xmm4, %xmm1 addps %xmm5, %xmm2 mulps %xmm1, %xmm2 movaps %xmm3, %xmm1 mulps %xmm0, %xmm1 subps %xmm6, %xmm2 mulps %xmm3, %xmm1 mulps %xmm4, %xmm3 addps %xmm5, %xmm1 mulps %xmm3, %xmm1 movaps %xmm7, %xmm3 cmpltps %xmm0, %xmm3 movaps %xmm3, %xmm0 blendvps %xmm0, %xmm2, %xmm1 movaps %xmm1, -16(%rdx) cmpq %rcx, %rax jne L3 rep; ret LFE221: .align 4,0x90 .globl foo() foo(): LFB222: movaps LC3(%rip), %xmm7 leaq signed char(%rip), %rax movaps LC0(%rip), %xmm3 leaq bool(%rip), %rdx movaps LC1(%rip), %xmm6 leaq 4096+signed char(%rip), %rcx movaps LC2(%rip), %xmm5 .align 4,0x90 L7: movaps (%rax), %xmm2 movaps %xmm7, %xmm0 addq $16, %rax addq $16, %rdx movaps %xmm2, %xmm1 cmpltps %xmm2, %xmm0 movaps %xmm2, %xmm4 subps %xmm3, %xmm1 blendvps %xmm0, %xmm1, %xmm4 rsqrtps %xmm4, %xmm0 movaps %xmm4, %xmm1 mulps %xmm0, %xmm1 mulps %xmm0, %xmm1 mulps %xmm5, %xmm0 addps %xmm6, %xmm1 mulps %xmm0, %xmm1 movaps %xmm3, %xmm0 cmpltps %xmm2, %xmm0 movaps %xmm1, %xmm4 subps %xmm3, %xmm4 blendvps %xmm0, %xmm4, %xmm1 movaps %xmm1, -16(%rdx) cmpq %rcx, %rax jne L7 rep; ret