This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[Bug tree-optimization/55723] New: SLP vectorization vs loop: SLP more efficient!

From: "vincenzo.innocente at cern dot ch" <gcc-bugzilla at gcc dot gnu dot org>
To: gcc-bugs at gcc dot gnu dot org
Date: Mon, 17 Dec 2012 18:57:54 +0000
Subject: [Bug tree-optimization/55723] New: SLP vectorization vs loop: SLP more efficient!
Auto-submitted: auto-generated

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55723

             Bug #: 55723
           Summary: SLP vectorization vs loop: SLP more efficient!
    Classification: Unclassified
           Product: gcc
           Version: 4.8.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassigned@gcc.gnu.org
        ReportedBy: vincenzo.innocente@cern.ch


in the following code, basic block vectorization seems to be more efficient
that standard loop vectorization (I measure 20% better)
Is the loop vectorization computing the polynomial twice?


gcc version 4.8.0 20121215 (experimental) [trunk revision 194522] (GCC) 



cat AtanT.cc;
typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;

template<typename Float>
inline
Float atan(Float t) {
  constexpr float PIO4F = 0.7853981633974483096f;

  Float z= (t > 0.4142135623730950f) ? (t-1.0f)/(t+1.0f) : t;

  Float z2 = z * z;
  Float ret =
    ((( 8.05374449538e-2f * z2
    - 1.38776856032E-1f) * z2
      + 1.99777106478E-1f) * z2
     - 3.33329491539E-1f) * z2 * z
    + z;

  // move back in place
  return ( t > 0.4142135623730950f ) ? ret+PIO4F : ret;
  return ret;
}

float32x4_t va[1024];
float32x4_t vb[1024];

float a[4*1024];
float b[4*1024];

void computeV() {
  for (int i=0;i!=1024;++i)
    vb[i]=atan(va[i]);
}

//inline
void computeL() {
  for (int i=0;i!=4*1024;++i)
    b[i]=atan(a[i]);
}
Vincenzos-MacBook-Pro:floatPrec innocent$ c++ -std=c++11 -Ofast -march=corei7
-S AtanT.cc; cat AtanT.s
    .text
    .align 4,0x90
    .globl __Z8computeVv
__Z8computeVv:
LFB1:
    movaps    LC1(%rip), %xmm4
    leaq    _va(%rip), %rcx
    xorl    %eax, %eax
    movaps    LC0(%rip), %xmm10
    leaq    _vb(%rip), %rdx
    movaps    LC2(%rip), %xmm9
    movaps    LC3(%rip), %xmm8
    movaps    LC4(%rip), %xmm7
    movaps    LC5(%rip), %xmm6
    movaps    LC6(%rip), %xmm5
    .align 4,0x90
L3:
    movaps    (%rcx,%rax), %xmm1
    movaps    %xmm1, %xmm3
    movaps    %xmm1, %xmm2
    addps    %xmm4, %xmm3
    subps    %xmm4, %xmm2
    rcpps    %xmm3, %xmm0
    mulps    %xmm0, %xmm3
    mulps    %xmm0, %xmm3
    addps    %xmm0, %xmm0
    subps    %xmm3, %xmm0
    movaps    %xmm1, %xmm3
    mulps    %xmm0, %xmm2
    movaps    %xmm10, %xmm0
    cmpltps    %xmm1, %xmm0
    blendvps    %xmm0, %xmm2, %xmm3
    movaps    %xmm3, %xmm2
    mulps    %xmm3, %xmm2
    movaps    %xmm2, %xmm1
    mulps    %xmm9, %xmm1
    subps    %xmm8, %xmm1
    mulps    %xmm2, %xmm1
    addps    %xmm7, %xmm1
    mulps    %xmm2, %xmm1
    subps    %xmm6, %xmm1
    mulps    %xmm2, %xmm1
    addps    %xmm4, %xmm1
    mulps    %xmm3, %xmm1
    movaps    %xmm1, %xmm2
    addps    %xmm5, %xmm2
    blendvps    %xmm0, %xmm2, %xmm1
    movaps    %xmm1, (%rdx,%rax)
    addq    $16, %rax
    cmpq    $16384, %rax
    jne    L3
    rep; ret
LFE1:
    .align 4,0x90
    .globl __Z8computeLv
__Z8computeLv:
LFB2:
    movaps    LC1(%rip), %xmm5
    leaq    _a(%rip), %rcx
    xorl    %eax, %eax
    movaps    LC0(%rip), %xmm11
    leaq    _b(%rip), %rdx
    movaps    LC2(%rip), %xmm9
    movaps    LC7(%rip), %xmm8
    movaps    LC4(%rip), %xmm7
    movaps    LC8(%rip), %xmm6
    movaps    LC6(%rip), %xmm10
    .align 4,0x90
L7:
    movaps    (%rcx,%rax), %xmm0
    movaps    %xmm0, %xmm3
    movaps    %xmm0, %xmm1
    addps    %xmm5, %xmm3
    subps    %xmm5, %xmm1
    rcpps    %xmm3, %xmm2
    mulps    %xmm2, %xmm3
    mulps    %xmm2, %xmm3
    addps    %xmm2, %xmm2
    subps    %xmm3, %xmm2
    movaps    %xmm0, %xmm3
    mulps    %xmm0, %xmm3
    mulps    %xmm2, %xmm1
    movaps    %xmm1, %xmm4
    mulps    %xmm1, %xmm4
    movaps    %xmm4, %xmm2
    mulps    %xmm9, %xmm2
    addps    %xmm8, %xmm2
    mulps    %xmm4, %xmm2
    addps    %xmm7, %xmm2
    mulps    %xmm4, %xmm2
    addps    %xmm6, %xmm2
    mulps    %xmm4, %xmm2
    movaps    %xmm11, %xmm4
    cmpltps    %xmm0, %xmm4
    addps    %xmm5, %xmm2
    mulps    %xmm1, %xmm2
    movaps    %xmm3, %xmm1
    mulps    %xmm9, %xmm1
    addps    %xmm10, %xmm2
    addps    %xmm8, %xmm1
    mulps    %xmm3, %xmm1
    addps    %xmm7, %xmm1
    mulps    %xmm3, %xmm1
    addps    %xmm6, %xmm1
    mulps    %xmm3, %xmm1
    addps    %xmm5, %xmm1
    mulps    %xmm0, %xmm1
    movaps    %xmm4, %xmm0
    blendvps    %xmm0, %xmm2, %xmm1
    movaps    %xmm1, (%rdx,%rax)
    addq    $16, %rax
    cmpq    $16384, %rax
    jne    L7
    rep; ret

Follow-Ups:
- [Bug tree-optimization/55723] SLP vectorization vs loop: SLP more efficient: loop vectorization inefficient in presence of multiple "blends"
  - From: vincenzo.innocente at cern dot ch
- [Bug tree-optimization/55723] loop vectorization inefficient in presence of multiple identical conditions
  - From: vincenzo.innocente at cern dot ch

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]