Bug 63945 - Missing vectorization optimization
Summary: Missing vectorization optimization
Status: UNCONFIRMED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 4.9.1
: P3 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2014-11-19 02:34 UTC by Wolfgang Bangerth
Modified: 2019-10-16 18:08 UTC (History)
1 user (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Wolfgang Bangerth 2014-11-19 02:34:21 UTC
(Reporting this for Bruno Turcksin <bruno.turcksin@gmail.com>.)

The loop in the following testcase cannot be vectorized, we get the error:

note: not vectorized: latch block not empty.
note: bad loop form.

The reason is that val is a member of the class, is evaluated in the if, and is used in the loop that should be vectorized. If these three conditions are satisfied the loop cannot be vectorized.

...............................

#include <vector>

class TEST
{
  public :
    TEST();
    void test();

  private :
    const double val;
};

TEST::TEST()
  :
  val(2.)
{}

void TEST::test()
{
  const unsigned int n(1000);
  std::vector<double> a(n);
  std::vector<double> b(n);
  std::vector<double> c(n);

  for (unsigned int i=0; i<n; ++i)
  {
    a[i] = 1.;
    b[i] = 1.;
  }

  if (val<100.)
  {
#pragma omp simd
    for (unsigned int i=0; i<n; ++i)
      c[i] = val*a[i]+b[i];
  }
}
int main ()
{
  TEST a;
  a.test();
}
...................................
Comment 1 Witold Baryluk 2019-10-16 18:08:39 UTC
It does vectorize for me on gcc 9.2.1:

-march=skylake-avx512

aa.cpp:34:29: optimized: loop vectorized using 32 byte vectors
aa.cpp:25:27: optimized: loop vectorized using 32 byte vectors


  if (val<100.)
    1279:       c5 fb 10 0b             vmovsd (%rbx),%xmm1
    127d:       c5 fb 10 05 8b 0d 00    vmovsd 0xd8b(%rip),%xmm0        # 2010 <_IO_stdin_used+0x10>
    1284:       00 
    1285:       c5 f9 2f c1             vcomisd %xmm1,%xmm0
    1289:       76 2b                   jbe    12b6 <_ZN4TEST4testEv+0xc6>
    128b:       c4 e2 7d 19 c9          vbroadcastsd %xmm1,%ymm1
    1290:       31 c0                   xor    %eax,%eax
    1292:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
      c[i] = val*a[i]+b[i];
    1298:       c4 c1 7d 10 04 04       vmovupd (%r12,%rax,1),%ymm0
    129e:       c4 c2 f5 a8 44 05 00    vfmadd213pd 0x0(%r13,%rax,1),%ymm1,%ymm0
    12a5:       c5 fd 11 04 07          vmovupd %ymm0,(%rdi,%rax,1)
    for (unsigned int i=0; i<n; ++i)
    12aa:       48 83 c0 20             add    $0x20,%rax
    12ae:       48 3d 40 1f 00 00       cmp    $0x1f40,%rax
    12b4:       75 e2                   jne    1298 <_ZN4TEST4testEv+0xa8>
        ::operator delete(__p);
    12b6:       c5 f8 77                vzeroupper 


Similarly:

-march=knm

aa.cpp:34:29: optimized: loop vectorized using 64 byte vectors
aa.cpp:25:27: optimized: loop vectorized using 64 byte vectors

  if (val<100.)
    15bc:       31 c0                   xor    %eax,%eax
    15be:       66 90                   xchg   %ax,%ax
      c[i] = val*a[i]+b[i];
    15c0:       62 f1 fd 48 28 04 01    vmovapd (%rcx,%rax,1),%zmm0
    15c7:       62 f2 ed 48 a8 04 06    vfmadd213pd (%rsi,%rax,1),%zmm2,%zmm0
    15ce:       62 d1 fd 48 11 04 01    vmovupd %zmm0,(%r9,%rax,1)
    for (unsigned int i=0; i<n; ++i)
    15d5:       48 8d 40 40             lea    0x40(%rax),%rax   // (64 bytes, aka 8 * sizeof(double))
    15d9:       48 39 d0                cmp    %rdx,%rax
    15dc:       75 e2                   jne    15c0 <_ZN4TEST4testEv+0x3e0>

(plus a lot of handling for unaligned stack).

-march=znver2

aa.cpp:34:29: optimized: loop vectorized using 32 byte vectors
aa.cpp:25:27: optimized: loop vectorized using 32 byte vectors

  if (val<100.)
    1279:       c5 fb 10 0b             vmovsd (%rbx),%xmm1
    127d:       c5 fb 10 05 8b 0d 00    vmovsd 0xd8b(%rip),%xmm0        # 2010 <_IO_stdin_used+0x10>
    1284:       00 
    1285:       c5 f9 2f c1             vcomisd %xmm1,%xmm0
    1289:       76 33                   jbe    12be <_ZN4TEST4testEv+0xce>
    128b:       c4 e2 7d 19 c9          vbroadcastsd %xmm1,%ymm1
    1290:       31 c0                   xor    %eax,%eax
    1292:       66 66 2e 0f 1f 84 00    data16 nopw %cs:0x0(%rax,%rax,1)
    1299:       00 00 00 00 
    129d:       0f 1f 00                nopl   (%rax)
      c[i] = val*a[i]+b[i];
    12a0:       c4 c1 7d 10 04 04       vmovupd (%r12,%rax,1),%ymm0
    12a6:       c4 c2 f5 a8 44 05 00    vfmadd213pd 0x0(%r13,%rax,1),%ymm1,%ymm0
    12ad:       c5 fd 11 04 07          vmovupd %ymm0,(%rdi,%rax,1)
    for (unsigned int i=0; i<n; ++i)
    12b2:       48 83 c0 20             add    $0x20,%rax
    12b6:       48 3d 40 1f 00 00       cmp    $0x1f40,%rax   // 1f40 == 8000 bytes (1000 * sizeof(double))
    12bc:       75 e2                   jne    12a0 <_ZN4TEST4testEv+0xb0>

-march=core2

aa.cpp:34:29: optimized: loop vectorized using 16 byte vectors
aa.cpp:25:27: optimized: loop vectorized using 16 byte vectors

  if (val<100.)
    1276:       f2 0f 10 13             movsd  (%rbx),%xmm2
    127a:       f2 0f 10 05 8e 0d 00    movsd  0xd8e(%rip),%xmm0        # 2010 <_IO_stdin_used+0x10>
    1281:       00 
    1282:       66 0f 2f c2             comisd %xmm2,%xmm0
    1286:       76 40                   jbe    12c8 <_ZN4TEST4testEv+0xd8>
    1288:       31 c0                   xor    %eax,%eax
    128a:       66 0f 14 d2             unpcklpd %xmm2,%xmm2
    128e:       66 90                   xchg   %ax,%ax
      c[i] = val*a[i]+b[i];
    1290:       f3 0f 7e 44 05 00       movq   0x0(%rbp,%rax,1),%xmm0
    1296:       f3 41 0f 7e 0c 04       movq   (%r12,%rax,1),%xmm1
    129c:       66 0f 16 44 05 08       movhpd 0x8(%rbp,%rax,1),%xmm0
    12a2:       66 0f 59 c2             mulpd  %xmm2,%xmm0
    12a6:       66 41 0f 16 4c 04 08    movhpd 0x8(%r12,%rax,1),%xmm1
    12ad:       66 0f 58 c1             addpd  %xmm1,%xmm0
    12b1:       66 0f 13 04 07          movlpd %xmm0,(%rdi,%rax,1)
    12b6:       66 0f 17 44 07 08       movhpd %xmm0,0x8(%rdi,%rax,1)
    for (unsigned int i=0; i<n; ++i)
    12bc:       48 83 c0 10             add    $0x10,%rax
    12c0:       48 3d 40 1f 00 00       cmp    $0x1f40,%rax
    12c6:       75 c8                   jne    1290 <_ZN4TEST4testEv+0xa0>



Looks all pretty optimally vectorized to me.

The code can be made even better, if you ensure proper alignment of std::vector arrrays, which they might not be at the moment.