Bug 63945

Summary: Missing vectorization optimization
Product: gcc Reporter: Wolfgang Bangerth <bangerth>
Component: tree-optimizationAssignee: Not yet assigned to anyone <unassigned>
Status: RESOLVED FIXED    
Severity: enhancement CC: witold.baryluk+gcc
Priority: P3 Keywords: missed-optimization
Version: 4.9.1   
Target Milestone: 5.0   
Host: Target:
Build: Known to work:
Known to fail: Last reconfirmed:
Bug Depends on:    
Bug Blocks: 53947    

Description Wolfgang Bangerth 2014-11-19 02:34:21 UTC
(Reporting this for Bruno Turcksin <bruno.turcksin@gmail.com>.)

The loop in the following testcase cannot be vectorized, we get the error:

note: not vectorized: latch block not empty.
note: bad loop form.

The reason is that val is a member of the class, is evaluated in the if, and is used in the loop that should be vectorized. If these three conditions are satisfied the loop cannot be vectorized.

...............................

#include <vector>

class TEST
{
  public :
    TEST();
    void test();

  private :
    const double val;
};

TEST::TEST()
  :
  val(2.)
{}

void TEST::test()
{
  const unsigned int n(1000);
  std::vector<double> a(n);
  std::vector<double> b(n);
  std::vector<double> c(n);

  for (unsigned int i=0; i<n; ++i)
  {
    a[i] = 1.;
    b[i] = 1.;
  }

  if (val<100.)
  {
#pragma omp simd
    for (unsigned int i=0; i<n; ++i)
      c[i] = val*a[i]+b[i];
  }
}
int main ()
{
  TEST a;
  a.test();
}
...................................
Comment 1 Witold Baryluk 2019-10-16 18:08:39 UTC
It does vectorize for me on gcc 9.2.1:

-march=skylake-avx512

aa.cpp:34:29: optimized: loop vectorized using 32 byte vectors
aa.cpp:25:27: optimized: loop vectorized using 32 byte vectors


  if (val<100.)
    1279:       c5 fb 10 0b             vmovsd (%rbx),%xmm1
    127d:       c5 fb 10 05 8b 0d 00    vmovsd 0xd8b(%rip),%xmm0        # 2010 <_IO_stdin_used+0x10>
    1284:       00 
    1285:       c5 f9 2f c1             vcomisd %xmm1,%xmm0
    1289:       76 2b                   jbe    12b6 <_ZN4TEST4testEv+0xc6>
    128b:       c4 e2 7d 19 c9          vbroadcastsd %xmm1,%ymm1
    1290:       31 c0                   xor    %eax,%eax
    1292:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
      c[i] = val*a[i]+b[i];
    1298:       c4 c1 7d 10 04 04       vmovupd (%r12,%rax,1),%ymm0
    129e:       c4 c2 f5 a8 44 05 00    vfmadd213pd 0x0(%r13,%rax,1),%ymm1,%ymm0
    12a5:       c5 fd 11 04 07          vmovupd %ymm0,(%rdi,%rax,1)
    for (unsigned int i=0; i<n; ++i)
    12aa:       48 83 c0 20             add    $0x20,%rax
    12ae:       48 3d 40 1f 00 00       cmp    $0x1f40,%rax
    12b4:       75 e2                   jne    1298 <_ZN4TEST4testEv+0xa8>
        ::operator delete(__p);
    12b6:       c5 f8 77                vzeroupper 


Similarly:

-march=knm

aa.cpp:34:29: optimized: loop vectorized using 64 byte vectors
aa.cpp:25:27: optimized: loop vectorized using 64 byte vectors

  if (val<100.)
    15bc:       31 c0                   xor    %eax,%eax
    15be:       66 90                   xchg   %ax,%ax
      c[i] = val*a[i]+b[i];
    15c0:       62 f1 fd 48 28 04 01    vmovapd (%rcx,%rax,1),%zmm0
    15c7:       62 f2 ed 48 a8 04 06    vfmadd213pd (%rsi,%rax,1),%zmm2,%zmm0
    15ce:       62 d1 fd 48 11 04 01    vmovupd %zmm0,(%r9,%rax,1)
    for (unsigned int i=0; i<n; ++i)
    15d5:       48 8d 40 40             lea    0x40(%rax),%rax   // (64 bytes, aka 8 * sizeof(double))
    15d9:       48 39 d0                cmp    %rdx,%rax
    15dc:       75 e2                   jne    15c0 <_ZN4TEST4testEv+0x3e0>

(plus a lot of handling for unaligned stack).

-march=znver2

aa.cpp:34:29: optimized: loop vectorized using 32 byte vectors
aa.cpp:25:27: optimized: loop vectorized using 32 byte vectors

  if (val<100.)
    1279:       c5 fb 10 0b             vmovsd (%rbx),%xmm1
    127d:       c5 fb 10 05 8b 0d 00    vmovsd 0xd8b(%rip),%xmm0        # 2010 <_IO_stdin_used+0x10>
    1284:       00 
    1285:       c5 f9 2f c1             vcomisd %xmm1,%xmm0
    1289:       76 33                   jbe    12be <_ZN4TEST4testEv+0xce>
    128b:       c4 e2 7d 19 c9          vbroadcastsd %xmm1,%ymm1
    1290:       31 c0                   xor    %eax,%eax
    1292:       66 66 2e 0f 1f 84 00    data16 nopw %cs:0x0(%rax,%rax,1)
    1299:       00 00 00 00 
    129d:       0f 1f 00                nopl   (%rax)
      c[i] = val*a[i]+b[i];
    12a0:       c4 c1 7d 10 04 04       vmovupd (%r12,%rax,1),%ymm0
    12a6:       c4 c2 f5 a8 44 05 00    vfmadd213pd 0x0(%r13,%rax,1),%ymm1,%ymm0
    12ad:       c5 fd 11 04 07          vmovupd %ymm0,(%rdi,%rax,1)
    for (unsigned int i=0; i<n; ++i)
    12b2:       48 83 c0 20             add    $0x20,%rax
    12b6:       48 3d 40 1f 00 00       cmp    $0x1f40,%rax   // 1f40 == 8000 bytes (1000 * sizeof(double))
    12bc:       75 e2                   jne    12a0 <_ZN4TEST4testEv+0xb0>

-march=core2

aa.cpp:34:29: optimized: loop vectorized using 16 byte vectors
aa.cpp:25:27: optimized: loop vectorized using 16 byte vectors

  if (val<100.)
    1276:       f2 0f 10 13             movsd  (%rbx),%xmm2
    127a:       f2 0f 10 05 8e 0d 00    movsd  0xd8e(%rip),%xmm0        # 2010 <_IO_stdin_used+0x10>
    1281:       00 
    1282:       66 0f 2f c2             comisd %xmm2,%xmm0
    1286:       76 40                   jbe    12c8 <_ZN4TEST4testEv+0xd8>
    1288:       31 c0                   xor    %eax,%eax
    128a:       66 0f 14 d2             unpcklpd %xmm2,%xmm2
    128e:       66 90                   xchg   %ax,%ax
      c[i] = val*a[i]+b[i];
    1290:       f3 0f 7e 44 05 00       movq   0x0(%rbp,%rax,1),%xmm0
    1296:       f3 41 0f 7e 0c 04       movq   (%r12,%rax,1),%xmm1
    129c:       66 0f 16 44 05 08       movhpd 0x8(%rbp,%rax,1),%xmm0
    12a2:       66 0f 59 c2             mulpd  %xmm2,%xmm0
    12a6:       66 41 0f 16 4c 04 08    movhpd 0x8(%r12,%rax,1),%xmm1
    12ad:       66 0f 58 c1             addpd  %xmm1,%xmm0
    12b1:       66 0f 13 04 07          movlpd %xmm0,(%rdi,%rax,1)
    12b6:       66 0f 17 44 07 08       movhpd %xmm0,0x8(%rdi,%rax,1)
    for (unsigned int i=0; i<n; ++i)
    12bc:       48 83 c0 10             add    $0x10,%rax
    12c0:       48 3d 40 1f 00 00       cmp    $0x1f40,%rax
    12c6:       75 c8                   jne    1290 <_ZN4TEST4testEv+0xa0>



Looks all pretty optimally vectorized to me.

The code can be made even better, if you ensure proper alignment of std::vector arrrays, which they might not be at the moment.
Comment 2 Andrew Pinski 2025-09-01 07:02:51 UTC
Starting in GCC 15, the code is fully optimized away.

if make sure c is used after the loop, then GCC 5 starts to vectorize the code. Decently since GCC 8.