(Reporting this for Bruno Turcksin <bruno.turcksin@gmail.com>.) The loop in the following testcase cannot be vectorized, we get the error: note: not vectorized: latch block not empty. note: bad loop form. The reason is that val is a member of the class, is evaluated in the if, and is used in the loop that should be vectorized. If these three conditions are satisfied the loop cannot be vectorized. ............................... #include <vector> class TEST { public : TEST(); void test(); private : const double val; }; TEST::TEST() : val(2.) {} void TEST::test() { const unsigned int n(1000); std::vector<double> a(n); std::vector<double> b(n); std::vector<double> c(n); for (unsigned int i=0; i<n; ++i) { a[i] = 1.; b[i] = 1.; } if (val<100.) { #pragma omp simd for (unsigned int i=0; i<n; ++i) c[i] = val*a[i]+b[i]; } } int main () { TEST a; a.test(); } ...................................
It does vectorize for me on gcc 9.2.1: -march=skylake-avx512 aa.cpp:34:29: optimized: loop vectorized using 32 byte vectors aa.cpp:25:27: optimized: loop vectorized using 32 byte vectors if (val<100.) 1279: c5 fb 10 0b vmovsd (%rbx),%xmm1 127d: c5 fb 10 05 8b 0d 00 vmovsd 0xd8b(%rip),%xmm0 # 2010 <_IO_stdin_used+0x10> 1284: 00 1285: c5 f9 2f c1 vcomisd %xmm1,%xmm0 1289: 76 2b jbe 12b6 <_ZN4TEST4testEv+0xc6> 128b: c4 e2 7d 19 c9 vbroadcastsd %xmm1,%ymm1 1290: 31 c0 xor %eax,%eax 1292: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) c[i] = val*a[i]+b[i]; 1298: c4 c1 7d 10 04 04 vmovupd (%r12,%rax,1),%ymm0 129e: c4 c2 f5 a8 44 05 00 vfmadd213pd 0x0(%r13,%rax,1),%ymm1,%ymm0 12a5: c5 fd 11 04 07 vmovupd %ymm0,(%rdi,%rax,1) for (unsigned int i=0; i<n; ++i) 12aa: 48 83 c0 20 add $0x20,%rax 12ae: 48 3d 40 1f 00 00 cmp $0x1f40,%rax 12b4: 75 e2 jne 1298 <_ZN4TEST4testEv+0xa8> ::operator delete(__p); 12b6: c5 f8 77 vzeroupper Similarly: -march=knm aa.cpp:34:29: optimized: loop vectorized using 64 byte vectors aa.cpp:25:27: optimized: loop vectorized using 64 byte vectors if (val<100.) 15bc: 31 c0 xor %eax,%eax 15be: 66 90 xchg %ax,%ax c[i] = val*a[i]+b[i]; 15c0: 62 f1 fd 48 28 04 01 vmovapd (%rcx,%rax,1),%zmm0 15c7: 62 f2 ed 48 a8 04 06 vfmadd213pd (%rsi,%rax,1),%zmm2,%zmm0 15ce: 62 d1 fd 48 11 04 01 vmovupd %zmm0,(%r9,%rax,1) for (unsigned int i=0; i<n; ++i) 15d5: 48 8d 40 40 lea 0x40(%rax),%rax // (64 bytes, aka 8 * sizeof(double)) 15d9: 48 39 d0 cmp %rdx,%rax 15dc: 75 e2 jne 15c0 <_ZN4TEST4testEv+0x3e0> (plus a lot of handling for unaligned stack). -march=znver2 aa.cpp:34:29: optimized: loop vectorized using 32 byte vectors aa.cpp:25:27: optimized: loop vectorized using 32 byte vectors if (val<100.) 1279: c5 fb 10 0b vmovsd (%rbx),%xmm1 127d: c5 fb 10 05 8b 0d 00 vmovsd 0xd8b(%rip),%xmm0 # 2010 <_IO_stdin_used+0x10> 1284: 00 1285: c5 f9 2f c1 vcomisd %xmm1,%xmm0 1289: 76 33 jbe 12be <_ZN4TEST4testEv+0xce> 128b: c4 e2 7d 19 c9 vbroadcastsd %xmm1,%ymm1 1290: 31 c0 xor %eax,%eax 1292: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1) 1299: 00 00 00 00 129d: 0f 1f 00 nopl (%rax) c[i] = val*a[i]+b[i]; 12a0: c4 c1 7d 10 04 04 vmovupd (%r12,%rax,1),%ymm0 12a6: c4 c2 f5 a8 44 05 00 vfmadd213pd 0x0(%r13,%rax,1),%ymm1,%ymm0 12ad: c5 fd 11 04 07 vmovupd %ymm0,(%rdi,%rax,1) for (unsigned int i=0; i<n; ++i) 12b2: 48 83 c0 20 add $0x20,%rax 12b6: 48 3d 40 1f 00 00 cmp $0x1f40,%rax // 1f40 == 8000 bytes (1000 * sizeof(double)) 12bc: 75 e2 jne 12a0 <_ZN4TEST4testEv+0xb0> -march=core2 aa.cpp:34:29: optimized: loop vectorized using 16 byte vectors aa.cpp:25:27: optimized: loop vectorized using 16 byte vectors if (val<100.) 1276: f2 0f 10 13 movsd (%rbx),%xmm2 127a: f2 0f 10 05 8e 0d 00 movsd 0xd8e(%rip),%xmm0 # 2010 <_IO_stdin_used+0x10> 1281: 00 1282: 66 0f 2f c2 comisd %xmm2,%xmm0 1286: 76 40 jbe 12c8 <_ZN4TEST4testEv+0xd8> 1288: 31 c0 xor %eax,%eax 128a: 66 0f 14 d2 unpcklpd %xmm2,%xmm2 128e: 66 90 xchg %ax,%ax c[i] = val*a[i]+b[i]; 1290: f3 0f 7e 44 05 00 movq 0x0(%rbp,%rax,1),%xmm0 1296: f3 41 0f 7e 0c 04 movq (%r12,%rax,1),%xmm1 129c: 66 0f 16 44 05 08 movhpd 0x8(%rbp,%rax,1),%xmm0 12a2: 66 0f 59 c2 mulpd %xmm2,%xmm0 12a6: 66 41 0f 16 4c 04 08 movhpd 0x8(%r12,%rax,1),%xmm1 12ad: 66 0f 58 c1 addpd %xmm1,%xmm0 12b1: 66 0f 13 04 07 movlpd %xmm0,(%rdi,%rax,1) 12b6: 66 0f 17 44 07 08 movhpd %xmm0,0x8(%rdi,%rax,1) for (unsigned int i=0; i<n; ++i) 12bc: 48 83 c0 10 add $0x10,%rax 12c0: 48 3d 40 1f 00 00 cmp $0x1f40,%rax 12c6: 75 c8 jne 1290 <_ZN4TEST4testEv+0xa0> Looks all pretty optimally vectorized to me. The code can be made even better, if you ensure proper alignment of std::vector arrrays, which they might not be at the moment.