Bug 57952

Summary: AVX/AVX2 no ymm registers used in a trivial reduction
Product: gcc Reporter: vincenzo Innocente <vincenzo.innocente>
Component: targetAssignee: Not yet assigned to anyone <unassigned>
Status: RESOLVED FIXED    
Severity: normal CC: jakub, mmokrejs
Priority: P3    
Version: 4.9.0   
Target Milestone: ---   
Host: Target:
Build: Known to work: 10.1.0
Known to fail: Last reconfirmed:

Description vincenzo Innocente 2013-07-22 08:43:53 UTC
in this quite trivial benchmark
gcc does not generate avx/avx2 instruction using ymm registries
c++ -Ofast -S polyAVX.cpp -march=core-avx2 ; grep -c "ymm" polyAVX.s
0
clang++ -Ofast -S polyAVX.cpp -march=core-avx2 ; grep -c "ymm" polyAVX.s
73

same for -march=corei7-avx
gcc version 4.9.0 20130718 (experimental) [trunk revision 201034] (GCC) 


with obvious speed effect…
 c++ -Ofast polyAVX.cpp -march=core-avx2 ; time ./a.out 
0.508u 0.000s 0:00.50 100.0%	0+0k 0+0io 1pf+0w
clang++ -Ofast polyAVX.cpp -march=core-avx2 ; time ./a.out
0.257u 0.000s 0:00.25 100.0%	0+0k 0+0io 1pf+0w


cat polyAVX.cpp
//template<typename T>
typedef float T;
inline T polyHorner(T y) {
  return  T(0x2.p0) + y * (T(0x2.p0) + y * (T(0x1.p0) + y * (T(0x5.55523p-4) + y * (T(0x1.5554dcp-4) + y * (T(0x4.48f41p-8) + y * T(0xb.6ad4p-12)))))) ;
}

int main() {

    bool ret=true;
    float s =0;
    for (int k=0; k!=100; ++k) {
      float c = 1.f/1000000.f;
      for (int i=1; i<10000001; ++i) s+= polyHorner((float(i)+1.f)*c);
    }
    ret &= s!=0;


  return ret ? 0 : -1;


}
Comment 1 vincenzo Innocente 2013-07-23 07:20:23 UTC
I modified a bit the benchmark adding timing
and the new version now vectorize YMM with avx2, still not with old avx
if I remove the call to rdtsc(); it does not use YMM anymore
-fno-tree-pre does not help

cat polyAVX.cpp 
//template<typename T>
typedef float T;
inline T polyHorner(T y) {
  return  T(0x2.p0) + y * (T(0x2.p0) + y * (T(0x1.p0) + y * (T(0x5.55523p-4) + y * (T(0x1.5554dcp-4) + y * (T(0x4.48f41p-8) + y * T(0xb.6ad4p-12)))))) ;
}

#include <x86intrin.h>
#include<iostream>

volatile unsigned long long rdtsc() {
    unsigned int taux=0;
    return __rdtscp(&taux);
  }

int main() {


  long long t=0;

    bool ret=true;
    float s =0;
    for (int k=0; k!=100; ++k) {
      float c =   1.f/10000000.f;
      t -=rdtsc();
      for (int i=1; i<10000001; ++i) s+= polyHorner((float(i)+float(k))*c);
      t	+=rdtsc();
    }
    ret &= s!=0;

  std::cout << t <<std::endl;

  return ret ? 0 : -1;


}
[innocent@vinavx2 vectorize]$ c++ -Ofast -S polyAVX.cpp -march=core-avx2 ; grep -c "ymm" polyAVX.s
28
[innocent@vinavx2 vectorize]$ c++ -Ofast -S polyAVX.cpp -march=corei7-avx ; grep -c "ymm" polyAVX.s
0
Comment 2 mmokrejs 2017-05-11 16:21:23 UTC
I see a similar problem with gcc-4.9.6 unable to use 256-bit YMM registers. It only uses 128-bit XMM registers from SSE instructions.

Try https://people.eecs.berkeley.edu/~samw/research/code/stream/stream.c

$ gcc -march=native -o stream stream.c
$ objdump -d stream | grep ymm
$


icc does the job properly:

$ icc -qopenmp -O3 -xhost stream.c
$ objdump -d a.out | grep ymm
  401323:       c5 fd 10 00             vmovupd (%rax),%ymm0
  40132d:       c5 fd 10 48 20          vmovupd 0x20(%rax),%ymm1
  401332:       c5 7d 10 40 40          vmovupd 0x40(%rax),%ymm8
  401337:       c5 7d 10 48 60          vmovupd 0x60(%rax),%ymm9
  40133c:       c5 7d 10 1d 3c 65 00    vmovupd 0x653c(%rip),%ymm11        # 407880 <_IO_stdin_used+0x340>
  401344:       c5 fd 5c 50 f8          vsubpd -0x8(%rax),%ymm0,%ymm2
  401349:       c5 f5 5c 58 18          vsubpd 0x18(%rax),%ymm1,%ymm3
  40134e:       c5 3d 5c 50 38          vsubpd 0x38(%rax),%ymm8,%ymm10
  401353:       c5 35 5c 60 58          vsubpd 0x58(%rax),%ymm9,%ymm12
  401358:       c5 a5 59 e2             vmulpd %ymm2,%ymm11,%ymm4
  401361:       c5 a5 59 f3             vmulpd %ymm3,%ymm11,%ymm6
  401365:       c4 41 25 59 ea          vmulpd %ymm10,%ymm11,%ymm13
  40136a:       c4 41 25 59 fc          vmulpd %ymm12,%ymm11,%ymm15
  40136f:       c5 fd e6 ec             vcvttpd2dq %ymm4,%xmm5
  401373:       c5 fd e6 fe             vcvttpd2dq %ymm6,%xmm7
  401377:       c4 41 7d e6 f5          vcvttpd2dq %ymm13,%xmm14
  40137c:       c5 f5 ef c9             vpxor  %ymm1,%ymm1,%ymm1
  401380:       c4 41 7d e6 ef          vcvttpd2dq %ymm15,%xmm13
  401385:       c4 e2 7d 58 e2          vpbroadcastd %xmm2,%ymm4
  40139c:       c4 e3 55 38 c7 01       vinserti128 $0x1,%xmm7,%ymm5,%ymm0
  4013a2:       c4 e2 7d 3d d9          vpmaxsd %ymm1,%ymm0,%ymm3
  4013a7:       c4 e2 65 39 f4          vpminsd %ymm4,%ymm3,%ymm6
  4013ac:       c4 43 0d 38 fd 01       vinserti128 $0x1,%xmm13,%ymm14,%ymm15
  4013b2:       c4 e2 05 3d e9          vpmaxsd %ymm1,%ymm15,%ymm5
  4013b7:       c4 e2 55 39 fe          vpminsd %ymm6,%ymm5,%ymm7
  4013bc:       c4 c3 7d 39 f8 01       vextracti128 $0x1,%ymm7,%xmm8
  401caa:       c5 fd 57 c0             vxorpd %ymm0,%ymm0,%ymm0
  401cc8:       c5 fd 10 16             vmovupd (%rsi),%ymm2
  401cd0:       c5 fd 10 5e 20          vmovupd 0x20(%rsi),%ymm3
  401cd5:       c5 fd 10 66 40          vmovupd 0x40(%rsi),%ymm4
  401cda:       c5 fd 10 76 60          vmovupd 0x60(%rsi),%ymm6
  401ce7:       c4 e2 ed b8 0f          vfmadd231pd (%rdi),%ymm2,%ymm1
  401cec:       c4 e2 e5 b8 47 20       vfmadd231pd 0x20(%rdi),%ymm3,%ymm0
  401cf2:       c5 dd 59 6f 40          vmulpd 0x40(%rdi),%ymm4,%ymm5
  401cf7:       c5 cd 59 7f 60          vmulpd 0x60(%rdi),%ymm6,%ymm7
  401cfc:       c5 d5 58 c9             vaddpd %ymm1,%ymm5,%ymm1
  401d00:       c5 c5 58 c0             vaddpd %ymm0,%ymm7,%ymm0
  401d0d:       c5 f5 58 c0             vaddpd %ymm0,%ymm1,%ymm0
  401d11:       c4 e3 7d 19 c1 01       vextractf128 $0x1,%ymm0,%xmm1
  401d6a:       c5 fd 10 8c c3 c0 be    vmovupd 0x60bec0(%rbx,%rax,8),%ymm1
  401d73:       c5 f5 59 94 c3 c0 60    vmulpd 0x268660c0(%rbx,%rax,8),%ymm1,%ymm2
  401d7c:       c5 ed 58 c0             vaddpd %ymm0,%ymm2,%ymm0
  401d85:       c4 e3 7d 19 c1 01       vextractf128 $0x1,%ymm0,%xmm1
  401f9b:       c4 e2 7d 19 44 24 10    vbroadcastsd 0x10(%rsp),%ymm0
  401fa2:       c5 fd 10 8c d1 c0 be    vmovupd 0x60bec0(%rcx,%rdx,8),%ymm1
  401fab:       c4 e2 fd a8 8c d1 c0    vfmadd213pd 0x268660c0(%rcx,%rdx,8),%ymm0,%ymm1
  401fb5:       c5 fd 2b 8c d1 c0 02    vmovntpd %ymm1,0x4cac02c0(%rcx,%rdx,8)
  40213b:       c5 fd 10 84 d1 c0 60    vmovupd 0x268660c0(%rcx,%rdx,8),%ymm0
  402144:       c5 fd 58 8c d1 c0 be    vaddpd 0x60bec0(%rcx,%rdx,8),%ymm0,%ymm1
  40214d:       c5 fd 2b 8c d1 c0 02    vmovntpd %ymm1,0x4cac02c0(%rcx,%rdx,8)
  4022dd:       c4 e2 7d 19 44 24 10    vbroadcastsd 0x10(%rsp),%ymm0
  4022e4:       c5 fd 2b 84 d1 c0 02    vmovntpd %ymm0,0x4cac02c0(%rcx,%rdx,8)
  4025c2:       c5 fd 10 05 76 52 00    vmovupd 0x5276(%rip),%ymm0        # 407840 <_IO_stdin_used+0x300>
  4025de:       c5 fd 59 0e             vmulpd (%rsi),%ymm0,%ymm1
  4025e2:       c5 fd 11 0e             vmovupd %ymm1,(%rsi)
  402759:       c5 fd 10 15 ff 50 00    vmovupd 0x50ff(%rip),%ymm2        # 407860 <_IO_stdin_used+0x320>
  402761:       c5 fd 10 0d d7 50 00    vmovupd 0x50d7(%rip),%ymm1        # 407840 <_IO_stdin_used+0x300>
  402769:       c5 fd 57 c0             vxorpd %ymm0,%ymm0,%ymm0
  40276d:       c5 fd 2b 94 f8 c0 60    vmovntpd %ymm2,0x268660c0(%rax,%rdi,8)
  402776:       c5 fd 2b 8c f8 c0 be    vmovntpd %ymm1,0x60bec0(%rax,%rdi,8)
  40277f:       c5 fd 2b 84 f8 c0 02    vmovntpd %ymm0,0x4cac02c0(%rax,%rdi,8)
  4030b0:       c5 fd 10 84 c8 c0 60    vmovupd 0x268660c0(%rax,%rcx,8),%ymm0
  4030b9:       c5 fd 2b 84 c8 c0 02    vmovntpd %ymm0,0x4cac02c0(%rax,%rcx,8)
  4032f0:       c5 fd 10 44 24 08       vmovupd 0x8(%rsp),%ymm0
  4032fb:       c5 fd 10 4c 24 28       vmovupd 0x28(%rsp),%ymm1
  403301:       c5 7d 10 44 24 48       vmovupd 0x48(%rsp),%ymm8
  403307:       c5 7d 10 4c 24 68       vmovupd 0x68(%rsp),%ymm9
  40330d:       c5 7d 10 1d 6b 45 00    vmovupd 0x456b(%rip),%ymm11        # 407880 <_IO_stdin_used+0x340>
  403315:       c5 fd 5c 14 24          vsubpd (%rsp),%ymm0,%ymm2
  40331a:       c5 f5 5c 5c 24 20       vsubpd 0x20(%rsp),%ymm1,%ymm3
  403320:       c5 3d 5c 54 24 40       vsubpd 0x40(%rsp),%ymm8,%ymm10
  403326:       c5 35 5c 64 24 60       vsubpd 0x60(%rsp),%ymm9,%ymm12
  40332c:       c5 a5 59 e2             vmulpd %ymm2,%ymm11,%ymm4
  403334:       c5 a5 59 f3             vmulpd %ymm3,%ymm11,%ymm6
  403338:       c4 41 25 59 ea          vmulpd %ymm10,%ymm11,%ymm13
  40333d:       c4 41 25 59 fc          vmulpd %ymm12,%ymm11,%ymm15
  403342:       c5 fd e6 ec             vcvttpd2dq %ymm4,%xmm5
  403346:       c5 fd e6 fe             vcvttpd2dq %ymm6,%xmm7
  40334a:       c4 41 7d e6 f5          vcvttpd2dq %ymm13,%xmm14
  40334f:       c5 f5 ef c9             vpxor  %ymm1,%ymm1,%ymm1
  403353:       c4 41 7d e6 ef          vcvttpd2dq %ymm15,%xmm13
  403358:       c4 e2 7d 58 e2          vpbroadcastd %xmm2,%ymm4
  403368:       c4 e3 55 38 c7 01       vinserti128 $0x1,%xmm7,%ymm5,%ymm0
  40336e:       c4 e2 7d 3d d9          vpmaxsd %ymm1,%ymm0,%ymm3
  403373:       c4 e2 65 39 f4          vpminsd %ymm4,%ymm3,%ymm6
  403378:       c4 43 0d 38 fd 01       vinserti128 $0x1,%xmm13,%ymm14,%ymm15
  40337e:       c4 e2 05 3d e9          vpmaxsd %ymm1,%ymm15,%ymm5
  403383:       c4 e2 55 39 fe          vpminsd %ymm6,%ymm5,%ymm7
  403388:       c4 c3 7d 39 f8 01       vextracti128 $0x1,%ymm7,%xmm8
  403694:       c4 c1 7d 10 84 c0 c0    vmovupd 0x268660c0(%r8,%rax,8),%ymm0
  40369e:       c4 c1 7d 2b 84 c0 c0    vmovntpd %ymm0,0x4cac02c0(%r8,%rax,8)
  4038ba:       c5 fd 10 84 d1 c0 60    vmovupd 0x268660c0(%rcx,%rdx,8),%ymm0
  4038c3:       c5 fd 58 8c d1 c0 be    vaddpd 0x60bec0(%rcx,%rdx,8),%ymm0,%ymm1
  4038cc:       c5 fd 2b 8c d1 c0 02    vmovntpd %ymm1,0x4cac02c0(%rcx,%rdx,8)
  403b4a:       c4 e2 7d 19 c1          vbroadcastsd %xmm1,%ymm0
  403b4f:       c5 fd 10 94 d1 c0 be    vmovupd 0x60bec0(%rcx,%rdx,8),%ymm2
  403b58:       c4 e2 fd a8 94 d1 c0    vfmadd213pd 0x268660c0(%rcx,%rdx,8),%ymm0,%ymm2
  403b62:       c5 fd 2b 94 d1 c0 02    vmovntpd %ymm2,0x4cac02c0(%rcx,%rdx,8)
  403e2e:       c5 fd 57 c0             vxorpd %ymm0,%ymm0,%ymm0
  403e40:       c5 fd 10 14 dd c0 be    vmovupd 0x60bec0(,%rbx,8),%ymm2
  403e4d:       c5 fd 10 1c dd e0 be    vmovupd 0x60bee0(,%rbx,8),%ymm3
  403e56:       c5 fd 10 24 dd 00 bf    vmovupd 0x60bf00(,%rbx,8),%ymm4
  403e5f:       c5 fd 10 34 dd 20 bf    vmovupd 0x60bf20(,%rbx,8),%ymm6
  403e68:       c4 e2 ed b8 0c dd c0    vfmadd231pd 0x268660c0(,%rbx,8),%ymm2,%ymm1
  403e72:       c4 e2 e5 b8 04 dd e0    vfmadd231pd 0x268660e0(,%rbx,8),%ymm3,%ymm0
  403e7c:       c5 dd 59 2c dd 00 61    vmulpd 0x26866100(,%rbx,8),%ymm4,%ymm5
  403e85:       c5 cd 59 3c dd 20 61    vmulpd 0x26866120(,%rbx,8),%ymm6,%ymm7
  403e8e:       c5 d5 58 c9             vaddpd %ymm1,%ymm5,%ymm1
  403e92:       c5 c5 58 c0             vaddpd %ymm0,%ymm7,%ymm0
  403e9f:       c5 f5 58 c0             vaddpd %ymm0,%ymm1,%ymm0
  403ea3:       c4 e3 7d 19 c1 01       vextractf128 $0x1,%ymm0,%xmm1
  403efb:       c4 c1 7d 10 8c c1 c0    vmovupd 0x60bec0(%r9,%rax,8),%ymm1
  403f05:       c4 c1 75 59 94 c1 c0    vmulpd 0x268660c0(%r9,%rax,8),%ymm1,%ymm2
  403f0f:       c5 ed 58 c0             vaddpd %ymm0,%ymm2,%ymm0
  403f18:       c4 e3 7d 19 c1 01       vextractf128 $0x1,%ymm0,%xmm1
  404216:       c4 e2 7d 19 c1          vbroadcastsd %xmm1,%ymm0
  40421b:       c5 fd 2b 84 d1 c0 02    vmovntpd %ymm0,0x4cac02c0(%rcx,%rdx,8)
  404690:       c5 fe 6f 0e             vmovdqu (%rsi),%ymm1
  404694:       c5 fe 6f 56 20          vmovdqu 0x20(%rsi),%ymm2
  404699:       c5 fe 6f 5e 40          vmovdqu 0x40(%rsi),%ymm3
  40469e:       c5 fe 6f 66 60          vmovdqu 0x60(%rsi),%ymm4
  4046a3:       c5 fe 6f ae 80 00 00    vmovdqu 0x80(%rsi),%ymm5
  4046ab:       c5 fe 6f b6 a0 00 00    vmovdqu 0xa0(%rsi),%ymm6
  4046b3:       c5 fe 6f be c0 00 00    vmovdqu 0xc0(%rsi),%ymm7
  4046bb:       c5 7e 6f 86 e0 00 00    vmovdqu 0xe0(%rsi),%ymm8
  4046c3:       c5 fd 7f 0f             vmovdqa %ymm1,(%rdi)
  4046c7:       c5 fd 7f 57 20          vmovdqa %ymm2,0x20(%rdi)
  4046cc:       c5 fd 7f 5f 40          vmovdqa %ymm3,0x40(%rdi)
  4046d1:       c5 fd 7f 67 60          vmovdqa %ymm4,0x60(%rdi)
  4046d6:       c5 fd 7f af 80 00 00    vmovdqa %ymm5,0x80(%rdi)
  4046de:       c5 fd 7f b7 a0 00 00    vmovdqa %ymm6,0xa0(%rdi)
  4046e6:       c5 fd 7f bf c0 00 00    vmovdqa %ymm7,0xc0(%rdi)
  4046ee:       c5 7d 7f 87 e0 00 00    vmovdqa %ymm8,0xe0(%rdi)
  40475c:       c5 fe 6f 0e             vmovdqu (%rsi),%ymm1
  404760:       c5 fe 6f 56 20          vmovdqu 0x20(%rsi),%ymm2
  404765:       c5 fe 6f 5e 40          vmovdqu 0x40(%rsi),%ymm3
  40476a:       c5 fe 6f 66 60          vmovdqu 0x60(%rsi),%ymm4
  40476f:       c5 fe 6f ae 80 00 00    vmovdqu 0x80(%rsi),%ymm5
  404777:       c5 fe 6f b6 a0 00 00    vmovdqu 0xa0(%rsi),%ymm6
  40477f:       c5 fe 6f be c0 00 00    vmovdqu 0xc0(%rsi),%ymm7
  404787:       c5 7e 6f 86 e0 00 00    vmovdqu 0xe0(%rsi),%ymm8
  404796:       c5 fd e7 0f             vmovntdq %ymm1,(%rdi)
  40479a:       c5 fd e7 57 20          vmovntdq %ymm2,0x20(%rdi)
  40479f:       c5 fd e7 5f 40          vmovntdq %ymm3,0x40(%rdi)
  4047a4:       c5 fd e7 67 60          vmovntdq %ymm4,0x60(%rdi)
  4047a9:       c5 fd e7 af 80 00 00    vmovntdq %ymm5,0x80(%rdi)
  4047b1:       c5 fd e7 b7 a0 00 00    vmovntdq %ymm6,0xa0(%rdi)
  4047b9:       c5 fd e7 bf c0 00 00    vmovntdq %ymm7,0xc0(%rdi)
  4047c1:       c5 7d e7 87 e0 00 00    vmovntdq %ymm8,0xe0(%rdi)
  4048f0:       c5 fc 10 86 20 ff ff    vmovups -0xe0(%rsi),%ymm0
  4048f8:       c5 fc 29 87 20 ff ff    vmovaps %ymm0,-0xe0(%rdi)
  404900:       c5 fc 10 86 40 ff ff    vmovups -0xc0(%rsi),%ymm0
  404908:       c5 fc 29 87 40 ff ff    vmovaps %ymm0,-0xc0(%rdi)
  404910:       c5 fc 10 86 60 ff ff    vmovups -0xa0(%rsi),%ymm0
  404918:       c5 fc 29 87 60 ff ff    vmovaps %ymm0,-0xa0(%rdi)
  404920:       c5 fc 10 46 80          vmovups -0x80(%rsi),%ymm0
  404925:       c5 fc 29 47 80          vmovaps %ymm0,-0x80(%rdi)
  40492a:       c5 fc 10 46 a0          vmovups -0x60(%rsi),%ymm0
  40492f:       c5 fc 29 47 a0          vmovaps %ymm0,-0x60(%rdi)
  404934:       c5 fc 10 46 c0          vmovups -0x40(%rsi),%ymm0
  404939:       c5 fc 29 47 c0          vmovaps %ymm0,-0x40(%rdi)
  40493e:       c5 fc 10 46 e0          vmovups -0x20(%rsi),%ymm0
  404943:       c5 fc 29 47 e0          vmovaps %ymm0,-0x20(%rdi)
  404a40:       c5 fc 10 06             vmovups (%rsi),%ymm0
  404a44:       c5 fc 11 07             vmovups %ymm0,(%rdi)
  404a48:       c5 fc 10 44 0e e0       vmovups -0x20(%rsi,%rcx,1),%ymm0
  404a4e:       c5 fc 11 44 0f e0       vmovups %ymm0,-0x20(%rdi,%rcx,1)
  404a60:       c5 fc 10 06             vmovups (%rsi),%ymm0
  404a64:       c5 fc 11 07             vmovups %ymm0,(%rdi)
  404a68:       c5 fc 10 46 20          vmovups 0x20(%rsi),%ymm0
  404a6d:       c5 fc 11 47 20          vmovups %ymm0,0x20(%rdi)
  404a72:       c5 fc 10 44 0e e0       vmovups -0x20(%rsi,%rcx,1),%ymm0
  404a78:       c5 fc 11 44 0f e0       vmovups %ymm0,-0x20(%rdi,%rcx,1)
  404a90:       c5 fc 10 06             vmovups (%rsi),%ymm0
  404a94:       c5 fc 11 07             vmovups %ymm0,(%rdi)
  404a98:       c5 fc 10 46 20          vmovups 0x20(%rsi),%ymm0
  404a9d:       c5 fc 11 47 20          vmovups %ymm0,0x20(%rdi)
  404aa2:       c5 fc 10 46 40          vmovups 0x40(%rsi),%ymm0
  404aa7:       c5 fc 11 47 40          vmovups %ymm0,0x40(%rdi)
  404aac:       c5 fc 10 44 0e e0       vmovups -0x20(%rsi,%rcx,1),%ymm0
  404ab2:       c5 fc 11 44 0f e0       vmovups %ymm0,-0x20(%rdi,%rcx,1)
  404ac0:       c5 fc 10 06             vmovups (%rsi),%ymm0
  404ac4:       c5 fc 11 07             vmovups %ymm0,(%rdi)
  404ac8:       c5 fc 10 46 20          vmovups 0x20(%rsi),%ymm0
  404acd:       c5 fc 11 47 20          vmovups %ymm0,0x20(%rdi)
  404ad2:       c5 fc 10 46 40          vmovups 0x40(%rsi),%ymm0
  404ad7:       c5 fc 11 47 40          vmovups %ymm0,0x40(%rdi)
  404adc:       c5 fc 10 46 60          vmovups 0x60(%rsi),%ymm0
  404ae1:       c5 fc 11 47 60          vmovups %ymm0,0x60(%rdi)
  404ae6:       c5 fc 10 44 0e e0       vmovups -0x20(%rsi,%rcx,1),%ymm0
  404aec:       c5 fc 11 44 0f e0       vmovups %ymm0,-0x20(%rdi,%rcx,1)
  404b00:       c5 fc 10 06             vmovups (%rsi),%ymm0
  404b04:       c5 fc 11 07             vmovups %ymm0,(%rdi)
  404b08:       c5 fc 10 46 20          vmovups 0x20(%rsi),%ymm0
  404b0d:       c5 fc 11 47 20          vmovups %ymm0,0x20(%rdi)
  404b12:       c5 fc 10 46 40          vmovups 0x40(%rsi),%ymm0
  404b17:       c5 fc 11 47 40          vmovups %ymm0,0x40(%rdi)
  404b1c:       c5 fc 10 46 60          vmovups 0x60(%rsi),%ymm0
  404b21:       c5 fc 11 47 60          vmovups %ymm0,0x60(%rdi)
  404b26:       c5 fc 10 86 80 00 00    vmovups 0x80(%rsi),%ymm0
  404b2e:       c5 fc 11 87 80 00 00    vmovups %ymm0,0x80(%rdi)
  404b36:       c5 fc 10 44 0e e0       vmovups -0x20(%rsi,%rcx,1),%ymm0
  404b3c:       c5 fc 11 44 0f e0       vmovups %ymm0,-0x20(%rdi,%rcx,1)
  404b50:       c5 fc 10 06             vmovups (%rsi),%ymm0
  404b54:       c5 fc 11 07             vmovups %ymm0,(%rdi)
  404b58:       c5 fc 10 46 20          vmovups 0x20(%rsi),%ymm0
  404b5d:       c5 fc 11 47 20          vmovups %ymm0,0x20(%rdi)
  404b62:       c5 fc 10 46 40          vmovups 0x40(%rsi),%ymm0
  404b67:       c5 fc 11 47 40          vmovups %ymm0,0x40(%rdi)
  404b6c:       c5 fc 10 46 60          vmovups 0x60(%rsi),%ymm0
  404b71:       c5 fc 11 47 60          vmovups %ymm0,0x60(%rdi)
  404b76:       c5 fc 10 86 80 00 00    vmovups 0x80(%rsi),%ymm0
  404b7e:       c5 fc 11 87 80 00 00    vmovups %ymm0,0x80(%rdi)
  404b86:       c5 fc 10 86 a0 00 00    vmovups 0xa0(%rsi),%ymm0
  404b8e:       c5 fc 11 87 a0 00 00    vmovups %ymm0,0xa0(%rdi)
  404b96:       c5 fc 10 44 0e e0       vmovups -0x20(%rsi,%rcx,1),%ymm0
  404b9c:       c5 fc 11 44 0f e0       vmovups %ymm0,-0x20(%rdi,%rcx,1)
  404bb0:       c5 fc 10 06             vmovups (%rsi),%ymm0
  404bb4:       c5 fc 11 07             vmovups %ymm0,(%rdi)
  404bb8:       c5 fc 10 46 20          vmovups 0x20(%rsi),%ymm0
  404bbd:       c5 fc 11 47 20          vmovups %ymm0,0x20(%rdi)
  404bc2:       c5 fc 10 46 40          vmovups 0x40(%rsi),%ymm0
  404bc7:       c5 fc 11 47 40          vmovups %ymm0,0x40(%rdi)
  404bcc:       c5 fc 10 46 60          vmovups 0x60(%rsi),%ymm0
  404bd1:       c5 fc 11 47 60          vmovups %ymm0,0x60(%rdi)
  404bd6:       c5 fc 10 86 80 00 00    vmovups 0x80(%rsi),%ymm0
  404bde:       c5 fc 11 87 80 00 00    vmovups %ymm0,0x80(%rdi)
  404be6:       c5 fc 10 86 a0 00 00    vmovups 0xa0(%rsi),%ymm0
  404bee:       c5 fc 11 87 a0 00 00    vmovups %ymm0,0xa0(%rdi)
  404bf6:       c5 fc 10 86 c0 00 00    vmovups 0xc0(%rsi),%ymm0
  404bfe:       c5 fc 11 87 c0 00 00    vmovups %ymm0,0xc0(%rdi)
  404c06:       c5 fc 10 44 0e e0       vmovups -0x20(%rsi,%rcx,1),%ymm0
  404c0c:       c5 fc 11 44 0f e0       vmovups %ymm0,-0x20(%rdi,%rcx,1)
  404c20:       c5 fc 10 84 0e 00 ff    vmovups -0x100(%rsi,%rcx,1),%ymm0
  404c29:       c5 fc 11 84 0f 00 ff    vmovups %ymm0,-0x100(%rdi,%rcx,1)
  404c32:       c5 fc 10 84 0e 20 ff    vmovups -0xe0(%rsi,%rcx,1),%ymm0
  404c3b:       c5 fc 11 84 0f 20 ff    vmovups %ymm0,-0xe0(%rdi,%rcx,1)
  404c44:       c5 fc 10 84 0e 40 ff    vmovups -0xc0(%rsi,%rcx,1),%ymm0
  404c4d:       c5 fc 11 84 0f 40 ff    vmovups %ymm0,-0xc0(%rdi,%rcx,1)
  404c56:       c5 fc 10 84 0e 60 ff    vmovups -0xa0(%rsi,%rcx,1),%ymm0
  404c5f:       c5 fc 11 84 0f 60 ff    vmovups %ymm0,-0xa0(%rdi,%rcx,1)
  404c68:       c5 fc 10 44 0e 80       vmovups -0x80(%rsi,%rcx,1),%ymm0
  404c6e:       c5 fc 11 44 0f 80       vmovups %ymm0,-0x80(%rdi,%rcx,1)
  404c74:       c5 fc 10 44 0e a0       vmovups -0x60(%rsi,%rcx,1),%ymm0
  404c7a:       c5 fc 11 44 0f a0       vmovups %ymm0,-0x60(%rdi,%rcx,1)
  404c80:       c5 fc 10 44 0e c0       vmovups -0x40(%rsi,%rcx,1),%ymm0
  404c86:       c5 fc 11 44 0f c0       vmovups %ymm0,-0x40(%rdi,%rcx,1)
  404c8c:       c5 fc 10 44 0e e0       vmovups -0x20(%rsi,%rcx,1),%ymm0
  404c92:       c5 fc 11 44 0f e0       vmovups %ymm0,-0x20(%rdi,%rcx,1)
$
Comment 3 mmokrejs 2017-05-11 16:43:08 UTC
Weird, why g++ can do the task although probably less efficiently than icc?

$ g++ -O3 -march=core-avx-i -mtune=core-avx-i -mavx2 stream.c  ; objdump -d a.out | grep ymm
stream.c:106:48: warning: deprecated conversion from string constant to 'char*' [-Wwrite-strings]
     "Add:       ", "Triad:     ", "Dot:       "};
                                                ^
stream.c:106:48: warning: deprecated conversion from string constant to 'char*' [-Wwrite-strings]
stream.c:106:48: warning: deprecated conversion from string constant to 'char*' [-Wwrite-strings]
stream.c:106:48: warning: deprecated conversion from string constant to 'char*' [-Wwrite-strings]
stream.c:106:48: warning: deprecated conversion from string constant to 'char*' [-Wwrite-strings]
  4006ac:       c5 fd 28 0d 2c 11 00    vmovapd 0x112c(%rip),%ymm1        # 4017e0 <_ZL5label+0xa0>
  4006b4:       c5 fd 28 05 44 11 00    vmovapd 0x1144(%rip),%ymm0        # 401800 <_ZL5label+0xc0>
  4006c0:       c5 fd 29 88 80 61 ab    vmovapd %ymm1,0x4cab6180(%rax)
  4006cc:       c5 fd 29 80 60 c1 85    vmovapd %ymm0,0x2685c160(%rax)
  400738:       c5 fd 28 01             vmovapd (%rcx),%ymm0
  400740:       c5 fd 58 c0             vaddpd %ymm0,%ymm0,%ymm0
  400744:       c5 fd 29 41 e0          vmovapd %ymm0,-0x20(%rcx)
  400803:       c5 fd 28 15 15 10 00    vmovapd 0x1015(%rip),%ymm2        # 401820 <_ZL5label+0xe0>
  400817:       c5 fd 29 95 d0 fd ff    vmovapd %ymm2,-0x230(%rbp)
  4008d0:       c5 fd 28 95 d0 fd ff    vmovapd -0x230(%rbp),%ymm2
  400908:       c5 fd 29 10             vmovapd %ymm2,(%rax)
  40091c:       c5 fd 29 95 d0 fd ff    vmovapd %ymm2,-0x230(%rbp)
  40098c:       c5 fd 28 95 d0 fd ff    vmovapd -0x230(%rbp),%ymm2
  4009b8:       c5 fd 28 80 80 61 ab    vmovapd 0x4cab6180(%rax),%ymm0
  4009c4:       c5 fd 58 80 60 c1 85    vaddpd 0x2685c160(%rax),%ymm0,%ymm0
  4009cc:       c5 fd 29 80 60 21 60    vmovapd %ymm0,0x602160(%rax)
  4009e3:       c5 fd 29 95 d0 fd ff    vmovapd %ymm2,-0x230(%rbp)
  400a56:       c5 fd 28 95 d0 fd ff    vmovapd -0x230(%rbp),%ymm2
  400a80:       c5 ed 59 80 80 c1 85    vmulpd 0x2685c180(%rax),%ymm2,%ymm0
  400a88:       c5 fd 58 80 80 61 ab    vaddpd 0x4cab6180(%rax),%ymm0,%ymm0
  400a94:       c5 fd 29 80 60 21 60    vmovapd %ymm0,0x602160(%rax)
  400aab:       c5 fd 29 95 d0 fd ff    vmovapd %ymm2,-0x230(%rbp)
  400b1e:       c5 fd 28 95 d0 fd ff    vmovapd -0x230(%rbp),%ymm2
  400b77:       c5 fd 29 95 d0 fd ff    vmovapd %ymm2,-0x230(%rbp)
  400bab:       c5 fd 28 95 d0 fd ff    vmovapd -0x230(%rbp),%ymm2
  401006:       c4 e3 6d 18 95 28 ff    vinsertf128 $0x1,-0xd8(%rbp),%ymm2,%ymm2
  401018:       c4 e3 7d 18 85 48 ff    vinsertf128 $0x1,-0xb8(%rbp),%ymm0,%ymm0
  401022:       c5 ed 5c 95 10 ff ff    vsubpd -0xf0(%rbp),%ymm2,%ymm2
  40102a:       c5 fd 5c 85 30 ff ff    vsubpd -0xd0(%rbp),%ymm0,%ymm0
  401032:       c5 fd 28 25 66 07 00    vmovapd 0x766(%rip),%ymm4        # 4017a0 <_ZL5label+0x60>
  401044:       c5 ed 59 d4             vmulpd %ymm4,%ymm2,%ymm2
  401048:       c5 fd 59 c4             vmulpd %ymm4,%ymm0,%ymm0
  40104c:       c5 fd e6 d2             vcvttpd2dq %ymm2,%xmm2
  401058:       c5 fd e6 c0             vcvttpd2dq %ymm0,%xmm0
  40105c:       c4 e3 6d 38 d0 01       vinserti128 $0x1,%xmm0,%ymm2,%ymm2
  401062:       c4 e2 6d 3d d3          vpmaxsd %ymm3,%ymm2,%ymm2
  401067:       c4 e2 6d 39 15 50 07    vpminsd 0x750(%rip),%ymm2,%ymm2        # 4017c0 <_ZL5label+0x80>
  401096:       c4 e3 75 18 8d 68 ff    vinsertf128 $0x1,-0x98(%rbp),%ymm1,%ymm1
  4010a4:       c5 f5 5c 8d 50 ff ff    vsubpd -0xb0(%rbp),%ymm1,%ymm1
  4010b7:       c5 f5 59 cc             vmulpd %ymm4,%ymm1,%ymm1
  4010bb:       c5 fd e6 c9             vcvttpd2dq %ymm1,%xmm1
  4010d3:       c4 e3 7d 18 45 88 01    vinsertf128 $0x1,-0x78(%rbp),%ymm0,%ymm0
  4010da:       c5 fd 5c 85 70 ff ff    vsubpd -0x90(%rbp),%ymm0,%ymm0
  4010e2:       c5 fd 59 c4             vmulpd %ymm4,%ymm0,%ymm0
  4010e6:       c5 fd e6 c0             vcvttpd2dq %ymm0,%xmm0
  4010ea:       c4 e3 75 38 c0 01       vinserti128 $0x1,%xmm0,%ymm1,%ymm0
  4010f5:       c4 e2 7d 3d c3          vpmaxsd %ymm3,%ymm0,%ymm0
  4010fa:       c4 e2 6d 39 c0          vpminsd %ymm0,%ymm2,%ymm0
  4010ff:       c4 e3 7d 46 c8 01       vperm2i128 $0x1,%ymm0,%ymm0,%ymm1
  401105:       c4 e2 7d 39 c1          vpminsd %ymm1,%ymm0,%ymm0
  40110f:       c5 f5 73 d8 08          vpsrldq $0x8,%ymm0,%ymm1
  401114:       c4 e2 7d 39 c9          vpminsd %ymm1,%ymm0,%ymm1
  401119:       c5 fd 73 d9 04          vpsrldq $0x4,%ymm1,%ymm0
  40111e:       c4 e2 75 39 c0          vpminsd %ymm0,%ymm1,%ymm0
  4011e0:       c4 e2 7d 19 c0          vbroadcastsd %xmm0,%ymm0
  4011f0:       c5 fd 29 00             vmovapd %ymm0,(%rax)
  401218:       c5 fd 28 80 80 61 ab    vmovapd 0x4cab6180(%rax),%ymm0
  401224:       c5 fd 58 80 60 c1 85    vaddpd 0x2685c160(%rax),%ymm0,%ymm0
  40122c:       c5 fd 29 80 60 21 60    vmovapd %ymm0,0x602160(%rax)
  401240:       c4 e2 7d 19 c0          vbroadcastsd %xmm0,%ymm0
  401250:       c5 fd 59 88 80 c1 85    vmulpd 0x2685c180(%rax),%ymm0,%ymm1
  401258:       c5 f5 58 88 80 61 ab    vaddpd 0x4cab6180(%rax),%ymm1,%ymm1
  401264:       c5 fd 29 88 60 21 60    vmovapd %ymm1,0x602160(%rax)
$
Comment 4 Jakub Jelinek 2017-05-12 06:01:28 UTC
The reason why #c1 (as well as #c0) is only vectorized using vector length of 8 rather than 4 is that the loop iterator is cast to float and therefore needed inside of the loop in vector registers:

pr57952.C:21:20: note: op not supported by target.
pr57952.C:21:20: note: not vectorized: relevant stmt not supported: i_16 = i_41 + 1;
pr57952.C:21:20: note: bad operation or unsupported loop bound.

and AVX doesn't support V8SImode addition.

Now, perhaps we could have an optimization that in that case if all the iterators can be provably exactly represented in the floating point value we could try to do what the programmer should have done, i.e. add a float iterator that is set to 1.0f and incremented in each iteration and used instead of float(i).  But it won't work in this case, because you need 24 bits for the iterator and float only has 23 bit mantissa.

  for (int k=0; k!=100; ++k) {
    float c = 1.f/10000000.f;
    float fi = 1.f;
    for (int i=1; i<10000001; ++i) { s+= polyHorner((fi+float(k))*c); fi += 1.f; }
  }

is vectorized with -Ofast -mavx just fine vectorization factor of 8.

As for #c2/#c3, GCC 4.9 is not supported anymore and the dumps are too large to find out what exactly you mean by efficient and not efficient, both the ICC and GCC generated assemblies use both %ymm and %xmm registers depending on what exactly the need.
Comment 5 mmokrejs 2017-05-17 08:38:59 UTC
(In reply to Jakub Jelinek from comment #4)

> As for #c2/#c3, GCC 4.9 is not supported anymore and the dumps are too large
> to find out what exactly you mean by efficient and not efficient, both the
> ICC and GCC generated assemblies use both %ymm and %xmm registers depending
> on what exactly the need.

"gcc -march=native" or "g++ march=native" do not insert a single instructing using ymm registers unless -O3 is used.

$ gcc -O3 -march=native -mavx2 stream.c  ; objdump -d a.out | grep ymm | wc -l
63
$ gcc -O2 -march=native -mavx2 stream.c  ; objdump -d a.out | grep ymm | wc -l
0
$ gcc -O2 -march=native stream.c  ; objdump -d a.out | grep ymm | wc -l
0
$ gcc -O3 -march=native stream.c  ; objdump -d a.out | grep ymm | wc -l
63
$

I am on Gentoo Linux where 5.4.0 is still in testing only, same for 6.3 and 7.1. The 4.9 series is the last which is generally usable.
Comment 6 Jakub Jelinek 2017-05-17 08:45:46 UTC
(In reply to mmokrejs from comment #5)
> (In reply to Jakub Jelinek from comment #4)
> 
> > As for #c2/#c3, GCC 4.9 is not supported anymore and the dumps are too large
> > to find out what exactly you mean by efficient and not efficient, both the
> > ICC and GCC generated assemblies use both %ymm and %xmm registers depending
> > on what exactly the need.
> 
> "gcc -march=native" or "g++ march=native" do not insert a single instructing
> using ymm registers unless -O3 is used.
> 
> $ gcc -O3 -march=native -mavx2 stream.c  ; objdump -d a.out | grep ymm | wc
> -l
> 63
> $ gcc -O2 -march=native -mavx2 stream.c  ; objdump -d a.out | grep ymm | wc
> -l
> 0
> $ gcc -O2 -march=native stream.c  ; objdump -d a.out | grep ymm | wc -l
> 0
> $ gcc -O3 -march=native stream.c  ; objdump -d a.out | grep ymm | wc -l
> 63
> $

Of course, vectorization is only enabled by default for -O3/-Ofast, not at -O2, for vectorization at -O2 you need to use -O2 -ftree-vectorize.
Comment 7 mmokrejs 2017-05-17 09:03:01 UTC
(In reply to Jakub Jelinek from comment #6)
> > $ gcc -O3 -march=native stream.c  ; objdump -d a.out | grep ymm | wc -l
> > 63
> > $
> 
> Of course, vectorization is only enabled by default for -O3/-Ofast, not at
> -O2, for vectorization at -O2 you need to use -O2 -ftree-vectorize.

$ gcc -O2 -march=native -ftree-vectorize stream.c  ; objdump -d a.out | grep ymm | wc -l
60
$


Ah, thanks. Please update the manpage. It says nothing about the need to use -O3 or -Ofast interacting with -march=native or -mavx or -mavx2.

<quote>
       -march=cpu-type
           Generate instructions for the machine type cpu-type.  In contrast to -mtune=cpu-type, which merely tunes the generated code for the specified cpu-type, -march=cpu-type allows
           GCC to generate code that may not run at all on processors other than the one indicated.  Specifying -march=cpu-type implies -mtune=cpu-type.

           The choices for cpu-type are:

           native
               This selects the CPU to generate code for at compilation time by determining the processor type of the compiling machine.  Using -march=native enables all instruction
               subsets supported by the local machine (hence the result might not run on different machines).  Using -mtune=native produces code optimized for the local machine under
               the constraints of the selected instruction set.
</quote>

<quote>
           sandybridge
               Intel Sandy Bridge CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AES and PCLMUL instruction set support.

           ivybridge
               Intel Ivy Bridge CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AES, PCLMUL, FSGSBASE, RDRND and F16C instruction set support.

           haswell
               Intel Haswell CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, FMA, BMI, BMI2 and F16C
               instruction set support.

           broadwell
               Intel Broadwell CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, FMA, BMI, BMI2, F16C,
               RDSEED, ADCX and PREFETCHW instruction set support.
</quote>


These entrie shave no description text at all:
<quote>
       -mavx
       -mno-avx
       -mavx2
       -mno-avx2
       -mavx512f
       -mno-avx512f
       -mavx512pf
       -mno-avx512pf
       -mavx512er
       -mno-avx512er
       -mavx512cd
       -mno-avx512cd
</quote>

There are hardly any links from within the manpage (notably the intel x86/amd64 section) to -ftree-vectorize.
<quote>
       -ftree-vectorize
           Perform vectorization on trees. This flag enables -ftree-loop-vectorize and -ftree-slp-vectorize if not explicitly specified.

       -ftree-loop-vectorize
           Perform loop vectorization on trees. This flag is enabled by default at -O3 and when -ftree-vectorize is enabled.

       -ftree-slp-vectorize
           Perform basic block vectorization on trees. This flag is enabled by default at -O3 and when -ftree-vectorize is enabled.
</quote>
Comment 8 Jakub Jelinek 2017-05-17 13:25:45 UTC
Why should there be any references to that?  -march= is an ISA selection option, it says the compiler may use the instructions from the ISA.
-ftree-vectorize is an optimization option (default at -O3 and -Ofast), which attempts to vectorize loops using the selected ISA if possible.  The reason it isn't on at -O2 is that it doesn't always improve code, it can make code slower as well, and usually makes the code larger; it really depends on the loop etc.
Comment 9 mmokrejs 2017-05-17 14:23:35 UTC
Thank you, I updated https://wiki.gentoo.org/wiki/GCC_optimization and added the notes on -ftree-vectorize option.
Comment 10 Richard Biener 2021-06-08 14:13:16 UTC
I can now see this vectorized with at least GCC 10 and up.  Note we're vectorizing the _outer_ loop here but we also manage to vectorize the
inner loop only if I comment out the outer one, it just looks less efficient.

.L2:
        vmovdqa %ymm6, %ymm2
        movl    $10000000, %eax
        .p2align 4,,10
        .p2align 3
.L3:
        vmovdqa %ymm2, %ymm0
        vpaddd  %ymm6, %ymm2, %ymm2
        vcvtdq2ps       %ymm0, %ymm0
        vaddps  %ymm5, %ymm0, %ymm0
        vmulps  %ymm11, %ymm0, %ymm0
        vmovaps %ymm0, %ymm1
        vfmadd132ps     %ymm10, %ymm9, %ymm1
        vfmadd132ps     %ymm0, %ymm8, %ymm1
        vfmadd132ps     %ymm0, %ymm7, %ymm1
        vfmadd132ps     %ymm0, %ymm5, %ymm1
        vfmadd132ps     %ymm0, %ymm4, %ymm1
        vfmadd132ps     %ymm1, %ymm4, %ymm0
        vaddps  %ymm0, %ymm3, %ymm3
        decl    %eax
        jne     .L3
        incl    %edx
        cmpl    $12, %edx
        jne     .L2