Summary: | AVX/AVX2 no ymm registers used in a trivial reduction | ||
---|---|---|---|
Product: | gcc | Reporter: | vincenzo Innocente <vincenzo.innocente> |
Component: | target | Assignee: | Not yet assigned to anyone <unassigned> |
Status: | RESOLVED FIXED | ||
Severity: | normal | CC: | jakub, mmokrejs |
Priority: | P3 | ||
Version: | 4.9.0 | ||
Target Milestone: | --- | ||
Host: | Target: | ||
Build: | Known to work: | 10.1.0 | |
Known to fail: | Last reconfirmed: |
Description
vincenzo Innocente
2013-07-22 08:43:53 UTC
I modified a bit the benchmark adding timing and the new version now vectorize YMM with avx2, still not with old avx if I remove the call to rdtsc(); it does not use YMM anymore -fno-tree-pre does not help cat polyAVX.cpp //template<typename T> typedef float T; inline T polyHorner(T y) { return T(0x2.p0) + y * (T(0x2.p0) + y * (T(0x1.p0) + y * (T(0x5.55523p-4) + y * (T(0x1.5554dcp-4) + y * (T(0x4.48f41p-8) + y * T(0xb.6ad4p-12)))))) ; } #include <x86intrin.h> #include<iostream> volatile unsigned long long rdtsc() { unsigned int taux=0; return __rdtscp(&taux); } int main() { long long t=0; bool ret=true; float s =0; for (int k=0; k!=100; ++k) { float c = 1.f/10000000.f; t -=rdtsc(); for (int i=1; i<10000001; ++i) s+= polyHorner((float(i)+float(k))*c); t +=rdtsc(); } ret &= s!=0; std::cout << t <<std::endl; return ret ? 0 : -1; } [innocent@vinavx2 vectorize]$ c++ -Ofast -S polyAVX.cpp -march=core-avx2 ; grep -c "ymm" polyAVX.s 28 [innocent@vinavx2 vectorize]$ c++ -Ofast -S polyAVX.cpp -march=corei7-avx ; grep -c "ymm" polyAVX.s 0 I see a similar problem with gcc-4.9.6 unable to use 256-bit YMM registers. It only uses 128-bit XMM registers from SSE instructions. Try https://people.eecs.berkeley.edu/~samw/research/code/stream/stream.c $ gcc -march=native -o stream stream.c $ objdump -d stream | grep ymm $ icc does the job properly: $ icc -qopenmp -O3 -xhost stream.c $ objdump -d a.out | grep ymm 401323: c5 fd 10 00 vmovupd (%rax),%ymm0 40132d: c5 fd 10 48 20 vmovupd 0x20(%rax),%ymm1 401332: c5 7d 10 40 40 vmovupd 0x40(%rax),%ymm8 401337: c5 7d 10 48 60 vmovupd 0x60(%rax),%ymm9 40133c: c5 7d 10 1d 3c 65 00 vmovupd 0x653c(%rip),%ymm11 # 407880 <_IO_stdin_used+0x340> 401344: c5 fd 5c 50 f8 vsubpd -0x8(%rax),%ymm0,%ymm2 401349: c5 f5 5c 58 18 vsubpd 0x18(%rax),%ymm1,%ymm3 40134e: c5 3d 5c 50 38 vsubpd 0x38(%rax),%ymm8,%ymm10 401353: c5 35 5c 60 58 vsubpd 0x58(%rax),%ymm9,%ymm12 401358: c5 a5 59 e2 vmulpd %ymm2,%ymm11,%ymm4 401361: c5 a5 59 f3 vmulpd %ymm3,%ymm11,%ymm6 401365: c4 41 25 59 ea vmulpd %ymm10,%ymm11,%ymm13 40136a: c4 41 25 59 fc vmulpd %ymm12,%ymm11,%ymm15 40136f: c5 fd e6 ec vcvttpd2dq %ymm4,%xmm5 401373: c5 fd e6 fe vcvttpd2dq %ymm6,%xmm7 401377: c4 41 7d e6 f5 vcvttpd2dq %ymm13,%xmm14 40137c: c5 f5 ef c9 vpxor %ymm1,%ymm1,%ymm1 401380: c4 41 7d e6 ef vcvttpd2dq %ymm15,%xmm13 401385: c4 e2 7d 58 e2 vpbroadcastd %xmm2,%ymm4 40139c: c4 e3 55 38 c7 01 vinserti128 $0x1,%xmm7,%ymm5,%ymm0 4013a2: c4 e2 7d 3d d9 vpmaxsd %ymm1,%ymm0,%ymm3 4013a7: c4 e2 65 39 f4 vpminsd %ymm4,%ymm3,%ymm6 4013ac: c4 43 0d 38 fd 01 vinserti128 $0x1,%xmm13,%ymm14,%ymm15 4013b2: c4 e2 05 3d e9 vpmaxsd %ymm1,%ymm15,%ymm5 4013b7: c4 e2 55 39 fe vpminsd %ymm6,%ymm5,%ymm7 4013bc: c4 c3 7d 39 f8 01 vextracti128 $0x1,%ymm7,%xmm8 401caa: c5 fd 57 c0 vxorpd %ymm0,%ymm0,%ymm0 401cc8: c5 fd 10 16 vmovupd (%rsi),%ymm2 401cd0: c5 fd 10 5e 20 vmovupd 0x20(%rsi),%ymm3 401cd5: c5 fd 10 66 40 vmovupd 0x40(%rsi),%ymm4 401cda: c5 fd 10 76 60 vmovupd 0x60(%rsi),%ymm6 401ce7: c4 e2 ed b8 0f vfmadd231pd (%rdi),%ymm2,%ymm1 401cec: c4 e2 e5 b8 47 20 vfmadd231pd 0x20(%rdi),%ymm3,%ymm0 401cf2: c5 dd 59 6f 40 vmulpd 0x40(%rdi),%ymm4,%ymm5 401cf7: c5 cd 59 7f 60 vmulpd 0x60(%rdi),%ymm6,%ymm7 401cfc: c5 d5 58 c9 vaddpd %ymm1,%ymm5,%ymm1 401d00: c5 c5 58 c0 vaddpd %ymm0,%ymm7,%ymm0 401d0d: c5 f5 58 c0 vaddpd %ymm0,%ymm1,%ymm0 401d11: c4 e3 7d 19 c1 01 vextractf128 $0x1,%ymm0,%xmm1 401d6a: c5 fd 10 8c c3 c0 be vmovupd 0x60bec0(%rbx,%rax,8),%ymm1 401d73: c5 f5 59 94 c3 c0 60 vmulpd 0x268660c0(%rbx,%rax,8),%ymm1,%ymm2 401d7c: c5 ed 58 c0 vaddpd %ymm0,%ymm2,%ymm0 401d85: c4 e3 7d 19 c1 01 vextractf128 $0x1,%ymm0,%xmm1 401f9b: c4 e2 7d 19 44 24 10 vbroadcastsd 0x10(%rsp),%ymm0 401fa2: c5 fd 10 8c d1 c0 be vmovupd 0x60bec0(%rcx,%rdx,8),%ymm1 401fab: c4 e2 fd a8 8c d1 c0 vfmadd213pd 0x268660c0(%rcx,%rdx,8),%ymm0,%ymm1 401fb5: c5 fd 2b 8c d1 c0 02 vmovntpd %ymm1,0x4cac02c0(%rcx,%rdx,8) 40213b: c5 fd 10 84 d1 c0 60 vmovupd 0x268660c0(%rcx,%rdx,8),%ymm0 402144: c5 fd 58 8c d1 c0 be vaddpd 0x60bec0(%rcx,%rdx,8),%ymm0,%ymm1 40214d: c5 fd 2b 8c d1 c0 02 vmovntpd %ymm1,0x4cac02c0(%rcx,%rdx,8) 4022dd: c4 e2 7d 19 44 24 10 vbroadcastsd 0x10(%rsp),%ymm0 4022e4: c5 fd 2b 84 d1 c0 02 vmovntpd %ymm0,0x4cac02c0(%rcx,%rdx,8) 4025c2: c5 fd 10 05 76 52 00 vmovupd 0x5276(%rip),%ymm0 # 407840 <_IO_stdin_used+0x300> 4025de: c5 fd 59 0e vmulpd (%rsi),%ymm0,%ymm1 4025e2: c5 fd 11 0e vmovupd %ymm1,(%rsi) 402759: c5 fd 10 15 ff 50 00 vmovupd 0x50ff(%rip),%ymm2 # 407860 <_IO_stdin_used+0x320> 402761: c5 fd 10 0d d7 50 00 vmovupd 0x50d7(%rip),%ymm1 # 407840 <_IO_stdin_used+0x300> 402769: c5 fd 57 c0 vxorpd %ymm0,%ymm0,%ymm0 40276d: c5 fd 2b 94 f8 c0 60 vmovntpd %ymm2,0x268660c0(%rax,%rdi,8) 402776: c5 fd 2b 8c f8 c0 be vmovntpd %ymm1,0x60bec0(%rax,%rdi,8) 40277f: c5 fd 2b 84 f8 c0 02 vmovntpd %ymm0,0x4cac02c0(%rax,%rdi,8) 4030b0: c5 fd 10 84 c8 c0 60 vmovupd 0x268660c0(%rax,%rcx,8),%ymm0 4030b9: c5 fd 2b 84 c8 c0 02 vmovntpd %ymm0,0x4cac02c0(%rax,%rcx,8) 4032f0: c5 fd 10 44 24 08 vmovupd 0x8(%rsp),%ymm0 4032fb: c5 fd 10 4c 24 28 vmovupd 0x28(%rsp),%ymm1 403301: c5 7d 10 44 24 48 vmovupd 0x48(%rsp),%ymm8 403307: c5 7d 10 4c 24 68 vmovupd 0x68(%rsp),%ymm9 40330d: c5 7d 10 1d 6b 45 00 vmovupd 0x456b(%rip),%ymm11 # 407880 <_IO_stdin_used+0x340> 403315: c5 fd 5c 14 24 vsubpd (%rsp),%ymm0,%ymm2 40331a: c5 f5 5c 5c 24 20 vsubpd 0x20(%rsp),%ymm1,%ymm3 403320: c5 3d 5c 54 24 40 vsubpd 0x40(%rsp),%ymm8,%ymm10 403326: c5 35 5c 64 24 60 vsubpd 0x60(%rsp),%ymm9,%ymm12 40332c: c5 a5 59 e2 vmulpd %ymm2,%ymm11,%ymm4 403334: c5 a5 59 f3 vmulpd %ymm3,%ymm11,%ymm6 403338: c4 41 25 59 ea vmulpd %ymm10,%ymm11,%ymm13 40333d: c4 41 25 59 fc vmulpd %ymm12,%ymm11,%ymm15 403342: c5 fd e6 ec vcvttpd2dq %ymm4,%xmm5 403346: c5 fd e6 fe vcvttpd2dq %ymm6,%xmm7 40334a: c4 41 7d e6 f5 vcvttpd2dq %ymm13,%xmm14 40334f: c5 f5 ef c9 vpxor %ymm1,%ymm1,%ymm1 403353: c4 41 7d e6 ef vcvttpd2dq %ymm15,%xmm13 403358: c4 e2 7d 58 e2 vpbroadcastd %xmm2,%ymm4 403368: c4 e3 55 38 c7 01 vinserti128 $0x1,%xmm7,%ymm5,%ymm0 40336e: c4 e2 7d 3d d9 vpmaxsd %ymm1,%ymm0,%ymm3 403373: c4 e2 65 39 f4 vpminsd %ymm4,%ymm3,%ymm6 403378: c4 43 0d 38 fd 01 vinserti128 $0x1,%xmm13,%ymm14,%ymm15 40337e: c4 e2 05 3d e9 vpmaxsd %ymm1,%ymm15,%ymm5 403383: c4 e2 55 39 fe vpminsd %ymm6,%ymm5,%ymm7 403388: c4 c3 7d 39 f8 01 vextracti128 $0x1,%ymm7,%xmm8 403694: c4 c1 7d 10 84 c0 c0 vmovupd 0x268660c0(%r8,%rax,8),%ymm0 40369e: c4 c1 7d 2b 84 c0 c0 vmovntpd %ymm0,0x4cac02c0(%r8,%rax,8) 4038ba: c5 fd 10 84 d1 c0 60 vmovupd 0x268660c0(%rcx,%rdx,8),%ymm0 4038c3: c5 fd 58 8c d1 c0 be vaddpd 0x60bec0(%rcx,%rdx,8),%ymm0,%ymm1 4038cc: c5 fd 2b 8c d1 c0 02 vmovntpd %ymm1,0x4cac02c0(%rcx,%rdx,8) 403b4a: c4 e2 7d 19 c1 vbroadcastsd %xmm1,%ymm0 403b4f: c5 fd 10 94 d1 c0 be vmovupd 0x60bec0(%rcx,%rdx,8),%ymm2 403b58: c4 e2 fd a8 94 d1 c0 vfmadd213pd 0x268660c0(%rcx,%rdx,8),%ymm0,%ymm2 403b62: c5 fd 2b 94 d1 c0 02 vmovntpd %ymm2,0x4cac02c0(%rcx,%rdx,8) 403e2e: c5 fd 57 c0 vxorpd %ymm0,%ymm0,%ymm0 403e40: c5 fd 10 14 dd c0 be vmovupd 0x60bec0(,%rbx,8),%ymm2 403e4d: c5 fd 10 1c dd e0 be vmovupd 0x60bee0(,%rbx,8),%ymm3 403e56: c5 fd 10 24 dd 00 bf vmovupd 0x60bf00(,%rbx,8),%ymm4 403e5f: c5 fd 10 34 dd 20 bf vmovupd 0x60bf20(,%rbx,8),%ymm6 403e68: c4 e2 ed b8 0c dd c0 vfmadd231pd 0x268660c0(,%rbx,8),%ymm2,%ymm1 403e72: c4 e2 e5 b8 04 dd e0 vfmadd231pd 0x268660e0(,%rbx,8),%ymm3,%ymm0 403e7c: c5 dd 59 2c dd 00 61 vmulpd 0x26866100(,%rbx,8),%ymm4,%ymm5 403e85: c5 cd 59 3c dd 20 61 vmulpd 0x26866120(,%rbx,8),%ymm6,%ymm7 403e8e: c5 d5 58 c9 vaddpd %ymm1,%ymm5,%ymm1 403e92: c5 c5 58 c0 vaddpd %ymm0,%ymm7,%ymm0 403e9f: c5 f5 58 c0 vaddpd %ymm0,%ymm1,%ymm0 403ea3: c4 e3 7d 19 c1 01 vextractf128 $0x1,%ymm0,%xmm1 403efb: c4 c1 7d 10 8c c1 c0 vmovupd 0x60bec0(%r9,%rax,8),%ymm1 403f05: c4 c1 75 59 94 c1 c0 vmulpd 0x268660c0(%r9,%rax,8),%ymm1,%ymm2 403f0f: c5 ed 58 c0 vaddpd %ymm0,%ymm2,%ymm0 403f18: c4 e3 7d 19 c1 01 vextractf128 $0x1,%ymm0,%xmm1 404216: c4 e2 7d 19 c1 vbroadcastsd %xmm1,%ymm0 40421b: c5 fd 2b 84 d1 c0 02 vmovntpd %ymm0,0x4cac02c0(%rcx,%rdx,8) 404690: c5 fe 6f 0e vmovdqu (%rsi),%ymm1 404694: c5 fe 6f 56 20 vmovdqu 0x20(%rsi),%ymm2 404699: c5 fe 6f 5e 40 vmovdqu 0x40(%rsi),%ymm3 40469e: c5 fe 6f 66 60 vmovdqu 0x60(%rsi),%ymm4 4046a3: c5 fe 6f ae 80 00 00 vmovdqu 0x80(%rsi),%ymm5 4046ab: c5 fe 6f b6 a0 00 00 vmovdqu 0xa0(%rsi),%ymm6 4046b3: c5 fe 6f be c0 00 00 vmovdqu 0xc0(%rsi),%ymm7 4046bb: c5 7e 6f 86 e0 00 00 vmovdqu 0xe0(%rsi),%ymm8 4046c3: c5 fd 7f 0f vmovdqa %ymm1,(%rdi) 4046c7: c5 fd 7f 57 20 vmovdqa %ymm2,0x20(%rdi) 4046cc: c5 fd 7f 5f 40 vmovdqa %ymm3,0x40(%rdi) 4046d1: c5 fd 7f 67 60 vmovdqa %ymm4,0x60(%rdi) 4046d6: c5 fd 7f af 80 00 00 vmovdqa %ymm5,0x80(%rdi) 4046de: c5 fd 7f b7 a0 00 00 vmovdqa %ymm6,0xa0(%rdi) 4046e6: c5 fd 7f bf c0 00 00 vmovdqa %ymm7,0xc0(%rdi) 4046ee: c5 7d 7f 87 e0 00 00 vmovdqa %ymm8,0xe0(%rdi) 40475c: c5 fe 6f 0e vmovdqu (%rsi),%ymm1 404760: c5 fe 6f 56 20 vmovdqu 0x20(%rsi),%ymm2 404765: c5 fe 6f 5e 40 vmovdqu 0x40(%rsi),%ymm3 40476a: c5 fe 6f 66 60 vmovdqu 0x60(%rsi),%ymm4 40476f: c5 fe 6f ae 80 00 00 vmovdqu 0x80(%rsi),%ymm5 404777: c5 fe 6f b6 a0 00 00 vmovdqu 0xa0(%rsi),%ymm6 40477f: c5 fe 6f be c0 00 00 vmovdqu 0xc0(%rsi),%ymm7 404787: c5 7e 6f 86 e0 00 00 vmovdqu 0xe0(%rsi),%ymm8 404796: c5 fd e7 0f vmovntdq %ymm1,(%rdi) 40479a: c5 fd e7 57 20 vmovntdq %ymm2,0x20(%rdi) 40479f: c5 fd e7 5f 40 vmovntdq %ymm3,0x40(%rdi) 4047a4: c5 fd e7 67 60 vmovntdq %ymm4,0x60(%rdi) 4047a9: c5 fd e7 af 80 00 00 vmovntdq %ymm5,0x80(%rdi) 4047b1: c5 fd e7 b7 a0 00 00 vmovntdq %ymm6,0xa0(%rdi) 4047b9: c5 fd e7 bf c0 00 00 vmovntdq %ymm7,0xc0(%rdi) 4047c1: c5 7d e7 87 e0 00 00 vmovntdq %ymm8,0xe0(%rdi) 4048f0: c5 fc 10 86 20 ff ff vmovups -0xe0(%rsi),%ymm0 4048f8: c5 fc 29 87 20 ff ff vmovaps %ymm0,-0xe0(%rdi) 404900: c5 fc 10 86 40 ff ff vmovups -0xc0(%rsi),%ymm0 404908: c5 fc 29 87 40 ff ff vmovaps %ymm0,-0xc0(%rdi) 404910: c5 fc 10 86 60 ff ff vmovups -0xa0(%rsi),%ymm0 404918: c5 fc 29 87 60 ff ff vmovaps %ymm0,-0xa0(%rdi) 404920: c5 fc 10 46 80 vmovups -0x80(%rsi),%ymm0 404925: c5 fc 29 47 80 vmovaps %ymm0,-0x80(%rdi) 40492a: c5 fc 10 46 a0 vmovups -0x60(%rsi),%ymm0 40492f: c5 fc 29 47 a0 vmovaps %ymm0,-0x60(%rdi) 404934: c5 fc 10 46 c0 vmovups -0x40(%rsi),%ymm0 404939: c5 fc 29 47 c0 vmovaps %ymm0,-0x40(%rdi) 40493e: c5 fc 10 46 e0 vmovups -0x20(%rsi),%ymm0 404943: c5 fc 29 47 e0 vmovaps %ymm0,-0x20(%rdi) 404a40: c5 fc 10 06 vmovups (%rsi),%ymm0 404a44: c5 fc 11 07 vmovups %ymm0,(%rdi) 404a48: c5 fc 10 44 0e e0 vmovups -0x20(%rsi,%rcx,1),%ymm0 404a4e: c5 fc 11 44 0f e0 vmovups %ymm0,-0x20(%rdi,%rcx,1) 404a60: c5 fc 10 06 vmovups (%rsi),%ymm0 404a64: c5 fc 11 07 vmovups %ymm0,(%rdi) 404a68: c5 fc 10 46 20 vmovups 0x20(%rsi),%ymm0 404a6d: c5 fc 11 47 20 vmovups %ymm0,0x20(%rdi) 404a72: c5 fc 10 44 0e e0 vmovups -0x20(%rsi,%rcx,1),%ymm0 404a78: c5 fc 11 44 0f e0 vmovups %ymm0,-0x20(%rdi,%rcx,1) 404a90: c5 fc 10 06 vmovups (%rsi),%ymm0 404a94: c5 fc 11 07 vmovups %ymm0,(%rdi) 404a98: c5 fc 10 46 20 vmovups 0x20(%rsi),%ymm0 404a9d: c5 fc 11 47 20 vmovups %ymm0,0x20(%rdi) 404aa2: c5 fc 10 46 40 vmovups 0x40(%rsi),%ymm0 404aa7: c5 fc 11 47 40 vmovups %ymm0,0x40(%rdi) 404aac: c5 fc 10 44 0e e0 vmovups -0x20(%rsi,%rcx,1),%ymm0 404ab2: c5 fc 11 44 0f e0 vmovups %ymm0,-0x20(%rdi,%rcx,1) 404ac0: c5 fc 10 06 vmovups (%rsi),%ymm0 404ac4: c5 fc 11 07 vmovups %ymm0,(%rdi) 404ac8: c5 fc 10 46 20 vmovups 0x20(%rsi),%ymm0 404acd: c5 fc 11 47 20 vmovups %ymm0,0x20(%rdi) 404ad2: c5 fc 10 46 40 vmovups 0x40(%rsi),%ymm0 404ad7: c5 fc 11 47 40 vmovups %ymm0,0x40(%rdi) 404adc: c5 fc 10 46 60 vmovups 0x60(%rsi),%ymm0 404ae1: c5 fc 11 47 60 vmovups %ymm0,0x60(%rdi) 404ae6: c5 fc 10 44 0e e0 vmovups -0x20(%rsi,%rcx,1),%ymm0 404aec: c5 fc 11 44 0f e0 vmovups %ymm0,-0x20(%rdi,%rcx,1) 404b00: c5 fc 10 06 vmovups (%rsi),%ymm0 404b04: c5 fc 11 07 vmovups %ymm0,(%rdi) 404b08: c5 fc 10 46 20 vmovups 0x20(%rsi),%ymm0 404b0d: c5 fc 11 47 20 vmovups %ymm0,0x20(%rdi) 404b12: c5 fc 10 46 40 vmovups 0x40(%rsi),%ymm0 404b17: c5 fc 11 47 40 vmovups %ymm0,0x40(%rdi) 404b1c: c5 fc 10 46 60 vmovups 0x60(%rsi),%ymm0 404b21: c5 fc 11 47 60 vmovups %ymm0,0x60(%rdi) 404b26: c5 fc 10 86 80 00 00 vmovups 0x80(%rsi),%ymm0 404b2e: c5 fc 11 87 80 00 00 vmovups %ymm0,0x80(%rdi) 404b36: c5 fc 10 44 0e e0 vmovups -0x20(%rsi,%rcx,1),%ymm0 404b3c: c5 fc 11 44 0f e0 vmovups %ymm0,-0x20(%rdi,%rcx,1) 404b50: c5 fc 10 06 vmovups (%rsi),%ymm0 404b54: c5 fc 11 07 vmovups %ymm0,(%rdi) 404b58: c5 fc 10 46 20 vmovups 0x20(%rsi),%ymm0 404b5d: c5 fc 11 47 20 vmovups %ymm0,0x20(%rdi) 404b62: c5 fc 10 46 40 vmovups 0x40(%rsi),%ymm0 404b67: c5 fc 11 47 40 vmovups %ymm0,0x40(%rdi) 404b6c: c5 fc 10 46 60 vmovups 0x60(%rsi),%ymm0 404b71: c5 fc 11 47 60 vmovups %ymm0,0x60(%rdi) 404b76: c5 fc 10 86 80 00 00 vmovups 0x80(%rsi),%ymm0 404b7e: c5 fc 11 87 80 00 00 vmovups %ymm0,0x80(%rdi) 404b86: c5 fc 10 86 a0 00 00 vmovups 0xa0(%rsi),%ymm0 404b8e: c5 fc 11 87 a0 00 00 vmovups %ymm0,0xa0(%rdi) 404b96: c5 fc 10 44 0e e0 vmovups -0x20(%rsi,%rcx,1),%ymm0 404b9c: c5 fc 11 44 0f e0 vmovups %ymm0,-0x20(%rdi,%rcx,1) 404bb0: c5 fc 10 06 vmovups (%rsi),%ymm0 404bb4: c5 fc 11 07 vmovups %ymm0,(%rdi) 404bb8: c5 fc 10 46 20 vmovups 0x20(%rsi),%ymm0 404bbd: c5 fc 11 47 20 vmovups %ymm0,0x20(%rdi) 404bc2: c5 fc 10 46 40 vmovups 0x40(%rsi),%ymm0 404bc7: c5 fc 11 47 40 vmovups %ymm0,0x40(%rdi) 404bcc: c5 fc 10 46 60 vmovups 0x60(%rsi),%ymm0 404bd1: c5 fc 11 47 60 vmovups %ymm0,0x60(%rdi) 404bd6: c5 fc 10 86 80 00 00 vmovups 0x80(%rsi),%ymm0 404bde: c5 fc 11 87 80 00 00 vmovups %ymm0,0x80(%rdi) 404be6: c5 fc 10 86 a0 00 00 vmovups 0xa0(%rsi),%ymm0 404bee: c5 fc 11 87 a0 00 00 vmovups %ymm0,0xa0(%rdi) 404bf6: c5 fc 10 86 c0 00 00 vmovups 0xc0(%rsi),%ymm0 404bfe: c5 fc 11 87 c0 00 00 vmovups %ymm0,0xc0(%rdi) 404c06: c5 fc 10 44 0e e0 vmovups -0x20(%rsi,%rcx,1),%ymm0 404c0c: c5 fc 11 44 0f e0 vmovups %ymm0,-0x20(%rdi,%rcx,1) 404c20: c5 fc 10 84 0e 00 ff vmovups -0x100(%rsi,%rcx,1),%ymm0 404c29: c5 fc 11 84 0f 00 ff vmovups %ymm0,-0x100(%rdi,%rcx,1) 404c32: c5 fc 10 84 0e 20 ff vmovups -0xe0(%rsi,%rcx,1),%ymm0 404c3b: c5 fc 11 84 0f 20 ff vmovups %ymm0,-0xe0(%rdi,%rcx,1) 404c44: c5 fc 10 84 0e 40 ff vmovups -0xc0(%rsi,%rcx,1),%ymm0 404c4d: c5 fc 11 84 0f 40 ff vmovups %ymm0,-0xc0(%rdi,%rcx,1) 404c56: c5 fc 10 84 0e 60 ff vmovups -0xa0(%rsi,%rcx,1),%ymm0 404c5f: c5 fc 11 84 0f 60 ff vmovups %ymm0,-0xa0(%rdi,%rcx,1) 404c68: c5 fc 10 44 0e 80 vmovups -0x80(%rsi,%rcx,1),%ymm0 404c6e: c5 fc 11 44 0f 80 vmovups %ymm0,-0x80(%rdi,%rcx,1) 404c74: c5 fc 10 44 0e a0 vmovups -0x60(%rsi,%rcx,1),%ymm0 404c7a: c5 fc 11 44 0f a0 vmovups %ymm0,-0x60(%rdi,%rcx,1) 404c80: c5 fc 10 44 0e c0 vmovups -0x40(%rsi,%rcx,1),%ymm0 404c86: c5 fc 11 44 0f c0 vmovups %ymm0,-0x40(%rdi,%rcx,1) 404c8c: c5 fc 10 44 0e e0 vmovups -0x20(%rsi,%rcx,1),%ymm0 404c92: c5 fc 11 44 0f e0 vmovups %ymm0,-0x20(%rdi,%rcx,1) $ Weird, why g++ can do the task although probably less efficiently than icc? $ g++ -O3 -march=core-avx-i -mtune=core-avx-i -mavx2 stream.c ; objdump -d a.out | grep ymm stream.c:106:48: warning: deprecated conversion from string constant to 'char*' [-Wwrite-strings] "Add: ", "Triad: ", "Dot: "}; ^ stream.c:106:48: warning: deprecated conversion from string constant to 'char*' [-Wwrite-strings] stream.c:106:48: warning: deprecated conversion from string constant to 'char*' [-Wwrite-strings] stream.c:106:48: warning: deprecated conversion from string constant to 'char*' [-Wwrite-strings] stream.c:106:48: warning: deprecated conversion from string constant to 'char*' [-Wwrite-strings] 4006ac: c5 fd 28 0d 2c 11 00 vmovapd 0x112c(%rip),%ymm1 # 4017e0 <_ZL5label+0xa0> 4006b4: c5 fd 28 05 44 11 00 vmovapd 0x1144(%rip),%ymm0 # 401800 <_ZL5label+0xc0> 4006c0: c5 fd 29 88 80 61 ab vmovapd %ymm1,0x4cab6180(%rax) 4006cc: c5 fd 29 80 60 c1 85 vmovapd %ymm0,0x2685c160(%rax) 400738: c5 fd 28 01 vmovapd (%rcx),%ymm0 400740: c5 fd 58 c0 vaddpd %ymm0,%ymm0,%ymm0 400744: c5 fd 29 41 e0 vmovapd %ymm0,-0x20(%rcx) 400803: c5 fd 28 15 15 10 00 vmovapd 0x1015(%rip),%ymm2 # 401820 <_ZL5label+0xe0> 400817: c5 fd 29 95 d0 fd ff vmovapd %ymm2,-0x230(%rbp) 4008d0: c5 fd 28 95 d0 fd ff vmovapd -0x230(%rbp),%ymm2 400908: c5 fd 29 10 vmovapd %ymm2,(%rax) 40091c: c5 fd 29 95 d0 fd ff vmovapd %ymm2,-0x230(%rbp) 40098c: c5 fd 28 95 d0 fd ff vmovapd -0x230(%rbp),%ymm2 4009b8: c5 fd 28 80 80 61 ab vmovapd 0x4cab6180(%rax),%ymm0 4009c4: c5 fd 58 80 60 c1 85 vaddpd 0x2685c160(%rax),%ymm0,%ymm0 4009cc: c5 fd 29 80 60 21 60 vmovapd %ymm0,0x602160(%rax) 4009e3: c5 fd 29 95 d0 fd ff vmovapd %ymm2,-0x230(%rbp) 400a56: c5 fd 28 95 d0 fd ff vmovapd -0x230(%rbp),%ymm2 400a80: c5 ed 59 80 80 c1 85 vmulpd 0x2685c180(%rax),%ymm2,%ymm0 400a88: c5 fd 58 80 80 61 ab vaddpd 0x4cab6180(%rax),%ymm0,%ymm0 400a94: c5 fd 29 80 60 21 60 vmovapd %ymm0,0x602160(%rax) 400aab: c5 fd 29 95 d0 fd ff vmovapd %ymm2,-0x230(%rbp) 400b1e: c5 fd 28 95 d0 fd ff vmovapd -0x230(%rbp),%ymm2 400b77: c5 fd 29 95 d0 fd ff vmovapd %ymm2,-0x230(%rbp) 400bab: c5 fd 28 95 d0 fd ff vmovapd -0x230(%rbp),%ymm2 401006: c4 e3 6d 18 95 28 ff vinsertf128 $0x1,-0xd8(%rbp),%ymm2,%ymm2 401018: c4 e3 7d 18 85 48 ff vinsertf128 $0x1,-0xb8(%rbp),%ymm0,%ymm0 401022: c5 ed 5c 95 10 ff ff vsubpd -0xf0(%rbp),%ymm2,%ymm2 40102a: c5 fd 5c 85 30 ff ff vsubpd -0xd0(%rbp),%ymm0,%ymm0 401032: c5 fd 28 25 66 07 00 vmovapd 0x766(%rip),%ymm4 # 4017a0 <_ZL5label+0x60> 401044: c5 ed 59 d4 vmulpd %ymm4,%ymm2,%ymm2 401048: c5 fd 59 c4 vmulpd %ymm4,%ymm0,%ymm0 40104c: c5 fd e6 d2 vcvttpd2dq %ymm2,%xmm2 401058: c5 fd e6 c0 vcvttpd2dq %ymm0,%xmm0 40105c: c4 e3 6d 38 d0 01 vinserti128 $0x1,%xmm0,%ymm2,%ymm2 401062: c4 e2 6d 3d d3 vpmaxsd %ymm3,%ymm2,%ymm2 401067: c4 e2 6d 39 15 50 07 vpminsd 0x750(%rip),%ymm2,%ymm2 # 4017c0 <_ZL5label+0x80> 401096: c4 e3 75 18 8d 68 ff vinsertf128 $0x1,-0x98(%rbp),%ymm1,%ymm1 4010a4: c5 f5 5c 8d 50 ff ff vsubpd -0xb0(%rbp),%ymm1,%ymm1 4010b7: c5 f5 59 cc vmulpd %ymm4,%ymm1,%ymm1 4010bb: c5 fd e6 c9 vcvttpd2dq %ymm1,%xmm1 4010d3: c4 e3 7d 18 45 88 01 vinsertf128 $0x1,-0x78(%rbp),%ymm0,%ymm0 4010da: c5 fd 5c 85 70 ff ff vsubpd -0x90(%rbp),%ymm0,%ymm0 4010e2: c5 fd 59 c4 vmulpd %ymm4,%ymm0,%ymm0 4010e6: c5 fd e6 c0 vcvttpd2dq %ymm0,%xmm0 4010ea: c4 e3 75 38 c0 01 vinserti128 $0x1,%xmm0,%ymm1,%ymm0 4010f5: c4 e2 7d 3d c3 vpmaxsd %ymm3,%ymm0,%ymm0 4010fa: c4 e2 6d 39 c0 vpminsd %ymm0,%ymm2,%ymm0 4010ff: c4 e3 7d 46 c8 01 vperm2i128 $0x1,%ymm0,%ymm0,%ymm1 401105: c4 e2 7d 39 c1 vpminsd %ymm1,%ymm0,%ymm0 40110f: c5 f5 73 d8 08 vpsrldq $0x8,%ymm0,%ymm1 401114: c4 e2 7d 39 c9 vpminsd %ymm1,%ymm0,%ymm1 401119: c5 fd 73 d9 04 vpsrldq $0x4,%ymm1,%ymm0 40111e: c4 e2 75 39 c0 vpminsd %ymm0,%ymm1,%ymm0 4011e0: c4 e2 7d 19 c0 vbroadcastsd %xmm0,%ymm0 4011f0: c5 fd 29 00 vmovapd %ymm0,(%rax) 401218: c5 fd 28 80 80 61 ab vmovapd 0x4cab6180(%rax),%ymm0 401224: c5 fd 58 80 60 c1 85 vaddpd 0x2685c160(%rax),%ymm0,%ymm0 40122c: c5 fd 29 80 60 21 60 vmovapd %ymm0,0x602160(%rax) 401240: c4 e2 7d 19 c0 vbroadcastsd %xmm0,%ymm0 401250: c5 fd 59 88 80 c1 85 vmulpd 0x2685c180(%rax),%ymm0,%ymm1 401258: c5 f5 58 88 80 61 ab vaddpd 0x4cab6180(%rax),%ymm1,%ymm1 401264: c5 fd 29 88 60 21 60 vmovapd %ymm1,0x602160(%rax) $ The reason why #c1 (as well as #c0) is only vectorized using vector length of 8 rather than 4 is that the loop iterator is cast to float and therefore needed inside of the loop in vector registers: pr57952.C:21:20: note: op not supported by target. pr57952.C:21:20: note: not vectorized: relevant stmt not supported: i_16 = i_41 + 1; pr57952.C:21:20: note: bad operation or unsupported loop bound. and AVX doesn't support V8SImode addition. Now, perhaps we could have an optimization that in that case if all the iterators can be provably exactly represented in the floating point value we could try to do what the programmer should have done, i.e. add a float iterator that is set to 1.0f and incremented in each iteration and used instead of float(i). But it won't work in this case, because you need 24 bits for the iterator and float only has 23 bit mantissa. for (int k=0; k!=100; ++k) { float c = 1.f/10000000.f; float fi = 1.f; for (int i=1; i<10000001; ++i) { s+= polyHorner((fi+float(k))*c); fi += 1.f; } } is vectorized with -Ofast -mavx just fine vectorization factor of 8. As for #c2/#c3, GCC 4.9 is not supported anymore and the dumps are too large to find out what exactly you mean by efficient and not efficient, both the ICC and GCC generated assemblies use both %ymm and %xmm registers depending on what exactly the need. (In reply to Jakub Jelinek from comment #4) > As for #c2/#c3, GCC 4.9 is not supported anymore and the dumps are too large > to find out what exactly you mean by efficient and not efficient, both the > ICC and GCC generated assemblies use both %ymm and %xmm registers depending > on what exactly the need. "gcc -march=native" or "g++ march=native" do not insert a single instructing using ymm registers unless -O3 is used. $ gcc -O3 -march=native -mavx2 stream.c ; objdump -d a.out | grep ymm | wc -l 63 $ gcc -O2 -march=native -mavx2 stream.c ; objdump -d a.out | grep ymm | wc -l 0 $ gcc -O2 -march=native stream.c ; objdump -d a.out | grep ymm | wc -l 0 $ gcc -O3 -march=native stream.c ; objdump -d a.out | grep ymm | wc -l 63 $ I am on Gentoo Linux where 5.4.0 is still in testing only, same for 6.3 and 7.1. The 4.9 series is the last which is generally usable. (In reply to mmokrejs from comment #5) > (In reply to Jakub Jelinek from comment #4) > > > As for #c2/#c3, GCC 4.9 is not supported anymore and the dumps are too large > > to find out what exactly you mean by efficient and not efficient, both the > > ICC and GCC generated assemblies use both %ymm and %xmm registers depending > > on what exactly the need. > > "gcc -march=native" or "g++ march=native" do not insert a single instructing > using ymm registers unless -O3 is used. > > $ gcc -O3 -march=native -mavx2 stream.c ; objdump -d a.out | grep ymm | wc > -l > 63 > $ gcc -O2 -march=native -mavx2 stream.c ; objdump -d a.out | grep ymm | wc > -l > 0 > $ gcc -O2 -march=native stream.c ; objdump -d a.out | grep ymm | wc -l > 0 > $ gcc -O3 -march=native stream.c ; objdump -d a.out | grep ymm | wc -l > 63 > $ Of course, vectorization is only enabled by default for -O3/-Ofast, not at -O2, for vectorization at -O2 you need to use -O2 -ftree-vectorize. (In reply to Jakub Jelinek from comment #6) > > $ gcc -O3 -march=native stream.c ; objdump -d a.out | grep ymm | wc -l > > 63 > > $ > > Of course, vectorization is only enabled by default for -O3/-Ofast, not at > -O2, for vectorization at -O2 you need to use -O2 -ftree-vectorize. $ gcc -O2 -march=native -ftree-vectorize stream.c ; objdump -d a.out | grep ymm | wc -l 60 $ Ah, thanks. Please update the manpage. It says nothing about the need to use -O3 or -Ofast interacting with -march=native or -mavx or -mavx2. <quote> -march=cpu-type Generate instructions for the machine type cpu-type. In contrast to -mtune=cpu-type, which merely tunes the generated code for the specified cpu-type, -march=cpu-type allows GCC to generate code that may not run at all on processors other than the one indicated. Specifying -march=cpu-type implies -mtune=cpu-type. The choices for cpu-type are: native This selects the CPU to generate code for at compilation time by determining the processor type of the compiling machine. Using -march=native enables all instruction subsets supported by the local machine (hence the result might not run on different machines). Using -mtune=native produces code optimized for the local machine under the constraints of the selected instruction set. </quote> <quote> sandybridge Intel Sandy Bridge CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AES and PCLMUL instruction set support. ivybridge Intel Ivy Bridge CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AES, PCLMUL, FSGSBASE, RDRND and F16C instruction set support. haswell Intel Haswell CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, FMA, BMI, BMI2 and F16C instruction set support. broadwell Intel Broadwell CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, FMA, BMI, BMI2, F16C, RDSEED, ADCX and PREFETCHW instruction set support. </quote> These entrie shave no description text at all: <quote> -mavx -mno-avx -mavx2 -mno-avx2 -mavx512f -mno-avx512f -mavx512pf -mno-avx512pf -mavx512er -mno-avx512er -mavx512cd -mno-avx512cd </quote> There are hardly any links from within the manpage (notably the intel x86/amd64 section) to -ftree-vectorize. <quote> -ftree-vectorize Perform vectorization on trees. This flag enables -ftree-loop-vectorize and -ftree-slp-vectorize if not explicitly specified. -ftree-loop-vectorize Perform loop vectorization on trees. This flag is enabled by default at -O3 and when -ftree-vectorize is enabled. -ftree-slp-vectorize Perform basic block vectorization on trees. This flag is enabled by default at -O3 and when -ftree-vectorize is enabled. </quote> Why should there be any references to that? -march= is an ISA selection option, it says the compiler may use the instructions from the ISA. -ftree-vectorize is an optimization option (default at -O3 and -Ofast), which attempts to vectorize loops using the selected ISA if possible. The reason it isn't on at -O2 is that it doesn't always improve code, it can make code slower as well, and usually makes the code larger; it really depends on the loop etc. Thank you, I updated https://wiki.gentoo.org/wiki/GCC_optimization and added the notes on -ftree-vectorize option. I can now see this vectorized with at least GCC 10 and up. Note we're vectorizing the _outer_ loop here but we also manage to vectorize the inner loop only if I comment out the outer one, it just looks less efficient. .L2: vmovdqa %ymm6, %ymm2 movl $10000000, %eax .p2align 4,,10 .p2align 3 .L3: vmovdqa %ymm2, %ymm0 vpaddd %ymm6, %ymm2, %ymm2 vcvtdq2ps %ymm0, %ymm0 vaddps %ymm5, %ymm0, %ymm0 vmulps %ymm11, %ymm0, %ymm0 vmovaps %ymm0, %ymm1 vfmadd132ps %ymm10, %ymm9, %ymm1 vfmadd132ps %ymm0, %ymm8, %ymm1 vfmadd132ps %ymm0, %ymm7, %ymm1 vfmadd132ps %ymm0, %ymm5, %ymm1 vfmadd132ps %ymm0, %ymm4, %ymm1 vfmadd132ps %ymm1, %ymm4, %ymm0 vaddps %ymm0, %ymm3, %ymm3 decl %eax jne .L3 incl %edx cmpl $12, %edx jne .L2 |