snip #include <array> #include <iostream> int main(const int argc, const char** argv) { using value_type = int; // type does not matter using array_type = std::array<value_type, 32>; array_type a, b; // simple init for (size_t i=0; i<a.size(); ++i) a[i] = argc; /* * copy's use only sse-registers and never higher */ b = a; // or for (size_t i=0; i<a.size(); ++i) b[i] = a[i]; for (size_t i=0; i<a.size(); ++i) std::cout<<(b[i])<<' '; return EXIT_SUCCESS; } gcc 8.3.1 and higher (and lower?) -O3 COLLECT_GCC=gcc-8 COLLECT_LTO_WRAPPER=/usr/lib64/gcc/x86_64-suse-linux/8/lto-wrapper OFFLOAD_TARGET_NAMES=hsa:nvptx-none Target: x86_64-suse-linux Configured with: ../configure --prefix=/usr --infodir=/usr/share/info --mandir=/usr/share/man --libdir=/usr/lib64 --libexecdir=/usr/lib64 --enable-languages=c,c++,objc,fortran,obj-c++,ada,go --enable-offload-targets=hsa,nvptx-none=/usr/nvptx-none, --without-cuda-driver --enable-checking=release --disable-werror --with-gxx-include-dir=/usr/include/c++/8 --enable-ssp --disable-libssp --disable-libvtv --disable-cet --disable-libcc1 --enable-plugin --with-bugurl=http://bugs.opensuse.org/ --with-pkgversion='SUSE Linux' --with-slibdir=/lib64 --with-system-zlib --enable-libstdcxx-allocator=new --disable-libstdcxx-pch --enable-version-specific-runtime-libs --with-gcc-major-version-only --enable-linker-build-id --enable-linux-futex --enable-gnu-indirect-function --program-suffix=-8 --without-system-libunwind --enable-multilib --with-arch-32=x86-64 --with-tune=generic --build=x86_64-suse-linux --host=x86_64-suse-linux Thread model: posix gcc version 8.3.1 20190226 [gcc-8-branch revision 269204] (SUSE Linux)
> copy's use only sse-registers and never higher What do you mean by that? Do you want AVX? Then you should let the compiler know that they are available (for instance -march=native).
Ah, I see, this is a DUP or PR 89226 then?
Am 15.05.19 um 21:20 schrieb glisse at gcc dot gnu.org: > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90492 > > --- Comment #1 from Marc Glisse <glisse at gcc dot gnu.org> --- >> copy's use only sse-registers and never higher > > What do you mean by that? Do you want AVX? Then you should let the compiler > know that they are available (for instance -march=native). > Yes, i'm use -march=native on Ryzen 7 2700 (has avx/avx2) or you compile with -march=skylake-avx512, but copy-operations use only sse-registers in all cases.
#include <array> #include <iostream> int main(const int argc, const char** argv) { using value_type = int64_t; using array_type = std::array<value_type, 16>; array_type a, b; for (size_t i=0; i<a.size(); ++i) a[i] = i; b = a; for (size_t i=0; i<a.size(); ++i) std::cout<<(b[i])<<' '; return EXIT_SUCCESS; } compile with gcc-9 and -O3 -march=skylake-avx512 -mtune=intel -mno-vzeroupper generates this code: 0000000000000000 <main>: 0: 55 push %rbp 1: 48 89 e5 mov %rsp,%rbp 4: 41 54 push %r12 6: 53 push %rbx 7: 48 83 e4 c0 and $0xffffffffffffffc0,%rsp b: 48 8d a4 24 c0 fe ff lea -0x140(%rsp),%rsp 12: ff 13: 62 f1 fd 48 6f 05 00 vmovdqa64 0x0(%rip),%zmm0 # 1d <main+0x1d> 1a: 00 00 00 19: R_X86_64_PC32 .rodata-0x4 1d: 48 8d 9c 24 c0 00 00 lea 0xc0(%rsp),%rbx 24: 00 25: 62 f1 fd 48 7f 44 24 vmovdqa64 %zmm0,0x40(%rsp) 2c: 01 2d: c5 f9 6f d0 vmovdqa %xmm0,%xmm2 31: 62 f1 fd 48 6f 05 00 vmovdqa64 0x0(%rip),%zmm0 # 3b <main+0x3b> 38: 00 00 00 37: R_X86_64_PC32 .rodata+0x3c 3b: 4c 8d a4 24 40 01 00 lea 0x140(%rsp),%r12 42: 00 43: 62 f1 fd 48 7f 44 24 vmovdqa64 %zmm0,0x80(%rsp) 4a: 02 4b: 62 f1 fd 08 6f 5c 24 vmovdqa64 0x50(%rsp),%xmm3 52: 05 53: 62 f1 fd 08 6f 64 24 vmovdqa64 0x60(%rsp),%xmm4 5a: 06 5b: 62 f1 fd 08 6f 6c 24 vmovdqa64 0x70(%rsp),%xmm5 62: 07 63: 62 f1 fd 08 6f 74 24 vmovdqa64 0x90(%rsp),%xmm6 6a: 09 6b: 62 f1 fd 08 6f 7c 24 vmovdqa64 0xa0(%rsp),%xmm7 72: 0a 73: 62 f1 fd 08 6f 4c 24 vmovdqa64 0xb0(%rsp),%xmm1 7a: 0b 7b: 62 f1 fd 08 7f 54 24 vmovdqa64 %xmm2,0xc0(%rsp) 82: 0c 83: 62 f1 fd 08 7f 5c 24 vmovdqa64 %xmm3,0xd0(%rsp) 8a: 0d 8b: 62 f1 fd 08 7f 64 24 vmovdqa64 %xmm4,0xe0(%rsp) 92: 0e 93: 62 f1 fd 08 7f 44 24 vmovdqa64 %xmm0,0x100(%rsp) 9a: 10 9b: 62 f1 fd 08 7f 6c 24 vmovdqa64 %xmm5,0xf0(%rsp) a2: 0f a3: 62 f1 fd 08 7f 74 24 vmovdqa64 %xmm6,0x110(%rsp) aa: 11 ab: 62 f1 fd 08 7f 7c 24 vmovdqa64 %xmm7,0x120(%rsp) b2: 12 b3: 62 f1 fd 08 7f 4c 24 vmovdqa64 %xmm1,0x130(%rsp) ba: 13 bb: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) c0: 48 8b 33 mov (%rbx),%rsi c3: bf 00 00 00 00 mov $0x0,%edi c4: R_X86_64_32 std::cout c8: 48 83 c3 08 add $0x8,%rbx cc: e8 00 00 00 00 callq d1 <main+0xd1> cd: R_X86_64_PLT32 std::ostream& std::ostream::_M_insert<long>(long)-0x4 d1: 48 89 c7 mov %rax,%rdi d4: ba 01 00 00 00 mov $0x1,%edx d9: c6 44 24 3f 20 movb $0x20,0x3f(%rsp) de: 48 8d 74 24 3f lea 0x3f(%rsp),%rsi e3: e8 00 00 00 00 callq e8 <main+0xe8> e4: R_X86_64_PLT32 std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)-0x4 e8: 49 39 dc cmp %rbx,%r12 eb: 75 d3 jne c0 <main+0xc0> ed: 48 8d 65 f0 lea -0x10(%rbp),%rsp f1: 31 c0 xor %eax,%eax f3: 5b pop %rbx f4: 41 5c pop %r12 f6: 5d pop %rbp f7: c3 retq f8: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1) ff: 00
Confirmed. This is probably the generic block-copying (move_by_pieces) code expanding MEM[(char * {ref-all})&b] = MEM[(char * {ref-all})&a]; and the target directing it to at most use SSE regs: /* MOVE_MAX_PIECES is the number of bytes at a time which we can move efficiently, as opposed to MOVE_MAX which is the maximum number of bytes we can move with a single instruction. ??? We should use TImode in 32-bit mode and use OImode or XImode if they are available. But since by_pieces_ninsns determines the widest mode with MAX_FIXED_MODE_SIZE, we can only use TImode in 64-bit mode. */ #define MOVE_MAX_PIECES \ ((TARGET_64BIT \ && TARGET_SSE2 \ && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \ && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD) not sure if the MAX_FIXED_MODE_SIZE comment is still valid.
The bug still contained in gcc 10.0.0 20191210 ?! When can I expect this to be fixed?