[Bug target/94663] New: [missed optimization] _mm512_dpbusds_epi32 generates excess vmovdqa64
gcc at kheafield dot com
gcc-bugzilla@gcc.gnu.org
Sun Apr 19 19:57:04 GMT 2020
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663
Bug ID: 94663
Summary: [missed optimization] _mm512_dpbusds_epi32 generates
excess vmovdqa64
Product: gcc
Version: 9.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: gcc at kheafield dot com
Target Milestone: ---
The _mm512_dpbusds_epi32 intrinsic generates extra vmovdqa64 instructions when
used inside a loop. The underlying instruction, vpdpbusds, adds to an
accumulator, so it is commonly used in loops. The compiler appears to be
unnecessarily using two registers for the accumulator by copying it.
Example:
#include "immintrin.h"
__m512i Slow(const __m512i *a, const __m512i b0, const __m512i b1, std::size_t
count) {
__m512i c0 = _mm512_setzero_epi32();
__m512i c1 = _mm512_setzero_epi32();
for (std::size_t i = 0; i < count; ++i) {
c0 = _mm512_dpbusds_epi32(c0, a[i], b0);
c1 = _mm512_dpbusds_epi32(c1, a[i], b1);
}
// Do not optimize away
return _mm512_sub_epi32(c0, c1);
}
When compiled with g++ -O3 -mavx512vnni example.cc -S, the main loop is:
.L3:
vmovdqa64 (%rdi), %zmm6
vmovdqa64 %zmm3, %zmm0
vmovdqa64 %zmm4, %zmm2
addq $64, %rdi
vpdpbusds %zmm5, %zmm6, %zmm0
vpdpbusds %zmm1, %zmm6, %zmm2
vmovdqa64 %zmm0, %zmm3
vmovdqa64 %zmm2, %zmm4
cmpq %rdi, %rax
jne .L3
It's copying accumulator zmm3 to zmm0, accumulating in zmm0, then copying back
to zmm3. It should have just used one register. The same happens for zmm4 and
zmm2.
Workaround: use inline assembly.
__m512i Fast(const __m512i *a, const __m512i b0, const __m512i b1, std::size_t
count) {
__m512i c0 = _mm512_setzero_epi32();
__m512i c1 = _mm512_setzero_epi32();
for (std::size_t i = 0; i < count; ++i) {
asm ("vpdpbusds %1, %2, %0" : "+x"(c0) : "mx"(a[i]), "x"(b0));
asm ("vpdpbusds %1, %2, %0" : "+x"(c1) : "mx"(a[i]), "x"(b1));
}
// Do not optimize away
return _mm512_sub_epi32(c0, c1);
}
Here, the generated code is better, with no extra moves.
.L10:
#APP
# 19 "example.cc" 1
vpdpbusds (%rdi), %zmm3, %zmm0
# 0 "" 2
# 20 "example.cc" 1
vpdpbusds (%rdi), %zmm1, %zmm2
# 0 "" 2
#NO_APP
addq $64, %rdi
cmpq %rax, %rdi
jne .L10
Reproduced on the following versions of g++:
g++ -v
Using built-in specs.
COLLECT_GCC=g++
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with:
/var/tmp/portage/sys-devel/gcc-9.2.0-r2/work/gcc-9.2.0/configure
--host=x86_64-pc-linux-gnu --build=x86_64-pc-linux-gnu --prefix=/usr
--bindir=/usr/x86_64-pc-linux-gnu/gcc-bin/9.2.0
--includedir=/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include
--datadir=/usr/share/gcc-data/x86_64-pc-linux-gnu/9.2.0
--mandir=/usr/share/gcc-data/x86_64-pc-linux-gnu/9.2.0/man
--infodir=/usr/share/gcc-data/x86_64-pc-linux-gnu/9.2.0/info
--with-gxx-include-dir=/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/g++-v9
--with-python-dir=/share/gcc-data/x86_64-pc-linux-gnu/9.2.0/python
--enable-languages=c,c++,fortran --enable-obsolete --enable-secureplt
--disable-werror --with-system-zlib --enable-nls --without-included-gettext
--enable-checking=release --with-bugurl=https://bugs.gentoo.org/
--with-pkgversion='Gentoo 9.2.0-r2 p3' --disable-esp --enable-libstdcxx-time
--enable-shared --enable-threads=posix --enable-__cxa_atexit
--enable-clocale=gnu --enable-multilib --with-multilib-list=m32,m64
--disable-altivec --disable-fixed-point --enable-targets=all --enable-libgomp
--disable-libmudflap --disable-libssp --disable-systemtap
--enable-vtable-verify --enable-lto --without-isl --enable-default-pie
--enable-default-ssp
Thread model: posix
gcc version 9.2.0 (Gentoo 9.2.0-r2 p3)
g++ -v
Using built-in specs.
COLLECT_GCC=g++
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/8/lto-wrapper
OFFLOAD_TARGET_NAMES=nvptx-none
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu
8.4.0-1ubuntu1~18.04' --with-bugurl=file:///usr/share/doc/gcc-8/README.Bugs
--enable-languages=c,ada,c++,go,brig,d,fortran,objc,obj-c++ --prefix=/usr
--with-gcc-major-version-only --program-suffix=-8
--program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id
--libexecdir=/usr/lib --without-included-gettext --enable-threads=posix
--libdir=/usr/lib --enable-nls --enable-clocale=gnu --enable-libstdcxx-debug
--enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new
--enable-gnu-unique-object --disable-vtable-verify --enable-libmpx
--enable-plugin --enable-default-pie --with-system-zlib
--with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch
--disable-werror --with-arch-32=i686 --with-abi=m64
--with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic
--enable-offload-targets=nvptx-none --without-cuda-driver
--enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu
--target=x86_64-linux-gnu
Thread model: posix
gcc version 8.4.0 (Ubuntu 8.4.0-1ubuntu1~18.04)
Full source code:
#include <immintrin.h>
#include <cstddef>
__m512i Slow(const __m512i *a, const __m512i b0, const __m512i b1, std::size_t
count) {
__m512i c0 = _mm512_setzero_epi32();
__m512i c1 = _mm512_setzero_epi32();
for (std::size_t i = 0; i < count; ++i) {
c0 = _mm512_dpbusds_epi32(c0, a[i], b0);
c1 = _mm512_dpbusds_epi32(c1, a[i], b1);
}
// Do not optimize away
return _mm512_sub_epi32(c0, c1);
}
__m512i Fast(const __m512i *a, const __m512i b0, const __m512i b1, std::size_t
count) {
__m512i c0 = _mm512_setzero_epi32();
__m512i c1 = _mm512_setzero_epi32();
for (std::size_t i = 0; i < count; ++i) {
asm ("vpdpbusds %1, %2, %0" : "+x"(c0) : "mx"(a[i]), "x"(b0));
asm ("vpdpbusds %1, %2, %0" : "+x"(c1) : "mx"(a[i]), "x"(b1));
}
// Do not optimize away
return _mm512_sub_epi32(c0, c1);
}
Command line: g++ -O3 -mavx512vnni -S example.cc
(It also happens with g++ -O3 -march=native -S example.cc on a Cascade Lake CPU
with g++ 8.4.0).
Output: none
More information about the Gcc-bugs
mailing list