[Bug target/94663] New: [missed optimization] _mm512_dpbusds_epi32 generates excess vmovdqa64

gcc at kheafield dot com gcc-bugzilla@gcc.gnu.org
Sun Apr 19 19:57:04 GMT 2020


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663

            Bug ID: 94663
           Summary: [missed optimization] _mm512_dpbusds_epi32 generates
                    excess vmovdqa64
           Product: gcc
           Version: 9.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: gcc at kheafield dot com
  Target Milestone: ---

The _mm512_dpbusds_epi32 intrinsic generates extra vmovdqa64 instructions when
used inside a loop.  The underlying instruction, vpdpbusds, adds to an
accumulator, so it is commonly used in loops.  The compiler appears to be
unnecessarily using two registers for the accumulator by copying it.  

Example:

#include "immintrin.h"
__m512i Slow(const __m512i *a, const __m512i b0, const __m512i b1, std::size_t
count) {
  __m512i c0 = _mm512_setzero_epi32();
  __m512i c1 = _mm512_setzero_epi32();
  for (std::size_t i = 0; i < count; ++i) {
    c0 = _mm512_dpbusds_epi32(c0, a[i], b0);
    c1 = _mm512_dpbusds_epi32(c1, a[i], b1);
  }
  // Do not optimize away
  return _mm512_sub_epi32(c0, c1);
}

When compiled with g++ -O3 -mavx512vnni example.cc -S, the main loop is:

.L3:
        vmovdqa64       (%rdi), %zmm6
        vmovdqa64       %zmm3, %zmm0
        vmovdqa64       %zmm4, %zmm2
        addq    $64, %rdi
        vpdpbusds       %zmm5, %zmm6, %zmm0
        vpdpbusds       %zmm1, %zmm6, %zmm2
        vmovdqa64       %zmm0, %zmm3
        vmovdqa64       %zmm2, %zmm4
        cmpq    %rdi, %rax
        jne     .L3

It's copying accumulator zmm3 to zmm0, accumulating in zmm0, then copying back
to zmm3.  It should have just used one register.  The same happens for zmm4 and
zmm2.  

Workaround: use inline assembly.  

__m512i Fast(const __m512i *a, const __m512i b0, const __m512i b1, std::size_t
count) {
  __m512i c0 = _mm512_setzero_epi32();
  __m512i c1 = _mm512_setzero_epi32();
  for (std::size_t i = 0; i < count; ++i) {
    asm ("vpdpbusds %1, %2, %0" : "+x"(c0) : "mx"(a[i]), "x"(b0));
    asm ("vpdpbusds %1, %2, %0" : "+x"(c1) : "mx"(a[i]), "x"(b1));
  }
  // Do not optimize away
  return _mm512_sub_epi32(c0, c1);
}

Here, the generated code is better, with no extra moves.  

.L10:
#APP
# 19 "example.cc" 1
        vpdpbusds (%rdi), %zmm3, %zmm0
# 0 "" 2
# 20 "example.cc" 1
        vpdpbusds (%rdi), %zmm1, %zmm2
# 0 "" 2
#NO_APP
        addq    $64, %rdi
        cmpq    %rax, %rdi
        jne     .L10

Reproduced on the following versions of g++:

g++ -v 
Using built-in specs.
COLLECT_GCC=g++
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with:
/var/tmp/portage/sys-devel/gcc-9.2.0-r2/work/gcc-9.2.0/configure
--host=x86_64-pc-linux-gnu --build=x86_64-pc-linux-gnu --prefix=/usr
--bindir=/usr/x86_64-pc-linux-gnu/gcc-bin/9.2.0
--includedir=/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include
--datadir=/usr/share/gcc-data/x86_64-pc-linux-gnu/9.2.0
--mandir=/usr/share/gcc-data/x86_64-pc-linux-gnu/9.2.0/man
--infodir=/usr/share/gcc-data/x86_64-pc-linux-gnu/9.2.0/info
--with-gxx-include-dir=/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/g++-v9
--with-python-dir=/share/gcc-data/x86_64-pc-linux-gnu/9.2.0/python
--enable-languages=c,c++,fortran --enable-obsolete --enable-secureplt
--disable-werror --with-system-zlib --enable-nls --without-included-gettext
--enable-checking=release --with-bugurl=https://bugs.gentoo.org/
--with-pkgversion='Gentoo 9.2.0-r2 p3' --disable-esp --enable-libstdcxx-time
--enable-shared --enable-threads=posix --enable-__cxa_atexit
--enable-clocale=gnu --enable-multilib --with-multilib-list=m32,m64
--disable-altivec --disable-fixed-point --enable-targets=all --enable-libgomp
--disable-libmudflap --disable-libssp --disable-systemtap
--enable-vtable-verify --enable-lto --without-isl --enable-default-pie
--enable-default-ssp
Thread model: posix
gcc version 9.2.0 (Gentoo 9.2.0-r2 p3) 

g++ -v
Using built-in specs.
COLLECT_GCC=g++
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/8/lto-wrapper
OFFLOAD_TARGET_NAMES=nvptx-none
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu
8.4.0-1ubuntu1~18.04' --with-bugurl=file:///usr/share/doc/gcc-8/README.Bugs
--enable-languages=c,ada,c++,go,brig,d,fortran,objc,obj-c++ --prefix=/usr
--with-gcc-major-version-only --program-suffix=-8
--program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id
--libexecdir=/usr/lib --without-included-gettext --enable-threads=posix
--libdir=/usr/lib --enable-nls --enable-clocale=gnu --enable-libstdcxx-debug
--enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new
--enable-gnu-unique-object --disable-vtable-verify --enable-libmpx
--enable-plugin --enable-default-pie --with-system-zlib
--with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch
--disable-werror --with-arch-32=i686 --with-abi=m64
--with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic
--enable-offload-targets=nvptx-none --without-cuda-driver
--enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu
--target=x86_64-linux-gnu
Thread model: posix
gcc version 8.4.0 (Ubuntu 8.4.0-1ubuntu1~18.04) 

Full source code:
#include <immintrin.h>
#include <cstddef>

__m512i Slow(const __m512i *a, const __m512i b0, const __m512i b1, std::size_t
count) {
  __m512i c0 = _mm512_setzero_epi32();
  __m512i c1 = _mm512_setzero_epi32();
  for (std::size_t i = 0; i < count; ++i) {
    c0 = _mm512_dpbusds_epi32(c0, a[i], b0);
    c1 = _mm512_dpbusds_epi32(c1, a[i], b1);
  }
  // Do not optimize away
  return _mm512_sub_epi32(c0, c1);
}

__m512i Fast(const __m512i *a, const __m512i b0, const __m512i b1, std::size_t
count) {
  __m512i c0 = _mm512_setzero_epi32();
  __m512i c1 = _mm512_setzero_epi32();
  for (std::size_t i = 0; i < count; ++i) {
    asm ("vpdpbusds %1, %2, %0" : "+x"(c0) : "mx"(a[i]), "x"(b0));
    asm ("vpdpbusds %1, %2, %0" : "+x"(c1) : "mx"(a[i]), "x"(b1));
  }
  // Do not optimize away
  return _mm512_sub_epi32(c0, c1);
}

Command line: g++ -O3 -mavx512vnni -S example.cc
(It also happens with g++ -O3 -march=native -S example.cc on a Cascade Lake CPU
with g++ 8.4.0).  
Output: none


More information about the Gcc-bugs mailing list