[Bug target/93930] New: Unnecessary broadcast instructions for AVX512

fredrik987 at gmail dot com gcc-bugzilla@gcc.gnu.org
Tue Feb 25 17:59:00 GMT 2020


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93930

            Bug ID: 93930
           Summary: Unnecessary broadcast instructions for AVX512
           Product: gcc
           Version: 9.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: fredrik987 at gmail dot com
  Target Milestone: ---

Created attachment 47908
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=47908&action=edit
Test case

The code below generates unnecessary broadcast instructions for AVX512,
compiled with "-Ofast -march=skylake-avx512". This occurs for gcc trunk and
9.2/8.3 but not 7.5.

Most constants are read from memory via vbroadcastss except two, which are read
as scalars and then broadcast within the loop. For gcc 7.5 all constants are
read via vbroadcastss.

The problem seems to be more frequent for larger functions.

 ---

Compiler output for gcc 9.2:

        ...
.L3:
        vmovaps zmm0, ZMMWORD PTR [rdi]
        add     rdi, 64
        vmovaps zmm3, zmm0
        vmovaps zmm1, zmm0
        vmulps  zmm2, zmm0, zmm0
        vfmadd132ps     zmm3, zmm11, zmm12
        vfmadd132ps     zmm1, zmm13, zmm14
        vmovaps zmm4, zmm0
        vfmadd132ps     zmm4, zmm7, zmm8
        sub     rsi, -128
        vfmadd132ps     zmm1, zmm3, zmm2
        vmovaps zmm3, zmm0
        vfmadd132ps     zmm3, zmm9, zmm10
        vfmadd132ps     zmm3, zmm4, zmm2
        vbroadcastss    zmm4, xmm15         <--- Broadcast within loop
        vmulps  zmm3, zmm3, zmm1
        vmovaps ZMMWORD PTR [rsi-128], zmm3
        vbroadcastss    zmm3, xmm16         <--- Broadcast within loop
        vfmadd132ps     zmm3, zmm4, zmm0
        vfmadd132ps     zmm0, zmm5, zmm6
        vfmadd132ps     zmm0, zmm3, zmm2
        vmulps  zmm1, zmm1, zmm0
        vmovaps ZMMWORD PTR [rsi-64], zmm1
        cmp     rdi, rax
        jne     .L3
        ...

 ---

#include <immintrin.h>

static __m512 f(__m512 x)
{
    __m512 a = _mm512_set1_ps(11);
    __m512 b = _mm512_set1_ps(12);
    __m512 c = _mm512_set1_ps(13);
    __m512 d = _mm512_set1_ps(14);

    __m512 y = _mm512_mul_ps(x, x);

    return _mm512_fmadd_ps(y, _mm512_fmadd_ps(x, a, b), _mm512_fmadd_ps(x, c,
d));
}

static __m512 g(__m512 x)
{
    __m512 a = _mm512_set1_ps(21);
    __m512 b = _mm512_set1_ps(22);
    __m512 c = _mm512_set1_ps(23);
    __m512 d = _mm512_set1_ps(24);

    __m512 y = _mm512_mul_ps(x, x);

    return _mm512_fmadd_ps(y, _mm512_fmadd_ps(x, a, b), _mm512_fmadd_ps(x, c,
d));
}

static __m512 h(__m512 x)
{
    __m512 a = _mm512_set1_ps(31);
    __m512 b = _mm512_set1_ps(32);
    __m512 c = _mm512_set1_ps(33);
    __m512 d = _mm512_set1_ps(34);

    __m512 y = _mm512_mul_ps(x, x);

    return _mm512_fmadd_ps(y, _mm512_fmadd_ps(x, a, b), _mm512_fmadd_ps(x, c,
d));
}

void test(__m512 *x, __m512 *y, int n)
{
    for (int i = 0; i < n; i++) {
        __m512 u = *x++;
        __m512 v = h(u);

        *y++ = _mm512_mul_ps(f(u), v);
        *y++ = _mm512_mul_ps(g(u), v);
    }
}


More information about the Gcc-bugs mailing list