[PATCH] [i386] Remove pass_cpb which is related to enable avx512 embedded broadcast from constant pool.

H.J. Lu hjl.tools@gmail.com
Fri Aug 6 12:26:08 GMT 2021


On Thu, Aug 5, 2021 at 11:21 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Wed, Jul 14, 2021 at 8:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Jul 13, 2021 at 9:35 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Wed, Jul 14, 2021 at 10:34 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > >
> > > > By optimizing vector movement to broadcast in ix86_expand_vector_move
> > > > during pass_expand, pass_reload/LRA can automatically generate an avx512
> > > > embedded broadcast, pass_cpb is not needed.
> > > >
> > > > Considering that in the absence of avx512f, broadcast from memory is
> > > > still slightly faster than loading the entire memory, so always enable
> > > > broadcast.
> > > >
> > > > benchmark:
> > > > https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vaddps/broadcast
> > > >
> > > > The performance diff
> > > >
> > > > strategy    : cycles
> > > > memory      : 1046611188
> > > > memory      : 1255420817
> > > > memory      : 1044720793
> > > > memory      : 1253414145
> > > > average     : 1097868397
> > > >
> > > > broadcast   : 1044430688
> > > > broadcast   : 1044477630
> > > > broadcast   : 1253554603
> > > > broadcast   : 1044561934
> > > > average     : 1096756213
> > > >
> > > > But however broadcast has larger size.
> > > >
> > > > the size diff
> > > >
> > > > size broadcast.o
> > > >    text    data     bss     dec     hex filename
> > > >     137       0       0     137      89 broadcast.o
> > > >
> > > > size memory.o
> > > >    text    data     bss     dec     hex filename
> > > >     115       0       0     115      73 memory.o
> > > >
> > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >         * config/i386/i386-expand.c
> > > >         (ix86_broadcast_from_integer_constant): Rename to ..
> > > >         (ix86_broadcast_from_constant): .. this, and extend it to
> > > >         handle float mode.
> > > >         (ix86_expand_vector_move): Extend to float mode.
> > > >         * config/i386/i386-features.c
> > > >         (replace_constant_pool_with_broadcast): Remove.
> > > >         (remove_partial_avx_dependency_gate): Ditto.
> > > >         (constant_pool_broadcast): Ditto.
> > > >         (class pass_constant_pool_broadcast): Ditto.
> > > >         (make_pass_constant_pool_broadcast): Ditto.
> > > >         (remove_partial_avx_dependency): Adjust gate.
> > > >         * config/i386/i386-passes.def: Remove pass_constant_pool_broadcast.
> > > >         * config/i386/i386-protos.h
> > > >         (make_pass_constant_pool_broadcast): Remove.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > >         * gcc.target/i386/fuse-caller-save-xmm.c: Adjust testcase.
> > > > ---
> > > >  gcc/config/i386/i386-expand.c                 |  29 +++-
> > > >  gcc/config/i386/i386-features.c               | 157 +-----------------
> > > >  gcc/config/i386/i386-passes.def               |   1 -
> > > >  gcc/config/i386/i386-protos.h                 |   1 -
> > > >  .../gcc.target/i386/fuse-caller-save-xmm.c    |   2 +-
> > > >  5 files changed, 26 insertions(+), 164 deletions(-)
> > > >
> > > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> > > > index 69ea79e6123..ba870145acd 100644
> > > > --- a/gcc/config/i386/i386-expand.c
> > > > +++ b/gcc/config/i386/i386-expand.c
> > > > @@ -453,8 +453,10 @@ ix86_expand_move (machine_mode mode, rtx operands[])
> > > >    emit_insn (gen_rtx_SET (op0, op1));
> > > >  }
> > > >
> > > > +/* OP is a memref of CONST_VECTOR, return scalar constant mem
> > > > +   if CONST_VECTOR is a vec_duplicate, else return NULL.  */
> > > >  static rtx
> > > > -ix86_broadcast_from_integer_constant (machine_mode mode, rtx op)
> > > > +ix86_broadcast_from_constant (machine_mode mode, rtx op)
> > > >  {
> > > >    int nunits = GET_MODE_NUNITS (mode);
> > > >    if (nunits < 2)
> > > > @@ -462,7 +464,8 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op)
> > > >
> > > >    /* Don't use integer vector broadcast if we can't move from GPR to SSE
> > > >       register directly.  */
> > > > -  if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
> > > > +  if (!TARGET_INTER_UNIT_MOVES_TO_VEC
> > > > +      && INTEGRAL_MODE_P (mode))
> > > >      return nullptr;
> > > >
> > > >    /* Convert CONST_VECTOR to a non-standard SSE constant integer
> > > > @@ -470,12 +473,17 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op)
> > > >    if (!(TARGET_AVX2
> > > >         || (TARGET_AVX
> > > >             && (GET_MODE_INNER (mode) == SImode
> > > > -               || GET_MODE_INNER (mode) == DImode)))
> > > > +               || GET_MODE_INNER (mode) == DImode))
> > > > +       || FLOAT_MODE_P (mode))
> > > >        || standard_sse_constant_p (op, mode))
> > > >      return nullptr;
> > > >
> > > > -  /* Don't broadcast from a 64-bit integer constant in 32-bit mode.  */
> > > > -  if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT)
> > > > +  /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
> > > > +     We can still put 64-bit integer constant in memory when
> > > > +     avx512 embed broadcast is available.  */
> > > > +  if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
> > > > +      && (!TARGET_AVX512F
> > > > +         || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
> > > >      return nullptr;
> > > >
> > > >    if (GET_MODE_INNER (mode) == TImode)
> > > > @@ -561,17 +569,20 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
> > > >
> > > >    if (can_create_pseudo_p ()
> > > >        && GET_MODE_SIZE (mode) >= 16
> > > > -      && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> > > > +      && VECTOR_MODE_P (mode)
> > > >        && (MEM_P (op1)
> > > >           && SYMBOL_REF_P (XEXP (op1, 0))
> > > >           && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
> > > >      {
> > > > -      rtx first = ix86_broadcast_from_integer_constant (mode, op1);
> > > > +      rtx first = ix86_broadcast_from_constant (mode, op1);
> > > >        if (first != nullptr)
> > > >         {
> > > >           /* Broadcast to XMM/YMM/ZMM register from an integer
> > > > -            constant.  */
> > > > -         op1 = ix86_gen_scratch_sse_rtx (mode);
> > > > +            constant or scalar mem.  */
> > > > +         op1 = gen_reg_rtx (mode);
> > > I've changed ix86_gen_scratch_sse_rtx to gen_reg_rtx to let LRA/reload
> > > make the final decision for register usage, would that make sense?
> >
> > Hard registers are used for 2 purposes:
> >
> > 1.  Prevent stack realignment when the original code doesn't use vector
> > registers, which is the same for memcpy and memset.
> > 2.  Prevent combine to convert constant broadcast to load from constant
> > pool.
> >
> W/ gcc version 12.0.0 20210806 (experimental) (GCC) and replaced
> ix86_gen_scratch_sse_rtx with gen_reg_rtx i didn't observe the failure
> related to upper cases.

Did you run full GCC testsuite with

RUNTESTFLAGS="--target_board='unix{-m32,}'"

My partial results show

FAIL: gcc.target/i386/pieces-memset-3.c scan-assembler-not and[^\n\r]*%[re]sp
FAIL: gcc.target/i386/pieces-memset-37.c scan-assembler-not and[^\n\r]*%[re]sp
FAIL: gcc.target/i386/pieces-memset-37.c scan-assembler-not %[re]bp
FAIL: gcc.target/i386/pieces-memset-39.c scan-assembler-not and[^\n\r]*%[re]sp
FAIL: gcc.target/i386/pieces-memset-39.c scan-assembler-not %[re]bp
FAIL: gcc.target/i386/pieces-memset-3.c scan-assembler-not and[^\n\r]*%[re]sp
FAIL: gcc.target/i386/pieces-memset-3.c scan-assembler-not %[re]bp
FAIL: gcc.target/i386/pieces-memset-37.c scan-assembler-not and[^\n\r]*%[re]sp
FAIL: gcc.target/i386/pieces-memset-37.c scan-assembler-not %[re]bp
FAIL: gcc.target/i386/pieces-memset-39.c scan-assembler-not and[^\n\r]*%[re]sp
FAIL: gcc.target/i386/pieces-memset-39.c scan-assembler-not %[re]bp
FAIL: gcc.target/i386/pr90773-14.c scan-assembler-times movd[\\t
]+%xmm[0-9]+, 16\\(%[^,]+\\) 1
FAIL: gcc.target/i386/pr90773-17.c scan-assembler-times vmovd[\\t
]+%xmm[0-9]+, 15\\(%[^,]+\\) 1
FAIL: gcc.target/i386/pr90773-14.c scan-assembler-times movd[\\t
]+%xmm[0-9]+, 16\\(%[^,]+\\) 1
FAIL: gcc.target/i386/pr90773-17.c scan-assembler-times vmovd[\\t
]+%xmm[0-9]+, 15\\(%[^,]+\\) 1
FAIL: gcc.target/i386/pr90773-5.c scan-assembler-times movq[\\t
]+%xmm[0-9]+, 13\\(%[^,]+\\) 1
FAIL: gcc.target/i386/pr100865-10b.c scan-assembler-not vzeroupper
FAIL: gcc.target/i386/pr100865-11b.c scan-assembler-times
vmovdqa64[\\t ]%xmm[0-9]+,  16
FAIL: gcc.target/i386/pr100865-12b.c scan-assembler-times
vmovdqa64[\\t ]%xmm[0-9]+,  16
FAIL: gcc.target/i386/pr100865-4b.c scan-assembler-not vzeroupper
FAIL: gcc.target/i386/pr100865-6b.c scan-assembler-times vmovdqu32[\\t
]%ymm[0-9]+,  8
FAIL: gcc.target/i386/pr100865-6b.c scan-assembler-not vzeroupper
FAIL: gcc.target/i386/pr100865-7b.c scan-assembler-times vmovdqu64[\\t
]%ymm[0-9]+,  16
FAIL: gcc.target/i386/pr100865-7b.c scan-assembler-not vzeroupper
FAIL: gcc.target/i386/pr100865-8a.c scan-assembler-times
(?:vpbroadcastd|vpshufd)[\\t ]+[^\n]*, %xmm[0-9]+ 1
FAIL: gcc.target/i386/pr100865-8b.c scan-assembler-times vmovdqa64[\\t
]%xmm[0-9]+,  16
FAIL: gcc.target/i386/pr100865-8c.c scan-assembler-times vpshufd[\\t
]+[^\n]*, %xmm[0-9]+ 1
FAIL: gcc.target/i386/pr100865-9b.c scan-assembler-times vmovdqa64[\\t
]%xmm[0-9]+,  16
FAIL: gcc.target/i386/pr100865-9c.c scan-assembler-times vpshufd[\\t
]+[^\n]*, %xmm[0-9]+ 1
FAIL: gcc.target/i386/eh_return-1.c (internal compiler error)
FAIL: gcc.target/i386/eh_return-1.c (test for excess errors)
FAIL: gcc.target/i386/eh_return-2.c (internal compiler error)
FAIL: gcc.target/i386/eh_return-2.c (test for excess errors)

with

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index aea224ab235..61e7e94581c 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -23338,6 +23338,7 @@ ix86_optab_supported_p (int op, machine_mode
mode1, machine_mode,
 rtx
 ix86_gen_scratch_sse_rtx (machine_mode mode)
 {
+  return gen_reg_rtx (mode);
   if (TARGET_SSE && !lra_in_progress)
     {
       unsigned int regno;

> only failure like
> gcc.target/i386/pr100865-10b.c scan-assembler-not vzeroupper
> gcc.target/i386/pr100865-4b.c scan-assembler-not vzeroupper
> gcc.target/i386/pr100865-6b.c scan-assembler-not vzeroupper
> I guess it's related to xmm31 usage.
>
> Have you check in all patches rely on this part?
>
>
-- 
H.J.


More information about the Gcc-patches mailing list