[PATCH] move x86 to use gather/scatter internal functions

Hongtao Liu crazylht@gmail.com
Wed Aug 18 11:38:29 GMT 2021


On Wed, Aug 18, 2021 at 7:37 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Wed, Aug 18, 2021 at 7:30 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Wed, Aug 18, 2021 at 6:28 PM Richard Biener <rguenther@suse.de> wrote:
> > >
> > > On Wed, 18 Aug 2021, Richard Biener wrote:
> > >
> > > >
> > > > So in the end I seem to be able to combine AVX & AVX512 arriving
> > > > at the following which passes basic testing.  I will now see to
> > > > teach the vectorizer the required "promotion" to handle
> > > > mask_gather_loadv4dfv4si and mask_gather_loadv4sfv4di.
> > > >
> > > > Meanwhile, do you see any hole in the below?  If not I'll
> > > > do mask_scatter_store accordingly (that will be simpler since
> > > > AVX doesn't have scatter).
> > >
> > > There seems to be one more complication ... we have
> > >
> > > (define_expand "avx2_gatherdi<mode>"
> > >   [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand")
> > >                    (unspec:VEC_GATHER_MODE
> > >                      [(match_operand:<VEC_GATHER_SRCDI> 1
> > > "register_operand")
> > >                       (mem:<ssescalarmode>
> > >                         (match_par_dup 6
> > >                           [(match_operand 2 "vsib_address_operand")
> > >                            (match_operand:<VEC_GATHER_IDXDI>
> > >                               3 "register_operand")
> > >
> > > but VEC_GATHER_IDXDI is
> > >
> > > (define_mode_attr VEC_GATHER_IDXDI
> > >                       [(V2DI "V2DI") (V4DI "V4DI") (V8DI "V8DI")
> > >                        (V2DF "V2DI") (V4DF "V4DI") (V8DF "V8DI")
> > >                        (V4SI "V2DI") (V8SI "V4DI") (V16SI "V8DI")
> > >                        (V4SF "V2DI") (V8SF "V4DI") (V16SF "V8DI")])
> > >
> > > I'd have expected (V4SF "V4DI") for example, or (V8SF "V8DI").
> > VEX.128 version: For dword indices, the instruction will gather four
> > single-precision floating-point values. For
> > qword indices, the instruction will gather two values and zero the
> > upper 64 bits of the destination.
> > VEX.256 version: For dword indices, the instruction will gather eight
> > single-precision floating-point values. For
> > qword indices, the instruction will gather four values and zero the
> > upper 128 bits of the destination.
> >
> > So, for expander name, it should be v2sfv2di and v4sfv4di for IDXDI
> > under avx2, and v8sfv8di under avx512.
> >
> > ----cut pattern----
> > (define_insn "*avx2_gatherdi<VEC_GATHER_MODE:mode>_2"
> >   [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x")
> > (unspec:VEC_GATHER_MODE
> >   [(pc)
> >    (match_operator:<ssescalarmode> 6 "vsib_mem_operator"
> >      [(unspec:P
> > [(match_operand:P 2 "vsib_address_operand" "Tv")
> > (match_operand:<VEC_GATHER_IDXDI> 3 "register_operand" "x")
> > (match_operand:SI 5 "const1248_operand" "n")]
> > UNSPEC_VSIBADDR)])
> >    (mem:BLK (scratch))
> >    (match_operand:<VEC_GATHER_SRCDI> 4 "register_operand" "1")]
> >   UNSPEC_GATHER))
> >    (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
> >   "TARGET_AVX2"
> > {
> >   if (<VEC_GATHER_MODE:MODE>mode != <VEC_GATHER_SRCDI>mode)
> >     return "%M2v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6,
> > %x0|%x0, %6, %4}";
> > ----cut end---------------
> > We are using the trick of the operand modifier %x0 to force print xmm.
> (define_mode_attr VEC_GATHER_SRCDI
>       [(V2DI "V2DI") (V4DI "V4DI") (V8DI "V8DI")
>        (V2DF "V2DF") (V4DF "V4DF") (V8DF "V8DF")
>        (V4SI "V4SI") (V8SI "V4SI") (V16SI "V8SI")
>        (V4SF "V4SF") (V8SF "V4SF") (V16SF "V8SF")])
>
> (define_insn "*avx2_gathersi<VEC_GATHER_MODE:mode>"
>   [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x")
> (unspec:VEC_GATHER_MODE
>   [(match_operand:VEC_GATHER_MODE 2 "register_operand" "0")
>    (match_operator:<ssescalarmode> 7 "vsib_mem_operator"
>      [(unspec:P
> [(match_operand:P 3 "vsib_address_operand" "Tv")
> (match_operand:<VEC_GATHER_IDXSI> 4 "register_operand" "x")
> (match_operand:SI 6 "const1248_operand" "n")]
> UNSPEC_VSIBADDR)])
>    (mem:BLK (scratch))
>    (match_operand:VEC_GATHER_MODE 5 "register_operand" "1")]
>   UNSPEC_GATHER))
>    (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
>   "TARGET_AVX2"
>   "%M3v<sseintprefix>gatherd<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}"
>
> Or only print operands[1] which has mode VEC_GATHER_SRCDI as index.

Typo should be operands[2] in the below one
(define_insn "*avx2_gatherdi<VEC_GATHER_MODE:mode>"
  [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x")
(unspec:VEC_GATHER_MODE
  [(match_operand:<VEC_GATHER_SRCDI> 2 "register_operand" "0")
   (match_operator:<ssescalarmode> 7 "vsib_mem_operator"
     [(unspec:P
[(match_operand:P 3 "vsib_address_operand" "Tv")
(match_operand:<VEC_GATHER_IDXDI> 4 "register_operand" "x")
(match_operand:SI 6 "const1248_operand" "n")]
UNSPEC_VSIBADDR)])
   (mem:BLK (scratch))
   (match_operand:<VEC_GATHER_SRCDI> 5 "register_operand" "1")]
  UNSPEC_GATHER))
   (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
  "TARGET_AVX2"
  "%M3v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %2|%2, %7, %5}"

> >
> >
> >
> > --------------cut from i386-expand.c-----------
> > case IX86_BUILTIN_GATHER3DIV16SI:
> >   if (target == NULL_RTX)
> >     target = gen_reg_rtx (V8SImode);
> >   emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
> >   break;
> > case IX86_BUILTIN_GATHER3DIV8SF:
> > case IX86_BUILTIN_GATHERDIV8SF:
> >   if (target == NULL_RTX)
> >     target = gen_reg_rtx (V4SFmode);
> >   emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
> > -----------cut from i386-expand.c-----------
> >
> > and do ugly things when expand builtin
> >
> > > From the ISA docs I conclude that vgatherqps is not supported for
> > > a %zmm destination but V8SF and V8DI is a supported combination?
> > >
> > > (define_mode_attr VEC_GATHER_IDXSI
> > >                       [(V2DI "V4SI") (V4DI "V4SI") (V8DI "V8SI")
> > >                        (V2DF "V4SI") (V4DF "V4SI") (V8DF "V8SI")
> > >                        (V4SI "V4SI") (V8SI "V8SI") (V16SI "V16SI")
> > >                        (V4SF "V4SI") (V8SF "V8SI") (V16SF "V16SI")])
> > >
> > > looks like what I expect.  So shouldn't we use a special
> > > VEC_GATHERDI_MODE iterator instead that drops V16{SI,SF} and adjust
> > > VEC_GATHER_IDXDI accordingly?  Even
> > >
> > > (define_expand "<avx512>_gatherdi<mode>"
> > >   [(parallel [(set (match_operand:VI48F 0 "register_operand")
> > >                    (unspec:VI48F
> > >
> > > "supports" V16SI and V16SF gathers, leading to broken patterns?
> > >
> > > I can test if anything goes wrong fixing VEC_GATHER_IDXDI to
> > >
> > > (define_mode_attr VEC_GATHER_IDXDI
> > >                       [(V2DI "V2DI") (V4DI "V4DI") (V8DI "V8DI")
> > >                        (V2DF "V2DI") (V4DF "V4DI") (V8DF "V8DI")
> > >                        (V4SI "V4DI") (V8SI "V8DI") (V16SI "V8DI")
> > >                        (V4SF "V4DI") (V8SF "V8DI") (V16SF "V8DI")])
> > >
> > > Thanks,
> > > Richard.
> > >
> > > > Thanks,
> > > > Richard.
> > > >
> > > > From 706d3ac96b384eaad249cc83ec542ec643e21a4c Mon Sep 17 00:00:00 2001
> > > > From: Richard Biener <rguenther@suse.de>
> > > > Date: Tue, 17 Aug 2021 15:14:38 +0200
> > > > Subject: [PATCH] move x86 to use gather/scatter internal functions
> > > > To: gcc-patches@gcc.gnu.org
> > > >
> > > > This moves the x86 backend to use standard pattern names for
> > > > mask_gather_load and mask_scatter_store rather than using the
> > > > builtin_{gather,scatter} target hooks.
> > > >
> > > > The patch starts with mask_gather_load convering AVX and AVX512
> > > > and omits gather_load which doesn't match an actual instruction.
> > > > The vectorizer will appropriately set up a all-ones mask when
> > > > necessary.
> > > >
> > > > The vectorizer still lacks support for unpacking, thus
> > > > mixed size gathers like mask_gather_loadv4dfv4si.
> > > >
> > > > 2021-08-17  Richard Biener  <rguenther@suse.de>
> > > >
> > > >       * config/i386/sse.md
> > > >       (vec_gather_idxsi, vec_gather_idxdi): New lower-case
> > > >       mode attributes for VEC_GATHER_IDX{SI,DI}.
> > > >       (VEC_GATHER_MODEX): New mode iterator.
> > > >       (mask_gather_load<mode><vec_gather_idxsi>): New expander.
> > > >       (mask_gather_load<mode><vec_gather_idxdi>): Likewise.
> > > > ---
> > > >  gcc/config/i386/sse.md | 53 ++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 53 insertions(+)
> > > >
> > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > > index 13889687793..6ae826c268a 100644
> > > > --- a/gcc/config/i386/sse.md
> > > > +++ b/gcc/config/i386/sse.md
> > > > @@ -23640,12 +23640,22 @@
> > > >                      (V2DF "V4SI") (V4DF "V4SI") (V8DF "V8SI")
> > > >                      (V4SI "V4SI") (V8SI "V8SI") (V16SI "V16SI")
> > > >                      (V4SF "V4SI") (V8SF "V8SI") (V16SF "V16SI")])
> > > > +(define_mode_attr vec_gather_idxsi
> > > > +                   [(V2DI "v4si") (V4DI "v4si") (V8DI "v8si")
> > > > +                    (V2DF "v4si") (V4DF "v4si") (V8DF "v8si")
> > > > +                    (V4SI "v4si") (V8SI "v8si") (V16SI "v16si")
> > > > +                    (V4SF "v4si") (V8SF "v8si") (V16SF "v16si")])
> > > >
> > > >  (define_mode_attr VEC_GATHER_IDXDI
> > > >                     [(V2DI "V2DI") (V4DI "V4DI") (V8DI "V8DI")
> > > >                      (V2DF "V2DI") (V4DF "V4DI") (V8DF "V8DI")
> > > >                      (V4SI "V2DI") (V8SI "V4DI") (V16SI "V8DI")
> > > >                      (V4SF "V2DI") (V8SF "V4DI") (V16SF "V8DI")])
> > > > +(define_mode_attr vec_gather_idxdi
> > > > +                   [(V2DI "v2di") (V4DI "v4di") (V8DI "v8di")
> > > > +                    (V2DF "v2di") (V4DF "v4di") (V8DF "v8di")
> > > > +                    (V4SI "v2di") (V8SI "v4di") (V16SI "v8di")
> > > > +                    (V4SF "v2di") (V8SF "v4di") (V16SF "v8di")])
> > > >
> > > >  (define_mode_attr VEC_GATHER_SRCDI
> > > >                     [(V2DI "V2DI") (V4DI "V4DI") (V8DI "V8DI")
> > > > @@ -23653,6 +23663,30 @@
> > > >                      (V4SI "V4SI") (V8SI "V4SI") (V16SI "V8SI")
> > > >                      (V4SF "V4SF") (V8SF "V4SF") (V16SF "V8SF")])
> > > >
> > > > +(define_mode_iterator VEC_GATHER_MODEX
> > > > +  [V2DI V2DF V4DI V4DF V4SI V4SF V8SI V8SF
> > > > +   (V8DI "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
> > > > +   (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")])
> > > > +
> > > > +(define_expand "mask_gather_load<mode><vec_gather_idxsi>"
> > > > +  [(match_operand:VEC_GATHER_MODEX 0 "register_operand")
> > > > +   (match_operand 1 "vsib_address_operand")
> > > > +   (match_operand:<VEC_GATHER_IDXSI> 2 "register_operand")
> > > > +   (match_operand:SI 3 "const0_operand")
> > > > +   (match_operand:SI 4 "const1248_operand")
> > > > +   (match_operand 5 "register_operand")]
> > > > +  "TARGET_AVX2 && TARGET_USE_GATHER"
> > > > +{
> > > > +  rtx scratch = gen_reg_rtx (<MODE>mode);
> > > > +  emit_move_insn (scratch, CONST0_RTX(<MODE>mode));
> > > > +  if (GET_MODE_SIZE (<MODE>mode) != 32 && !TARGET_AVX512VL)
> > > > +    operands[5] = gen_lowpart_SUBREG (<MODE>mode, operands[5]);
> > > > +  emit_insn (gen_<avx2_avx512>_gathersi<mode> (operands[0], scratch,
> > > > +                                            operands[1], operands[2],
> > > > +                                            operands[5], operands[4]));
> > > > +  DONE;
> > > > +})
> > > > +
> > > >  (define_expand "avx2_gathersi<mode>"
> > > >    [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand")
> > > >                  (unspec:VEC_GATHER_MODE
> > > > @@ -23714,6 +23748,25 @@
> > > >     (set_attr "prefix" "vex")
> > > >     (set_attr "mode" "<sseinsnmode>")])
> > > >
> > > > +(define_expand "mask_gather_load<mode><vec_gather_idxdi>"
> > > > +  [(match_operand:VEC_GATHER_MODEX 0 "register_operand")
> > > > +   (match_operand 1 "vsib_address_operand")
> > > > +   (match_operand:<VEC_GATHER_IDXDI> 2 "register_operand")
> > > > +   (match_operand:SI 3 "const0_operand")
> > > > +   (match_operand:SI 4 "const1248_operand")
> > > > +   (match_operand 5 "register_operand")]
> > > > +  "TARGET_AVX2 && TARGET_USE_GATHER"
> > > > +{
> > > > +  rtx scratch = gen_reg_rtx (<MODE>mode);
> > > > +  emit_move_insn (scratch, CONST0_RTX(<MODE>mode));
> > > > +  if (GET_MODE_SIZE (<MODE>mode) != 32 && !TARGET_AVX512VL)
> > > > +    operands[5] = gen_lowpart_SUBREG (<VEC_GATHER_SRCDI>mode, operands[5]);
> > > > +  emit_insn (gen_<avx2_avx512>_gatherdi<mode> (operands[0], scratch,
> > > > +                                            operands[1], operands[2],
> > > > +                                            operands[5], operands[4]));
> > > > +  DONE;
> > > > +})
> > > > +
> > > >  (define_expand "avx2_gatherdi<mode>"
> > > >    [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand")
> > > >                  (unspec:VEC_GATHER_MODE
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
> > > Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)
> >
> >
> >
> > --
> > BR,
> > Hongtao
>
>
>
> --
> BR,
> Hongtao



-- 
BR,
Hongtao


More information about the Gcc-patches mailing list