[PATCH 1/2] middle-end Teach CSE to be able to do vector extracts.

Tamar Christina Tamar.Christina@arm.com
Mon Nov 1 08:43:33 GMT 2021


Mailing list got lost somewhere, Archiving OK.

> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Friday, October 29, 2021 4:52 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: jeffreyalaw@gmail.com; rguenther@suse.de; nd <nd@arm.com>
> Subject: Re: [PATCH 1/2] middle-end Teach CSE to be able to do vector
> extracts.
> 
> Sorry for the slow review.
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> > [this time with patch]
> >
> > Hi all,
> >
> > This is a new version which has the rewrite Richard S requested And
> > also handles when lowpart_subreg fails.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> >         * cse.c (add_to_set): New.
> >         (find_sets_in_insn): Register constants in sets.
> >         (canonicalize_insn): Use auto_vec instead.
> >         (cse_insn): Try materializing using vec_dup.
> >         * rtl.h (simplify_context::simplify_gen_vec_select,
> >         simplify_gen_vec_select): New.
> >         * simplify-rtx.c (simplify_context::simplify_gen_vec_select): New.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/cse.c b/gcc/cse.c
> > index
> >
> 4c3988ee430e99cff74c32cdf9b6382505edd415..2c0442484117317e553c92f48fa
> c
> > 24a0b55063bd 100644
> > --- a/gcc/cse.c
> > +++ b/gcc/cse.c
> > @@ -44,6 +44,7 @@ along with GCC; see the file COPYING3.  If not see
> > #include "regs.h"
> >  #include "function-abi.h"
> >  #include "rtlanal.h"
> > +#include "expr.h"
> >
> >  /* The basic idea of common subexpression elimination is to go
> >     through the code, keeping a record of expressions that would @@
> > -4240,13 +4241,23 @@ try_back_substitute_reg (rtx set, rtx_insn *insn)
> >      }
> >  }
> >
> >
> > +
> 
> Seems like excessive whitespace.
> 
> > +/* Add an entry containing RTL X into SETS.  */ static inline void
> > +add_to_set (vec<struct set> *sets, rtx x) {
> > +  struct set entry = {};
> > +  entry.rtl = x;
> > +  sets->safe_push (entry);
> > +}
> > +
> >  /* Record all the SETs in this instruction into SETS_PTR,
> >     and return the number of recorded sets.  */  static int
> > -find_sets_in_insn (rtx_insn *insn, struct set **psets)
> > +find_sets_in_insn (rtx_insn *insn, vec<struct set> *psets)
> >  {
> > -  struct set *sets = *psets;
> > -  int n_sets = 0;
> > +  vec<struct set> sets = *psets;
> 
> Is this needed?  It looks like you convert all uses to pset (which is good).
> 
> > +
> >    rtx x = PATTERN (insn);
> >
> >    if (GET_CODE (x) == SET)
> > @@ -4266,8 +4277,25 @@ find_sets_in_insn (rtx_insn *insn, struct set
> **psets)
> >          someplace else, so it isn't worth cse'ing.  */
> >        else if (GET_CODE (SET_SRC (x)) == CALL)
> >         ;
> > +      else if (GET_CODE (SET_SRC (x)) == CONST_VECTOR
> > +              && GET_MODE_CLASS (GET_MODE (SET_SRC (x))) !=
> MODE_VECTOR_BOOL)
> > +       {
> > +         /* First register the vector itself.  */
> > +         add_to_set (psets, x);
> > +         rtx src = SET_SRC (x);
> > +         /* Go over the constants of the CONST_VECTOR in forward order, to
> > +            put them in the same order in the SETS array.  */
> > +         for (unsigned i = 0; i < const_vector_encoded_nelts (src) ; i++)
> > +           {
> > +             /* These are templates and don't actually get emitted but are
> > +                used to tell CSE how to get to a particular constant.  */
> > +             rtx y = simplify_gen_vec_select (SET_DEST (x), i);
> > +             gcc_assert (y);
> > +             add_to_set (psets, gen_rtx_SET (y, CONST_VECTOR_ELT
> > + (src, i)));
> 
> For the record: it looks like everything that uses set::rtl only really cares
> about the SET_DEST & SET_SRC individually, so in principle we could save
> creating some garbage SETs by splitting it into dest and src fields.  I don't think
> that's important enough to be a requirement though.
> 
> > +           }
> > +       }
> >        else
> > -       sets[n_sets++].rtl = x;
> > +       add_to_set (psets, x);
> >      }
> >    else if (GET_CODE (x) == PARALLEL)
> >      {
> > @@ -4288,12 +4316,12 @@ find_sets_in_insn (rtx_insn *insn, struct set
> **psets)
> >               else if (GET_CODE (SET_SRC (y)) == CALL)
> >                 ;
> >               else
> > -               sets[n_sets++].rtl = y;
> > +               add_to_set (psets, y);
> >             }
> >         }
> >      }
> >
> > -  return n_sets;
> > +  return sets.length ();
> >  }
> >
> >
> >  /* Subroutine of canonicalize_insn.  X is an ASM_OPERANDS in INSN.
> > */ @@ -4341,9 +4369,10 @@ canon_asm_operands (rtx x, rtx_insn *insn)
> >     see canon_reg.  */
> >
> >  static void
> > -canonicalize_insn (rtx_insn *insn, struct set **psets, int n_sets)
> > +canonicalize_insn (rtx_insn *insn, vec<struct set> *psets)
> >  {
> > -  struct set *sets = *psets;
> > +  vec<struct set> sets = *psets;
> > +  int n_sets = sets.length ();
> >    rtx tem;
> >    rtx x = PATTERN (insn);
> >    int i;
> > @@ -4502,13 +4531,6 @@ cse_insn (rtx_insn *insn)
> >    int src_eqv_in_memory = 0;
> >    unsigned src_eqv_hash = 0;
> >
> > -  struct set *sets = (struct set *) 0;
> > -
> > -  if (GET_CODE (x) == SET)
> > -    sets = XALLOCA (struct set);
> > -  else if (GET_CODE (x) == PARALLEL)
> > -    sets = XALLOCAVEC (struct set, XVECLEN (x, 0));
> > -
> >    this_insn = insn;
> >
> >    /* Find all regs explicitly clobbered in this insn, @@ -4517,10
> > +4539,11 @@ cse_insn (rtx_insn *insn)
> >    invalidate_from_sets_and_clobbers (insn);
> >
> >    /* Record all the SETs in this instruction.  */
> > -  n_sets = find_sets_in_insn (insn, &sets);
> > +  auto_vec<struct set, 8> sets;
> > +  n_sets = find_sets_in_insn (insn, (vec<struct set>*)&sets);
> >
> >    /* Substitute the canonical register where possible.  */
> > -  canonicalize_insn (insn, &sets, n_sets);
> > +  canonicalize_insn (insn, (vec<struct set>*)&sets);
> >
> >    /* If this insn has a REG_EQUAL note, store the equivalent value in
> SRC_EQV,
> >       if different, or if the DEST is a STRICT_LOW_PART/ZERO_EXTRACT.
> > The @@ -4986,6 +5009,27 @@ cse_insn (rtx_insn *insn)
> >           src_related_is_const_anchor = src_related != NULL_RTX;
> >         }
> >
> > +      /* Try to re-materialize a vec_dup with an existing constant.   */
> > +      rtx src_elt;
> > +      if ((!src_eqv_here || CONSTANT_P (src_eqv_here))
> > +         && const_vec_duplicate_p (src, &src_elt))
> > +       {
> > +          machine_mode const_mode = GET_MODE_INNER (GET_MODE
> (src));
> > +          struct table_elt *related_elt
> > +               = lookup (src_elt, HASH (src_elt, const_mode), const_mode);
> > +          if (related_elt)
> > +           {
> > +             for (related_elt = related_elt->first_same_value;
> > +                  related_elt; related_elt = related_elt->next_same_value)
> > +               if (REG_P (related_elt->exp))
> > +                 {
> > +                   src_eqv_here
> > +                       = gen_rtx_VEC_DUPLICATE (GET_MODE (src),
> > +                                                related_elt->exp);
> 
> I wondered whether we should compare the cost of the new vec_duplicate
> against any existing constant src_eqv_here, but I guess that isn't necessary
> because of:
> 
>       if (src_const == 0
> 	  && (CONSTANT_P (src_folded)
> 	      /* Consider (minus (label_ref L1) (label_ref L2)) as
> 		 "constant" here so we will record it. This allows us
> 		 to fold switch statements when an ADDR_DIFF_VEC is used.
> */
> 	      || (GET_CODE (src_folded) == MINUS
> 		  && GET_CODE (XEXP (src_folded, 0)) == LABEL_REF
> 		  && GET_CODE (XEXP (src_folded, 1)) == LABEL_REF)))
> 	src_const = src_folded, src_const_elt = elt;
>       else if (src_const == 0 && src_eqv_here && CONSTANT_P (src_eqv_here))
> 	src_const = src_eqv_here, src_const_elt = src_eqv_elt;
> 
> Specifically, if src_folded is nonnull and src_eqv_here is a CONST_VECTOR. it
> ought to be the same constant as src_folded, since (unlike symbolic
> constants) each constant should have a unique representation.  Perhaps
> that's worth a comment:
> 
>    /* We don't need to compare costs with an existing (constant)
>       src_eqv_here, since any such src_eqv_here should already be
>       available in src_const.  */
> 
> Maybe obvious, but still.
> 
> OK with those changes (except the set::rtl one of course).
> 
> Thanks,
> Richard
> 
> > +                   break;
> > +                 }
> > +           }
> > +       }
> >
> >        if (src == src_folded)
> >         src_folded = 0;
> > diff --git a/gcc/rtl.h b/gcc/rtl.h
> > index
> >
> 5473cc9f2ddf1863191a3e2b5914ae89598e53b4..6a6de1cf11d0bf5dd38f5ee37
> 901
> > 310d653ce722 100644
> > --- a/gcc/rtl.h
> > +++ b/gcc/rtl.h
> > @@ -3425,6 +3425,7 @@ public:
> >                             rtx, rtx, rtx);
> >    rtx simplify_gen_relational (rtx_code, machine_mode, machine_mode,
> rtx, rtx);
> >    rtx simplify_gen_subreg (machine_mode, rtx, machine_mode,
> > poly_uint64);
> > +  rtx simplify_gen_vec_select (rtx, unsigned int);
> >
> >    /* Tracks the level of MEM nesting for the value being simplified:
> >       0 means the value is not in a MEM, >0 means it is.  This is
> > needed @@ -3526,6 +3527,12 @@ simplify_gen_subreg (machine_mode
> outermode, rtx op, machine_mode innermode,
> >                                                   innermode, byte);  }
> >
> > +inline rtx
> > +simplify_gen_vec_select (rtx op, unsigned int index) {
> > +  return simplify_context ().simplify_gen_vec_select (op, index); }
> > +
> >  inline rtx
> >  lowpart_subreg (machine_mode outermode, rtx op, machine_mode
> > innermode)  { diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
> > index
> >
> bbbd6b74942491e9861ce2928f01b9aa7eeab031..19d899e6321e169d9ef92949
> 9e05
> > 7be0b73c2502 100644
> > --- a/gcc/simplify-rtx.c
> > +++ b/gcc/simplify-rtx.c
> > @@ -7589,6 +7589,28 @@ simplify_context::lowpart_subreg
> (machine_mode outer_mode, rtx expr,
> >                               subreg_lowpart_offset (outer_mode,
> > inner_mode));  }
> >
> > +/* Generate RTX to select element at INDEX out of vector OP.  */
> > +
> > +rtx simplify_context::simplify_gen_vec_select (rtx op, unsigned int
> > +index) {
> > +
> > +  if (!VECTOR_MODE_P (GET_MODE (op)))
> > +    return NULL_RTX;
> > +
> > +  machine_mode imode = GET_MODE_INNER (GET_MODE (op));
> > +
> > +  if (index == 0)
> > +    {
> > +      rtx res = lowpart_subreg (imode, op, GET_MODE (op));
> > +      if (res)
> > +       return res;
> > +    }
> > +
> > +  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT
> > +(index)));
> > +  return gen_rtx_VEC_SELECT (imode, op, tmp); }
> > +
> > +
> >  /* Simplify X, an rtx expression.
> >
> >     Return the simplified expression or NULL if no simplifications


More information about the Gcc-patches mailing list