[PATCH 1/8 v9]middle-end slp: Support optimizing load distribution

Richard Biener rguenther@suse.de
Mon Jan 11 13:54:55 GMT 2021


On Mon, 11 Jan 2021, Tamar Christina wrote:

> Hi Richi,
> 
> Attached is the updated patch.
> 
> Note that testcases for all of these will be committed with the patch but I'm
> Finishing up the 32-bit Arm changes to mirror the changes the AArch64 maintainer
> wanted and then have to do bootstrap which will take the majority of the day so
> wanted to get these patches out first.
> 
> I also built spec with the matcher on and off and noticed no meaningful change in
> Compile time but replacements in several benchmarks.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-slp.c (optimize_load_redistribution_1): New.
> 	(optimize_load_redistribution, vect_is_slp_load_node): New.
> 	(vect_match_slp_patterns): Use it.
> 
> -- inline copy of patch --
> 
> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index 2a58e54fe51471df5f55ce4a524d0022744054b0..89e226ca3a25a6c77b86d46ba234ce54bd3cb83b 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -2228,6 +2228,114 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
>    return exact_div (common_multiple (nunits, group_size), group_size);
>  }
>  
> +/* Helper that checks to see if a node is a load node. This is done based on
> +   two criterias:
> +   1) The node is internal
> +   2) The node has no childen.  */
> +
> +static inline bool
> +vect_is_slp_load_node  (slp_tree root)
> +{
> +  return (SLP_TREE_DEF_TYPE (root) == vect_internal_def
> +	  && !SLP_TREE_CHILDREN (root).exists ());

this would return true for induction defs as well (the SLP_TREE_DEF_TYPE
only distinguishes between vect_internal_def and constant/external 
def...).  It would also not match masked loads.  A more close match
would be

  SLP_TREE_DEF_TYPE (root) == vect_internal_def
  && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
  && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)))

but not sure whether you handle masked loads OK (so you could
do the !SLP_TREE_CHILDREN (root).exists () in the caller if not).

> +}
> +
> +
> +/* Helper function of optimize_load_redistribution that performs the operation
> +   recursively.  */
> +
> +static slp_tree
> +optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
> +				vec_info *vinfo, unsigned int group_size,
> +				hash_set<slp_tree> *visited, slp_tree root)
> +{
> +  if (visited->add (root))
> +    return NULL;
> +
> +  slp_tree node;
> +  unsigned i;
> +
> +  /* For now, we don't know anything about externals so do not do anything.  */
> +  if (SLP_TREE_DEF_TYPE (root) != vect_internal_def)
> +    return NULL;
> +  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
> +    {
> +      /* First convert this node into a load node and add it to the leaves
> +         list and flatten the permute from a lane to a load one.  If it's
> +         unneeded it will be elided later.  */
> +      vec<stmt_vec_info> stmts;
> +      stmts.create (SLP_TREE_LANES (root));
> +      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
> +      for (unsigned j = 0; j < lane_perm.length (); j++)
> +        {
> +          std::pair<unsigned, unsigned> perm = lane_perm[j];
> +          node = SLP_TREE_CHILDREN (root)[perm.first];
> +
> +	  if (!vect_is_slp_load_node (node))

stmts leaks here - I think you also want to still recurse to the SLP
children, there can be two_operator nodes consuming the complex
ops.  So maybe a break and guard the rest with j == lane_perm.length ().

> +	   return NULL;
> +
> +	  stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
> +        }
> +
> +      if (dump_enabled_p ())
> +	dump_printf_loc (MSG_NOTE, vect_location,
> +			 "converting stmts on permute node %p\n", root);
> +
> +      bool *matches = XALLOCAVEC (bool, group_size);
> +      poly_uint64 max_nunits = 1;
> +      unsigned tree_size = 0, limit = 1;
> +      node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
> +				  matches, &limit, &tree_size, bst_map);
> +      if (!node)
> +	stmts.release ();
> +
> +      return node;
> +    }
> +
> +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> +    {
> +      slp_tree value
> +	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, visited,
> +					  node);
> +      if (value)
> +	{
> +          SLP_TREE_CHILDREN (root)[i] = value;
> +          vect_free_slp_tree (node);
> +	}
> +    }
> +
> +  return NULL;
> +}
> +
> +/* Temporary workaround for loads not being CSEd during SLP build.  This
> +   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
> +   VEC_PERM nodes that blend vectors from multiple nodes that all read from the
> +   same DR such that the final operation is equal to a permuted load.  Such
> +   NODES are then directly converted into LOADS themselves.  The nodes are
> +   CSEd using BST_MAP.  */
> +
> +static void
> +optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
> +			      vec_info *vinfo, unsigned int group_size,
> +			      slp_tree root)
> +{
> +  slp_tree node;
> +  unsigned i;
> +  hash_set<slp_tree> visited;
> +
> +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> +    {
> +      slp_tree value
> +	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, &visited,
> +					  node);
> +      if (value)
> +	{
> +          SLP_TREE_CHILDREN (root)[i] = value;
> +          vect_free_slp_tree (node);
> +	}
> +    }
> +}
> +
>  /* Helper function of vect_match_slp_patterns.
>  
>     Attempts to match patterns against the slp tree rooted in REF_NODE using
> @@ -2276,7 +2384,7 @@ static bool
>  vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
>  			 hash_set<slp_tree> *visited,
>  			 slp_tree_to_load_perm_map_t *perm_cache,
> -			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
> +			 scalar_stmts_to_slp_tree_map_t *bst_map)
>  {
>    DUMP_VECT_SCOPE ("vect_match_slp_patterns");
>    slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
> @@ -2291,6 +2399,9 @@ vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
>  
>    if (found_p)
>      {
> +      optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (*ref_node),
> +				    *ref_node);
> +

Can you move this up to the caller and share one visited set for all
SLP instances?  Because you will run into shared subtrees more than
once the way it is structured now.

Otherwise looks ok.

Richard.

>        if (dump_enabled_p ())
>  	{
>  	  dump_printf_loc (MSG_NOTE, vect_location,
> 
> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Thursday, January 7, 2021 1:21 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> > Subject: Re: [PATCH 1/8 v9]middle-end slp: Support optimizing load
> > distribution
> > 
> > > From tamar.christina@arm.com Mon Dec 28 14:36:32 2020
> > > Date: Mon, 28 Dec 2020 13:35:56 +0000
> > > From: Tamar Christina <tamar.christina@arm.com>
> > > To: gcc-patches@gcc.gnu.org
> > > Cc: nd@arm.com, rguenther@suse.de, ook@ucw.cz
> > > Subject: [PATCH 1/8 v9]middle-end slp: Support optimizing load
> > > distribution
> > >
> > > Hi All,
> > >
> > > This introduces a post processing step for the pattern matcher to
> > > flatten permutes introduced by the complex multiplications patterns.
> > >
> > > This performs a blend early such that SLP is not cancelled by the
> > > LOAD_LANES permute.  This is a temporary workaround to the fact that
> > > loads are not CSEd during building and is required to produce efficient code.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* tree-vect-slp.c (optimize_load_redistribution_1): New.
> > > 	(optimize_load_redistribution): New.
> > > 	(vect_match_slp_patterns): Use it.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index
> > >
> > 2a58e54fe51471df5f55ce4a524d0022744054b0..8360a59098f517498f3155f325c
> > f
> > > 8406466ac25c 100644
> > > --- a/gcc/tree-vect-slp.c
> > > +++ b/gcc/tree-vect-slp.c
> > > @@ -2228,6 +2228,115 @@ calculate_unrolling_factor (poly_uint64 nunits,
> > unsigned int group_size)
> > >    return exact_div (common_multiple (nunits, group_size),
> > > group_size);  }
> > >
> > > +/* Helper function of optimize_load_redistribution that performs the
> > operation
> > > +   recursively.  */
> > > +
> > > +static slp_tree
> > > +optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
> > *bst_map,
> > > +				hash_set<slp_tree> *visited, slp_tree root) {
> > > +  if (visited->add (root))
> > > +    return NULL;
> > > +
> > > +  slp_tree node;
> > > +  unsigned i;
> > > +
> > > +  /* For now, we don't know anything about externals so do not do
> > > + anything.  */  if (SLP_TREE_DEF_TYPE (root) == vect_external_def
> > > +      || SLP_TREE_DEF_TYPE (root) == vect_constant_def)
> > 
> > use a single != vect_internal_def test please
> > 
> > > +    return NULL;
> > > +  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR
> > > +      && SLP_TREE_LANE_PERMUTATION (root).exists ()
> > > +      && !SLP_TREE_SCALAR_STMTS (root).exists ())
> > 
> > I think both last tests are unnecessary
> > 
> > > +    {
> > > +      /* First convert this node into a load node and add it to the leaves
> > > +         list and flatten the permute from a lane to a load one.  If it's
> > > +         unneeded it will be elided later.  */
> > > +      auto_vec<stmt_vec_info> stmts;
> > > +      stmts.create (SLP_TREE_LANES (root));
> > > +      load_permutation_t load_perm;
> > > +      load_perm.create (SLP_TREE_LANES (root));
> > > +      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION
> > > + (root);
> > 
> > load_perm leaks when any of the below outs is taken
> > 
> > > +      for (unsigned j = 0; j < lane_perm.length (); j++)
> > > +        {
> > > +          std::pair<unsigned, unsigned> perm = lane_perm[j];
> > > +	  /* This isn't strictly needed, but this function is a temporary
> > > +	     one for specifically pattern matching, so don't want it to
> > > +	     optimize things the remainder of the pipeline will.  */
> > > +	  if (perm.first != j)
> > > +	    goto next;
> > 
> > but please elide it nevertheless
> > 
> > > +          node = SLP_TREE_CHILDREN (root)[perm.first];
> > > +
> > > +	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
> > > +	    return NULL;
> > 
> > so you want to check whether this is a load, I think more to the point would
> > be a vect_internal_def + zero SLP children check.  And a comment on what
> > we test (we do lack classification of SLP nodes, so a helper like
> > vect_is_slp_load_node or so would be OK as well)
> > 
> > > +
> > > +	  stmts.quick_push (SLP_TREE_SCALAR_STMTS
> > (node)[perm.second]);
> > > +          load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION
> > > +(node)[perm.second]);
> > 
> > As you're doing here lacks a check that we are actually loading from the same
> > DR group.  I think it might be easier to just collect scalar stmts and throw
> > them at vect_build_slp_tree?  That should perform the necessary
> > verification, build the appropriate lane permute and perform the CSE.  Which
> > leads to the question why the VEC_PERM node doesn't have scalar stmts set
> > while we are actually be able to compute them here ... that is, the CSE
> > opportunity could have been noticed during pattern matching itself?
> > 
> 
> I thought about this myself, but given your previous comment of not touching the scalar
> Statements during matching I thought it was a cleaner abstraction to separate the two.
> 
> Also this optimization function is temporary anyway so I figured I'd leave the matchers
> "as they should be" and optimize it afterwards.
> 
> Regards,
> Tamar
> 
> > > +        }
> > > +
> > > +      if (dump_enabled_p ())
> > > +	dump_printf_loc (MSG_NOTE, vect_location,
> > > +			 "converting stmts on permute node %p\n", root);
> > > +
> > > +      slp_tree *value = bst_map->get (stmts);
> > > +      if (value)
> > > +	node = *value;
> > > +      else
> > > +	{
> > > +	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
> > > +	    SLP_TREE_REF_COUNT (node)++;
> > > +
> > > +	  vec<stmt_vec_info> stmts_cpy = stmts.copy ();
> > > +	  node = vect_create_new_slp_node (stmts_cpy.copy (), 0);
> > > +	  SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root);
> > > +	  SLP_TREE_LOAD_PERMUTATION (node) = load_perm;
> > > +	  bst_map->put (stmts_cpy, node);
> > > +	}
> > > +      SLP_TREE_REF_COUNT (node)++;
> > 
> > Adjusting the refcount here but doing the replacement in the caller is a bit
> > awkward to follow - how about passing a reference so you can adjust the
> > edge here?
> > 
> > > +
> > > +      return node;
> > > +    }
> > > +
> > > +next:
> > > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> > > +    {
> > > +      slp_tree value = optimize_load_redistribution_1 (bst_map, visited,
> > node);
> > > +      if (value)
> > > +	{
> > > +          SLP_TREE_CHILDREN (root)[i] = value;
> > > +          vect_free_slp_tree (node);
> > > +	}
> > > +    }
> > > +
> > > +  return NULL;
> > > +}
> > > +
> > > +/* Temporary workaround for loads not being CSEd during SLP build.  This
> > > +   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
> > > +   VEC_PERM nodes that blend vectors from multiple nodes that all read
> > from the
> > > +   same DR such that the final operation is equal to a permuted load.  Such
> > > +   NODES are then directly converted into LOADS themselves.  The nodes
> > are
> > > +   CSEd using BST_MAP.  */
> > > +
> > > +static void
> > > +optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
> > *bst_map,
> > > +			      slp_tree root)
> > > +{
> > > +  slp_tree node;
> > > +  unsigned i;
> > > +  hash_set<slp_tree> visited;
> > > +
> > > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> > > +    {
> > > +      slp_tree value = optimize_load_redistribution_1 (bst_map, &visited,
> > node);
> > > +      if (value)
> > > +	{
> > > +          SLP_TREE_CHILDREN (root)[i] = value;
> > > +          vect_free_slp_tree (node);
> > > +	}
> > > +    }
> > > +}
> > > +
> > >  /* Helper function of vect_match_slp_patterns.
> > >
> > >     Attempts to match patterns against the slp tree rooted in REF_NODE
> > > using @@ -2276,7 +2385,7 @@ static bool  vect_match_slp_patterns
> > > (slp_instance instance, vec_info *vinfo,
> > >  			 hash_set<slp_tree> *visited,
> > >  			 slp_tree_to_load_perm_map_t *perm_cache,
> > > -			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
> > > +			 scalar_stmts_to_slp_tree_map_t *bst_map)
> > >  {
> > >    DUMP_VECT_SCOPE ("vect_match_slp_patterns");
> > >    slp_tree *ref_node = &SLP_INSTANCE_TREE (instance); @@ -2291,6
> > > +2400,8 @@ vect_match_slp_patterns (slp_instance instance, vec_info
> > > *vinfo,
> > >
> > >    if (found_p)
> > >      {
> > > +      optimize_load_redistribution (bst_map, *ref_node);
> > > +
> > >        if (dump_enabled_p ())
> > >  	{
> > >  	  dump_printf_loc (MSG_NOTE, vect_location,
> > >
> > >
> > > --
> > >
> > >
> > >     [ Part 2, Text/X-DIFF 140 lines. ]
> > >     [ Unable to print this part. ]
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)


More information about the Gcc-patches mailing list