[PATCH, rs6000] Add store fusion support for Power10

Bill Schmidt wschmidt@linux.ibm.com
Wed Aug 4 14:23:13 GMT 2021


Hi Pat,

Good stuff!  Comments below.

On 8/2/21 3:19 PM, Pat Haugen via Gcc-patches wrote:
> Enable store fusion on Power10.
>
> Use the SCHED_REORDER hook to implement Power10 specific ready list reordering.
> As of now, pairing stores for store fusion is the only function being
> performed.
>
> Bootstrap/regtest on powerpc64le(Power10) with no new regressions. Ok for
> master?
>
> -Pat
>
>
> 2021-08-02  Pat Haugen  <pthaugen@linux.ibm.com>
>
> gcc/ChangeLog:
>
> 	* config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add new flag.
> 	(POWERPC_MASKS): Likewise.
> 	* config/rs6000/rs6000.c (rs6000_option_override_internal): Enable
> 	store fusion for Power10.
> 	(is_load_insn1): Verify destination is a register.
> 	(is_store_insn1): Verify source is a register.
> 	(is_fusable_store): New.
> 	(power10_sched_reorder): Likewise.
> 	(rs6000_sched_reorder): Do Power10 specific reordering.
> 	(rs6000_sched_reorder2): Likewise.
> 	* config/rs6000/rs6000.opt: Add new option.
>
>
>
> diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def
> index 6758296c0fd..f5812da0184 100644
> --- a/gcc/config/rs6000/rs6000-cpus.def
> +++ b/gcc/config/rs6000/rs6000-cpus.def
> @@ -90,7 +90,8 @@
>   				 | OPTION_MASK_P10_FUSION_2LOGICAL	\
>   				 | OPTION_MASK_P10_FUSION_LOGADD 	\
>   				 | OPTION_MASK_P10_FUSION_ADDLOG	\
> -				 | OPTION_MASK_P10_FUSION_2ADD)
> +				 | OPTION_MASK_P10_FUSION_2ADD		\
> +				 | OPTION_MASK_P10_FUSION_2STORE)


This is all fine for now; as we've discussed elsewhere, we probably want 
to eventually consolidate all these fusion flags into one.

>
>   /* Flags that need to be turned off if -mno-power9-vector.  */
>   #define OTHER_P9_VECTOR_MASKS	(OPTION_MASK_FLOAT128_HW		\
> @@ -143,6 +144,7 @@
>   				 | OPTION_MASK_P10_FUSION_LOGADD 	\
>   				 | OPTION_MASK_P10_FUSION_ADDLOG	\
>   				 | OPTION_MASK_P10_FUSION_2ADD    	\
> +				 | OPTION_MASK_P10_FUSION_2STORE	\
>   				 | OPTION_MASK_HTM			\
>   				 | OPTION_MASK_ISEL			\
>   				 | OPTION_MASK_MFCRF			\
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 279f00cc648..1460a0d7c5c 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -4490,6 +4490,10 @@ rs6000_option_override_internal (bool global_init_p)
>         && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0)
>       rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD;
>
> +  if (TARGET_POWER10
> +      && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0)
> +    rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE;
> +
>     /* Turn off vector pair/mma options on non-power10 systems.  */
>     else if (!TARGET_POWER10 && TARGET_MMA)
>       {
> @@ -18357,7 +18361,7 @@ is_load_insn1 (rtx pat, rtx *load_mem)
>     if (!pat || pat == NULL_RTX)
>       return false;
>
> -  if (GET_CODE (pat) == SET)
> +  if (GET_CODE (pat) == SET && REG_P (SET_DEST (pat)))
>       return find_mem_ref (SET_SRC (pat), load_mem);
Looks like this is just an optimization to quickly discard stores, right?
>     if (GET_CODE (pat) == PARALLEL)
> @@ -18394,7 +18398,8 @@ is_store_insn1 (rtx pat, rtx *str_mem)
>     if (!pat || pat == NULL_RTX)
>       return false;
>
> -  if (GET_CODE (pat) == SET)
> +  if (GET_CODE (pat) == SET
> +      && (REG_P (SET_SRC (pat)) || SUBREG_P (SET_SRC (pat))))
>       return find_mem_ref (SET_DEST (pat), str_mem);


Similar question.

>
>     if (GET_CODE (pat) == PARALLEL)
> @@ -18859,6 +18864,96 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos)
>     return cached_can_issue_more;
>   }
>
> +/* Determine if INSN is a store to memory that can be fused with a similar
> +   adjacent store.  */
> +
> +static bool
> +is_fusable_store (rtx_insn *insn, rtx *str_mem)
> +{
> +  /* Exit early if not doing store fusion.  */
> +  if (!(TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE))
> +    return false;
> +
> +  /* Insn must be a non-prefixed base+disp form store.  */
> +  if (is_store_insn (insn, str_mem)
> +      && get_attr_prefixed (insn) == PREFIXED_NO
> +      && get_attr_update (insn) == UPDATE_NO
> +      && get_attr_indexed (insn) == INDEXED_NO)
> +    {
> +      /* Further restictions by mode and size.  */
> +      machine_mode mode = GET_MODE (*str_mem);
> +      HOST_WIDE_INT size;
> +      if MEM_SIZE_KNOWN_P (*str_mem)
> +	size = MEM_SIZE (*str_mem);
> +      else
> +	return false;
> +
> +      if INTEGRAL_MODE_P (mode)
> +	{
> +	  /* Must be word or dword size.  */
> +	  return (size == 4 || size == 8);
> +	}
> +      else if FLOAT_MODE_P (mode)
> +	{
> +	  /* Must be dword size.  */
> +	  return (size == 8);
> +	}
> +    }
> +
> +  return false;
> +}
> +
> +/* Do Power10 specific reordering of the ready list.  */
> +
> +static int
> +power10_sched_reorder (rtx_insn **ready, int lastpos)
> +{
> +  int pos;
> +  rtx mem1, mem2;
> +
> +  /* Do store fusion during sched2 only.  */
> +  if (!reload_completed)
> +    return cached_can_issue_more;
> +
> +  /* If the prior insn finished off a store fusion pair then simply
> +     reset the counter and return, nothing more to do.  */


Good comments throughout, thanks!

> +  if (load_store_pendulum != 0)
> +    {
> +      load_store_pendulum = 0;
> +      return cached_can_issue_more;
> +    }
> +
> +  /* Try to pair certain store insns to adjacent memory locations
> +     so that the hardware will fuse them to a single operation.  */
> +  if (is_fusable_store (last_scheduled_insn, &mem1))
> +    {
> +      /* A fusable store was just scheduled.  Scan the ready list for another
> +	 store that it can fuse with.  */
> +      pos = lastpos;
> +      while (pos >= 0)
> +	{
> +	  /* GPR stores can be ascending or descending offsets, FPR/VSR stores
VSR?  I don't see how that applies here.
> +	     must be ascending only.  */
> +	  if (is_fusable_store (ready[pos], &mem2)
> +	      && ((INTEGRAL_MODE_P (GET_MODE (mem1))
> +		   && adjacent_mem_locations (mem1, mem2))
> +		  || (FLOAT_MODE_P (GET_MODE (mem1))
> +		   && (adjacent_mem_locations (mem1, mem2) == mem1))))
> +	    {
> +	      /* Found a fusable store.  Move it to the end of the ready list
> +		 so it is scheduled next.  */
> +	      move_to_end_of_ready (ready, pos, lastpos);
> +
> +	      load_store_pendulum = -1;
> +	      break;
> +	    }
> +	  pos--;
> +	}
> +    }
> +
> +  return cached_can_issue_more;
> +}
> +
>   /* We are about to begin issuing insns for this clock cycle. */
>
>   static int
> @@ -18885,6 +18980,10 @@ rs6000_sched_reorder (FILE *dump ATTRIBUTE_UNUSED, int sched_verbose,
>     if (rs6000_tune == PROCESSOR_POWER6)
>       load_store_pendulum = 0;
>
> +  /* Do Power10 dependent reordering.  */
> +  if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn)
> +    power10_sched_reorder (ready, *pn_ready - 1);
> +


I happened to notice that pn_ready is marked as ATTRIBUTE_UNUSED.  This 
predates your patch, but maybe you could clean that up too.

>     return rs6000_issue_rate ();
>   }
>
> @@ -18906,6 +19005,10 @@ rs6000_sched_reorder2 (FILE *dump, int sched_verbose, rtx_insn **ready,
>         && recog_memoized (last_scheduled_insn) >= 0)
>       return power9_sched_reorder2 (ready, *pn_ready - 1);
>
> +  /* Do Power10 dependent reordering.  */
> +  if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn)
> +    return power10_sched_reorder (ready, *pn_ready - 1);
> +
>     return cached_can_issue_more;
>   }
>
> diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
> index 0538db387dc..3753de19557 100644
> --- a/gcc/config/rs6000/rs6000.opt
> +++ b/gcc/config/rs6000/rs6000.opt
> @@ -514,6 +514,10 @@ mpower10-fusion-2add
>   Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags)
>   Fuse dependent pairs of add or vaddudm instructions for better performance on power10.
>
> +mpower10-fusion-2store
> +Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags)
> +Fuse certain store operations together for better performance on power10.
> +
>   mcrypto
>   Target Mask(CRYPTO) Var(rs6000_isa_flags)
>   Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions.

Can you think of any test cases we can use to demonstrate store fusion?

Otherwise looks good to me.  Thanks!

Bill



More information about the Gcc-patches mailing list