[PATCH, rs6000] Add store fusion support for Power10

Pat Haugen pthaugen@linux.ibm.com
Wed Aug 4 21:16:45 GMT 2021


On 8/4/21 9:23 AM, Bill Schmidt wrote:
> Hi Pat,
> 
> Good stuff!  Comments below.
> 
> On 8/2/21 3:19 PM, Pat Haugen via Gcc-patches wrote:
>> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
>> index 279f00cc648..1460a0d7c5c 100644
>> --- a/gcc/config/rs6000/rs6000.c
>> +++ b/gcc/config/rs6000/rs6000.c
>> @@ -4490,6 +4490,10 @@ rs6000_option_override_internal (bool global_init_p)
>>         && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0)
>>       rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD;
>>
>> +  if (TARGET_POWER10
>> +      && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0)
>> +    rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE;
>> +
>>     /* Turn off vector pair/mma options on non-power10 systems.  */
>>     else if (!TARGET_POWER10 && TARGET_MMA)
>>       {
>> @@ -18357,7 +18361,7 @@ is_load_insn1 (rtx pat, rtx *load_mem)
>>     if (!pat || pat == NULL_RTX)
>>       return false;
>>
>> -  if (GET_CODE (pat) == SET)
>> +  if (GET_CODE (pat) == SET && REG_P (SET_DEST (pat)))
>>       return find_mem_ref (SET_SRC (pat), load_mem);
> Looks like this is just an optimization to quickly discard stores, right?

Additional verification check to make sure destination is REG, yes. This will become a separate patch.

>>     if (GET_CODE (pat) == PARALLEL)
>> @@ -18394,7 +18398,8 @@ is_store_insn1 (rtx pat, rtx *str_mem)
>>     if (!pat || pat == NULL_RTX)
>>       return false;
>>
>> -  if (GET_CODE (pat) == SET)
>> +  if (GET_CODE (pat) == SET
>> +      && (REG_P (SET_SRC (pat)) || SUBREG_P (SET_SRC (pat))))
>>       return find_mem_ref (SET_DEST (pat), str_mem);
> 
> 
> Similar question.

Similar answer. :)
> 
>>
>>     if (GET_CODE (pat) == PARALLEL)
>> @@ -18859,6 +18864,96 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos)
>>     return cached_can_issue_more;
>>   }
>>
>> +/* Determine if INSN is a store to memory that can be fused with a similar
>> +   adjacent store.  */
>> +
>> +static bool
>> +is_fusable_store (rtx_insn *insn, rtx *str_mem)
>> +{
>> +  /* Exit early if not doing store fusion.  */
>> +  if (!(TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE))
>> +    return false;
>> +
>> +  /* Insn must be a non-prefixed base+disp form store.  */
>> +  if (is_store_insn (insn, str_mem)
>> +      && get_attr_prefixed (insn) == PREFIXED_NO
>> +      && get_attr_update (insn) == UPDATE_NO
>> +      && get_attr_indexed (insn) == INDEXED_NO)
>> +    {
>> +      /* Further restictions by mode and size.  */
>> +      machine_mode mode = GET_MODE (*str_mem);
>> +      HOST_WIDE_INT size;
>> +      if MEM_SIZE_KNOWN_P (*str_mem)
>> +    size = MEM_SIZE (*str_mem);
>> +      else
>> +    return false;
>> +
>> +      if INTEGRAL_MODE_P (mode)
>> +    {
>> +      /* Must be word or dword size.  */
>> +      return (size == 4 || size == 8);
>> +    }
>> +      else if FLOAT_MODE_P (mode)
>> +    {
>> +      /* Must be dword size.  */
>> +      return (size == 8);
>> +    }
>> +    }
>> +
>> +  return false;
>> +}
>> +
>> +/* Do Power10 specific reordering of the ready list.  */
>> +
>> +static int
>> +power10_sched_reorder (rtx_insn **ready, int lastpos)
>> +{
>> +  int pos;
>> +  rtx mem1, mem2;
>> +
>> +  /* Do store fusion during sched2 only.  */
>> +  if (!reload_completed)
>> +    return cached_can_issue_more;
>> +
>> +  /* If the prior insn finished off a store fusion pair then simply
>> +     reset the counter and return, nothing more to do.  */
> 
> 
> Good comments throughout, thanks!
> 
>> +  if (load_store_pendulum != 0)
>> +    {
>> +      load_store_pendulum = 0;
>> +      return cached_can_issue_more;
>> +    }
>> +
>> +  /* Try to pair certain store insns to adjacent memory locations
>> +     so that the hardware will fuse them to a single operation.  */
>> +  if (is_fusable_store (last_scheduled_insn, &mem1))
>> +    {
>> +      /* A fusable store was just scheduled.  Scan the ready list for another
>> +     store that it can fuse with.  */
>> +      pos = lastpos;
>> +      while (pos >= 0)
>> +    {
>> +      /* GPR stores can be ascending or descending offsets, FPR/VSR stores
> VSR?  I don't see how that applies here.

Scalar floating point store from VSX reg (i.e. stxsd).

>> +         must be ascending only.  */
>> +      if (is_fusable_store (ready[pos], &mem2)
>> +          && ((INTEGRAL_MODE_P (GET_MODE (mem1))
>> +           && adjacent_mem_locations (mem1, mem2))
>> +          || (FLOAT_MODE_P (GET_MODE (mem1))
>> +           && (adjacent_mem_locations (mem1, mem2) == mem1))))
>> +        {
>> +          /* Found a fusable store.  Move it to the end of the ready list
>> +         so it is scheduled next.  */
>> +          move_to_end_of_ready (ready, pos, lastpos);
>> +
>> +          load_store_pendulum = -1;
>> +          break;
>> +        }
>> +      pos--;
>> +    }
>> +    }
>> +
>> +  return cached_can_issue_more;
>> +}
>> +
>>   /* We are about to begin issuing insns for this clock cycle. */
>>
>>   static int
>> @@ -18885,6 +18980,10 @@ rs6000_sched_reorder (FILE *dump ATTRIBUTE_UNUSED, int sched_verbose,
>>     if (rs6000_tune == PROCESSOR_POWER6)
>>       load_store_pendulum = 0;
>>
>> +  /* Do Power10 dependent reordering.  */
>> +  if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn)
>> +    power10_sched_reorder (ready, *pn_ready - 1);
>> +
> 
> 
> I happened to notice that pn_ready is marked as ATTRIBUTE_UNUSED.  This predates your patch, but maybe you could clean that up too.

Will do as a separate cleanup.

> 
>>     return rs6000_issue_rate ();
>>   }
>>
>> @@ -18906,6 +19005,10 @@ rs6000_sched_reorder2 (FILE *dump, int sched_verbose, rtx_insn **ready,
>>         && recog_memoized (last_scheduled_insn) >= 0)
>>       return power9_sched_reorder2 (ready, *pn_ready - 1);
>>
>> +  /* Do Power10 dependent reordering.  */
>> +  if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn)
>> +    return power10_sched_reorder (ready, *pn_ready - 1);
>> +
>>     return cached_can_issue_more;
>>   }
>>
>> diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
>> index 0538db387dc..3753de19557 100644
>> --- a/gcc/config/rs6000/rs6000.opt
>> +++ b/gcc/config/rs6000/rs6000.opt
>> @@ -514,6 +514,10 @@ mpower10-fusion-2add
>>   Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags)
>>   Fuse dependent pairs of add or vaddudm instructions for better performance on power10.
>>
>> +mpower10-fusion-2store
>> +Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags)
>> +Fuse certain store operations together for better performance on power10.
>> +
>>   mcrypto
>>   Target Mask(CRYPTO) Var(rs6000_isa_flags)
>>   Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions.
> 
> Can you think of any test cases we can use to demonstrate store fusion?

Yeah, should be able to come up with something to verify two adjacent stores.

Thanks for the review.
> 
> Otherwise looks good to me.  Thanks!
> 
> Bill
> 



More information about the Gcc-patches mailing list