[PATCH v2] dse: Remove partial load after full store for high part access[PR71309]

Wed Jul 22 09:18:12 GMT 2020

Hi,

On 2020/7/21 23:30, Richard Sandiford wrote:
> Xiong Hu Luo <luoxhu@linux.ibm.com> writes:>> @@ -1872,9 +1872,27 @@ get_stored_val (store_info *store_info, machine_mode read_mode,
>>       {
>>         poly_int64 shift = gap * BITS_PER_UNIT;
>>         poly_int64 access_size = GET_MODE_SIZE (read_mode) + gap;
>> -      read_reg = find_shift_sequence (access_size, store_info, read_mode,
>> -				      shift, optimize_bb_for_speed_p (bb),
>> -				      require_cst);
>> +      rtx rhs_subreg = NULL;
>> +
>> +      if (known_eq (GET_MODE_BITSIZE (store_mode), shift * 2))
>> +	{
>> +	  scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
>> +	  poly_uint64 sub_off
>> +	    = ((!BYTES_BIG_ENDIAN)
>> +		 ? GET_MODE_SIZE (store_mode) - GET_MODE_SIZE (inner_mode)
>> +		 : 0);
>> +
>> +	  rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
>> +					    store_mode, sub_off);
>> +	  if (rhs_subreg)
>> +	    read_reg
>> +	      = extract_low_bits (read_mode, inner_mode, copy_rtx (rhs_subreg));
>> +	}
>> +
>> +      if (read_reg == NULL)
>> +	read_reg
>> +	  = find_shift_sequence (access_size, store_info, read_mode, shift,
>> +				 optimize_bb_for_speed_p (bb), require_cst);
> 
> Did you consider doing this in find_shift_sequence instead?
> ISTM that this is really using subregs to optimise:
> 
>        /* In theory we could also check for an ashr.  Ian Taylor knows
> 	 of one dsp where the cost of these two was not the same.  But
> 	 this really is a rare case anyway.  */
>        target = expand_binop (new_mode, lshr_optab, new_reg,
> 			     gen_int_shift_amount (new_mode, shift),
> 			     new_reg, 1, OPTAB_DIRECT);
> 
> I think everything up to:
> 
>        /* Also try a wider mode if the necessary punning is either not
> 	 desirable or not possible.  */
>        if (!CONSTANT_P (store_info->rhs)
> 	  && !targetm.modes_tieable_p (new_mode, store_mode))
> 	continue;
> 
> is either neutral or helpful for the subreg case too, so maybe
> we could just add the optimisation after that.  (It probably isn't
> worth reusing any of the later loop body code though, since the
> subreg case is much simpler.)
> 
> I don't think we need to restrict this case to modes of size
> shift * 2.  We can just check whether the shift is a multiple of
> the new_mode calculated by find_shift_sequence (using multiple_p).
> 
> An easier way of converting the shift to a subreg byte offset
> is to use subreg_offset_from_lsb, which also handles
> BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN.
> 

Thanks, I've updated the patch by moving it into find_shift_sequence.
Not sure whether meets your comments precisely though it still works:)
There is a comment mentioned that 

  /* Some machines like the x86 have shift insns for each size of
     operand.  Other machines like the ppc or the ia-64 may only have
     shift insns that shift values within 32 or 64 bit registers.
     This loop tries to find the smallest shift insn that will right
     justify the value we want to read but is available in one insn on
     the machine.  */

So it will early break without some additional check as the new_mode is
TImode here:

      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
	break;



[PATCH v2] dse: Remove partial load after full store for high part access[PR71309]


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
18: r122:TI=r119:TI
16: r123:TI#0=r122:TI#8 0>>0
17: r123:TI#8=0
19: r124:DI=r123:TI#0
7: [r118:DI]=r119:TI
8: r121:DI=r124:DI

Final ASM will be as below without partial load after full store(stxv+ld):
  mr 9,3
  ld 3,24(3)
  ld 10,16(3)
  std 3,8(9)
  std 10,0(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression testing on Power9-LE.

For AArch64, one ldr is replaced by mov:

ldp     x2, x3, [x0, 16]
stp     x2, x3, [x0]
ldr     x0, [x0, 8]

=>

mov     x1, x0
ldp     x2, x0, [x0, 16]
stp     x2, x0, [x1]

gcc/ChangeLog:

2020-07-22  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* dse.c (find_shift_sequence): Use subreg of shifted from high part
	register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-22  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c                                  | 15 +++++++++-
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..e06a9fbb0cd 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1736,7 +1736,8 @@ find_shift_sequence (poly_int64 access_size,
       int cost;
 
       new_mode = new_mode_iter.require ();
-      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
+      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD
+	  && !multiple_p (GET_MODE_BITSIZE (new_mode), shift))
 	break;
 
       /* If a constant was stored into memory, try to simplify it here,
@@ -1793,6 +1794,18 @@ find_shift_sequence (poly_int64 access_size,
       shift_seq = get_insns ();
       end_sequence ();
 
+      /* Use subreg shifting from high part to avoid full store followed by
+	 partial load.  See PR71309.  */
+      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift) && shift_seq)
+	{
+	  new_lhs = extract_low_bits (new_mode, store_mode,
+				      copy_rtx (store_info->rhs));
+	  emit_move_insn (new_reg, new_lhs);
+	  emit_insn (shift_seq);
+	  read_reg = extract_low_bits (read_mode, new_mode, target);
+	  break;
+	}
+
       if (target != new_reg || shift_seq == NULL)
 	continue;
 
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 00000000000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+    TYPE2 mnt;
+    TYPE dentry;
+};
+
+struct nameidata {
+    struct path path;
+    struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = nd->root;
+  d = nd->path.dentry;
+  d2 = nd->path.mnt;
+  return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
-- 
2.27.0.90.geebb51ba8c