[PATCH] rs6000: Support doubleword swaps removal in rot64 load store [PR100085]

Xionghu Luo luoxhu@linux.ibm.com
Fri Jun 4 02:45:17 GMT 2021



On 2021/6/4 04:16, Segher Boessenkool wrote:
> Hi!
> 
> On Thu, Jun 03, 2021 at 08:46:46AM +0800, Xionghu Luo wrote:
>> On 2021/6/3 06:20, Segher Boessenkool wrote:
>>> On Wed, Jun 02, 2021 at 03:19:32AM -0500, Xionghu Luo wrote:
>>>> On P8LE, extra rot64+rot64 load or store instructions are generated
>>>> in float128 to vector __int128 conversion.
>>>>
>>>> This patch teaches pass swaps to also handle such pattens to remove
>>>> extra swap instructions.
>>>
>>> Did you check if this is already handled by simplify-rtx if the mode had
>>> been TImode (not V1TImode)?  If not, why do you not handle it there?
>>
>> I tried to do it in combine or peephole, the later pass split2
>> or split3 will still split it to rotate + rotate again as we have split
>> after reload, and this pattern is quite P8LE specific, so put it in pass
>> swap.  The simplify-rtx could simplify
>> r124:KF#0=r123:KF#0<-<0x40<-<0x40 to r124:KF#0=r123:KF#0 for register
>> operations already.
> 
> What mode are those subregs?  Abbreviated RTL printouts are very lossy.
> Assuming those are TImode (please check), then yes, that is what I
> asked, thanks.

typedef union
{
  __float128 vf1;
  vector __int128 vi128;
  __int128 i128;
} VF_128;


VF_128 vu;

int foo3 ()
{
  __float128 f128 = {3.1415926535897932384626433832795028841971693993751058Q};
  vu.vf1 = f128;
  vector __int128 ret = vu.vi128;
  return ret[0];
}

This case catches such optimization, they are also V1TImode:

Trying 8 -> 9:
    8: r122:KF#0=r123:KF#0<-<0x40
      REG_DEAD r123:KF
    9: r124:KF#0=r122:KF#0<-<0x40
      REG_DEAD r122:KF
Successfully matched this instruction:
(set (subreg:V1TI (reg:KF 124) 0)
    (rotate:V1TI (rotate:V1TI (subreg:V1TI (reg:KF 123) 0)
            (const_int 64 [0x40]))
        (const_int 64 [0x40])))
allowing combination of insns 8 and 9
original costs 4 + 4 = 8
replacement cost 4
deferring deletion of insn with uid = 8.
modifying insn i3     9: r124:KF#0=r123:KF#0<-<0x40<-<0x40
      REG_DEAD r123:KF
deferring rescan insn with uid = 9.


With confirmation, actually it was optimized by this pattern from vsx.md
in split1 pass instead of simplify-rtx.


(define_insn_and_split "*vsx_le_undo_permute_<mode>"
  [(set (match_operand:VSX_TI 0 "vsx_register_operand" "=wa,wa")
	(rotate:VSX_TI
	 (rotate:VSX_TI
	  (match_operand:VSX_TI 1 "vsx_register_operand" "0,wa")
	  (const_int 64))
	 (const_int 64)))]
  "!BYTES_BIG_ENDIAN && TARGET_VSX"
  "@
   #
   xxlor %x0,%x1"
  "&& 1"
  [(set (match_dup 0) (match_dup 1))]
{
  if (reload_completed && REGNO (operands[0]) == REGNO (operands[1]))
    {
      emit_note (NOTE_INSN_DELETED);
      DONE;
    }
}
  [(set_attr "length" "0,4")
   (set_attr "type" "veclogical")])


> 
>> ;; The post-reload split requires that we re-permute the source
>> ;; register in case it is still live.
>> (define_split
>>    [(set (match_operand:VSX_LE_128 0 "memory_operand")
>>          (match_operand:VSX_LE_128 1 "vsx_register_operand"))]
>>    "!BYTES_BIG_ENDIAN && TARGET_VSX && reload_completed && !TARGET_P9_VECTOR
>>     && !altivec_indexed_or_indirect_operand (operands[0], <MODE>mode)"
>>    [(const_int 0)]
>> {
>>    rs6000_emit_le_vsx_permute (operands[1], operands[1], <MODE>mode);
>>    rs6000_emit_le_vsx_permute (operands[0], operands[1], <MODE>mode);
>>    rs6000_emit_le_vsx_permute (operands[1], operands[1], <MODE>mode);
>>    DONE;
>> })
> 
> Yes, that needs improvement itself.
> 
> The tthing to realise is that TImode is optimised by generic code just
> fine (as all scalar integer modes are), but V1TImode is not.  We have
> that mode because we really needed to not put TImode in vector registers
> so much on older cpus, but that balance may have changed by now.  Worth
> experimenting with, we now can do pretty much all noormal operations in
> vector registers!
>
We have two issues stated in PR100085, one is __float128 to vector __int128 (V1TImode),
the other is float128 to __float128 to __int128 (TImode).   The first one could be 
solved by this patch(by pass swap optimization), so to cover the TImode case, we should
also generate rotate permute instructions when gen_movti for P8LE like gen_movkf
in vector.md(below change is exactly copied from "mov<mode>"...)

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 84820d3b5cb..f35c235a39e 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -7385,7 +7385,22 @@ (define_expand "mov<mode>"
        (match_operand:INT 1 "any_operand"))]
   ""
 {
-  rs6000_emit_move (operands[0], operands[1], <MODE>mode);
+  /* When generating load/store instructions to/from VSX registers on
+     pre-power9 hardware in little endian mode, we need to emit register
+     permute instructions to byte swap the contents, since the VSX load/store
+     instructions do not include a byte swap as part of their operation.
+     Altivec loads and stores have no such problem, so we skip them below.  */
+  if (!BYTES_BIG_ENDIAN
+      && VECTOR_MEM_VSX_P (<MODE>mode)
+      && !TARGET_P9_VECTOR
+      && !gpr_or_gpr_p (operands[0], operands[1])
+      && ((memory_operand (operands[0], <MODE>mode)
+          && !altivec_indexed_or_indirect_operand(operands[0], <MODE>mode))
+         ^ (memory_operand (operands[1], <MODE>mode)
+            && !altivec_indexed_or_indirect_operand(operands[1], <MODE>mode))))
+      rs6000_emit_le_vsx_move (operands[0], operands[1], <MODE>mode);
+    else
+      rs6000_emit_move (operands[0], operands[1], <MODE>mode);
   DONE;
 })

test case:

__int128
foo4 (__float128 f128)
{
  VF_128 vunion;

  vunion.vf1 = f128;
  return vunion.i128;
}

trunk VS. this patch + above change (Sorry that still using slim RTL for
compare, the instruction pattern are added manually.):


diff pr100085.trunk.c.245r.expand pr100085.c.245r.expand
<     8: r121:TI=[r112:DI]        {*vsx_le_perm_load_ti}
<     9: r117:TI=r121:TI
<    13: %r3:TI=r117:TI
<    14: use %r3:TI
---
>     8: r122:V2DI=vec_select([r112:DI],parallel)    {vsx_ld_elemrev_v2di}
>     9: r121:TI#0=vec_select(r122:V2DI,parallel)    {*vsx_xxpermdi2_le_v2di}
>    10: r117:TI=r121:TI   
>    14: %r3:TI=r117:TI
>    15: use %r3:TI


diff pr100085.trunk.c.251r.swaps pr100085.c.251r.swaps:
<     6: r120:KF#0=r118:KF#0<-<0x40
<    16: r122:DI=0x20
<     7: [sfp:DI+r122:DI]=r120:KF#0<-<0x40
<    17: r123:DI=sfp:DI+0x20
<     8: r121:TI=[r123:DI]
<     9: r117:TI=r121:TI
<    13: %r3:TI=r117:TI
<    14: use %r3:TI
---
>    20: r120:KF#0=r118:KF#0
>    17: r123:DI=0x20
>    19: [sfp:DI+r123:DI&0xfffffffffffffff0]=r120:KF#0
>    18: r124:DI=0x20
>    21: r122:V2DI=[sfp:DI+r124:DI&0xfffffffffffffff0]
>    22: r121:TI#0=r122:V2DI
>    10: r117:TI=r121:TI
>    14: %r3:TI=r117:TI
>    15: use %r3:TI


diff pr100085.trunk.s pr100085.s
<       xxpermdi %vs34,%vs34,%vs34,2
<       li %r10,32
<       addi %r8,%r1,-48
<       addi %r9,%r1,-16
<       stxvd2x %vs34,%r8,%r10
<       ld %r3,0(%r9)
<       ld %r4,8(%r9)
---
>       xxpermdi %vs0,%vs34,%vs34,3
>       mfvsrd %r4,%vs34
>       mfvsrd %r3,%vs0


-- 
Thanks,
Xionghu


More information about the Gcc-patches mailing list