[gcc(refs/users/meissner/heads/work161-dmf)] Optimize __builtin_mma_assemble_acc for dense math registers

Michael Meissner meissner@gcc.gnu.org
Mon Mar 4 23:42:10 GMT 2024


https://gcc.gnu.org/g:7f98d8d5e0e3f93dbac49d3d6ddfe08a342b0ed2

commit 7f98d8d5e0e3f93dbac49d3d6ddfe08a342b0ed2
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Mon Mar 4 18:39:13 2024 -0500

    Optimize __builtin_mma_assemble_acc for dense math registers
    
    2024-03-04   Michael Meissner  <meissner@linux.ibm.com>
    
            * config/rs6000/mma.md (UNSPEC_DM_ASSEMBLE): New UNSPEC.
            (mma_assemble_acc): Optimize if we have dense math registers.
            (mma_assemble_acc_nodm): Rename from mma_assemble_acc, and restrict it
            to when we don't have dense math registers.
            (mma_assemble_acc_dm): New insn.

Diff:
---
 gcc/config/rs6000/mma.md | 66 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 57 insertions(+), 9 deletions(-)

diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 9ac4086cf50..33c1baeda5e 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -92,6 +92,7 @@
    UNSPEC_MMA_XXMFACC
    UNSPEC_MMA_XXMTACC
    UNSPEC_MMA_DMSETDMRZ
+   UNSPEC_DM_ASSEMBLE
   ])
 
 (define_c_enum "unspecv"
@@ -454,25 +455,47 @@
 })
 
 (define_expand "mma_assemble_acc"
-  [(match_operand:XO 0 "fpr_reg_operand")
+  [(match_operand:XO 0 "register_operand")
    (match_operand:V16QI 1 "mma_assemble_input_operand")
    (match_operand:V16QI 2 "mma_assemble_input_operand")
    (match_operand:V16QI 3 "mma_assemble_input_operand")
    (match_operand:V16QI 4 "mma_assemble_input_operand")]
   "TARGET_MMA"
 {
-  rtx src = gen_rtx_UNSPEC_VOLATILE (XOmode,
-			    	     gen_rtvec (4, operands[1], operands[2],
-				       		operands[3], operands[4]),
-			    	     UNSPECV_MMA_ASSEMBLE);
-  emit_move_insn (operands[0], src);
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  rtx op4 = operands[4];
+
+  if (TARGET_DENSE_MATH)
+    {
+      rtx vpair1 = gen_reg_rtx (OOmode);
+      rtx vpair2 = gen_reg_rtx (OOmode);
+      if (WORDS_BIG_ENDIAN)
+	{
+	  emit_insn (gen_vsx_assemble_pair (vpair1, op1, op2));
+	  emit_insn (gen_vsx_assemble_pair (vpair2, op3, op4));
+	  emit_insn (gen_mma_assemble_acc_dm (op0, vpair1, vpair2));
+	}
+      else
+	{
+	  emit_insn (gen_vsx_assemble_pair (vpair1, op4, op3));
+	  emit_insn (gen_vsx_assemble_pair (vpair2, op2, op1));
+	  emit_insn (gen_mma_assemble_acc_dm (op0, vpair1, vpair2));
+	}
+    }
+
+  else
+    emit_insn (gen_mma_assemble_acc_nodm (op0, op1, op2, op3, op4));
+
   DONE;
 })
 
 ;; We cannot update the four output registers atomically, so mark the output
-;; as an early clobber so we don't accidentally clobber the input operands.  */
+;; as an early clobber so we don't accidentally clobber the input operands.
 
-(define_insn_and_split "*mma_assemble_acc"
+(define_insn_and_split "mma_assemble_acc_nodm"
   [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
 	(unspec_volatile:XO
 	  [(match_operand:V16QI 1 "mma_assemble_input_operand" "mwa")
@@ -480,7 +503,7 @@
 	   (match_operand:V16QI 3 "mma_assemble_input_operand" "mwa")
 	   (match_operand:V16QI 4 "mma_assemble_input_operand" "mwa")]
 	  UNSPECV_MMA_ASSEMBLE))]
-  "TARGET_MMA"
+  "TARGET_MMA_NO_DENSE_MATH"
   "#"
   "&& reload_completed"
   [(const_int 0)]
@@ -493,6 +516,31 @@
   DONE;
 })
 
+;; On a system with dense math, we build the accumulators from two vector
+;; pairs.
+
+(define_insn_and_split "mma_assemble_acc_dm"
+ [(set (match_operand:XO 0 "register_operand" "=wD,?wa")
+       (unspec:XO [(match_operand:OO 1 "vsx_register_operand" "wa,mwa")
+		   (match_operand:OO 2 "vsx_register_operand" "wa,mwa")]
+		  UNSPEC_DM_ASSEMBLE))]
+ "TARGET_MMA_DENSE_MATH"
+ "@
+  dmxxinstdmr512 %0,%1,%2,0
+  #"
+ "&& reload_completed && vsx_register_operand (operands[0], XOmode)"
+ [(set (match_dup 3) (match_dup 1))
+  (set (match_dup 4) (match_dup 2))]
+{
+  int r = reg_or_subregno (operands[0]);
+  int hi = (!WORDS_BIG_ENDIAN);
+  int lo = 1 - hi;
+  operands[3] = gen_rtx_REG (OOmode, r + (hi * 2));
+  operands[4] = gen_rtx_REG (OOmode, r + (lo * 2));
+}
+ [(set_attr "type" "mma")
+  (set_attr "length" "*,16")])
+
 (define_expand "mma_disassemble_acc"
   [(match_operand:V16QI 0 "mma_disassemble_output_operand")
    (match_operand:XO 1 "fpr_reg_operand")


More information about the Gcc-cvs mailing list