[PATCH] IBM Z: Try to make use of load-and-test instructions

Fri Sep 18 11:10:13 GMT 2020

This patch enables a peephole2 optimization which transforms a load of
constant zero into a temporary register which is then finally used to
compare against a floating-point register of interest into a single load
and test instruction.  However, the optimization is only applied if both
registers are dead afterwards and if we test for (in)equality only.
This is relaxed in case of fast math.

This is a follow up to PR88856.

Bootstrapped and regtested on IBM Z.

gcc/ChangeLog:

	* config/s390/s390.md ("*cmp<mode>_ccs_0", "*cmp<mode>_ccz_0",
	"*cmp<mode>_ccs_0_fastmath"): Basically change "*cmp<mode>_ccs_0" into
	"*cmp<mode>_ccz_0" and for fast math add "*cmp<mode>_ccs_0_fastmath".

gcc/testsuite/ChangeLog:

	* gcc.target/s390/load-and-test-fp-1.c: Change test to include all
	possible combinations of dead/live registers and comparisons (equality,
	relational).
	* gcc.target/s390/load-and-test-fp-2.c: Same as load-and-test-fp-1.c
	but for fast math.
	* gcc.target/s390/load-and-test-fp.h: New test included by
	load-and-test-fp-{1,2}.c.
---
 gcc/config/s390/s390.md                       | 54 +++++++++++++++----
 .../gcc.target/s390/load-and-test-fp-1.c      | 19 +++----
 .../gcc.target/s390/load-and-test-fp-2.c      | 17 ++----
 .../gcc.target/s390/load-and-test-fp.h        | 12 +++++
 4 files changed, 67 insertions(+), 35 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/load-and-test-fp.h

diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 4c3e5400a2b..e591aa7c324 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -1391,23 +1391,55 @@
 ; (TF|DF|SF|TD|DD|SD) instructions
 
 
-; FIXME: load and test instructions turn SNaN into QNaN what is not
-; acceptable if the target will be used afterwards.  On the other hand
-; they are quite convenient for implementing comparisons with 0.0. So
-; try to enable them via splitter/peephole if the value isn't needed anymore.
-; See testcases: load-and-test-fp-1.c and load-and-test-fp-2.c
+; load and test instructions turn a signaling NaN into a quiet NaN.  Thus they
+; may only be used if the target register is dead afterwards or if fast math
+; is enabled.  The former is done via a peephole optimization.  Note, load and
+; test instructions may only be used for (in)equality comparisons because
+; relational comparisons must treat a quiet NaN like a signaling NaN which is
+; not the case for load and test instructions.  For fast math insn
+; "cmp<mode>_ccs_0_fastmath" applies.
+; See testcases load-and-test-fp-{1,2}.c
+
+(define_peephole2
+  [(set (match_operand:FP 0 "register_operand")
+	(match_operand:FP 1 "const0_operand"))
+   (set (reg:CCZ CC_REGNUM)
+	(compare:CCZ (match_operand:FP 2 "register_operand")
+		     (match_operand:FP 3 "register_operand")))]
+  "TARGET_HARD_FLOAT
+   && FP_REG_P (operands[2])
+   && REGNO (operands[0]) == REGNO (operands[3])
+   && peep2_reg_dead_p (2, operands[0])
+   && peep2_reg_dead_p (2, operands[2])"
+  [(parallel
+    [(set (reg:CCZ CC_REGNUM)
+	  (match_op_dup 4 [(match_dup 2) (match_dup 1)]))
+     (clobber (match_dup 2))])]
+  "operands[4] = gen_rtx_COMPARE (CCZmode, operands[2], operands[1]);")
 
 ; ltxbr, ltdbr, ltebr, ltxtr, ltdtr
-(define_insn "*cmp<mode>_ccs_0"
-  [(set (reg CC_REGNUM)
-	(compare (match_operand:FP 0 "register_operand"  "f")
-		 (match_operand:FP 1 "const0_operand"    "")))
-   (clobber (match_operand:FP      2 "register_operand" "=0"))]
-  "s390_match_ccmode(insn, CCSmode) && TARGET_HARD_FLOAT"
+(define_insn "*cmp<mode>_ccz_0"
+  [(set (reg:CCZ CC_REGNUM)
+	(compare:CCZ (match_operand:FP 0 "register_operand" "f")
+		     (match_operand:FP 1 "const0_operand")))
+   (clobber (match_operand:FP 2 "register_operand" "=0"))]
+  "TARGET_HARD_FLOAT"
   "lt<xde><bt>r\t%0,%0"
    [(set_attr "op_type" "RRE")
     (set_attr "type"  "fsimp<mode>")])
 
+(define_insn "*cmp<mode>_ccs_0_fastmath"
+  [(set (reg CC_REGNUM)
+	(compare (match_operand:FP 0 "register_operand" "f")
+		 (match_operand:FP 1 "const0_operand")))]
+  "s390_match_ccmode (insn, CCSmode)
+   && TARGET_HARD_FLOAT
+   && !flag_trapping_math
+   && !flag_signaling_nans"
+  "lt<xde><bt>r\t%0,%0"
+  [(set_attr "op_type" "RRE")
+   (set_attr "type" "fsimp<mode>")])
+
 ; VX: TFmode in FPR pairs: use cxbr instead of wfcxb
 ; cxtr, cdtr, cxbr, cdbr, cebr, cdb, ceb, wfcsb, wfcdb
 (define_insn "*cmp<mode>_ccs"
diff --git a/gcc/testsuite/gcc.target/s390/load-and-test-fp-1.c b/gcc/testsuite/gcc.target/s390/load-and-test-fp-1.c
index 2a7e88c0f1b..ebb8a88c574 100644
--- a/gcc/testsuite/gcc.target/s390/load-and-test-fp-1.c
+++ b/gcc/testsuite/gcc.target/s390/load-and-test-fp-1.c
@@ -1,17 +1,12 @@
 /* { dg-do compile } */
 /* { dg-options "-O3 -mzarch" } */
 
-/* a is used after the comparison.  We cannot use load and test here
-   since it would turn SNaNs into QNaNs.  */
+/* Use load-and-test instructions if compared for (in)equality and if variable
+   `a` is dead after the comparison.  For all other cases use
+   compare-and-signal instructions.  */
 
-double gl;
+#include "load-and-test-fp.h"
 
-double
-foo (double dummy, double a)
-{
-  if (a == 0.0)
-    gl = 1;
-  return a;
-}
-
-/* { dg-final { scan-assembler {\tcdbr?\t} } } */
+/* { dg-final { scan-assembler-times "ltdbr\t" 2 } } */
+/* { dg-final { scan-assembler-times "cdbr\t" 2 } } */
+/* { dg-final { scan-assembler-times "kdbr\t" 8 } } */
diff --git a/gcc/testsuite/gcc.target/s390/load-and-test-fp-2.c b/gcc/testsuite/gcc.target/s390/load-and-test-fp-2.c
index 7646fdd5def..53dab3c4424 100644
--- a/gcc/testsuite/gcc.target/s390/load-and-test-fp-2.c
+++ b/gcc/testsuite/gcc.target/s390/load-and-test-fp-2.c
@@ -1,16 +1,9 @@
 /* { dg-do compile } */
-/* { dg-options "-O3" } */
+/* { dg-options "-O3 -mzarch -ffast-math" } */
 
-/* a is not used after the comparison.  So we should use load and test
-   here.  */
+/* Fast-math implies -fno-trapping-math -fno-signaling-nans which imply
+   that no user visible trap will happen.  */
 
-double gl;
+#include "load-and-test-fp.h"
 
-void
-bar (double a)
-{
-  if (a == 0.0)
-    gl = 1;
-}
-
-/* { dg-final { scan-assembler "ltdbr\t" } } */
+/* { dg-final { scan-assembler-times "ltdbr\t" 12 } } */
diff --git a/gcc/testsuite/gcc.target/s390/load-and-test-fp.h b/gcc/testsuite/gcc.target/s390/load-and-test-fp.h
new file mode 100644
index 00000000000..f153d96698d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/load-and-test-fp.h
@@ -0,0 +1,12 @@
+double gl;
+
+#define test(N, CMP) \
+  void   N ## _dead(double a) { if (a CMP 0.0) gl = 1; } \
+  double N ## _live(double a) { if (a CMP 0.0) gl = 1; return a; }
+
+test(eq, ==)
+test(ne, !=)
+test(ge, >=)
+test(gt, >)
+test(le, <=)
+test(lt, <)
-- 
2.25.3