[PATCH], Add 4 operand FMA support back into power7

Tue Jul 19 23:26:00 GMT 2011

(I had an emacs failure when sending out this message, and I may have sent out
a blank message by accident -- sorry if I did).

When I did the original power7 support, I switched all of the floating point
operations to use VSX instructions instead of the traditional instructions.
The traditional fused multiply and add instructions (FMA) have 4 operands so
that the destination can be a separate register from the inputs, while the VSX
encoding requires the output to overlap the addend or one of the multiplies.
Occasionally you would get better code if the register allocator had the
freedom to use a different output register.  This patch adds 4 operand FMAs
back for scalar double precision.  It also adds the Altivec 4 operand FMAs for
vector single precision where the Altivec instruction set provides suitable
instructions.  It also adjusts the tests that were depending on using the VSX
form of the instructions.

I have bootstrapped the patches and reran the test suite with no regressions.
In addition, I have built and run all of Spec 2006 with the patches.  Are these
patches ok to install in GCC 4.7?

[gcc]
2011-07-19  Michael Meissner  <meissner@linux.vnet.ibm.com>

	* config/rs6000/vsx.md (vsx_fma*): Use 4 argument fma instructions
	where we can use them from the standard and altivec instruction
	sets, instead of always using the 3 operand VSX forms that require
	the destination to overlap one of the inputs.
	(vsx_fms*): Ditto.
	(vsx_fnma*): Ditto.
	(vsx_fnms*): Ditto.

	* config/rs6000/rs6000.md (fmadf4_fpr): Set fp_type fp_maddsub_d
	for DF types.
	(fmsdf4_fpr): Ditto.
	(nfmadf4_fpr): Ditto.
	(nfmsdf4_fpr): Ditto.

[gcc/testsuite]
2011-07-12  Michael Meissner  <meissner@linux.vnet.ibm.com>

	* gcc.target/powerpc/ppc-fma-1.c: Adjust to allow non-VSX fmas to
	be generated.
	* gcc.target/powerpc/ppc-fma-2.c: Ditto.
	* gcc.target/powerpc/recip-3.c: Ditto.


-- 
Michael Meissner, IBM
5 Technology Place Drive, M/S 2757, Westford, MA 01886-3141, USA
meissner@linux.vnet.ibm.com	fax +1 (978) 399-6899
-------------- next part --------------
Index: gcc/config/rs6000/vsx.md
===================================================================

--- gcc/config/rs6000/vsx.md	(revision 176207)
+++ gcc/config/rs6000/vsx.md	(working copy)
@@ -524,46 +524,112 @@ (define_insn "*vsx_tsqrt<mode>2_internal
   [(set_attr "type" "<VStype_simple>")
    (set_attr "fp_type" "<VSfptype_simple>")])
 
-;; Fused vector multiply/add instructions
+;; Fused vector multiply/add instructions Support the classical DF versions of
+;; fma, which allows the target to be a separate register from the 3 inputs.
+;; Under VSX, the target must be either the addend or the first multiply.
+;; Where we can, also do the same for the Altivec V4SF fmas.
+
+(define_insn "*vsx_fmadf4"
+  [(set (match_operand:DF 0 "vsx_register_operand" "=ws,ws,?wa,?wa,d")
+	(fma:DF
+	  (match_operand:DF 1 "vsx_register_operand" "%ws,ws,wa,wa,d")
+	  (match_operand:DF 2 "vsx_register_operand" "ws,0,wa,0,d")
+	  (match_operand:DF 3 "vsx_register_operand" "0,ws,0,wa,d")))]
+  "VECTOR_UNIT_VSX_P (DFmode)"
+  "@
+   xsmaddadp %x0,%x1,%x2
+   xsmaddmdp %x0,%x1,%x3
+   xsmaddadp %x0,%x1,%x2
+   xsmaddmdp %x0,%x1,%x3
+   {fma|fmadd} %0,%1,%2,%3"
+  [(set_attr "type" "fp")
+   (set_attr "fp_type" "fp_maddsub_d")])
+
+(define_insn "*vsx_fmav4sf4"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=ws,ws,?wa,?wa,v")
+	(fma:V4SF
+	  (match_operand:V4SF 1 "vsx_register_operand" "%ws,ws,wa,wa,v")
+	  (match_operand:V4SF 2 "vsx_register_operand" "ws,0,wa,0,v")
+	  (match_operand:V4SF 3 "vsx_register_operand" "0,ws,0,wa,v")))]
+  "VECTOR_UNIT_VSX_P (V4SFmode)"
+  "@
+   xvmaddasp %x0,%x1,%x2
+   xvmaddmsp %x0,%x1,%x3
+   xvmaddasp %x0,%x1,%x2
+   xvmaddmsp %x0,%x1,%x3
+   vmaddfp %0,%1,%2,%3"
+  [(set_attr "type" "vecfloat")])
 
-(define_insn "*vsx_fma<mode>4"
-  [(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,<VSr>,?wa,?wa")
-	(fma:VSX_B
-	  (match_operand:VSX_B 1 "vsx_register_operand" "%<VSr>,<VSr>,wa,wa")
-	  (match_operand:VSX_B 2 "vsx_register_operand" "<VSr>,0,wa,0")
-	  (match_operand:VSX_B 3 "vsx_register_operand" "0,<VSr>,0,wa")))]
-  "VECTOR_UNIT_VSX_P (<MODE>mode)"
+(define_insn "*vsx_fmav2df4"
+  [(set (match_operand:V2DF 0 "vsx_register_operand" "=ws,ws,?wa,?wa")
+	(fma:V2DF
+	  (match_operand:V2DF 1 "vsx_register_operand" "%ws,ws,wa,wa")
+	  (match_operand:V2DF 2 "vsx_register_operand" "ws,0,wa,0")
+	  (match_operand:V2DF 3 "vsx_register_operand" "0,ws,0,wa")))]
+  "VECTOR_UNIT_VSX_P (V2DFmode)"
   "@
-   x<VSv>madda<VSs> %x0,%x1,%x2
-   x<VSv>maddm<VSs> %x0,%x1,%x3
-   x<VSv>madda<VSs> %x0,%x1,%x2
-   x<VSv>maddm<VSs> %x0,%x1,%x3"
-  [(set_attr "type" "<VStype_mul>")
-   (set_attr "fp_type" "<VSfptype_mul>")])
+   xvmaddadp %x0,%x1,%x2
+   xvmaddmdp %x0,%x1,%x3
+   xvmaddadp %x0,%x1,%x2
+   xvmaddmdp %x0,%x1,%x3"
+  [(set_attr "type" "vecfloat")])
+
+(define_insn "*vsx_fmsdf4"
+  [(set (match_operand:DF 0 "vsx_register_operand" "=ws,ws,?wa,?wa,d")
+	(fma:DF
+	  (match_operand:DF 1 "vsx_register_operand" "%ws,ws,wa,wa,d")
+	  (match_operand:DF 2 "vsx_register_operand" "ws,0,wa,0,d")
+	  (neg:DF
+	    (match_operand:DF 3 "vsx_register_operand" "0,ws,0,wa,d"))))]
+  "VECTOR_UNIT_VSX_P (DFmode)"
+  "@
+   xsmsubadp %x0,%x1,%x2
+   xsmsubmdp %x0,%x1,%x3
+   xsmsubadp %x0,%x1,%x2
+   xsmsubmdp %x0,%x1,%x3
+   {fms|fmsub} %0,%1,%2,%3"
+  [(set_attr "type" "fp")
+   (set_attr "fp_type" "fp_maddsub_d")])
 
 (define_insn "*vsx_fms<mode>4"
-  [(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,<VSr>,?wa,?wa")
-	(fma:VSX_B
-	  (match_operand:VSX_B 1 "vsx_register_operand" "%<VSr>,<VSr>,wa,wa")
-	  (match_operand:VSX_B 2 "vsx_register_operand" "<VSr>,0,wa,0")
-	  (neg:VSX_B
-	    (match_operand:VSX_B 3 "vsx_register_operand" "0,<VSr>,0,wa"))))]
+  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=<VSr>,<VSr>,?wa,?wa")
+	(fma:VSX_F
+	  (match_operand:VSX_F 1 "vsx_register_operand" "%<VSr>,<VSr>,wa,wa")
+	  (match_operand:VSX_F 2 "vsx_register_operand" "<VSr>,0,wa,0")
+	  (neg:VSX_F
+	    (match_operand:VSX_F 3 "vsx_register_operand" "0,<VSr>,0,wa"))))]
   "VECTOR_UNIT_VSX_P (<MODE>mode)"
   "@
    x<VSv>msuba<VSs> %x0,%x1,%x2
    x<VSv>msubm<VSs> %x0,%x1,%x3
    x<VSv>msuba<VSs> %x0,%x1,%x2
    x<VSv>msubm<VSs> %x0,%x1,%x3"
-  [(set_attr "type" "<VStype_mul>")
-   (set_attr "fp_type" "<VSfptype_mul>")])
+  [(set_attr "type" "vecfloat")])
+
+(define_insn "*vsx_nfmadf4"
+  [(set (match_operand:DF 0 "vsx_register_operand" "=ws,ws,?wa,?wa,d")
+	(neg:DF
+	 (fma:DF
+	  (match_operand:DF 1 "vsx_register_operand" "ws,ws,wa,wa,d")
+	  (match_operand:DF 2 "vsx_register_operand" "ws,0,wa,0,d")
+	  (match_operand:DF 3 "vsx_register_operand" "0,ws,0,wa,d"))))]
+  "VECTOR_UNIT_VSX_P (DFmode)"
+  "@
+   xsnmaddadp %x0,%x1,%x2
+   xsnmaddmdp %x0,%x1,%x3
+   xsnmaddadp %x0,%x1,%x2
+   xsnmaddmdp %x0,%x1,%x3
+   {fnma|fnmadd} %0,%1,%2,%3"
+  [(set_attr "type" "fp")
+   (set_attr "fp_type" "fp_maddsub_d")])
 
 (define_insn "*vsx_nfma<mode>4"
-  [(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,<VSr>,?wa,?wa")
-	(neg:VSX_B
-	 (fma:VSX_B
-	  (match_operand:VSX_B 1 "vsx_register_operand" "<VSr>,<VSr>,wa,wa")
-	  (match_operand:VSX_B 2 "vsx_register_operand" "<VSr>,0,wa,0")
-	  (match_operand:VSX_B 3 "vsx_register_operand" "0,<VSr>,0,wa"))))]
+  [(set (match_operand:VSX_F 0 "vsx_register_operand" "=<VSr>,<VSr>,?wa,?wa")
+	(neg:VSX_F
+	 (fma:VSX_F
+	  (match_operand:VSX_F 1 "vsx_register_operand" "<VSr>,<VSr>,wa,wa")
+	  (match_operand:VSX_F 2 "vsx_register_operand" "<VSr>,0,wa,0")
+	  (match_operand:VSX_F 3 "vsx_register_operand" "0,<VSr>,0,wa"))))]
   "VECTOR_UNIT_VSX_P (<MODE>mode)"
   "@
    x<VSv>nmadda<VSs> %x0,%x1,%x2
@@ -573,22 +639,56 @@ (define_insn "*vsx_nfma<mode>4"
   [(set_attr "type" "<VStype_mul>")
    (set_attr "fp_type" "<VSfptype_mul>")])
 
-(define_insn "*vsx_nfms<mode>4"
-  [(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,<VSr>,?wa,?wa")
-	(neg:VSX_B
-	 (fma:VSX_B
-	   (match_operand:VSX_B 1 "vsx_register_operand" "%<VSr>,<VSr>,wa,wa")
-	   (match_operand:VSX_B 2 "vsx_register_operand" "<VSr>,0,wa,0")
-	   (neg:VSX_B
-	     (match_operand:VSX_B 3 "vsx_register_operand" "0,<VSr>,0,wa")))))]
-  "VECTOR_UNIT_VSX_P (<MODE>mode)"
+(define_insn "*vsx_nfmsdf4"
+  [(set (match_operand:DF 0 "vsx_register_operand" "=ws,ws,?wa,?wa,d")
+	(neg:DF
+	 (fma:DF
+	   (match_operand:DF 1 "vsx_register_operand" "%ws,ws,wa,wa,d")
+	   (match_operand:DF 2 "vsx_register_operand" "ws,0,wa,0,d")
+	   (neg:DF
+	     (match_operand:DF 3 "vsx_register_operand" "0,ws,0,wa,d")))))]
+  "VECTOR_UNIT_VSX_P (DFmode)"
   "@
-   x<VSv>nmsuba<VSs> %x0,%x1,%x2
-   x<VSv>nmsubm<VSs> %x0,%x1,%x3
-   x<VSv>nmsuba<VSs> %x0,%x1,%x2
-   x<VSv>nmsubm<VSs> %x0,%x1,%x3"
-  [(set_attr "type" "<VStype_mul>")
-   (set_attr "fp_type" "<VSfptype_mul>")])
+   xsnmsubadp %x0,%x1,%x2
+   xsnmsubmdp %x0,%x1,%x3
+   xsnmsubadp %x0,%x1,%x2
+   xsnmsubmdp %x0,%x1,%x3
+   {fnms|fnmsub} %0,%1,%2,%3"
+  [(set_attr "type" "fp")
+   (set_attr "fp_type" "fp_maddsub_d")])
+
+(define_insn "*vsx_nfmsv4sf4"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wf,wf,?wa,?wa,v")
+	(neg:V4SF
+	 (fma:V4SF
+	   (match_operand:V4SF 1 "vsx_register_operand" "%wf,wf,wa,wa,v")
+	   (match_operand:V4SF 2 "vsx_register_operand" "wf,0,wa,0,v")
+	   (neg:V4SF
+	     (match_operand:V4SF 3 "vsx_register_operand" "0,wf,0,wa,v")))))]
+  "VECTOR_UNIT_VSX_P (V4SFmode)"
+  "@
+   xvnmsubasp %x0,%x1,%x2
+   xvnmsubmsp %x0,%x1,%x3
+   xvnmsubasp %x0,%x1,%x2
+   xvnmsubmsp %x0,%x1,%x3
+   vnmsubfp %0,%1,%2,%3"
+  [(set_attr "type" "vecfloat")])
+
+(define_insn "*vsx_nfmsv2df4"
+  [(set (match_operand:V2DF 0 "vsx_register_operand" "=wd,wd,?wa,?wa")
+	(neg:V2DF
+	 (fma:V2DF
+	   (match_operand:V2DF 1 "vsx_register_operand" "%wd,wd,wa,wa")
+	   (match_operand:V2DF 2 "vsx_register_operand" "wd,0,wa,0")
+	   (neg:V2DF
+	     (match_operand:V2DF 3 "vsx_register_operand" "0,wd,0,wa")))))]
+  "VECTOR_UNIT_VSX_P (V2DFmode)"
+  "@
+   xvnmsubadp %x0,%x1,%x2
+   xvnmsubmdp %x0,%x1,%x3
+   xvnmsubadp %x0,%x1,%x2
+   xvnmsubmdp %x0,%x1,%x3"
+  [(set_attr "type" "vecfloat")])
 
 ;; Vector conditional expressions (no scalar version for these instructions)
 (define_insn "vsx_eq<mode>"
Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md	(revision 176207)
+++ gcc/config/rs6000/rs6000.md	(working copy)
@@ -6288,7 +6288,7 @@ (define_insn "*fmadf4_fpr"
    && VECTOR_UNIT_NONE_P (DFmode)"
   "{fma|fmadd} %0,%1,%2,%3"
   [(set_attr "type" "fp")
-   (set_attr "fp_type" "fp_maddsub_s")])
+   (set_attr "fp_type" "fp_maddsub_d")])
 
 (define_insn "*fmsdf4_fpr"
   [(set (match_operand:DF 0 "gpc_reg_operand" "=f")
@@ -6299,7 +6299,7 @@ (define_insn "*fmsdf4_fpr"
    && VECTOR_UNIT_NONE_P (DFmode)"
   "{fms|fmsub} %0,%1,%2,%3"
   [(set_attr "type" "fp")
-   (set_attr "fp_type" "fp_maddsub_s")])
+   (set_attr "fp_type" "fp_maddsub_d")])
 
 (define_insn "*nfmadf4_fpr"
   [(set (match_operand:DF 0 "gpc_reg_operand" "=f")
@@ -6310,7 +6310,7 @@ (define_insn "*nfmadf4_fpr"
    && VECTOR_UNIT_NONE_P (DFmode)"
   "{fnma|fnmadd} %0,%1,%2,%3"
   [(set_attr "type" "fp")
-   (set_attr "fp_type" "fp_maddsub_s")])
+   (set_attr "fp_type" "fp_maddsub_d")])
 
 (define_insn "*nfmsdf4_fpr"
   [(set (match_operand:DF 0 "gpc_reg_operand" "=f")
@@ -6321,7 +6321,7 @@ (define_insn "*nfmsdf4_fpr"
    && VECTOR_UNIT_NONE_P (DFmode)"
   "{fnms|fnmsub} %0,%1,%2,%3"
   [(set_attr "type" "fp")
-   (set_attr "fp_type" "fp_maddsub_s")])
+   (set_attr "fp_type" "fp_maddsub_d")])
 
 (define_expand "sqrtdf2"
   [(set (match_operand:DF 0 "gpc_reg_operand" "")
Index: gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c	(revision 176207)
+++ gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c	(working copy)
@@ -3,16 +3,16 @@
 /* { dg-require-effective-target powerpc_vsx_ok } */
 /* { dg-options "-O3 -ftree-vectorize -mcpu=power7 -ffast-math -ffp-contract=off" } */
 /* { dg-final { scan-assembler-times "xvmadd" 2 } } */
-/* { dg-final { scan-assembler-times "xsmadd" 1 } } */
+/* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 1 } } */
 /* { dg-final { scan-assembler-times "fmadds" 1 } } */
 /* { dg-final { scan-assembler-times "xvmsub" 2 } } */
-/* { dg-final { scan-assembler-times "xsmsub" 1 } } */
+/* { dg-final { scan-assembler-times "xsmsub\|fmsub\ " 1 } } */
 /* { dg-final { scan-assembler-times "fmsubs" 1 } } */
 /* { dg-final { scan-assembler-times "xvnmadd" 2 } } */
-/* { dg-final { scan-assembler-times "xsnmadd" 1 } } */
+/* { dg-final { scan-assembler-times "xsnmadd\|fnmadd\ " 1 } } */
 /* { dg-final { scan-assembler-times "fnmadds" 1 } } */
 /* { dg-final { scan-assembler-times "xvnmsub" 2 } } */
-/* { dg-final { scan-assembler-times "xsnmsub" 1 } } */
+/* { dg-final { scan-assembler-times "xsnmsub\|fnmsub\ " 1 } } */
 /* { dg-final { scan-assembler-times "fnmsubs" 1 } } */
 
 /* Only the functions calling the bulitin should generate an appropriate (a *
Index: gcc/testsuite/gcc.target/powerpc/recip-3.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/recip-3.c	(revision 176207)
+++ gcc/testsuite/gcc.target/powerpc/recip-3.c	(working copy)
@@ -1,9 +1,9 @@
 /* { dg-do compile { target { { powerpc*-*-* } && { ! powerpc*-apple-darwin* } } } } */
 /* { dg-options "-O2 -mrecip -ffast-math -mcpu=power7" } */
 /* { dg-final { scan-assembler-times "xsrsqrtedp" 1 } } */
-/* { dg-final { scan-assembler-times "xsmsub.dp" 1 } } */
+/* { dg-final { scan-assembler-times "xsmsub.dp\|fmsub\ " 1 } } */
 /* { dg-final { scan-assembler-times "xsmuldp" 4 } } */
-/* { dg-final { scan-assembler-times "xsnmsub.dp" 2 } } */
+/* { dg-final { scan-assembler-times "xsnmsub.dp\|fnmsub\ " 2 } } */
 /* { dg-final { scan-assembler-times "frsqrtes" 1 } } */
 /* { dg-final { scan-assembler-times "fmsubs" 1 } } */
 /* { dg-final { scan-assembler-times "fmuls" 4 } } */
Index: gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c	(revision 176207)
+++ gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c	(working copy)
@@ -3,16 +3,16 @@
 /* { dg-require-effective-target powerpc_vsx_ok } */
 /* { dg-options "-O3 -ftree-vectorize -mcpu=power7 -ffast-math" } */
 /* { dg-final { scan-assembler-times "xvmadd" 4 } } */
-/* { dg-final { scan-assembler-times "xsmadd" 2 } } */
+/* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 2 } } */
 /* { dg-final { scan-assembler-times "fmadds" 2 } } */
 /* { dg-final { scan-assembler-times "xvmsub" 2 } } */
-/* { dg-final { scan-assembler-times "xsmsub" 1 } } */
+/* { dg-final { scan-assembler-times "xsmsub\|fmsub\ " 1 } } */
 /* { dg-final { scan-assembler-times "fmsubs" 1 } } */
 /* { dg-final { scan-assembler-times "xvnmadd" 2 } } */
-/* { dg-final { scan-assembler-times "xsnmadd" 1 } } */
+/* { dg-final { scan-assembler-times "xsnmadd\|fnmadd " 1 } } */
 /* { dg-final { scan-assembler-times "fnmadds" 1 } } */
 /* { dg-final { scan-assembler-times "xvnmsub" 2 } } */
-/* { dg-final { scan-assembler-times "xsnmsub" 1 } } */
+/* { dg-final { scan-assembler-times "xsnmsub\|fnmsub " 1 } } */
 /* { dg-final { scan-assembler-times "fnmsubs" 1 } } */
 
 /* All functions should generate an appropriate (a * b) + c instruction