This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] do not lower a/b to a*(1/b)


David Edelsohn wrote:

Please test the patch with SPEC CPU2000 on multiple architectures
(including PowerPC) using -ffast-math.


Unfortunately I don't have access to SPEC.

Here is an updated patch including Andrew's suggestion and an additional test case using phi nodes.

Paolo
2005-05-13  Paolo Bonzini  <bonzini@gnu.org>

	* Makefile.in: Adjust dependencies.
	* expr.c (expand_expr_real_1) <case RDIV_EXPR>: Never emit as a*(1/b).
	* fold-const.c (distribute_real_division): New.
	(fold_binary) <case PLUS_EXPR, case MINUS_EXPR>: Use it.
	* tree-optimize.c (init_tree_optimization_passes): Run it.
	* tree-pass.h (pass_cse_reciprocals): New.
	* tree-ssa.c (gate_cse_reciprocals, execute_cse_reciprocals,
	execute_cse_reciprocals_1, pass_cse_reciprocals): New.

2005-05-13  Paolo Bonzini  <bonzini@gnu.org>

	* testsuite/gcc.dg/fold-div-1.c, testsuite/gcc.dg/recip-1.c,
	testsuite/gcc.dg/recip-2.c: New.

Index: Makefile.in
===================================================================
RCS file: /cvs/gcc/gcc/gcc/Makefile.in,v
retrieving revision 1.1484
diff -p -u -r1.1484 Makefile.in
--- Makefile.in	11 May 2005 16:25:24 -0000	1.1484
+++ Makefile.in	13 May 2005 16:24:33 -0000
@@ -1646,7 +1646,7 @@ tree-ssa.o : tree-ssa.c $(TREE_FLOW_H) $
    errors.h toplev.h function.h $(TIMEVAR_H) $(TM_H) coretypes.h \
    $(TREE_DUMP_H) langhooks.h tree-pass.h $(BASIC_BLOCK_H) bitmap.h \
    $(FLAGS_H) $(GGC_H) hard-reg-set.h $(HASHTAB_H) pointer-set.h \
-   $(TREE_GIMPLE_H) tree-inline.h $(VARRAY_H)
+   $(TREE_GIMPLE_H) tree-inline.h $(VARRAY_H) real.h
 tree-into-ssa.o : tree-into-ssa.c $(TREE_FLOW_H) $(CONFIG_H) $(SYSTEM_H) \
    $(RTL_H) $(TREE_H) $(TM_P_H) $(EXPR_H) output.h $(DIAGNOSTIC_H) \
    errors.h function.h $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \
Index: expr.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/expr.c,v
retrieving revision 1.789
diff -p -u -u -r1.789 expr.c
--- expr.c	3 May 2005 22:21:48 -0000	1.789
+++ expr.c	13 May 2005 16:13:18 -0000
@@ -7806,18 +7806,6 @@ expand_expr_real_1 (tree exp, rtx target
       return expand_divmod (0, code, mode, op0, op1, target, unsignedp);
 
     case RDIV_EXPR:
-      /* Emit a/b as a*(1/b).  Later we may manage CSE the reciprocal saving
-         expensive divide.  If not, combine will rebuild the original
-         computation.  */
-      if (flag_unsafe_math_optimizations && optimize && !optimize_size
-	  && TREE_CODE (type) == REAL_TYPE
-	  && !real_onep (TREE_OPERAND (exp, 0)))
-        return expand_expr (build2 (MULT_EXPR, type, TREE_OPERAND (exp, 0),
-				    build2 (RDIV_EXPR, type,
-					    build_real (type, dconst1),
-					    TREE_OPERAND (exp, 1))),
-			    target, tmode, modifier);
-
       goto binop;
 
     case TRUNC_MOD_EXPR:
Index: fold-const.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/fold-const.c,v
retrieving revision 1.578
diff -p -u -u -r1.578 fold-const.c
--- fold-const.c	11 May 2005 15:21:23 -0000	1.578
+++ fold-const.c	13 May 2005 16:13:17 -0000
@@ -3076,6 +3076,46 @@ distribute_bit_expr (enum tree_code code
   return fold_build2 (TREE_CODE (arg0), type, common,
 		      fold_build2 (code, type, left, right));
 }
+
+/* Knowing that ARG0 and ARG1 are both RDIV_EXPRs, simplify a binary operation
+   with code CODE.  This optimization is unsafe.  */
+static tree
+distribute_real_division (enum tree_code code, tree type, tree arg0, tree arg1)
+{
+  bool mul0 = TREE_CODE (arg0) == MULT_EXPR;
+  bool mul1 = TREE_CODE (arg1) == MULT_EXPR;
+
+  /* (A / C) +- (B / C) -> (A +- B) / C.  */
+  if (mul0 == mul1
+      && operand_equal_p (TREE_OPERAND (arg0, 1),
+		       TREE_OPERAND (arg1, 1), 0))
+    return fold_build2 (mul0 ? MULT_EXPR : RDIV_EXPR, type,
+			fold_build2 (code, type,
+				     TREE_OPERAND (arg0, 0),
+				     TREE_OPERAND (arg1, 0)),
+			TREE_OPERAND (arg0, 1));
+
+  /* (A / C1) +- (A / C2) -> A * (1 / C1 +- 1 / C2).  */
+  if (operand_equal_p (TREE_OPERAND (arg0, 0),
+		       TREE_OPERAND (arg1, 0), 0)
+      && TREE_CODE (TREE_OPERAND (arg0, 1)) == REAL_CST
+      && TREE_CODE (TREE_OPERAND (arg1, 1)) == REAL_CST)
+    {
+      REAL_VALUE_TYPE r0, r1;
+      r0 = TREE_REAL_CST (TREE_OPERAND (arg0, 1));
+      r1 = TREE_REAL_CST (TREE_OPERAND (arg1, 1));
+      if (!mul0)
+	real_arithmetic (&r0, RDIV_EXPR, &dconst1, &r0);
+      if (!mul1)
+        real_arithmetic (&r1, RDIV_EXPR, &dconst1, &r1);
+      real_arithmetic (&r0, code, &r0, &r1);
+      return fold_build2 (MULT_EXPR, type,
+			  TREE_OPERAND (arg0, 0),
+			  build_real (type, r0));
+    }
+
+  return NULL_TREE;
+}
 
 /* Return a BIT_FIELD_REF of type TYPE to refer to BITSIZE bits of INNER
    starting at BITPOS.  The field is unsigned if UNSIGNEDP is nonzero.  */
@@ -7502,6 +7544,12 @@ fold_binary (enum tree_code code, tree t
 				    fold_convert (type, tem));
 	    }
 
+          if (flag_unsafe_math_optimizations
+	      && (TREE_CODE (arg0) == RDIV_EXPR || TREE_CODE (arg0) == MULT_EXPR)
+	      && (TREE_CODE (arg1) == RDIV_EXPR || TREE_CODE (arg1) == MULT_EXPR)
+	      && (tem = distribute_real_division (code, type, arg0, arg1)))
+	    return tem;
+
 	  /* Convert x+x into x*2.0.  */
 	  if (operand_equal_p (arg0, arg1, 0)
 	      && SCALAR_FLOAT_TYPE_P (type))
@@ -7899,6 +7947,12 @@ fold_binary (enum tree_code code, tree t
 	    return fold_convert (type, fold (tem));
 	}
 
+      if (flag_unsafe_math_optimizations
+	  && (TREE_CODE (arg0) == RDIV_EXPR || TREE_CODE (arg0) == MULT_EXPR)
+	  && (TREE_CODE (arg1) == RDIV_EXPR || TREE_CODE (arg1) == MULT_EXPR)
+	  && (tem = distribute_real_division (code, type, arg0, arg1)))
+	return tem;
+
       if (TREE_CODE (arg0) == MULT_EXPR
 	  && TREE_CODE (arg1) == MULT_EXPR
 	  && (!FLOAT_TYPE_P (type) || flag_unsafe_math_optimizations))
Index: tree-optimize.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/tree-optimize.c,v
retrieving revision 2.91
diff -p -u -u -r2.91 tree-optimize.c
--- tree-optimize.c	12 May 2005 19:29:21 -0000	2.91
+++ tree-optimize.c	13 May 2005 16:13:17 -0000
@@ -402,6 +402,7 @@ init_tree_optimization_passes (void)
      we add may_alias right after fold builtins
      which can create arbitrary GIMPLE.  */
   NEXT_PASS (pass_may_alias);
+  NEXT_PASS (pass_cse_reciprocals);
   NEXT_PASS (pass_split_crit_edges);
   NEXT_PASS (pass_pre);
   NEXT_PASS (pass_sink_code);
Index: tree-pass.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/tree-pass.h,v
retrieving revision 2.33
diff -p -u -u -r2.33 tree-pass.h
--- tree-pass.h	21 Apr 2005 13:18:23 -0000	2.33
+++ tree-pass.h	13 May 2005 16:13:17 -0000
@@ -196,6 +196,7 @@ extern struct tree_opt_pass pass_fold_bu
 extern struct tree_opt_pass pass_stdarg;
 extern struct tree_opt_pass pass_early_warn_uninitialized;
 extern struct tree_opt_pass pass_late_warn_uninitialized;
+extern struct tree_opt_pass pass_cse_reciprocals;
 extern struct tree_opt_pass pass_warn_function_return;
 extern struct tree_opt_pass pass_phiopt;
 extern struct tree_opt_pass pass_forwprop;
Index: tree-ssa.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/tree-ssa.c,v
retrieving revision 2.94
diff -p -u -r2.94 tree-ssa.c
--- tree-ssa.c	8 May 2005 15:07:22 -0000	2.94
+++ tree-ssa.c	15 May 2005 08:43:12 -0000
@@ -28,6 +28,7 @@ Boston, MA 02111-1307, USA.  */
 #include "tm_p.h"
 #include "ggc.h"
 #include "langhooks.h"
+#include "real.h"
 #include "hard-reg-set.h"
 #include "basic-block.h"
 #include "output.h"
@@ -1222,4 +1223,137 @@ struct tree_opt_pass pass_late_warn_unin
   0,                                    /* todo_flags_finish */
   0				        /* letter */
 };
-	  
+
+
+/* A mini-pass that tries to CSE reciprocal operations.  These are
+   common in sequences such as this one:
+
+	modulus = sqrt(x*x + y*y + z*z);
+	x = x / modulus;
+	y = y / modulus;
+	z = z / modulus;
+
+   that can be optimized to
+
+	modulus = sqrt(x*x + y*y + z*z);
+        rmodulus = 1.0 / modulus;
+	x = x * rmodulus;
+	y = y * rmodulus;
+	z = z * rmodulus;
+
+   We do this for loop invariant divisors, and within a single basic
+   block in this function.  */
+static bool
+gate_cse_reciprocals (void)
+{
+  return optimize && !optimize_size && flag_unsafe_math_optimizations;
+}
+
+/* Check if DEF's uses include more than one floating-point division,
+   and if so replace them by multiplications with the reciprocal.  If
+   PHI is true, insert the reciprocal calculation before BSI, otherwise
+   insert it after and move BSI to the new statement.
+
+   Does not check the type of DEF, nor that DEF is a GIMPLE register.
+   This is done in the caller for speed, because otherwise this routine
+   would be called for every definition and phi node.  */
+static void
+execute_cse_reciprocals_1 (block_stmt_iterator *bsi, tree def, bool phi)
+{
+  use_operand_p use_p;
+  imm_use_iterator use_iter;
+  tree t, new_stmt, type;
+  int count = 0;
+
+  /* Find uses.  */
+  FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
+    {
+      tree use_stmt = USE_STMT (use_p);
+      if (TREE_CODE (use_stmt) == MODIFY_EXPR
+	  && TREE_CODE (TREE_OPERAND (use_stmt, 1)) == RDIV_EXPR
+	  && TREE_OPERAND (TREE_OPERAND (use_stmt, 1), 1) == def)
+	{
+	  if (++count == 2)
+	    break;
+	}
+    }
+
+  if (count < 2)
+    return;
+
+  /* Make a variable with the replacement and substitute it.  */
+  type = TREE_TYPE (def);
+  t = make_rename_temp (type, "reciptmp");
+  new_stmt = build2 (MODIFY_EXPR, void_type_node, t,
+		     fold_build2 (RDIV_EXPR, type, build_real (type, dconst1),
+				  def));
+
+  if (phi)
+    bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
+  else
+    bsi_insert_after (bsi, new_stmt, BSI_NEW_STMT);
+
+  FOR_EACH_IMM_USE_SAFE (use_p, use_iter, def)
+    {
+      tree use_stmt = USE_STMT (use_p);
+      if (use_stmt != new_stmt
+	  && TREE_CODE (use_stmt) == MODIFY_EXPR
+	  && TREE_CODE (TREE_OPERAND (use_stmt, 1)) == RDIV_EXPR
+	  && TREE_OPERAND (TREE_OPERAND (use_stmt, 1), 1) == def)
+	{
+	  TREE_SET_CODE (TREE_OPERAND (use_stmt, 1), MULT_EXPR);
+	  SET_USE (use_p, t);
+	}
+    }
+}
+
+static void
+execute_cse_reciprocals (void)
+{
+  basic_block bb;
+  FOR_EACH_BB (bb)
+    {
+      block_stmt_iterator bsi;
+      tree phi, def;
+      for (bsi = bsi_start (bb);
+	   !bsi_end_p (bsi) && TREE_CODE (bsi_stmt (bsi)) == LABEL_EXPR;
+	   bsi_next (&bsi))
+        ;
+
+      for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
+	{
+	  def = PHI_RESULT (phi);
+	  if (FLOAT_TYPE_P (TREE_TYPE (def))
+	      && is_gimple_reg (def))
+	    execute_cse_reciprocals_1 (&bsi, def, true);
+	}
+
+      for (; !bsi_end_p (bsi); bsi_next (&bsi))
+        {
+	  tree stmt = bsi_stmt (bsi);
+	  if (TREE_CODE (stmt) == MODIFY_EXPR
+	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
+	      && FLOAT_TYPE_P (TREE_TYPE (def))
+	      && is_gimple_reg (def))
+	    execute_cse_reciprocals_1 (&bsi, def, false);
+	}
+    }
+}
+
+struct tree_opt_pass pass_cse_reciprocals =
+{
+  "recip",				/* name */
+  gate_cse_reciprocals,			/* gate */
+  execute_cse_reciprocals,		/* execute */
+  NULL,					/* sub */
+  NULL,					/* next */
+  0,					/* static_pass_number */
+  0,					/* tv_id */
+  PROP_ssa,				/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
+    | TODO_verify_stmts,                /* todo_flags_finish */
+  0				        /* letter */
+};
Index: doc/passes.texi
===================================================================
RCS file: /cvs/gcc/gcc/gcc/doc/passes.texi,v
retrieving revision 1.53
diff -p -u -r1.53 passes.texi
--- doc/passes.texi	4 May 2005 17:15:30 -0000	1.53
+++ doc/passes.texi	13 May 2005 16:18:39 -0000
@@ -354,7 +354,7 @@ This pass transforms tail recursion into
 
 This pass sinks stores and assignments down the flowgraph closer to it's
 use point.  The pass is located in @file{tree-ssa-sink.c} and is
-described by @code{pass_sink_code}
+described by @code{pass_sink_code}.
 
 @item Partial redundancy elimination
 
@@ -362,6 +362,11 @@ This pass eliminates partially redundant
 performing load motion.  The pass is located in @file{tree-ssa-pre.c}
 and is described by @code{pass_pre}.
 
+Just before partial redundancy elimination, if
+@option{-funsafe-math-optimizations} is on, GCC tries to convert
+divisions to multiplications by the reciprocal.  The pass is located
+in @file{tree-ssa.c} and is described by @code{pass_cse_reciprocal}.
+
 @item Loop optimization
 
 The main driver of the pass is placed in @file{tree-ssa-loop.c}
Index: testsuite/gcc.dg/fold-div-1.c
===================================================================
RCS file: testsuite/gcc.dg/fold-div-1.c
diff -N testsuite/gcc.dg/fold-div-1.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ testsuite/gcc.dg/fold-div-1.c	13 May 2005 16:13:18 -0000
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-funsafe-math-optimizations -fdump-tree-generic" } */
+
+float f(float x)
+{
+  return x/2 + x/3;
+}
+
+float g(float x)
+{
+  return 2/x + 3/x;
+}
+
+float h(float x)
+{
+  return x/2 - x/3;
+}
+
+float i(float x)
+{
+  return 2/x - 3/x;
+}
+
+/* f and h should be turned into multiplications,
+   the divisions in g and i should be grouped together.  */
+
+/* { dg-final { scan-tree-dump-times " \\* " 2 "generic" } } */
+/* { dg-final { scan-tree-dump-times " / " 2 "generic" } } */
+/* { dg-final { cleanup-tree-dump "generic" } } */
+
Index: testsuite/gcc.dg/tree-ssa/recip-1.c
===================================================================
RCS file: testsuite/gcc.dg/tree-ssa/recip-1.c
diff -N testsuite/gcc.dg/tree-ssa/recip-1.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ testsuite/gcc.dg/tree-ssa/recip-1.c	13 May 2005 16:13:18 -0000
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -funsafe-math-optimizations -fdump-tree-recip" } */
+
+float e(float *x, float *y, float *z)
+{
+  float m = __builtin_sqrt (*x * *x + *y * *y + *z * *z);
+  *x /= m;
+  *y /= m;
+  *z /= m;
+}
+
+/* Look for only one division.  */
+/* { dg-final { scan-tree-dump-times "= .* /" 1 "recip" } } */
+/* { dg-final { cleanup-tree-dump "recip" } } */
Index: testsuite/gcc.dg/tree-ssa/recip-2.c
===================================================================
RCS file: testsuite/gcc.dg/tree-ssa/recip-2.c
diff -N testsuite/gcc.dg/tree-ssa/recip-2.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ testsuite/gcc.dg/tree-ssa/recip-2.c	13 May 2005 16:13:18 -0000
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -funsafe-math-optimizations -fdump-tree-recip" } */
+
+float e(float a, float b, float c, float d, float e, float f)
+{
+  if (a < b)
+    {
+      a = a + b;
+      c = c + d;
+    }
+
+  /* The PHI nodes for these divisions should be combined.  */
+  e = e / a;
+  f = f / a;
+  
+  a = a / c;
+  b = b / c;
+
+  return a + b + e + f;
+}
+
+/* { dg-final { scan-tree-dump-times " / " 2 "recip" } } */
+/* { dg-final { cleanup-tree-dump "recip" } } */

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]