This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[patch] Change swdiv evaluation order (x86), remove pass_convert_to_rsqrt


Hi,

two things came up while playing with -mrecip for spec2006 (which will 
give a tremendous speedup for gromacs, like 44%).  First, the evaluation 
order of our approximate division with Newton-Raphson is suboptimal in 
enlarging the error needlessly.  We evaluated in this order:

  a * (rcp(b) * (2.0 - (b * rcp(b))))

(that's of the order O(a*eps)), while it's better to evaluate as

  (a * rcp(b)) * (2.0 - (b * rcp(b)))

(order of O(eps)).  Clearly a*rcp(b) are in the range of the real result 
and 2-b*rcp(b) is a correction factor around 1.0.  So it's preferrable to 
actually compute those two things separately and multiply in the end, 
which also gives better ILP.  The new order is also the one given in the 
optimization manuals.

That fixes some SPEC2006 benchmarks when -mrecip is given (amongst them 
gromacs).

The second thing is pass_convert_to_rsqrt.  That is a pass which tries to 
transform sqrt(a/b) into rsqrt(b/a), on the theory that rsqrt can be 
evaluated quicker.  That theory is correct, but the transformation is 
bogus nevertheless.  Even under -funsafe-math-optimizations we aren't 
allowed to transform a/b into b/a; if a is very small, in particular if it 
might be zero, the results become bogus.  That's independed of if that 
(wrong) immediate result is itself wrapped with a sqrt/rsqrt call.  What 
the transformation will do is transform 0.0 == sqrt(0/x) -> rsqrt(x/0) == 
NaN.  Clearly something for -fnonsense-math-optimizations, not for 
-funsafe.

This bug doesn't hit us often, because it needs TARGET_BUILTIN_RECIPROCAL, 
which only does something on rs6000 and x86, and then only with -mrecip.  
The above case (0/something) wasn't catched by the testcases for this 
transformation because they used ==/!= to check their results, which under 
-ffast-math don't do the right thing for NaNs.  We need to use memcmp, 
like in the added testcase.  The brokeness does happen in 481.wrf with 
-mrecip.

We don't have any value range info during this pass, so the only case we 
could allow this transformation is known constants in the dividend.  I 
weighted this severe limitation and hence even smaller usefulness of the 
transformation (which is a bit doubtful to begin with, as with -mrecip the 
divison itself and the sqrt are already expressed in terms of rcp and 
rsqrt) with the gain of removing another pass and walk over all 
instructions, and chose the latter.

[Just for completeness, we still transform 1./sqrtf(x) into rsqrt(x) under 
-mrecip, this transformation is completely fine under -ffast-math 
actually, because we loose only 2 ulps; we might consider activating this 
not only with -mrecip]

Regstrapping on x86_64-linux in progress, the (few) testcases that use 
-mrecip at all pass and that's the only thing that should be affected.  
Okay for trunk?


Ciao,
Michael.
-- 
	* tree-ssa-math-opts.c (execute_convert_to_rsqrt): Remove.
	(gate_convert_to_rsqrt): Ditto.
	(pass_convert_to_rsqrt): Ditto.
	* tree-pass.h (pass_convert_to_rsqrt): Don't declare.
	* passes.c (init_optimization_passes): Don't add pass_convert_to_rsqrt
	to pass list.

	* config/i386/i386.c (ix86_emit_swdivsf): Change evaluation order.

testsuite/
	* gcc.target/i386/sse-recip.c: Use fabsf instead of != .
	* gcc.target/i386/sse-recip-vec.c: Ditto.
	* gcc.target/i386/brokensqrt.c: New test.

Index: tree-ssa-math-opts.c
===================================================================
--- tree-ssa-math-opts.c	(Revision 153683)
+++ tree-ssa-math-opts.c	(Arbeitskopie)
@@ -791,98 +791,6 @@ struct gimple_opt_pass pass_cse_sincos =
  }
 };
 
-/* Find all expressions in the form of sqrt(a/b) and
-   convert them to rsqrt(b/a).  */
-
-static unsigned int
-execute_convert_to_rsqrt (void)
-{
-  basic_block bb;
-
-  FOR_EACH_BB (bb)
-    {
-      gimple_stmt_iterator gsi;
-
-      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
-        {
-	  gimple stmt = gsi_stmt (gsi);
-	  tree fndecl;
-
-	  if (is_gimple_call (stmt)
-	      && gimple_call_lhs (stmt)
-	      && (fndecl = gimple_call_fndecl (stmt))
-	      && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
-		  || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
-	    {
-	      enum built_in_function code;
-	      bool md_code;
-	      tree arg1;
-	      gimple stmt1;
-
-	      code = DECL_FUNCTION_CODE (fndecl);
-	      md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
-
-	      fndecl = targetm.builtin_reciprocal (code, md_code, true);
-	      if (!fndecl)
-		continue;
-
-	      arg1 = gimple_call_arg (stmt, 0);
-
-	      if (TREE_CODE (arg1) != SSA_NAME)
-		continue;
-
-	      stmt1 = SSA_NAME_DEF_STMT (arg1);
-
-	      if (is_gimple_assign (stmt1)
-		  && gimple_assign_rhs_code (stmt1) == RDIV_EXPR)
-		{
-		  tree arg10, arg11;
-
-		  arg10 = gimple_assign_rhs1 (stmt1);
-		  arg11 = gimple_assign_rhs2 (stmt1);
-
-		  /* Swap operands of RDIV_EXPR.  */
-		  gimple_assign_set_rhs1 (stmt1, arg11);
-		  gimple_assign_set_rhs2 (stmt1, arg10);
-		  fold_stmt_inplace (stmt1);
-		  update_stmt (stmt1);
-
-		  gimple_call_set_fndecl (stmt, fndecl);
-		  update_stmt (stmt);
-		}
-	    }
-	}
-    }
-
-  return 0;
-}
-
-static bool
-gate_convert_to_rsqrt (void)
-{
-  return flag_unsafe_math_optimizations && optimize;
-}
-
-struct gimple_opt_pass pass_convert_to_rsqrt =
-{
- {
-  GIMPLE_PASS,
-  "rsqrt",				/* name */
-  gate_convert_to_rsqrt,		/* gate */
-  execute_convert_to_rsqrt,		/* execute */
-  NULL,					/* sub */
-  NULL,					/* next */
-  0,					/* static_pass_number */
-  TV_NONE,				/* tv_id */
-  PROP_ssa,				/* properties_required */
-  0,					/* properties_provided */
-  0,					/* properties_destroyed */
-  0,					/* todo_flags_start */
-  TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
-    | TODO_verify_stmts                 /* todo_flags_finish */
- }
-};
-
 /* A symbolic number is used to detect byte permutation and selection
    patterns.  Therefore the field N contains an artificial number
    consisting of byte size markers:
Index: tree-pass.h
===================================================================
--- tree-pass.h	(Revision 153683)
+++ tree-pass.h	(Arbeitskopie)
@@ -401,7 +401,6 @@ extern struct gimple_opt_pass pass_early
 extern struct gimple_opt_pass pass_late_warn_uninitialized;
 extern struct gimple_opt_pass pass_cse_reciprocals;
 extern struct gimple_opt_pass pass_cse_sincos;
-extern struct gimple_opt_pass pass_convert_to_rsqrt;
 extern struct gimple_opt_pass pass_optimize_bswap;
 extern struct gimple_opt_pass pass_warn_function_return;
 extern struct gimple_opt_pass pass_warn_function_noreturn;
Index: passes.c
===================================================================
--- passes.c	(Revision 153683)
+++ passes.c	(Arbeitskopie)
@@ -880,7 +880,6 @@ init_optimization_passes (void)
 	  NEXT_PASS (pass_tree_loop_done);
 	}
       NEXT_PASS (pass_cse_reciprocals);
-      NEXT_PASS (pass_convert_to_rsqrt);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_vrp);
       NEXT_PASS (pass_dominator);
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(Revision 153683)
+++ config/i386/i386.c	(Arbeitskopie)
@@ -28626,18 +28626,18 @@ void ix86_emit_swdivsf (rtx res, rtx a,
   emit_insn (gen_rtx_SET (VOIDmode, x0,
 			  gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
 					  UNSPEC_RCP)));
-  /* e0 = x0 * b */
+  /* e0 = x0 * a */
   emit_insn (gen_rtx_SET (VOIDmode, e0,
-			  gen_rtx_MULT (mode, x0, b)));
-  /* e1 = 2. - e0 */
+			  gen_rtx_MULT (mode, x0, a)));
+  /* e1 = x0 * b */
   emit_insn (gen_rtx_SET (VOIDmode, e1,
-			  gen_rtx_MINUS (mode, two, e0)));
-  /* x1 = x0 * e1 */
+			  gen_rtx_MULT (mode, x0, b)));
+  /* x1 = 2. - e1 */
   emit_insn (gen_rtx_SET (VOIDmode, x1,
-			  gen_rtx_MULT (mode, x0, e1)));
-  /* res = a * x1 */
+			  gen_rtx_MINUS (mode, two, e1)));
+  /* res = e0 * x1 */
   emit_insn (gen_rtx_SET (VOIDmode, res,
-			  gen_rtx_MULT (mode, a, x1)));
+			  gen_rtx_MULT (mode, e0, x1)));
 }
 
 /* Output code to perform a Newton-Rhapson approximation of a
Index: testsuite/gcc.target/i386/sse-recip-vec.c
===================================================================
--- testsuite/gcc.target/i386/sse-recip-vec.c	(Revision 153683)
+++ testsuite/gcc.target/i386/sse-recip-vec.c	(Arbeitskopie)
@@ -4,6 +4,7 @@
 #include "sse-check.h"
 
 extern float sqrtf (float);
+extern float fabsf (float);
 
 #define N 8
 
@@ -26,7 +27,7 @@ sse_test (void)
   /* check results:  */
   for (i = 0; i < N; i++)
     {
-      if (r[i] != rc[i])
+      if (fabsf (r[i] - rc[i]) > 0.0001)
 	abort();
     }   
 }
Index: testsuite/gcc.target/i386/sse-recip.c
===================================================================
--- testsuite/gcc.target/i386/sse-recip.c	(Revision 153683)
+++ testsuite/gcc.target/i386/sse-recip.c	(Arbeitskopie)
@@ -4,6 +4,7 @@
 #include "sse-check.h"
 
 extern float sqrtf (float);
+extern float fabsf (float);
 
 #define N 8
 
@@ -26,7 +27,7 @@ sse_test (void)
   /* check results:  */
   for (i = 0; i < N; i++)
     {
-      if (r[i] != rc[i])
+      if (fabsf (r[i] - rc[i]) > 0.0001)
 	abort();
     }   
 }
Index: testsuite/gcc.target/i386/brokensqrt.c
===================================================================
--- testsuite/gcc.target/i386/brokensqrt.c	(Revision 0)
+++ testsuite/gcc.target/i386/brokensqrt.c	(Revision 0)
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -msse -mfpmath=sse -mrecip" } */
+#include "sse-check.h"
+
+extern float sqrtf (float);
+float __attribute__((noinline)) broken (float a, float b)
+{
+  return sqrtf (a / b);
+}
+
+extern void abort (void);
+extern void *memcpy (void *, const void *, __SIZE_TYPE__);
+static void
+sse_test (void)
+{
+  int i;
+  float x;
+  char buf[sizeof (float)];
+  x = broken (0.0f, 10000.0f);
+  /* A convoluted way to check for the correct result (zero) for all
+     floating point formats.
+     We can't use ==, !=, or range checks, or isinf/isnan/isunordered,
+     because all of these will not do the right thing under -ffast-math,
+     as they can assume that neither nan nor inf are returned.  */
+  memcpy (&buf, &x, sizeof (float));
+  for (i = 0; i < sizeof (float); i++)
+    if (buf[i] != 0)
+      abort ();
+}


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]