[PATCH/RFC], PR 42694/middle-end, make pow faster for some constant exponents

Tue Jan 26 04:10:00 GMT 2010

I would like to get some comments on this patch that I plan to submit to GCC
4.6 when it is opened up.

In looking at the various spec 2006 runs, I noticed one bencmark (bwaves) was
spending a lot of time in the pow function.  I looked at the source, and saw
that all of the pow's came from the use of x**0.75 in the Fortran source (x**2
having already been optimized out by the compiler).

This patch changes the compiler so that under -ffast-math:

     pow (x, 0.25)	replaced by sqrt (sqrt (x))
     pow (x, 0.125)	replaced by sqrt (sqrt (sqrt (x)))
     pow (x, 0.75)	replaced by sqrt (x) * sqrt (sqrt (x))
     pow (x, 1./6.)	replaced by cbrt (sqrt (x))

And have the compiler stop replacing sqrt (sqrt (x)), sqrt (cbrt (x)), and
cbrt (sqrt (x)) with equivalent pow calls.

I added switches to control this, but I could just as easily go with --param or
target hooks.

I looked at the differences between what the compiler is doing now, and with
the optimization, and I feel they are not out of bounds (in the range of what
the current -ffast-math optimizations does already), but I would welcome
comments of how many sqrts/cbrts should be generated before the error gets out
of hand.

In general, on x86 and power systems, sqrt is an order of magntude faster than
pow, and cbrt is about 1.5 times faster.

The bug report is here:
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42694

I'm also enclosing my current test program for the speedup.

-- 
Michael Meissner, IBM
4 Technology Place Drive, MS 2203A, Westford, MA, 01886, USA
meissner@linux.vnet.ibm.com
-------------- next part --------------
[gcc]
2010-01-22  Michael Meissner  <meissner@linux.vnet.ibm.com>

	PR 42694/middle-end
	* common.opt (-fpow-max-sqrt): New switches to control number of
	sqrts and cbrts are generated for pow expansions.
	(-fpow-max-sqrt-after-cbrt): Ditto.
	(-fpow-max-cbrt): Ditto.

	* builtins.c (expand_builtin_pow_root): New function to expand pow
	with a constant exponent into a series of sqrt and cbrt calls.
	(expand_builtin_pow): Call it.

[gcc/testsuite]
2010-01-22  Michael Meissner  <meissner@linux.vnet.ibm.com>

	* gcc.dg/builtins-58.c: Add -fno-ident to the options to prevent
	the string 'pow' in the pathname of the compiler from generating a
	test failure.

Index: gcc/common.opt
===================================================================

--- gcc/common.opt	(.../svn+ssh://meissner@gcc.gnu.org/svn/gcc/trunk/gcc/common.opt)	(revision 156025)
+++ gcc/common.opt	(.../gcc/common.opt)	(working copy)
@@ -862,6 +862,22 @@ fpost-ipa-mem-report
 Common Report Var(post_ipa_mem_report)
 Report on memory allocation before interprocedural optimization
 
+fpow-max-sqrt=
+Common RejectNegative Var(flag_pow_max_sqrt) Joined UInteger Init(3)
+Use at most <number> sqrt calls to replace the builtin pow function if the
+-ffast-math or -funsafe-math-optimizations switches are used.
+
+fpow-max-sqrt-after-cbrt=
+Common RejectNegative Var(flag_pow_max_sqrt_after_cbrt) Joined UInteger Init(1)
+Use at most <number> sqrt calls in conjunction with cbrt calls to replace the
+builtin pow function if the -ffast-math or -funsafe-math-optimizations switches
+are used.
+
+fpow-max-cbrt=
+Common RejectNegative Var(flag_pow_max_cbrt) Joined UInteger Init(1)
+Use at most <number> cbrt calls to replace the builtin pow function if the
+-ffast-math or -funsafe-math-optimizations switches are used.
+
 fpack-struct
 Common Report Var(flag_pack_struct) Optimization
 Pack structure members together without holes
Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c	(.../svn+ssh://meissner@gcc.gnu.org/svn/gcc/trunk/gcc/builtins.c)	(revision 156025)
+++ gcc/builtins.c	(.../gcc/builtins.c)	(working copy)
@@ -2910,6 +2910,118 @@ expand_powi (rtx x, enum machine_mode mo
   return result;
 }
 
+/* Fold a builtin function call to pow, powf, or powl into a series of sqrts or
+   cbrts.  Return NULL_RTX if no simplification can be made or expand the tree
+   if we can simplify it.  */
+static rtx
+expand_builtin_pow_root (location_t loc, tree arg0, tree arg1, tree type,
+			 rtx subtarget)
+{
+  if (TREE_CODE (arg1) == REAL_CST
+      && !TREE_OVERFLOW (arg1)
+      && flag_unsafe_math_optimizations)
+    {
+      enum machine_mode mode = TYPE_MODE (type);
+      tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
+      tree cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
+      REAL_VALUE_TYPE c = TREE_REAL_CST (arg1);
+      REAL_VALUE_TYPE dconst34, dconstexp;
+      int num, num2, i;
+
+      if (sqrtfn)
+	{
+	  /* Check whether we can do a series of sqrt's for recprocal powers of
+	     two (0.5, 0.25, 0.125, etc.) until the limit of the number of
+	     sqrts is reached.  */
+	  dconstexp = dconsthalf;
+	  for (num = 1; num <= flag_pow_max_sqrt; num++)
+	    {
+	      if (REAL_VALUES_EQUAL (c, dconstexp))
+		{
+		  tree op = build_call_nofold_loc (loc, sqrtfn, 1, arg0);
+
+		  for (i = 1; i < num; i++)
+		    op = build_call_nofold_loc (loc, sqrtfn, 1, op);
+
+		  return expand_expr (op, subtarget, mode, EXPAND_NORMAL);
+		}
+
+	      SET_REAL_EXP (&dconstexp, REAL_EXP (&dconstexp) - 1);
+	    }
+
+	  /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)).  */
+	  if (flag_pow_max_sqrt >= 2 && !TREE_SIDE_EFFECTS (arg0))
+	    {
+	      real_from_integer (&dconst34, VOIDmode, 3, 0, 0);
+	      SET_REAL_EXP (&dconst34, REAL_EXP (&dconst34) - 2);
+
+	      if (REAL_VALUES_EQUAL (c, dconst34))
+		{
+		  tree sqrt1 = build_call_expr_loc (loc, sqrtfn, 1, arg0);
+		  tree sqrt2 = builtin_save_expr (sqrt1);
+		  tree sqrt3 = build_call_expr_loc (loc, sqrtfn, 1, sqrt1);
+		  tree op = fold_build2_loc (loc, MULT_EXPR, type, sqrt2,
+					     sqrt3);
+		  return expand_expr (op, subtarget, mode, EXPAND_NORMAL);
+		}
+	    }
+
+	  /* Check whether we can do a series of cbrt/sqrts instead of pow,
+	     until the limit of cbrts/sqrts is reached.  */
+	  if (cbrtfn
+	      && flag_pow_max_cbrt > 0
+	      && (tree_expr_nonnegative_p (arg0)
+		  || !HONOR_NANS (TYPE_MODE (type))))
+	    {
+	      dconstexp = dconst_third ();
+
+	      for (num = 1; num <= flag_pow_max_cbrt; num++)
+		{
+		  /* First try 1/(3**n).  */
+		  REAL_VALUE_TYPE dconstexp_trunc
+		    = real_value_truncate (TYPE_MODE (type), dconstexp);
+
+		  if (REAL_VALUES_EQUAL (c, dconstexp_trunc))
+		    {
+		      tree op = arg0;
+
+		      for (i = 0; i < num; i++)
+			op = build_call_nofold_loc (loc, cbrtfn, 1, op);
+
+		      return expand_expr (op, subtarget, mode, EXPAND_NORMAL);
+		    }
+
+		  /* Now try using sqrts as well to handle 1/6. etc.  */
+		  for (num2 = 1; num2 <= flag_pow_max_sqrt_after_cbrt; num2++)
+		    {
+		      SET_REAL_EXP (&dconstexp_trunc,
+				    REAL_EXP (&dconstexp_trunc) - 1);
+
+		      if (REAL_VALUES_EQUAL (c, dconstexp_trunc))
+			{
+			  tree op = arg0;
+
+			  for (i = 0; i < num2; i++)
+			    op = build_call_nofold_loc (loc, sqrtfn, 1, op);
+
+			  for (i = 0; i < num; i++)
+			    op = build_call_nofold_loc (loc, cbrtfn, 1, op);
+
+			  return expand_expr (op, subtarget, mode,
+					      EXPAND_NORMAL);
+			}
+		    }
+
+		  real_arithmetic (&dconstexp, MULT_EXPR, dconst_third_ptr (),
+				   &dconstexp);
+		}
+	    }
+	}
+    }
+
+  return NULL_RTX;
+}
+
 /* Expand a call to the pow built-in mathematical function.  Return NULL_RTX if
    a normal call should be emitted rather than expanding the function
    in-line.  EXP is the expression that is a call to the builtin
@@ -2995,6 +3107,13 @@ expand_builtin_pow (tree exp, rtx target
 	}
     }
 
+  /* Check whether we can do a series of sqrt or cbrt's instead of the pow
+     call.  */
+  op = expand_builtin_pow_root (EXPR_LOCATION (exp), arg0, arg1, type,
+				subtarget);
+  if (op)
+    return op;
+
   /* Try if the exponent is a third of an integer.  In this case
      we can expand to x**(n/3) * cbrt(x)**(n%3).  As cbrt (x) is
      different from pow (x, 1./3.) due to rounding and behavior
Index: gcc/testsuite/gcc.dg/builtins-58.c
===================================================================
--- gcc/testsuite/gcc.dg/builtins-58.c	(.../svn+ssh://meissner@gcc.gnu.org/svn/gcc/trunk/gcc/testsuite/gcc.dg/builtins-58.c)	(revision 156025)
+++ gcc/testsuite/gcc.dg/builtins-58.c	(.../gcc/testsuite/gcc.dg/builtins-58.c)	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O -ffast-math -std=c99" } */
+/* { dg-options "-O -ffast-math -std=c99 -fno-ident" } */
 
 #include "builtins-config.h"
 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: test-pow.tar.gz
Type: application/x-gzip
Size: 7782 bytes
Desc: not available
URL: <http://gcc.gnu.org/pipermail/gcc-patches/attachments/20100126/5062e746/attachment.bin>