This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH] Optimise sqrt reciprocal multiplications

From: Kyrill Tkachov <kyrylo dot tkachov at foss dot arm dot com>
To: Richard Biener <rguenther at suse dot de>
Cc: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>, richard dot sandiford at arm dot com
Date: Wed, 05 Sep 2018 12:38:35 +0100
Subject: Re: [PATCH] Optimise sqrt reciprocal multiplications
References: <5B7C371C.8040207@foss.arm.com> <87bm9ttmq5.fsf@arm.com> <5B7EEA31.6000908@foss.arm.com> <5B87B55F.3020707@foss.arm.com> <alpine.LSU.2.20.1808311238020.16707@zhemvz.fhfr.qr> <5B8E792A.6050205@foss.arm.com> <alpine.LSU.2.20.1809041622290.16707@zhemvz.fhfr.qr> <5B8EB85A.6000705@foss.arm.com>

On 04/09/18 17:52, Kyrill Tkachov wrote:
>
> On 04/09/18 15:31, Richard Biener wrote:
>> On Tue, 4 Sep 2018, Kyrill Tkachov wrote:
>>> Hi Richard,
>>>
>>> On 31/08/18 12:07, Richard Biener wrote:
>>>> On Thu, 30 Aug 2018, Kyrill Tkachov wrote:
>>>>> Ping.
>>>>>
>>>>> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg01496.html
>>>>>
>>>>> Thanks,
>>>>> Kyrill
>>>>>
>>>>> On 23/08/18 18:09, Kyrill Tkachov wrote:
>>>>>> Hi Richard,
>>>>>>
>>>>>> On 23/08/18 11:13, Richard Sandiford wrote:
>>>>>>> Kyrill  Tkachov <kyrylo.tkachov@foss.arm.com> writes:
>>>>>>>> Hi all,
>>>>>>>>
>>>>>>>> This patch aims to optimise sequences involving uses of 1.0 / sqrt (a)
>>>>>>>> under -freciprocal-math and -funsafe-math-optimizations.
>>>>>>>> In particular consider:
>>>>>>>>
>>>>>>>> x = 1.0 / sqrt (a);
>>>>>>>> r1 = x * x;  // same as 1.0 / a
>>>>>>>> r2 = a * x; // same as sqrt (a)
>>>>>>>>
>>>>>>>> If x, r1 and r2 are all used further on in the code, this can be
>>>>>>>> transformed into:
>>>>>>>> tmp1 = 1.0 / a
>>>>>>>> tmp2 = sqrt (a)
>>>>>>>> tmp3 = tmp1 * tmp2
>>>>>>>> x = tmp3
>>>>>>>> r1 = tmp1
>>>>>>>> r2 = tmp2
>>>>>>> Nice optimisation :-)  Someone who knows the pass better should review,
>>>>>>> but:
>>>>>> Thanks for the review.
>>>>>>> There seems to be an implicit assumption that this is a win even
>>>>>>> when the r1 and r2 assignments are only conditionally executed.
>>>>>>> That's probably true, but it might be worth saying explicitly.
>>>>>> I'll admit I had not considered that case.
>>>>>> I think it won't make a difference in practice, as the really expensive
>>>>>> operations here
>>>>>> are the sqrt and the division and they are on the executed path in either
>>>>>> case and them
>>>>>> becoming independent should be a benefit of its own.
>>>>>>>> +/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
>>>>>>>> +
>>>>>>>> +static inline bool
>>>>>>>> +is_mult_by (gimple *use_stmt, tree def, tree a)
>>>>>>>> +{
>>>>>>>> +  if (gimple_code (use_stmt) == GIMPLE_ASSIGN
>>>>>>>> +      && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
>>>>>>>> +    {
>>>>>>>> +      tree op0 = gimple_assign_rhs1 (use_stmt);
>>>>>>>> +      tree op1 = gimple_assign_rhs2 (use_stmt);
>>>>>>>> +
>>>>>>>> +      return (op0 == def && op1 == a)
>>>>>>>> +          || (op0 == a && op1 == def);
>>>>>>>> +    }
>>>>>>>> +  return 0;
>>>>>>>> +}
>>>>>>> Seems like is_square_of could now be a light-weight wrapper around this.
>>>>>> Indeed, I've done the wrapping now.
>>>>>>>> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator
>>>>>>>> *def_gsi, tree def)
>>>>>>>>      occ_head = NULL;
>>>>>>>>    }
>>>>>>>>    +/* Transform sequences like
>>>>>>>> +   x = 1.0 / sqrt (a);
>>>>>>>> +   r1 = x * x;
>>>>>>>> +   r2 = a * x;
>>>>>>>> +   into:
>>>>>>>> +   tmp1 = 1.0 / a;
>>>>>>>> +   tmp2 = sqrt (a);
>>>>>>>> +   tmp3 = tmp1 * tmp2;
>>>>>>>> +   x = tmp3;
>>>>>>>> +   r1 = tmp1;
>>>>>>>> +   r2 = tmp2;
>>>>>>>> +   depending on the uses of x, r1, r2.  This removes one multiplication
>>>>>>>> and
>>>>>>>> +   allows the sqrt and division operations to execute in parallel.
>>>>>>>> +   DEF_GSI is the gsi of the initial division by sqrt that defines
>>>>>>>> +   DEF (x in the example abovs). */
>>>>>>>> +
>>>>>>>> +static void
>>>>>>>> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
>>>>>>>> +{
>>>>>>>> +  use_operand_p use_p;
>>>>>>>> +  imm_use_iterator use_iter;
>>>>>>>> +  gimple *stmt = gsi_stmt (*def_gsi);
>>>>>>>> +  tree x = def;
>>>>>>>> +  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
>>>>>>>> +  tree div_rhs1 = gimple_assign_rhs1 (stmt);
>>>>>>>> +
>>>>>>>> +  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
>>>>>>>> +      || TREE_CODE (div_rhs1) != REAL_CST
>>>>>>>> +      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
>>>>>>>> +    return;
>>>>>>>> +
>>>>>>>> +  gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
>>>>>>>> +  if (!is_gimple_call (sqrt_stmt)
>>>>>>>> +      || !gimple_call_lhs (sqrt_stmt))
>>>>>>>> +    return;
>>>>>>>> +
>>>>>>>> +  gcall *call = as_a <gcall *> (sqrt_stmt);
>>>>>>> Very minor, but:
>>>>>>>
>>>>>>>     gcall *sqrt_stmt
>>>>>>>       = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
>>>>>>>     if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
>>>>>>>       return;
>>>>>>>
>>>>>>> would avoid the need for the separate as_a<>, and would mean that
>>>>>>> we only call gimple_call_* on gcalls.
>>>>>> Ok.
>>>>>>>> +  if (has_other_use)
>>>>>>>> +    {
>>>>>>>> +      /* Using the two temporaries tmp1, tmp2 from above
>>>>>>>> +     the original x is now:
>>>>>>>> +     x = tmp1 * tmp2.  */
>>>>>>>> +      gcc_assert (mult_ssa_name);
>>>>>>>> +      gcc_assert (sqr_ssa_name);
>>>>>>>> +      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
>>>>>>>> +
>>>>>>>> +      tree new_ssa_name
>>>>>>>> +    = make_temp_ssa_name (TREE_TYPE (a), NULL,
>>>>>>>> "recip_sqrt_transformed");
>>>>>>>> +      gimple *new_stmt
>>>>>>>> +    = gimple_build_assign (new_ssa_name, MULT_EXPR,
>>>>>>>> +                   mult_ssa_name, sqr_ssa_name);
>>>>>>>> +      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
>>>>>>>> +      gcc_assert (gsi_stmt (gsi2) == stmt);
>>>>>>>> + gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
>>>>>>>> +      fold_stmt (&gsi2);
>>>>>>>> +      update_stmt (stmt);
>>>>>>> In this case we're replacing the statement in its original position,
>>>>>>> so there's no real need to use a temporary.  It seems better to
>>>>>>> change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same
>>>>>>> lhs as before.
>>>>>> Yes, that's cleaner.
>>>>>>>> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
>>>>>>>>          if (optimize_bb_for_size_p (bb))
>>>>>>>>            continue;
>>>>>>> Seems unnecessary to skip the new optimisation when optimising for size.
>>>>>>> Like you say, it saves a multiplication overall. Also:
>>>>>> Indeed.
>>>>>>>> +      if (flag_unsafe_math_optimizations)
>>>>>>>> +    {
>>>>>>>> +      for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
>>>>>>>> +           !gsi_end_p (gsi);
>>>>>>>> +           gsi_next (&gsi))
>>>>>>>> +        {
>>>>>>>> +          gimple *stmt = gsi_stmt (gsi);
>>>>>>>> +
>>>>>>>> +          if (gimple_has_lhs (stmt)
>>>>>>>> +          && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
>>>>>>>> +          && FLOAT_TYPE_P (TREE_TYPE (def))
>>>>>>>> +          && TREE_CODE (def) == SSA_NAME
>>>>>>>> +          && is_gimple_assign (stmt)
>>>>>>>> +          && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
>>>>>>>> +        optimize_recip_sqrt (&gsi, def);
>>>>>>>> +        }
>>>>>>>> +    }
>>>>>>> It looks like this could safely be done in one of the existing walks
>>>>>>> (e.g. the execute_cse_reciprocals_1 one, if we do this when optimising
>>>>>>> for size).
>>>>>> You're right. I've moved this into one of the walks above this.
>>>>>>
>>>>>> Bootstrapped and tested on aarch64-none-linux-gnu and
>>>>>> x86_64-unknown-linux-gnu.
>>>>>> CC'ing richi as he's reviewed these kinds of patches in the past.
>>>>>>
>>>>>> Is this ok for trunk?
>>>> I wonder how it interacts with execute_cse_reciprocals_1 given it
>>>> introduces a division 1.0 / a which will be not processed by
>>>> execute_cse_reciprocals_1 given that operates by walking uses of 'a'.
>>>> That may be just a missed optimization of course.
>>> Hmm, I believe right now it doesn't interact with execute_cse_reciprocals_1 as
>>> it's either one or the other.
>>> I've left it as it is for now, but would you like us to call
>>> execute_cse_reciprocals_1 on the potentially transformed division?
>> I think that wouldn't "fix" it given execute_cse_reciprocals_1 works by
>> seeing all reciprocal uses of 'a' so it may have alrady seen two and
>> you introduced the third.  But we can leave "solving" this for
>> future enhacements (I'd avoid doing two passes over the IL just because
>> of said possibility right now).
>>>> +  if (has_other_use)
>>>> +    {
>>>> +      /* Using the two temporaries tmp1, tmp2 from above
>>>> +        the original x is now:
>>>> +        x = tmp1 * tmp2.  */
>>>> +      gcc_assert (mult_ssa_name);
>>>> +      gcc_assert (sqr_ssa_name);
>>>> +
>>>> +      gimple_assign_set_rhs1 (stmt, mult_ssa_name);
>>>> +      gimple_assign_set_rhs2 (stmt, sqr_ssa_name);
>>>> +      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
>>>> +      fold_stmt_inplace (def_gsi);
>>>> +      update_stmt (stmt);
>>>> +    }
>>>>
>>>> so you are leaving the original stmt in place unchanged even if it is
>>>> not used?  Why?  Note that with -fno-call-exceptions this stmt may
>>>> throw, so you should arrange to code-generate 1./a in place of the
>>>> original division to preserve EH behavior.  Watch out because then
>>>> with other uses you have to find a place to insert its computation.
>>> Ok. These are oversights on my part. I've updated the patch to modify the
>>> original division in place. The multiplication is placed after it.
>> If the division throws internally you can't insert after it, you
>> have to insert on the single non-EH edge outgoing from the BB
>> with the division.  Just try a C++ testcase with -fnon-call-exceptions
>> and a try {} catch(...) {} around.
>>
>> Not sure if it's worth to handle so bailing out if
>> stmt_can_throw_internal (stmt) would be an option as well.
>>>> +      if (is_square_of (stmt2, x))
>>>> +       {
>>>> +         if (!sqr_stmts.contains (stmt2))
>>>> +           sqr_stmts.safe_push (stmt2);
>>>> +       }
>>>>
>>>> this is quadratic in the number of square stmts... please consider
>>>> making sqr_stmts a bitmap of SSA defs (so the stmt you have now
>>>> is then SSA_NAME_DEF_STMT (ssa_name (bitmap-element))).
>>> Done. In practice I didn't see there being more than one such use, I expect
>>> them
>>> to be CSE'd, but maybe if they're in different basic blocks...
>> Oh, it just occured to me you could use FOR_EACH_IMM_USE_STMT ()
>> which will only get you each stmt once... ;)  Sorry for the misleading
>> bitmap suggestion.
>>>> You do not seem to restrict placement of the use stmts but insert
>>>> before the 1/sqrt(a) stmt.  That possibly hoists the multiplications
>>>> where they are not needed.  Consider
>>>>
>>>>    x = 1./sqrt(a);
>>>>    if (l)
>>>>      l1 = x * 3.;
>>>>    else if (l2)
>>>>      l1 = x * x;
>>>>    else if (l3)
>>>>      l1 = a * x;
>>>>
>>>> or similar where on the path to x * 3. you now perform two extra
>>>> multiplications.
>>> Ok, I've restricted the optimisation somewhat.
>>> Now if there's an other use it won't perform the transformation unless there
>>> is already
>>> a multiplication present on the main path. That way it won't introduce
>>> multiplications
>>> on paths where there aren't any.
>> OK.
>>>> Your testcases do not cover the case of other uses at all.  Or of
>>>> EH.
>>> gcc.dg/recip_sqrt_mult_1.c should handle the other uses case as tmp is a
>>> global
>>> being written to. I've added more testcases involving uses in different basic
>>> blocks
>>> and a g++.dg testcase with -fnon-call-exceptions.
>>> I'm not sure what functionality to test though apart from that it doesn't ICE
>>> and does
>>> the transformations I expect.
>> That's good enough I guess.  I see you do have a testcase with
>> try/catch so I wonder why foo4 doesn't ICE when you insert after
>> the throwing stmt...  maybe it doesn't throw?  Ah - they probably
>> fail your new mult_on_main_path test?
>
> Yeah. I did manage to reproduce an ICE eventually by removing that check
> and enabling -ftrapping-math.
>
> I think I'll just not enter this at all if stmt_can_throw_internal (stmt) like you suggested.


And here is the version that does that. It also reverts to using a vector for the sqr_stmts for consistency
and uses FOR_EACH_IMM_USE_STMT to iterate over the use statements.

Bootstrapped and tested on aarch64-none-linux-gnu.

Ok?

Thanks,
Kyrill

2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

    * tree-ssa-math-opts.c (is_mult_by): New function.
    (is_square_of): Use the above.
    (optimize_recip_sqrt): New function.
    (pass_cse_reciprocals::execute): Use the above.

2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

    * gcc.dg/recip_sqrt_mult_1.c: New test.
    * gcc.dg/recip_sqrt_mult_2.c: Likewise.
    * gcc.dg/recip_sqrt_mult_3.c: Likewise.
    * gcc.dg/recip_sqrt_mult_4.c: Likewise.
    * gcc.dg/recip_sqrt_mult_5.c: Likewise.
    * g++.dg/recip_sqrt_mult_1.C: Likewise.
    * g++.dg/recip_sqrt_mult_2.C: Likewise.

diff --git a/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C
new file mode 100644
index 0000000000000000000000000000000000000000..11d9c6f758f1529d8ed4cadf85010f6ce379c195
--- /dev/null
+++ b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fnon-call-exceptions -fdump-tree-recip" } */
+
+double res, res2, tmp;
+void
+foo1 (double a, double b)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+    res = tmp * tmp;
+    res2 = a * tmp;
+  }
+  catch (...)
+    { ; }
+}
+
+void
+foo4 (double a, double b, int c, int d)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+  }
+  catch (...)
+    {
+      if (c)
+	res = tmp * tmp;
+
+      if (d)
+	res2 = a * tmp;
+    }
+}
+
+void
+foo5 (double a, double b, int c, int d)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+    res = tmp * tmp;
+
+    if (d)
+      res2 = a * tmp;
+  }
+  catch (...)
+    { ; }
+}
+
+/* { dg-final { scan-tree-dump-times "Optimizing reciprocal sqrt multiplications" 2 "recip" } } */
+/* { dg-final { scan-tree-dump-times "Replacing squaring multiplication" 2 "recip" } } */
+/* { dg-final { scan-tree-dump-times "Replacing original division" 2 "recip" } } */
diff --git a/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C b/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C
new file mode 100644
index 0000000000000000000000000000000000000000..cca12caf4d79bfa69933d9b8fce41f38bb5b7a19
--- /dev/null
+++ b/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-w -Ofast -fnon-call-exceptions -ftrapping-math -fdump-tree-recip" } */
+
+/* Check that the recip_sqrt optimization does not trigger here, causing an
+   ICE due to EH info.  */
+
+
+double res, res2, tmp;
+void
+foo1 (double a, double b)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+    res = tmp * tmp;
+    res2 = a * tmp;
+  }
+  catch (...)
+    { ; }
+}
+
+void
+foo4 (double a, double b, int c, int d)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+  }
+  catch (...)
+    {
+      if (c)
+	res = tmp * tmp;
+
+      if (d)
+	res2 = a * tmp;
+    }
+}
+
+void
+foo5 (double a, double b, int c, int d)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+    res = tmp * tmp;
+
+    if (d)
+      res2 = a * tmp;
+  }
+  catch (...)
+    { ; }
+}
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..188390a4ecffca1b21f05c4f19abecfb7ebd188f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+double res, res2, tmp;
+void
+foo (double a, double b)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  res = tmp * tmp;
+  res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..c5fc3de7b657b1769e76254b4bc874e0595e43ef
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+float
+foo (float a)
+{
+  float tmp = 1.0f / __builtin_sqrtf (a);
+  return a * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not " / " "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..e7d185ba7e22cfc7cca72296d5ccc544f24fdb14
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+double
+foo (double a)
+{
+  double tmp = 1.0f / __builtin_sqrt (a);
+  return tmp * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not "__builtin_sqrt" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..e3005f2feb6f4bacbb6eafc0155e196cb866fcdf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+/* The main path doesn't have any multiplications.
+   Avoid introducing them in the recip pass.  */
+
+double res, res2, tmp;
+void
+foo (double a, double b, int c, int d)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  if (c)
+    res = tmp * tmp;
+
+  if (d)
+    res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump-not "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump-not "Replacing original division" "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c
new file mode 100644
index 0000000000000000000000000000000000000000..e871f0fcd4feb1687f9815e4babf4d0667a15ea8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+/* We want to do the recip_sqrt transformations here there is already
+   a multiplication on the main path.  */
+
+double res, res2, tmp;
+void
+foo (double a, double b, int c, int d)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  res = tmp * tmp;
+
+  if (d)
+    res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 25378da6f4ab27dbce51e10461efc18cc416a4d7..19bff5c3c3715f3e194e1c091ae01f6c4d6d2ce8 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -337,9 +337,9 @@ is_division_by (gimple *use_stmt, tree def)
 	 && gimple_assign_rhs1 (use_stmt) != def;
 }
 
-/* Return whether USE_STMT is DEF * DEF.  */
+/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
 static inline bool
-is_square_of (gimple *use_stmt, tree def)
+is_mult_by (gimple *use_stmt, tree def, tree a)
 {
   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
       && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
@@ -347,11 +347,19 @@ is_square_of (gimple *use_stmt, tree def)
       tree op0 = gimple_assign_rhs1 (use_stmt);
       tree op1 = gimple_assign_rhs2 (use_stmt);
 
-      return op0 == op1 && op0 == def;
+      return (op0 == def && op1 == a)
+	      || (op0 == a && op1 == def);
     }
   return 0;
 }
 
+/* Return whether USE_STMT is DEF * DEF.  */
+static inline bool
+is_square_of (gimple *use_stmt, tree def)
+{
+  return is_mult_by (use_stmt, def, def);
+}
+
 /* Return whether USE_STMT is a floating-point division by
    DEF * DEF.  */
 static inline bool
@@ -526,6 +534,188 @@ free_bb (struct occurrence *occ)
     }
 }
 
+/* Transform sequences like
+   t = sqrt (a)
+   x = 1.0 / t;
+   r1 = x * x;
+   r2 = a * x;
+   into:
+   t = sqrt (a)
+   r1 = 1.0 / a;
+   r2 = t;
+   x = r1 * r2;
+   depending on the uses of x, r1, r2.  This removes one multiplication and
+   allows the sqrt and division operations to execute in parallel.
+   DEF_GSI is the gsi of the initial division by sqrt that defines
+   DEF (x in the example abovs).  */
+
+static void
+optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
+{
+  gimple *use_stmt;
+  imm_use_iterator use_iter;
+  gimple *stmt = gsi_stmt (*def_gsi);
+  tree x = def;
+  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
+  tree div_rhs1 = gimple_assign_rhs1 (stmt);
+
+  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
+      || TREE_CODE (div_rhs1) != REAL_CST
+      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
+    return;
+
+  gcall *sqrt_stmt
+    = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
+
+  if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
+    return;
+
+  switch (gimple_call_combined_fn (sqrt_stmt))
+    {
+    CASE_CFN_SQRT:
+    CASE_CFN_SQRT_FN:
+      break;
+
+    default:
+      return;
+    }
+  tree a = gimple_call_arg (sqrt_stmt, 0);
+
+  /* We have 'a' and 'x'.  Now analyze the uses of 'x'.  */
+
+  /* Statements that use x in x * x.  */
+  auto_vec<gimple *> sqr_stmts;
+  /* Statements that use x in a * x.  */
+  auto_vec<gimple *> mult_stmts;
+  bool has_other_use = false;
+  bool mult_on_main_path = false;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x)
+    {
+      if (is_gimple_debug (use_stmt))
+	continue;
+      if (is_square_of (use_stmt, x))
+	{
+	  sqr_stmts.safe_push (use_stmt);
+	  if (gimple_bb (use_stmt) == gimple_bb (stmt))
+	    mult_on_main_path = true;
+	}
+      else if (is_mult_by (use_stmt, x, a))
+	{
+	  mult_stmts.safe_push (use_stmt);
+	  if (gimple_bb (use_stmt) == gimple_bb (stmt))
+	    mult_on_main_path = true;
+	}
+      else
+	has_other_use = true;
+    }
+
+  /* In the x * x and a * x cases we just rewire stmt operands or
+     remove multiplications.  In the has_other_use case we introduce
+     a multiplication so make sure we don't introduce a multiplication
+     on a path where there was none.  */
+  if (has_other_use && !mult_on_main_path)
+    return;
+
+  if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
+    return;
+
+  /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
+     to be able to compose it from the sqr and mult cases.  */
+  if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
+    return;
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
+      print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
+      print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+      fprintf (dump_file, "\n");
+    }
+
+  bool delete_div = !has_other_use;
+  tree sqr_ssa_name = NULL_TREE;
+  if (!sqr_stmts.is_empty ())
+    {
+      /* r1 = x * x.  Transform the original
+	 x = 1.0 / t
+	 into
+	 tmp1 = 1.0 / a
+	 r1 = tmp1.  */
+
+      sqr_ssa_name
+	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
+
+      if (dump_file)
+	{
+	  fprintf (dump_file, "Replacing original division\n");
+	  print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+	  fprintf (dump_file, "with new division\n");
+	}
+      gimple_assign_set_lhs (stmt, sqr_ssa_name);
+      gimple_assign_set_rhs2 (stmt, a);
+      fold_stmt_inplace (def_gsi);
+      update_stmt (stmt);
+
+      if (dump_file)
+	print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+
+      delete_div = false;
+      gimple *sqr_stmt;
+      unsigned int i;
+      FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
+	{
+	  gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
+	  gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
+	  update_stmt (sqr_stmt);
+	}
+    }
+  if (!mult_stmts.is_empty ())
+    {
+      /* r2 = a * x.  Transform this into:
+	 r2 = t (The original sqrt (a)).  */
+      unsigned int i;
+      gimple *mult_stmt = NULL;
+      FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
+	{
+	  gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
+
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "Replacing squaring multiplication\n");
+	      print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
+	      fprintf (dump_file, "with assignment\n");
+	    }
+	  gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
+	  fold_stmt_inplace (&gsi2);
+	  update_stmt (mult_stmt);
+	  if (dump_file)
+	    print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
+      }
+    }
+
+  if (has_other_use)
+    {
+      /* Using the two temporaries tmp1, tmp2 from above
+	 the original x is now:
+	 x = tmp1 * tmp2.  */
+      gcc_assert (orig_sqrt_ssa_name);
+      gcc_assert (sqr_ssa_name);
+
+      gimple *new_stmt
+	= gimple_build_assign (x, MULT_EXPR,
+				orig_sqrt_ssa_name, sqr_ssa_name);
+      gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
+      update_stmt (stmt);
+    }
+  else if (delete_div)
+    {
+      /* Remove the original division.  */
+      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
+      gsi_remove (&gsi2, true);
+      release_defs (stmt);
+    }
+}
 
 /* Look for floating-point divisions among DEF's uses, and try to
    replace them by multiplications with the reciprocal.  Add
@@ -756,7 +946,15 @@ pass_cse_reciprocals::execute (function *fun)
 	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 	      && FLOAT_TYPE_P (TREE_TYPE (def))
 	      && TREE_CODE (def) == SSA_NAME)
-	    execute_cse_reciprocals_1 (&gsi, def);
+	    {
+	      if (flag_unsafe_math_optimizations
+		  && is_gimple_assign (stmt)
+		  && !stmt_can_throw_internal (stmt)
+		  && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
+		optimize_recip_sqrt (&gsi, def);
+	      else
+		execute_cse_reciprocals_1 (&gsi, def);
+	    }
 	}
 
       if (optimize_bb_for_size_p (bb))

Follow-Ups:
- Re: [PATCH] Optimise sqrt reciprocal multiplications
  - From: Richard Biener
- Re: [PATCH] Optimise sqrt reciprocal multiplications
  - From: H.J. Lu

References:
- Re: [PATCH] Optimise sqrt reciprocal multiplications
  - From: Kyrill Tkachov
- Re: [PATCH] Optimise sqrt reciprocal multiplications
  - From: Richard Biener
- Re: [PATCH] Optimise sqrt reciprocal multiplications
  - From: Kyrill Tkachov

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]