This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] Optimise sqrt reciprocal multiplications
- From: Kyrill Tkachov <kyrylo dot tkachov at foss dot arm dot com>
- To: Richard Biener <rguenther at suse dot de>
- Cc: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>, richard dot sandiford at arm dot com
- Date: Wed, 05 Sep 2018 12:38:35 +0100
- Subject: Re: [PATCH] Optimise sqrt reciprocal multiplications
- References: <5B7C371C.8040207@foss.arm.com> <87bm9ttmq5.fsf@arm.com> <5B7EEA31.6000908@foss.arm.com> <5B87B55F.3020707@foss.arm.com> <alpine.LSU.2.20.1808311238020.16707@zhemvz.fhfr.qr> <5B8E792A.6050205@foss.arm.com> <alpine.LSU.2.20.1809041622290.16707@zhemvz.fhfr.qr> <5B8EB85A.6000705@foss.arm.com>
On 04/09/18 17:52, Kyrill Tkachov wrote:
>
> On 04/09/18 15:31, Richard Biener wrote:
>> On Tue, 4 Sep 2018, Kyrill Tkachov wrote:
>>> Hi Richard,
>>>
>>> On 31/08/18 12:07, Richard Biener wrote:
>>>> On Thu, 30 Aug 2018, Kyrill Tkachov wrote:
>>>>> Ping.
>>>>>
>>>>> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg01496.html
>>>>>
>>>>> Thanks,
>>>>> Kyrill
>>>>>
>>>>> On 23/08/18 18:09, Kyrill Tkachov wrote:
>>>>>> Hi Richard,
>>>>>>
>>>>>> On 23/08/18 11:13, Richard Sandiford wrote:
>>>>>>> Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> writes:
>>>>>>>> Hi all,
>>>>>>>>
>>>>>>>> This patch aims to optimise sequences involving uses of 1.0 / sqrt (a)
>>>>>>>> under -freciprocal-math and -funsafe-math-optimizations.
>>>>>>>> In particular consider:
>>>>>>>>
>>>>>>>> x = 1.0 / sqrt (a);
>>>>>>>> r1 = x * x; // same as 1.0 / a
>>>>>>>> r2 = a * x; // same as sqrt (a)
>>>>>>>>
>>>>>>>> If x, r1 and r2 are all used further on in the code, this can be
>>>>>>>> transformed into:
>>>>>>>> tmp1 = 1.0 / a
>>>>>>>> tmp2 = sqrt (a)
>>>>>>>> tmp3 = tmp1 * tmp2
>>>>>>>> x = tmp3
>>>>>>>> r1 = tmp1
>>>>>>>> r2 = tmp2
>>>>>>> Nice optimisation :-) Someone who knows the pass better should review,
>>>>>>> but:
>>>>>> Thanks for the review.
>>>>>>> There seems to be an implicit assumption that this is a win even
>>>>>>> when the r1 and r2 assignments are only conditionally executed.
>>>>>>> That's probably true, but it might be worth saying explicitly.
>>>>>> I'll admit I had not considered that case.
>>>>>> I think it won't make a difference in practice, as the really expensive
>>>>>> operations here
>>>>>> are the sqrt and the division and they are on the executed path in either
>>>>>> case and them
>>>>>> becoming independent should be a benefit of its own.
>>>>>>>> +/* Return TRUE if USE_STMT is a multiplication of DEF by A. */
>>>>>>>> +
>>>>>>>> +static inline bool
>>>>>>>> +is_mult_by (gimple *use_stmt, tree def, tree a)
>>>>>>>> +{
>>>>>>>> + if (gimple_code (use_stmt) == GIMPLE_ASSIGN
>>>>>>>> + && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
>>>>>>>> + {
>>>>>>>> + tree op0 = gimple_assign_rhs1 (use_stmt);
>>>>>>>> + tree op1 = gimple_assign_rhs2 (use_stmt);
>>>>>>>> +
>>>>>>>> + return (op0 == def && op1 == a)
>>>>>>>> + || (op0 == a && op1 == def);
>>>>>>>> + }
>>>>>>>> + return 0;
>>>>>>>> +}
>>>>>>> Seems like is_square_of could now be a light-weight wrapper around this.
>>>>>> Indeed, I've done the wrapping now.
>>>>>>>> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator
>>>>>>>> *def_gsi, tree def)
>>>>>>>> occ_head = NULL;
>>>>>>>> }
>>>>>>>> +/* Transform sequences like
>>>>>>>> + x = 1.0 / sqrt (a);
>>>>>>>> + r1 = x * x;
>>>>>>>> + r2 = a * x;
>>>>>>>> + into:
>>>>>>>> + tmp1 = 1.0 / a;
>>>>>>>> + tmp2 = sqrt (a);
>>>>>>>> + tmp3 = tmp1 * tmp2;
>>>>>>>> + x = tmp3;
>>>>>>>> + r1 = tmp1;
>>>>>>>> + r2 = tmp2;
>>>>>>>> + depending on the uses of x, r1, r2. This removes one multiplication
>>>>>>>> and
>>>>>>>> + allows the sqrt and division operations to execute in parallel.
>>>>>>>> + DEF_GSI is the gsi of the initial division by sqrt that defines
>>>>>>>> + DEF (x in the example abovs). */
>>>>>>>> +
>>>>>>>> +static void
>>>>>>>> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
>>>>>>>> +{
>>>>>>>> + use_operand_p use_p;
>>>>>>>> + imm_use_iterator use_iter;
>>>>>>>> + gimple *stmt = gsi_stmt (*def_gsi);
>>>>>>>> + tree x = def;
>>>>>>>> + tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
>>>>>>>> + tree div_rhs1 = gimple_assign_rhs1 (stmt);
>>>>>>>> +
>>>>>>>> + if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
>>>>>>>> + || TREE_CODE (div_rhs1) != REAL_CST
>>>>>>>> + || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
>>>>>>>> + return;
>>>>>>>> +
>>>>>>>> + gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
>>>>>>>> + if (!is_gimple_call (sqrt_stmt)
>>>>>>>> + || !gimple_call_lhs (sqrt_stmt))
>>>>>>>> + return;
>>>>>>>> +
>>>>>>>> + gcall *call = as_a <gcall *> (sqrt_stmt);
>>>>>>> Very minor, but:
>>>>>>>
>>>>>>> gcall *sqrt_stmt
>>>>>>> = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
>>>>>>> if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
>>>>>>> return;
>>>>>>>
>>>>>>> would avoid the need for the separate as_a<>, and would mean that
>>>>>>> we only call gimple_call_* on gcalls.
>>>>>> Ok.
>>>>>>>> + if (has_other_use)
>>>>>>>> + {
>>>>>>>> + /* Using the two temporaries tmp1, tmp2 from above
>>>>>>>> + the original x is now:
>>>>>>>> + x = tmp1 * tmp2. */
>>>>>>>> + gcc_assert (mult_ssa_name);
>>>>>>>> + gcc_assert (sqr_ssa_name);
>>>>>>>> + gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
>>>>>>>> +
>>>>>>>> + tree new_ssa_name
>>>>>>>> + = make_temp_ssa_name (TREE_TYPE (a), NULL,
>>>>>>>> "recip_sqrt_transformed");
>>>>>>>> + gimple *new_stmt
>>>>>>>> + = gimple_build_assign (new_ssa_name, MULT_EXPR,
>>>>>>>> + mult_ssa_name, sqr_ssa_name);
>>>>>>>> + gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
>>>>>>>> + gcc_assert (gsi_stmt (gsi2) == stmt);
>>>>>>>> + gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
>>>>>>>> + fold_stmt (&gsi2);
>>>>>>>> + update_stmt (stmt);
>>>>>>> In this case we're replacing the statement in its original position,
>>>>>>> so there's no real need to use a temporary. It seems better to
>>>>>>> change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same
>>>>>>> lhs as before.
>>>>>> Yes, that's cleaner.
>>>>>>>> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
>>>>>>>> if (optimize_bb_for_size_p (bb))
>>>>>>>> continue;
>>>>>>> Seems unnecessary to skip the new optimisation when optimising for size.
>>>>>>> Like you say, it saves a multiplication overall. Also:
>>>>>> Indeed.
>>>>>>>> + if (flag_unsafe_math_optimizations)
>>>>>>>> + {
>>>>>>>> + for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
>>>>>>>> + !gsi_end_p (gsi);
>>>>>>>> + gsi_next (&gsi))
>>>>>>>> + {
>>>>>>>> + gimple *stmt = gsi_stmt (gsi);
>>>>>>>> +
>>>>>>>> + if (gimple_has_lhs (stmt)
>>>>>>>> + && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
>>>>>>>> + && FLOAT_TYPE_P (TREE_TYPE (def))
>>>>>>>> + && TREE_CODE (def) == SSA_NAME
>>>>>>>> + && is_gimple_assign (stmt)
>>>>>>>> + && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
>>>>>>>> + optimize_recip_sqrt (&gsi, def);
>>>>>>>> + }
>>>>>>>> + }
>>>>>>> It looks like this could safely be done in one of the existing walks
>>>>>>> (e.g. the execute_cse_reciprocals_1 one, if we do this when optimising
>>>>>>> for size).
>>>>>> You're right. I've moved this into one of the walks above this.
>>>>>>
>>>>>> Bootstrapped and tested on aarch64-none-linux-gnu and
>>>>>> x86_64-unknown-linux-gnu.
>>>>>> CC'ing richi as he's reviewed these kinds of patches in the past.
>>>>>>
>>>>>> Is this ok for trunk?
>>>> I wonder how it interacts with execute_cse_reciprocals_1 given it
>>>> introduces a division 1.0 / a which will be not processed by
>>>> execute_cse_reciprocals_1 given that operates by walking uses of 'a'.
>>>> That may be just a missed optimization of course.
>>> Hmm, I believe right now it doesn't interact with execute_cse_reciprocals_1 as
>>> it's either one or the other.
>>> I've left it as it is for now, but would you like us to call
>>> execute_cse_reciprocals_1 on the potentially transformed division?
>> I think that wouldn't "fix" it given execute_cse_reciprocals_1 works by
>> seeing all reciprocal uses of 'a' so it may have alrady seen two and
>> you introduced the third. But we can leave "solving" this for
>> future enhacements (I'd avoid doing two passes over the IL just because
>> of said possibility right now).
>>>> + if (has_other_use)
>>>> + {
>>>> + /* Using the two temporaries tmp1, tmp2 from above
>>>> + the original x is now:
>>>> + x = tmp1 * tmp2. */
>>>> + gcc_assert (mult_ssa_name);
>>>> + gcc_assert (sqr_ssa_name);
>>>> +
>>>> + gimple_assign_set_rhs1 (stmt, mult_ssa_name);
>>>> + gimple_assign_set_rhs2 (stmt, sqr_ssa_name);
>>>> + gimple_assign_set_rhs_code (stmt, MULT_EXPR);
>>>> + fold_stmt_inplace (def_gsi);
>>>> + update_stmt (stmt);
>>>> + }
>>>>
>>>> so you are leaving the original stmt in place unchanged even if it is
>>>> not used? Why? Note that with -fno-call-exceptions this stmt may
>>>> throw, so you should arrange to code-generate 1./a in place of the
>>>> original division to preserve EH behavior. Watch out because then
>>>> with other uses you have to find a place to insert its computation.
>>> Ok. These are oversights on my part. I've updated the patch to modify the
>>> original division in place. The multiplication is placed after it.
>> If the division throws internally you can't insert after it, you
>> have to insert on the single non-EH edge outgoing from the BB
>> with the division. Just try a C++ testcase with -fnon-call-exceptions
>> and a try {} catch(...) {} around.
>>
>> Not sure if it's worth to handle so bailing out if
>> stmt_can_throw_internal (stmt) would be an option as well.
>>>> + if (is_square_of (stmt2, x))
>>>> + {
>>>> + if (!sqr_stmts.contains (stmt2))
>>>> + sqr_stmts.safe_push (stmt2);
>>>> + }
>>>>
>>>> this is quadratic in the number of square stmts... please consider
>>>> making sqr_stmts a bitmap of SSA defs (so the stmt you have now
>>>> is then SSA_NAME_DEF_STMT (ssa_name (bitmap-element))).
>>> Done. In practice I didn't see there being more than one such use, I expect
>>> them
>>> to be CSE'd, but maybe if they're in different basic blocks...
>> Oh, it just occured to me you could use FOR_EACH_IMM_USE_STMT ()
>> which will only get you each stmt once... ;) Sorry for the misleading
>> bitmap suggestion.
>>>> You do not seem to restrict placement of the use stmts but insert
>>>> before the 1/sqrt(a) stmt. That possibly hoists the multiplications
>>>> where they are not needed. Consider
>>>>
>>>> x = 1./sqrt(a);
>>>> if (l)
>>>> l1 = x * 3.;
>>>> else if (l2)
>>>> l1 = x * x;
>>>> else if (l3)
>>>> l1 = a * x;
>>>>
>>>> or similar where on the path to x * 3. you now perform two extra
>>>> multiplications.
>>> Ok, I've restricted the optimisation somewhat.
>>> Now if there's an other use it won't perform the transformation unless there
>>> is already
>>> a multiplication present on the main path. That way it won't introduce
>>> multiplications
>>> on paths where there aren't any.
>> OK.
>>>> Your testcases do not cover the case of other uses at all. Or of
>>>> EH.
>>> gcc.dg/recip_sqrt_mult_1.c should handle the other uses case as tmp is a
>>> global
>>> being written to. I've added more testcases involving uses in different basic
>>> blocks
>>> and a g++.dg testcase with -fnon-call-exceptions.
>>> I'm not sure what functionality to test though apart from that it doesn't ICE
>>> and does
>>> the transformations I expect.
>> That's good enough I guess. I see you do have a testcase with
>> try/catch so I wonder why foo4 doesn't ICE when you insert after
>> the throwing stmt... maybe it doesn't throw? Ah - they probably
>> fail your new mult_on_main_path test?
>
> Yeah. I did manage to reproduce an ICE eventually by removing that check
> and enabling -ftrapping-math.
>
> I think I'll just not enter this at all if stmt_can_throw_internal (stmt) like you suggested.
And here is the version that does that. It also reverts to using a vector for the sqr_stmts for consistency
and uses FOR_EACH_IMM_USE_STMT to iterate over the use statements.
Bootstrapped and tested on aarch64-none-linux-gnu.
Ok?
Thanks,
Kyrill
2018-09-03 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* tree-ssa-math-opts.c (is_mult_by): New function.
(is_square_of): Use the above.
(optimize_recip_sqrt): New function.
(pass_cse_reciprocals::execute): Use the above.
2018-09-03 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.dg/recip_sqrt_mult_1.c: New test.
* gcc.dg/recip_sqrt_mult_2.c: Likewise.
* gcc.dg/recip_sqrt_mult_3.c: Likewise.
* gcc.dg/recip_sqrt_mult_4.c: Likewise.
* gcc.dg/recip_sqrt_mult_5.c: Likewise.
* g++.dg/recip_sqrt_mult_1.C: Likewise.
* g++.dg/recip_sqrt_mult_2.C: Likewise.
diff --git a/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C
new file mode 100644
index 0000000000000000000000000000000000000000..11d9c6f758f1529d8ed4cadf85010f6ce379c195
--- /dev/null
+++ b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fnon-call-exceptions -fdump-tree-recip" } */
+
+double res, res2, tmp;
+void
+foo1 (double a, double b)
+{
+ try {
+ tmp = 1.0 / __builtin_sqrt (a);
+ res = tmp * tmp;
+ res2 = a * tmp;
+ }
+ catch (...)
+ { ; }
+}
+
+void
+foo4 (double a, double b, int c, int d)
+{
+ try {
+ tmp = 1.0 / __builtin_sqrt (a);
+ }
+ catch (...)
+ {
+ if (c)
+ res = tmp * tmp;
+
+ if (d)
+ res2 = a * tmp;
+ }
+}
+
+void
+foo5 (double a, double b, int c, int d)
+{
+ try {
+ tmp = 1.0 / __builtin_sqrt (a);
+ res = tmp * tmp;
+
+ if (d)
+ res2 = a * tmp;
+ }
+ catch (...)
+ { ; }
+}
+
+/* { dg-final { scan-tree-dump-times "Optimizing reciprocal sqrt multiplications" 2 "recip" } } */
+/* { dg-final { scan-tree-dump-times "Replacing squaring multiplication" 2 "recip" } } */
+/* { dg-final { scan-tree-dump-times "Replacing original division" 2 "recip" } } */
diff --git a/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C b/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C
new file mode 100644
index 0000000000000000000000000000000000000000..cca12caf4d79bfa69933d9b8fce41f38bb5b7a19
--- /dev/null
+++ b/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-w -Ofast -fnon-call-exceptions -ftrapping-math -fdump-tree-recip" } */
+
+/* Check that the recip_sqrt optimization does not trigger here, causing an
+ ICE due to EH info. */
+
+
+double res, res2, tmp;
+void
+foo1 (double a, double b)
+{
+ try {
+ tmp = 1.0 / __builtin_sqrt (a);
+ res = tmp * tmp;
+ res2 = a * tmp;
+ }
+ catch (...)
+ { ; }
+}
+
+void
+foo4 (double a, double b, int c, int d)
+{
+ try {
+ tmp = 1.0 / __builtin_sqrt (a);
+ }
+ catch (...)
+ {
+ if (c)
+ res = tmp * tmp;
+
+ if (d)
+ res2 = a * tmp;
+ }
+}
+
+void
+foo5 (double a, double b, int c, int d)
+{
+ try {
+ tmp = 1.0 / __builtin_sqrt (a);
+ res = tmp * tmp;
+
+ if (d)
+ res2 = a * tmp;
+ }
+ catch (...)
+ { ; }
+}
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..188390a4ecffca1b21f05c4f19abecfb7ebd188f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+double res, res2, tmp;
+void
+foo (double a, double b)
+{
+ tmp = 1.0 / __builtin_sqrt (a);
+ res = tmp * tmp;
+ res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..c5fc3de7b657b1769e76254b4bc874e0595e43ef
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+float
+foo (float a)
+{
+ float tmp = 1.0f / __builtin_sqrtf (a);
+ return a * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not " / " "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..e7d185ba7e22cfc7cca72296d5ccc544f24fdb14
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+double
+foo (double a)
+{
+ double tmp = 1.0f / __builtin_sqrt (a);
+ return tmp * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not "__builtin_sqrt" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..e3005f2feb6f4bacbb6eafc0155e196cb866fcdf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+/* The main path doesn't have any multiplications.
+ Avoid introducing them in the recip pass. */
+
+double res, res2, tmp;
+void
+foo (double a, double b, int c, int d)
+{
+ tmp = 1.0 / __builtin_sqrt (a);
+ if (c)
+ res = tmp * tmp;
+
+ if (d)
+ res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump-not "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump-not "Replacing original division" "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c
new file mode 100644
index 0000000000000000000000000000000000000000..e871f0fcd4feb1687f9815e4babf4d0667a15ea8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+/* We want to do the recip_sqrt transformations here there is already
+ a multiplication on the main path. */
+
+double res, res2, tmp;
+void
+foo (double a, double b, int c, int d)
+{
+ tmp = 1.0 / __builtin_sqrt (a);
+ res = tmp * tmp;
+
+ if (d)
+ res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 25378da6f4ab27dbce51e10461efc18cc416a4d7..19bff5c3c3715f3e194e1c091ae01f6c4d6d2ce8 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -337,9 +337,9 @@ is_division_by (gimple *use_stmt, tree def)
&& gimple_assign_rhs1 (use_stmt) != def;
}
-/* Return whether USE_STMT is DEF * DEF. */
+/* Return TRUE if USE_STMT is a multiplication of DEF by A. */
static inline bool
-is_square_of (gimple *use_stmt, tree def)
+is_mult_by (gimple *use_stmt, tree def, tree a)
{
if (gimple_code (use_stmt) == GIMPLE_ASSIGN
&& gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
@@ -347,11 +347,19 @@ is_square_of (gimple *use_stmt, tree def)
tree op0 = gimple_assign_rhs1 (use_stmt);
tree op1 = gimple_assign_rhs2 (use_stmt);
- return op0 == op1 && op0 == def;
+ return (op0 == def && op1 == a)
+ || (op0 == a && op1 == def);
}
return 0;
}
+/* Return whether USE_STMT is DEF * DEF. */
+static inline bool
+is_square_of (gimple *use_stmt, tree def)
+{
+ return is_mult_by (use_stmt, def, def);
+}
+
/* Return whether USE_STMT is a floating-point division by
DEF * DEF. */
static inline bool
@@ -526,6 +534,188 @@ free_bb (struct occurrence *occ)
}
}
+/* Transform sequences like
+ t = sqrt (a)
+ x = 1.0 / t;
+ r1 = x * x;
+ r2 = a * x;
+ into:
+ t = sqrt (a)
+ r1 = 1.0 / a;
+ r2 = t;
+ x = r1 * r2;
+ depending on the uses of x, r1, r2. This removes one multiplication and
+ allows the sqrt and division operations to execute in parallel.
+ DEF_GSI is the gsi of the initial division by sqrt that defines
+ DEF (x in the example abovs). */
+
+static void
+optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
+{
+ gimple *use_stmt;
+ imm_use_iterator use_iter;
+ gimple *stmt = gsi_stmt (*def_gsi);
+ tree x = def;
+ tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
+ tree div_rhs1 = gimple_assign_rhs1 (stmt);
+
+ if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
+ || TREE_CODE (div_rhs1) != REAL_CST
+ || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
+ return;
+
+ gcall *sqrt_stmt
+ = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
+
+ if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
+ return;
+
+ switch (gimple_call_combined_fn (sqrt_stmt))
+ {
+ CASE_CFN_SQRT:
+ CASE_CFN_SQRT_FN:
+ break;
+
+ default:
+ return;
+ }
+ tree a = gimple_call_arg (sqrt_stmt, 0);
+
+ /* We have 'a' and 'x'. Now analyze the uses of 'x'. */
+
+ /* Statements that use x in x * x. */
+ auto_vec<gimple *> sqr_stmts;
+ /* Statements that use x in a * x. */
+ auto_vec<gimple *> mult_stmts;
+ bool has_other_use = false;
+ bool mult_on_main_path = false;
+
+ FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x)
+ {
+ if (is_gimple_debug (use_stmt))
+ continue;
+ if (is_square_of (use_stmt, x))
+ {
+ sqr_stmts.safe_push (use_stmt);
+ if (gimple_bb (use_stmt) == gimple_bb (stmt))
+ mult_on_main_path = true;
+ }
+ else if (is_mult_by (use_stmt, x, a))
+ {
+ mult_stmts.safe_push (use_stmt);
+ if (gimple_bb (use_stmt) == gimple_bb (stmt))
+ mult_on_main_path = true;
+ }
+ else
+ has_other_use = true;
+ }
+
+ /* In the x * x and a * x cases we just rewire stmt operands or
+ remove multiplications. In the has_other_use case we introduce
+ a multiplication so make sure we don't introduce a multiplication
+ on a path where there was none. */
+ if (has_other_use && !mult_on_main_path)
+ return;
+
+ if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
+ return;
+
+ /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
+ to be able to compose it from the sqr and mult cases. */
+ if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
+ return;
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
+ print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
+ print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+ fprintf (dump_file, "\n");
+ }
+
+ bool delete_div = !has_other_use;
+ tree sqr_ssa_name = NULL_TREE;
+ if (!sqr_stmts.is_empty ())
+ {
+ /* r1 = x * x. Transform the original
+ x = 1.0 / t
+ into
+ tmp1 = 1.0 / a
+ r1 = tmp1. */
+
+ sqr_ssa_name
+ = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Replacing original division\n");
+ print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+ fprintf (dump_file, "with new division\n");
+ }
+ gimple_assign_set_lhs (stmt, sqr_ssa_name);
+ gimple_assign_set_rhs2 (stmt, a);
+ fold_stmt_inplace (def_gsi);
+ update_stmt (stmt);
+
+ if (dump_file)
+ print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+
+ delete_div = false;
+ gimple *sqr_stmt;
+ unsigned int i;
+ FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
+ {
+ gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
+ gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
+ update_stmt (sqr_stmt);
+ }
+ }
+ if (!mult_stmts.is_empty ())
+ {
+ /* r2 = a * x. Transform this into:
+ r2 = t (The original sqrt (a)). */
+ unsigned int i;
+ gimple *mult_stmt = NULL;
+ FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
+ {
+ gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Replacing squaring multiplication\n");
+ print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
+ fprintf (dump_file, "with assignment\n");
+ }
+ gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
+ fold_stmt_inplace (&gsi2);
+ update_stmt (mult_stmt);
+ if (dump_file)
+ print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
+ }
+ }
+
+ if (has_other_use)
+ {
+ /* Using the two temporaries tmp1, tmp2 from above
+ the original x is now:
+ x = tmp1 * tmp2. */
+ gcc_assert (orig_sqrt_ssa_name);
+ gcc_assert (sqr_ssa_name);
+
+ gimple *new_stmt
+ = gimple_build_assign (x, MULT_EXPR,
+ orig_sqrt_ssa_name, sqr_ssa_name);
+ gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
+ update_stmt (stmt);
+ }
+ else if (delete_div)
+ {
+ /* Remove the original division. */
+ gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
+ gsi_remove (&gsi2, true);
+ release_defs (stmt);
+ }
+}
/* Look for floating-point divisions among DEF's uses, and try to
replace them by multiplications with the reciprocal. Add
@@ -756,7 +946,15 @@ pass_cse_reciprocals::execute (function *fun)
&& (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
&& FLOAT_TYPE_P (TREE_TYPE (def))
&& TREE_CODE (def) == SSA_NAME)
- execute_cse_reciprocals_1 (&gsi, def);
+ {
+ if (flag_unsafe_math_optimizations
+ && is_gimple_assign (stmt)
+ && !stmt_can_throw_internal (stmt)
+ && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
+ optimize_recip_sqrt (&gsi, def);
+ else
+ execute_cse_reciprocals_1 (&gsi, def);
+ }
}
if (optimize_bb_for_size_p (bb))