This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Optimise sqrt reciprocal multiplications
- From: Kyrill Tkachov <kyrylo dot tkachov at foss dot arm dot com>
- To: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Tue, 21 Aug 2018 17:00:28 +0100
- Subject: [PATCH] Optimise sqrt reciprocal multiplications
Hi all,
This patch aims to optimise sequences involving uses of 1.0 / sqrt (a) under -freciprocal-math and -funsafe-math-optimizations.
In particular consider:
x = 1.0 / sqrt (a);
r1 = x * x; // same as 1.0 / a
r2 = a * x; // same as sqrt (a)
If x, r1 and r2 are all used further on in the code, this can be transformed into:
tmp1 = 1.0 / a
tmp2 = sqrt (a)
tmp3 = tmp1 * tmp2
x = tmp3
r1 = tmp1
r2 = tmp2
A bit convoluted, but this saves us one multiplication and, more importantly, the sqrt and division are now independent.
This also allows optimisation of a subset of these expressions.
For example:
x = 1.0 / sqrt (a)
r1 = x * x
can be transformed to r1 = 1.0 / a, eliminating the sqrt if x is not used anywhere else.
And similarly:
x = 1.0 / sqrt (a)
r1 = a * x
can be transformed to sqrt (a) eliminating the division.
For the testcase:
double res, res2, tmp;
void
foo (double a, double b)
{
tmp = 1.0 / __builtin_sqrt (a);
res = tmp * tmp;
res2 = a * tmp;
}
We now generate for aarch64 with -Ofast:
foo:
fmov d2, 1.0e+0
adrp x2, res2
fsqrt d1, d0
adrp x1, res
fdiv d0, d2, d0
adrp x0, tmp
str d1, [x2, #:lo12:res2]
fmul d1, d1, d0
str d0, [x1, #:lo12:res]
str d1, [x0, #:lo12:tmp]
ret
where before it generated:
foo:
fsqrt d2, d0
fmov d1, 1.0e+0
adrp x1, res2
adrp x2, tmp
adrp x0, res
fdiv d1, d1, d2
fmul d0, d1, d0
fmul d2, d1, d1
str d1, [x2, #:lo12:tmp]
str d0, [x1, #:lo12:res2]
str d2, [x0, #:lo12:res]
ret
As you can see, the new sequence has one fewer multiply and the fsqrt and fdiv are independent.
With this patch I see a 14% improvement on 544.nab_r from SPEC2017 on Cortex-A72 at -Ofast.
Bootstrapped and tested on aarch64-none-linux-gnu and x86_64-unknown-linux-gnu.
Ok for trunk?
Thanks,
Kyrill
2018-08-21 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* tree-ssa-math-opts.c (is_mult_by): New function.
(optimize_recip_sqrt): Likewise.
(pass_cse_reciprocals::execute): Use the above.
2018-08-21 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.dg/recip_sqrt_mult_1.c: New test.
* gcc.dg/recip_sqrt_mult_2.c: Likewise.
* gcc.dg/recip_sqrt_mult_3.c: Likewise.
commit ea11c9eb018abf4e21c61b8a7305291b0d9a7ec8
Author: kyrtka01 <kyrylo.tkachov@arm.com>
Date: Tue Aug 21 13:54:07 2018 +0100
Optimise sqrt reciprocal multiplications
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
new file mode 100644
index 0000000..86429e4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+double res, res2, tmp;
+void
+foo (double a, double b)
+{
+ tmp = 1.0 / __builtin_sqrt (a);
+ res = tmp * tmp;
+ res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing multiplication" "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
new file mode 100644
index 0000000..c5fc3de
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+float
+foo (float a)
+{
+ float tmp = 1.0f / __builtin_sqrtf (a);
+ return a * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not " / " "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
new file mode 100644
index 0000000..e7d185b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+double
+foo (double a)
+{
+ double tmp = 1.0f / __builtin_sqrt (a);
+ return tmp * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not "__builtin_sqrt" "optimized" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index a90d9d2..b25beaf 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -352,6 +352,23 @@ is_square_of (gimple *use_stmt, tree def)
return 0;
}
+/* Return TRUE if USE_STMT is a multiplication of DEF by A. */
+
+static inline bool
+is_mult_by (gimple *use_stmt, tree def, tree a)
+{
+ if (gimple_code (use_stmt) == GIMPLE_ASSIGN
+ && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
+ {
+ tree op0 = gimple_assign_rhs1 (use_stmt);
+ tree op1 = gimple_assign_rhs2 (use_stmt);
+
+ return (op0 == def && op1 == a)
+ || (op0 == a && op1 == def);
+ }
+ return 0;
+}
+
/* Return whether USE_STMT is a floating-point division by
DEF * DEF. */
static inline bool
@@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
occ_head = NULL;
}
+/* Transform sequences like
+ x = 1.0 / sqrt (a);
+ r1 = x * x;
+ r2 = a * x;
+ into:
+ tmp1 = 1.0 / a;
+ tmp2 = sqrt (a);
+ tmp3 = tmp1 * tmp2;
+ x = tmp3;
+ r1 = tmp1;
+ r2 = tmp2;
+ depending on the uses of x, r1, r2. This removes one multiplication and
+ allows the sqrt and division operations to execute in parallel.
+ DEF_GSI is the gsi of the initial division by sqrt that defines
+ DEF (x in the example abovs). */
+
+static void
+optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
+{
+ use_operand_p use_p;
+ imm_use_iterator use_iter;
+ gimple *stmt = gsi_stmt (*def_gsi);
+ tree x = def;
+ tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
+ tree div_rhs1 = gimple_assign_rhs1 (stmt);
+
+ if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
+ || TREE_CODE (div_rhs1) != REAL_CST
+ || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
+ return;
+
+ gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
+ if (!is_gimple_call (sqrt_stmt)
+ || !gimple_call_lhs (sqrt_stmt))
+ return;
+
+ gcall *call = as_a <gcall *> (sqrt_stmt);
+ switch (gimple_call_combined_fn (call))
+ {
+ CASE_CFN_SQRT:
+ CASE_CFN_SQRT_FN:
+ break;
+
+ default:
+ return;
+ }
+ tree a = gimple_call_arg (sqrt_stmt, 0);
+
+ /* We have 'a' and 'x'. Now analyze the uses of 'x'. */
+
+ /* Statements that use x in x * x. */
+ auto_vec<gimple *> sqr_stmts;
+ /* Statements that use x in a * x. */
+ auto_vec<gimple *> mult_stmts;
+ bool has_other_use = false;
+
+ FOR_EACH_IMM_USE_FAST (use_p, use_iter, x)
+ {
+ gimple *stmt2 = USE_STMT (use_p);
+ if (is_gimple_debug (stmt2))
+ continue;
+ if (is_square_of (stmt2, x))
+ {
+ if (!sqr_stmts.contains (stmt2))
+ sqr_stmts.safe_push (stmt2);
+ }
+ else if (is_mult_by (stmt2, x, a))
+ mult_stmts.safe_push (stmt2);
+ else
+ has_other_use = true;
+ }
+ if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
+ return;
+
+ /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
+ to be able to compose it from the sqr and mult cases. */
+ if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
+ return;
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
+ print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
+ print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+ fprintf (dump_file, "\n");
+ }
+
+ tree mult_ssa_name = NULL_TREE;
+ tree sqr_ssa_name = NULL_TREE;
+ if (!sqr_stmts.is_empty ())
+ {
+ /* r1 = x * x. Transform this into:
+ tmp1 = 1.0 / a
+ r1 = tmp1. */
+
+ sqr_ssa_name
+ = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
+ gimple *new_stmt
+ = gimple_build_assign (sqr_ssa_name, RDIV_EXPR,
+ build_real (TREE_TYPE (a), dconst1), a);
+ gsi_insert_before (def_gsi, new_stmt, GSI_NEW_STMT);
+
+ unsigned int i;
+ gimple *sqr_stmt;
+ FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
+ {
+ gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Replacing squaring multiplication\n");
+ print_gimple_stmt (dump_file, sqr_stmt, 0, TDF_NONE);
+ fprintf (dump_file, "with\n");
+ print_gimple_stmt (dump_file, new_stmt, 0, TDF_NONE);
+ fprintf (dump_file, "\n");
+ }
+ gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
+ fold_stmt_inplace (&gsi2);
+ update_stmt (sqr_stmt);
+ }
+ }
+ if (!mult_stmts.is_empty ())
+ {
+ /* r2 = a * x. Transform this into:
+ tmp2 = sqrt (a)
+ r2 = tmp2. */
+ mult_ssa_name
+ = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_mult");
+ gimple *new_stmt
+ = gimple_build_assign (mult_ssa_name, orig_sqrt_ssa_name);
+ gsi_insert_before (def_gsi, new_stmt, GSI_NEW_STMT);
+
+ unsigned int i;
+ gimple *mult_stmt = NULL;
+ FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
+ {
+ gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Replacing multiplication\n");
+ print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
+ fprintf (dump_file, "with\n");
+ print_gimple_stmt (dump_file, new_stmt, 0, TDF_NONE);
+ fprintf (dump_file, "\n");
+ }
+ gimple_assign_set_rhs_from_tree (&gsi2, mult_ssa_name);
+ fold_stmt_inplace (&gsi2);
+ update_stmt (mult_stmt);
+ }
+ }
+
+ if (has_other_use)
+ {
+ /* Using the two temporaries tmp1, tmp2 from above
+ the original x is now:
+ x = tmp1 * tmp2. */
+ gcc_assert (mult_ssa_name);
+ gcc_assert (sqr_ssa_name);
+ gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
+
+ tree new_ssa_name
+ = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_transformed");
+ gimple *new_stmt
+ = gimple_build_assign (new_ssa_name, MULT_EXPR,
+ mult_ssa_name, sqr_ssa_name);
+ gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
+ gcc_assert (gsi_stmt (gsi2) == stmt);
+ gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
+ fold_stmt (&gsi2);
+ update_stmt (stmt);
+ }
+}
+
/* Return an internal function that implements the reciprocal of CALL,
or IFN_LAST if there is no such function that the target supports. */
@@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
if (optimize_bb_for_size_p (bb))
continue;
+ if (flag_unsafe_math_optimizations)
+ {
+ for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
+ !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+
+ if (gimple_has_lhs (stmt)
+ && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
+ && FLOAT_TYPE_P (TREE_TYPE (def))
+ && TREE_CODE (def) == SSA_NAME
+ && is_gimple_assign (stmt)
+ && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
+ optimize_recip_sqrt (&gsi, def);
+ }
+ }
/* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
gsi_next (&gsi))