This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH][tree-complex.c] PR tree-optimization/70291: Inline floating-point complex multiplication more aggressively
- From: Kyrill Tkachov <kyrylo dot tkachov at foss dot arm dot com>
- To: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Mon, 30 Apr 2018 18:41:41 +0100
- Subject: [PATCH][tree-complex.c] PR tree-optimization/70291: Inline floating-point complex multiplication more aggressively
Hi all,
We can improve the performance of complex floating-point multiplications by inlining the expansion a bit more aggressively.
We can inline complex x = a * b as:
x = (ar*br - ai*bi) + i(ar*bi + br*ai);
if (isunordered (__real__ x, __imag__ x))
x = __muldc3 (a, b); //Or __mulsc3 for single-precision
That way the common case where no NaNs are produced we can avoid the libgcc call and fall back to the
NaN handling stuff in libgcc if either components of the expansion are NaN.
The implementation is done in expand_complex_multiplication in tree-complex.c and the above expansion
will be done when optimising for -O1 and greater and when not optimising for size.
At -O0 and -Os the single call to libgcc will be emitted.
For the code:
__complex double
foo (__complex double a, __complex double b)
{
return a * b;
}
We will now emit at -O2 for aarch64:
foo:
fmul d16, d1, d3
fmul d6, d1, d2
fnmsub d5, d0, d2, d16
fmadd d4, d0, d3, d6
fcmp d5, d4
bvs .L8
fmov d1, d4
fmov d0, d5
ret
.L8:
stp x29, x30, [sp, -16]!
mov x29, sp
bl __muldc3
ldp x29, x30, [sp], 16
ret
Instead of just a branch to __muldc3.
Bootstrapped and tested on aarch64-none-linux-gnu, arm-none-linux-gnueabihf, x86_64-unknown-linux-gnu.
Ok for trunk? (GCC 9)
Thanks,
Kyrill
2018-04-30 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
PR tree-optimization/70291
* tree-complex.c (insert_complex_mult_libcall): New function.
(expand_complex_multiplication_limited_range): Likewise.
(expand_complex_multiplication): Expand floating-point complex
multiplication using the above.
2018-04-30 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
PR tree-optimization/70291
* gcc.dg/complex-6.c: New test.
* gcc.dg/complex-7.c: Likewise.
diff --git a/gcc/testsuite/gcc.dg/complex-6.c b/gcc/testsuite/gcc.dg/complex-6.c
new file mode 100644
index 0000000000000000000000000000000000000000..123b2a8206f098e7140792375830ff5f01f30cf6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/complex-6.c
@@ -0,0 +1,13 @@
+/* PR tree-optimization/70291. */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cplxlower" } */
+
+__complex float
+foo (__complex float a, __complex float b)
+{
+ return a * b;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_isunordered" 1 "cplxlower1" } } */
+/* { dg-final { scan-tree-dump-times "__mulsc3" 1 "cplxlower1" } } */
diff --git a/gcc/testsuite/gcc.dg/complex-7.c b/gcc/testsuite/gcc.dg/complex-7.c
new file mode 100644
index 0000000000000000000000000000000000000000..7d5ba3aefb3e007824b778d716ef7f21a48c58f8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/complex-7.c
@@ -0,0 +1,13 @@
+/* PR tree-optimization/70291. */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cplxlower" } */
+
+__complex double
+foo (__complex double a, __complex double b)
+{
+ return a * b;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_isunordered" 1 "cplxlower1" } } */
+/* { dg-final { scan-tree-dump-times "__muldc3" 1 "cplxlower1" } } */
diff --git a/gcc/tree-complex.c b/gcc/tree-complex.c
index 622b8696399b9e9d8bddcc6340d2f8d8ca852637..319c302526483ca80ecafe7e55289c1850ad6a11 100644
--- a/gcc/tree-complex.c
+++ b/gcc/tree-complex.c
@@ -978,6 +978,43 @@ expand_complex_addition (gimple_stmt_iterator *gsi, tree inner_type,
}
/* Expand a complex multiplication or division to a libcall to the c99
+ compliant routines. Unlike expand_complex_libcall create and insert
+ the call, assign it to an output variable and return that rather than
+ modifying existing statements in place. */
+
+static tree
+insert_complex_mult_libcall (gimple_stmt_iterator *gsi, tree type, tree ar,
+ tree ai, tree br, tree bi)
+{
+ machine_mode mode;
+ built_in_function bcode;
+ tree fn, lhs;
+ gcall *stmt;
+
+
+ mode = TYPE_MODE (type);
+ gcc_assert (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT);
+
+ bcode = ((built_in_function)
+ (BUILT_IN_COMPLEX_MUL_MIN + mode - MIN_MODE_COMPLEX_FLOAT));
+
+ fn = builtin_decl_explicit (bcode);
+
+ stmt = gimple_build_call (fn, 4, ar, ai, br, bi);
+ lhs = create_tmp_var (type);
+ gimple_call_set_lhs (stmt, lhs);
+ if (gimple_in_ssa_p (cfun))
+ {
+ lhs = make_ssa_name (lhs, stmt);
+ gimple_call_set_lhs (stmt, lhs);
+ }
+ update_stmt (stmt);
+ gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+
+ return lhs;
+}
+
+/* Expand a complex multiplication or division to a libcall to the c99
compliant routines. */
static void
@@ -1025,6 +1062,35 @@ expand_complex_libcall (gimple_stmt_iterator *gsi, tree ar, tree ai,
}
}
+/* Perform a complex multiplication assuming limited range on two
+ complex constants A, B represented by AR, AI, BR, BI of type TYPE.
+ The operation we want is: a * b = (ar*br - ai*bi) + i(ar*bi + br*ai).
+ Insert the GIMPLE statements into GSI. Store the real and imaginary
+ components of the result into RR and RI. */
+
+static void
+expand_complex_multiplication_limited_range (gimple_stmt_iterator *gsi,
+ tree type, tree ar, tree ai,
+ tree br, tree bi,
+ tree *rr, tree *ri)
+{
+ tree t1, t2, t3, t4;
+
+ t1 = gimplify_build2 (gsi, MULT_EXPR, type, ar, br);
+ t2 = gimplify_build2 (gsi, MULT_EXPR, type, ai, bi);
+ t3 = gimplify_build2 (gsi, MULT_EXPR, type, ar, bi);
+
+ /* Avoid expanding redundant multiplication for the common
+ case of squaring a complex number. */
+ if (ar == br && ai == bi)
+ t4 = t3;
+ else
+ t4 = gimplify_build2 (gsi, MULT_EXPR, type, ai, br);
+
+ *rr = gimplify_build2 (gsi, MINUS_EXPR, type, t1, t2);
+ *ri = gimplify_build2 (gsi, PLUS_EXPR, type, t3, t4);
+}
+
/* Expand complex multiplication to scalars:
a * b = (ar*br - ai*bi) + i(ar*bi + br*ai)
*/
@@ -1080,27 +1146,98 @@ expand_complex_multiplication (gimple_stmt_iterator *gsi, tree inner_type,
case PAIR (VARYING, VARYING):
if (flag_complex_method == 2 && SCALAR_FLOAT_TYPE_P (inner_type))
{
- expand_complex_libcall (gsi, ar, ai, br, bi, MULT_EXPR);
- return;
- }
- else
- {
- tree t1, t2, t3, t4;
-
- t1 = gimplify_build2 (gsi, MULT_EXPR, inner_type, ar, br);
- t2 = gimplify_build2 (gsi, MULT_EXPR, inner_type, ai, bi);
- t3 = gimplify_build2 (gsi, MULT_EXPR, inner_type, ar, bi);
+ /* If optimizing for size or not at all just do a libcall. */
+ if (optimize == 0 || optimize_function_for_size_p (cfun))
+ {
+ expand_complex_libcall (gsi, ar, ai, br, bi, MULT_EXPR);
+ return;
+ }
- /* Avoid expanding redundant multiplication for the common
- case of squaring a complex number. */
- if (ar == br && ai == bi)
- t4 = t3;
+ /* Else, expand x = a * b into
+ x = (ar*br - ai*bi) + i(ar*bi + br*ai);
+ if (isunordered (__real__ x, __imag__ x))
+ x = __muldc3 (a, b); */
+
+ /* Get the top-level complex type. We'll need it soon. */
+ tree top_type = TREE_TYPE (gimple_assign_lhs (gsi_stmt (*gsi)));
+
+ tree tmpr, tmpi;
+ expand_complex_multiplication_limited_range (gsi, inner_type, ar,
+ ai, br, bi, &tmpr,
+ &tmpi);
+
+ tree isunordered_decl = builtin_decl_explicit (BUILT_IN_ISUNORDERED);
+ tree isunordered_res = create_tmp_var (integer_type_node);
+ gimple *tmpr_unord_check
+ = gimple_build_call (isunordered_decl, 2, tmpr, tmpi);
+ gimple_call_set_lhs (tmpr_unord_check, isunordered_res);
+
+ gsi_insert_before (gsi, tmpr_unord_check, GSI_SAME_STMT);
+ gimple *check
+ = gimple_build_cond (NE_EXPR, isunordered_res, integer_zero_node,
+ NULL_TREE, NULL_TREE);
+
+ basic_block orig_bb = gsi_bb (*gsi);
+ /* We want to keep track of the original complex multiplication
+ statement as we're going to modify it later in
+ update_complex_assignment. Make sure that insert_cond_bb leaves
+ that statement in the join block. */
+ gsi_prev (gsi);
+ basic_block cond_bb
+ = insert_cond_bb (gsi_bb (*gsi), gsi_stmt (*gsi), check,
+ profile_probability::very_unlikely ());
+
+
+ gimple_stmt_iterator cond_bb_gsi = gsi_last_bb (cond_bb);
+ gsi_insert_after (&cond_bb_gsi, gimple_build_nop (), GSI_NEW_STMT);
+
+ tree libcall_res
+ = insert_complex_mult_libcall (&cond_bb_gsi, top_type, ar, ai, br,
+ bi);
+ tree cond_real = gimplify_build1 (&cond_bb_gsi, REALPART_EXPR,
+ inner_type, libcall_res);
+ tree cond_imag = gimplify_build1 (&cond_bb_gsi, IMAGPART_EXPR,
+ inner_type, libcall_res);
+
+ basic_block join_bb = single_succ_edge (cond_bb)->dest;
+ *gsi = gsi_start_nondebug_after_labels_bb (join_bb);
+
+ /* We have a conditional block with some assignments in cond_bb.
+ Wire up the PHIs to wrap up. */
+ if (gimple_in_ssa_p (cfun))
+ {
+ rr = create_tmp_var (inner_type);
+ rr = make_ssa_name (rr);
+ ri = create_tmp_var (inner_type);
+ ri = make_ssa_name (ri);
+ edge cond_to_join = single_succ_edge (cond_bb);
+ edge orig_to_join = find_edge (orig_bb, join_bb);
+
+ gphi *real_phi = create_phi_node (rr, gsi_bb (*gsi));
+ add_phi_arg (real_phi, cond_real, cond_to_join,
+ UNKNOWN_LOCATION);
+ add_phi_arg (real_phi, tmpr, orig_to_join, UNKNOWN_LOCATION);
+
+ gphi *imag_phi = create_phi_node (ri, gsi_bb (*gsi));
+ add_phi_arg (imag_phi, cond_imag, cond_to_join,
+ UNKNOWN_LOCATION);
+ add_phi_arg (imag_phi, tmpi, orig_to_join, UNKNOWN_LOCATION);
+ }
else
- t4 = gimplify_build2 (gsi, MULT_EXPR, inner_type, ai, br);
-
- rr = gimplify_build2 (gsi, MINUS_EXPR, inner_type, t1, t2);
- ri = gimplify_build2 (gsi, PLUS_EXPR, inner_type, t3, t4);
+ {
+ gimple *stmt = gimple_build_assign (tmpr, cond_real);
+ gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+ stmt = gimple_build_assign (tmpi, cond_imag);
+ gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+ rr = tmpr;
+ ri = tmpi;
+ }
}
+ else
+ /* If we are not worrying about NaNs expand to
+ (ar*br - ai*bi) + i(ar*bi + br*ai) directly. */
+ expand_complex_multiplication_limited_range (gsi, inner_type, ar, ai,
+ br, bi, &rr, &ri);
break;
default: