This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Fix PR18589
- From: "William J. Schmidt" <wschmidt at linux dot vnet dot ibm dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: bergner at vnet dot ibm dot com
- Date: Tue, 06 Mar 2012 14:49:14 -0600
- Subject: [PATCH] Fix PR18589
Hi,
This is a re-post of the patch I posted for comments in January to
address http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18589. The patch
modifies reassociation to expose repeated factors from __builtin_pow*
calls, optimally reassociate repeated factors, and possibly reconstitute
__builtin_powi calls from the results of reassociation.
Bootstrapped and passes regression tests for powerpc64-linux-gnu. I
expect there may need to be some small changes, but I am targeting this
for trunk approval.
Thanks very much for the review,
Bill
gcc:
2012-03-06 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
* tree-pass.h: Replace pass_reassoc with pass_early_reassoc and
pass_late_reassoc.
* passes.c (init_optimization_passes): Change pass_reassoc calls to
pass_early_reassoc and pass_late_reassoc.
* tree-ssa-reassoc.c (reassociate_stats): Add two fields.
(early_reassoc): New static var.
(MAX_POW_EXPAND): New #define'd constant.
(linear_expand_pow_common): New function.
(linear_expand_powi): Likewise.
(linear_expand_pow): Likewise.
(break_up_subtract_bb): Attempt to expand __builtin_pow[i].
(repeat_factor_d): New struct and associated typedefs.
(repeat_factor_vec): New static vector variable.
(compare_repeat_factors): New function.
(get_reassoc_pow_ssa_name): Likewise.
(attempt_builtin_powi): Likewise.
(reassociate_bb): Attempt to reconstitute __builtin_powi calls, and
multiply their results by any leftover reassociated factors.
(fini_reassoc): Two new calls to statistics_counter_event.
(execute_early_reassoc): New function.
(execute_late_reassoc): Likewise.
(pass_early_reassoc): Replace pass_reassoc, renamed to reassoc1,
call execute_early_reassoc.
(pass_late_reassoc): New gimple_opt_pass named reassoc2 that calls
execute_late_reassoc.
gcc/testsuite:
2012-03-06 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
* gcc.dg/pr46309.c: Change -fdump-tree-reassoc-details to
-fdump-tree-reassoc[12]-details.
* gcc.dg/tree-ssa/pr18589-1.c: New test.
* gcc.dg/tree-ssa/pr18589-2.c: Likewise.
* gcc.dg/tree-ssa/pr18589-3.c: Likewise.
* gcc.dg/tree-ssa/pr18589-4.c: Likewise.
* gcc.dg/tree-ssa/pr18589-5.c: Likewise.
* gcc.dg/tree-ssa/pr18589-6.c: Likewise.
* gcc.dg/tree-ssa/pr18589-7.c: Likewise.
* gcc.dg/tree-ssa/pr18589-8.c: Likewise.
Index: gcc/tree-pass.h
===================================================================
--- gcc/tree-pass.h (revision 184997)
+++ gcc/tree-pass.h (working copy)
@@ -440,7 +440,8 @@ extern struct gimple_opt_pass pass_copy_prop;
extern struct gimple_opt_pass pass_vrp;
extern struct gimple_opt_pass pass_uncprop;
extern struct gimple_opt_pass pass_return_slot;
-extern struct gimple_opt_pass pass_reassoc;
+extern struct gimple_opt_pass pass_early_reassoc;
+extern struct gimple_opt_pass pass_late_reassoc;
extern struct gimple_opt_pass pass_rebuild_cgraph_edges;
extern struct gimple_opt_pass pass_remove_cgraph_callee_edges;
extern struct gimple_opt_pass pass_build_cgraph_edges;
Index: gcc/testsuite/gcc.dg/pr46309.c
===================================================================
--- gcc/testsuite/gcc.dg/pr46309.c (revision 184997)
+++ gcc/testsuite/gcc.dg/pr46309.c (working copy)
@@ -1,6 +1,6 @@
/* PR tree-optimization/46309 */
/* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-reassoc-details" } */
+/* { dg-options "-O2 -fdump-tree-reassoc1-details -fdump-tree-reassoc2-details" } */
/* The transformation depends on BRANCH_COST being greater than 1
(see the notes in the PR), so try to force that. */
/* { dg-additional-options "-mtune=octeon2" { target mips*-*-* } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z, double u)
+{
+ return x * x * y * y * y * z * z * z * z * u;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 7 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z, double u)
+{
+ return x * x * x * y * y * y * z * z * z * z * u * u * u * u;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 6 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c (revision 0)
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y)
+{
+ return __builtin_pow (x, -4.0) * __builtin_pow (y, -4.0);
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " / " 1 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+float baz (float x, float y)
+{
+ return x * x * x * x * y * y * y * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+long double baz (long double x, long double y)
+{
+ return x * x * x * x * y * y * y * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y)
+{
+ return x * x * x * x * y * y * y * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y)
+{
+ return x * y * y * x * y * x * x * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z)
+{
+ return x * x * y * y * y * z * z * z * z;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 6 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/passes.c
===================================================================
--- gcc/passes.c (revision 184997)
+++ gcc/passes.c (working copy)
@@ -1325,7 +1325,7 @@ init_optimization_passes (void)
opportunities. */
NEXT_PASS (pass_phi_only_cprop);
NEXT_PASS (pass_dse);
- NEXT_PASS (pass_reassoc);
+ NEXT_PASS (pass_early_reassoc);
NEXT_PASS (pass_dce);
NEXT_PASS (pass_forwprop);
NEXT_PASS (pass_phiopt);
@@ -1377,7 +1377,7 @@ init_optimization_passes (void)
}
NEXT_PASS (pass_lower_vector_ssa);
NEXT_PASS (pass_cse_reciprocals);
- NEXT_PASS (pass_reassoc);
+ NEXT_PASS (pass_late_reassoc);
NEXT_PASS (pass_vrp);
NEXT_PASS (pass_dominator);
/* The only const/copy propagation opportunities left after
Index: gcc/tree-ssa-reassoc.c
===================================================================
--- gcc/tree-ssa-reassoc.c (revision 184997)
+++ gcc/tree-ssa-reassoc.c (working copy)
@@ -53,6 +53,9 @@ along with GCC; see the file COPYING3. If not see
1. Breaking up subtract operations into addition + negate, where
it would promote the reassociation of adds.
+ 1a. During the same pass, expanding __builtin_pow variants with
+ small integer exponents into linearized multiply trees.
+
2. Left linearization of the expression trees, so that (A+B)+(C+D)
becomes (((A+B)+C)+D), which is easier for us to rewrite later.
During linearization, we place the operands of the binary
@@ -61,6 +64,10 @@ along with GCC; see the file COPYING3. If not see
3. Optimization of the operand lists, eliminating things like a +
-a, a & a, etc.
+ 3a. Combine repeated factors with the same occurrence counts
+ into a __builtin_powi call that will later be optimized into
+ an optimal number of multiplies.
+
4. Rewrite the expression trees we linearized and optimized so
they are in proper rank order.
@@ -169,6 +176,8 @@ static struct
int constants_eliminated;
int ops_eliminated;
int rewritten;
+ int pows_expanded;
+ int pows_created;
} reassociate_stats;
/* Operator, rank pair. */
@@ -190,6 +199,12 @@ static int next_operand_entry_id;
depth. */
static long *bb_rank;
+/* Distinguish between early and late reassociation passes. Early
+ reassociation expands and rebuilds __builtin_pow* calls. This
+ is not done in late reassociation to preserve the builtin
+ optimizations performed in pass_cse_sincos. */
+static bool early_reassoc;
+
/* Operand->rank hashtable. */
static struct pointer_map_t *operand_rank;
@@ -2759,6 +2774,164 @@ can_reassociate_p (tree op)
return false;
}
+/* Define a limit on the exponent of a __builtin_pow or __builtin_powi
+ that will be converted into a linear chain of multiplies prior to
+ reassociation. */
+
+#define MAX_POW_EXPAND 32
+
+/* Given STMT at *GSIP that is a __builtin_pow or __builtin_powi
+ operation with base ARG0 and integer power EXPONENT, transform
+ it into a linear chain of multiplies, provided that EXPONENT is
+ of sufficiently small magnitude. */
+static void
+linear_expand_pow_common (gimple stmt, gimple_stmt_iterator *gsip,
+ tree arg0, HOST_WIDE_INT exponent)
+{
+ tree lhs = gimple_call_lhs (stmt);
+ HOST_WIDE_INT abs_exp = abs (exponent);
+ location_t loc = gimple_location (stmt);
+ gimple mul_stmt = NULL;
+ gimple div_stmt = NULL;
+ tree result, type, target;
+ gimple copy_stmt;
+
+ if (abs_exp > MAX_POW_EXPAND)
+ return;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "Expanding builtin pow/powi: ");
+ print_gimple_stmt (dump_file, stmt, 0, 0);
+ }
+
+ reassociate_stats.pows_expanded++;
+
+ type = TREE_TYPE (arg0);
+
+ if (exponent == 0)
+ result = build_real (type, dconst1);
+ else if (exponent == 1)
+ result = arg0;
+ else
+ {
+ tree target_ssa;
+
+ target = create_tmp_reg (type, "reassocpow");
+ add_referenced_var (target);
+
+ if (exponent == -1)
+ result = arg0;
+ else
+ {
+ unsigned i;
+
+ target_ssa = make_ssa_name (target, NULL);
+ mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
+ arg0, arg0);
+ gimple_set_location (mul_stmt, loc);
+ gsi_insert_before (gsip, mul_stmt, GSI_SAME_STMT);
+
+ for (i = abs_exp - 2; i > 0; i--)
+ {
+ tree prev_target_ssa = target_ssa;
+ target_ssa = make_ssa_name (target, NULL);
+ mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
+ prev_target_ssa, arg0);
+ gimple_set_location (mul_stmt, loc);
+ gsi_insert_before (gsip, mul_stmt, GSI_SAME_STMT);
+ }
+
+ result = target_ssa;
+ }
+
+ /* Possible TODO: If we expand two __builtin_pow's with different
+ negative exponents, the introduction of the RDIVs for inversion
+ prevents combining their factors. (If they have the same negative
+ exponent, expression folding combines them as expected.) I'm
+ not worrying about this as it should be quite rare in practice. */
+ if (exponent < 0)
+ {
+ target_ssa = make_ssa_name (target, NULL);
+ div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target_ssa,
+ build_real (type, dconst1),
+ result);
+ gimple_set_location (div_stmt, loc);
+ gsi_insert_before (gsip, div_stmt, GSI_SAME_STMT);
+ result = target_ssa;
+ }
+ }
+
+ /* If we introduced multiplies but no inversion, avoid introducing a
+ copy so that the copy doesn't truncate a larger reassociation chain. */
+ if (mul_stmt && !div_stmt)
+ {
+ unlink_stmt_vdef (stmt);
+ gimple_set_lhs (mul_stmt, lhs);
+ gsi_remove (gsip, true);
+ update_stmt (mul_stmt);
+ /* If we're at the end of the block, leave the iterator in a
+ usable state. */
+ if (gsi_end_p (*gsip))
+ *gsip = gsi_for_stmt (mul_stmt);
+ }
+ else
+ {
+ copy_stmt = gimple_build_assign (lhs, result);
+ gimple_set_location (copy_stmt, loc);
+ unlink_stmt_vdef (stmt);
+ gsi_replace (gsip, copy_stmt, true);
+ }
+}
+
+/* Transform __builtin_powi into a linear chain of multiplies,
+ if the exponent is of sufficiently small magnitude. */
+
+static void
+linear_expand_powi (gimple stmt, gimple_stmt_iterator *gsip)
+{
+ tree arg0 = gimple_call_arg (stmt, 0);
+ tree arg1 = gimple_call_arg (stmt, 1);
+ HOST_WIDE_INT exponent;
+
+ if (TREE_CODE (arg1) != INTEGER_CST)
+ return;
+
+ if (host_integerp (arg1, 0))
+ {
+ exponent = TREE_INT_CST_LOW (arg1);
+ linear_expand_pow_common (stmt, gsip, arg0, exponent);
+ }
+}
+
+/* Transform __builtin_pow into a linear chain of multiplies,
+ if the exponent is constant and equivalent to an integer of
+ sufficiently small magnitude. */
+
+static void
+linear_expand_pow (gimple stmt, gimple_stmt_iterator *gsip)
+{
+ tree arg0 = gimple_call_arg (stmt, 0);
+ tree arg1 = gimple_call_arg (stmt, 1);
+ REAL_VALUE_TYPE c, cint;
+ HOST_WIDE_INT n;
+
+ /* The exponent must be a constant equivalent to an integer that
+ fits in a HOST_WIDE_INT. */
+ if (TREE_CODE (arg1) != REAL_CST)
+ return;
+
+ c = TREE_REAL_CST (arg1);
+ if (REAL_EXP (&c) > HOST_BITS_PER_WIDE_INT)
+ return;
+
+ n = real_to_integer (&c);
+ real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
+
+ if (real_identical (&c, &cint))
+ linear_expand_pow_common (stmt, gsip, arg0, n);
+}
+
/* Break up subtract operations in block BB.
We do this top down because we don't know whether the subtract is
@@ -2784,9 +2957,32 @@ break_up_subtract_bb (basic_block bb)
for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
{
+ tree fndecl;
gimple stmt = gsi_stmt (gsi);
gimple_set_visited (stmt, false);
+ /* Look for __builtin_pow and __builtin_powi calls,
+ and expand them to multiplies if possible. */
+ if (early_reassoc
+ && flag_unsafe_math_optimizations
+ && is_gimple_call (stmt)
+ && (fndecl = gimple_call_fndecl (stmt))
+ && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
+ {
+ switch (DECL_FUNCTION_CODE (fndecl))
+ {
+ CASE_FLT_FN (BUILT_IN_POW):
+ linear_expand_pow (stmt, &gsi);
+ break;
+ CASE_FLT_FN (BUILT_IN_POWI):
+ linear_expand_powi (stmt, &gsi);
+ break;
+ default:
+ ;
+ }
+ continue;
+ }
+
if (!is_gimple_assign (stmt)
|| !can_reassociate_p (gimple_assign_lhs (stmt)))
continue;
@@ -2815,6 +3011,342 @@ break_up_subtract_bb (basic_block bb)
break_up_subtract_bb (son);
}
+/* Used for repeated factor analysis. */
+struct repeat_factor_d
+{
+ /* An SSA name that occurs in a multiply chain. */
+ tree factor;
+
+ /* Cached rank of the factor. */
+ unsigned rank;
+
+ /* Number of occurrences of the factor in the chain. */
+ HOST_WIDE_INT count;
+
+ /* An SSA name representing the product of this factor and
+ all factors appearing later in the repeated factor vector. */
+ tree repr;
+};
+
+typedef struct repeat_factor_d repeat_factor, *repeat_factor_t;
+typedef const struct repeat_factor_d *const_repeat_factor_t;
+
+DEF_VEC_O (repeat_factor);
+DEF_VEC_ALLOC_O (repeat_factor, heap);
+
+static VEC (repeat_factor, heap) *repeat_factor_vec;
+
+/* Used for sorting the repeat factor vector. Sort primarily by
+ ascending occurrence count, secondarily by descending rank. */
+
+static int
+compare_repeat_factors (const void *x1, const void *x2)
+{
+ const_repeat_factor_t rf1 = (const_repeat_factor_t) x1;
+ const_repeat_factor_t rf2 = (const_repeat_factor_t) x2;
+
+ if (rf1->count != rf2->count)
+ return rf1->count - rf2->count;
+
+ return rf2->rank - rf1->rank;
+}
+
+/* Get a new SSA name for register variable *TARGET of type TYPE.
+ If *TARGET is null or incompatible with TYPE, create the variable
+ first. */
+
+static tree
+get_reassoc_pow_ssa_name (tree *target, tree type)
+{
+ if (!*target || !types_compatible_p (type, TREE_TYPE (*target)))
+ {
+ *target = create_tmp_reg (type, "reassocpow");
+ add_referenced_var (*target);
+ }
+
+ return make_ssa_name (*target, NULL);
+}
+
+/* Look for repeated operands in OPS in the multiply tree rooted at
+ STMT. Replace them with an optimal sequence of multiplies and powi
+ builtin calls, and remove the used operands from OPS. Return an
+ SSA name representing the value of the replacement sequence. */
+
+static tree
+attempt_builtin_powi (gimple stmt, VEC(operand_entry_t, heap) **ops,
+ tree *target)
+{
+ unsigned i, j, cached, vec_len;
+ int ii;
+ operand_entry_t oe;
+ repeat_factor_t rf1, rf2;
+ repeat_factor rfnew;
+ tree target_ssa;
+ tree result = NULL_TREE;
+ tree type = TREE_TYPE (gimple_get_lhs (stmt));
+ tree powi_fndecl = mathfn_built_in (type, BUILT_IN_POWI);
+ gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+ gimple mul_stmt, pow_stmt;
+
+ /* Nothing to do if BUILT_IN_POWI doesn't exist for this type and
+ target. */
+ if (!powi_fndecl)
+ return NULL_TREE;
+
+ /* Allocate the repeated factor vector. */
+ repeat_factor_vec = VEC_alloc (repeat_factor, heap, 10);
+
+ /* Scan the OPS vector for all SSA names in the product and build
+ up a vector of occurrence counts for each factor. */
+ FOR_EACH_VEC_ELT (operand_entry_t, *ops, i, oe)
+ {
+ if (TREE_CODE (oe->op) == SSA_NAME)
+ {
+ FOR_EACH_VEC_ELT (repeat_factor, repeat_factor_vec, j, rf1)
+ {
+ if (rf1->factor == oe->op)
+ {
+ rf1->count++;
+ break;
+ }
+ }
+
+ if (j >= VEC_length (repeat_factor, repeat_factor_vec))
+ {
+ rfnew.factor = oe->op;
+ rfnew.rank = oe->rank;
+ rfnew.count = 1;
+ rfnew.repr = NULL_TREE;
+ VEC_safe_push (repeat_factor, heap, repeat_factor_vec, &rfnew);
+ }
+ }
+ }
+
+ vec_len = VEC_length (repeat_factor, repeat_factor_vec);
+
+ /* Sort the repeated factor vector by (a) increasing occurrence count,
+ and (b) decreasing rank. */
+ VEC_qsort (repeat_factor, repeat_factor_vec, compare_repeat_factors);
+
+ /* There are a variety of ways we could combine factors with different
+ occurrence counts. It seems best in practice to try to combine as
+ many base factors as possible into a product before applying
+ builtin_powi to the result. The sort order chosen for the repeated
+ factor vector allows us to cache partial results for the product
+ of the base factors for subsequent use.
+
+ As an example, consider x * x * y * y * y * y * z * z * z * z.
+ We want to first compose the product x * y * z, raise it to the
+ second power, and then multiply this by the product y * z raised
+ to the second power. This can be done in 5 multiplies provided
+ we cache y * z for use in both expressions:
+
+ t1 = y * z
+ t2 = t1 * x
+ t3 = t2 * t2
+ t4 = t1 * t1
+ result = t3 * t4
+
+ Alternatively we could also do this in 5 multiplies by first
+ calculating y * z, squaring it twice, and multiplying by x * x.
+ However, if the occurrence counts were not powers of 2 as in
+ this example, combining higher occurrence counts first would
+ require more multiplies than combining lower ones first. */
+
+ /* Repeatedly look for opportunities to create a builtin_powi call. */
+ while (true)
+ {
+ HOST_WIDE_INT power;
+
+ /* Find the first factor in the repeated factor vector whose
+ occurrence count is at least 2. If no such factor exists,
+ there are no builtin_powi opportunities remaining. */
+ FOR_EACH_VEC_ELT (repeat_factor, repeat_factor_vec, j, rf1)
+ {
+ if (rf1->count >= 2)
+ break;
+ }
+
+ if (j >= vec_len)
+ break;
+
+ power = rf1->count;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ unsigned elt;
+ repeat_factor_t rf;
+ fputs ("Building __builtin_pow call for (", dump_file);
+ for (elt = j; elt < vec_len; elt++)
+ {
+ rf = VEC_index (repeat_factor, repeat_factor_vec, elt);
+ print_generic_expr (dump_file, rf->factor, 0);
+ if (elt < vec_len - 1)
+ fputs (" * ", dump_file);
+ }
+ fprintf (dump_file, ")^%ld\n", power);
+ }
+
+ reassociate_stats.pows_created++;
+
+ /* Visit each element of the vector in reverse order (so that
+ high-occurrence elements are visited first, and within the
+ same occurrence count, lower-ranked elements are visited
+ first). Form a linear product of all elements in this order
+ whose occurrencce count is at least that of element J.
+ Record the SSA name representing the product of each element
+ with all subsequent elements in the vector. */
+ if (j == vec_len - 1)
+ rf1->repr = rf1->factor;
+ else
+ {
+ for (ii = vec_len - 2; ii >= (int)j; ii--)
+ {
+ tree op1, op2;
+
+ rf1 = VEC_index (repeat_factor, repeat_factor_vec, ii);
+ rf2 = VEC_index (repeat_factor, repeat_factor_vec, ii + 1);
+
+ /* Init the last factor's representative to be itself. */
+ if (!rf2->repr)
+ rf2->repr = rf2->factor;
+
+ op1 = rf1->factor;
+ op2 = rf2->repr;
+
+ target_ssa = get_reassoc_pow_ssa_name (target, type);
+ mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
+ op1, op2);
+ gimple_set_location (mul_stmt, gimple_location (stmt));
+ gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
+ rf1->repr = target_ssa;
+ }
+ }
+
+ /* Form a call to __builtin_powi for the maximum product
+ just formed, raised to the power obtained earlier. */
+ rf1 = VEC_index (repeat_factor, repeat_factor_vec, j);
+ target_ssa = get_reassoc_pow_ssa_name (target, type);
+ pow_stmt = gimple_build_call (powi_fndecl, 2, rf1->repr,
+ build_int_cst (integer_type_node, power));
+ gimple_call_set_lhs (pow_stmt, target_ssa);
+ gimple_set_location (pow_stmt, gimple_location (stmt));
+ gsi_insert_before (&gsi, pow_stmt, GSI_SAME_STMT);
+
+ /* Decrement the occurrence count of each element in the product
+ by the count found above, and remove this many copies of each
+ factor from OPS. */
+ for (i = j; i < vec_len; i++)
+ {
+ unsigned k = power;
+ unsigned n;
+
+ rf1 = VEC_index (repeat_factor, repeat_factor_vec, i);
+ rf1->count -= power;
+
+ FOR_EACH_VEC_ELT_REVERSE (operand_entry_t, *ops, n, oe)
+ {
+ if (oe->op == rf1->factor)
+ {
+ VEC_ordered_remove (operand_entry_t, *ops, n);
+ if (--k == 0)
+ break;
+ }
+ }
+ }
+
+ /* If we previously formed at least one other builtin_powi call,
+ form the product of this one and those others. */
+ if (result)
+ {
+ tree new_target_ssa = get_reassoc_pow_ssa_name (target, type);
+ mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, new_target_ssa,
+ target_ssa, result);
+ gimple_set_location (mul_stmt, gimple_location (stmt));
+ gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
+ result = new_target_ssa;
+ }
+ else
+ result = target_ssa;
+ }
+
+ /* At this point all elements in the repeated factor vector have a
+ remaining occurrence count of 0 or 1. Scanning the vector in
+ reverse order, find the last element with a 1 before a 0 is found.
+ If this element has an SSA representative and is not the last
+ element, then it represents a multiply we have already calculated.
+ Multiply the result so far by this SSA name. Remove one copy of
+ each factor in the product from OPS. */
+ cached = vec_len;
+
+ for (ii = vec_len - 1; ii >= 0; ii--)
+ {
+ rf1 = VEC_index (repeat_factor, repeat_factor_vec, ii);
+ if (rf1->count == 0)
+ {
+ cached = ii + 1;
+ break;
+ }
+ }
+
+ if (cached < vec_len)
+ {
+ rf1 = VEC_index (repeat_factor, repeat_factor_vec, cached);
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ unsigned elt;
+ repeat_factor_t rf;
+ fputs ("Multiplying by cached product ", dump_file);
+ for (elt = cached; elt < vec_len; elt++)
+ {
+ rf = VEC_index (repeat_factor, repeat_factor_vec, elt);
+ print_generic_expr (dump_file, rf->factor, 0);
+ if (elt < vec_len - 1)
+ fputs (" * ", dump_file);
+ }
+ fputs ("\n", dump_file);
+ }
+
+ if (!result)
+ result = rf1->repr;
+ else
+ {
+ tree new_target_ssa = get_reassoc_pow_ssa_name (target, type);
+ mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, new_target_ssa,
+ rf1->repr, result);
+ gimple_set_location (mul_stmt, gimple_location (stmt));
+ gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
+ result = new_target_ssa;
+ }
+ }
+
+ for (i = cached; i < vec_len; i++)
+ {
+ unsigned n;
+
+ rf1 = VEC_index (repeat_factor, repeat_factor_vec, i);
+ rf1->count--;
+
+ FOR_EACH_VEC_ELT_REVERSE (operand_entry_t, *ops, n, oe)
+ {
+ if (oe->op == rf1->factor)
+ {
+ VEC_ordered_remove (operand_entry_t, *ops, n);
+ break;
+ }
+ }
+ }
+
+ /* Clean up. */
+ VEC_free (repeat_factor, heap, repeat_factor_vec);
+
+ /* Return the final product as computed herein. Note that there
+ may still be some elements with single occurrence count left
+ in OPS; those will be handled by the normal reassociation logic. */
+ return result;
+}
+
/* Reassociate expressions in basic block BB and its post-dominator as
children. */
@@ -2823,6 +3355,7 @@ reassociate_bb (basic_block bb)
{
gimple_stmt_iterator gsi;
basic_block son;
+ tree target = NULL_TREE;
for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
{
@@ -2883,6 +3416,8 @@ reassociate_bb (basic_block bb)
if (associative_tree_code (rhs_code))
{
+ tree powi_result = NULL_TREE;
+
VEC(operand_entry_t, heap) *ops = NULL;
/* There may be no immediate uses left by the time we
@@ -2904,7 +3439,14 @@ reassociate_bb (basic_block bb)
if (rhs_code == BIT_IOR_EXPR || rhs_code == BIT_AND_EXPR)
optimize_range_tests (rhs_code, &ops);
- if (VEC_length (operand_entry_t, ops) == 1)
+ if (early_reassoc
+ && rhs_code == MULT_EXPR
+ && flag_unsafe_math_optimizations)
+ powi_result = attempt_builtin_powi (stmt, &ops, &target);
+
+ /* If the operand vector is now empty, all operands were
+ consumed by the __builtin_pow optimization. */
+ if (VEC_length (operand_entry_t, ops) == 0)
{
if (dump_file && (dump_flags & TDF_DETAILS))
{
@@ -2913,9 +3455,7 @@ reassociate_bb (basic_block bb)
}
rhs1 = gimple_assign_rhs1 (stmt);
- gimple_assign_set_rhs_from_tree (&gsi,
- VEC_last (operand_entry_t,
- ops)->op);
+ gimple_assign_set_rhs_from_tree (&gsi, powi_result);
update_stmt (stmt);
remove_visited_stmt_chain (rhs1);
@@ -2925,6 +3465,39 @@ reassociate_bb (basic_block bb)
print_gimple_stmt (dump_file, stmt, 0, 0);
}
}
+
+ else if (VEC_length (operand_entry_t, ops) == 1)
+ {
+ tree last_op = VEC_last (operand_entry_t, ops)->op;
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "Transforming ");
+ print_gimple_stmt (dump_file, stmt, 0, 0);
+ }
+
+ if (powi_result)
+ {
+ rhs1 = gimple_assign_rhs1 (stmt);
+ gimple_assign_set_rhs_with_ops_1 (&gsi, MULT_EXPR,
+ powi_result, last_op,
+ NULL_TREE);
+ update_stmt (gsi_stmt (gsi));
+ remove_visited_stmt_chain (rhs1);
+ }
+ else
+ {
+ rhs1 = gimple_assign_rhs1 (stmt);
+ gimple_assign_set_rhs_from_tree (&gsi, last_op);
+ update_stmt (stmt);
+ remove_visited_stmt_chain (rhs1);
+ }
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, " into ");
+ print_gimple_stmt (dump_file, stmt, 0, 0);
+ }
+ }
else
{
enum machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
@@ -2940,6 +3513,24 @@ reassociate_bb (basic_block bb)
rewrite_expr_tree_parallel (stmt, width, ops);
else
rewrite_expr_tree (stmt, 0, ops, false);
+
+ /* If we combined some repeated factors into a
+ __builtin_powi call, multiply that result by the
+ reassociated operands. */
+ if (powi_result)
+ {
+ gimple mul_stmt;
+ tree type = TREE_TYPE (gimple_get_lhs (stmt));
+ tree target_ssa = get_reassoc_pow_ssa_name (&target,
+ type);
+ gimple_set_lhs (stmt, target_ssa);
+ update_stmt (stmt);
+ mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, lhs,
+ powi_result,
+ target_ssa);
+ gimple_set_location (mul_stmt, gimple_location (stmt));
+ gsi_insert_after (&gsi, mul_stmt, GSI_NEW_STMT);
+ }
}
VEC_free (operand_entry_t, heap, ops);
@@ -3054,6 +3645,10 @@ fini_reassoc (void)
reassociate_stats.ops_eliminated);
statistics_counter_event (cfun, "Statements rewritten",
reassociate_stats.rewritten);
+ statistics_counter_event (cfun, "Built-in pow[i] calls expanded",
+ reassociate_stats.pows_expanded);
+ statistics_counter_event (cfun, "Built-in powi calls created",
+ reassociate_stats.pows_created);
pointer_map_destroy (operand_rank);
free_alloc_pool (operand_entry_pool);
@@ -3077,19 +3672,33 @@ execute_reassoc (void)
return 0;
}
+static unsigned int
+execute_early_reassoc (void)
+{
+ early_reassoc = true;
+ return execute_reassoc ();
+}
+
+static unsigned int
+execute_late_reassoc (void)
+{
+ early_reassoc = false;
+ return execute_reassoc ();
+}
+
static bool
gate_tree_ssa_reassoc (void)
{
return flag_tree_reassoc != 0;
}
-struct gimple_opt_pass pass_reassoc =
+struct gimple_opt_pass pass_early_reassoc =
{
{
GIMPLE_PASS,
- "reassoc", /* name */
+ "reassoc1", /* name */
gate_tree_ssa_reassoc, /* gate */
- execute_reassoc, /* execute */
+ execute_early_reassoc, /* execute */
NULL, /* sub */
NULL, /* next */
0, /* static_pass_number */
@@ -3103,3 +3712,24 @@ gate_tree_ssa_reassoc (void)
| TODO_ggc_collect /* todo_flags_finish */
}
};
+
+struct gimple_opt_pass pass_late_reassoc =
+{
+ {
+ GIMPLE_PASS,
+ "reassoc2", /* name */
+ gate_tree_ssa_reassoc, /* gate */
+ execute_late_reassoc, /* execute */
+ NULL, /* sub */
+ NULL, /* next */
+ 0, /* static_pass_number */
+ TV_TREE_REASSOC, /* tv_id */
+ PROP_cfg | PROP_ssa, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_verify_ssa
+ | TODO_verify_flow
+ | TODO_ggc_collect /* todo_flags_finish */
+ }
+};