This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Improve cost computations in IVOPTS
- From: Eric Botcazou <ebotcazou at adacore dot com>
- To: gcc-patches at gcc dot gnu dot org
- Date: Mon, 25 May 2009 12:55:53 +0200
- Subject: [PATCH] Improve cost computations in IVOPTS
Hi,
for loops with non-constant iteration origin, e.g.:
extern char arr1[];
int foo1(char v, int start, int end)
{
int i;
for (i = start; i < end; i++)
arr1[i] = v;
}
extern int arr2[];
int foo2(int v, int start, int end)
{
int i;
for (i = start; i < end; i++)
arr2[i] = v;
}
IVOPS generates code that uses one more register than needed on x86 at -O:
.L3:
movb %bl, (%edx)
addl $1, %eax
addl $1, %edx
cmpl %eax, %ecx
jg .L3
.L8:
movl %ebx, (%edx)
addl $1, %eax
addl $4, %edx
cmpl %eax, %ecx
jg .L8
i.e. 3 registers for the iteration instead of 2. That's problematic for very
nested inner loops.
The problem comes for the cost computed for the IV associated with the use:
use 0
address
in statement arr1[i_12] = v_5(D);
at position arr1[i_12]
type char *
base &arr1[start_2(D)]
step 1
base object (void *) &arr1
related candidates
Use 0:
cand cost compl. depends on
0 19 1 1
1 27 2 1
2 27 1 1
3 27 1 1
4 2 0
candidate 1 (important)
incremented before exit test
type unsigned int
base (unsigned int) (start_2(D) + 1)
step 1
candidate 2 (important)
incremented before exit test
type unsigned int
base (unsigned int) start_2(D)
step 1
candidate 3 (important)
original biv
type unsigned int
base (unsigned int) start_2(D)
step 1
The costs for candidates 1, 2 and 3 are greatly skewed because the compiler
doesn't correctly evaluate the cost of &arr1[start_2(D)] - start_2(D).
The attached patch is aimed at fixing this. It generates for the above loops:
.L3:
movb %cl, arr1(%eax)
addl $1, %eax
cmpl %eax, %edx
jg .L3
.L8:
movl %ecx, arr2(,%eax,4)
addl $1, %eax
cmpl %eax, %edx
jg .L8
Tested on i586-suse-linux, OK for mainline?
2009-05-25 Eric Botcazou <ebotcazou@adacore.com>
* tree-ssa-loop-ivopts.c (strip_offset_1) <MULT_EXPR>: New case.
(force_expr_to_var_cost) <NEGATE_EXPR>: Likewise.
(ptr_difference_cost): Use affine combinations to compute it.
(difference_cost): Likewise.
(get_computation_cost_at): Compute more accurate cost for addresses
if the ratio is a multiplier allowed in addresses.
For non-addresses, consider that an additional offset or symbol is
added only once.
--
Eric Botcazou
Index: tree-ssa-loop-ivopts.c
===================================================================
--- tree-ssa-loop-ivopts.c (revision 147814)
+++ tree-ssa-loop-ivopts.c (working copy)
@@ -1922,6 +1922,24 @@ strip_offset_1 (tree expr, bool inside_a
return fold_convert (orig_type, expr);
+ case MULT_EXPR:
+ op1 = TREE_OPERAND (expr, 1);
+ if (TREE_CODE (op1) != INTEGER_CST)
+ return orig_expr;
+
+ op0 = TREE_OPERAND (expr, 0);
+ op0 = strip_offset_1 (op0, false, false, &off0);
+ if (op0 == TREE_OPERAND (expr, 0))
+ return orig_expr;
+
+ *offset = off0 * TREE_INT_CST_LOW (op1);
+ if (integer_zerop (op0))
+ expr = op0;
+ else
+ expr = fold_build2 (MULT_EXPR, type, op0, op1);
+
+ return fold_convert (orig_type, expr);
+
case ARRAY_REF:
case ARRAY_RANGE_REF:
if (!inside_addr)
@@ -3302,6 +3320,19 @@ force_expr_to_var_cost (tree expr, bool
break;
+ case NEGATE_EXPR:
+ op0 = TREE_OPERAND (expr, 0);
+ STRIP_NOPS (op0);
+ op1 = NULL_TREE;
+
+ if (is_gimple_val (op0))
+ cost0 = zero_cost;
+ else
+ cost0 = force_expr_to_var_cost (op0, speed);
+
+ cost1 = zero_cost;
+ break;
+
default:
/* Just an arbitrary value, FIXME. */
return new_cost (target_spill_cost[speed], 0);
@@ -3313,6 +3344,7 @@ force_expr_to_var_cost (tree expr, bool
case POINTER_PLUS_EXPR:
case PLUS_EXPR:
case MINUS_EXPR:
+ case NEGATE_EXPR:
cost = new_cost (add_cost (mode, speed), 0);
break;
@@ -3415,8 +3447,8 @@ ptr_difference_cost (struct ivopts_data
unsigned HOST_WIDE_INT *offset, bitmap *depends_on)
{
HOST_WIDE_INT diff = 0;
- comp_cost cost;
- bool speed = optimize_loop_for_speed_p (data->current_loop);
+ aff_tree aff_e1, aff_e2;
+ tree type;
gcc_assert (TREE_CODE (e1) == ADDR_EXPR);
@@ -3434,12 +3466,14 @@ ptr_difference_cost (struct ivopts_data
*symbol_present = false;
*var_present = true;
-
- cost = force_var_cost (data, e1, depends_on);
- cost = add_costs (cost, force_var_cost (data, e2, depends_on));
- cost.cost += add_cost (Pmode, speed);
- return cost;
+ type = signed_type_for (TREE_TYPE (e1));
+ tree_to_aff_combination (e1, type, &aff_e1);
+ tree_to_aff_combination (e2, type, &aff_e2);
+ aff_combination_scale (&aff_e2, double_int_minus_one);
+ aff_combination_add (&aff_e1, &aff_e2);
+
+ return force_var_cost (data, aff_combination_to_tree (&aff_e1), depends_on);
}
/* Estimates cost of expressing difference E1 - E2 as
@@ -3453,9 +3487,10 @@ difference_cost (struct ivopts_data *dat
tree e1, tree e2, bool *symbol_present, bool *var_present,
unsigned HOST_WIDE_INT *offset, bitmap *depends_on)
{
- comp_cost cost;
enum machine_mode mode = TYPE_MODE (TREE_TYPE (e1));
unsigned HOST_WIDE_INT off1, off2;
+ aff_tree aff_e1, aff_e2;
+ tree type;
e1 = strip_offset (e1, &off1);
e2 = strip_offset (e2, &off2);
@@ -3465,8 +3500,8 @@ difference_cost (struct ivopts_data *dat
STRIP_NOPS (e2);
if (TREE_CODE (e1) == ADDR_EXPR)
- return ptr_difference_cost (data, e1, e2, symbol_present, var_present, offset,
- depends_on);
+ return ptr_difference_cost (data, e1, e2, symbol_present, var_present,
+ offset, depends_on);
*symbol_present = false;
if (operand_equal_p (e1, e2, 0))
@@ -3474,23 +3509,26 @@ difference_cost (struct ivopts_data *dat
*var_present = false;
return zero_cost;
}
+
*var_present = true;
+
if (integer_zerop (e2))
return force_var_cost (data, e1, depends_on);
if (integer_zerop (e1))
{
- cost = force_var_cost (data, e2, depends_on);
+ comp_cost cost = force_var_cost (data, e2, depends_on);
cost.cost += multiply_by_cost (-1, mode, data->speed);
-
return cost;
}
- cost = force_var_cost (data, e1, depends_on);
- cost = add_costs (cost, force_var_cost (data, e2, depends_on));
- cost.cost += add_cost (mode, data->speed);
+ type = signed_type_for (TREE_TYPE (e1));
+ tree_to_aff_combination (e1, type, &aff_e1);
+ tree_to_aff_combination (e2, type, &aff_e2);
+ aff_combination_scale (&aff_e2, double_int_minus_one);
+ aff_combination_add (&aff_e1, &aff_e2);
- return cost;
+ return force_var_cost (data, aff_combination_to_tree (&aff_e1), depends_on);
}
/* Determines the cost of the computation by that USE is expressed
@@ -3511,7 +3549,6 @@ get_computation_cost_at (struct ivopts_d
HOST_WIDE_INT ratio, aratio;
bool var_present, symbol_present;
comp_cost cost;
- unsigned n_sums;
double_int rat;
bool speed = optimize_bb_for_speed_p (gimple_bb (at));
@@ -3544,7 +3581,7 @@ get_computation_cost_at (struct ivopts_d
return infinite_cost;
}
- if (TYPE_PRECISION (utype) != TYPE_PRECISION (ctype))
+ if (TYPE_PRECISION (utype) < TYPE_PRECISION (ctype))
{
/* TODO -- add direct handling of this case. */
goto fallback;
@@ -3569,6 +3606,9 @@ get_computation_cost_at (struct ivopts_d
else
return infinite_cost;
+ STRIP_NOPS (cbase);
+ ctype = TREE_TYPE (cbase);
+
/* use = ubase + ratio * (var - cbase). If either cbase is a constant
or ratio == 1, it is better to handle this like
@@ -3591,6 +3631,18 @@ get_computation_cost_at (struct ivopts_d
&symbol_present, &var_present, &offset,
depends_on);
}
+ else if (address_p
+ && !POINTER_TYPE_P (ctype)
+ && multiplier_allowed_in_address_p (ratio,
+ TYPE_MODE (TREE_TYPE (utype))))
+ {
+ cbase
+ = fold_build2 (MULT_EXPR, ctype, cbase, build_int_cst (ctype, ratio));
+ cost = difference_cost (data,
+ ubase, cbase,
+ &symbol_present, &var_present, &offset,
+ depends_on);
+ }
else
{
cost = force_var_cost (data, cbase, depends_on);
@@ -3608,39 +3660,38 @@ get_computation_cost_at (struct ivopts_d
offset -= ratio * cstepi;
/* Now the computation is in shape symbol + var1 + const + ratio * var2.
- (symbol/var/const parts may be omitted). If we are looking for an address,
- find the cost of addressing this. */
+ (symbol/var1/const parts may be omitted). If we are looking for an
+ address, find the cost of addressing this. */
if (address_p)
- return add_costs (cost, get_address_cost (symbol_present, var_present,
- offset, ratio,
- TYPE_MODE (TREE_TYPE (*use->op_p)), speed));
+ return add_costs (cost,
+ get_address_cost (symbol_present, var_present,
+ offset, ratio,
+ TYPE_MODE (TREE_TYPE (utype)), speed));
/* Otherwise estimate the costs for computing the expression. */
- aratio = ratio > 0 ? ratio : -ratio;
if (!symbol_present && !var_present && !offset)
{
if (ratio != 1)
cost.cost += multiply_by_cost (ratio, TYPE_MODE (ctype), speed);
-
return cost;
}
- if (aratio != 1)
- cost.cost += multiply_by_cost (aratio, TYPE_MODE (ctype), speed);
-
- n_sums = 1;
- if (var_present
- /* Symbol + offset should be compile-time computable. */
- && (symbol_present || offset))
- n_sums++;
+ /* Symbol + offset should be compile-time computable so consider that they
+ are added once to the variable, if present. */
+ if (var_present && (symbol_present || offset))
+ cost.cost += add_cost (TYPE_MODE (ctype), speed)
+ / AVG_LOOP_NITER (data->current_loop);
/* Having offset does not affect runtime cost in case it is added to
symbol, but it increases complexity. */
if (offset)
cost.complexity++;
- cost.cost += n_sums * add_cost (TYPE_MODE (ctype), speed);
- return cost;
+ cost.cost += add_cost (TYPE_MODE (ctype), speed);
+
+ aratio = ratio > 0 ? ratio : -ratio;
+ if (aratio != 1)
+ cost.cost += multiply_by_cost (aratio, TYPE_MODE (ctype), speed);
fallback:
{