This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] widening_mul: Do cost check when propagating mult into plus/minus expressions


Hi,

the widening_mul pass might increase the number of multiplications in
the code by transforming

a = b * c
d = a + 2
e = a + 3

into:

d = b * c + 2
e = b * c + 3

under the assumption that an FMA instruction is not more expensive
than a simple add.  This certainly isn't always true.  While e.g. on
s390 an fma is indeed not slower than an add execution-wise it has
disadvantages regarding instruction grouping.  It doesn't group with
any other instruction what has a major impact on the instruction
dispatch bandwidth.

The following patch tries to figure out the costs for adds, mults and
fmas by building an RTX and asking the backends cost function in order
to estimate whether it is whorthwhile doing the transformation.

With that patch the 436.cactus hotloop contains 28 less
multiplications than before increasing performance slightly (~2%).

Bootstrapped and regtested on x86_64 and s390x.

Bye,

-Andreas-

2011-07-13  Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>

	* tree-ssa-math-opts.c (compute_costs): New function.
	(convert_mult_to_fma): Take costs into account when propagating
	multiplications into several additions.
	* config/s390/s390.c (z196_costs): Adjust costs for madbr and
	maebr.


Index: gcc/tree-ssa-math-opts.c
===================================================================
*** gcc/tree-ssa-math-opts.c.orig
--- gcc/tree-ssa-math-opts.c
*************** convert_plusminus_to_widen (gimple_stmt_
*** 2185,2190 ****
--- 2185,2252 ----
    return true;
  }
  
+ /* Computing the costs for calculating RTX with CODE in MODE.  */
+ 
+ static unsigned
+ compute_costs (enum machine_mode mode, enum rtx_code code, bool speed)
+ {
+   rtx seq;
+   rtx set;
+   unsigned cost = 0;
+ 
+   start_sequence ();
+ 
+   switch (GET_RTX_LENGTH (code))
+     {
+     case 2:
+       force_operand (gen_rtx_fmt_ee (code, mode,
+ 		       gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 1),
+ 		       gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 2)),
+ 		     NULL_RTX);
+       break;
+     case 3:
+       /* FMA expressions are not handled by force_operand.  */
+       expand_ternary_op (mode, fma_optab,
+ 			 gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 1),
+ 			 gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 2),
+ 			 gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 3),
+ 			 NULL_RTX, false);
+       break;
+     default:
+       gcc_unreachable ();
+     }
+ 
+   seq = get_insns ();
+   end_sequence ();
+ 
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     {
+       fprintf (dump_file, "Calculating costs of %s in %s mode.  Sequence is:\n",
+ 	       GET_RTX_NAME (code), GET_MODE_NAME (mode));
+       print_rtl (dump_file, seq);
+     }
+ 
+   for (; seq; seq = NEXT_INSN (seq))
+     {
+       set = single_set (seq);
+       if (set)
+ 	cost += rtx_cost (set, SET, speed);
+       else
+ 	cost++;
+     }
+ 
+   /* If the backend returns a cost of zero it is most certainly lying.
+      Set this to one in order to notice that we already calculated it
+      once.  */
+   cost = cost ? cost : 1;
+ 
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "%s in %s costs %d\n\n",
+ 	       GET_RTX_NAME (code), GET_MODE_NAME (mode), cost);
+ 
+   return cost;
+ }
+ 
  /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
     with uses in additions and subtractions to form fused multiply-add
     operations.  Returns true if successful and MUL_STMT should be removed.  */
*************** convert_mult_to_fma (gimple mul_stmt, tr
*** 2197,2202 ****
--- 2259,2270 ----
    gimple use_stmt, neguse_stmt, fma_stmt;
    use_operand_p use_p;
    imm_use_iterator imm_iter;
+   enum machine_mode mode;
+   int uses = 0;
+   bool speed = optimize_bb_for_speed_p (gimple_bb (mul_stmt));
+   static unsigned mul_cost[NUM_MACHINE_MODES];
+   static unsigned add_cost[NUM_MACHINE_MODES];
+   static unsigned fma_cost[NUM_MACHINE_MODES];
  
    if (FLOAT_TYPE_P (type)
        && flag_fp_contract_mode == FP_CONTRACT_OFF)
*************** convert_mult_to_fma (gimple mul_stmt, tr
*** 2213,2222 ****
    if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
      return false;
  
    /* Make sure that the multiplication statement becomes dead after
!      the transformation, thus that all uses are transformed to FMAs.
!      This means we assume that an FMA operation has the same cost
!      as an addition.  */
    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
      {
        enum tree_code use_code;
--- 2281,2297 ----
    if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
      return false;
  
+   mode = TYPE_MODE (type);
+ 
+   if (!fma_cost[mode])
+     {
+       fma_cost[mode] = compute_costs (mode, FMA, speed);
+       add_cost[mode] = compute_costs (mode, PLUS, speed);
+       mul_cost[mode] = compute_costs (mode, MULT, speed);
+     }
+ 
    /* Make sure that the multiplication statement becomes dead after
!      the transformation, thus that all uses are transformed to FMAs.  */
    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
      {
        enum tree_code use_code;
*************** convert_mult_to_fma (gimple mul_stmt, tr
*** 2292,2297 ****
--- 2367,2373 ----
        if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
  	return false;
  
+       uses++;
        /* While it is possible to validate whether or not the exact form
  	 that we've recognized is available in the backend, the assumption
  	 is that the transformation is never a loss.  For instance, suppose
*************** convert_mult_to_fma (gimple mul_stmt, tr
*** 2302,2307 ****
--- 2378,2389 ----
  	 independant and could be run in parallel.  */
      }
  
+   /* Calculate the costs of moving the multiplication into all the
+      minus/plus expressions.  */
+ 
+   if (uses * fma_cost[mode] > uses * add_cost[mode] + mul_cost[mode])
+     return false;
+ 
    FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
      {
        gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
Index: gcc/config/s390/s390.c
===================================================================
*** gcc/config/s390/s390.c.orig
--- gcc/config/s390/s390.c
*************** struct processor_costs z196_cost =
*** 242,249 ****
    COSTS_N_INSNS (100),   /* SQXBR B+100 */
    COSTS_N_INSNS (42),    /* SQDBR B+42 */
    COSTS_N_INSNS (28),    /* SQEBR B+28 */
!   COSTS_N_INSNS (1),     /* MADBR B */
!   COSTS_N_INSNS (1),     /* MAEBR B */
    COSTS_N_INSNS (101),   /* DXBR B+101 */
    COSTS_N_INSNS (29),    /* DDBR */
    COSTS_N_INSNS (22),    /* DEBR */
--- 242,250 ----
    COSTS_N_INSNS (100),   /* SQXBR B+100 */
    COSTS_N_INSNS (42),    /* SQDBR B+42 */
    COSTS_N_INSNS (28),    /* SQEBR B+28 */
!   /* Cheaper than a mul+add but more expensive then a single mul/add.  */
!   COSTS_N_INSNS (1) + COSTS_N_INSNS (1) / 2, /* MADBR B */
!   COSTS_N_INSNS (1) + COSTS_N_INSNS (1) / 2, /* MAEBR B */
    COSTS_N_INSNS (101),   /* DXBR B+101 */
    COSTS_N_INSNS (29),    /* DDBR */
    COSTS_N_INSNS (22),    /* DEBR */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]