This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] Add FMA_EXPR, un-cse multiplications before expansion
On Fri, 22 Oct 2010, Joseph S. Myers wrote:
> On Fri, 22 Oct 2010, Richard Guenther wrote:
>
> > Joseph, I'm waiting for your last option handling patch to go in
> > to adjust -ffast-math to include -ffp-contract. I'm also not
>
> Right now I don't have any uncommitted options patches.
>
> > sure how to do the tristate on/off/fast at the moment.
>
> The current way is code in the option handler that checks for different
> string arguments, as done for -fexcess-precision= (a two-state option
> where the option design allows for other versions such as "none" to be
> added if anyone wants them).
>
> (I do intend at some point to add a generic facility for such options in
> .opt files; there's a lot of duplication of similar code to handle such
> options, especially in all the targets looking up possible -march values.
> Given a generic facility, diagnostics for invalid values can all tell you
> what the valid values are, as can --help.)
--help= seems to be broken for me at the moment.
Here's my current patch in case anyone plans to do fma related work
elsewhere.
Richard.
2010-10-22 Richard Guenther <rguenther@suse.de>
* tree.def (FMA_EXPR): New tree code.
* expr.c (expand_expr_real_2): Add FMA_EXPR expansion code.
* gimple.c (gimple_rhs_class_table): FMA_EXPR is a GIMPLE_TERNARY_RHS.
* tree-cfg.c (verify_gimple_assign_ternary): Verify FMA_EXPR types.
* tree-inline.c (estimate_operator_cost): Handle FMA_EXPR.
* gimple-pretty-print.c (dump_ternary_rhs): Likewise.
* tree-ssa-math-opts.c (convert_mult_to_fma): New function.
(execute_optimize_widening_mul): Call it. Reorganize to allow
dead stmt removal. Move TODO flags ...
(pass_optimize_widening_mul): ... here.
* common.opt (-ffp-contract): New option.
* opts.c (common_handle_option): Handle it.
(set_unsafe_math_optimizations_flags): Enable -ffp-contract=fast.
* doc/invoke.texi (-ffp-contract): Document.
(-funsafe-math-optimizations): Adjust.
* gcc.target/i386/fma4-vector-2.c: New testcase.
Index: gcc/tree.def
===================================================================
*** gcc/tree.def.orig 2010-10-22 16:43:20.000000000 +0200
--- gcc/tree.def 2010-10-22 16:48:01.000000000 +0200
*************** DEFTREECODE (WIDEN_MULT_PLUS_EXPR, "wide
*** 1092,1097 ****
--- 1092,1103 ----
is subtracted from t3. */
DEFTREECODE (WIDEN_MULT_MINUS_EXPR, "widen_mult_plus_expr", tcc_expression, 3)
+ /* Fused multiply-add.
+ All operands and the result are of the same type. No intermediate
+ rounding is performed after multiplying operand one with operand two
+ before adding operand three. */
+ DEFTREECODE (FMA_EXPR, "fma_expr", tcc_expression, 3)
+
/* Whole vector left/right shift in bits.
Operand 0 is a vector to be shifted.
Operand 1 is an integer shift amount in bits. */
Index: gcc/expr.c
===================================================================
*** gcc/expr.c.orig 2010-10-22 16:43:20.000000000 +0200
--- gcc/expr.c 2010-10-22 16:48:01.000000000 +0200
*************** expand_expr_real_2 (sepops ops, rtx targ
*** 7254,7260 ****
int ignore;
bool reduce_bit_field;
location_t loc = ops->location;
! tree treeop0, treeop1;
#define REDUCE_BIT_FIELD(expr) (reduce_bit_field \
? reduce_to_bit_field_precision ((expr), \
target, \
--- 7254,7260 ----
int ignore;
bool reduce_bit_field;
location_t loc = ops->location;
! tree treeop0, treeop1, treeop2;
#define REDUCE_BIT_FIELD(expr) (reduce_bit_field \
? reduce_to_bit_field_precision ((expr), \
target, \
*************** expand_expr_real_2 (sepops ops, rtx targ
*** 7267,7272 ****
--- 7267,7273 ----
treeop0 = ops->op0;
treeop1 = ops->op1;
+ treeop2 = ops->op2;
/* We should be called only on simple (binary or unary) expressions,
exactly those that are valid in gimple expressions that aren't
*************** expand_expr_real_2 (sepops ops, rtx targ
*** 7624,7630 ****
case WIDEN_MULT_PLUS_EXPR:
case WIDEN_MULT_MINUS_EXPR:
expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
! op2 = expand_normal (ops->op2);
target = expand_widen_pattern_expr (ops, op0, op1, op2,
target, unsignedp);
return target;
--- 7625,7631 ----
case WIDEN_MULT_PLUS_EXPR:
case WIDEN_MULT_MINUS_EXPR:
expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
! op2 = expand_normal (treeop2);
target = expand_widen_pattern_expr (ops, op0, op1, op2,
target, unsignedp);
return target;
*************** expand_expr_real_2 (sepops ops, rtx targ
*** 7711,7716 ****
--- 7712,7745 ----
expand_operands (treeop0, treeop1, subtarget, &op0, &op1, EXPAND_NORMAL);
return REDUCE_BIT_FIELD (expand_mult (mode, op0, op1, target, unsignedp));
+ case FMA_EXPR:
+ {
+ gimple def;
+ def = get_def_for_expr (treeop0, NEGATE_EXPR);
+ if (def)
+ {
+ op0 = expand_normal (gimple_assign_rhs1 (def));
+ op0 = force_reg (mode, op0);
+ op0 = gen_rtx_NEG (mode, op0);
+ }
+ else
+ op0 = expand_expr (treeop0, subtarget, VOIDmode, EXPAND_NORMAL);
+ op1 = expand_normal (treeop1);
+ def = get_def_for_expr (treeop2, NEGATE_EXPR);
+ if (def)
+ {
+ op2 = expand_normal (gimple_assign_rhs1 (def));
+ op2 = force_reg (mode, op2);
+ op2 = gen_rtx_NEG (mode, op2);
+ }
+ else
+ op2 = expand_normal (treeop2);
+ /* ??? Building the explicit negs above doesn't help if the
+ fma<mode>4 expander doesn't accept it. */
+ return expand_ternary_op (TYPE_MODE (type), fma_optab,
+ op0, op1, op2, target, 0);
+ }
+
case MULT_EXPR:
/* If this is a fixed-point operation, then we cannot use the code
below because "expand_mult" doesn't support sat/no-sat fixed-point
Index: gcc/gimple.c
===================================================================
*** gcc/gimple.c.orig 2010-10-22 16:43:20.000000000 +0200
--- gcc/gimple.c 2010-10-22 16:48:01.000000000 +0200
*************** get_gimple_rhs_num_ops (enum tree_code c
*** 2528,2534 ****
|| (SYM) == TRUTH_XOR_EXPR) ? GIMPLE_BINARY_RHS \
: (SYM) == TRUTH_NOT_EXPR ? GIMPLE_UNARY_RHS \
: ((SYM) == WIDEN_MULT_PLUS_EXPR \
! || (SYM) == WIDEN_MULT_MINUS_EXPR) ? GIMPLE_TERNARY_RHS \
: ((SYM) == COND_EXPR \
|| (SYM) == CONSTRUCTOR \
|| (SYM) == OBJ_TYPE_REF \
--- 2528,2535 ----
|| (SYM) == TRUTH_XOR_EXPR) ? GIMPLE_BINARY_RHS \
: (SYM) == TRUTH_NOT_EXPR ? GIMPLE_UNARY_RHS \
: ((SYM) == WIDEN_MULT_PLUS_EXPR \
! || (SYM) == WIDEN_MULT_MINUS_EXPR \
! || (SYM) == FMA_EXPR) ? GIMPLE_TERNARY_RHS \
: ((SYM) == COND_EXPR \
|| (SYM) == CONSTRUCTOR \
|| (SYM) == OBJ_TYPE_REF \
Index: gcc/tree-cfg.c
===================================================================
*** gcc/tree-cfg.c.orig 2010-10-22 16:43:20.000000000 +0200
--- gcc/tree-cfg.c 2010-10-22 16:48:01.000000000 +0200
*************** verify_gimple_assign_ternary (gimple stm
*** 3748,3753 ****
--- 3748,3767 ----
}
break;
+ case FMA_EXPR:
+ if (!useless_type_conversion_p (lhs_type, rhs1_type)
+ || !useless_type_conversion_p (lhs_type, rhs2_type)
+ || !useless_type_conversion_p (lhs_type, rhs3_type))
+ {
+ error ("type mismatch in fused multiply-add expression");
+ debug_generic_expr (lhs_type);
+ debug_generic_expr (rhs1_type);
+ debug_generic_expr (rhs2_type);
+ debug_generic_expr (rhs3_type);
+ return true;
+ }
+ break;
+
default:
gcc_unreachable ();
}
Index: gcc/tree-inline.c
===================================================================
*** gcc/tree-inline.c.orig 2010-10-22 16:43:20.000000000 +0200
--- gcc/tree-inline.c 2010-10-22 16:48:01.000000000 +0200
*************** estimate_operator_cost (enum tree_code c
*** 3284,3289 ****
--- 3284,3290 ----
case POINTER_PLUS_EXPR:
case MINUS_EXPR:
case MULT_EXPR:
+ case FMA_EXPR:
case ADDR_SPACE_CONVERT_EXPR:
case FIXED_CONVERT_EXPR:
Index: gcc/gimple-pretty-print.c
===================================================================
*** gcc/gimple-pretty-print.c.orig 2010-10-22 16:43:20.000000000 +0200
--- gcc/gimple-pretty-print.c 2010-10-22 16:48:01.000000000 +0200
*************** dump_ternary_rhs (pretty_printer *buffer
*** 400,405 ****
--- 400,413 ----
pp_character (buffer, '>');
break;
+ case FMA_EXPR:
+ dump_generic_node (buffer, gimple_assign_rhs1 (gs), spc, flags, false);
+ pp_string (buffer, " * ");
+ dump_generic_node (buffer, gimple_assign_rhs2 (gs), spc, flags, false);
+ pp_string (buffer, " + ");
+ dump_generic_node (buffer, gimple_assign_rhs3 (gs), spc, flags, false);
+ break;
+
default:
gcc_unreachable ();
}
Index: gcc/tree-ssa-math-opts.c
===================================================================
*** gcc/tree-ssa-math-opts.c.orig 2010-10-22 16:43:20.000000000 +0200
--- gcc/tree-ssa-math-opts.c 2010-10-22 16:48:01.000000000 +0200
*************** convert_plusminus_to_widen (gimple_stmt_
*** 1494,1499 ****
--- 1494,1604 ----
return true;
}
+ /* Combine the multiplication at MUL_STMT with uses in additions and
+ subtractions to form fused multiply-add operations. Returns true
+ if successful and MUL_STMT should be removed. */
+
+ static bool
+ convert_mult_to_fma (gimple mul_stmt)
+ {
+ tree mul_result = gimple_assign_lhs (mul_stmt);
+ tree type = TREE_TYPE (mul_result);
+ gimple use_stmt, fma_stmt;
+ use_operand_p use_p;
+ imm_use_iterator imm_iter;
+
+ if (FLOAT_TYPE_P (type)
+ && !flag_fp_contract)
+ return false;
+
+ /* If the target doesn't support it, don't generate it.
+ ??? We have no way of querying support for the various variants
+ with negated operands, so for the following we simply assume
+ they are all available ((-a)*b+c, a*b-c and (-a)*b-c). */
+ if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
+ return false;
+
+ /* We don't want to do bitfield reduction ops. */
+ if (INTEGRAL_TYPE_P (type)
+ && (TYPE_PRECISION (type)
+ != GET_MODE_PRECISION (TYPE_MODE (type))))
+ return false;
+
+ /* Make sure that the multiplication statement becomes dead after
+ the transformation, thus that all uses are transformed to FMAs.
+ This means we assume that an FMA operation has the same cost
+ as an addition. */
+ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
+ {
+ enum tree_code use_code;
+
+ use_stmt = USE_STMT (use_p);
+
+ if (!is_gimple_assign (use_stmt))
+ return false;
+ use_code = gimple_assign_rhs_code (use_stmt);
+ /* ??? Handle NEGATE_EXPR. */
+ if (use_code != PLUS_EXPR
+ && use_code != MINUS_EXPR)
+ return false;
+
+ /* We can't handle a * b + a * b. */
+ if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
+ return false;
+
+ /* For now restrict this operations to single basic blocks. In theory
+ we would want to support sinking the multiplication in
+ m = a*b;
+ if ()
+ ma = m + c;
+ else
+ d = m;
+ to form a fma in the then block and sink the multiplication to the
+ else block. */
+ if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
+ return false;
+ }
+
+ FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
+ {
+ tree addop, mulop1;
+ gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
+
+ mulop1 = gimple_assign_rhs1 (mul_stmt);
+ if (gimple_assign_rhs1 (use_stmt) == mul_result)
+ {
+ addop = gimple_assign_rhs2 (use_stmt);
+ /* a * b - c -> a * b + (-c) */
+ if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
+ addop = force_gimple_operand_gsi (&gsi,
+ build1 (NEGATE_EXPR,
+ type, addop),
+ true, NULL_TREE, true,
+ GSI_SAME_STMT);
+ }
+ else
+ {
+ addop = gimple_assign_rhs1 (use_stmt);
+ /* a - b * c -> (-b) * c + a */
+ if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
+ mulop1 = force_gimple_operand_gsi (&gsi,
+ build1 (NEGATE_EXPR,
+ type, mulop1),
+ true, NULL_TREE, true,
+ GSI_SAME_STMT);
+ }
+
+ fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR,
+ gimple_assign_lhs (use_stmt),
+ mulop1,
+ gimple_assign_rhs2 (mul_stmt),
+ addop);
+ gsi_replace (&gsi, fma_stmt, true);
+ }
+
+ return true;
+ }
+
/* Find integer multiplications where the operands are extended from
smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
where appropriate. */
*************** convert_plusminus_to_widen (gimple_stmt_
*** 1501,1531 ****
static unsigned int
execute_optimize_widening_mul (void)
{
- bool changed = false;
basic_block bb;
FOR_EACH_BB (bb)
{
gimple_stmt_iterator gsi;
! for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
{
gimple stmt = gsi_stmt (gsi);
enum tree_code code;
! if (!is_gimple_assign (stmt))
! continue;
!
! code = gimple_assign_rhs_code (stmt);
! if (code == MULT_EXPR)
! changed |= convert_mult_to_widen (stmt);
! else if (code == PLUS_EXPR || code == MINUS_EXPR)
! changed |= convert_plusminus_to_widen (&gsi, stmt, code);
}
}
! return (changed ? TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
! | TODO_verify_stmts : 0);
}
static bool
--- 1606,1650 ----
static unsigned int
execute_optimize_widening_mul (void)
{
basic_block bb;
FOR_EACH_BB (bb)
{
gimple_stmt_iterator gsi;
! for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
{
gimple stmt = gsi_stmt (gsi);
enum tree_code code;
! if (is_gimple_assign (stmt))
! {
! code = gimple_assign_rhs_code (stmt);
! switch (code)
! {
! case MULT_EXPR:
! if (!convert_mult_to_widen (stmt)
! && convert_mult_to_fma (stmt))
! {
! gsi_remove (&gsi, true);
! release_defs (stmt);
! continue;
! }
! break;
!
! case PLUS_EXPR:
! case MINUS_EXPR:
! convert_plusminus_to_widen (&gsi, stmt, code);
! break;
!
! default:;
! }
! }
! gsi_next (&gsi);
}
}
! return 0;
}
static bool
*************** struct gimple_opt_pass pass_optimize_wid
*** 1549,1554 ****
0, /* properties_provided */
0, /* properties_destroyed */
0, /* todo_flags_start */
! 0 /* todo_flags_finish */
}
};
--- 1668,1676 ----
0, /* properties_provided */
0, /* properties_destroyed */
0, /* todo_flags_start */
! TODO_verify_ssa
! | TODO_verify_stmts
! | TODO_dump_func
! | TODO_update_ssa /* todo_flags_finish */
}
};
Index: gcc/testsuite/gcc.target/i386/fma4-vector-2.c
===================================================================
*** /dev/null 1970-01-01 00:00:00.000000000 +0000
--- gcc/testsuite/gcc.target/i386/fma4-vector-2.c 2010-10-22 16:48:01.000000000 +0200
***************
*** 0 ****
--- 1,21 ----
+ /* { dg-do compile } */
+ /* { dg-require-effective-target lp64 } */
+ /* { dg-options "-O2 -mfma4 -ftree-vectorize -mtune=generic" } */
+
+ float r[256], s[256];
+ float x[256];
+ float y[256];
+ float z[256];
+
+ void foo (void)
+ {
+ int i;
+ for (i = 0; i < 256; ++i)
+ {
+ r[i] = x[i] * y[i] - z[i];
+ s[i] = x[i] * y[i] + z[i];
+ }
+ }
+
+ /* { dg-final { scan-assembler "vfmaddps" } } */
+ /* { dg-final { scan-assembler "vfmsubps" } } */
Index: gcc/common.opt
===================================================================
*** gcc/common.opt.orig 2010-10-22 16:43:20.000000000 +0200
--- gcc/common.opt 2010-10-22 16:48:24.000000000 +0200
*************** fforward-propagate
*** 842,847 ****
--- 842,851 ----
Common Report Var(flag_forward_propagate) Optimization
Perform a forward propagation pass on RTL
+ ffp-contract=
+ Common Joined RejectNegative Var(flag_fp_contract)
+ -ffp-contract=[on|off|fast] Perform floating-point expression contraction.
+
; Nonzero means don't put addresses of constant functions in registers.
; Used for compiling the Unix kernel, where strange substitutions are
; done on the assembly output.
Index: gcc/doc/invoke.texi
===================================================================
*** gcc/doc/invoke.texi.orig 2010-10-22 16:43:20.000000000 +0200
--- gcc/doc/invoke.texi 2010-10-22 16:48:01.000000000 +0200
*************** Objective-C and Objective-C++ Dialects}.
*** 342,348 ****
-fdelayed-branch -fdelete-null-pointer-checks -fdse -fdse @gol
-fearly-inlining -fipa-sra -fexpensive-optimizations -ffast-math @gol
-ffinite-math-only -ffloat-store -fexcess-precision=@var{style} @gol
! -fforward-propagate -ffunction-sections @gol
-fgcse -fgcse-after-reload -fgcse-las -fgcse-lm -fgraphite-identity @gol
-fgcse-sm -fif-conversion -fif-conversion2 -findirect-inlining @gol
-finline-functions -finline-functions-called-once -finline-limit=@var{n} @gol
--- 342,348 ----
-fdelayed-branch -fdelete-null-pointer-checks -fdse -fdse @gol
-fearly-inlining -fipa-sra -fexpensive-optimizations -ffast-math @gol
-ffinite-math-only -ffloat-store -fexcess-precision=@var{style} @gol
! -fforward-propagate -ffp-contract=@var{style} -ffunction-sections @gol
-fgcse -fgcse-after-reload -fgcse-las -fgcse-lm -fgraphite-identity @gol
-fgcse-sm -fif-conversion -fif-conversion2 -findirect-inlining @gol
-finline-functions -finline-functions-called-once -finline-limit=@var{n} @gol
*************** loop unrolling.
*** 5980,5985 ****
--- 5980,5996 ----
This option is enabled by default at optimization levels @option{-O},
@option{-O2}, @option{-O3}, @option{-Os}.
+ @item -ffp-contract=@var{style}
+ @opindex ffp-contract
+ @option{-ffp-contract=off} disables floating-point expression contraction.
+ @option{-ffp-contract=fast} enables floating-point expression contraction
+ such as forming of fused multiply-add operations if the target has
+ native support for them.
+ @option{-ffp-contract=on} enables floating-point expression contraction
+ if allowed by the language standard.
+
+ The default is @option{-ffp-contract=off}.
+
@item -fomit-frame-pointer
@opindex fomit-frame-pointer
Don't keep the frame pointer in a register for functions that
*************** an exact implementation of IEEE or ISO r
*** 7816,7822 ****
math functions. It may, however, yield faster code for programs
that do not require the guarantees of these specifications.
Enables @option{-fno-signed-zeros}, @option{-fno-trapping-math},
! @option{-fassociative-math} and @option{-freciprocal-math}.
The default is @option{-fno-unsafe-math-optimizations}.
--- 7827,7834 ----
math functions. It may, however, yield faster code for programs
that do not require the guarantees of these specifications.
Enables @option{-fno-signed-zeros}, @option{-fno-trapping-math},
! @option{-fassociative-math}, @option{-freciprocal-math} and
! @option{-ffp-contract=fast}.
The default is @option{-fno-unsafe-math-optimizations}.
Index: gcc/opts.c
===================================================================
*** gcc/opts.c.orig 2010-10-22 16:43:20.000000000 +0200
--- gcc/opts.c 2010-10-22 16:54:23.000000000 +0200
*************** common_handle_option (struct gcc_options
*** 1901,1906 ****
--- 1901,1918 ----
return false;
break;
+ case OPT_ffp_contract_:
+ if (!strcmp (arg, "on"))
+ /* Not implemented. */
+ flag_fp_contract = 0;
+ else if (!strcmp (arg, "off"))
+ flag_fp_contract = 0;
+ else if (!strcmp (arg, "fast"))
+ flag_fp_contract = 1;
+ else
+ error ("unknown floating point contraction style \"%s\"", arg);
+ break;
+
case OPT_fexcess_precision_:
if (!strcmp (arg, "fast"))
flag_excess_precision_cmdline = EXCESS_PRECISION_FAST;
*************** set_unsafe_math_optimizations_flags (int
*** 2289,2294 ****
--- 2301,2307 ----
flag_signed_zeros = !set;
flag_associative_math = set;
flag_reciprocal_math = set;
+ flag_fp_contract = set;
}
/* Return true iff flags are set as if -ffast-math. */