This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [RFC PATCH] implement fma() as builtin x87 or SSE intrinsic
- From: Uros Bizjak <uros at kss-loka dot si>
- To: Roger Sayle <roger at eyesopen dot com>
- Cc: "Joseph S. Myers" <jsm at polyomino dot org dot uk>,Falk Hueffner <hueffner at informatik dot uni-tuebingen dot de>,gcc-patches at gcc dot gnu dot org
- Date: Mon, 17 May 2004 09:04:40 +0200
- Subject: Re: [RFC PATCH] implement fma() as builtin x87 or SSE intrinsic
- References: <Pine.LNX.4.44.0405130434530.347-100000@www.eyesopen.com>
Roger Sayle wrote:
Having said that, on x87 atleast, fma and hypot are safely expanded
even without -ffast-math, for singles and doubles as they return the
expected perfectly rounded results. [Which is why mathinline.h doesn't
have a FAST_MATH guard]. And certainly on other platforms the "fmadf4"
expander could be used without -ffast-math.
I hope this explains/resolves some of the controversy. Could you
repost your fma patch with XFmode intermediates, and without the test
for flag_unsafe_math_optimizations on the fmasf4 and fmadf4 expanders,
i.e. just "TARGET_80387" conditions, and remove the same test from
the expand_builtin so that the expanders are considered even without
-ffast-math?
Hello Roger!
I have changed "fmasf4" as you suggested (expansion to wider mode is
handled automatically in expand_ternop() function:
(define_expand "fmasf4"
[(set (match_dup 4)
(mult:XF (match_operand:XF 1 "register_operand" "")
(match_operand:XF 2 "register_operand" "")))
(set (match_dup 5)
(plus:XF (match_dup 4)
(match_operand:XF 3 "register_operand" "")))
(set (match_operand:SF 0 "register_operand" "")
(float_truncate:SF (match_dup 5)))]
"TARGET_80387"
{
operands[4] = gen_reg_rtx (XFmode);
operands[5] = gen_reg_rtx (XFmode);
})
Resulting RTL for fmaf(x,y,z) now results in:
(insn:TI 12 34 14 0 (set (reg:XF 8 st)
(float_extend:XF (mem:SF (plus:SI (reg/f:SI 6 bp)
(const_int 12 [0xc])) [2 y+0 S4 A32]))) 89
{*extendsfxf2_1} (insn_list 33 (nil))
(nil))
(insn:TI 14 12 15 0 (set (reg:XF 8 st)
(mult:XF (float_extend:XF (mem:SF (plus:SI (reg/f:SI 6 bp)
(const_int 8 [0x8])) [2 x+0 S4 A32]))
(reg:XF 8 st))) 411 {*fop_xf_4} (insn_list 12 (insn_list 33
(nil)))
(nil))
(insn:TI 15 14 35 0 (set (reg:XF 8 st)
(plus:XF (float_extend:XF (mem:SF (plus:SI (reg/f:SI 6 bp)
(const_int 16 [0x10])) [2 z+0 S4 A32]))
(reg:XF 8 st))) 411 {*fop_xf_4} (insn_list 14 (insn_list 33
(nil)))
(nil))
However, final asm code, [fmuls and fadds insn are produced by
output_387_binary_op()] is still:
test1f:
pushl %ebp
movl %esp, %ebp
flds 12(%ebp)
fmuls 8(%ebp)
fadds 16(%ebp)
popl %ebp
ret
However, I would expect something like following asm code from RTL code
above:
pushl %ebp
movl %esp, %ebp
flds 12(%ebp) <- SF load
flds 8(%ebp) <- SF load
fmulp %st, %st(1) <- XF multiply
flds 16(%ebp) <- SF load
popl %ebp
faddp %st, %st(1) <- XF add
ret <- [ dummy XF -> SF ]
Revised patch and testcase are attached to this message.
Uros.
Index: gcc/builtins.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/builtins.c,v
retrieving revision 1.323
diff -u -p -r1.323 builtins.c
--- gcc/builtins.c 15 May 2004 18:17:20 -0000 1.323
+++ gcc/builtins.c 17 May 2004 06:26:41 -0000
@@ -95,6 +95,7 @@ static void expand_errno_check (tree, rt
static rtx expand_builtin_mathfn (tree, rtx, rtx);
static rtx expand_builtin_mathfn_2 (tree, rtx, rtx);
static rtx expand_builtin_mathfn_3 (tree, rtx, rtx);
+static rtx expand_builtin_sin_cos (tree, rtx, rtx);
static rtx expand_builtin_args_info (tree);
static rtx expand_builtin_next_arg (tree);
static rtx expand_builtin_va_start (tree);
@@ -1950,7 +1951,7 @@ expand_builtin_mathfn_2 (tree exp, rtx t
return target;
}
-/* Expand a call to the builtin sin and cos math functions.
+/* Expand a call to the builtin ternary math functions (fma).
Return 0 if a normal call should be emitted rather than expanding the
function in-line. EXP is the expression that is a call to the builtin
function; if convenient, the result should be placed in TARGET.
@@ -1961,6 +1962,117 @@ static rtx
expand_builtin_mathfn_3 (tree exp, rtx target, rtx subtarget)
{
optab builtin_optab;
+ rtx op0, op1, op2, insns;
+ tree fndecl = get_callee_fndecl (exp);
+ tree arglist = TREE_OPERAND (exp, 1);
+ tree arg0, arg1, arg2, temp, narg;
+ enum machine_mode mode;
+ bool errno_set = true;
+ bool stable = true;
+
+ if (!validate_arglist (arglist, REAL_TYPE, REAL_TYPE,
+ REAL_TYPE, VOID_TYPE))
+ return 0;
+
+ arg0 = TREE_VALUE (arglist);
+ arg1 = TREE_VALUE (arglist = TREE_CHAIN (arglist));
+ arg2 = TREE_VALUE (TREE_CHAIN (arglist));
+
+ switch (DECL_FUNCTION_CODE (fndecl))
+ {
+ case BUILT_IN_FMA:
+ case BUILT_IN_FMAF:
+ case BUILT_IN_FMAL:
+ builtin_optab = fma_optab; break;
+ default:
+ abort ();
+ }
+
+ /* Make a suitable register to place result in. */
+ mode = TYPE_MODE (TREE_TYPE (exp));
+
+ /* Before working hard, check whether the instruction is available. */
+ if (builtin_optab->handlers[(int) mode].insn_code == CODE_FOR_nothing)
+ return 0;
+
+ target = gen_reg_rtx (mode);
+
+ if (! flag_errno_math || ! HONOR_NANS (mode))
+ errno_set = false;
+
+ /* Always stabilize the argument list. */
+ narg = builtin_save_expr (arg2);
+ if (narg != arg2)
+ {
+ temp = build_tree_list (NULL_TREE, narg);
+ stable = false;
+ }
+ else
+ temp = TREE_CHAIN (arglist);
+
+ narg = builtin_save_expr (arg1);
+ if (narg != arg1)
+ {
+ arglist = tree_cons (NULL_TREE, narg, temp);
+ stable = false;
+ }
+ else if (! stable)
+ arglist = tree_cons (NULL_TREE, arg1, temp);
+
+ narg = builtin_save_expr (arg0);
+ if (narg != arg0)
+ {
+ arglist = tree_cons (NULL_TREE, narg, temp);
+ stable = false;
+ }
+ else if (! stable)
+ arglist = tree_cons (NULL_TREE, arg0, temp);
+
+ if (! stable)
+ exp = build_function_call_expr (fndecl, arglist);
+
+ op0 = expand_expr (arg0, subtarget, VOIDmode, 0);
+ op1 = expand_expr (arg1, 0, VOIDmode, 0);
+ op2 = expand_expr (arg2, 0, VOIDmode, 0);
+
+ emit_queue ();
+ start_sequence ();
+
+ /* Compute into TARGET.
+ Set TARGET to wherever the result comes back. */
+ target = expand_ternop (mode, builtin_optab, op0, op1, op2, target, 0);
+
+ /* If we were unable to expand via the builtin, stop the sequence
+ (without outputting the insns) and call to the library function
+ with the stabilized argument list. */
+ if (target == 0)
+ {
+ end_sequence ();
+ return expand_call (exp, target, target == const0_rtx);
+ }
+
+ if (errno_set)
+ expand_errno_check (exp, target);
+
+ /* Output the entire sequence. */
+ insns = get_insns ();
+ end_sequence ();
+ emit_insn (insns);
+
+ return target;
+}
+
+/* Expand a call to the builtin sin and cos math functions.
+ Return 0 if a normal call should be emitted rather than expanding the
+ function in-line. EXP is the expression that is a call to the builtin
+ function; if convenient, the result should be placed in TARGET.
+ SUBTARGET may be used as the target for computing one of EXP's
+ operands. */
+
+static rtx
+expand_builtin_sin_cos (tree exp, rtx target, rtx subtarget)
+{
+ optab builtin_optab;
rtx op0, insns, before_call;
tree fndecl = get_callee_fndecl (exp);
tree arglist = TREE_OPERAND (exp, 1);
@@ -5580,6 +5692,14 @@ expand_builtin (tree exp, rtx target, rt
return target;
break;
+ case BUILT_IN_FMA:
+ case BUILT_IN_FMAF:
+ case BUILT_IN_FMAL:
+ target = expand_builtin_mathfn_3 (exp, target, subtarget);
+ if (target)
+ return target;
+ break;
+
case BUILT_IN_SIN:
case BUILT_IN_SINF:
case BUILT_IN_SINL:
@@ -5588,7 +5708,7 @@ expand_builtin (tree exp, rtx target, rt
case BUILT_IN_COSL:
if (! flag_unsafe_math_optimizations)
break;
- target = expand_builtin_mathfn_3 (exp, target, subtarget);
+ target = expand_builtin_sin_cos (exp, target, subtarget);
if (target)
return target;
break;
Index: gcc/genopinit.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/genopinit.c,v
retrieving revision 1.75
diff -u -p -r1.75 genopinit.c
--- gcc/genopinit.c 7 May 2004 05:38:14 -0000 1.75
+++ gcc/genopinit.c 17 May 2004 06:26:41 -0000
@@ -93,6 +93,7 @@ static const char * const optabs[] =
"umod_optab->handlers[$A].insn_code = CODE_FOR_$(umod$a3$)",
"fmod_optab->handlers[$A].insn_code = CODE_FOR_$(fmod$a3$)",
"drem_optab->handlers[$A].insn_code = CODE_FOR_$(drem$a3$)",
+ "fma_optab->handlers[$A].insn_code = CODE_FOR_$(fma$a4$)",
"ftrunc_optab->handlers[$A].insn_code = CODE_FOR_$(ftrunc$F$a2$)",
"and_optab->handlers[$A].insn_code = CODE_FOR_$(and$a3$)",
"ior_optab->handlers[$A].insn_code = CODE_FOR_$(ior$a3$)",
Index: gcc/optabs.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/optabs.c,v
retrieving revision 1.221
diff -u -p -r1.221 optabs.c
--- gcc/optabs.c 7 May 2004 05:38:14 -0000 1.221
+++ gcc/optabs.c 17 May 2004 06:26:42 -0000
@@ -2402,6 +2402,136 @@ expand_simple_unop (enum machine_mode mo
return expand_unop (mode, unop, op0, target, unsignedp);
}
+/* Generate code to perform an operation specified by TERNOPTAB
+ on operands OP0, OP1 and OP2, with results having machine-mode MODE.
+
+ UNSIGNEDP is for the case where we have to widen the operands
+ to perform the operation. It says to use zero-extension.
+
+ If TARGET is nonzero, the value is generated there,
+ if it is convenient to do so. */
+
+rtx
+expand_ternop (enum machine_mode mode, optab ternoptab,
+ rtx op0, rtx op1, rtx op2, rtx target,
+ int unsignedp)
+{
+ enum mode_class class;
+ enum machine_mode wider_mode;
+ rtx entry_last = get_last_insn ();
+ rtx last;
+
+ class = GET_MODE_CLASS (mode);
+
+ op0 = protect_from_queue (op0, 0);
+ op1 = protect_from_queue (op1, 0);
+ op2 = protect_from_queue (op2, 0);
+
+ if (flag_force_mem)
+ {
+ op0 = force_not_mem (op0);
+ op1 = force_not_mem (op1);
+ op2 = force_not_mem (op2);
+ }
+
+ if (target)
+ target = protect_from_queue (target, 1);
+ else
+ target = gen_reg_rtx (mode);
+
+ /* Record where to go back to if we fail. */
+ last = get_last_insn ();
+
+ if (ternoptab->handlers[(int) mode].insn_code != CODE_FOR_nothing)
+ {
+ int icode = (int) ternoptab->handlers[(int) mode].insn_code;
+ enum machine_mode mode0 = insn_data[icode].operand[1].mode;
+ enum machine_mode mode1 = insn_data[icode].operand[2].mode;
+ enum machine_mode mode2 = insn_data[icode].operand[3].mode;
+ rtx pat;
+ rtx xop0 = op0, xop1 = op1, xop2 = op2;
+
+ /* In case the insn wants input operands in modes different from
+ those of the actual operands, convert the operands. It would
+ seem that we don't need to convert CONST_INTs, but we do, so
+ that they're properly zero-extended, sign-extended or truncated
+ for their mode. */
+
+ if (GET_MODE (op0) != mode0 && mode0 != VOIDmode)
+ xop0 = convert_modes (mode0,
+ GET_MODE (op0) != VOIDmode
+ ? GET_MODE (op0)
+ : mode,
+ xop0, unsignedp);
+
+ if (GET_MODE (op1) != mode1 && mode1 != VOIDmode)
+ xop1 = convert_modes (mode1,
+ GET_MODE (op1) != VOIDmode
+ ? GET_MODE (op1)
+ : mode,
+ xop1, unsignedp);
+
+ if (GET_MODE (op2) != mode2 && mode2 != VOIDmode)
+ xop2 = convert_modes (mode2,
+ GET_MODE (op2) != VOIDmode
+ ? GET_MODE (op2)
+ : mode,
+ xop2, unsignedp);
+
+ /* Now, if insn doesn't accept these operands, put them into pseudos. */
+ if (! (*insn_data[icode].operand[1].predicate) (xop0, mode0))
+ xop0 = copy_to_mode_reg (mode0, xop0);
+
+ if (! (*insn_data[icode].operand[2].predicate) (xop1, mode1))
+ xop1 = copy_to_mode_reg (mode1, xop1);
+
+ if (! (*insn_data[icode].operand[3].predicate) (xop2, mode2))
+ xop2 = copy_to_mode_reg (mode2, xop2);
+
+ if (! (*insn_data[icode].operand[0].predicate) (target, mode))
+ abort ();
+
+ pat = GEN_FCN (icode) (target, xop0, xop1, xop2);
+ if (pat)
+ {
+ emit_insn (pat);
+ return target;
+ }
+ else
+ delete_insns_since (last);
+ }
+
+ /* It can't be done in this mode. Can we do it in a wider mode? */
+
+ if (class == MODE_INT || class == MODE_FLOAT || class == MODE_COMPLEX_FLOAT)
+ {
+ for (wider_mode = GET_MODE_WIDER_MODE (mode); wider_mode != VOIDmode;
+ wider_mode = GET_MODE_WIDER_MODE (wider_mode))
+ {
+ if (ternoptab->handlers[(int) wider_mode].insn_code
+ != CODE_FOR_nothing)
+ {
+ rtx t0 = gen_reg_rtx (wider_mode);
+ rtx cop0 = convert_modes (wider_mode, mode, op0, unsignedp);
+ rtx cop1 = convert_modes (wider_mode, mode, op1, unsignedp);
+ rtx cop2 = convert_modes (wider_mode, mode, op2, unsignedp);
+
+ if (expand_ternop (wider_mode, ternoptab, cop0, cop1, cop2,
+ t0, unsignedp))
+ {
+ convert_move (target, t0, unsignedp);
+ return target;
+ }
+ else
+ delete_insns_since (last);
+ }
+ }
+ }
+
+ delete_insns_since (entry_last);
+ return 0;
+}
+
/* Try calculating
(clz:narrow x)
as
@@ -5330,6 +5460,7 @@ init_optabs (void)
umod_optab = init_optab (UMOD);
fmod_optab = init_optab (UNKNOWN);
drem_optab = init_optab (UNKNOWN);
+ fma_optab = init_optab (UNKNOWN);
ftrunc_optab = init_optab (UNKNOWN);
and_optab = init_optab (AND);
ior_optab = init_optab (IOR);
Index: gcc/optabs.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/optabs.h,v
retrieving revision 1.29
diff -u -p -r1.29 optabs.h
--- gcc/optabs.h 7 May 2004 05:38:14 -0000 1.29
+++ gcc/optabs.h 17 May 2004 06:26:42 -0000
@@ -96,6 +96,8 @@ enum optab_index
/* Floating point remainder functions */
OTI_fmod,
OTI_drem,
+ /* Floating point multiply-add */
+ OTI_fma,
/* Convert float to integer in float fmt */
OTI_ftrunc,
@@ -252,6 +254,7 @@ extern GTY(()) optab optab_table[OTI_MAX
#define umod_optab (optab_table[OTI_umod])
#define fmod_optab (optab_table[OTI_fmod])
#define drem_optab (optab_table[OTI_drem])
+#define fma_optab (optab_table[OTI_fma])
#define ftrunc_optab (optab_table[OTI_ftrunc])
#define and_optab (optab_table[OTI_and])
#define ior_optab (optab_table[OTI_ior])
@@ -419,6 +422,9 @@ extern int expand_twoval_binop (optab, r
/* Expand a unary arithmetic operation given optab rtx operand. */
extern rtx expand_unop (enum machine_mode, optab, rtx, rtx, int);
+
+/* Expand a ternary arithmetic operation given optab rtx operand. */
+extern rtx expand_ternop (enum machine_mode, optab, rtx, rtx, rtx, rtx, int);
/* Expand the absolute value operation. */
extern rtx expand_abs_nojump (enum machine_mode, rtx, rtx, int);
Index: gcc/config/i386/i386.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.md,v
retrieving revision 1.537
diff -u -p -r1.537 i386.md
--- gcc/config/i386/i386.md 7 May 2004 05:38:19 -0000 1.537
+++ gcc/config/i386/i386.md 17 May 2004 06:26:46 -0000
@@ -16314,6 +16314,50 @@
emit_move_insn (operands[2], temp);
emit_move_insn (operands[9], CONST1_RTX (XFmode)); /* fld1 */
})
+
+;; TESTING
+
+(define_expand "fmasf4"
+ [(set (match_dup 4)
+ (mult:XF (match_operand:XF 1 "register_operand" "")
+ (match_operand:XF 2 "register_operand" "")))
+ (set (match_dup 5)
+ (plus:XF (match_dup 4)
+ (match_operand:XF 3 "register_operand" "")))
+ (set (match_operand:SF 0 "register_operand" "")
+ (float_truncate:SF (match_dup 5)))]
+ "TARGET_80387"
+{
+ operands[4] = gen_reg_rtx (XFmode);
+ operands[5] = gen_reg_rtx (XFmode);
+})
+
+(define_expand "fmadf4"
+ [(set (match_dup 4)
+ (mult:XF (match_operand:XF 1 "register_operand" "")
+ (match_operand:XF 2 "register_operand" "")))
+ (set (match_dup 5)
+ (plus:XF (match_dup 4)
+ (match_operand:XF 3 "nonimmediate_operand" "")))
+ (set (match_operand:DF 0 "register_operand" "")
+ (float_truncate:DF (match_dup 5)))]
+ "TARGET_80387"
+{
+ operands[4] = gen_reg_rtx (XFmode);
+ operands[5] = gen_reg_rtx (XFmode);
+})
+
+(define_expand "fmaxf4"
+ [(set (match_dup 4)
+ (mult:XF (match_operand:XF 1 "register_operand" "")
+ (match_operand:XF 2 "register_operand" "")))
+ (set (match_operand:XF 0 "register_operand" "")
+ (plus:XF (match_dup 4) (match_operand:XF 3 "register_operand" "")))]
+ "TARGET_80387
+ && flag_unsafe_math_optimizations"
+{
+ operands[4] = gen_reg_rtx (XFmode);
+})
;; Block operation instructions
#if 1
float test1f(float x, float y, float z)
{
return fmaf(x, y, z);
}
#endif
#if 1
double test1d(double x, double y, double z)
{
return fma(x, y, z);
}
#endif
#if 1
long double test1l(long double x, long double y, long double z)
{
return fmal(x, y, z);
}
#endif