This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [RFC PATCH] implement fma() as builtin x87 or SSE intrinsic


Roger Sayle wrote:

Having said that, on x87 atleast, fma and hypot are safely expanded
even without -ffast-math, for singles and doubles as they return the
expected perfectly rounded results.  [Which is why mathinline.h doesn't
have a FAST_MATH guard].  And certainly on other platforms the "fmadf4"
expander could be used without -ffast-math.

I hope this explains/resolves some of the controversy. Could you
repost your fma patch with XFmode intermediates, and without the test
for flag_unsafe_math_optimizations on the fmasf4 and fmadf4 expanders,
i.e. just "TARGET_80387" conditions, and remove the same test from
the expand_builtin so that the expanders are considered even without
-ffast-math?


Hello Roger!

I have changed "fmasf4" as you suggested (expansion to wider mode is handled automatically in expand_ternop() function:

(define_expand "fmasf4"
 [(set (match_dup 4)
   (mult:XF (match_operand:XF 1 "register_operand" "")
        (match_operand:XF 2 "register_operand" "")))
  (set (match_dup 5)
   (plus:XF (match_dup 4)
        (match_operand:XF 3 "register_operand" "")))
  (set (match_operand:SF 0 "register_operand" "")
   (float_truncate:SF (match_dup 5)))]
 "TARGET_80387"
{
 operands[4] = gen_reg_rtx (XFmode);
 operands[5] = gen_reg_rtx (XFmode);
})

Resulting RTL for fmaf(x,y,z) now results in:

(insn:TI 12 34 14 0 (set (reg:XF 8 st)
(float_extend:XF (mem:SF (plus:SI (reg/f:SI 6 bp)
(const_int 12 [0xc])) [2 y+0 S4 A32]))) 89 {*extendsfxf2_1} (insn_list 33 (nil))
(nil))


(insn:TI 14 12 15 0 (set (reg:XF 8 st)
(mult:XF (float_extend:XF (mem:SF (plus:SI (reg/f:SI 6 bp)
(const_int 8 [0x8])) [2 x+0 S4 A32]))
(reg:XF 8 st))) 411 {*fop_xf_4} (insn_list 12 (insn_list 33 (nil)))
(nil))


(insn:TI 15 14 35 0 (set (reg:XF 8 st)
(plus:XF (float_extend:XF (mem:SF (plus:SI (reg/f:SI 6 bp)
(const_int 16 [0x10])) [2 z+0 S4 A32]))
(reg:XF 8 st))) 411 {*fop_xf_4} (insn_list 14 (insn_list 33 (nil)))
(nil))


However, final asm code, [fmuls and fadds insn are produced by output_387_binary_op()] is still:

test1f:
       pushl   %ebp
       movl    %esp, %ebp
       flds    12(%ebp)
       fmuls   8(%ebp)
       fadds   16(%ebp)
       popl    %ebp
       ret

However, I would expect something like following asm code from RTL code above:

       pushl   %ebp
       movl    %esp, %ebp
       flds    12(%ebp)       <- SF load
       flds    8(%ebp)         <- SF load
       fmulp   %st, %st(1) <- XF multiply
       flds    16(%ebp)       <- SF load
       popl    %ebp
       faddp   %st, %st(1)  <- XF add
       ret                             <- [ dummy XF -> SF ]

Revised patch and testcase are attached to this message.

Uros.
Index: gcc/builtins.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/builtins.c,v
retrieving revision 1.323
diff -u -p -r1.323 builtins.c
--- gcc/builtins.c	15 May 2004 18:17:20 -0000	1.323
+++ gcc/builtins.c	17 May 2004 06:26:41 -0000
@@ -95,6 +95,7 @@ static void expand_errno_check (tree, rt
 static rtx expand_builtin_mathfn (tree, rtx, rtx);
 static rtx expand_builtin_mathfn_2 (tree, rtx, rtx);
 static rtx expand_builtin_mathfn_3 (tree, rtx, rtx);
+static rtx expand_builtin_sin_cos (tree, rtx, rtx);
 static rtx expand_builtin_args_info (tree);
 static rtx expand_builtin_next_arg (tree);
 static rtx expand_builtin_va_start (tree);
@@ -1950,7 +1951,7 @@ expand_builtin_mathfn_2 (tree exp, rtx t
   return target;
 }
 
-/* Expand a call to the builtin sin and cos math functions.
+/* Expand a call to the builtin ternary math functions (fma).
    Return 0 if a normal call should be emitted rather than expanding the
    function in-line.  EXP is the expression that is a call to the builtin
    function; if convenient, the result should be placed in TARGET.
@@ -1961,6 +1962,117 @@ static rtx
 expand_builtin_mathfn_3 (tree exp, rtx target, rtx subtarget)
 {
   optab builtin_optab;
+  rtx op0, op1, op2, insns;
+  tree fndecl = get_callee_fndecl (exp);
+  tree arglist = TREE_OPERAND (exp, 1);
+  tree arg0, arg1, arg2, temp, narg;
+  enum machine_mode mode;
+  bool errno_set = true;
+  bool stable = true;
+
+  if (!validate_arglist (arglist, REAL_TYPE, REAL_TYPE,
+			 REAL_TYPE, VOID_TYPE))
+    return 0;
+
+  arg0 = TREE_VALUE (arglist);
+  arg1 = TREE_VALUE (arglist = TREE_CHAIN (arglist));
+  arg2 = TREE_VALUE (TREE_CHAIN (arglist));
+
+  switch (DECL_FUNCTION_CODE (fndecl))
+    {
+    case BUILT_IN_FMA:
+    case BUILT_IN_FMAF:
+    case BUILT_IN_FMAL:
+      builtin_optab = fma_optab; break;
+    default:
+      abort ();
+    }
+
+  /* Make a suitable register to place result in.  */
+  mode = TYPE_MODE (TREE_TYPE (exp));
+
+  /* Before working hard, check whether the instruction is available.  */
+  if (builtin_optab->handlers[(int) mode].insn_code == CODE_FOR_nothing)
+    return 0;
+
+  target = gen_reg_rtx (mode);
+
+  if (! flag_errno_math || ! HONOR_NANS (mode))
+    errno_set = false;
+
+  /* Always stabilize the argument list.  */
+  narg = builtin_save_expr (arg2);
+  if (narg != arg2)
+    {
+      temp = build_tree_list (NULL_TREE, narg);
+      stable = false;
+    }
+  else
+    temp = TREE_CHAIN (arglist);
+
+  narg = builtin_save_expr (arg1);
+  if (narg != arg1)
+    {
+      arglist = tree_cons (NULL_TREE, narg, temp);
+      stable = false;
+    }
+  else if (! stable)
+    arglist = tree_cons (NULL_TREE, arg1, temp);
+
+  narg = builtin_save_expr (arg0);
+  if (narg != arg0)
+    {
+      arglist = tree_cons (NULL_TREE, narg, temp);
+      stable = false;
+    }
+  else if (! stable)
+    arglist = tree_cons (NULL_TREE, arg0, temp);
+
+  if (! stable)
+    exp = build_function_call_expr (fndecl, arglist);
+
+  op0 = expand_expr (arg0, subtarget, VOIDmode, 0);
+  op1 = expand_expr (arg1, 0, VOIDmode, 0);
+  op2 = expand_expr (arg2, 0, VOIDmode, 0);
+
+  emit_queue ();
+  start_sequence ();
+
+  /* Compute into TARGET.
+     Set TARGET to wherever the result comes back.  */
+  target = expand_ternop (mode, builtin_optab, op0, op1, op2, target, 0);
+
+  /* If we were unable to expand via the builtin, stop the sequence
+     (without outputting the insns) and call to the library function
+     with the stabilized argument list.  */
+  if (target == 0)
+    {
+      end_sequence ();
+      return expand_call (exp, target, target == const0_rtx);
+    }
+
+  if (errno_set)
+    expand_errno_check (exp, target);
+
+  /* Output the entire sequence.  */
+  insns = get_insns ();
+  end_sequence ();
+  emit_insn (insns);
+
+  return target;
+}
+
+/* Expand a call to the builtin sin and cos math functions.
+   Return 0 if a normal call should be emitted rather than expanding the
+   function in-line.  EXP is the expression that is a call to the builtin
+   function; if convenient, the result should be placed in TARGET.
+   SUBTARGET may be used as the target for computing one of EXP's
+   operands.  */
+
+static rtx
+expand_builtin_sin_cos (tree exp, rtx target, rtx subtarget)
+{
+  optab builtin_optab;
   rtx op0, insns, before_call;
   tree fndecl = get_callee_fndecl (exp);
   tree arglist = TREE_OPERAND (exp, 1);
@@ -5580,6 +5692,14 @@ expand_builtin (tree exp, rtx target, rt
 	return target;
       break;
 
+    case BUILT_IN_FMA:
+    case BUILT_IN_FMAF:
+    case BUILT_IN_FMAL:
+      target = expand_builtin_mathfn_3 (exp, target, subtarget);
+      if (target)
+	return target;
+      break;
+
     case BUILT_IN_SIN:
     case BUILT_IN_SINF:
     case BUILT_IN_SINL:
@@ -5588,7 +5708,7 @@ expand_builtin (tree exp, rtx target, rt
     case BUILT_IN_COSL:
       if (! flag_unsafe_math_optimizations)
 	break;
-      target = expand_builtin_mathfn_3 (exp, target, subtarget);
+      target = expand_builtin_sin_cos (exp, target, subtarget);
       if (target)
 	return target;
       break;
Index: gcc/genopinit.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/genopinit.c,v
retrieving revision 1.75
diff -u -p -r1.75 genopinit.c
--- gcc/genopinit.c	7 May 2004 05:38:14 -0000	1.75
+++ gcc/genopinit.c	17 May 2004 06:26:41 -0000
@@ -93,6 +93,7 @@ static const char * const optabs[] =
   "umod_optab->handlers[$A].insn_code = CODE_FOR_$(umod$a3$)",
   "fmod_optab->handlers[$A].insn_code = CODE_FOR_$(fmod$a3$)",
   "drem_optab->handlers[$A].insn_code = CODE_FOR_$(drem$a3$)",
+  "fma_optab->handlers[$A].insn_code = CODE_FOR_$(fma$a4$)",
   "ftrunc_optab->handlers[$A].insn_code = CODE_FOR_$(ftrunc$F$a2$)",
   "and_optab->handlers[$A].insn_code = CODE_FOR_$(and$a3$)",
   "ior_optab->handlers[$A].insn_code = CODE_FOR_$(ior$a3$)",
Index: gcc/optabs.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/optabs.c,v
retrieving revision 1.221
diff -u -p -r1.221 optabs.c
--- gcc/optabs.c	7 May 2004 05:38:14 -0000	1.221
+++ gcc/optabs.c	17 May 2004 06:26:42 -0000
@@ -2402,6 +2402,136 @@ expand_simple_unop (enum machine_mode mo
   return expand_unop (mode, unop, op0, target, unsignedp);
 }
 
+/* Generate code to perform an operation specified by TERNOPTAB
+   on operands OP0, OP1 and OP2, with results having machine-mode MODE.
+
+   UNSIGNEDP is for the case where we have to widen the operands
+   to perform the operation.  It says to use zero-extension.
+
+   If TARGET is nonzero, the value is generated there,
+   if it is convenient to do so.  */
+
+rtx
+expand_ternop (enum machine_mode mode, optab ternoptab,
+	       rtx op0, rtx op1, rtx op2, rtx target,
+	       int unsignedp)
+{
+  enum mode_class class;
+  enum machine_mode wider_mode;
+  rtx entry_last = get_last_insn ();
+  rtx last;
+
+  class = GET_MODE_CLASS (mode);
+
+  op0 = protect_from_queue (op0, 0);
+  op1 = protect_from_queue (op1, 0);
+  op2 = protect_from_queue (op2, 0);
+
+  if (flag_force_mem)
+    {
+      op0 = force_not_mem (op0);
+      op1 = force_not_mem (op1);
+      op2 = force_not_mem (op2);
+    }
+
+  if (target)
+    target = protect_from_queue (target, 1);
+  else
+    target = gen_reg_rtx (mode);
+
+  /* Record where to go back to if we fail.  */
+  last = get_last_insn ();
+
+  if (ternoptab->handlers[(int) mode].insn_code != CODE_FOR_nothing)
+    {
+      int icode = (int) ternoptab->handlers[(int) mode].insn_code;
+      enum machine_mode mode0 = insn_data[icode].operand[1].mode;
+      enum machine_mode mode1 = insn_data[icode].operand[2].mode;
+      enum machine_mode mode2 = insn_data[icode].operand[3].mode;
+      rtx pat;
+      rtx xop0 = op0, xop1 = op1, xop2 = op2;
+
+      /* In case the insn wants input operands in modes different from
+	 those of the actual operands, convert the operands.  It would
+	 seem that we don't need to convert CONST_INTs, but we do, so
+	 that they're properly zero-extended, sign-extended or truncated
+	 for their mode.  */
+
+      if (GET_MODE (op0) != mode0 && mode0 != VOIDmode)
+	xop0 = convert_modes (mode0,
+			      GET_MODE (op0) != VOIDmode
+			      ? GET_MODE (op0)
+			      : mode,
+			      xop0, unsignedp);
+
+      if (GET_MODE (op1) != mode1 && mode1 != VOIDmode)
+	xop1 = convert_modes (mode1,
+			      GET_MODE (op1) != VOIDmode
+			      ? GET_MODE (op1)
+			      : mode,
+			      xop1, unsignedp);
+
+      if (GET_MODE (op2) != mode2 && mode2 != VOIDmode)
+	xop2 = convert_modes (mode2,
+			      GET_MODE (op2) != VOIDmode
+			      ? GET_MODE (op2)
+			      : mode,
+			      xop2, unsignedp);
+
+      /* Now, if insn doesn't accept these operands, put them into pseudos.  */
+      if (! (*insn_data[icode].operand[1].predicate) (xop0, mode0))
+	xop0 = copy_to_mode_reg (mode0, xop0);
+
+      if (! (*insn_data[icode].operand[2].predicate) (xop1, mode1))
+	xop1 = copy_to_mode_reg (mode1, xop1);
+
+      if (! (*insn_data[icode].operand[3].predicate) (xop2, mode2))
+	xop2 = copy_to_mode_reg (mode2, xop2);
+
+      if (! (*insn_data[icode].operand[0].predicate) (target, mode))
+	abort ();
+
+      pat = GEN_FCN (icode) (target, xop0, xop1, xop2);
+      if (pat)
+	{
+	  emit_insn (pat);
+	  return target;
+	}
+      else
+	delete_insns_since (last);
+    }
+
+  /* It can't be done in this mode.  Can we do it in a wider mode?  */
+
+  if (class == MODE_INT || class == MODE_FLOAT || class == MODE_COMPLEX_FLOAT)
+    {
+      for (wider_mode = GET_MODE_WIDER_MODE (mode); wider_mode != VOIDmode;
+	   wider_mode = GET_MODE_WIDER_MODE (wider_mode))
+	{
+	  if (ternoptab->handlers[(int) wider_mode].insn_code
+	      != CODE_FOR_nothing)
+	    {
+	      rtx t0 = gen_reg_rtx (wider_mode);
+	      rtx cop0 = convert_modes (wider_mode, mode, op0, unsignedp);
+	      rtx cop1 = convert_modes (wider_mode, mode, op1, unsignedp);
+	      rtx cop2 = convert_modes (wider_mode, mode, op2, unsignedp);
+
+	      if (expand_ternop (wider_mode, ternoptab, cop0, cop1, cop2,
+				 t0, unsignedp))
+		{
+		  convert_move (target, t0, unsignedp);
+		  return target;
+		}
+	      else
+		delete_insns_since (last);
+	    }
+	}
+    }
+
+  delete_insns_since (entry_last);
+  return 0;
+}
+
 /* Try calculating
 	(clz:narrow x)
    as
@@ -5330,6 +5460,7 @@ init_optabs (void)
   umod_optab = init_optab (UMOD);
   fmod_optab = init_optab (UNKNOWN);
   drem_optab = init_optab (UNKNOWN);
+  fma_optab = init_optab (UNKNOWN);
   ftrunc_optab = init_optab (UNKNOWN);
   and_optab = init_optab (AND);
   ior_optab = init_optab (IOR);
Index: gcc/optabs.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/optabs.h,v
retrieving revision 1.29
diff -u -p -r1.29 optabs.h
--- gcc/optabs.h	7 May 2004 05:38:14 -0000	1.29
+++ gcc/optabs.h	17 May 2004 06:26:42 -0000
@@ -96,6 +96,8 @@ enum optab_index
   /* Floating point remainder functions */
   OTI_fmod,
   OTI_drem,
+  /* Floating point multiply-add */
+  OTI_fma,
   /* Convert float to integer in float fmt */
   OTI_ftrunc,
 
@@ -252,6 +254,7 @@ extern GTY(()) optab optab_table[OTI_MAX
 #define umod_optab (optab_table[OTI_umod])
 #define fmod_optab (optab_table[OTI_fmod])
 #define drem_optab (optab_table[OTI_drem])
+#define fma_optab (optab_table[OTI_fma])
 #define ftrunc_optab (optab_table[OTI_ftrunc])
 #define and_optab (optab_table[OTI_and])
 #define ior_optab (optab_table[OTI_ior])
@@ -419,6 +422,9 @@ extern int expand_twoval_binop (optab, r
 
 /* Expand a unary arithmetic operation given optab rtx operand.  */
 extern rtx expand_unop (enum machine_mode, optab, rtx, rtx, int);
+
+/* Expand a ternary arithmetic operation given optab rtx operand.  */
+extern rtx expand_ternop (enum machine_mode, optab, rtx, rtx, rtx, rtx, int);
 
 /* Expand the absolute value operation.  */
 extern rtx expand_abs_nojump (enum machine_mode, rtx, rtx, int);
Index: gcc/config/i386/i386.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.md,v
retrieving revision 1.537
diff -u -p -r1.537 i386.md
--- gcc/config/i386/i386.md	7 May 2004 05:38:19 -0000	1.537
+++ gcc/config/i386/i386.md	17 May 2004 06:26:46 -0000
@@ -16314,6 +16314,50 @@
   emit_move_insn (operands[2], temp);
   emit_move_insn (operands[9], CONST1_RTX (XFmode));  /* fld1 */
 })
+
+;; TESTING
+
+(define_expand "fmasf4"
+  [(set (match_dup 4)
+	(mult:XF (match_operand:XF 1 "register_operand" "")
+		 (match_operand:XF 2 "register_operand" "")))
+   (set (match_dup 5)
+	(plus:XF (match_dup 4)
+		 (match_operand:XF 3 "register_operand" "")))
+   (set (match_operand:SF 0 "register_operand" "")
+	(float_truncate:SF (match_dup 5)))]
+  "TARGET_80387"
+{
+  operands[4] = gen_reg_rtx (XFmode);
+  operands[5] = gen_reg_rtx (XFmode);
+})
+
+(define_expand "fmadf4"
+  [(set (match_dup 4)
+	(mult:XF (match_operand:XF 1 "register_operand" "")
+		 (match_operand:XF 2 "register_operand" "")))
+   (set (match_dup 5)
+	(plus:XF (match_dup 4)
+	(match_operand:XF 3 "nonimmediate_operand" "")))
+   (set (match_operand:DF 0 "register_operand" "")
+	(float_truncate:DF (match_dup 5)))]
+  "TARGET_80387"
+{
+  operands[4] = gen_reg_rtx (XFmode);
+  operands[5] = gen_reg_rtx (XFmode);
+})
+
+(define_expand "fmaxf4"
+  [(set (match_dup 4)
+	(mult:XF (match_operand:XF 1 "register_operand" "")
+		 (match_operand:XF 2 "register_operand" "")))
+   (set (match_operand:XF 0 "register_operand" "")
+	(plus:XF (match_dup 4) (match_operand:XF 3 "register_operand" "")))]
+  "TARGET_80387
+   && flag_unsafe_math_optimizations"
+{
+  operands[4] = gen_reg_rtx (XFmode);
+})
 
 ;; Block operation instructions
 
#if 1
float test1f(float x, float y, float z)
{
  return fmaf(x, y, z);
}
#endif

#if 1
double test1d(double x, double y, double z)
{
  return fma(x, y, z);
}
#endif

#if 1
long double test1l(long double x, long double y, long double z)
{
  return fmal(x, y, z);
}
#endif

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]