This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [AArch64] Emit square root using the Newton series


On 03/17/16 17:46, Evandro Menezes wrote:
This patch refactors the function to emit the reciprocal square root approximation to also emit the square root approximation.

   2016-03-23  Evandro Menezes <e.menezes@samsung.com>
                Wilco Dijkstra  <wilco.dijkstra@arm.com>

   gcc/
            * config/aarch64/aarch64-tuning-flags.def
            (AARCH64_EXTRA_TUNE_APPROX_SQRT_{SF,DF}): New tuning macros.
            * config/aarch64/aarch64-protos.h
            (aarch64_emit_approx_rsqrt): Replace with
   "aarch64_emit_approx_sqrt".
            (AARCH64_EXTRA_TUNE_APPROX_SQRT): New macro.
            * config/aarch64/aarch64.c
            (exynosm1_tunings): Use the new macro.
            (aarch64_emit_approx_sqrt): Define new function.
            (aarch64_override_options_after_change_1): Handle new option.
            * config/aarch64/aarch64.md
            (rsqrt<mode>2): Use new function instead.
            (sqrt<mode>2): New expansion and insn definitions.
            * config/aarch64/aarch64-simd.md: Likewise.
            * config/aarch64/aarch64.opt
            (mlow-precision-sqrt): Add new option description.
            * doc/invoke.texi (mlow-precision-sqrt): Likewise.

This version of the patch cleans up the changes to the MD files and fixes some bugs introduced in it since the first proposal.

Thanks for your feedback,

--
Evandro Menezes

>From 712e330bf651393bb788e85ebe7b3d9a37f54ae7 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Thu, 17 Mar 2016 17:39:55 -0500
Subject: [PATCH] [AArch64] Emit square root using the Newton series

2016-03-23  Evandro Menezes  <e.menezes@samsung.com>
            Wilco Dijkstra  <wilco.dijkstra@arm.com>

gcc/
	* config/aarch64/aarch64-tuning-flags.def
	(AARCH64_EXTRA_TUNE_APPROX_SQRT_{SF,DF}): New tuning macros.
	* config/aarch64/aarch64-protos.h
	(aarch64_emit_approx_rsqrt): Replace with "aarch64_emit_approx_sqrt".
	(AARCH64_EXTRA_TUNE_APPROX_SQRT): New macro.
	* config/aarch64/aarch64.c
	(exynosm1_tunings): Use the new macro.
	(aarch64_emit_approx_sqrt): Define new function.
	(aarch64_override_options_after_change_1): Handle new option.
	* config/aarch64/aarch64.md
	(rsqrt<mode>2): Use new function instead.
	(sqrt<mode>2): New expansion and insn definitions.
	* config/aarch64/aarch64-simd.md: Likewise.
	* config/aarch64/aarch64.opt
	(mlow-precision-sqrt): Add new option description.
	* doc/invoke.texi (mlow-precision-sqrt): Likewise.
---
 gcc/config/aarch64/aarch64-protos.h         |   5 +-
 gcc/config/aarch64/aarch64-simd.md          |  13 ++-
 gcc/config/aarch64/aarch64-tuning-flags.def |   3 +-
 gcc/config/aarch64/aarch64.c                | 129 ++++++++++++++++++++++------
 gcc/config/aarch64/aarch64.md               |  11 ++-
 gcc/config/aarch64/aarch64.opt              |   9 +-
 gcc/doc/invoke.texi                         |  10 +++
 7 files changed, 147 insertions(+), 33 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index dced209..24c2125 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -263,6 +263,9 @@ enum aarch64_extra_tuning_flags
 };
 #undef AARCH64_EXTRA_TUNING_OPTION
 
+#define AARCH64_EXTRA_TUNE_APPROX_SQRT \
+  (AARCH64_EXTRA_TUNE_APPROX_SQRT_DF | AARCH64_EXTRA_TUNE_APPROX_SQRT_SF)
+
 extern struct tune_params aarch64_tune_params;
 
 HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned);
@@ -361,7 +364,7 @@ void aarch64_register_pragmas (void);
 void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 void aarch64_save_restore_target_globals (tree);
-void aarch64_emit_approx_rsqrt (rtx, rtx);
+bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce..47ccb18 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -405,7 +405,7 @@
 		     UNSPEC_RSQRT))]
   "TARGET_SIMD"
 {
-  aarch64_emit_approx_rsqrt (operands[0], operands[1]);
+  aarch64_emit_approx_sqrt (operands[0], operands[1], true);
   DONE;
 })
 
@@ -4307,7 +4307,16 @@
 
 ;; sqrt
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:VDQF 0 "register_operand")
+	(sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+    DONE;
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:VDQF 0 "register_operand" "=w")
         (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
   "TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..725a79c 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -30,4 +30,5 @@
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
 AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
+AARCH64_EXTRA_TUNING_OPTION ("approx_sqrt", APPROX_SQRT_DF)
+AARCH64_EXTRA_TUNING_OPTION ("approx_sqrtf", APPROX_SQRT_SF)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ed0daa5..b155b74 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -38,6 +38,7 @@
 #include "recog.h"
 #include "diagnostic.h"
 #include "insn-attr.h"
+#include "insn-flags.h"
 #include "alias.h"
 #include "fold-const.h"
 #include "stor-layout.h"
@@ -538,7 +539,8 @@ static const struct tune_params exynosm1_tunings =
   48,	/* max_case_values.  */
   64,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_APPROX_RSQRT
+   | AARCH64_EXTRA_TUNE_APPROX_SQRT) /* tune_flags.  */
 };
 
 static const struct tune_params thunderx_tunings =
@@ -7498,46 +7500,115 @@ get_rsqrts_type (machine_mode mode)
   }
 }
 
-/* Emit instruction sequence to compute the reciprocal square root using the
-   Newton-Raphson series.  Iterate over the series twice for SF
-   and thrice for DF.  */
+/* Emit instruction sequence to compute either the approximate square root
+   or its approximate reciprocal.  */
 
-void
-aarch64_emit_approx_rsqrt (rtx dst, rtx src)
+bool
+aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
 {
-  machine_mode mode = GET_MODE (src);
-  gcc_assert (
-    mode == SFmode || mode == V2SFmode || mode == V4SFmode
-	|| mode == DFmode || mode == V2DFmode);
+  machine_mode mode = GET_MODE (dst);
+  machine_mode mmsk = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
+				       GET_MODE_NUNITS (mode));
+  bool scalar = !VECTOR_MODE_P (mode);
+
+  if (!flag_finite_math_only
+      || flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_function_for_size_p (cfun)
+      || !((recp && (flag_mrecip_low_precision_sqrt
+		     || (aarch64_tune_params.extra_tuning_flags
+			 & AARCH64_EXTRA_TUNE_APPROX_RSQRT)))
+	   || (!recp && (flag_mlow_precision_sqrt
+			 || (GET_MODE_INNER (mode) == SFmode
+			     && (aarch64_tune_params.extra_tuning_flags
+				 & AARCH64_EXTRA_TUNE_APPROX_SQRT_SF))
+			 || (GET_MODE_INNER (mode) == DFmode
+			     && (aarch64_tune_params.extra_tuning_flags
+				 & AARCH64_EXTRA_TUNE_APPROX_SQRT_DF))))))
+    return false;
 
-  rtx xsrc = gen_reg_rtx (mode);
-  emit_move_insn (xsrc, src);
-  rtx x0 = gen_reg_rtx (mode);
+  rtx xne, xmsk;
+  if (!recp)
+    {
+      /* When calculating the approximate square root...  */
+      if (scalar)
+	{
+	  /* Compare argument with 0.0 and set the CC.  */
+	  rtx xcc = aarch64_gen_compare_reg (NE, src, CONST0_RTX (mode));
+	  xne = gen_rtx_NE (VOIDmode, xcc, const0_rtx);
+	}
+      else
+	{
+	  /* Compare the argument with 0.0 and set a vector mask.  */
+	  xmsk = gen_reg_rtx (mmsk);
+	  switch (mode)
+	  {
+	    case V2SFmode:
+	      emit_insn (gen_aarch64_cmeqv2sf (xmsk, src, CONST0_RTX (mode)));
+	      break;
+
+	    case V4SFmode:
+	      emit_insn (gen_aarch64_cmeqv4sf (xmsk, src, CONST0_RTX (mode)));
+	      break;
+
+	    case V2DFmode:
+	      emit_insn (gen_aarch64_cmeqv2df (xmsk, src, CONST0_RTX (mode)));
+	      break;
 
-  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+	    default:
+	      gcc_unreachable ();
+	  }
+	}
+    }
 
-  bool double_mode = (mode == DFmode || mode == V2DFmode);
+  /* Estimate the approximate reciprocal square root.  */
+  rtx xdst = gen_reg_rtx (mode);
+  emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
 
-  int iterations = double_mode ? 3 : 2;
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
 
-  /* Optionally iterate over the series one less time than otherwise.  */
-  if (flag_mrecip_low_precision_sqrt)
+  /* Optionally iterate over the series once less for faster performance
+     while sacrificing the accuracy.  */
+  if ((recp && flag_mrecip_low_precision_sqrt)
+      || (!recp && flag_mlow_precision_sqrt))
     iterations--;
 
-  for (int i = 0; i < iterations; ++i)
+  /* Iterate over the series to calculate the approximate reciprocal square root.  */
+  while (iterations--)
     {
-      rtx x1 = gen_reg_rtx (mode);
       rtx x2 = gen_reg_rtx (mode);
-      rtx x3 = gen_reg_rtx (mode);
-      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+      emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
 
-      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+      rtx x1 = gen_reg_rtx (mode);
+      emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
 
-      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
-      x0 = x1;
+      emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
     }
 
-  emit_move_insn (dst, x0);
+  if (!recp)
+    {
+      /* Qualify the approximate reciprocal square root when the argument is 0.0.  */
+      if (scalar)
+	/* Conditionally set the result to 0.0.  */
+	emit_set_insn (xdst, gen_rtx_IF_THEN_ELSE (mode, xne, xdst, src));
+      else
+	{
+	  /* Mask off any resulting vector element to 0.0.  */
+	  rtx xtmp = gen_reg_rtx (mmsk);
+	  emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
+					    gen_rtx_SUBREG (mmsk, xdst, 0)));
+	  emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
+	}
+
+      /* Calculate the approximate square root.  */
+      emit_set_insn (dst, gen_rtx_MULT (mode, xdst, src));
+    }
+  else
+    /* Return the calculated approximate reciprocal square root.  */
+    emit_move_insn (dst, xdst);
+
+  return true;
 }
 
 /* Return the number of instructions that can be issued per cycle.  */
@@ -8144,6 +8215,12 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
       && (aarch64_cmodel == AARCH64_CMODEL_TINY
 	  || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
     aarch64_nopcrelative_literal_loads = false;
+
+  /* When enabling the lower precision Newton series for the square root, also
+     enable it for the reciprocal square root, since the later is an
+     intermediary step for the latter.  */
+  if (flag_mlow_precision_sqrt)
+    flag_mrecip_low_precision_sqrt = true;
 }
 
 /* 'Unpack' up the internal tuning structs and update the options
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 68676c9..43fa318 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4665,7 +4665,16 @@
   [(set_attr "type" "ffarith<s>")]
 )
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:GPF 0 "register_operand")
+        (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+    DONE;
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:GPF 0 "register_operand" "=w")
         (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
   "TARGET_FLOAT"
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index c637ff4..ffd5540 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -151,5 +151,10 @@ PC relative literal loads.
 
 mlow-precision-recip-sqrt
 Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
+When calculating the approximate reciprocal square root,
+use one less step than otherwise, thus reducing latency and precision.
+
+mlow-precision-sqrt
+Common Var(flag_mlow_precision_sqrt) Optimization
+When calculating the approximate square root,
+use one less step than otherwise, thus reducing latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 99ac11b..433a9f2 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -573,6 +573,7 @@ Objective-C and Objective-C++ Dialects}.
 -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
 -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
+-mlow-precision-sqrt -mno-low-precision-sqrt@gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12908,6 +12909,15 @@ uses one less step than otherwise, thus reducing latency and precision.
 This is only relevant if @option{-ffast-math} enables the reciprocal square root
 approximation, which in turn depends on the target processor.
 
+@item -mlow-precision-sqrt
+@item -mno-low-precision-sqrt
+@opindex -mlow-precision-sqrt
+@opindex -mno-low-precision-sqrt
+When calculating the square root approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the square root
+approximation, which in turn depends on the target processor.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture and, optionally, one or
-- 
1.9.1


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]