This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] 2015-07-29 Benedikt Huber <benedikt dot huber at theobroma-systems dot com> Philipp Tomsich <philipp dot tomsich at theobroma-systems dot com>


Hi Benedikt,

On 29/07/15 18:47, Benedikt Huber wrote:
         * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
         rsqrtf.
         * config/aarch64/aarch64-opts.h: -mrecip has a default value
         depending on the core.
         * config/aarch64/aarch64-protos.h: Declare.
         * config/aarch64/aarch64-simd.md: Matching expressions for
         frsqrte and frsqrts.
         * config/aarch64/aarch64.c: New functions. Emit rsqrt
         estimation code in fast math mode.
         * config/aarch64/aarch64.md: Added enum entries.
         * config/aarch64/aarch64.opt: Added options -mrecip and
         -mlow-precision-recip-sqrt.
         * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
         for frsqrte and frsqrts
         * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.

<snip>

+
  /* Expand an expression EXP that calls a built-in function,
     with result going to TARGET if that's convenient.  */
  rtx
@@ -1139,6 +1213,13 @@ aarch64_expand_builtin (tree exp,
    else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
      return aarch64_crc32_expand_builtin (fcode, exp, target);

+  if (fcode == AARCH64_BUILTIN_RSQRT_DF   ||
+      fcode == AARCH64_BUILTIN_RSQRT_SF   ||
+      fcode == AARCH64_BUILTIN_RSQRT_V2DF ||
+      fcode == AARCH64_BUILTIN_RSQRT_V2SF ||
+      fcode == AARCH64_BUILTIN_RSQRT_V4SF)
+    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+

The style guidelines are to put the '||' on a new line like so:

 (fcode == AARCH64_BUILTIN_RSQRT_DF
  || fcode == AARCH64_BUILTIN_RSQRT_SF
  || ...


    gcc_unreachable ();
  }

@@ -1296,6 +1377,28 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
    return NULL_TREE;
  }

+tree
+aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
+{
+  if (md_fn)
+    {
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
+        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
+        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
+        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
+    }
+  else
+    {
+      if (fn == BUILT_IN_SQRT)
+        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
+      if (fn == BUILT_IN_SQRTF)
+        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
+    }
+  return NULL_TREE;
+}
+
  #undef VAR1
  #define VAR1(T, N, MAP, A) \
    case AARCH64_SIMD_BUILTIN_##T##_##N##A:
diff --git a/gcc/config/aarch64/aarch64-opts.h b/gcc/config/aarch64/aarch64-opts.h
index 24bfd9f..75e9c67 100644
--- a/gcc/config/aarch64/aarch64-opts.h
+++ b/gcc/config/aarch64/aarch64-opts.h
@@ -64,4 +64,11 @@ enum aarch64_code_model {
    AARCH64_CMODEL_LARGE
  };

+/* Each core can have -mrecip enabled or disabled by default. */
+enum aarch64_mrecip {
+  AARCH64_MRECIP_OFF = 0,
+  AARCH64_MRECIP_ON,
+  AARCH64_MRECIP_DEFAULT,
+};
+
  #endif
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 4062c27..dabe345 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -197,6 +197,7 @@ struct tune_params
    int min_div_recip_mul_sf;
    int min_div_recip_mul_df;
    unsigned int extra_tuning_flags;
+  bool mrecip_default_enabled;
  };

Please create a new field to put into the extra_tuning_flags.
This involves adding an entry aarch64-tuning-flags.def.
This way we avoid overpopulating the tuning structure with a multitude of
on/off boolean options.


  #define AARCH64_FUSION_PAIR(x, name, index) \
@@ -321,6 +322,8 @@ void aarch64_print_operand (FILE *, rtx, char);
  void aarch64_print_operand_address (FILE *, rtx);
  void aarch64_emit_call_insn (rtx);

+void aarch64_emit_swrsqrt (rtx, rtx);
+
  /* Initialize builtins for SIMD intrinsics.  */
  void init_aarch64_simd_builtins (void);

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index b90f938..ac8e12d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -353,6 +353,33 @@
    [(set_attr "type" "neon_fp_mul_d_scalar_q")]
  )

+(define_insn "*rsqrte_<mode>"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+                    UNSPEC_RSQRTE))]
+  "TARGET_SIMD"
+  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+
+(define_insn "*rsqrts_<mode>"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+               (match_operand:VALLF 2 "register_operand" "w")]
+                    UNSPEC_RSQRTS))]
+  "TARGET_SIMD"
+  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+
+(define_expand "rsqrt_<mode>"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+                    UNSPEC_RSQRT))]
+  "TARGET_SIMD"
+{
+  aarch64_emit_swrsqrt (operands[0], operands[1]);
+  DONE;
+})

Please add the number of operands to the pattern names, for example:
"rsqrte_<mode>2"


+
  (define_insn "*aarch64_mul3_elt_to_64v2df"
    [(set (match_operand:DF 0 "register_operand" "=w")
       (mult:DF
<snip>


  /* Support for fine-grained override of the tuning structures.  */
@@ -6961,6 +6967,66 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
    return aarch64_tune_params.memmov_cost;
  }

+extern tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);
+
+static tree
+aarch64_builtin_reciprocal (unsigned int fn,
+                            bool md_fn,
+                            bool)
+{
+  if (!flag_finite_math_only ||
+      flag_trapping_math ||
+      !flag_unsafe_math_optimizations ||
+      optimize_size ||
+      flag_mrecip == AARCH64_MRECIP_OFF ||
+      (flag_mrecip == AARCH64_MRECIP_DEFAULT &&
+       !aarch64_tune_params.mrecip_default_enabled))

Similar comment on the placement of the '||'.

+  {
+    return NULL_TREE;
+  }
+
+  return aarch64_builtin_rsqrt (fn, md_fn);
+}
+
+void
+aarch64_emit_swrsqrt (rtx dst, rtx src)
+{
+  enum machine_mode mode = GET_MODE (src);
+  gcc_assert (
+    mode == SFmode || mode == V2SFmode || mode == V4SFmode ||
+    mode == DFmode || mode == V2DFmode);
+
+  rtx xsrc = gen_reg_rtx (mode);
+  emit_move_insn (xsrc, src);
+  rtx x0 = gen_reg_rtx (mode);
+  emit_set_insn (x0,
+    gen_rtx_UNSPEC (mode, gen_rtvec (1, xsrc), UNSPEC_RSQRTE));
+
+  bool double_mode = (mode == DFmode || mode == V2DFmode);
+
+  int iterations = 2;
+  if (double_mode)
+    iterations = 3;
+
+  if (flag_mrecip_low_precision_sqrt)
+    iterations--;
+
+  for (int i = 0; i < iterations; ++i)
+    {
+      rtx x1 = gen_reg_rtx (mode);
+      rtx x2 = gen_reg_rtx (mode);
+      rtx x3 = gen_reg_rtx (mode);
+      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+      emit_set_insn (x3, gen_rtx_UNSPEC (
+        mode, gen_rtvec (2, xsrc, x2), UNSPEC_RSQRTS));
+      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+      x0 = x1;

You can probably avoid manually creating these RTXes and risking
mis-constructing them (or someone modifying them by mistake in the future)
by just calling the gen_* functions for the patterns you want to generate.
For example, instead of doing:

+      emit_set_insn (x3, gen_rtx_UNSPEC (
+        mode, gen_rtvec (2, xsrc, x2), UNSPEC_RSQRTS));


You can probably just do:
emit_insn (gen_rsqrts_{sf,df}2 (x3, x2));
This will require you to rename the patterns in aarch64.md to not have a '*'
in their name.



+
+  emit_move_insn (dst, x0);
+  return;
+}
+
  /* Return the number of instructions that can be issued per cycle.  */
  static int
  aarch64_sched_issue_rate (void)
@@ -12099,6 +12165,9 @@ aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
  #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
  #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p

+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
+
  #undef TARGET_VECTOR_MODE_SUPPORTED_P
  #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 1e343fa..d7944b2 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -122,6 +122,9 @@
      UNSPEC_VSTRUCTDUMMY
      UNSPEC_SP_SET
      UNSPEC_SP_TEST
+    UNSPEC_RSQRT
+    UNSPEC_RSQRTE
+    UNSPEC_RSQRTS
  ])

  (define_c_enum "unspecv" [
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 98ef9f6..581dc21 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -124,3 +124,11 @@ Enum(aarch64_abi) String(ilp32) Value(AARCH64_ABI_ILP32)

  EnumValue
  Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
+
+mrecip
+Common Report Var(flag_mrecip) Optimization Init(AARCH64_MRECIP_DEFAULT)
+Generate software reciprocal square root for better throughput. Default with fast-math.

s/fast-math/-ffast-math/.
Also, this is not the default with -ffast-math. The default now
depends on the tuning core, right?

Thanks,
Kyrill


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]