[PATCH] 2015-06-15 Benedikt Huber <benedikt.huber@theobroma-systems.com>

Sat Jun 27 08:12:00 GMT 2015

On Thu, Jun 18, 2015 at 5:04 AM, Benedikt Huber
<benedikt.huber@theobroma-systems.com> wrote:
A few comments about the patch itself besides the cost model issue and
iteration issue mentioned in the other email thread.

>        * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
>        * config/aarch64/aarch64-protos.h: Declare.
>        * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and frsqrts.
>        * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code in fast math mode.
>        * config/aarch64/aarch64.md: Added enum entry.
>        * testsuite/gcc.target/aarch64/rsqrt.c: Tests for single and double.
> ---
>  gcc/ChangeLog                            |   9 +++
>  gcc/config/aarch64/aarch64-builtins.c    |  60 ++++++++++++++++
>  gcc/config/aarch64/aarch64-protos.h      |   2 +
>  gcc/config/aarch64/aarch64-simd.md       |  27 ++++++++
>  gcc/config/aarch64/aarch64.c             |  63 +++++++++++++++++
>  gcc/config/aarch64/aarch64.md            |   3 +
>  gcc/testsuite/gcc.target/aarch64/rsqrt.c | 113 +++++++++++++++++++++++++++++++
>  7 files changed, 277 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index c9b156f..690ebba 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,12 @@
> +2015-06-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
> +
> +       * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
> +       * config/aarch64/aarch64-protos.h: Declare.
> +       * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and frsqrts.
> +       * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code in fast math mode.
> +       * config/aarch64/aarch64.md: Added enum entry.
> +       * testsuite/gcc.target/aarch64/rsqrt.c: Tests for single and double.
> +
>  2015-06-14  Richard Sandiford  <richard.sandiford@arm.com>
>
>         * rtl.h (classify_insn): Declare.
> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
> index f7a39ec..484bb84 100644
> --- a/gcc/config/aarch64/aarch64-builtins.c
> +++ b/gcc/config/aarch64/aarch64-builtins.c
> @@ -342,6 +342,8 @@ enum aarch64_builtins
>    AARCH64_BUILTIN_GET_FPSR,
>    AARCH64_BUILTIN_SET_FPSR,
>
> +  AARCH64_BUILTIN_RSQRT,
> +  AARCH64_BUILTIN_RSQRTF,
>    AARCH64_SIMD_BUILTIN_BASE,
>    AARCH64_SIMD_BUILTIN_LANE_CHECK,
>  #include "aarch64-simd-builtins.def"
> @@ -831,6 +833,32 @@ aarch64_init_crc32_builtins ()
>  }
>
>  void
> +aarch64_add_builtin_rsqrt (void)
> +{
> +  tree fndecl = NULL;
> +  tree ftype = NULL;
> +  ftype = build_function_type_list (double_type_node, double_type_node, NULL_TREE);
> +
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt",
> +                                 ftype,
> +                                 AARCH64_BUILTIN_RSQRT,
> +                                 BUILT_IN_MD,
> +                                 NULL,
> +                                 NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT] = fndecl;
> +
> +  tree ftypef = NULL;
> +  ftypef = build_function_type_list (float_type_node, float_type_node, NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrtf",
> +                                 ftypef,
> +                                 AARCH64_BUILTIN_RSQRTF,
> +                                 BUILT_IN_MD,
> +                                 NULL,
> +                                 NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRTF] = fndecl;
> +}
> +
> +void
>  aarch64_init_builtins (void)
>  {
>    tree ftype_set_fpr
> @@ -855,6 +883,7 @@ aarch64_init_builtins (void)
>      aarch64_init_simd_builtins ();
>    if (TARGET_CRC32)
>      aarch64_init_crc32_builtins ();
> +  aarch64_add_builtin_rsqrt ();
>  }
>
>  tree
> @@ -1099,6 +1128,23 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
>    return target;
>  }
>
> +static rtx
> +aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
> +{
> +  rtx pat;
> +  tree arg0 = CALL_EXPR_ARG (exp, 0);
> +  rtx op0 = expand_normal (arg0);
> +
> +  enum insn_code c = CODE_FOR_rsqrtdf;
> +  if (fcode == AARCH64_BUILTIN_RSQRTF)
> +    c = CODE_FOR_rsqrtsf;
> +
> +  pat = GEN_FCN (c) (target, op0);
> +  emit_insn (pat);
> +
> +  return target;
> +}
> +
>  /* Expand an expression EXP that calls a built-in function,
>     with result going to TARGET if that's convenient.  */
>  rtx
> @@ -1146,6 +1192,11 @@ aarch64_expand_builtin (tree exp,
>    else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
>      return aarch64_crc32_expand_builtin (fcode, exp, target);
>
> +  if (fcode == AARCH64_BUILTIN_RSQRT ||
> +      fcode == AARCH64_BUILTIN_RSQRTF)
> +    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
> +
> +  return NULL_RTX;
>    gcc_unreachable ();
>  }
>
> @@ -1303,6 +1354,15 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
>    return NULL_TREE;
>  }
>
> +tree
> +aarch64_builtin_rsqrt (bool is_float)
> +{
> +  if (is_float)
> +    return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRTF];
> +  else
> +    return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT];
> +}
> +
>  #undef VAR1
>  #define VAR1(T, N, MAP, A) \
>    case AARCH64_SIMD_BUILTIN_##T##_##N##A:
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 965a11b..4f1c8ce 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -270,6 +270,8 @@ void aarch64_print_operand (FILE *, rtx, char);
>  void aarch64_print_operand_address (FILE *, rtx);
>  void aarch64_emit_call_insn (rtx);
>
> +void aarch64_emit_swrsqrt (rtx, rtx);
> +
>  /* Initialize builtins for SIMD intrinsics.  */
>  void init_aarch64_simd_builtins (void);
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index b90f938..266800a 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -353,6 +353,33 @@
>    [(set_attr "type" "neon_fp_mul_d_scalar_q")]
>  )
>
> +(define_insn "*rsqrte_simd"
> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
> +                    UNSPEC_RSQRTE))]
> +  "TARGET_SIMD"
> +  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
> +  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])

I think it might be useful to have the mode on the pattern name.

> +
> +(define_insn "*rsqrts_simd"

Likewise.

> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
> +               (match_operand:VALLF 2 "register_operand" "w")]
> +                    UNSPEC_RSQRTS))]
> +  "TARGET_SIMD"
> +  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
> +  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
> +
> +(define_expand "rsqrt<mode>"
> +  [(set (match_operand:GPF 0 "register_operand" "=w")
> +       (unspec:GPF [(match_operand:GPF 1 "register_operand" "w")]
> +                    UNSPEC_RSQRT))]
> +  "TARGET_FLOAT"
> +{
> +  aarch64_emit_swrsqrt (operands[0], operands[1]);
> +  DONE;
> +})
> +
>  (define_insn "*aarch64_mul3_elt_to_64v2df"
>    [(set (match_operand:DF 0 "register_operand" "=w")
>       (mult:DF
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index c3c2795..281f0b2 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -6816,6 +6816,66 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
>    return aarch64_tune_params->memmov_cost;
>  }
>
> +extern tree aarch64_builtin_rsqrt (bool is_float);
> +
> +static tree
> +aarch64_builtin_reciprocal (unsigned int fn,
> +                            bool md_fn ATTRIBUTE_UNUSED,
> +                            bool sqrt ATTRIBUTE_UNUSED)

You don't need ATTRIBUTE_UNUSED any more if you are not using the
variable at all.  Just remove the variable name, this is C++ code
after all :).

> +{
> +  if (!fast_math_flags_set_p (&global_options))
> +    return NULL_TREE;

Is this correct check here?  I suspect you want
!(flag_finite_math_only && !flag_trapping_math &&
flag_unsafe_math_optimizations) instead so someone can do -ffast-math
-fno-finite-math-only.

is there a reason why you don't check md_fn here at all because if
md_fn is true, and you get the same value for fn as
BUILT_IN_SQRTF/BUILT_IN_SQRT, we would get incorrect code.
Also I think you should check md_fn and support the vectorized functions here.

> +
> +  if (fn == BUILT_IN_SQRTF)
> +    return aarch64_builtin_rsqrt (true);
> +  else if (fn == BUILT_IN_SQRT)
> +    return aarch64_builtin_rsqrt (false);
> +  else
> +    return NULL_TREE;
> +}
> +
> +void
> +aarch64_emit_swrsqrt (rtx dst, rtx src)
> +{
> +  enum machine_mode mode = GET_MODE (src);
> +  rtx xsrc = gen_reg_rtx (mode);
> +  emit_set_insn (xsrc, src);

Why do you use emit_set_insn here instead of emit_move_insn?

> +
> +  rtx x0 = gen_reg_rtx (mode);
> +  emit_insn (gen_rtx_SET (x0,
> +                          gen_rtx_UNSPEC (mode, gen_rtvec (1, xsrc),
> +                                          UNSPEC_RSQRTE)));

And not emit_set_insn here?

> +
> +  bool double_mode = (DFmode   == mode ||
> +                      V1DFmode == mode ||
> +                      V2DFmode == mode ||
> +                      V4DFmode == mode ||
> +                      V6DFmode == mode ||
> +                      V8DFmode == mode);

Why not use:
mode == DFmode || (VECTOR_MODE_P (mode) && GET_MODE_INNER (mode) == DFmode);
At least it becomes more obvious what is done here.
Or just:
mode == DFmode || mode == V2DFmode || mode == V4DFmode

Also it might make sense to have an assert on the modes we accept at
the beginning of the function.

> +
> +  int iterations = 2;
> +  if (double_mode)
> +    iterations = 3;
> +
> +  for (int i = 0; i < iterations; ++i)
> +    {
> +      rtx x1 = gen_reg_rtx (mode);
> +      rtx x2 = gen_reg_rtx (mode);
> +      rtx x3 = gen_reg_rtx (mode);
> +      emit_insn (gen_rtx_SET (x2,
> +                              gen_rtx_MULT (mode, x0, x0)));
> +      emit_insn (gen_rtx_SET (x3,
> +                              gen_rtx_UNSPEC (mode, gen_rtvec (2, xsrc, x2),
> +                                              UNSPEC_RSQRTS)));
> +      emit_insn (gen_rtx_SET (x1,
> +                              gen_rtx_MULT (mode, x0, x3)));

More of emit_set_insn usage.

> +      x0 = x1;
> +    }
> +
> +  emit_move_insn (dst, x0);
> +  return;
> +}
> +
>  /* Return the number of instructions that can be issued per cycle.  */
>  static int
>  aarch64_sched_issue_rate (void)
> @@ -11747,6 +11807,9 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
>  #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
>  #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
>
> +#undef TARGET_BUILTIN_RECIPROCAL
> +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
> +
>  #undef TARGET_VECTOR_MODE_SUPPORTED_P
>  #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 11123d6..7272d05 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -120,6 +120,9 @@
>      UNSPEC_VSTRUCTDUMMY
>      UNSPEC_SP_SET
>      UNSPEC_SP_TEST
> +    UNSPEC_RSQRT
> +    UNSPEC_RSQRTE
> +    UNSPEC_RSQRTS
>  ])
>
>  (define_c_enum "unspecv" [
> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt.c b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
> new file mode 100644
> index 0000000..6607483
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
> @@ -0,0 +1,113 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -fno-inline --save-temps -ffast-math" } */

Maybe instead of -fno-inline, you can add some volatile to some
variables.  We really should have separate tests for the assembly scan
vs the runtime tests too.

> +
> +#include <math.h>
> +#include <stdio.h>
> +
> +#include <values.h>

Is values.h a standard header file.  I suspect you should be using
float.h instead which is the standard header file.
> +
> +#define PI    3.141592653589793
> +#define SQRT2 1.4142135623730951
> +
> +#define PI_4 0.7853981633974483
> +#define SQRT1_2 0.7071067811865475
> +
> +/* 2^25+1, float has 24 significand bits
> + *       according to Single-precision floating-point format.  */
> +#define TESTA8_FLT 33554433
> +/* 2^54+1, double has 53 significand bits
> + *       according to Double-precision floating-point format.  */
> +#define TESTA8_DBL 18014398509481985
> +
> +#define SD(a, b) t_double ((#a), (a), (b));
> +#define SF(a, b) t_float ((#a), (a), (b));
> +
> +#define EPSILON_double __DBL_EPSILON__
> +#define EPSILON_float __FLT_EPSILON__
> +#define ABS_double fabs
> +#define ABS_float fabsf
> +#define SQRT_double sqrt
> +#define SQRT_float sqrtf

I suspect you should have those using the builtin versions of the
above functions instead of the named ones.

Thanks,
Andrew Pinski

> +
> +extern void abort (void);
> +
> +#define TESTTYPE(TYPE)                                                     \
> +TYPE rsqrt_##TYPE (TYPE a)                                                 \
> +{                                                                          \
> +  return 1.0/SQRT_##TYPE(a);                                               \
> +}                                                                          \
> +                                                                           \
> +int equals_##TYPE (TYPE a, TYPE b)                                         \
> +{                                                                          \
> +  return (a == b ||                                                        \
> +          (isnan (a) && isnan (b)) ||                                      \
> +          (ABS_##TYPE (a - b) < EPSILON_##TYPE));                          \
> +}                                                                          \
> +                                                                           \
> +void t_##TYPE (const char *s, TYPE a, TYPE result)                         \
> +{                                                                          \
> +  TYPE r = rsqrt_##TYPE (a);                                               \
> +  if (!equals_##TYPE (r, result))                                          \
> +  {                                                                        \
> +    abort ();                                                              \
> +  }                                                                        \
> +}                                                                          \
> +
> +//  printf ("Problem in %20s: %30.18A should be %30.18A\n", s, r, result); \
> +
> +TESTTYPE(double)
> +TESTTYPE(float)
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
> +
> +int main ()
> +{
> +  SD(    1.0/256, 0X1.00000000000000P+4  );
> +  SD(        1.0, 0X1.00000000000000P+0  );
> +  SD(       -1.0,                     NAN);
> +  SD(       11.0, 0X1.34BF63D1568260P-2  );
> +  SD(        0.0,                INFINITY);
> +  SD(   INFINITY, 0X0.00000000000000P+0  );
> +  SD(        NAN,                     NAN);
> +  SD(       -NAN,                    -NAN);
> +  SD(    DBL_MAX, 0X1.00000000000010P-512);
> +  SD(    DBL_MIN, 0X1.00000000000000P+511);
> +  SD(         PI, 0X1.20DD750429B6D0P-1  );
> +  SD(       PI_4, 0X1.20DD750429B6D0P+0  );
> +  SD(      SQRT2, 0X1.AE89F995AD3AE0P-1  );
> +  SD(    SQRT1_2, 0X1.306FE0A31B7150P+0  );
> +  SD(        -PI,                     NAN);
> +  SD(     -SQRT2,                     NAN);
> +  SD( TESTA8_DBL, 0X1.00000000000000P-27 );
> +
> +  SF(    1.0/256, 0X1.00000000000000P+4  );
> +  SF(        1.0, 0X1.00000000000000P+0  );
> +  SF(       -1.0,                     NAN);
> +  SF(       11.0, 0X1.34BF6400000000P-2  );
> +  SF(        0.0,                INFINITY);
> +  SF(   INFINITY, 0X0.00000000000000P+0  );
> +  SF(        NAN,                     NAN);
> +  SF(       -NAN,                    -NAN);
> +  SF(    FLT_MAX, 0X1.00000200000000P-64 );
> +  SF(    FLT_MIN, 0X1.00000000000000P+63 );
> +  SF(         PI, 0X1.20DD7400000000P-1  );
> +  SF(       PI_4, 0X1.20DD7400000000P+0  );
> +  SF(      SQRT2, 0X1.AE89FA00000000P-1  );
> +  SF(    SQRT1_2, 0X1.306FE000000000P+0  );
> +  SF(        -PI,                     NAN);
> +  SF(     -SQRT2,                     NAN);
> +  SF( TESTA8_FLT, 0X1.6A09E600000000P-13 );
> +
> +//   With -ffast-math these return positive INF.
> +//   SD(       -0.0,               -INFINITY);
> +//   SF(       -0.0,               -INFINITY);
> +//   The reason here is that -ffast-math flushes to zero.
> +//   SD(DBL_MIN/256, 0X1.00000000000000P+515);
> +//   SF(FLT_MIN/256, 0X1.00000000000000P+67 );
> +
> +  return 0;
> +}
> --
> 1.9.1
>