[PATCH] 2015-06-15 Benedikt Huber <benedikt.huber@theobroma-systems.com>
Andrew Pinski
pinskia@gmail.com
Sat Jun 27 08:12:00 GMT 2015
On Thu, Jun 18, 2015 at 5:04 AM, Benedikt Huber
<benedikt.huber@theobroma-systems.com> wrote:
A few comments about the patch itself besides the cost model issue and
iteration issue mentioned in the other email thread.
> * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
> * config/aarch64/aarch64-protos.h: Declare.
> * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and frsqrts.
> * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code in fast math mode.
> * config/aarch64/aarch64.md: Added enum entry.
> * testsuite/gcc.target/aarch64/rsqrt.c: Tests for single and double.
> ---
> gcc/ChangeLog | 9 +++
> gcc/config/aarch64/aarch64-builtins.c | 60 ++++++++++++++++
> gcc/config/aarch64/aarch64-protos.h | 2 +
> gcc/config/aarch64/aarch64-simd.md | 27 ++++++++
> gcc/config/aarch64/aarch64.c | 63 +++++++++++++++++
> gcc/config/aarch64/aarch64.md | 3 +
> gcc/testsuite/gcc.target/aarch64/rsqrt.c | 113 +++++++++++++++++++++++++++++++
> 7 files changed, 277 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index c9b156f..690ebba 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,12 @@
> +2015-06-15 Benedikt Huber <benedikt.huber@theobroma-systems.com>
> +
> + * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
> + * config/aarch64/aarch64-protos.h: Declare.
> + * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and frsqrts.
> + * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code in fast math mode.
> + * config/aarch64/aarch64.md: Added enum entry.
> + * testsuite/gcc.target/aarch64/rsqrt.c: Tests for single and double.
> +
> 2015-06-14 Richard Sandiford <richard.sandiford@arm.com>
>
> * rtl.h (classify_insn): Declare.
> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
> index f7a39ec..484bb84 100644
> --- a/gcc/config/aarch64/aarch64-builtins.c
> +++ b/gcc/config/aarch64/aarch64-builtins.c
> @@ -342,6 +342,8 @@ enum aarch64_builtins
> AARCH64_BUILTIN_GET_FPSR,
> AARCH64_BUILTIN_SET_FPSR,
>
> + AARCH64_BUILTIN_RSQRT,
> + AARCH64_BUILTIN_RSQRTF,
> AARCH64_SIMD_BUILTIN_BASE,
> AARCH64_SIMD_BUILTIN_LANE_CHECK,
> #include "aarch64-simd-builtins.def"
> @@ -831,6 +833,32 @@ aarch64_init_crc32_builtins ()
> }
>
> void
> +aarch64_add_builtin_rsqrt (void)
> +{
> + tree fndecl = NULL;
> + tree ftype = NULL;
> + ftype = build_function_type_list (double_type_node, double_type_node, NULL_TREE);
> +
> + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt",
> + ftype,
> + AARCH64_BUILTIN_RSQRT,
> + BUILT_IN_MD,
> + NULL,
> + NULL_TREE);
> + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT] = fndecl;
> +
> + tree ftypef = NULL;
> + ftypef = build_function_type_list (float_type_node, float_type_node, NULL_TREE);
> + fndecl = add_builtin_function ("__builtin_aarch64_rsqrtf",
> + ftypef,
> + AARCH64_BUILTIN_RSQRTF,
> + BUILT_IN_MD,
> + NULL,
> + NULL_TREE);
> + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRTF] = fndecl;
> +}
> +
> +void
> aarch64_init_builtins (void)
> {
> tree ftype_set_fpr
> @@ -855,6 +883,7 @@ aarch64_init_builtins (void)
> aarch64_init_simd_builtins ();
> if (TARGET_CRC32)
> aarch64_init_crc32_builtins ();
> + aarch64_add_builtin_rsqrt ();
> }
>
> tree
> @@ -1099,6 +1128,23 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
> return target;
> }
>
> +static rtx
> +aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
> +{
> + rtx pat;
> + tree arg0 = CALL_EXPR_ARG (exp, 0);
> + rtx op0 = expand_normal (arg0);
> +
> + enum insn_code c = CODE_FOR_rsqrtdf;
> + if (fcode == AARCH64_BUILTIN_RSQRTF)
> + c = CODE_FOR_rsqrtsf;
> +
> + pat = GEN_FCN (c) (target, op0);
> + emit_insn (pat);
> +
> + return target;
> +}
> +
> /* Expand an expression EXP that calls a built-in function,
> with result going to TARGET if that's convenient. */
> rtx
> @@ -1146,6 +1192,11 @@ aarch64_expand_builtin (tree exp,
> else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
> return aarch64_crc32_expand_builtin (fcode, exp, target);
>
> + if (fcode == AARCH64_BUILTIN_RSQRT ||
> + fcode == AARCH64_BUILTIN_RSQRTF)
> + return aarch64_expand_builtin_rsqrt (fcode, exp, target);
> +
> + return NULL_RTX;
> gcc_unreachable ();
> }
>
> @@ -1303,6 +1354,15 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
> return NULL_TREE;
> }
>
> +tree
> +aarch64_builtin_rsqrt (bool is_float)
> +{
> + if (is_float)
> + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRTF];
> + else
> + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT];
> +}
> +
> #undef VAR1
> #define VAR1(T, N, MAP, A) \
> case AARCH64_SIMD_BUILTIN_##T##_##N##A:
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 965a11b..4f1c8ce 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -270,6 +270,8 @@ void aarch64_print_operand (FILE *, rtx, char);
> void aarch64_print_operand_address (FILE *, rtx);
> void aarch64_emit_call_insn (rtx);
>
> +void aarch64_emit_swrsqrt (rtx, rtx);
> +
> /* Initialize builtins for SIMD intrinsics. */
> void init_aarch64_simd_builtins (void);
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index b90f938..266800a 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -353,6 +353,33 @@
> [(set_attr "type" "neon_fp_mul_d_scalar_q")]
> )
>
> +(define_insn "*rsqrte_simd"
> + [(set (match_operand:VALLF 0 "register_operand" "=w")
> + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
> + UNSPEC_RSQRTE))]
> + "TARGET_SIMD"
> + "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
> + [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
I think it might be useful to have the mode on the pattern name.
> +
> +(define_insn "*rsqrts_simd"
Likewise.
> + [(set (match_operand:VALLF 0 "register_operand" "=w")
> + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
> + (match_operand:VALLF 2 "register_operand" "w")]
> + UNSPEC_RSQRTS))]
> + "TARGET_SIMD"
> + "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
> + [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
> +
> +(define_expand "rsqrt<mode>"
> + [(set (match_operand:GPF 0 "register_operand" "=w")
> + (unspec:GPF [(match_operand:GPF 1 "register_operand" "w")]
> + UNSPEC_RSQRT))]
> + "TARGET_FLOAT"
> +{
> + aarch64_emit_swrsqrt (operands[0], operands[1]);
> + DONE;
> +})
> +
> (define_insn "*aarch64_mul3_elt_to_64v2df"
> [(set (match_operand:DF 0 "register_operand" "=w")
> (mult:DF
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index c3c2795..281f0b2 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -6816,6 +6816,66 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
> return aarch64_tune_params->memmov_cost;
> }
>
> +extern tree aarch64_builtin_rsqrt (bool is_float);
> +
> +static tree
> +aarch64_builtin_reciprocal (unsigned int fn,
> + bool md_fn ATTRIBUTE_UNUSED,
> + bool sqrt ATTRIBUTE_UNUSED)
You don't need ATTRIBUTE_UNUSED any more if you are not using the
variable at all. Just remove the variable name, this is C++ code
after all :).
> +{
> + if (!fast_math_flags_set_p (&global_options))
> + return NULL_TREE;
Is this correct check here? I suspect you want
!(flag_finite_math_only && !flag_trapping_math &&
flag_unsafe_math_optimizations) instead so someone can do -ffast-math
-fno-finite-math-only.
is there a reason why you don't check md_fn here at all because if
md_fn is true, and you get the same value for fn as
BUILT_IN_SQRTF/BUILT_IN_SQRT, we would get incorrect code.
Also I think you should check md_fn and support the vectorized functions here.
> +
> + if (fn == BUILT_IN_SQRTF)
> + return aarch64_builtin_rsqrt (true);
> + else if (fn == BUILT_IN_SQRT)
> + return aarch64_builtin_rsqrt (false);
> + else
> + return NULL_TREE;
> +}
> +
> +void
> +aarch64_emit_swrsqrt (rtx dst, rtx src)
> +{
> + enum machine_mode mode = GET_MODE (src);
> + rtx xsrc = gen_reg_rtx (mode);
> + emit_set_insn (xsrc, src);
Why do you use emit_set_insn here instead of emit_move_insn?
> +
> + rtx x0 = gen_reg_rtx (mode);
> + emit_insn (gen_rtx_SET (x0,
> + gen_rtx_UNSPEC (mode, gen_rtvec (1, xsrc),
> + UNSPEC_RSQRTE)));
And not emit_set_insn here?
> +
> + bool double_mode = (DFmode == mode ||
> + V1DFmode == mode ||
> + V2DFmode == mode ||
> + V4DFmode == mode ||
> + V6DFmode == mode ||
> + V8DFmode == mode);
Why not use:
mode == DFmode || (VECTOR_MODE_P (mode) && GET_MODE_INNER (mode) == DFmode);
At least it becomes more obvious what is done here.
Or just:
mode == DFmode || mode == V2DFmode || mode == V4DFmode
Also it might make sense to have an assert on the modes we accept at
the beginning of the function.
> +
> + int iterations = 2;
> + if (double_mode)
> + iterations = 3;
> +
> + for (int i = 0; i < iterations; ++i)
> + {
> + rtx x1 = gen_reg_rtx (mode);
> + rtx x2 = gen_reg_rtx (mode);
> + rtx x3 = gen_reg_rtx (mode);
> + emit_insn (gen_rtx_SET (x2,
> + gen_rtx_MULT (mode, x0, x0)));
> + emit_insn (gen_rtx_SET (x3,
> + gen_rtx_UNSPEC (mode, gen_rtvec (2, xsrc, x2),
> + UNSPEC_RSQRTS)));
> + emit_insn (gen_rtx_SET (x1,
> + gen_rtx_MULT (mode, x0, x3)));
More of emit_set_insn usage.
> + x0 = x1;
> + }
> +
> + emit_move_insn (dst, x0);
> + return;
> +}
> +
> /* Return the number of instructions that can be issued per cycle. */
> static int
> aarch64_sched_issue_rate (void)
> @@ -11747,6 +11807,9 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
> #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
> #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
>
> +#undef TARGET_BUILTIN_RECIPROCAL
> +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
> +
> #undef TARGET_VECTOR_MODE_SUPPORTED_P
> #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 11123d6..7272d05 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -120,6 +120,9 @@
> UNSPEC_VSTRUCTDUMMY
> UNSPEC_SP_SET
> UNSPEC_SP_TEST
> + UNSPEC_RSQRT
> + UNSPEC_RSQRTE
> + UNSPEC_RSQRTS
> ])
>
> (define_c_enum "unspecv" [
> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt.c b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
> new file mode 100644
> index 0000000..6607483
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
> @@ -0,0 +1,113 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -fno-inline --save-temps -ffast-math" } */
Maybe instead of -fno-inline, you can add some volatile to some
variables. We really should have separate tests for the assembly scan
vs the runtime tests too.
> +
> +#include <math.h>
> +#include <stdio.h>
> +
> +#include <values.h>
Is values.h a standard header file. I suspect you should be using
float.h instead which is the standard header file.
> +
> +#define PI 3.141592653589793
> +#define SQRT2 1.4142135623730951
> +
> +#define PI_4 0.7853981633974483
> +#define SQRT1_2 0.7071067811865475
> +
> +/* 2^25+1, float has 24 significand bits
> + * according to Single-precision floating-point format. */
> +#define TESTA8_FLT 33554433
> +/* 2^54+1, double has 53 significand bits
> + * according to Double-precision floating-point format. */
> +#define TESTA8_DBL 18014398509481985
> +
> +#define SD(a, b) t_double ((#a), (a), (b));
> +#define SF(a, b) t_float ((#a), (a), (b));
> +
> +#define EPSILON_double __DBL_EPSILON__
> +#define EPSILON_float __FLT_EPSILON__
> +#define ABS_double fabs
> +#define ABS_float fabsf
> +#define SQRT_double sqrt
> +#define SQRT_float sqrtf
I suspect you should have those using the builtin versions of the
above functions instead of the named ones.
Thanks,
Andrew Pinski
> +
> +extern void abort (void);
> +
> +#define TESTTYPE(TYPE) \
> +TYPE rsqrt_##TYPE (TYPE a) \
> +{ \
> + return 1.0/SQRT_##TYPE(a); \
> +} \
> + \
> +int equals_##TYPE (TYPE a, TYPE b) \
> +{ \
> + return (a == b || \
> + (isnan (a) && isnan (b)) || \
> + (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \
> +} \
> + \
> +void t_##TYPE (const char *s, TYPE a, TYPE result) \
> +{ \
> + TYPE r = rsqrt_##TYPE (a); \
> + if (!equals_##TYPE (r, result)) \
> + { \
> + abort (); \
> + } \
> +} \
> +
> +// printf ("Problem in %20s: %30.18A should be %30.18A\n", s, r, result); \
> +
> +TESTTYPE(double)
> +TESTTYPE(float)
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
> +
> +int main ()
> +{
> + SD( 1.0/256, 0X1.00000000000000P+4 );
> + SD( 1.0, 0X1.00000000000000P+0 );
> + SD( -1.0, NAN);
> + SD( 11.0, 0X1.34BF63D1568260P-2 );
> + SD( 0.0, INFINITY);
> + SD( INFINITY, 0X0.00000000000000P+0 );
> + SD( NAN, NAN);
> + SD( -NAN, -NAN);
> + SD( DBL_MAX, 0X1.00000000000010P-512);
> + SD( DBL_MIN, 0X1.00000000000000P+511);
> + SD( PI, 0X1.20DD750429B6D0P-1 );
> + SD( PI_4, 0X1.20DD750429B6D0P+0 );
> + SD( SQRT2, 0X1.AE89F995AD3AE0P-1 );
> + SD( SQRT1_2, 0X1.306FE0A31B7150P+0 );
> + SD( -PI, NAN);
> + SD( -SQRT2, NAN);
> + SD( TESTA8_DBL, 0X1.00000000000000P-27 );
> +
> + SF( 1.0/256, 0X1.00000000000000P+4 );
> + SF( 1.0, 0X1.00000000000000P+0 );
> + SF( -1.0, NAN);
> + SF( 11.0, 0X1.34BF6400000000P-2 );
> + SF( 0.0, INFINITY);
> + SF( INFINITY, 0X0.00000000000000P+0 );
> + SF( NAN, NAN);
> + SF( -NAN, -NAN);
> + SF( FLT_MAX, 0X1.00000200000000P-64 );
> + SF( FLT_MIN, 0X1.00000000000000P+63 );
> + SF( PI, 0X1.20DD7400000000P-1 );
> + SF( PI_4, 0X1.20DD7400000000P+0 );
> + SF( SQRT2, 0X1.AE89FA00000000P-1 );
> + SF( SQRT1_2, 0X1.306FE000000000P+0 );
> + SF( -PI, NAN);
> + SF( -SQRT2, NAN);
> + SF( TESTA8_FLT, 0X1.6A09E600000000P-13 );
> +
> +// With -ffast-math these return positive INF.
> +// SD( -0.0, -INFINITY);
> +// SF( -0.0, -INFINITY);
> +// The reason here is that -ffast-math flushes to zero.
> +// SD(DBL_MIN/256, 0X1.00000000000000P+515);
> +// SF(FLT_MIN/256, 0X1.00000000000000P+67 );
> +
> + return 0;
> +}
> --
> 1.9.1
>
More information about the Gcc-patches
mailing list