From: Richard Sandiford Date: Wed, 19 Feb 2020 17:22:14 +0000 (+0000) Subject: aarch64: Add SVE support for -mlow-precision-sqrt X-Git-Tag: basepoints/gcc-11~1302 X-Git-Url: https://gcc.gnu.org/git/?a=commitdiff_plain;h=a0ee8352df6f4cd98830c8dbaa969e1cda39cc40;p=gcc.git aarch64: Add SVE support for -mlow-precision-sqrt SVE was missing support for -mlow-precision-sqrt, which meant that -march=armv8.2-a+sve -mlow-precision-sqrt could cause a performance regression compared to -march=armv8.2-a -mlow-precision-sqrt. 2020-02-21 Richard Sandiford gcc/ * config/aarch64/aarch64.c (aarch64_emit_approx_sqrt): Add SVE support. Use aarch64_emit_mult instead of emitting multiplication instructions directly. * config/aarch64/aarch64-sve.md (sqrt2, rsqrt2) (@aarch64_rsqrte, @aarch64_rsqrts): New expanders. gcc/testsuite/ * gcc.target/aarch64/sve/rsqrt_1.c: New test. * gcc.target/aarch64/sve/rsqrt_1_run.c: Likewise. * gcc.target/aarch64/sve/sqrt_1.c: Likewise. * gcc.target/aarch64/sve/sqrt_1_run.c: Likewise. --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 4d161ca2cd1d..6e7de5fa729f 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2020-02-21 Richard Sandiford + + * config/aarch64/aarch64.c (aarch64_emit_approx_sqrt): Add SVE + support. Use aarch64_emit_mult instead of emitting multiplication + instructions directly. + * config/aarch64/aarch64-sve.md (sqrt2, rsqrt2) + (@aarch64_rsqrte, @aarch64_rsqrts): New expanders. + 2020-02-21 Richard Sandiford * config/aarch64/aarch64.c (aarch64_emit_mult): New function. diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index e3b1da89c1ae..a661b257109c 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -76,6 +76,8 @@ ;; ---- [INT] Logical inverse ;; ---- [FP<-INT] General unary arithmetic that maps to unspecs ;; ---- [FP] General unary arithmetic corresponding to unspecs +;; ---- [FP] Square root +;; ---- [FP] Reciprocal square root ;; ---- [PRED] Inverse ;; == Binary arithmetic @@ -3246,7 +3248,7 @@ ;; - FRINTP ;; - FRINTX ;; - FRINTZ -;; - FRSQRT +;; - FRSQRTE ;; - FSQRT ;; ------------------------------------------------------------------------- @@ -3267,7 +3269,7 @@ [(match_dup 2) (const_int SVE_RELAXED_GP) (match_operand:SVE_FULL_F 1 "register_operand")] - SVE_COND_FP_UNARY))] + SVE_COND_FP_UNARY_OPTAB))] "TARGET_SVE" { operands[2] = aarch64_ptrue_reg (mode); @@ -3357,6 +3359,56 @@ [(set_attr "movprfx" "*,yes,yes")] ) +;; ------------------------------------------------------------------------- +;; ---- [FP] Square root +;; ------------------------------------------------------------------------- + +(define_expand "sqrt2" + [(set (match_operand:SVE_FULL_F 0 "register_operand") + (unspec:SVE_FULL_F + [(match_dup 2) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 1 "register_operand")] + UNSPEC_COND_FSQRT))] + "TARGET_SVE" +{ + if (aarch64_emit_approx_sqrt (operands[0], operands[1], false)) + DONE; + operands[2] = aarch64_ptrue_reg (mode); +}) + +;; ------------------------------------------------------------------------- +;; ---- [FP] Reciprocal square root +;; ------------------------------------------------------------------------- + +(define_expand "rsqrt2" + [(set (match_operand:SVE_FULL_SDF 0 "register_operand") + (unspec:SVE_FULL_SDF + [(match_operand:SVE_FULL_SDF 1 "register_operand")] + UNSPEC_RSQRT))] + "TARGET_SVE" +{ + aarch64_emit_approx_sqrt (operands[0], operands[1], true); + DONE; +}) + +(define_expand "@aarch64_rsqrte" + [(set (match_operand:SVE_FULL_SDF 0 "register_operand") + (unspec:SVE_FULL_SDF + [(match_operand:SVE_FULL_SDF 1 "register_operand")] + UNSPEC_RSQRTE))] + "TARGET_SVE" +) + +(define_expand "@aarch64_rsqrts" + [(set (match_operand:SVE_FULL_SDF 0 "register_operand") + (unspec:SVE_FULL_SDF + [(match_operand:SVE_FULL_SDF 1 "register_operand") + (match_operand:SVE_FULL_SDF 2 "register_operand")] + UNSPEC_RSQRTS))] + "TARGET_SVE" +) + ;; ------------------------------------------------------------------------- ;; ---- [PRED] Inverse ;; ------------------------------------------------------------------------- diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index c1bbc4917c74..703f69a8b427 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -12790,6 +12790,9 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp) /* Caller assumes we cannot fail. */ gcc_assert (use_rsqrt_p (mode)); + rtx pg = NULL_RTX; + if (aarch64_sve_mode_p (mode)) + pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode)); machine_mode mmsk = (VECTOR_MODE_P (mode) ? related_int_vector_mode (mode).require () : int_mode_for_mode (mode).require ()); @@ -12798,11 +12801,21 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp) { /* When calculating the approximate square root, compare the argument with 0.0 and create a mask. */ - xmsk = gen_reg_rtx (mmsk); - emit_insn (gen_rtx_SET (xmsk, - gen_rtx_NEG (mmsk, - gen_rtx_EQ (mmsk, src, - CONST0_RTX (mode))))); + rtx zero = CONST0_RTX (mode); + if (pg) + { + xmsk = gen_reg_rtx (GET_MODE (pg)); + rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode); + emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode, + xmsk, pg, hint, src, zero)); + } + else + { + xmsk = gen_reg_rtx (mmsk); + emit_insn (gen_rtx_SET (xmsk, + gen_rtx_NEG (mmsk, + gen_rtx_EQ (mmsk, src, zero)))); + } } /* Estimate the approximate reciprocal square root. */ @@ -12824,29 +12837,40 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp) while (iterations--) { rtx x2 = gen_reg_rtx (mode); - emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst)); + aarch64_emit_mult (x2, pg, xdst, xdst); emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2)); if (iterations > 0) - emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1)); + aarch64_emit_mult (xdst, pg, xdst, x1); } if (!recp) { - /* Qualify the approximate reciprocal square root when the argument is - 0.0 by squashing the intermediary result to 0.0. */ - rtx xtmp = gen_reg_rtx (mmsk); - emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk), - gen_rtx_SUBREG (mmsk, xdst, 0))); - emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0)); - - /* Calculate the approximate square root. */ - emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src)); + if (pg) + /* Multiply nonzero source values by the corresponding intermediate + result elements, so that the final calculation is the approximate + square root rather than its reciprocal. Select a zero result for + zero source values, to avoid the Inf * 0 -> NaN that we'd get + otherwise. */ + emit_insn (gen_cond (UNSPEC_COND_FMUL, mode, + xdst, xmsk, xdst, src, CONST0_RTX (mode))); + else + { + /* Qualify the approximate reciprocal square root when the + argument is 0.0 by squashing the intermediary result to 0.0. */ + rtx xtmp = gen_reg_rtx (mmsk); + emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk), + gen_rtx_SUBREG (mmsk, xdst, 0))); + emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0)); + + /* Calculate the approximate square root. */ + aarch64_emit_mult (xdst, pg, xdst, src); + } } /* Finalize the approximation. */ - emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1)); + aarch64_emit_mult (dst, pg, xdst, x1); return true; } diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 548ee0f51e87..b106957f537b 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -2277,6 +2277,19 @@ UNSPEC_COND_FRINTZ UNSPEC_COND_FSQRT]) +;; Same as SVE_COND_FP_UNARY, but without codes that have a dedicated +;; 2 expander. +(define_int_iterator SVE_COND_FP_UNARY_OPTAB [UNSPEC_COND_FABS + UNSPEC_COND_FNEG + UNSPEC_COND_FRECPX + UNSPEC_COND_FRINTA + UNSPEC_COND_FRINTI + UNSPEC_COND_FRINTM + UNSPEC_COND_FRINTN + UNSPEC_COND_FRINTP + UNSPEC_COND_FRINTX + UNSPEC_COND_FRINTZ]) + (define_int_iterator SVE_COND_FCVT [UNSPEC_COND_FCVT]) (define_int_iterator SVE_COND_FCVTI [UNSPEC_COND_FCVTZS UNSPEC_COND_FCVTZU]) (define_int_iterator SVE_COND_ICVTF [UNSPEC_COND_SCVTF UNSPEC_COND_UCVTF]) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 936260e4ae79..8518061aa284 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2020-02-21 Richard Sandiford + + * gcc.target/aarch64/sve/rsqrt_1.c: New test. + * gcc.target/aarch64/sve/rsqrt_1_run.c: Likewise. + * gcc.target/aarch64/sve/sqrt_1.c: Likewise. + * gcc.target/aarch64/sve/sqrt_1_run.c: Likewise. + 2020-02-21 Richard Sandiford * gcc.target/aarch64/sve/recip_1.c: New test. diff --git a/gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1.c b/gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1.c new file mode 100644 index 000000000000..2dabfd3e67c5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1.c @@ -0,0 +1,27 @@ +/* { dg-options "-Ofast -mlow-precision-sqrt" } */ + +#define DEF_LOOP(TYPE, FN) \ + void \ + test_##TYPE (TYPE *x, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + x[i] = (TYPE) 1 / FN (x[i]); \ + } + +#define TEST_ALL(T) \ + T (_Float16, __builtin_sqrtf16) \ + T (float, __builtin_sqrtf) \ + T (double, __builtin_sqrt) + +TEST_ALL (DEF_LOOP) + +/* { dg-final { scan-assembler-not {\tfrsqrte\tz[0-9]+\.h} } } */ +/* { dg-final { scan-assembler-not {\tfrsqrts\tz[0-9]+\.h} } } */ + +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s} 2 } } */ +/* { dg-final { scan-assembler-times {\tfrsqrte\tz[0-9]+\.s} 1 } } */ +/* { dg-final { scan-assembler-times {\tfrsqrts\tz[0-9]+\.s} 1 } } */ + +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d} 4 } } */ +/* { dg-final { scan-assembler-times {\tfrsqrte\tz[0-9]+\.d} 1 } } */ +/* { dg-final { scan-assembler-times {\tfrsqrts\tz[0-9]+\.d} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1_run.c new file mode 100644 index 000000000000..73d309a58649 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1_run.c @@ -0,0 +1,27 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-Ofast -mlow-precision-sqrt" } */ + +#include "rsqrt_1.c" + +#define N 77 + +#define TEST_LOOP(TYPE, FN) \ + { \ + TYPE a[N]; \ + for (int i = 0; i < N; ++i) \ + a[i] = i + 1; \ + test_##TYPE (a, N); \ + for (int i = 0; i < N; ++i) \ + { \ + double diff = a[i] - 1.0 / __builtin_sqrt (i + 1); \ + if (__builtin_fabs (diff) > 0x1.0p-8) \ + __builtin_abort (); \ + } \ + } + +int +main (void) +{ + TEST_ALL (TEST_LOOP); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sqrt_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sqrt_1.c new file mode 100644 index 000000000000..aba2bf6e481e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/sqrt_1.c @@ -0,0 +1,30 @@ +/* { dg-options "-Ofast -mlow-precision-sqrt" } */ + +#define DEF_LOOP(TYPE, FN) \ + void \ + test_##TYPE (TYPE *x, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + x[i] = FN (x[i]); \ + } + +#define TEST_ALL(T) \ + T (_Float16, __builtin_sqrtf16) \ + T (float, __builtin_sqrtf) \ + T (double, __builtin_sqrt) + +TEST_ALL (DEF_LOOP) + +/* { dg-final { scan-assembler {\tfsqrt\tz[0-9]+\.h} } } */ +/* { dg-final { scan-assembler-not {\tfrsqrte\tz[0-9]+\.h} } } */ +/* { dg-final { scan-assembler-not {\tfrsqrts\tz[0-9]+\.h} } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z} 1 } } */ +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s} 3 } } */ +/* { dg-final { scan-assembler-times {\tfrsqrte\tz[0-9]+\.s} 1 } } */ +/* { dg-final { scan-assembler-times {\tfrsqrts\tz[0-9]+\.s} 1 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z} 1 } } */ +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d} 5 } } */ +/* { dg-final { scan-assembler-times {\tfrsqrte\tz[0-9]+\.d} 1 } } */ +/* { dg-final { scan-assembler-times {\tfrsqrts\tz[0-9]+\.d} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sqrt_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/sqrt_1_run.c new file mode 100644 index 000000000000..30906ceb1e27 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/sqrt_1_run.c @@ -0,0 +1,27 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-Ofast -mlow-precision-sqrt" } */ + +#include "sqrt_1.c" + +#define N 77 + +#define TEST_LOOP(TYPE, FN) \ + { \ + TYPE a[N]; \ + for (int i = 0; i < N; ++i) \ + a[i] = i; \ + test_##TYPE (a, N); \ + for (int i = 0; i < N; ++i) \ + { \ + double diff = a[i] - __builtin_sqrt (i); \ + if (__builtin_fabs (diff) > 0x1.0p-8) \ + __builtin_abort (); \ + } \ + } + +int +main (void) +{ + TEST_ALL (TEST_LOOP); + return 0; +}