This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH 3/3][AArch64] Emit division using the Newton series
- From: Evandro Menezes <e dot menezes at samsung dot com>
- To: gcc-patches at gcc dot gnu dot org, Wilco Dijkstra <Wilco dot Dijkstra at arm dot com>, Andrew Pinski <pinskia at gmail dot com>, James Greenhalgh <james dot greenhalgh at arm dot com>, "philipp dot tomsich at theobroma-systems dot com" <philipp dot tomsich at theobroma-systems dot com>, Benedikt Huber <benedikt dot huber at theobroma-systems dot com>, nd <nd at arm dot com>
- Date: Fri, 03 Jun 2016 16:50:24 -0500
- Subject: Re: [PATCH 3/3][AArch64] Emit division using the Newton series
- Authentication-results: sourceware.org; auth=none
- References: <57212C09 dot 1000501 at samsung dot com> <20160525161633 dot GA18183 at arm dot com> <5748D0DA dot 4070301 at samsung dot com> <20160531092754 dot GA17601 at arm dot com> <574DDED1 dot 7040707 at samsung dot com>
Rebasing the patch...
--
Evandro Menezes
>From d791090aae6a29fa94d8fc10894ee1053b05bcc2 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Mon, 4 Apr 2016 14:02:24 -0500
Subject: [PATCH 3/3] [AArch64] Emit division using the Newton series
2016-04-04 Evandro Menezes <e.menezes@samsung.com>
Wilco Dijkstra <Wilco.Dijkstra@arm.com>
gcc/
* config/aarch64/aarch64-protos.h
(cpu_approx_modes): Add new member "division".
(aarch64_emit_approx_div): Declare new function.
* config/aarch64/aarch64.c
(generic_approx_modes): New member "division".
(exynosm1_approx_modes): Likewise.
(xgene1_approx_modes): Likewise.
(aarch64_emit_approx_div): Define new function.
* config/aarch64/aarch64.md ("div<mode>3"): New expansion.
* config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
* config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
* doc/invoke.texi (-mlow-precision-div): Describe new option.
---
gcc/config/aarch64/aarch64-protos.h | 2 +
gcc/config/aarch64/aarch64-simd.md | 14 +++++-
gcc/config/aarch64/aarch64.c | 92 +++++++++++++++++++++++++++++++++++++
gcc/config/aarch64/aarch64.md | 19 ++++++--
gcc/config/aarch64/aarch64.opt | 6 +++
gcc/doc/invoke.texi | 10 ++++
6 files changed, 138 insertions(+), 5 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index eb33118..3e0a0a3 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -192,6 +192,7 @@ struct cpu_branch_cost
/* Allowed modes for approximations. */
struct cpu_approx_modes
{
+ const unsigned int division; /* Division. */
const unsigned int sqrt; /* Square root. */
const unsigned int recip_sqrt; /* Reciprocal square root. */
};
@@ -303,6 +304,7 @@ int aarch64_branch_cost (bool, bool);
enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
bool aarch64_constant_address_p (rtx);
+bool aarch64_emit_approx_div (rtx, rtx, rtx);
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
bool aarch64_expand_movmem (rtx *);
bool aarch64_float_const_zero_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 2a5c665..a244a27 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1509,7 +1509,19 @@
[(set_attr "type" "neon_fp_mul_<Vetype><q>")]
)
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:VDQF 0 "register_operand")
+ (div:VDQF (match_operand:VDQF 1 "general_operand")
+ (match_operand:VDQF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+ DONE;
+
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(div:VDQF (match_operand:VDQF 1 "register_operand" "w")
(match_operand:VDQF 2 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ca6035d..7b85a85 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -396,6 +396,7 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
/* Generic approximation modes. */
static const cpu_approx_modes generic_approx_modes =
{
+ AARCH64_APPROX_NONE, /* division */
AARCH64_APPROX_NONE, /* sqrt */
AARCH64_APPROX_NONE /* recip_sqrt */
};
@@ -403,6 +404,7 @@ static const cpu_approx_modes generic_approx_modes =
/* Approximation modes for Exynos M1. */
static const cpu_approx_modes exynosm1_approx_modes =
{
+ AARCH64_APPROX_NONE, /* division */
AARCH64_APPROX_ALL, /* sqrt */
AARCH64_APPROX_ALL /* recip_sqrt */
};
@@ -410,6 +412,7 @@ static const cpu_approx_modes exynosm1_approx_modes =
/* Approximation modes for X-Gene 1. */
static const cpu_approx_modes xgene1_approx_modes =
{
+ AARCH64_APPROX_NONE, /* division */
AARCH64_APPROX_NONE, /* sqrt */
AARCH64_APPROX_ALL /* recip_sqrt */
};
@@ -7487,6 +7490,95 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
return true;
}
+typedef rtx (*recpe_type) (rtx, rtx);
+
+/* Select reciprocal initial estimate insn depending on machine mode. */
+
+static recpe_type
+get_recpe_type (machine_mode mode)
+{
+ switch (mode)
+ {
+ case SFmode: return (gen_aarch64_frecpesf);
+ case V2SFmode: return (gen_aarch64_frecpev2sf);
+ case V4SFmode: return (gen_aarch64_frecpev4sf);
+ case DFmode: return (gen_aarch64_frecpedf);
+ case V2DFmode: return (gen_aarch64_frecpev2df);
+ default: gcc_unreachable ();
+ }
+}
+
+typedef rtx (*recps_type) (rtx, rtx, rtx);
+
+/* Select reciprocal series step insn depending on machine mode. */
+
+static recps_type
+get_recps_type (machine_mode mode)
+{
+ switch (mode)
+ {
+ case SFmode: return (gen_aarch64_frecpssf);
+ case V2SFmode: return (gen_aarch64_frecpsv2sf);
+ case V4SFmode: return (gen_aarch64_frecpsv4sf);
+ case DFmode: return (gen_aarch64_frecpsdf);
+ case V2DFmode: return (gen_aarch64_frecpsv2df);
+ default: gcc_unreachable ();
+ }
+}
+
+/* Emit the instruction sequence to compute the approximation for the division
+ of NUM by DEN in QUO and return whether the sequence was emitted or not. */
+
+bool
+aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
+{
+ machine_mode mode = GET_MODE (quo);
+ bool use_approx_division_p = (flag_mlow_precision_div
+ || (aarch64_tune_params.approx_modes->division
+ & AARCH64_APPROX_MODE (mode)));
+
+ if (!flag_finite_math_only
+ || flag_trapping_math
+ || !flag_unsafe_math_optimizations
+ || optimize_function_for_size_p (cfun)
+ || !use_approx_division_p)
+ return false;
+
+ /* Estimate the approximate reciprocal. */
+ rtx xrcp = gen_reg_rtx (mode);
+ emit_insn ((*get_recpe_type (mode)) (xrcp, den));
+
+ /* Iterate over the series twice for SF and thrice for DF. */
+ int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+ /* Optionally iterate over the series once less for faster performance,
+ while sacrificing the accuracy. */
+ if (flag_mlow_precision_div)
+ iterations--;
+
+ /* Iterate over the series to calculate the approximate reciprocal. */
+ rtx xtmp = gen_reg_rtx (mode);
+ while (iterations--)
+ {
+ emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
+
+ if (iterations > 0)
+ emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
+ }
+
+ if (num != CONST1_RTX (mode))
+ {
+ /* As the approximate reciprocal of DEN is already calculated, only
+ calculate the approximate division when NUM is not 1.0. */
+ rtx xnum = force_reg (mode, num);
+ emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
+ }
+
+ /* Finalize the approximation. */
+ emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
+ return true;
+}
+
/* Return the number of instructions that can be issued per cycle. */
static int
aarch64_sched_issue_rate (void)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index ba7d606..fbc6225 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4674,11 +4674,22 @@
[(set_attr "type" "fmul<s>")]
)
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:GPF 0 "register_operand")
+ (div:GPF (match_operand:GPF 1 "general_operand")
+ (match_operand:GPF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+ DONE;
+
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
[(set (match_operand:GPF 0 "register_operand" "=w")
- (div:GPF
- (match_operand:GPF 1 "register_operand" "w")
- (match_operand:GPF 2 "register_operand" "w")))]
+ (div:GPF (match_operand:GPF 1 "register_operand" "w")
+ (match_operand:GPF 2 "register_operand" "w")))]
"TARGET_FLOAT"
"fdiv\\t%<s>0, %<s>1, %<s>2"
[(set_attr "type" "fdiv<s>")]
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 3c4e7ae..bf6b475 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -161,3 +161,9 @@ Enable the square root approximation. Enabling this reduces
precision of square root results to about 16 bits for
single precision and to 32 bits for double precision.
If enabled, it implies -mlow-precision-recip-sqrt.
+
+mlow-precision-div
+Common Var(flag_mlow_precision_div) Optimization
+Enable the division approximation. Enabling this reduces
+precision of division results to about 16 bits for
+single precision and to 32 bits for double precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 73a3fb8..4d7bcb7 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -577,6 +577,7 @@ Objective-C and Objective-C++ Dialects}.
-mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol
-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
-mlow-precision-sqrt -mno-low-precision-sqrt@gol
+-mlow-precision-div -mno-low-precision-div @gol
-march=@var{name} -mcpu=@var{name} -mtune=@var{name}}
@emph{Adapteva Epiphany Options}
@@ -13032,6 +13033,15 @@ precision of square root results to about 16 bits for
single precision and to 32 bits for double precision.
If enabled, it implies @option{-mlow-precision-recip-sqrt}.
+@item -mlow-precision-div
+@item -mno-low-precision-div
+@opindex -mlow-precision-div
+@opindex -mno-low-precision-div
+When calculating the division approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the division
+approximation.
+
@item -march=@var{name}
@opindex march
Specify the name of the target architecture and, optionally, one or
--
2.6.3