This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH][AArch64] Improve Cortex-A53 FP scheduler
- From: Wilco Dijkstra <Wilco dot Dijkstra at arm dot com>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>
- Cc: nd <nd at arm dot com>, James Greenhalgh <James dot Greenhalgh at arm dot com>
- Date: Mon, 12 Jun 2017 13:16:44 +0000
- Subject: [PATCH][AArch64] Improve Cortex-A53 FP scheduler
- Authentication-results: sourceware.org; auth=none
- Authentication-results: arm.com; dkim=none (message not signed) header.d=none;arm.com; dmarc=none action=none header.from=arm.com;
- Nodisclaimer: True
- Spamdiagnosticmetadata: NSPM
- Spamdiagnosticoutput: 1:99
The Cortex-A53 scheduler model of FMAC bypass is not quite right
for FMAC to FMAC forwarding. Experiments also show the latencies of
FP operations are too high as well. Rather than adding more bypasses,
adjust the latencies of FP instructions to get a better schedule on
average. As a result SPECFP2006 is 1.1% faster.
Passes AArch64 and ARM bootstrap and regress.
ChangeLog:
2017-05-30 Wilco Dijkstra <wdijkstr@arm.com>
* config/arm/cortex-a53.md (cortex_a53_fpalu) Adjust latency.
(cortex_a53_fconst): Likewise.
(cortex_a53_fpmul): Likewise.
(cortex_a53_f_load_64): Likewise.
(cortex_a53_f_load_many): Likewise.
(cortex_a53_advsimd_alu): Likewise.
(cortex_a53_advsimd_alu_q): Likewise.
(cortex_a53_advsimd_mul): Likewise.
(cortex_a53_advsimd_mul_q): Likewise.
(fpmac bypass): Add new bypass for fpmac-fpmac case.
Add missing fmul, r2f_cvt and fconst cases.
--
diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md
index 403e84b5d4c3993c06ee879d147f9c5eb6dd3b9a..49f80d3130f97777260e9d3c6d4f37ef3db20c94 100644
--- a/gcc/config/arm/cortex-a53.md
+++ b/gcc/config/arm/cortex-a53.md
@@ -503,19 +503,19 @@ (define_cpu_unit "cortex_a53_crypto"
;; Floating-point arithmetic.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(define_insn_reservation "cortex_a53_fpalu" 5
+(define_insn_reservation "cortex_a53_fpalu" 4
(and (eq_attr "tune" "cortexa53")
(eq_attr "type" "ffariths, fadds, ffarithd, faddd, fmov,
f_cvt, fcmps, fcmpd, fccmps, fccmpd, fcsel,
f_rints, f_rintd, f_minmaxs, f_minmaxd"))
"cortex_a53_slot_any,cortex_a53_fp_alu")
-(define_insn_reservation "cortex_a53_fconst" 3
+(define_insn_reservation "cortex_a53_fconst" 2
(and (eq_attr "tune" "cortexa53")
(eq_attr "type" "fconsts,fconstd"))
"cortex_a53_slot_any,cortex_a53_fp_alu")
-(define_insn_reservation "cortex_a53_fpmul" 5
+(define_insn_reservation "cortex_a53_fpmul" 4
(and (eq_attr "tune" "cortexa53")
(eq_attr "type" "fmuls,fmuld"))
"cortex_a53_slot_any,cortex_a53_fp_mul")
@@ -566,14 +566,14 @@ (define_insn_reservation "cortex_a53_f_flags" 5
;; Floating-point load/store.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(define_insn_reservation "cortex_a53_f_load_64" 4
+(define_insn_reservation "cortex_a53_f_load_64" 3
(and (eq_attr "tune" "cortexa53")
(ior (eq_attr "type" "f_loads,f_loadd")
(eq_attr "cortex_a53_advsimd_type"
"advsimd_load_64")))
"cortex_a53_slot_any+cortex_a53_ls_agen")
-(define_insn_reservation "cortex_a53_f_load_many" 5
+(define_insn_reservation "cortex_a53_f_load_many" 4
(and (eq_attr "tune" "cortexa53")
(eq_attr "cortex_a53_advsimd_type"
"advsimd_load_128,advsimd_load_lots"))
@@ -604,22 +604,22 @@ (define_insn_reservation "cortex_a53_f_store_many" 0
;; or a 128-bit operation in which case we require in our model that we
;; issue from slot 0.
-(define_insn_reservation "cortex_a53_advsimd_alu" 5
+(define_insn_reservation "cortex_a53_advsimd_alu" 4
(and (eq_attr "tune" "cortexa53")
(eq_attr "cortex_a53_advsimd_type" "advsimd_alu"))
"cortex_a53_slot_any,cortex_a53_fp_alu")
-(define_insn_reservation "cortex_a53_advsimd_alu_q" 5
+(define_insn_reservation "cortex_a53_advsimd_alu_q" 4
(and (eq_attr "tune" "cortexa53")
(eq_attr "cortex_a53_advsimd_type" "advsimd_alu_q"))
"cortex_a53_slot0,cortex_a53_fp_alu_q")
-(define_insn_reservation "cortex_a53_advsimd_mul" 5
+(define_insn_reservation "cortex_a53_advsimd_mul" 4
(and (eq_attr "tune" "cortexa53")
(eq_attr "cortex_a53_advsimd_type" "advsimd_mul"))
"cortex_a53_slot_any,cortex_a53_fp_mul")
-(define_insn_reservation "cortex_a53_advsimd_mul_q" 5
+(define_insn_reservation "cortex_a53_advsimd_mul_q" 4
(and (eq_attr "tune" "cortexa53")
(eq_attr "cortex_a53_advsimd_type" "advsimd_mul_q"))
"cortex_a53_slot0,cortex_a53_fp_mul_q")
@@ -698,20 +698,18 @@ (define_insn_reservation "cortex_a53_crypto_sha_slow" 5
;; multiply-accumulate operations as a bypass reducing the latency
;; of producing instructions to near zero.
-(define_bypass 1 "cortex_a53_fp*,
+(define_bypass 1 "cortex_a53_fpalu,
+ cortex_a53_fpmul,
cortex_a53_r2f,
+ cortex_a53_r2f_cvt,
+ cortex_a53_fconst,
cortex_a53_f_load*"
"cortex_a53_fpmac"
"aarch_accumulator_forwarding")
-;; Model a bypass from the result of an FP operation to a use.
-
-(define_bypass 4 "cortex_a53_fpalu,
- cortex_a53_fpmul"
- "cortex_a53_fpalu,
- cortex_a53_fpmul,
- cortex_a53_fpmac,
- cortex_a53_advsimd_div*")
+(define_bypass 4 "cortex_a53_fpmac"
+ "cortex_a53_fpmac"
+ "aarch_accumulator_forwarding")
;; We want AESE and AESMC to end up consecutive to one another.