This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH, AArch64] Disable reg offset in quad-word store for Falkor.


On Falkor, because of an idiosyncracy of how the pipelines are designed, a
quad-word store using a reg+reg addressing mode is almost twice as slow as an
add followed by a quad-word store with a single reg addressing mode.  So we
get better performance if we disallow addressing modes using register offsets
with quad-word stores.

Using lmbench compiled with -O2 -ftree-vectorize as my benchmark, I see a 13%
performance increase on stream copy using this patch, and a 16% performance
increase on stream scale using this patch.  I also see a small performance
increase on SPEC CPU2006 of around 0.2% for int and 0.4% for FP at -O3.

	gcc/
	* config/aarch64/aarch64-protos.h (aarch64_movti_target_operand_p):
	New.
	* config/aarch64/aarch64-simd.md (aarch64_simd_mov<mode>): Use Utf.
	* config/aarch64/aarch64-tuning-flags.def
	(SLOW_REGOFFSET_QUADWORD_STORE): New.
	* config/aarch64/aarch64.c (qdf24xx_tunings): Add
	SLOW_REGOFFSET_QUADWORD_STORE to tuning flags.
	(aarch64_movti_target_operand_p): New.
	* config/aarch64/aarch64.md (movti_aarch64): Use Utf.
	(movtf_aarch64): Likewise.
	* config/aarch64/constraints.md (Utf): New.
---
 gcc/config/aarch64/aarch64-protos.h         |  1 +
 gcc/config/aarch64/aarch64-simd.md          |  4 ++--
 gcc/config/aarch64/aarch64-tuning-flags.def |  4 ++++
 gcc/config/aarch64/aarch64.c                | 14 +++++++++++++-
 gcc/config/aarch64/aarch64.md               |  6 +++---
 gcc/config/aarch64/constraints.md           |  6 ++++++
 6 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index e67c2ed..2dfd057 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -379,6 +379,7 @@ const char *aarch64_output_move_struct (rtx *operands);
 rtx aarch64_return_addr (int, rtx);
 rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
 bool aarch64_simd_mem_operand_p (rtx);
+bool aarch64_movti_target_operand_p (rtx);
 rtx aarch64_simd_vect_par_cnst_half (machine_mode, bool);
 rtx aarch64_tls_get_addr (void);
 tree aarch64_fold_builtin (tree, int, tree *, bool);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 70e9339..88bf210 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -133,9 +133,9 @@
 
 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-		"=w, Umq,  m,  w, ?r, ?w, ?r, w")
+		"=w, Umq,  Utf,  w, ?r, ?w, ?r, w")
 	(match_operand:VQ 1 "general_operand"
-		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
+		"m,  Dz,   w,    w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
        || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index f48642c..7d0b104 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -41,4 +41,8 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
    are not considered cheap.  */
 AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
 
+/* Don't use a register offset in a memory address for a quad-word store.  */
+AARCH64_EXTRA_TUNING_OPTION ("slow_regoffset_quadword_store",
+			     SLOW_REGOFFSET_QUADWORD_STORE)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5e26cb7..d6f1133 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -818,7 +818,7 @@ static const struct tune_params qdf24xx_tunings =
   2,	/* min_div_recip_mul_df.  */
   0,	/* max_case_values.  */
   tune_params::AUTOPREFETCHER_STRONG,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE),	/* tune_flags.  */
   &qdf24xx_prefetch_tune
 };
 
@@ -11821,6 +11821,18 @@ aarch64_simd_mem_operand_p (rtx op)
 			|| REG_P (XEXP (op, 0)));
 }
 
+/* Return TRUE if OP uses an efficient memory address for quad-word target.  */
+bool
+aarch64_movti_target_operand_p (rtx op)
+{
+  if (! optimize_size
+      && (aarch64_tune_params.extra_tuning_flags
+	  & AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE))
+    return MEM_P (op) && ! (GET_CODE (XEXP (op, 0)) == PLUS
+			    && ! CONST_INT_P (XEXP (XEXP (op, 0), 1)));
+  return MEM_P (op);
+}
+
 /* Emit a register copy from operand to operand, taking care not to
    early-clobber source registers in the process.
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index f8cdb06..9c7e356 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1023,7 +1023,7 @@
 
 (define_insn "*movti_aarch64"
   [(set (match_operand:TI 0
-	 "nonimmediate_operand"  "=r, w,r,w,r,m,m,w,m")
+	 "nonimmediate_operand"  "=r, w,r,w,r,m,m,w,Utf")
 	(match_operand:TI 1
 	 "aarch64_movti_operand" " rn,r,w,w,m,r,Z,m,w"))]
   "(register_operand (operands[0], TImode)
@@ -1170,9 +1170,9 @@
 
 (define_insn "*movtf_aarch64"
   [(set (match_operand:TF 0
-	 "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,m,?r,m ,m")
+	 "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,Utf,?r,m ,m")
 	(match_operand:TF 1
-	 "general_operand"      " w,?r, ?r,w ,Y,Y ,m,w,m ,?r,Y"))]
+	 "general_operand"      " w,?r, ?r,w ,Y,Y ,m,w  ,m ,?r,Y"))]
   "TARGET_FLOAT && (register_operand (operands[0], TFmode)
     || aarch64_reg_or_fp_zero (operands[1], TFmode))"
   "@
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 3649fb4..b1defb6 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -171,6 +171,12 @@
        (match_test "aarch64_legitimate_address_p (GET_MODE (op), XEXP (op, 0),
 						  PARALLEL, 1)")))
 
+(define_memory_constraint "Utf"
+  "@iternal
+   An efficient memory address for a quad-word target operand."
+  (and (match_code "mem")
+       (match_test "aarch64_movti_target_operand_p (op)")))
+
 (define_memory_constraint "Utv"
   "@internal
    An address valid for loading/storing opaque structure
-- 
2.7.4


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]