This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[aarch64][PATCH v2] Disable reg offset in quad-word store for Falkor


Hi,

Here's v2 of the patch to disable register offset addressing mode for
stores of 128-bit values on Falkor because they're very costly.
Differences from the last version:

 - Incorporated changes Jim made to his patch earlier that I missed,
   i.e. adding an extra tuning parameter called
   SLOW_REGOFFSET_QUADWORD_STORE instead of making it conditional on
   TUNE_FALKOR.

 - Added a new query type ADDR_QUERY_STR to indicate the queried
   address is used for a store.  This way I can use it for other
   scenarios where stores are significantly more expensive than loads,
   such as pre/post modify addressing modes.

 - Incorporated the constraint functionality into
   aarch64_legitimate_address_p and aarch64_classify_address.

I evaluated the suggestion of using costs to do this but it's not
possible with the current costs as they do not differentiate between
loads and stores.  If modifying all passes that use these costs to
identify loads vs stores is considered OK (ivopts seems to be the
biggest user) then I can volunteer to do that work for gcc9 and
evetually replace this.

----

On Falkor, because of an idiosyncracy of how the pipelines are designed, a
quad-word store using a reg+reg addressing mode is almost twice as slow as an
add followed by a quad-word store with a single reg addressing mode.  So we
get better performance if we disallow addressing modes using register offsets
with quad-word stores.

This patch improves fpspeed by 0.3% and intspeed by 0.22% in CPU2017,
with omnetpp_s (4.3%) and pop2_s (2.6%) being the biggest winners.

2018-xx-xx  Jim Wilson  <jim.wilson@linaro.org>
	    Kugan Vivenakandarajah  <kugan.vivekanandarajah@linaro.org>
	    Siddhesh Poyarekar  <siddhesh@sourceware.org>

	gcc/
	* gcc/config/aarch64/aarch64-protos.h (aarch64_addr_query_type):
	New member ADDR_QUERY_STR.
	* gcc/config/aarch64/aarch64-tuning-flags.def
	(SLOW_REGOFFSET_QUADWORD_STORE): New.
	* gcc/config/aarch64/aarch64.c (qdf24xx_tunings): Add
	SLOW_REGOFFSET_QUADWORD_STORE to tuning flags.
	(aarch64_classify_address): Avoid register indexing for quad
	mode stores when SLOW_REGOFFSET_QUADWORD_STORE is set.
	* gcc/config/aarch64/constraints.md (Uts): New constraint.
	* gcc/config/aarch64/aarch64.md (movti_aarch64, movtf_aarch64):
	Use it.
	* gcc/config/aarch64/aarch64-simd.md (aarch64_simd_mov<mode>):
	Likewise.

	gcc/testsuite/
	* gcc/testsuite/gcc.target/aarch64/pr82533.c: New test case.
---
 gcc/config/aarch64/aarch64-protos.h         |  4 ++++
 gcc/config/aarch64/aarch64-simd.md          |  4 ++--
 gcc/config/aarch64/aarch64-tuning-flags.def |  4 ++++
 gcc/config/aarch64/aarch64.c                | 12 +++++++++++-
 gcc/config/aarch64/aarch64.md               |  6 +++---
 gcc/config/aarch64/constraints.md           |  7 +++++++
 gcc/testsuite/gcc.target/aarch64/pr82533.c  | 11 +++++++++++
 7 files changed, 42 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr82533.c

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef1b0bc8e28..5fedc85f283 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -120,6 +120,9 @@ enum aarch64_symbol_type
    ADDR_QUERY_LDP_STP
       Query what is valid for a load/store pair.
 
+   ADDR_QUERY_STR
+      Query what is valid for a store.
+
    ADDR_QUERY_ANY
       Query what is valid for at least one memory constraint, which may
       allow things that "m" doesn't.  For example, the SVE LDR and STR
@@ -128,6 +131,7 @@ enum aarch64_symbol_type
 enum aarch64_addr_query_type {
   ADDR_QUERY_M,
   ADDR_QUERY_LDP_STP,
+  ADDR_QUERY_STR,
   ADDR_QUERY_ANY
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 3d1f6a01cb7..48d92702723 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -131,9 +131,9 @@
 
 (define_insn "*aarch64_simd_mov<VQ:mode>"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-		"=w, Umq,  m,  w, ?r, ?w, ?r, w")
+		"=w, Umq, Uts,  w, ?r, ?w, ?r, w")
 	(match_operand:VQ 1 "general_operand"
-		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
+		"m,  Dz,    w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
        || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index ea9ead234cb..04baf5b6de6 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -41,4 +41,8 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
    are not considered cheap.  */
 AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
 
+/* Don't use a register offset in a memory address for a quad-word store.  */
+AARCH64_EXTRA_TUNING_OPTION ("slow_regoffset_quadword_store",
+			     SLOW_REGOFFSET_QUADWORD_STORE)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 174310c9f1b..6aeeca8673c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -875,7 +875,7 @@ static const struct tune_params qdf24xx_tunings =
   2,	/* min_div_recip_mul_df.  */
   0,	/* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE),	/* tune_flags.  */
   &qdf24xx_prefetch_tune
 };
 
@@ -5530,6 +5530,16 @@ aarch64_classify_address (struct aarch64_address_info *info,
 				|| vec_flags == VEC_ADVSIMD
 				|| vec_flags == VEC_SVE_DATA));
 
+  /* Avoid register indexing for 128-bit stores when the
+     AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE option is set.  */
+  if (!optimize_size
+      && type == ADDR_QUERY_STR
+      && (aarch64_tune_params.extra_tuning_flags
+	  & AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE)
+      && (mode == TImode || mode == TFmode
+	  || aarch64_vector_data_mode_p (mode)))
+    allow_reg_index_p = false;
+
   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
      [Rn, #offset, MUL VL].  */
   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a6ecb391309..0c29e707676 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1079,7 +1079,7 @@
 
 (define_insn "*movti_aarch64"
   [(set (match_operand:TI 0
-	 "nonimmediate_operand"  "=   r,w, r,w,r,m,m,w,m")
+	 "nonimmediate_operand"  "=   r,w, r,w,r,m,m,w,Uts")
 	(match_operand:TI 1
 	 "aarch64_movti_operand" " rUti,r, w,w,m,r,Z,m,w"))]
   "(register_operand (operands[0], TImode)
@@ -1226,9 +1226,9 @@
 
 (define_insn "*movtf_aarch64"
   [(set (match_operand:TF 0
-	 "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,m,?r,m ,m")
+	 "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,Uts,?r,m ,m")
 	(match_operand:TF 1
-	 "general_operand"      " w,?r, ?r,w ,Y,Y ,m,w,m ,?r,Y"))]
+	 "general_operand"      " w,?r, ?r,w ,Y,Y ,m,w  ,m ,?r,Y"))]
   "TARGET_FLOAT && (register_operand (operands[0], TFmode)
     || aarch64_reg_or_fp_zero (operands[1], TFmode))"
   "@
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 3eb07f11894..2ecab8f1286 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -249,6 +249,13 @@
        (match_test "aarch64_legitimate_address_p (V2DImode,
 						  XEXP (op, 0), 1)")))
 
+(define_memory_constraint "Uts"
+  "@internal
+   An address valid for storing a 128-it AdvSIMD register"
+  (and (match_code "mem")
+       (match_test "aarch64_legitimate_address_p (GET_MODE (op), XEXP (op, 0),
+						  1, ADDR_QUERY_STR)")))
+
 (define_memory_constraint "Uty"
   "@internal
    An address valid for SVE LD1Rs."
diff --git a/gcc/testsuite/gcc.target/aarch64/pr82533.c b/gcc/testsuite/gcc.target/aarch64/pr82533.c
new file mode 100644
index 00000000000..fa28ffac03a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr82533.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-mcpu=falkor -O2 -ftree-vectorize" } */
+
+void
+copy (int N, double *c, double *a)
+{
+  for (int i = 0; i < N; ++i)
+    c[i] = a[i];
+}
+
+/* { dg-final { scan-assembler-not "str\tq\[0-9\]+, \\\[x\[0-9\]+, x\[0-9\]+\\\]" } } */
-- 
2.14.3


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]