This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Patch AArch64 4/4] Wire up New target hooks


Hi,

This patch wires up our new target hooks for AArch64. This also means
we can bring back the two failing SRA tests.

Bootstrapped on AArch64 with no issues.

OK for trunk?

Thanks,
James

---
gcc/

2014-09-25  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64.c
	(aarch64_estimate_movmem_ninsns): New.
	(aarch64_expand_movmem): Refactor old move costs.
	(aarch64_move_by_pieces_profitable_p): New.
	(aarch64_estimate_block_copy_ninsns): Likewise.
	(aarch64_max_scalarization_size): Likewise.
	(TARGET_MAX_SCALARIZATION_SIZE): Likewise.
	(TARGET_MOVE_BY_PIECES_PROFITABLE_P): Likewise.
	* config/aarch64/aarch64.h (AARCH64_MOVE_RATIO): New.
	(MOVE_RATIO): Delete.

gcc/testsuite/

2014-09-25  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.dg/tree-ssa/pr42585.c: Bring back for AArch64.
	* gcc.dg/tree-ssa/sra-12.c: Likewise.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 3483081..d8b5a4a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9616,6 +9616,34 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
   return false;
 }
 
+static unsigned int
+aarch64_estimate_movmem_ninsns (HOST_WIDE_INT size)
+{
+  HOST_WIDE_INT chunks = 0;
+  int n = size;
+
+  /* 3 bytes is a 2-byte then a 1-byte copy.  */
+  if (n == 3)
+    return 2;
+
+  /* 5, 6, 7 bytes need an extra copy.  */
+  if (n > 4 && n < 8)
+    chunks++;
+
+  /* If n was greater than 8, it is dealt with in 8/16-byte chunks
+     first.  */
+  chunks += n / 16;
+  n %= 16;
+  chunks += n / 8;
+  n %= 8;
+
+  /* Anything left is dealt with in one instruction.  */
+  if (n != 0)
+    chunks++;
+
+  return chunks;
+}
+
 /* Return a new RTX holding the result of moving POINTER forward by
    AMOUNT bytes.  */
 
@@ -9673,7 +9701,7 @@ aarch64_expand_movmem (rtx *operands)
 
   /* When optimizing for size, give a better estimate of the length of a
      memcpy call, but use the default otherwise.  */
-  unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
+  unsigned int max_instructions = AARCH64_MOVE_RATIO (speed_p);
 
   /* We can't do anything smart if the amount to copy is not constant.  */
   if (!CONST_INT_P (operands[2]))
@@ -9681,10 +9709,9 @@ aarch64_expand_movmem (rtx *operands)
 
   n = UINTVAL (operands[2]);
 
-  /* Try to keep the number of instructions low.  For cases below 16 bytes we
-     need to make at most two moves.  For cases above 16 bytes it will be one
-     move for each 16 byte chunk, then at most two additional moves.  */
-  if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
+  /* Try to keep the number of instructions we emit low, fail expansion
+     if we are unable to and leave it to memcpy.  */
+  if (aarch64_estimate_movmem_ninsns (n) > max_instructions)
     return false;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
@@ -9774,6 +9801,57 @@ aarch64_expand_movmem (rtx *operands)
   return true;
 }
 
+/* Implement TARGET_MOVE_BY_PIECES_PROFITABLE_P.  */
+
+bool
+aarch64_move_by_pieces_profitable_p (unsigned int size
+				     unsigned int align
+				     bool speed_p)
+{
+  /* For strict alignment we don't want to use our unaligned
+     movmem implementation.  */
+  if (STRICT_ALIGNMENT)
+    return (AARCH64_MOVE_RATIO (speed_p)
+	    < move_by_pieces_ninsns (size, align, speed_p));
+
+  /* If we have an overhang of 3, 6 or 7 bytes, we would emit an unaligned
+     load to cover it, if this is likely to be slow we would do better
+     going through move_by_pieces.  */
+  if (size % 8 > 5)
+    return SLOW_UNALIGNED_ACCESS (DImode, 1);
+  else if (size % 4 == 3)
+    return SLOW_UNALIGNED_ACCESS (SImode, 1);
+
+  /* We can likely do a better job than the move_by_pieces infrastructure
+     can.  */
+  return false;
+}
+
+/* Implement TARGET_ESTIMATE_BLOCK_COPY_NINSNS.  */
+
+unsigned int
+aarch64_estimate_block_copy_ninsns (HOST_WIDE_INT size, bool speed_p)
+{
+  if (aarch64_move_by_pieces_profitable_p (size, 8, speed_p))
+    return move_by_pieces_ninsns (size, 8, MOVE_MAX_PIECES);
+  else if (aarch64_estimate_movmem_ninsns (size)
+	   < AARCH64_MOVE_RATIO (speed_p))
+    return aarch64_estimate_movmem_ninsns (size);
+  else
+    /* memcpy.  Set up 3 arguments and make a call.  */
+    return 4;
+}
+
+/* Implement TARGET_MAX_SCALARIZATION_SIZE.  */
+
+unsigned int
+aarch64_max_scalarization_size (bool speed_p)
+{
+  /* The maximum number of instructions we are willing to use * the
+     maximum size we can move in one instruction (LDP/STP).  */
+  return AARCH64_MOVE_RATIO (speed_p) * 16;
+}
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -9843,6 +9921,10 @@ aarch64_expand_movmem (rtx *operands)
 #undef TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
 
+#undef TARGET_ESTIMATE_BLOCK_COPY_NINSNS
+#define TARGET_ESTIMATE_BLOCK_COPY_NINSNS \
+  aarch64_estimate_block_copy_ninsns
+
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
 
@@ -9897,9 +9979,17 @@ aarch64_expand_movmem (rtx *operands)
 #undef TARGET_MANGLE_TYPE
 #define TARGET_MANGLE_TYPE aarch64_mangle_type
 
+#undef TARGET_MAX_SCALARIZATION_SIZE
+#define TARGET_MAX_SCALARIZATION_SIZE \
+  aarch64_max_scalarization_size
+
 #undef TARGET_MEMORY_MOVE_COST
 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
 
+#undef TARGET_MOVE_BY_PIECES_PROFITABLE_P
+#define TARGET_MOVE_BY_PIECES_PROFITABLE_P \
+  aarch64_move_by_pieces_profitable_p
+
 #undef TARGET_MUST_PASS_IN_STACK
 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index db950da..5c8d37d 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -678,17 +678,10 @@ do {									     \
 /* Maximum bytes moved by a single instruction (load/store pair).  */
 #define MOVE_MAX (UNITS_PER_WORD * 2)
 
-/* The base cost overhead of a memcpy call, for MOVE_RATIO and friends.  */
+/* The base cost overhead of a memcpy call, for CLEAR_RATIO and friends.  */
 #define AARCH64_CALL_RATIO 8
 
-/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
-   move_by_pieces will continually copy the largest safe chunks.  So a
-   7-byte copy is a 4-byte + 2-byte + byte copy.  This proves inefficient
-   for both size and speed of copy, so we will instead use the "movmem"
-   standard name to implement the copy.  This logic does not apply when
-   targeting -mstrict-align, so keep a sensible default in that case.  */
-#define MOVE_RATIO(speed) \
-  (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
+#define AARCH64_MOVE_RATIO(speed) (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)
 
 /* For CLEAR_RATIO, when optimizing for size, give a better estimate
    of the length of a memset call, but use the default otherwise.  */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
index 07f575d..a970c85 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
@@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr)
 /* Whether the structs are totally scalarized or not depends on the
    MOVE_RATIO macro definition in the back end.  The scalarization will
    not take place when using small values for MOVE_RATIO.  */
-/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
-/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
 /* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
index 45aa963..59e5e6a 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
@@ -21,5 +21,5 @@ int foo (struct S *p)
   *p = l;
 }
 
-/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */
 /* { dg-final { cleanup-tree-dump "release_ssa" } } */

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]