[Patch AArch64 4/4] Wire up New target hooks
James Greenhalgh
james.greenhalgh@arm.com
Fri Sep 26 13:31:00 GMT 2014
On Thu, Sep 25, 2014 at 03:57:36PM +0100, James Greenhalgh wrote:
>
> Hi,
>
> This patch wires up our new target hooks for AArch64. This also means
> we can bring back the two failing SRA tests.
>
> Bootstrapped on AArch64 with no issues.
>
> OK for trunk?
No way! This patch is nonsense as it stands!
I'd like to withdraw this for now while I have a think about what
has gone wrong!
Thanks,
James
>
> Thanks,
> James
>
> ---
> gcc/
>
> 2014-09-25 James Greenhalgh <james.greenhalgh@arm.com>
>
> * config/aarch64/aarch64.c
> (aarch64_estimate_movmem_ninsns): New.
> (aarch64_expand_movmem): Refactor old move costs.
> (aarch64_move_by_pieces_profitable_p): New.
> (aarch64_estimate_block_copy_ninsns): Likewise.
> (aarch64_max_scalarization_size): Likewise.
> (TARGET_MAX_SCALARIZATION_SIZE): Likewise.
> (TARGET_MOVE_BY_PIECES_PROFITABLE_P): Likewise.
> * config/aarch64/aarch64.h (AARCH64_MOVE_RATIO): New.
> (MOVE_RATIO): Delete.
>
> gcc/testsuite/
>
> 2014-09-25 James Greenhalgh <james.greenhalgh@arm.com>
>
> * gcc.dg/tree-ssa/pr42585.c: Bring back for AArch64.
> * gcc.dg/tree-ssa/sra-12.c: Likewise.
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 3483081..d8b5a4a 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -9616,6 +9616,34 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
> return false;
> }
>
> +static unsigned int
> +aarch64_estimate_movmem_ninsns (HOST_WIDE_INT size)
> +{
> + HOST_WIDE_INT chunks = 0;
> + int n = size;
> +
> + /* 3 bytes is a 2-byte then a 1-byte copy. */
> + if (n == 3)
> + return 2;
> +
> + /* 5, 6, 7 bytes need an extra copy. */
> + if (n > 4 && n < 8)
> + chunks++;
> +
> + /* If n was greater than 8, it is dealt with in 8/16-byte chunks
> + first. */
> + chunks += n / 16;
> + n %= 16;
> + chunks += n / 8;
> + n %= 8;
> +
> + /* Anything left is dealt with in one instruction. */
> + if (n != 0)
> + chunks++;
> +
> + return chunks;
> +}
> +
> /* Return a new RTX holding the result of moving POINTER forward by
> AMOUNT bytes. */
>
> @@ -9673,7 +9701,7 @@ aarch64_expand_movmem (rtx *operands)
>
> /* When optimizing for size, give a better estimate of the length of a
> memcpy call, but use the default otherwise. */
> - unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
> + unsigned int max_instructions = AARCH64_MOVE_RATIO (speed_p);
>
> /* We can't do anything smart if the amount to copy is not constant. */
> if (!CONST_INT_P (operands[2]))
> @@ -9681,10 +9709,9 @@ aarch64_expand_movmem (rtx *operands)
>
> n = UINTVAL (operands[2]);
>
> - /* Try to keep the number of instructions low. For cases below 16 bytes we
> - need to make at most two moves. For cases above 16 bytes it will be one
> - move for each 16 byte chunk, then at most two additional moves. */
> - if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
> + /* Try to keep the number of instructions we emit low, fail expansion
> + if we are unable to and leave it to memcpy. */
> + if (aarch64_estimate_movmem_ninsns (n) > max_instructions)
> return false;
>
> base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
> @@ -9774,6 +9801,57 @@ aarch64_expand_movmem (rtx *operands)
> return true;
> }
>
> +/* Implement TARGET_MOVE_BY_PIECES_PROFITABLE_P. */
> +
> +bool
> +aarch64_move_by_pieces_profitable_p (unsigned int size
> + unsigned int align
> + bool speed_p)
> +{
> + /* For strict alignment we don't want to use our unaligned
> + movmem implementation. */
> + if (STRICT_ALIGNMENT)
> + return (AARCH64_MOVE_RATIO (speed_p)
> + < move_by_pieces_ninsns (size, align, speed_p));
> +
> + /* If we have an overhang of 3, 6 or 7 bytes, we would emit an unaligned
> + load to cover it, if this is likely to be slow we would do better
> + going through move_by_pieces. */
> + if (size % 8 > 5)
> + return SLOW_UNALIGNED_ACCESS (DImode, 1);
> + else if (size % 4 == 3)
> + return SLOW_UNALIGNED_ACCESS (SImode, 1);
> +
> + /* We can likely do a better job than the move_by_pieces infrastructure
> + can. */
> + return false;
> +}
> +
> +/* Implement TARGET_ESTIMATE_BLOCK_COPY_NINSNS. */
> +
> +unsigned int
> +aarch64_estimate_block_copy_ninsns (HOST_WIDE_INT size, bool speed_p)
> +{
> + if (aarch64_move_by_pieces_profitable_p (size, 8, speed_p))
> + return move_by_pieces_ninsns (size, 8, MOVE_MAX_PIECES);
> + else if (aarch64_estimate_movmem_ninsns (size)
> + < AARCH64_MOVE_RATIO (speed_p))
> + return aarch64_estimate_movmem_ninsns (size);
> + else
> + /* memcpy. Set up 3 arguments and make a call. */
> + return 4;
> +}
> +
> +/* Implement TARGET_MAX_SCALARIZATION_SIZE. */
> +
> +unsigned int
> +aarch64_max_scalarization_size (bool speed_p)
> +{
> + /* The maximum number of instructions we are willing to use * the
> + maximum size we can move in one instruction (LDP/STP). */
> + return AARCH64_MOVE_RATIO (speed_p) * 16;
> +}
> +
> #undef TARGET_ADDRESS_COST
> #define TARGET_ADDRESS_COST aarch64_address_cost
>
> @@ -9843,6 +9921,10 @@ aarch64_expand_movmem (rtx *operands)
> #undef TARGET_BUILTIN_DECL
> #define TARGET_BUILTIN_DECL aarch64_builtin_decl
>
> +#undef TARGET_ESTIMATE_BLOCK_COPY_NINSNS
> +#define TARGET_ESTIMATE_BLOCK_COPY_NINSNS \
> + aarch64_estimate_block_copy_ninsns
> +
> #undef TARGET_EXPAND_BUILTIN
> #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
>
> @@ -9897,9 +9979,17 @@ aarch64_expand_movmem (rtx *operands)
> #undef TARGET_MANGLE_TYPE
> #define TARGET_MANGLE_TYPE aarch64_mangle_type
>
> +#undef TARGET_MAX_SCALARIZATION_SIZE
> +#define TARGET_MAX_SCALARIZATION_SIZE \
> + aarch64_max_scalarization_size
> +
> #undef TARGET_MEMORY_MOVE_COST
> #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
>
> +#undef TARGET_MOVE_BY_PIECES_PROFITABLE_P
> +#define TARGET_MOVE_BY_PIECES_PROFITABLE_P \
> + aarch64_move_by_pieces_profitable_p
> +
> #undef TARGET_MUST_PASS_IN_STACK
> #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
>
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index db950da..5c8d37d 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -678,17 +678,10 @@ do { \
> /* Maximum bytes moved by a single instruction (load/store pair). */
> #define MOVE_MAX (UNITS_PER_WORD * 2)
>
> -/* The base cost overhead of a memcpy call, for MOVE_RATIO and friends. */
> +/* The base cost overhead of a memcpy call, for CLEAR_RATIO and friends. */
> #define AARCH64_CALL_RATIO 8
>
> -/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
> - move_by_pieces will continually copy the largest safe chunks. So a
> - 7-byte copy is a 4-byte + 2-byte + byte copy. This proves inefficient
> - for both size and speed of copy, so we will instead use the "movmem"
> - standard name to implement the copy. This logic does not apply when
> - targeting -mstrict-align, so keep a sensible default in that case. */
> -#define MOVE_RATIO(speed) \
> - (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
> +#define AARCH64_MOVE_RATIO(speed) (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)
>
> /* For CLEAR_RATIO, when optimizing for size, give a better estimate
> of the length of a memset call, but use the default otherwise. */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> index 07f575d..a970c85 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> @@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr)
> /* Whether the structs are totally scalarized or not depends on the
> MOVE_RATIO macro definition in the back end. The scalarization will
> not take place when using small values for MOVE_RATIO. */
> -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
> -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
> /* { dg-final { cleanup-tree-dump "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> index 45aa963..59e5e6a 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> @@ -21,5 +21,5 @@ int foo (struct S *p)
> *p = l;
> }
>
> -/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */
> /* { dg-final { cleanup-tree-dump "release_ssa" } } */
More information about the Gcc-patches
mailing list