[Patch AArch64 4/4] Wire up New target hooks

Fri Sep 26 13:31:00 GMT 2014

On Thu, Sep 25, 2014 at 03:57:36PM +0100, James Greenhalgh wrote:
> 
> Hi,
> 
> This patch wires up our new target hooks for AArch64. This also means
> we can bring back the two failing SRA tests.
> 
> Bootstrapped on AArch64 with no issues.
> 
> OK for trunk?

No way! This patch is nonsense as it stands!

I'd like to withdraw this for now while I have a think about what
has gone wrong!

Thanks,
James

> 
> Thanks,
> James
> 
> ---
> gcc/
> 
> 2014-09-25  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* config/aarch64/aarch64.c
> 	(aarch64_estimate_movmem_ninsns): New.
> 	(aarch64_expand_movmem): Refactor old move costs.
> 	(aarch64_move_by_pieces_profitable_p): New.
> 	(aarch64_estimate_block_copy_ninsns): Likewise.
> 	(aarch64_max_scalarization_size): Likewise.
> 	(TARGET_MAX_SCALARIZATION_SIZE): Likewise.
> 	(TARGET_MOVE_BY_PIECES_PROFITABLE_P): Likewise.
> 	* config/aarch64/aarch64.h (AARCH64_MOVE_RATIO): New.
> 	(MOVE_RATIO): Delete.
> 
> gcc/testsuite/
> 
> 2014-09-25  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* gcc.dg/tree-ssa/pr42585.c: Bring back for AArch64.
> 	* gcc.dg/tree-ssa/sra-12.c: Likewise.
> 

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 3483081..d8b5a4a 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -9616,6 +9616,34 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
>    return false;
>  }
>  
> +static unsigned int
> +aarch64_estimate_movmem_ninsns (HOST_WIDE_INT size)
> +{
> +  HOST_WIDE_INT chunks = 0;
> +  int n = size;
> +
> +  /* 3 bytes is a 2-byte then a 1-byte copy.  */
> +  if (n == 3)
> +    return 2;
> +
> +  /* 5, 6, 7 bytes need an extra copy.  */
> +  if (n > 4 && n < 8)
> +    chunks++;
> +
> +  /* If n was greater than 8, it is dealt with in 8/16-byte chunks
> +     first.  */
> +  chunks += n / 16;
> +  n %= 16;
> +  chunks += n / 8;
> +  n %= 8;
> +
> +  /* Anything left is dealt with in one instruction.  */
> +  if (n != 0)
> +    chunks++;
> +
> +  return chunks;
> +}
> +
>  /* Return a new RTX holding the result of moving POINTER forward by
>     AMOUNT bytes.  */
>  
> @@ -9673,7 +9701,7 @@ aarch64_expand_movmem (rtx *operands)
>  
>    /* When optimizing for size, give a better estimate of the length of a
>       memcpy call, but use the default otherwise.  */
> -  unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
> +  unsigned int max_instructions = AARCH64_MOVE_RATIO (speed_p);
>  
>    /* We can't do anything smart if the amount to copy is not constant.  */
>    if (!CONST_INT_P (operands[2]))
> @@ -9681,10 +9709,9 @@ aarch64_expand_movmem (rtx *operands)
>  
>    n = UINTVAL (operands[2]);
>  
> -  /* Try to keep the number of instructions low.  For cases below 16 bytes we
> -     need to make at most two moves.  For cases above 16 bytes it will be one
> -     move for each 16 byte chunk, then at most two additional moves.  */
> -  if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
> +  /* Try to keep the number of instructions we emit low, fail expansion
> +     if we are unable to and leave it to memcpy.  */
> +  if (aarch64_estimate_movmem_ninsns (n) > max_instructions)
>      return false;
>  
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
> @@ -9774,6 +9801,57 @@ aarch64_expand_movmem (rtx *operands)
>    return true;
>  }
>  
> +/* Implement TARGET_MOVE_BY_PIECES_PROFITABLE_P.  */
> +
> +bool
> +aarch64_move_by_pieces_profitable_p (unsigned int size
> +				     unsigned int align
> +				     bool speed_p)
> +{
> +  /* For strict alignment we don't want to use our unaligned
> +     movmem implementation.  */
> +  if (STRICT_ALIGNMENT)
> +    return (AARCH64_MOVE_RATIO (speed_p)
> +	    < move_by_pieces_ninsns (size, align, speed_p));
> +
> +  /* If we have an overhang of 3, 6 or 7 bytes, we would emit an unaligned
> +     load to cover it, if this is likely to be slow we would do better
> +     going through move_by_pieces.  */
> +  if (size % 8 > 5)
> +    return SLOW_UNALIGNED_ACCESS (DImode, 1);
> +  else if (size % 4 == 3)
> +    return SLOW_UNALIGNED_ACCESS (SImode, 1);
> +
> +  /* We can likely do a better job than the move_by_pieces infrastructure
> +     can.  */
> +  return false;
> +}
> +
> +/* Implement TARGET_ESTIMATE_BLOCK_COPY_NINSNS.  */
> +
> +unsigned int
> +aarch64_estimate_block_copy_ninsns (HOST_WIDE_INT size, bool speed_p)
> +{
> +  if (aarch64_move_by_pieces_profitable_p (size, 8, speed_p))
> +    return move_by_pieces_ninsns (size, 8, MOVE_MAX_PIECES);
> +  else if (aarch64_estimate_movmem_ninsns (size)
> +	   < AARCH64_MOVE_RATIO (speed_p))
> +    return aarch64_estimate_movmem_ninsns (size);
> +  else
> +    /* memcpy.  Set up 3 arguments and make a call.  */
> +    return 4;
> +}
> +
> +/* Implement TARGET_MAX_SCALARIZATION_SIZE.  */
> +
> +unsigned int
> +aarch64_max_scalarization_size (bool speed_p)
> +{
> +  /* The maximum number of instructions we are willing to use * the
> +     maximum size we can move in one instruction (LDP/STP).  */
> +  return AARCH64_MOVE_RATIO (speed_p) * 16;
> +}
> +
>  #undef TARGET_ADDRESS_COST
>  #define TARGET_ADDRESS_COST aarch64_address_cost
>  
> @@ -9843,6 +9921,10 @@ aarch64_expand_movmem (rtx *operands)
>  #undef TARGET_BUILTIN_DECL
>  #define TARGET_BUILTIN_DECL aarch64_builtin_decl
>  
> +#undef TARGET_ESTIMATE_BLOCK_COPY_NINSNS
> +#define TARGET_ESTIMATE_BLOCK_COPY_NINSNS \
> +  aarch64_estimate_block_copy_ninsns
> +
>  #undef  TARGET_EXPAND_BUILTIN
>  #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
>  
> @@ -9897,9 +9979,17 @@ aarch64_expand_movmem (rtx *operands)
>  #undef TARGET_MANGLE_TYPE
>  #define TARGET_MANGLE_TYPE aarch64_mangle_type
>  
> +#undef TARGET_MAX_SCALARIZATION_SIZE
> +#define TARGET_MAX_SCALARIZATION_SIZE \
> +  aarch64_max_scalarization_size
> +
>  #undef TARGET_MEMORY_MOVE_COST
>  #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
>  
> +#undef TARGET_MOVE_BY_PIECES_PROFITABLE_P
> +#define TARGET_MOVE_BY_PIECES_PROFITABLE_P \
> +  aarch64_move_by_pieces_profitable_p
> +
>  #undef TARGET_MUST_PASS_IN_STACK
>  #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
>  
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index db950da..5c8d37d 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -678,17 +678,10 @@ do {									     \
>  /* Maximum bytes moved by a single instruction (load/store pair).  */
>  #define MOVE_MAX (UNITS_PER_WORD * 2)
>  
> -/* The base cost overhead of a memcpy call, for MOVE_RATIO and friends.  */
> +/* The base cost overhead of a memcpy call, for CLEAR_RATIO and friends.  */
>  #define AARCH64_CALL_RATIO 8
>  
> -/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
> -   move_by_pieces will continually copy the largest safe chunks.  So a
> -   7-byte copy is a 4-byte + 2-byte + byte copy.  This proves inefficient
> -   for both size and speed of copy, so we will instead use the "movmem"
> -   standard name to implement the copy.  This logic does not apply when
> -   targeting -mstrict-align, so keep a sensible default in that case.  */
> -#define MOVE_RATIO(speed) \
> -  (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
> +#define AARCH64_MOVE_RATIO(speed) (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)
>  
>  /* For CLEAR_RATIO, when optimizing for size, give a better estimate
>     of the length of a memset call, but use the default otherwise.  */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> index 07f575d..a970c85 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> @@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr)
>  /* Whether the structs are totally scalarized or not depends on the
>     MOVE_RATIO macro definition in the back end.  The scalarization will
>     not take place when using small values for MOVE_RATIO.  */
> -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
> -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
>  /* { dg-final { cleanup-tree-dump "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> index 45aa963..59e5e6a 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> @@ -21,5 +21,5 @@ int foo (struct S *p)
>    *p = l;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */
>  /* { dg-final { cleanup-tree-dump "release_ssa" } } */