This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: [PATCH/VECT/AARCH64] Improve cost model for ThunderX2 CN99xx

From: "Richard Earnshaw (lists)" <Richard dot Earnshaw at arm dot com>
To: Andrew Pinski <apinski at cavium dot com>, GCC Patches <gcc-patches at gcc dot gnu dot org>
Date: Mon, 30 Jan 2017 16:55:44 +0000
Subject: Re: [PATCH/VECT/AARCH64] Improve cost model for ThunderX2 CN99xx
Authentication-results: sourceware.org; auth=none
References: <CA+=Sn1kCWp9o1oe=0cgWUaSVD8qUgfhbhvkvOTCC0ZMq2eHh8Q@mail.gmail.com>
On 28/01/17 20:34, Andrew Pinski wrote:
> Hi,
>   On some (most) AARCH64 cores, it is not always profitable to
> vectorize some integer loops.  This patch does two things (I can split
> it into different patches if needed).
> 1) It splits the aarch64 back-end's vector cost model's vector and
> scalar costs into int and fp fields
> 1a) For thunderx2t99, models correctly the integer vector/scalar costs.
> 2) Fixes/Improves a few calls to record_stmt_cost in tree-vect-loop.c
> where stmt_info was not being passed.
> 
> OK?  Bootstrapped and tested on aarch64-linux-gnu and provides 20% on
> libquantum and ~1% overall on SPEC CPU 2006 int.
> 
> Thanks,
> Andrew Pinski
> 
> ChangeLog:
> * tree-vect-loop.c (vect_compute_single_scalar_iteration_cost): Pass
> stmt_info to record_stmt_cost.
> (vect_get_known_peeling_cost): Pass stmt_info if known to record_stmt_cost.
> 
> * config/aarch64/aarch64-protos.h (cpu_vector_cost): Split
> cpu_vector_cost field into
> scalar_int_stmt_cost and scalar_fp_stmt_cost.  Split vec_stmt_cost
> field into vec_int_stmt_cost and vec_fp_stmt_cost.
> * config/aarch64/aarch64.c (generic_vector_cost): Update for the
> splitting of scalar_stmt_cost and vec_stmt_cost.
> (thunderx_vector_cost): Likewise.
> (cortexa57_vector_cost): LIkewise.
> (exynosm1_vector_cost): Likewise.
> (xgene1_vector_cost): Likewise.
> (thunderx2t99_vector_cost): Improve after the splitting of the two fields.
> (aarch64_builtin_vectorization_cost): Update for the splitting of
> scalar_stmt_cost and vec_stmt_cost.
> 
> 
> improve-vect-cost.diff.txt
> 
> 
> Index: config/aarch64/aarch64-protos.h
> ===================================================================
> --- config/aarch64/aarch64-protos.h	(revision 245002)
> +++ config/aarch64/aarch64-protos.h	(working copy)
> @@ -151,11 +151,17 @@ struct cpu_regmove_cost
>  /* Cost for vector insn classes.  */
>  struct cpu_vector_cost
>  {
> -  const int scalar_stmt_cost;		 /* Cost of any scalar operation,
> +  const int scalar_int_stmt_cost;	 /* Cost of any int scalar operation,
> +					    excluding load and store.  */
> +  const int scalar_fp_stmt_cost;	 /* Cost of any fp scalar operation,
>  					    excluding load and store.  */
>    const int scalar_load_cost;		 /* Cost of scalar load.  */
>    const int scalar_store_cost;		 /* Cost of scalar store.  */
> -  const int vec_stmt_cost;		 /* Cost of any vector operation,
> +  const int vec_int_stmt_cost;		 /* Cost of any int vector operation,
> +					    excluding load, store, permute,
> +					    vector-to-scalar and
> +					    scalar-to-vector operation.  */
> +  const int vec_fp_stmt_cost;		 /* Cost of any fp vector operation,
>  					    excluding load, store, permute,
>  					    vector-to-scalar and
>  					    scalar-to-vector operation.  */
> Index: config/aarch64/aarch64.c
> ===================================================================
> --- config/aarch64/aarch64.c	(revision 245002)
> +++ config/aarch64/aarch64.c	(working copy)
> @@ -365,10 +365,12 @@ static const struct cpu_regmove_cost thu
>  /* Generic costs for vector insn classes.  */
>  static const struct cpu_vector_cost generic_vector_cost =
>  {
> -  1, /* scalar_stmt_cost  */
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
>    1, /* scalar_load_cost  */
>    1, /* scalar_store_cost  */
> -  1, /* vec_stmt_cost  */
> +  1, /* vec_int_stmt_cost  */
> +  1, /* vec_fp_stmt_cost  */
>    2, /* vec_permute_cost  */
>    1, /* vec_to_scalar_cost  */
>    1, /* scalar_to_vec_cost  */
> @@ -383,10 +385,12 @@ static const struct cpu_vector_cost gene
>  /* ThunderX costs for vector insn classes.  */
>  static const struct cpu_vector_cost thunderx_vector_cost =
>  {
> -  1, /* scalar_stmt_cost  */
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
>    3, /* scalar_load_cost  */
>    1, /* scalar_store_cost  */
> -  4, /* vec_stmt_cost  */
> +  4, /* vec_int_stmt_cost  */
> +  4, /* vec_fp_stmt_cost  */
>    4, /* vec_permute_cost  */
>    2, /* vec_to_scalar_cost  */
>    2, /* scalar_to_vec_cost  */
> @@ -401,10 +405,12 @@ static const struct cpu_vector_cost thun
>  /* Generic costs for vector insn classes.  */
>  static const struct cpu_vector_cost cortexa57_vector_cost =
>  {
> -  1, /* scalar_stmt_cost  */
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
>    4, /* scalar_load_cost  */
>    1, /* scalar_store_cost  */
> -  2, /* vec_stmt_cost  */
> +  2, /* vec_int_stmt_cost  */
> +  2, /* vec_fp_stmt_cost  */
>    3, /* vec_permute_cost  */
>    8, /* vec_to_scalar_cost  */
>    8, /* scalar_to_vec_cost  */
> @@ -418,10 +424,12 @@ static const struct cpu_vector_cost cort
>  
>  static const struct cpu_vector_cost exynosm1_vector_cost =
>  {
> -  1, /* scalar_stmt_cost  */
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
>    5, /* scalar_load_cost  */
>    1, /* scalar_store_cost  */
> -  3, /* vec_stmt_cost  */
> +  3, /* vec_int_stmt_cost  */
> +  3, /* vec_fp_stmt_cost  */
>    3, /* vec_permute_cost  */
>    3, /* vec_to_scalar_cost  */
>    3, /* scalar_to_vec_cost  */
> @@ -436,10 +444,12 @@ static const struct cpu_vector_cost exyn
>  /* Generic costs for vector insn classes.  */
>  static const struct cpu_vector_cost xgene1_vector_cost =
>  {
> -  1, /* scalar_stmt_cost  */
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
>    5, /* scalar_load_cost  */
>    1, /* scalar_store_cost  */
> -  2, /* vec_stmt_cost  */
> +  2, /* vec_int_stmt_cost  */
> +  2, /* vec_fp_stmt_cost  */
>    2, /* vec_permute_cost  */
>    4, /* vec_to_scalar_cost  */
>    4, /* scalar_to_vec_cost  */
> @@ -454,10 +464,12 @@ static const struct cpu_vector_cost xgen
>  /* Costs for vector insn classes for Vulcan.  */
>  static const struct cpu_vector_cost thunderx2t99_vector_cost =
>  {
> -  6, /* scalar_stmt_cost  */
> +  1, /* scalar_int_stmt_cost  */
> +  6, /* scalar_fp_stmt_cost  */
>    4, /* scalar_load_cost  */
>    1, /* scalar_store_cost  */
> -  6, /* vec_stmt_cost  */
> +  5, /* vec_int_stmt_cost  */
> +  6, /* vec_fp_stmt_cost  */
>    3, /* vec_permute_cost  */
>    6, /* vec_to_scalar_cost  */
>    5, /* scalar_to_vec_cost  */
> @@ -8119,50 +8131,55 @@ aarch64_builtin_vectorization_cost (enum
>  				    int misalign ATTRIBUTE_UNUSED)
>  {
>    unsigned elements;
> +  const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
> +  bool fp = true;


Why have you defaulted to fp = true?  Seems to me that the default if
the type isn't known should be false.

R.

> +
> +  if (vectype != NULL)
> +    fp = FLOAT_TYPE_P (vectype);
>  
>    switch (type_of_cost)
>      {
>        case scalar_stmt:
> -	return aarch64_tune_params.vec_costs->scalar_stmt_cost;
> +	return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
>  
>        case scalar_load:
> -	return aarch64_tune_params.vec_costs->scalar_load_cost;
> +	return costs->scalar_load_cost;
>  
>        case scalar_store:
> -	return aarch64_tune_params.vec_costs->scalar_store_cost;
> +	return costs->scalar_store_cost;
>  
>        case vector_stmt:
> -	return aarch64_tune_params.vec_costs->vec_stmt_cost;
> +	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
>  
>        case vector_load:
> -	return aarch64_tune_params.vec_costs->vec_align_load_cost;
> +	return costs->vec_align_load_cost;
>  
>        case vector_store:
> -	return aarch64_tune_params.vec_costs->vec_store_cost;
> +	return costs->vec_store_cost;
>  
>        case vec_to_scalar:
> -	return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
> +	return costs->vec_to_scalar_cost;
>  
>        case scalar_to_vec:
> -	return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
> +	return costs->scalar_to_vec_cost;
>  
>        case unaligned_load:
> -	return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
> +	return costs->vec_unalign_load_cost;
>  
>        case unaligned_store:
> -	return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
> +	return costs->vec_unalign_store_cost;
>  
>        case cond_branch_taken:
> -	return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
> +	return costs->cond_taken_branch_cost;
>  
>        case cond_branch_not_taken:
> -	return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
> +	return costs->cond_not_taken_branch_cost;
>  
>        case vec_perm:
> -	return aarch64_tune_params.vec_costs->vec_permute_cost;
> +	return costs->vec_permute_cost;
>  
>        case vec_promote_demote:
> -	return aarch64_tune_params.vec_costs->vec_stmt_cost;
> +	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
>  
>        case vec_construct:
>          elements = TYPE_VECTOR_SUBPARTS (vectype);
> Index: tree-vect-loop.c
> ===================================================================
> --- tree-vect-loop.c	(revision 245002)
> +++ tree-vect-loop.c	(working copy)
> @@ -1329,9 +1329,9 @@ vect_compute_single_scalar_iteration_cos
>              continue;
>  
>  	  vect_cost_for_stmt kind;
> -          if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
> +          if (STMT_VINFO_DATA_REF (stmt_info))
>              {
> -              if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
> +              if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
>                 kind = scalar_load;
>               else
>                 kind = scalar_store;
> @@ -1341,7 +1341,7 @@ vect_compute_single_scalar_iteration_cos
>  
>  	  scalar_single_iter_cost
>  	    += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
> -				 factor, kind, NULL, 0, vect_prologue);
> +				 factor, kind, stmt_info, 0, vect_prologue);
>          }
>      }
>    LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
> @@ -3178,16 +3178,24 @@ vect_get_known_peeling_cost (loop_vec_in
>    int j;
>    if (peel_iters_prologue)
>      FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
> -      retval += record_stmt_cost (prologue_cost_vec,
> -				  si->count * peel_iters_prologue,
> -				  si->kind, NULL, si->misalign,
> -				  vect_prologue);
> +	{
> +	  struct _stmt_vec_info *stmt_info
> +	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
> +	  retval += record_stmt_cost (prologue_cost_vec,
> +				      si->count * peel_iters_prologue,
> +				      si->kind, stmt_info, si->misalign,
> +				      vect_prologue);
> +	}
>    if (*peel_iters_epilogue)
>      FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
> -      retval += record_stmt_cost (epilogue_cost_vec,
> -				  si->count * *peel_iters_epilogue,
> -				  si->kind, NULL, si->misalign,
> -				  vect_epilogue);
> +	{
> +	  struct _stmt_vec_info *stmt_info
> +	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
> +	  retval += record_stmt_cost (epilogue_cost_vec,
> +				      si->count * *peel_iters_epilogue,
> +				      si->kind, stmt_info, si->misalign,
> +				      vect_epilogue);
> +	}
>  
>    return retval;
>  }
>
Follow-Ups:
- Re: [PATCH/VECT/AARCH64] Improve cost model for ThunderX2 CN99xx
  - From: Andrew Pinski
References:
- [PATCH/VECT/AARCH64] Improve cost model for ThunderX2 CN99xx
  - From: Andrew Pinski
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]