[aarch64] Add Neoverse N2 tuning structs
Andre Vieira (lists)
andre.simoesdiasvieira@arm.com
Wed Mar 16 14:46:14 GMT 2022
Hi,
This patch adds tuning structures for Neoverse N2.
2022-03-16 Tamar Christina <tamar.christina@arm.com>
Andre Vieira <andre.simoesdiasvieira@arm.com>
* config/aarch64/aarch64.cc (neoversen2_addrcost_table,
neoversen2_regmove_cost,
neoversen2_advsimd_vector_cost, neoversen2_sve_vector_cost,
neoversen2_scalar_issue_info,
neoversen2_advsimd_issue_info, neoversen2_sve_issue_info,
neoversen2_vec_issue_info,
neoversen2_tunings): New structs.
(neoversen2_tunings): Use new structs and update tuning flags.
(aarch64_vec_op_count::rename_cycles_per_iter): Enable for
neoversen2 tuning.
-------------- next part --------------
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index d504fe7607b66a9c9ed9b183a2d3c03d34fb0f80..e0bb447beb9eae74551d863505eb265737d36334 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -519,6 +519,24 @@ static const struct cpu_addrcost_table neoversev1_addrcost_table =
0 /* imm_offset */
};
+static const struct cpu_addrcost_table neoversen2_addrcost_table =
+{
+ {
+ 1, /* hi */
+ 0, /* si */
+ 0, /* di */
+ 1, /* ti */
+ },
+ 0, /* pre_modify */
+ 0, /* post_modify */
+ 2, /* post_modify_ld3_st3 */
+ 2, /* post_modify_ld4_st4 */
+ 0, /* register_offset */
+ 0, /* register_sextend */
+ 0, /* register_zextend */
+ 0 /* imm_offset */
+};
+
static const struct cpu_regmove_cost generic_regmove_cost =
{
1, /* GP2GP */
@@ -624,6 +642,16 @@ static const struct cpu_regmove_cost a64fx_regmove_cost =
2 /* FP2FP */
};
+static const struct cpu_regmove_cost neoversen2_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Spilling to int<->fp instead of memory is recommended so set
+ realistic costs compared to memmv_cost. */
+ 3, /* GP2FP */
+ 2, /* FP2GP */
+ 2 /* FP2FP */
+};
+
/* Generic costs for Advanced SIMD vector operations. */
static const advsimd_vec_cost generic_advsimd_vector_cost =
{
@@ -2174,12 +2202,166 @@ static const struct tune_params neoverse512tvb_tunings =
&generic_prefetch_tune
};
+static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
+{
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 2, /* ld2_st2_permute_cost */
+ 2, /* ld3_st3_permute_cost */
+ 3, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ 4, /* reduc_i8_cost */
+ 4, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 6, /* reduc_f16_cost */
+ 4, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ /* This value is just inherited from the Cortex-A57 table. */
+ 8, /* vec_to_scalar_cost */
+ /* This depends very much on what the scalar value is and
+ where it comes from. E.g. some constants take two dependent
+ instructions or a load, while others might be moved from a GPR.
+ 4 seems to be a reasonable compromise in practice. */
+ 4, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ /* Although stores have a latency of 2 and compete for the
+ vector pipes, in practice it's better not to model that. */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+static const sve_vec_cost neoversen2_sve_vector_cost =
+{
+ {
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 3, /* ld2_st2_permute_cost */
+ 4, /* ld3_st3_permute_cost */
+ 4, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ /* Theoretically, a reduction involving 15 scalar ADDs could
+ complete in ~5 cycles and would have a cost of 15. [SU]ADDV
+ completes in 11 cycles, so give it a cost of 15 + 6. */
+ 21, /* reduc_i8_cost */
+ /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
+ 13, /* reduc_i16_cost */
+ /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
+ 9, /* reduc_i32_cost */
+ /* Likewise for 1 scalar ADDs (~1 cycles) vs. 2: 1 + 1. */
+ 2, /* reduc_i64_cost */
+ /* Theoretically, a reduction involving 7 scalar FADDs could
+ complete in ~8 cycles and would have a cost of 14. FADDV
+ completes in 6 cycles, so give it a cost of 14 - 2. */
+ 12, /* reduc_f16_cost */
+ /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */
+ 6, /* reduc_f32_cost */
+ /* Likewise for 1 scalar FADDs (~2 cycles) vs. 2: 2 - 0. */
+ 2, /* reduc_f64_cost */
+ 2, /* store_elt_extra_cost */
+ /* This value is just inherited from the Cortex-A57 table. */
+ 8, /* vec_to_scalar_cost */
+ /* See the comment above the Advanced SIMD versions. */
+ 4, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ /* Although stores have a latency of 2 and compete for the
+ vector pipes, in practice it's better not to model that. */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+ },
+ 3, /* clast_cost */
+ 10, /* fadda_f16_cost */
+ 6, /* fadda_f32_cost */
+ 4, /* fadda_f64_cost */
+ /* A strided Advanced SIMD x64 load would take two parallel FP loads
+ (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
+ is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
+ (cost 8) and a vec_construct (cost 2). Add a full vector operation
+ (cost 2) to that, to avoid the difference being lost in rounding.
+
+ There is no easy comparison between a strided Advanced SIMD x32 load
+ and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
+ operation more than a 64-bit gather. */
+ 14, /* gather_load_x32_cost */
+ 12, /* gather_load_x64_cost */
+ 3 /* scatter_store_elt_cost */
+};
+
+static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
+{
+ 3, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 4, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
+{
+ {
+ 3, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 2, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 2, /* ld2_st2_general_ops */
+ 2, /* ld3_st3_general_ops */
+ 3 /* ld4_st4_general_ops */
+};
+
+static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
+{
+ {
+ {
+ 3, /* loads_per_cycle */
+ 2, /* stores_per_cycle */
+ 2, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 2, /* ld2_st2_general_ops */
+ 3, /* ld3_st3_general_ops */
+ 3 /* ld4_st4_general_ops */
+ },
+ 2, /* pred_ops_per_cycle */
+ 2, /* while_pred_ops */
+ 2, /* int_cmp_pred_ops */
+ 1, /* fp_cmp_pred_ops */
+ 1, /* gather_scatter_pair_general_ops */
+ 1 /* gather_scatter_pair_pred_ops */
+};
+
+static const aarch64_vec_issue_info neoversen2_vec_issue_info =
+{
+ &neoversen2_scalar_issue_info,
+ &neoversen2_advsimd_issue_info,
+ &neoversen2_sve_issue_info
+};
+
+/* Neoverse N2 costs for vector insn classes. */
+static const struct cpu_vector_cost neoversen2_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 2, /* scalar_fp_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 1, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &neoversen2_advsimd_vector_cost, /* advsimd */
+ &neoversen2_sve_vector_cost, /* sve */
+ &neoversen2_vec_issue_info /* issue_info */
+};
+
static const struct tune_params neoversen2_tunings =
{
&cortexa76_extra_costs,
- &generic_addrcost_table,
- &generic_regmove_cost,
- &cortexa57_vector_cost,
+ &neoversen2_addrcost_table,
+ &neoversen2_regmove_cost,
+ &neoversen2_vector_cost,
&generic_branch_cost,
&generic_approx_modes,
SVE_128, /* sve_width */
@@ -2202,7 +2384,10 @@ static const struct tune_params neoversen2_tunings =
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
+ | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+ | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+ | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_prefetch_tune
};
@@ -15131,7 +15316,8 @@ aarch64_vec_op_count::sve_issue_info () const
fractional_cost
aarch64_vec_op_count::rename_cycles_per_iter () const
{
- if (sve_issue_info () == &neoverse512tvb_sve_issue_info)
+ if (sve_issue_info () == &neoverse512tvb_sve_issue_info
+ || sve_issue_info () == &neoversen2_sve_issue_info)
/* + 1 for an addition. We've already counted a general op for each
store, so we don't need to account for stores separately. The branch
reads no registers and so does not need to be counted either.
More information about the Gcc-patches
mailing list