This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH/VECT/AARCH64] Improve cost model for ThunderX2 CN99xx
- From: "Richard Earnshaw (lists)" <Richard dot Earnshaw at arm dot com>
- To: Andrew Pinski <apinski at cavium dot com>, GCC Patches <gcc-patches at gcc dot gnu dot org>
- Date: Wed, 1 Feb 2017 10:02:10 +0000
- Subject: Re: [PATCH/VECT/AARCH64] Improve cost model for ThunderX2 CN99xx
- Authentication-results: sourceware.org; auth=none
- References: <CA+=Sn1kCWp9o1oe=0cgWUaSVD8qUgfhbhvkvOTCC0ZMq2eHh8Q@mail.gmail.com> <CA+=Sn1nDJF0d9kBbRCsm9aqh7g=N9RZC7LJQ6nE_yLKLvWwe7Q@mail.gmail.com>
On 31/01/17 22:34, Andrew Pinski wrote:
> On Sat, Jan 28, 2017 at 12:34 PM, Andrew Pinski <apinski@cavium.com> wrote:
>> Hi,
>> On some (most) AARCH64 cores, it is not always profitable to
>> vectorize some integer loops. This patch does two things (I can split
>> it into different patches if needed).
>> 1) It splits the aarch64 back-end's vector cost model's vector and
>> scalar costs into int and fp fields
>> 1a) For thunderx2t99, models correctly the integer vector/scalar costs.
>> 2) Fixes/Improves a few calls to record_stmt_cost in tree-vect-loop.c
>> where stmt_info was not being passed.
>>
>> OK? Bootstrapped and tested on aarch64-linux-gnu and provides 20% on
>> libquantum and ~1% overall on SPEC CPU 2006 int.
>
> Here is the updated patch with the fixes requested by both Richards.
> Still the same performance as above.
>
> OK?
>
> Thanks,
> Andrew
>
> ChangLog:
> * tree-vect-loop.c (vect_compute_single_scalar_iteration_cost): Pass
> stmt_info to record_stmt_cost.
> (vect_get_known_peeling_cost): Pass stmt_info if known to record_stmt_cost.
>
> * config/aarch64/aarch64-protos.h (cpu_vector_cost): Split
> cpu_vector_cost field into
> scalar_int_stmt_cost and scalar_fp_stmt_cost. Split vec_stmt_cost
> field into vec_int_stmt_cost and vec_fp_stmt_cost.
> * config/aarch64/aarch64.c (generic_vector_cost): Update for the
> splitting of scalar_stmt_cost and vec_stmt_cost.
> (thunderx_vector_cost): Likewise.
> (cortexa57_vector_cost): LIkewise.
> (exynosm1_vector_cost): Likewise.
> (xgene1_vector_cost): Likewise.
> (thunderx2t99_vector_cost): Improve after the splitting of the two fields.
> (aarch64_builtin_vectorization_cost): Update for the splitting of
> scalar_stmt_cost and vec_stmt_cost.
>
>>
>> Thanks,
>> Andrew Pinski
>>
>> ChangeLog:
>> * tree-vect-loop.c (vect_compute_single_scalar_iteration_cost): Pass
>> stmt_info to record_stmt_cost.
>> (vect_get_known_peeling_cost): Pass stmt_info if known to record_stmt_cost.
>>
>> * config/aarch64/aarch64-protos.h (cpu_vector_cost): Split
>> cpu_vector_cost field into
>> scalar_int_stmt_cost and scalar_fp_stmt_cost. Split vec_stmt_cost
>> field into vec_int_stmt_cost and vec_fp_stmt_cost.
>> * config/aarch64/aarch64.c (generic_vector_cost): Update for the
>> splitting of scalar_stmt_cost and vec_stmt_cost.
>> (thunderx_vector_cost): Likewise.
>> (cortexa57_vector_cost): LIkewise.
>> (exynosm1_vector_cost): Likewise.
>> (xgene1_vector_cost): Likewise.
>> (thunderx2t99_vector_cost): Improve after the splitting of the two fields.
>> (aarch64_builtin_vectorization_cost): Update for the splitting of
>> scalar_stmt_cost and vec_stmt_cost.
>>
OK.
R.
>> updatedvectcost.diff.txt
>>
>>
>> Index: config/aarch64/aarch64-protos.h
>> ===================================================================
>> --- config/aarch64/aarch64-protos.h (revision 245070)
>> +++ config/aarch64/aarch64-protos.h (working copy)
>> @@ -151,11 +151,17 @@ struct cpu_regmove_cost
>> /* Cost for vector insn classes. */
>> struct cpu_vector_cost
>> {
>> - const int scalar_stmt_cost; /* Cost of any scalar operation,
>> + const int scalar_int_stmt_cost; /* Cost of any int scalar operation,
>> + excluding load and store. */
>> + const int scalar_fp_stmt_cost; /* Cost of any fp scalar operation,
>> excluding load and store. */
>> const int scalar_load_cost; /* Cost of scalar load. */
>> const int scalar_store_cost; /* Cost of scalar store. */
>> - const int vec_stmt_cost; /* Cost of any vector operation,
>> + const int vec_int_stmt_cost; /* Cost of any int vector operation,
>> + excluding load, store, permute,
>> + vector-to-scalar and
>> + scalar-to-vector operation. */
>> + const int vec_fp_stmt_cost; /* Cost of any fp vector operation,
>> excluding load, store, permute,
>> vector-to-scalar and
>> scalar-to-vector operation. */
>> Index: config/aarch64/aarch64.c
>> ===================================================================
>> --- config/aarch64/aarch64.c (revision 245070)
>> +++ config/aarch64/aarch64.c (working copy)
>> @@ -365,10 +365,12 @@ static const struct cpu_regmove_cost thu
>> /* Generic costs for vector insn classes. */
>> static const struct cpu_vector_cost generic_vector_cost =
>> {
>> - 1, /* scalar_stmt_cost */
>> + 1, /* scalar_int_stmt_cost */
>> + 1, /* scalar_fp_stmt_cost */
>> 1, /* scalar_load_cost */
>> 1, /* scalar_store_cost */
>> - 1, /* vec_stmt_cost */
>> + 1, /* vec_int_stmt_cost */
>> + 1, /* vec_fp_stmt_cost */
>> 2, /* vec_permute_cost */
>> 1, /* vec_to_scalar_cost */
>> 1, /* scalar_to_vec_cost */
>> @@ -383,10 +385,12 @@ static const struct cpu_vector_cost gene
>> /* ThunderX costs for vector insn classes. */
>> static const struct cpu_vector_cost thunderx_vector_cost =
>> {
>> - 1, /* scalar_stmt_cost */
>> + 1, /* scalar_int_stmt_cost */
>> + 1, /* scalar_fp_stmt_cost */
>> 3, /* scalar_load_cost */
>> 1, /* scalar_store_cost */
>> - 4, /* vec_stmt_cost */
>> + 4, /* vec_int_stmt_cost */
>> + 4, /* vec_fp_stmt_cost */
>> 4, /* vec_permute_cost */
>> 2, /* vec_to_scalar_cost */
>> 2, /* scalar_to_vec_cost */
>> @@ -401,10 +405,12 @@ static const struct cpu_vector_cost thun
>> /* Generic costs for vector insn classes. */
>> static const struct cpu_vector_cost cortexa57_vector_cost =
>> {
>> - 1, /* scalar_stmt_cost */
>> + 1, /* scalar_int_stmt_cost */
>> + 1, /* scalar_fp_stmt_cost */
>> 4, /* scalar_load_cost */
>> 1, /* scalar_store_cost */
>> - 2, /* vec_stmt_cost */
>> + 2, /* vec_int_stmt_cost */
>> + 2, /* vec_fp_stmt_cost */
>> 3, /* vec_permute_cost */
>> 8, /* vec_to_scalar_cost */
>> 8, /* scalar_to_vec_cost */
>> @@ -418,10 +424,12 @@ static const struct cpu_vector_cost cort
>>
>> static const struct cpu_vector_cost exynosm1_vector_cost =
>> {
>> - 1, /* scalar_stmt_cost */
>> + 1, /* scalar_int_stmt_cost */
>> + 1, /* scalar_fp_stmt_cost */
>> 5, /* scalar_load_cost */
>> 1, /* scalar_store_cost */
>> - 3, /* vec_stmt_cost */
>> + 3, /* vec_int_stmt_cost */
>> + 3, /* vec_fp_stmt_cost */
>> 3, /* vec_permute_cost */
>> 3, /* vec_to_scalar_cost */
>> 3, /* scalar_to_vec_cost */
>> @@ -436,10 +444,12 @@ static const struct cpu_vector_cost exyn
>> /* Generic costs for vector insn classes. */
>> static const struct cpu_vector_cost xgene1_vector_cost =
>> {
>> - 1, /* scalar_stmt_cost */
>> + 1, /* scalar_int_stmt_cost */
>> + 1, /* scalar_fp_stmt_cost */
>> 5, /* scalar_load_cost */
>> 1, /* scalar_store_cost */
>> - 2, /* vec_stmt_cost */
>> + 2, /* vec_int_stmt_cost */
>> + 2, /* vec_fp_stmt_cost */
>> 2, /* vec_permute_cost */
>> 4, /* vec_to_scalar_cost */
>> 4, /* scalar_to_vec_cost */
>> @@ -454,10 +464,12 @@ static const struct cpu_vector_cost xgen
>> /* Costs for vector insn classes for Vulcan. */
>> static const struct cpu_vector_cost thunderx2t99_vector_cost =
>> {
>> - 6, /* scalar_stmt_cost */
>> + 1, /* scalar_int_stmt_cost */
>> + 6, /* scalar_fp_stmt_cost */
>> 4, /* scalar_load_cost */
>> 1, /* scalar_store_cost */
>> - 6, /* vec_stmt_cost */
>> + 5, /* vec_int_stmt_cost */
>> + 6, /* vec_fp_stmt_cost */
>> 3, /* vec_permute_cost */
>> 6, /* vec_to_scalar_cost */
>> 5, /* scalar_to_vec_cost */
>> @@ -8119,50 +8131,55 @@ aarch64_builtin_vectorization_cost (enum
>> int misalign ATTRIBUTE_UNUSED)
>> {
>> unsigned elements;
>> + const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
>> + bool fp = false;
>> +
>> + if (vectype != NULL)
>> + fp = FLOAT_TYPE_P (vectype);
>>
>> switch (type_of_cost)
>> {
>> case scalar_stmt:
>> - return aarch64_tune_params.vec_costs->scalar_stmt_cost;
>> + return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
>>
>> case scalar_load:
>> - return aarch64_tune_params.vec_costs->scalar_load_cost;
>> + return costs->scalar_load_cost;
>>
>> case scalar_store:
>> - return aarch64_tune_params.vec_costs->scalar_store_cost;
>> + return costs->scalar_store_cost;
>>
>> case vector_stmt:
>> - return aarch64_tune_params.vec_costs->vec_stmt_cost;
>> + return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
>>
>> case vector_load:
>> - return aarch64_tune_params.vec_costs->vec_align_load_cost;
>> + return costs->vec_align_load_cost;
>>
>> case vector_store:
>> - return aarch64_tune_params.vec_costs->vec_store_cost;
>> + return costs->vec_store_cost;
>>
>> case vec_to_scalar:
>> - return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
>> + return costs->vec_to_scalar_cost;
>>
>> case scalar_to_vec:
>> - return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
>> + return costs->scalar_to_vec_cost;
>>
>> case unaligned_load:
>> - return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
>> + return costs->vec_unalign_load_cost;
>>
>> case unaligned_store:
>> - return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
>> + return costs->vec_unalign_store_cost;
>>
>> case cond_branch_taken:
>> - return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
>> + return costs->cond_taken_branch_cost;
>>
>> case cond_branch_not_taken:
>> - return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
>> + return costs->cond_not_taken_branch_cost;
>>
>> case vec_perm:
>> - return aarch64_tune_params.vec_costs->vec_permute_cost;
>> + return costs->vec_permute_cost;
>>
>> case vec_promote_demote:
>> - return aarch64_tune_params.vec_costs->vec_stmt_cost;
>> + return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
>>
>> case vec_construct:
>> elements = TYPE_VECTOR_SUBPARTS (vectype);
>> Index: tree-vect-loop.c
>> ===================================================================
>> --- tree-vect-loop.c (revision 245070)
>> +++ tree-vect-loop.c (working copy)
>> @@ -1329,9 +1329,9 @@ vect_compute_single_scalar_iteration_cos
>> continue;
>>
>> vect_cost_for_stmt kind;
>> - if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
>> + if (STMT_VINFO_DATA_REF (stmt_info))
>> {
>> - if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
>> + if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
>> kind = scalar_load;
>> else
>> kind = scalar_store;
>> @@ -1341,7 +1341,7 @@ vect_compute_single_scalar_iteration_cos
>>
>> scalar_single_iter_cost
>> += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
>> - factor, kind, NULL, 0, vect_prologue);
>> + factor, kind, stmt_info, 0, vect_prologue);
>> }
>> }
>> LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
>> @@ -3178,16 +3178,24 @@ vect_get_known_peeling_cost (loop_vec_in
>> int j;
>> if (peel_iters_prologue)
>> FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
>> - retval += record_stmt_cost (prologue_cost_vec,
>> - si->count * peel_iters_prologue,
>> - si->kind, NULL, si->misalign,
>> - vect_prologue);
>> + {
>> + stmt_vec_info stmt_info
>> + = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
>> + retval += record_stmt_cost (prologue_cost_vec,
>> + si->count * peel_iters_prologue,
>> + si->kind, stmt_info, si->misalign,
>> + vect_prologue);
>> + }
>> if (*peel_iters_epilogue)
>> FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
>> - retval += record_stmt_cost (epilogue_cost_vec,
>> - si->count * *peel_iters_epilogue,
>> - si->kind, NULL, si->misalign,
>> - vect_epilogue);
>> + {
>> + stmt_vec_info stmt_info
>> + = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
>> + retval += record_stmt_cost (epilogue_cost_vec,
>> + si->count * *peel_iters_epilogue,
>> + si->kind, stmt_info, si->misalign,
>> + vect_epilogue);
>> + }
>>
>> return retval;
>> }