This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
RE: Add scatter/gather costs
- From: "Kumar, Venkataramanan" <Venkataramanan dot Kumar at amd dot com>
- To: Jan Hubicka <hubicka at ucw dot cz>, "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Thu, 26 Oct 2017 06:48:23 +0000
- Subject: RE: Add scatter/gather costs
- Authentication-results: sourceware.org; auth=none
- Authentication-results: spf=none (sender IP is ) smtp.mailfrom=Venkataramanan dot Kumar at amd dot com;
- References: <20171025191909.GA89979@kam.mff.cuni.cz>
- Spamdiagnosticmetadata: NSPM
- Spamdiagnosticoutput: 1:99
Hi Honza,
> -----Original Message-----
> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
> owner@gcc.gnu.org] On Behalf Of Jan Hubicka
> Sent: Thursday, October 26, 2017 12:49 AM
> To: gcc-patches@gcc.gnu.org
> Subject: Add scatter/gather costs
>
> Hi,
> this patch adds computation of scatter/gather to i386 cost metric.
> The costs for core are set for haswell, skylake has better implementation so I
> will have to split the cost tables for cores older and younger than skylake. I
> will do that as a followup.
>
> Bootstrapped/regtested x86_64-linux, comitted.
>
> Honza
>
> * i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather
> cost correctly.
> * i386.h (processor_costs): Add gather_static, gather_per_elt,
> scatter_static, scatter_per_elt.
> * x86-tune-costs.h: Add new cost entries.
> Index: config/i386/i386.c
> ==========================================================
> =========
> --- config/i386/i386.c (revision 254073)
> +++ config/i386/i386.c (working copy)
> @@ -44490,7 +44490,6 @@ ix86_builtin_vectorization_cost (enum ve
> /* We should have separate costs for unaligned loads and gather/scatter.
> Do that incrementally. */
> case unaligned_load:
> - case vector_gather_load:
> index = sse_store_index (mode);
> return ix86_vec_cost (mode,
> COSTS_N_INSNS
> @@ -44498,13 +44497,28 @@ ix86_builtin_vectorization_cost (enum ve
> true);
>
> case unaligned_store:
> - case vector_scatter_store:
> index = sse_store_index (mode);
> return ix86_vec_cost (mode,
> COSTS_N_INSNS
> (ix86_cost->sse_unaligned_store[index]) / 2,
> true);
>
> + case vector_gather_load:
> + return ix86_vec_cost (mode,
> + COSTS_N_INSNS
> + (ix86_cost->gather_static
> + + ix86_cost->gather_per_elt
> + * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
> + true);
> +
> + case vector_scatter_store:
> + return ix86_vec_cost (mode,
> + COSTS_N_INSNS
> + (ix86_cost->scatter_static
> + + ix86_cost->scatter_per_elt
> + * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
> + true);
> +
> case cond_branch_taken:
> return ix86_cost->cond_taken_branch_cost;
>
> Index: config/i386/i386.h
> ==========================================================
> =========
> --- config/i386/i386.h (revision 254073)
> +++ config/i386/i386.h (working copy)
> @@ -253,6 +253,10 @@ struct processor_costs {
> const int mmxsse_to_integer; /* cost of moving mmxsse register to
> integer. */
> const int ssemmx_to_integer; /* cost of moving integer to mmxsse
> register. */
> + const int gather_static, gather_per_elt; /* Cost of gather load is computed
> + as static + per_item * nelts. */
> + const int scatter_static, scatter_per_elt; /* Cost of gather store is
> + computed as static + per_item * nelts. */
> const int l1_cache_size; /* size of l1 cache, in kilobytes. */
> const int l2_cache_size; /* size of l2 cache, in kilobytes. */
> const int prefetch_block; /* bytes moved to cache for prefetch. */
> Index: config/i386/x86-tune-costs.h
> ==========================================================
> =========
> --- config/i386/x86-tune-costs.h (revision 254073)
> +++ config/i386/x86-tune-costs.h (working copy)
> @@ -82,6 +82,8 @@ struct processor_costs ix86_size_cost =
> {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
> in 128bit, 256bit and 512bit */
> 3, 3, /* SSE->integer and integer->SSE
> moves */
> + 5, 0, /* Gather load static, per_elt. */
> + 5, 0, /* Gather store static, per_elt. */
> 0, /* size of l1 cache */
> 0, /* size of l2 cache */
> 0, /* size of prefetch block */
> @@ -166,6 +168,8 @@ struct processor_costs i386_cost = { /*
> in 32,64,128,256 and 512-bit */
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 3, 3, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 0, /* size of l1 cache */
> 0, /* size of l2 cache */
> 0, /* size of prefetch block */
> @@ -249,6 +253,8 @@ struct processor_costs i486_cost = { /*
> in 32,64,128,256 and 512-bit */
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 3, 3, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 4, /* size of l1 cache. 486 has 8kB cache
> shared for code and data, so 4kB is
> not really precise. */
> @@ -334,6 +340,8 @@ struct processor_costs pentium_cost = {
> in 32,64,128,256 and 512-bit */
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 3, 3, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> 8, /* size of l2 cache */
> 0, /* size of prefetch block */
> @@ -410,6 +418,8 @@ struct processor_costs lakemont_cost = {
> in 32,64,128,256 and 512-bit */
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 3, 3, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> 8, /* size of l2 cache */
> 0, /* size of prefetch block */
> @@ -501,6 +511,8 @@ struct processor_costs pentiumpro_cost =
> in 32,64,128,256 and 512-bit */
> {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
> 3, 3, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> 256, /* size of l2 cache */
> 32, /* size of prefetch block */
> @@ -584,6 +596,8 @@ struct processor_costs geode_cost = {
> in 32,64,128,256 and 512-bit */
> {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> 6, 6, /* SSE->integer and integer->SSE
> moves */
> + 2, 2, /* Gather load static, per_elt. */
> + 2, 2, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> 128, /* size of l2 cache. */
> 32, /* size of prefetch block */
> @@ -666,6 +680,8 @@ struct processor_costs k6_cost = {
> in 32,64,128,256 and 512-bit */
> {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
> 6, 6, /* SSE->integer and integer->SSE
> moves */
> + 2, 2, /* Gather load static, per_elt. */
> + 2, 2, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 32, /* size of l2 cache. Some models
> have integrated l2 cache, but
> @@ -754,6 +770,8 @@ struct processor_costs athlon_cost = {
> in 32,64,128,256 and 512-bit */
> {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
> 5, 5, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> 256, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -844,6 +862,8 @@ struct processor_costs k8_cost = {
> in 32,64,128,256 and 512-bit */
> {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
> 5, 5, /* SSE->integer and integer->SSE
> moves */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> 512, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -946,6 +966,8 @@ struct processor_costs amdfam10_cost = {
> 1/1 1/1
> MOVD reg32, xmmreg Double
> FADD 3
> 1/1 1/1 */
> + 4, 4, /* Gather load static, per_elt. */
> + 4, 4, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> 512, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1041,6 +1063,8 @@ const struct processor_costs bdver1_cost
> in 32,64,128,256 and 512-bit */
> {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
> 16, 20, /* SSE->integer and integer->SSE
> moves */
> + 12, 12, /* Gather load static, per_elt. */
> + 10, 10, /* Gather store static, per_elt. */
> 16, /* size of l1 cache. */
> 2048, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1138,6 +1162,8 @@ const struct processor_costs bdver2_cost
> in 32,64,128,256 and 512-bit */
> {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
> 16, 20, /* SSE->integer and integer->SSE
> moves */
> + 12, 12, /* Gather load static, per_elt. */
> + 10, 10, /* Gather store static, per_elt. */
> 16, /* size of l1 cache. */
> 2048, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1234,6 +1260,8 @@ struct processor_costs bdver3_cost = {
> in 32,64,128,256 and 512-bit */
> {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
> 16, 20, /* SSE->integer and integer->SSE
> moves */
> + 12, 12, /* Gather load static, per_elt. */
> + 10, 10, /* Gather store static, per_elt. */
> 16, /* size of l1 cache. */
> 2048, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1329,6 +1357,8 @@ struct processor_costs bdver4_cost = {
> in 32,64,128,256 and 512-bit */
> {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
> 16, 20, /* SSE->integer and integer->SSE
> moves */
> + 12, 12, /* Gather load static, per_elt. */
> + 10, 10, /* Gather store static, per_elt. */
> 16, /* size of l1 cache. */
> 2048, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1435,6 +1465,11 @@ struct processor_costs znver1_cost = {
> in 32,64,128,256 and 512-bit. */
> {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
> 6, 6, /* SSE->integer and integer->SSE
> moves. */
> + /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> + throughput 12. Approx 9 uops do not depend on vector size and every
> load
> + is 7 uops. */
> + 18, 8, /* Gather load static, per_elt. */
> + 18, 10, /* Gather store static, per_elt. */
Can you please help on how you arrived at 18 for the load/store static cost (based on throughput)?
Per_elt is 8 i.e. (latency of load ) 4 * 2 (reg-reg move ) ?
> 32, /* size of l1 cache. */
> 512, /* size of l2 cache. */
> 64, /* size of prefetch block. */
> @@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost
> in 32,64,128,256 and 512-bit */
> {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
> 14, 14, /* SSE->integer and integer->SSE
> moves */
> + 10, 10, /* Gather load static, per_elt. */
> + 10, 10, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 512, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost
> in 32,64,128,256 and 512-bit */
> {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
> 14, 14, /* SSE->integer and integer->SSE
> moves */
> + 10, 10, /* Gather load static, per_elt. */
> + 10, 10, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 2048, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = {
> in 32,64,128,256 and 512-bit */
> {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
> 20, 12, /* SSE->integer and integer->SSE
> moves */
> + 16, 16, /* Gather load static, per_elt. */
> + 16, 16, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> 256, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = {
> in 32,64,128,256 and 512-bit */
> {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
> 20, 12, /* SSE->integer and integer->SSE
> moves */
> + 12, 12, /* Gather load static, per_elt. */
> + 12, 12, /* Gather store static, per_elt. */
> 8, /* size of l1 cache. */
> 1024, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = {
> in 32,64,128,256 and 512-bit */
> {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> 8, 6, /* SSE->integer and integer->SSE
> moves */
> + 8, 8, /* Gather load static, per_elt. */
> + 8, 8, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 256, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = {
> in 32,64,128,256 and 512-bit */
> {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
> 8, 6, /* SSE->integer and integer->SSE
> moves */
> + 8, 8, /* Gather load static, per_elt. */
> + 8, 8, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 256, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = {
> in 32,64,128,256 and 512-bit */
> {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
> 4, 4, /* SSE->integer and integer->SSE
> moves */
> + 6, 6, /* Gather load static, per_elt. */
> + 6, 6, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 256, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = {
> in 32,64,128,256 and 512-bit */
> {10, 10, 10, 15, 20}, /* cost of unaligned storess. */
> 20, 20, /* SSE->integer and integer->SSE
> moves */
> + 6, 6, /* Gather load static, per_elt. */
> + 6, 6, /* Gather store static, per_elt. */
> 32, /* size of l1 cache. */
> 512, /* size of l2 cache. */
> 64, /* size of prefetch block */
> @@ -2239,6 +2290,11 @@ struct processor_costs core_cost = {
> in 32,64,128,256 and 512-bit */
> {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
> 2, 2, /* SSE->integer and integer->SSE
> moves */
> + /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> + rec. throughput 6.
> + So 5 uops statically and one uops per load. */
> + 10, 6, /* Gather load static, per_elt. */
> + 10, 6, /* Gather store static, per_elt. */
> 64, /* size of l1 cache. */
> 512, /* size of l2 cache. */
> 64, /* size of prefetch block */
Regards,
Venkat.