This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
RE: Add scatter/gather costs

From: "Kumar, Venkataramanan" <Venkataramanan dot Kumar at amd dot com>
To: Jan Hubicka <hubicka at ucw dot cz>, "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
Date: Thu, 26 Oct 2017 06:48:23 +0000
Subject: RE: Add scatter/gather costs
Authentication-results: sourceware.org; auth=none
Authentication-results: spf=none (sender IP is ) smtp.mailfrom=Venkataramanan dot Kumar at amd dot com;
References: <20171025191909.GA89979@kam.mff.cuni.cz>
Spamdiagnosticmetadata: NSPM
Spamdiagnosticoutput: 1:99
Hi Honza, 

> -----Original Message-----
> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
> owner@gcc.gnu.org] On Behalf Of Jan Hubicka
> Sent: Thursday, October 26, 2017 12:49 AM
> To: gcc-patches@gcc.gnu.org
> Subject: Add scatter/gather costs
> 
> Hi,
> this patch adds computation of scatter/gather to i386 cost metric.
> The costs for core are set for haswell, skylake has better implementation so I
> will have to split the cost tables for cores older and younger than skylake. I
> will do that as a followup.
> 
> Bootstrapped/regtested x86_64-linux, comitted.
> 
> Honza
> 
> 	* i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather
> 	cost correctly.
> 	* i386.h (processor_costs): Add gather_static, gather_per_elt,
> 	scatter_static, scatter_per_elt.
> 	* x86-tune-costs.h: Add new cost entries.
> Index: config/i386/i386.c
> ==========================================================
> =========
> --- config/i386/i386.c	(revision 254073)
> +++ config/i386/i386.c	(working copy)
> @@ -44490,7 +44490,6 @@ ix86_builtin_vectorization_cost (enum ve
>        /* We should have separate costs for unaligned loads and gather/scatter.
>  	 Do that incrementally.  */
>        case unaligned_load:
> -      case vector_gather_load:
>  	index = sse_store_index (mode);
>          return ix86_vec_cost (mode,
>  			      COSTS_N_INSNS
> @@ -44498,13 +44497,28 @@ ix86_builtin_vectorization_cost (enum ve
>  			      true);
> 
>        case unaligned_store:
> -      case vector_scatter_store:
>  	index = sse_store_index (mode);
>          return ix86_vec_cost (mode,
>  			      COSTS_N_INSNS
>  				 (ix86_cost->sse_unaligned_store[index]) / 2,
>  			      true);
> 
> +      case vector_gather_load:
> +        return ix86_vec_cost (mode,
> +			      COSTS_N_INSNS
> +				 (ix86_cost->gather_static
> +				  + ix86_cost->gather_per_elt
> +				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
> +			      true);
> +
> +      case vector_scatter_store:
> +        return ix86_vec_cost (mode,
> +			      COSTS_N_INSNS
> +				 (ix86_cost->scatter_static
> +				  + ix86_cost->scatter_per_elt
> +				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
> +			      true);
> +
>        case cond_branch_taken:
>          return ix86_cost->cond_taken_branch_cost;
> 
> Index: config/i386/i386.h
> ==========================================================
> =========
> --- config/i386/i386.h	(revision 254073)
> +++ config/i386/i386.h	(working copy)
> @@ -253,6 +253,10 @@ struct processor_costs {
>    const int mmxsse_to_integer;	/* cost of moving mmxsse register to
>  				   integer.  */
>    const int ssemmx_to_integer;  /* cost of moving integer to mmxsse
> register. */
> +  const int gather_static, gather_per_elt; /* Cost of gather load is computed
> +				   as static + per_item * nelts. */
> +  const int scatter_static, scatter_per_elt; /* Cost of gather store is
> +				   computed as static + per_item * nelts.  */
>    const int l1_cache_size;	/* size of l1 cache, in kilobytes.  */
>    const int l2_cache_size;	/* size of l2 cache, in kilobytes.  */
>    const int prefetch_block;	/* bytes moved to cache for prefetch.  */
> Index: config/i386/x86-tune-costs.h
> ==========================================================
> =========
> --- config/i386/x86-tune-costs.h	(revision 254073)
> +++ config/i386/x86-tune-costs.h	(working copy)
> @@ -82,6 +82,8 @@ struct processor_costs ix86_size_cost =
>    {3, 3, 3, 3, 3},				/* cost of unaligned SSE store
>  					   in 128bit, 256bit and 512bit */
>    3, 3,					/* SSE->integer and integer->SSE
> moves */
> +  5, 0,					/* Gather load static, per_elt.  */
> +  5, 0,					/* Gather store static, per_elt.  */
>    0,					/* size of l1 cache  */
>    0,					/* size of l2 cache  */
>    0,					/* size of prefetch block */
> @@ -166,6 +168,8 @@ struct processor_costs i386_cost = {	/*
>  					   in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
>    3, 3,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    0,					/* size of l1 cache  */
>    0,					/* size of l2 cache  */
>    0,					/* size of prefetch block */
> @@ -249,6 +253,8 @@ struct processor_costs i486_cost = {	/*
>  					   in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
>    3, 3,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    4,					/* size of l1 cache.  486 has 8kB cache
>  					   shared for code and data, so 4kB is
>  					   not really precise.  */
> @@ -334,6 +340,8 @@ struct processor_costs pentium_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
>    3, 3,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    8,					/* size of l1 cache.  */
>    8,					/* size of l2 cache  */
>    0,					/* size of prefetch block */
> @@ -410,6 +418,8 @@ struct processor_costs lakemont_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
>    3, 3,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    8,					/* size of l1 cache.  */
>    8,					/* size of l2 cache  */
>    0,					/* size of prefetch block */
> @@ -501,6 +511,8 @@ struct processor_costs pentiumpro_cost =
>  					   in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
>    3, 3,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    8,					/* size of l1 cache.  */
>    256,					/* size of l2 cache  */
>    32,					/* size of prefetch block */
> @@ -584,6 +596,8 @@ struct processor_costs geode_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
>    6, 6,					/* SSE->integer and integer->SSE
> moves */
> +  2, 2,					/* Gather load static, per_elt.  */
> +  2, 2,					/* Gather store static, per_elt.  */
>    64,					/* size of l1 cache.  */
>    128,					/* size of l2 cache.  */
>    32,					/* size of prefetch block */
> @@ -666,6 +680,8 @@ struct processor_costs k6_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
>    6, 6,					/* SSE->integer and integer->SSE
> moves */
> +  2, 2,					/* Gather load static, per_elt.  */
> +  2, 2,					/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    32,					/* size of l2 cache.  Some models
>  					   have integrated l2 cache, but
> @@ -754,6 +770,8 @@ struct processor_costs athlon_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
>    5, 5,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    64,					/* size of l1 cache.  */
>    256,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -844,6 +862,8 @@ struct processor_costs k8_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
>    5, 5,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    64,					/* size of l1 cache.  */
>    512,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -946,6 +966,8 @@ struct processor_costs amdfam10_cost = {
>  							       1/1  1/1
>  					    MOVD reg32, xmmreg Double
> FADD 3
>  							       1/1  1/1 */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    64,					/* size of l1 cache.  */
>    512,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1041,6 +1063,8 @@ const struct processor_costs bdver1_cost
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
>    16, 20,				/* SSE->integer and integer->SSE
> moves */
> +  12, 12,				/* Gather load static, per_elt.  */
> +  10, 10,				/* Gather store static, per_elt.  */
>    16,					/* size of l1 cache.  */
>    2048,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1138,6 +1162,8 @@ const struct processor_costs bdver2_cost
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
>    16, 20,				/* SSE->integer and integer->SSE
> moves */
> +  12, 12,				/* Gather load static, per_elt.  */
> +  10, 10,				/* Gather store static, per_elt.  */
>    16,					/* size of l1 cache.  */
>    2048,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1234,6 +1260,8 @@ struct processor_costs bdver3_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
>    16, 20,				/* SSE->integer and integer->SSE
> moves */
> +  12, 12,				/* Gather load static, per_elt.  */
> +  10, 10,				/* Gather store static, per_elt.  */
>    16,					/* size of l1 cache.  */
>    2048,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1329,6 +1357,8 @@ struct processor_costs bdver4_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
>    16, 20,				/* SSE->integer and integer->SSE
> moves */
> +  12, 12,				/* Gather load static, per_elt.  */
> +  10, 10,				/* Gather store static, per_elt.  */
>    16,					/* size of l1 cache.  */
>    2048,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1435,6 +1465,11 @@ struct processor_costs znver1_cost = {
>  					   in 32,64,128,256 and 512-bit.  */
>    {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
>    6, 6,					/* SSE->integer and integer->SSE
> moves.  */
> +  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> +     throughput 12.  Approx 9 uops do not depend on vector size and every
> load
> +     is 7 uops.  */
> +  18, 8,				/* Gather load static, per_elt.  */
> +  18, 10,				/* Gather store static, per_elt.  */

Can you please help on how you arrived at 18 for the load/store static cost (based on throughput)?
Per_elt is 8  i.e. (latency of load ) 4 * 2 (reg-reg move ) ?
 

>    32,					/* size of l1 cache.  */
>    512,					/* size of l2 cache.  */
>    64,					/* size of prefetch block.  */
> @@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
>    14, 14,				/* SSE->integer and integer->SSE
> moves */
> +  10, 10,				/* Gather load static, per_elt.  */
> +  10, 10,				/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    512,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
>    14, 14,				/* SSE->integer and integer->SSE
> moves */
> +  10, 10,				/* Gather load static, per_elt.  */
> +  10, 10,				/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    2048,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
>    20, 12,				/* SSE->integer and integer->SSE
> moves */
> +  16, 16,				/* Gather load static, per_elt.  */
> +  16, 16,				/* Gather store static, per_elt.  */
>    8,					/* size of l1 cache.  */
>    256,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
>    20, 12,				/* SSE->integer and integer->SSE
> moves */
> +  12, 12,				/* Gather load static, per_elt.  */
> +  12, 12,				/* Gather store static, per_elt.  */
>    8,					/* size of l1 cache.  */
>    1024,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
>    8, 6,					/* SSE->integer and integer->SSE
> moves */
> +  8, 8,					/* Gather load static, per_elt.  */
> +  8, 8,					/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    256,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
>    8, 6,					/* SSE->integer and integer->SSE
> moves */
> +  8, 8,					/* Gather load static, per_elt.  */
> +  8, 8,					/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    256,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
>    4, 4,					/* SSE->integer and integer->SSE
> moves */
> +  6, 6,					/* Gather load static, per_elt.  */
> +  6, 6,					/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    256,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 15, 20},			/* cost of unaligned storess.  */
>    20, 20,				/* SSE->integer and integer->SSE
> moves */
> +  6, 6,					/* Gather load static, per_elt.  */
> +  6, 6,					/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    512,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -2239,6 +2290,11 @@ struct processor_costs core_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
>    2, 2,					/* SSE->integer and integer->SSE
> moves */
> +  /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> +     rec. throughput 6.
> +     So 5 uops statically and one uops per load.  */
> +  10, 6,				/* Gather load static, per_elt.  */
> +  10, 6,				/* Gather store static, per_elt.  */
>    64,					/* size of l1 cache.  */
>    512,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */

Regards,
Venkat.
Follow-Ups:
- Re: Add scatter/gather costs
  - From: Jan Hubicka
References:
- Add scatter/gather costs
  - From: Jan Hubicka
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]