This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: Add scatter/gather costs


> Hi Honza, 
> 
> > +  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > +     throughput 12.  Approx 9 uops do not depend on vector size and every
> > load
> > +     is 7 uops.  */
> > +  18, 8,				/* Gather load static, per_elt.  */
> > +  18, 10,				/* Gather store static, per_elt.  */
> 
> Can you please help on how you arrived at 18 for the load/store static cost (based on throughput)?
> Per_elt is 8  i.e. (latency of load ) 4 * 2 (reg-reg move ) ?

>From the number of uops it seemed that gather is roughly 9+7*n where n is number of
entries. reg-reg move is 2, so 18 is 9*2.  I think we need to account that CPU
is indeed doing n independent load operations (so it does not save anything compared
to scalar code) and bit more.  Load cost is set to 6 (perhaps it should be 8 for
integer and more for FP?). So I went for 8 to make it bit more expensive.

I plan to experiment with the values incrementally so any suggestions are welcome.
Honza
>  
> 
> >    32,					/* size of l1 cache.  */
> >    512,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block.  */
> > @@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost
> >  					   in 32,64,128,256 and 512-bit */
> >    {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
> >    14, 14,				/* SSE->integer and integer->SSE
> > moves */
> > +  10, 10,				/* Gather load static, per_elt.  */
> > +  10, 10,				/* Gather store static, per_elt.  */
> >    32,					/* size of l1 cache.  */
> >    512,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost
> >  					   in 32,64,128,256 and 512-bit */
> >    {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
> >    14, 14,				/* SSE->integer and integer->SSE
> > moves */
> > +  10, 10,				/* Gather load static, per_elt.  */
> > +  10, 10,				/* Gather store static, per_elt.  */
> >    32,					/* size of l1 cache.  */
> >    2048,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
> >    20, 12,				/* SSE->integer and integer->SSE
> > moves */
> > +  16, 16,				/* Gather load static, per_elt.  */
> > +  16, 16,				/* Gather store static, per_elt.  */
> >    8,					/* size of l1 cache.  */
> >    256,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
> >    20, 12,				/* SSE->integer and integer->SSE
> > moves */
> > +  12, 12,				/* Gather load static, per_elt.  */
> > +  12, 12,				/* Gather store static, per_elt.  */
> >    8,					/* size of l1 cache.  */
> >    1024,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
> >    8, 6,					/* SSE->integer and integer->SSE
> > moves */
> > +  8, 8,					/* Gather load static, per_elt.  */
> > +  8, 8,					/* Gather store static, per_elt.  */
> >    32,					/* size of l1 cache.  */
> >    256,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
> >    8, 6,					/* SSE->integer and integer->SSE
> > moves */
> > +  8, 8,					/* Gather load static, per_elt.  */
> > +  8, 8,					/* Gather store static, per_elt.  */
> >    32,					/* size of l1 cache.  */
> >    256,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
> >    4, 4,					/* SSE->integer and integer->SSE
> > moves */
> > +  6, 6,					/* Gather load static, per_elt.  */
> > +  6, 6,					/* Gather store static, per_elt.  */
> >    32,					/* size of l1 cache.  */
> >    256,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {10, 10, 10, 15, 20},			/* cost of unaligned storess.  */
> >    20, 20,				/* SSE->integer and integer->SSE
> > moves */
> > +  6, 6,					/* Gather load static, per_elt.  */
> > +  6, 6,					/* Gather store static, per_elt.  */
> >    32,					/* size of l1 cache.  */
> >    512,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -2239,6 +2290,11 @@ struct processor_costs core_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
> >    2, 2,					/* SSE->integer and integer->SSE
> > moves */
> > +  /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> > +     rec. throughput 6.
> > +     So 5 uops statically and one uops per load.  */
> > +  10, 6,				/* Gather load static, per_elt.  */
> > +  10, 6,				/* Gather store static, per_elt.  */
> >    64,					/* size of l1 cache.  */
> >    512,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> 
> Regards,
> Venkat.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]