[PATCH][i386] Fix vec_construct cost, remove unused ix86_vec_cost arg

Thu Oct 11 10:11:00 GMT 2018

On Thu, 11 Oct 2018, Richard Biener wrote:

> 
> The following fixes vec_construct cost calculation to properly consider
> that the inserts will happen to SSE regs thus forgo the multiplication
> done in ix86_vec_cost which is passed the wrong mode.  This gets rid of
> the only call passing false to ix86_vec_cost (so consider the patch
> amended to remove the arg if approved).
> 
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
> 
> OK for trunk?
> 
> I am considering to make the factor we apply in ix86_vec_cost
> which currently depends on X86_TUNE_AVX128_OPTIMAL and
> X86_TUNE_SSE_SPLIT_REGS part of the actual cost tables since
> the reason we apply them are underlying CPU architecture details.
> Was the original reason of doing the multiplication based on
> those tunings to be able to "share" the same basic cost table
> across architectures that differ in this important detail?
> I see X86_TUNE_SSE_SPLIT_REGS is only used for m_ATHLON_K8
> and X86_TUNE_AVX128_OPTIMAL is used for m_BDVER, m_BTVER2
> and m_ZNVER1.  Those all have (multiple) exclusive processor_cost_table
> entries.
> 
> As a first step I'd like to remove the use of ix86_vec_cost for
> the entries that already have entries for multiple modes
> (loads and stores) and apply the factor there.  For example
> Zen can do two 128bit loads per cycle but only one 128bit store.
> With multiplying AVX256 costs by two we seem to cost sth like
> # instructions to dispatch * instruction latency which is an
> odd thing.  I'd have expected # instructions to dispatch / instruction 
> throughput * instruction latency - so a AVX256 add would cost
> the same as a AVX128 add, likewise for loads but stores would be
> more expensive because of the throughput issue.  This all
> ignores resource utilization across multiple insns but that's
> how the cost model works ...

So like the following which removes the use of ix86_vec_cost
for SSE loads and stores since we have per-mode costs already.
I've applied the relevant factor to the individual cost tables
(noting that for X86_TUNE_SSE_SPLIT_REGS we only apply the
multiplication for size == 128, not size >= 128 ...)

There's a ??? hunk in inline_memory_move_cost where we
failed to apply the scaling thus in that place we'd now have
a behavior change.  Alternatively I could leave the cost
tables unaltered if that costing part is more critical than
the vectorizer one.

I've also spotted, when reviewing ix86_vec_cost uses, a bug
in ix86_rtx_cost which keys on SFmode which doesn't work
for SSE modes, thus use GET_MODE_INNER.

Also I've changed X86_TUNE_AVX128_OPTIMAL to also apply
to BTVER1 - everywhere else we glob BTVER1 and BTVER2 so
this must surely be a omission.

Honza - is a patch like this OK?

Should I split out individual fixes to make bisection possible?

Should I update the cost tables or instead change the vectorizer
costing when considering the inline_memory_move_cost "issue"?

Thanks,
Richard.

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 0cf4152acb2..f5392232f61 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -39432,6 +39432,7 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
       int index = sse_store_index (mode);
       if (index == -1)
 	return 100;
+      /* ??? */
       if (in == 2)
         return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
       return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
@@ -40183,7 +40181,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
         gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
 
         *total = ix86_vec_cost (mode,
-				mode == SFmode ? cost->fmass : cost->fmasd,
+				GET_MODE_INNER (mode) == SFmode
+				? cost->fmass : cost->fmasd,
 				true);
 	*total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
 
@@ -45122,18 +45121,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
 	/* See PR82713 - we may end up being called on non-vector type.  */
 	if (index < 0)
 	  index = 2;
-        return ix86_vec_cost (mode,
-			      COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
-			      true);
+        return COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2;
 
       case vector_store:
 	index = sse_store_index (mode);
 	/* See PR82713 - we may end up being called on non-vector type.  */
 	if (index < 0)
 	  index = 2;
-        return ix86_vec_cost (mode,
-			      COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
-			      true);
+        return COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2;
 
       case vec_to_scalar:
       case scalar_to_vec:
@@ -45146,20 +45141,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
 	/* See PR82713 - we may end up being called on non-vector type.  */
 	if (index < 0)
 	  index = 2;
-        return ix86_vec_cost (mode,
-			      COSTS_N_INSNS
-				 (ix86_cost->sse_unaligned_load[index]) / 2,
-			      true);
+        return COSTS_N_INSNS (ix86_cost->sse_unaligned_load[index]) / 2;
 
       case unaligned_store:
 	index = sse_store_index (mode);
 	/* See PR82713 - we may end up being called on non-vector type.  */
 	if (index < 0)
 	  index = 2;
-        return ix86_vec_cost (mode,
-			      COSTS_N_INSNS
-				 (ix86_cost->sse_unaligned_store[index]) / 2,
-			      true);
+        return COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2;
 
       case vector_gather_load:
         return ix86_vec_cost (mode,
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 9c8ae0a7841..59d0a8b17d0 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -795,12 +795,12 @@ struct processor_costs athlon_cost = {
   {4, 4},				/* cost of storing MMX registers
 					   in SImode and DImode */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 4, 6, 12, 24},			/* cost of loading SSE registers
+  {4, 4, 12, 12, 24},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {4, 4, 6, 12, 24},			/* cost of unaligned loads.  */
-  {4, 4, 5, 10, 20},			/* cost of storing SSE registers
+  {4, 4, 12, 12, 24},			/* cost of unaligned loads.  */
+  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
+  {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
   5, 5,					/* SSE->integer and integer->SSE moves */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
@@ -891,12 +891,12 @@ struct processor_costs k8_cost = {
   {4, 4},				/* cost of storing MMX registers
 					   in SImode and DImode */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {4, 3, 6, 12, 24},			/* cost of loading SSE registers
+  {4, 3, 12, 12, 24},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {4, 3, 6, 12, 24},			/* cost of unaligned loads.  */
-  {4, 4, 5, 10, 20},			/* cost of storing SSE registers
+  {4, 3, 12, 12, 24},			/* cost of unaligned loads.  */
+  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
+  {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
   5, 5,					/* SSE->integer and integer->SSE moves */
   4, 4,					/* Gather load static, per_elt.  */
   4, 4,					/* Gather store static, per_elt.  */
@@ -1100,12 +1100,12 @@ const struct processor_costs bdver1_cost = {
   {10, 10},				/* cost of storing MMX registers
 					   in SImode and DImode */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
+  {12, 12, 10, 40, 60},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
-  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
+  {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
+  {10, 10, 10, 40, 60},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
+  {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
   16, 20,				/* SSE->integer and integer->SSE moves */
   12, 12,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
@@ -1203,12 +1203,12 @@ const struct processor_costs bdver2_cost = {
   {10, 10},				/* cost of storing MMX registers
 					   in SImode and DImode */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
+  {12, 12, 10, 40, 60},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
-  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
+  {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
+  {10, 10, 10, 40, 60},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
+  {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
   16, 20,				/* SSE->integer and integer->SSE moves */
   12, 12,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
@@ -1305,12 +1305,12 @@ struct processor_costs bdver3_cost = {
   {10, 10},				/* cost of storing MMX registers
 					   in SImode and DImode */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
+  {12, 12, 10, 40, 60},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
-  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
+  {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
+  {10, 10, 10, 40, 60},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
+  {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
   16, 20,				/* SSE->integer and integer->SSE moves */
   12, 12,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
@@ -1406,12 +1406,12 @@ struct processor_costs bdver4_cost = {
   {10, 10},				/* cost of storing MMX registers
 					   in SImode and DImode */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
+  {12, 12, 10, 40, 60},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
-  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
+  {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
+  {10, 10, 10, 40, 60},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
+  {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
   16, 20,				/* SSE->integer and integer->SSE moves */
   12, 12,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
@@ -1518,12 +1518,12 @@ struct processor_costs znver1_cost = {
   {8, 8},				/* cost of storing MMX registers
 					   in SImode and DImode.  */
   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
-  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
+  {6, 6, 6, 12, 24},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit.  */
-  {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
-  {8, 8, 8, 8, 16},			/* cost of storing SSE registers
+  {6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
+  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit.  */
-  {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
+  {8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
   6, 6,					/* SSE->integer and integer->SSE moves.  */
   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
      throughput 12.  Approx 9 uops do not depend on vector size and every load
@@ -1726,12 +1726,12 @@ const struct processor_costs btver1_cost = {
   {12, 12},				/* cost of storing MMX registers
 					   in SImode and DImode */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 24, 48},			/* cost of loading SSE registers
+  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
-  {10, 10, 12, 24, 48},			/* cost of storing SSE registers
+  {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
+  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
+  {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
   14, 14,				/* SSE->integer and integer->SSE moves */
   10, 10,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
@@ -1817,12 +1817,12 @@ const struct processor_costs btver2_cost = {
   {12, 12},				/* cost of storing MMX registers
 					   in SImode and DImode */
   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
-  {10, 10, 12, 24, 48},			/* cost of loading SSE registers
+  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
-  {10, 10, 12, 24, 48},			/* cost of storing SSE registers
+  {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
+  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
-  {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
+  {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
   14, 14,				/* SSE->integer and integer->SSE moves */
   10, 10,				/* Gather load static, per_elt.  */
   10, 10,				/* Gather store static, per_elt.  */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index a46450ad99d..ad8661075aa 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -441,7 +441,7 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal"
 
 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
    the auto-vectorizer.  */
-DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
+DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER
 	  | m_ZNVER1)
 
 /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX