This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: [PATCH][i386] Fix vec_construct cost, remove unused ix86_vec_cost arg

From: Jan Hubicka <hubicka at ucw dot cz>
To: Richard Biener <rguenther at suse dot de>
Cc: gcc-patches at gcc dot gnu dot org
Date: Thu, 18 Oct 2018 13:49:13 +0200
Subject: Re: [PATCH][i386] Fix vec_construct cost, remove unused ix86_vec_cost arg
References: <alpine.LSU.2.20.1810110929110.16707@zhemvz.fhfr.qr> <alpine.LSU.2.20.1810111110500.16707@zhemvz.fhfr.qr>
> 
> So like the following which removes the use of ix86_vec_cost
> for SSE loads and stores since we have per-mode costs already.
> I've applied the relevant factor to the individual cost tables
> (noting that for X86_TUNE_SSE_SPLIT_REGS we only apply the
> multiplication for size == 128, not size >= 128 ...)
> 
> There's a ??? hunk in inline_memory_move_cost where we
> failed to apply the scaling thus in that place we'd now have
> a behavior change.  Alternatively I could leave the cost
> tables unaltered if that costing part is more critical than
> the vectorizer one.

Changing the behaviour (applying the scale there) seems like
right way to go to me...
> 
> I've also spotted, when reviewing ix86_vec_cost uses, a bug
> in ix86_rtx_cost which keys on SFmode which doesn't work
> for SSE modes, thus use GET_MODE_INNER.
> 
> Also I've changed X86_TUNE_AVX128_OPTIMAL to also apply
> to BTVER1 - everywhere else we glob BTVER1 and BTVER2 so
> this must surely be a omission.

BTVER1 did not have AVX :)
> 
> Honza - is a patch like this OK?

Looks OK to me.  Splitting up individual changes is up to you.
I think it is not that dramatic change so hopefully regressions
won't be that hard to analyze.
> 
> Should I split out individual fixes to make bisection possible?
> 
> Should I update the cost tables or instead change the vectorizer
> costing when considering the inline_memory_move_cost "issue"?

Looks like memory move cost should do the right thing now after your patch?
Having larger loads/stores more expensive seems correct to me.

Patch is OK, without the ??? comment ;)
Honza
> 
> Thanks,
> Richard.
> 
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 0cf4152acb2..f5392232f61 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -39432,6 +39432,7 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
>        int index = sse_store_index (mode);
>        if (index == -1)
>  	return 100;
> +      /* ??? */
>        if (in == 2)
>          return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
>        return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
> @@ -40183,7 +40181,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
>          gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
>  
>          *total = ix86_vec_cost (mode,
> -				mode == SFmode ? cost->fmass : cost->fmasd,
> +				GET_MODE_INNER (mode) == SFmode
> +				? cost->fmass : cost->fmasd,
>  				true);
>  	*total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
>  
> @@ -45122,18 +45121,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>  	/* See PR82713 - we may end up being called on non-vector type.  */
>  	if (index < 0)
>  	  index = 2;
> -        return ix86_vec_cost (mode,
> -			      COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
> -			      true);
> +        return COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2;
>  
>        case vector_store:
>  	index = sse_store_index (mode);
>  	/* See PR82713 - we may end up being called on non-vector type.  */
>  	if (index < 0)
>  	  index = 2;
> -        return ix86_vec_cost (mode,
> -			      COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
> -			      true);
> +        return COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2;
>  
>        case vec_to_scalar:
>        case scalar_to_vec:
> @@ -45146,20 +45141,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>  	/* See PR82713 - we may end up being called on non-vector type.  */
>  	if (index < 0)
>  	  index = 2;
> -        return ix86_vec_cost (mode,
> -			      COSTS_N_INSNS
> -				 (ix86_cost->sse_unaligned_load[index]) / 2,
> -			      true);
> +        return COSTS_N_INSNS (ix86_cost->sse_unaligned_load[index]) / 2;
>  
>        case unaligned_store:
>  	index = sse_store_index (mode);
>  	/* See PR82713 - we may end up being called on non-vector type.  */
>  	if (index < 0)
>  	  index = 2;
> -        return ix86_vec_cost (mode,
> -			      COSTS_N_INSNS
> -				 (ix86_cost->sse_unaligned_store[index]) / 2,
> -			      true);
> +        return COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2;
>  
>        case vector_gather_load:
>          return ix86_vec_cost (mode,
> diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> index 9c8ae0a7841..59d0a8b17d0 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -795,12 +795,12 @@ struct processor_costs athlon_cost = {
>    {4, 4},				/* cost of storing MMX registers
>  					   in SImode and DImode */
>    2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
> -  {4, 4, 6, 12, 24},			/* cost of loading SSE registers
> +  {4, 4, 12, 12, 24},			/* cost of loading SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {4, 4, 6, 12, 24},			/* cost of unaligned loads.  */
> -  {4, 4, 5, 10, 20},			/* cost of storing SSE registers
> +  {4, 4, 12, 12, 24},			/* cost of unaligned loads.  */
> +  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
> +  {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
>    5, 5,					/* SSE->integer and integer->SSE moves */
>    4, 4,					/* Gather load static, per_elt.  */
>    4, 4,					/* Gather store static, per_elt.  */
> @@ -891,12 +891,12 @@ struct processor_costs k8_cost = {
>    {4, 4},				/* cost of storing MMX registers
>  					   in SImode and DImode */
>    2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
> -  {4, 3, 6, 12, 24},			/* cost of loading SSE registers
> +  {4, 3, 12, 12, 24},			/* cost of loading SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {4, 3, 6, 12, 24},			/* cost of unaligned loads.  */
> -  {4, 4, 5, 10, 20},			/* cost of storing SSE registers
> +  {4, 3, 12, 12, 24},			/* cost of unaligned loads.  */
> +  {4, 4, 10, 10, 20},			/* cost of storing SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
> +  {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
>    5, 5,					/* SSE->integer and integer->SSE moves */
>    4, 4,					/* Gather load static, per_elt.  */
>    4, 4,					/* Gather store static, per_elt.  */
> @@ -1100,12 +1100,12 @@ const struct processor_costs bdver1_cost = {
>    {10, 10},				/* cost of storing MMX registers
>  					   in SImode and DImode */
>    2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
> -  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
> +  {12, 12, 10, 40, 60},			/* cost of loading SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
> -  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
> +  {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
> +  {10, 10, 10, 40, 60},			/* cost of storing SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
> +  {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
>    16, 20,				/* SSE->integer and integer->SSE moves */
>    12, 12,				/* Gather load static, per_elt.  */
>    10, 10,				/* Gather store static, per_elt.  */
> @@ -1203,12 +1203,12 @@ const struct processor_costs bdver2_cost = {
>    {10, 10},				/* cost of storing MMX registers
>  					   in SImode and DImode */
>    2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
> -  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
> +  {12, 12, 10, 40, 60},			/* cost of loading SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
> -  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
> +  {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
> +  {10, 10, 10, 40, 60},			/* cost of storing SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
> +  {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
>    16, 20,				/* SSE->integer and integer->SSE moves */
>    12, 12,				/* Gather load static, per_elt.  */
>    10, 10,				/* Gather store static, per_elt.  */
> @@ -1305,12 +1305,12 @@ struct processor_costs bdver3_cost = {
>    {10, 10},				/* cost of storing MMX registers
>  					   in SImode and DImode */
>    2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
> -  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
> +  {12, 12, 10, 40, 60},			/* cost of loading SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
> -  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
> +  {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
> +  {10, 10, 10, 40, 60},			/* cost of storing SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
> +  {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
>    16, 20,				/* SSE->integer and integer->SSE moves */
>    12, 12,				/* Gather load static, per_elt.  */
>    10, 10,				/* Gather store static, per_elt.  */
> @@ -1406,12 +1406,12 @@ struct processor_costs bdver4_cost = {
>    {10, 10},				/* cost of storing MMX registers
>  					   in SImode and DImode */
>    2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
> -  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
> +  {12, 12, 10, 40, 60},			/* cost of loading SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
> -  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
> +  {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
> +  {10, 10, 10, 40, 60},			/* cost of storing SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
> +  {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
>    16, 20,				/* SSE->integer and integer->SSE moves */
>    12, 12,				/* Gather load static, per_elt.  */
>    10, 10,				/* Gather store static, per_elt.  */
> @@ -1518,12 +1518,12 @@ struct processor_costs znver1_cost = {
>    {8, 8},				/* cost of storing MMX registers
>  					   in SImode and DImode.  */
>    2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
> -  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
> +  {6, 6, 6, 12, 24},			/* cost of loading SSE registers
>  					   in 32,64,128,256 and 512-bit.  */
> -  {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
> -  {8, 8, 8, 8, 16},			/* cost of storing SSE registers
> +  {6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
> +  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
>  					   in 32,64,128,256 and 512-bit.  */
> -  {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
> +  {8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
>    6, 6,					/* SSE->integer and integer->SSE moves.  */
>    /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
>       throughput 12.  Approx 9 uops do not depend on vector size and every load
> @@ -1726,12 +1726,12 @@ const struct processor_costs btver1_cost = {
>    {12, 12},				/* cost of storing MMX registers
>  					   in SImode and DImode */
>    2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
> -  {10, 10, 12, 24, 48},			/* cost of loading SSE registers
> +  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
> -  {10, 10, 12, 24, 48},			/* cost of storing SSE registers
> +  {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
> +  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
> +  {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
>    14, 14,				/* SSE->integer and integer->SSE moves */
>    10, 10,				/* Gather load static, per_elt.  */
>    10, 10,				/* Gather store static, per_elt.  */
> @@ -1817,12 +1817,12 @@ const struct processor_costs btver2_cost = {
>    {12, 12},				/* cost of storing MMX registers
>  					   in SImode and DImode */
>    2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
> -  {10, 10, 12, 24, 48},			/* cost of loading SSE registers
> +  {10, 10, 12, 48, 96},			/* cost of loading SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
> -  {10, 10, 12, 24, 48},			/* cost of storing SSE registers
> +  {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
> +  {10, 10, 12, 48, 96},			/* cost of storing SSE registers
>  					   in 32,64,128,256 and 512-bit */
> -  {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
> +  {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
>    14, 14,				/* SSE->integer and integer->SSE moves */
>    10, 10,				/* Gather load static, per_elt.  */
>    10, 10,				/* Gather store static, per_elt.  */
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index a46450ad99d..ad8661075aa 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -441,7 +441,7 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal"
>  
>  /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
>     the auto-vectorizer.  */
> -DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
> +DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER
>  	  | m_ZNVER1)
>  
>  /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
>
Follow-Ups:
- Re: [PATCH][i386] Fix vec_construct cost, remove unused ix86_vec_cost arg
  - From: Richard Biener
References:
- [PATCH][i386] Fix vec_construct cost, remove unused ix86_vec_cost arg
  - From: Richard Biener
- Re: [PATCH][i386] Fix vec_construct cost, remove unused ix86_vec_cost arg
  - From: Richard Biener
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]