This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] x86: Tune Skylake, Cannonlake and Icelake as Haswell
On Fri, Jul 13, 2018 at 9:07 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
>> > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> > > > index 9e46b7b136f..762ab89fc9e 100644
>> > > > --- a/gcc/config/i386/i386.c
>> > > > +++ b/gcc/config/i386/i386.c
>> > > > @@ -137,17 +137,22 @@ const struct processor_costs *ix86_cost = NULL;
>> > > > #define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2)
>> > > > #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM)
>> > > > #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE)
>> > > > -#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL)
>> > > > +#define m_HASWELL ((HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) \
>> > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE) \
>> > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512) \
>> > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) \
>> > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) \
>> > > > + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER))
>> > > >
>> > >
>> > > Please introduce a new per-family define and group processors in this
>> > > define. Something like m_BDVER, m_BTVER and m_AMD_MULTIPLE for AMD
>> > targets.
>> > > We should not redefine m_HASWELL to include unrelated families.
>> > >
>> >
>> > Here is the updated patch. OK for trunk if all tests pass?
>> >
>> >
>> OK.
>
> We have also noticed that benchmarks on skylake are not good compared to
> haswell, this nicely explains it. I think this is -march=native regression
> compared to GCC versions that did not suppored better CPUs than Haswell. So it
> would be nice to backport it.
Yes, we should. Here is the patch to backport to GCC 8. OK for GCC 8 after
it has been checked into trunk?
Thanks.
--
H.J.
From 40a1050b330b421a1f445cb2a40b5a002da2e6d6 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 4 Jun 2018 19:16:06 -0700
Subject: [PATCH] x86: Tune Skylake, Cannonlake and Icelake as Haswell
r259399, which added PROCESSOR_SKYLAKE, disabled many x86 optimizations
which are enabled by PROCESSOR_HASWELL. As the result, -mtune=skylake
generates slower codes on Skylake than before. The same also applies
to Cannonlake and Icelak tuning.
This patch changes -mtune={skylake|cannonlake|icelake} to tune like
-mtune=haswell for until their tuning is properly adjusted. It also
enables -mprefer-vector-width=256 for -mtune=haswell, which has no
impact on codegen when AVX512 isn't enabled.
Performance impacts on SPEC CPU 2017 rate with 1 copy using
-march=native -mfpmath=sse -O2 -m64
are
1. On Broadwell server:
500.perlbench_r -0.56%
502.gcc_r -0.18%
505.mcf_r 0.24%
520.omnetpp_r 0.00%
523.xalancbmk_r -0.32%
525.x264_r -0.17%
531.deepsjeng_r 0.00%
541.leela_r 0.00%
548.exchange2_r 0.12%
557.xz_r 0.00%
Geomean 0.00%
503.bwaves_r 0.00%
507.cactuBSSN_r 0.21%
508.namd_r 0.00%
510.parest_r 0.19%
511.povray_r -0.48%
519.lbm_r 0.00%
521.wrf_r 0.28%
526.blender_r 0.19%
527.cam4_r 0.39%
538.imagick_r 0.00%
544.nab_r -0.36%
549.fotonik3d_r 0.51%
554.roms_r 0.00%
Geomean 0.17%
On Skylake client:
500.perlbench_r 0.96%
502.gcc_r 0.13%
505.mcf_r -1.03%
520.omnetpp_r -1.11%
523.xalancbmk_r 1.02%
525.x264_r 0.50%
531.deepsjeng_r 2.97%
541.leela_r 0.50%
548.exchange2_r -0.95%
557.xz_r 2.41%
Geomean 0.56%
503.bwaves_r 0.49%
507.cactuBSSN_r 3.17%
508.namd_r 4.05%
510.parest_r 0.15%
511.povray_r 0.80%
519.lbm_r 3.15%
521.wrf_r 10.56%
526.blender_r 2.97%
527.cam4_r 2.36%
538.imagick_r 46.40%
544.nab_r 2.04%
549.fotonik3d_r 0.00%
554.roms_r 1.27%
Geomean 5.49%
On Skylake server:
500.perlbench_r 0.71%
502.gcc_r -0.51%
505.mcf_r -1.06%
520.omnetpp_r -0.33%
523.xalancbmk_r -0.22%
525.x264_r 1.72%
531.deepsjeng_r -0.26%
541.leela_r 0.57%
548.exchange2_r -0.75%
557.xz_r -1.28%
Geomean -0.21%
503.bwaves_r 0.00%
507.cactuBSSN_r 2.66%
508.namd_r 3.67%
510.parest_r 1.25%
511.povray_r 2.26%
519.lbm_r 1.69%
521.wrf_r 11.03%
526.blender_r 3.39%
527.cam4_r 1.69%
538.imagick_r 64.59%
544.nab_r -0.54%
549.fotonik3d_r 2.68%
554.roms_r 0.00%
Geomean 6.19%
This patch improves -march=native performance on Skylake up to 60% and
leaves -march=native performance unchanged on Haswell.
gcc/
Backport from mainline
2018-07-12 H.J. Lu <hongjiu.lu@intel.com>
Sunil K Pandey <sunil.k.pandey@intel.com>
PR target/84413
* config/i386/i386.c (m_CORE_AVX512): New.
(m_CORE_AVX2): Likewise.
(m_CORE_ALL): Add m_CORE_AVX2.
* config/i386/x86-tune.def: Replace m_HASWELL with m_CORE_AVX2.
Replace m_SKYLAKE_AVX512 with m_CORE_AVX512 on avx256_optimal
and remove the rest of m_SKYLAKE_AVX512.
gcc/testsuite/
Backport from mainline
2018-07-12 H.J. Lu <hongjiu.lu@intel.com>
Sunil K Pandey <sunil.k.pandey@intel.com>
PR target/84413
* gcc.target/i386/pr84413-1.c: New test.
* gcc.target/i386/pr84413-2.c: Likewise.
* gcc.target/i386/pr84413-3.c: Likewise.
* gcc.target/i386/pr84413-4.c: Likewise.
---
gcc/config/i386/i386.c | 5 ++++-
gcc/config/i386/x86-tune.def | 26 +++++++++++------------
gcc/testsuite/gcc.target/i386/pr84413-1.c | 17 +++++++++++++++
gcc/testsuite/gcc.target/i386/pr84413-2.c | 17 +++++++++++++++
gcc/testsuite/gcc.target/i386/pr84413-3.c | 17 +++++++++++++++
gcc/testsuite/gcc.target/i386/pr84413-4.c | 17 +++++++++++++++
6 files changed, 85 insertions(+), 14 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-4.c
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d7dad81786a..8a032371e7f 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -139,7 +139,6 @@ const struct processor_costs *ix86_cost = NULL;
#define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM)
#define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE)
#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL)
-#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
#define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL)
#define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT)
#define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL)
@@ -149,6 +148,10 @@ const struct processor_costs *ix86_cost = NULL;
#define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE)
#define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT)
#define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)
+#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \
+ | m_ICELAKE_CLIENT | m_ICELAKE_SERVER)
+#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512)
+#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2)
#define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
#define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 60625668236..c99e45cba58 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -48,9 +48,9 @@ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
over partial stores. For example preffer MOVZBL or MOVQ to load 8bit
value over movb. */
DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
- m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL
+ m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2
| m_BONNELL | m_SILVERMONT | m_INTEL
- | m_KNL | m_KNM | m_AMD_MULTIPLE | m_SKYLAKE_AVX512 | m_GENERIC)
+ | m_KNL | m_KNM | m_AMD_MULTIPLE | m_GENERIC)
/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
destinations to be 128bit to allow register renaming on 128bit SSE units,
@@ -84,8 +84,8 @@ DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
partial dependencies. */
DEF_TUNE (X86_TUNE_MOVX, "movx",
m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
- | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_HASWELL
- | m_GEODE | m_AMD_MULTIPLE | m_SKYLAKE_AVX512 | m_GENERIC)
+ | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_AVX2
+ | m_GEODE | m_AMD_MULTIPLE | m_GENERIC)
/* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
full sized loads. */
@@ -101,19 +101,19 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
/* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
conditional jump instruction for TARGET_64BIT. */
DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
- m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1 | m_GENERIC)
+ m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | m_GENERIC)
/* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
subsequent conditional jump instruction when the condition jump
check sign flag (SF) or overflow flag (OF). */
DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
- m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1 | m_GENERIC)
+ m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | m_GENERIC)
/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
jump instruction when the alu instruction produces the CCFLAG consumed by
the conditional jump instruction. */
DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
- m_SANDYBRIDGE | m_HASWELL | m_GENERIC)
+ m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
/*****************************************************************************/
@@ -286,7 +286,7 @@ DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
/* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
for bit-manipulation instructions. */
DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
- m_SANDYBRIDGE | m_HASWELL | m_GENERIC)
+ m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
on hardware capabilities. Bdver3 hardware has a loop buffer which makes
@@ -335,15 +335,15 @@ DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
of a sequence loading registers by parts. */
DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
- m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM
- | m_INTEL | m_SKYLAKE_AVX512 | m_AMDFAM10 | m_BDVER | m_BTVER
+ m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
+ | m_INTEL | m_AMDFAM10 | m_BDVER | m_BTVER
| m_ZNVER1 | m_GENERIC)
/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
of a sequence loading registers by parts. */
DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
- m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM
- | m_INTEL | m_SKYLAKE_AVX512 | m_BDVER | m_ZNVER1 | m_GENERIC)
+ m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
+ | m_INTEL | m_BDVER | m_ZNVER1 | m_GENERIC)
/* Use packed single precision instructions where posisble. I.e. movups instead
of movupd. */
@@ -429,7 +429,7 @@ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
instructions in the auto-vectorizer. */
-DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_SKYLAKE_AVX512)
+DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
/*****************************************************************************/
/* Historical relics: tuning flags that helps a specific old CPU designs */
diff --git a/gcc/testsuite/gcc.target/i386/pr84413-1.c b/gcc/testsuite/gcc.target/i386/pr84413-1.c
new file mode 100644
index 00000000000..1c94d7715cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr84413-1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=skylake-avx512" } */
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */
+
+#define N 1024
+
+double a[N], b[N], c[N];
+
+void
+avx512f_test (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ c[i] = a[i] * b[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr84413-2.c b/gcc/testsuite/gcc.target/i386/pr84413-2.c
new file mode 100644
index 00000000000..adf9b527cd6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr84413-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=cannonlake" } */
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */
+
+#define N 1024
+
+double a[N], b[N], c[N];
+
+void
+avx512f_test (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ c[i] = a[i] * b[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr84413-3.c b/gcc/testsuite/gcc.target/i386/pr84413-3.c
new file mode 100644
index 00000000000..76bf25fc56b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr84413-3.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=icelake-server" } */
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */
+
+#define N 1024
+
+double a[N], b[N], c[N];
+
+void
+avx512f_test (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ c[i] = a[i] * b[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr84413-4.c b/gcc/testsuite/gcc.target/i386/pr84413-4.c
new file mode 100644
index 00000000000..031ef0c8916
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr84413-4.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=haswell -mavx512f" } */
+/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */
+
+#define N 1024
+
+double a[N], b[N], c[N];
+
+void
+avx512f_test (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ c[i] = a[i] * b[i];
+}
--
2.17.1