This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH][i386] Fix PR83358 - increase divide/mod latencies a bit
- From: Markus Trippelsdorf <markus at trippelsdorf dot de>
- To: gcc-patches at gcc dot gnu dot org
- Cc: Julia Koval <julia dot koval at intel dot com>, Uros Bizjak <ubizjak at gmail dot com>, Jan Hubicka <hubicka at ucw dot cz>
- Date: Tue, 12 Dec 2017 09:28:48 +0100
- Subject: [PATCH][i386] Fix PR83358 - increase divide/mod latencies a bit
- Authentication-results: sourceware.org; auth=none
As the testcase shows, trunk currently generates horrible code for
divisions used in tight loops. This happens because the algorithm
expanding div/mod doesn't take parallelism into account and this makes
the cost model unrealistic.
Fix the issue by increasing the estimated latencies a bit.
Tested on X86_64.
OK for trunk?
(I will keep an eye on the periodic SPEC testers.)
PR target/83358
* x86-tune-costs.h (skylake_cost, core_cost): Increase div/mod
latencies a bit.
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 312467d9788f..648219338308 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1541,9 +1541,11 @@ struct processor_costs skylake_cost = {
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (4)}, /* other */
0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (8), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (8), /* HI */
- COSTS_N_INSNS (11), /* SI */
+ /* Expanding div/mod currently doesn't consider parallelism. So the cost
+ model is not realistic. We compensate by increasing the latencies a bit. */
+ {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (11), /* HI */
+ COSTS_N_INSNS (14), /* SI */
COSTS_N_INSNS (76), /* DI */
COSTS_N_INSNS (76)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
@@ -2342,11 +2344,11 @@ struct processor_costs core_cost = {
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (4)}, /* other */
0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (8), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (8), /* HI */
- /* 8-11 */
- COSTS_N_INSNS (11), /* SI */
- /* 24-81 */
+ /* Expanding div/mod currently doesn't consider parallelism. So the cost
+ model is not realistic. We compensate by increasing the latencies a bit. */
+ {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (11), /* HI */
+ COSTS_N_INSNS (14), /* SI */
COSTS_N_INSNS (81), /* DI */
COSTS_N_INSNS (81)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
diff --git a/gcc/testsuite/gcc.target/i386/pr83358-1.c b/gcc/testsuite/gcc.target/i386/pr83358-1.c
new file mode 100644
index 000000000000..96427b2f56dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr83358-1.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=core2" } */
+
+#include <stdint.h>
+
+void bin2ascii(uint64_t val, char *dst) {
+ const int64_t POW10_10 = ((int64_t)10) * 1000 * 1000 * 1000;
+ int64_t hix = val / POW10_10;
+ int64_t lox = val % POW10_10;
+ int32_t v0 = hix / 100000;
+ int32_t v1 = hix % 100000;
+ int32_t v2 = lox / 100000;
+ int32_t v3 = lox % 100000;
+ for (int i = 4; i != 0; --i) {
+ dst[i + 0 * 5] = v0 % 10 + '0';
+ v0 /= 10;
+ dst[i + 1 * 5] = v1 % 10 + '0';
+ v1 /= 10;
+ dst[i + 2 * 5] = v2 % 10 + '0';
+ v2 /= 10;
+ dst[i + 3 * 5] = v3 % 10 + '0';
+ v3 /= 10;
+ }
+ dst[0 * 5] = v0 + '0';
+ dst[1 * 5] = v1 + '0';
+ dst[2 * 5] = v2 + '0';
+ dst[3 * 5] = v3 + '0';
+ dst[4 * 5] = 0;
+}
+
+/* { dg-final { scan-assembler-not "idiv" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr83358-2.c b/gcc/testsuite/gcc.target/i386/pr83358-2.c
new file mode 100644
index 000000000000..f6039bf72feb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr83358-2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=skylake-avx512" } */
+
+#include <stdint.h>
+
+void bin2ascii(uint64_t val, char *dst) {
+ const int64_t POW10_10 = ((int64_t)10) * 1000 * 1000 * 1000;
+ int64_t hix = val / POW10_10;
+ int64_t lox = val % POW10_10;
+ int32_t v0 = hix / 100000;
+ int32_t v1 = hix % 100000;
+ int32_t v2 = lox / 100000;
+ int32_t v3 = lox % 100000;
+ for (int i = 4; i != 0; --i) {
+ dst[i + 0 * 5] = v0 % 10 + '0';
+ v0 /= 10;
+ dst[i + 1 * 5] = v1 % 10 + '0';
+ v1 /= 10;
+ dst[i + 2 * 5] = v2 % 10 + '0';
+ v2 /= 10;
+ dst[i + 3 * 5] = v3 % 10 + '0';
+ v3 /= 10;
+ }
+ dst[0 * 5] = v0 + '0';
+ dst[1 * 5] = v1 + '0';
+ dst[2 * 5] = v2 + '0';
+ dst[3 * 5] = v3 + '0';
+ dst[4 * 5] = 0;
+}
+
+/* { dg-final { scan-assembler-not "idiv" } } */
--
Markus