This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Improve i?86/x86_64 prologue_and_epilogue for leaf functions (PR target/59501)
- From: Jakub Jelinek <jakub at redhat dot com>
- To: Jan Hubicka <hubicka at ucw dot cz>, Richard Henderson <rth at redhat dot com>, "H.J. Lu" <hjl dot tools at gmail dot com>
- Cc: gcc-patches at gcc dot gnu dot org
- Date: Fri, 20 Dec 2013 17:06:56 +0100
- Subject: [PATCH] Improve i?86/x86_64 prologue_and_epilogue for leaf functions (PR target/59501)
- Authentication-results: sourceware.org; auth=none
- Reply-to: Jakub Jelinek <jakub at redhat dot com>
Hi!
Honza recently changed the i?86 backend, so that it often doesn't
do -maccumulate-outgoing-args by default on x86_64.
Unfortunately, on some of the here included testcases this regressed
quite a bit the generated code. As AVX vectors are used, the dynamic
realignment code needs to assume e.g. that some of them will need to be
spilled, and for -mno-accumulate-outgoing-args the code needs to set
need_drap early as well. But in when emitting the prologue/epilogue,
if need_drap is set, we don't perform the optimization for leaf functions
which have zero size stack frame, thus we end up with uselessly doing
dynamic stack realignment, setting up DRAP that nothing uses and later on
restore everything back.
This patch improves it, if the DRAP register isn't live at the start of
entry bb successor and we aren't going to realign the stack, we don't
need DRAP at all, and even if we need DRAP register, that can't be the sole
reason for doing stack realignment, the prologue code is able to set up DRAP
even without dynamic stack realignment.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
2013-12-20 Jakub Jelinek <jakub@redhat.com>
PR target/59501
* config/i386/i386.c (ix86_save_reg): Don't return true for drap_reg
if !crtl->stack_realign_needed.
(ix86_finalize_stack_realign_flags): If drap_reg isn't live on entry
and stack_realign_needed will be false, clear drap_reg and need_drap.
Optimize leaf functions that don't need stack frame even if
crtl->need_drap.
* gcc.target/i386/pr59501-1.c: New test.
* gcc.target/i386/pr59501-1a.c: New test.
* gcc.target/i386/pr59501-2.c: New test.
* gcc.target/i386/pr59501-2a.c: New test.
* gcc.target/i386/pr59501-3.c: New test.
* gcc.target/i386/pr59501-3a.c: New test.
* gcc.target/i386/pr59501-4.c: New test.
* gcc.target/i386/pr59501-4a.c: New test.
* gcc.target/i386/pr59501-5.c: New test.
* gcc.target/i386/pr59501-6.c: New test.
--- gcc/config/i386/i386.c.jj 2013-12-19 13:35:23.000000000 +0100
+++ gcc/config/i386/i386.c 2013-12-20 11:44:14.389310804 +0100
@@ -9235,7 +9235,9 @@ ix86_save_reg (unsigned int regno, bool
}
}
- if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
+ if (crtl->drap_reg
+ && regno == REGNO (crtl->drap_reg)
+ && crtl->stack_realign_needed)
return true;
return (df_regs_ever_live_p (regno)
@@ -10473,12 +10475,23 @@ ix86_finalize_stack_realign_flags (void)
return;
}
+ /* If drap has been set, but it actually isn't live at the start
+ of the function and !stack_realign, there is no reason to set it up. */
+ if (crtl->drap_reg && !stack_realign)
+ {
+ basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+ if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
+ {
+ crtl->drap_reg = NULL_RTX;
+ crtl->need_drap = false;
+ }
+ }
+
/* If the only reason for frame_pointer_needed is that we conservatively
assumed stack realignment might be needed, but in the end nothing that
needed the stack alignment had been spilled, clear frame_pointer_needed
and say we don't need stack realignment. */
if (stack_realign
- && !crtl->need_drap
&& frame_pointer_needed
&& crtl->is_leaf
&& flag_omit_frame_pointer
@@ -10516,6 +10529,18 @@ ix86_finalize_stack_realign_flags (void)
}
}
+ /* If drap has been set, but it actually isn't live at the start
+ of the function, there is no reason to set it up. */
+ if (crtl->drap_reg)
+ {
+ basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+ if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
+ {
+ crtl->drap_reg = NULL_RTX;
+ crtl->need_drap = false;
+ }
+ }
+
frame_pointer_needed = false;
stack_realign = false;
crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
--- gcc/testsuite/gcc.target/i386/pr59501-2.c.jj 2013-12-20 12:02:08.754662741 +0100
+++ gcc/testsuite/gcc.target/i386/pr59501-2.c 2013-12-20 12:02:04.665668734 +0100
@@ -0,0 +1,5 @@
+/* PR target/59501 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -maccumulate-outgoing-args" } */
+
+#include "pr59501-1.c"
--- gcc/testsuite/gcc.target/i386/pr59501-1.c.jj 2013-12-20 12:01:44.253781613 +0100
+++ gcc/testsuite/gcc.target/i386/pr59501-1.c 2013-12-20 12:12:26.715391613 +0100
@@ -0,0 +1,30 @@
+/* PR target/59501 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -mno-accumulate-outgoing-args" } */
+
+#define CHECK_H "avx-check.h"
+#define TEST avx_test
+
+#include CHECK_H
+
+typedef double V __attribute__ ((vector_size (32)));
+
+__attribute__((noinline, noclone)) V
+foo (double *x, unsigned *y)
+{
+ V r = { x[y[0]], x[y[1]], x[y[2]], x[y[3]] };
+ return r;
+}
+
+static void
+TEST (void)
+{
+ double a[16];
+ unsigned b[4] = { 5, 0, 15, 7 };
+ int i;
+ for (i = 0; i < 16; i++)
+ a[i] = 0.5 + i;
+ V v = foo (a, b);
+ if (v[0] != 5.5 || v[1] != 0.5 || v[2] != 15.5 || v[3] != 7.5)
+ __builtin_abort ();
+}
--- gcc/testsuite/gcc.target/i386/pr59501-4a.c.jj 2013-12-20 12:19:20.603212859 +0100
+++ gcc/testsuite/gcc.target/i386/pr59501-4a.c 2013-12-20 12:23:33.647881672 +0100
@@ -0,0 +1,8 @@
+/* PR target/59501 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx -maccumulate-outgoing-args" } */
+
+#include "pr59501-3a.c"
+
+/* Verify no dynamic realignment is performed. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*sp" { xfail *-*-* } } } */
--- gcc/testsuite/gcc.target/i386/pr59501-3.c.jj 2013-12-20 12:02:44.644462041 +0100
+++ gcc/testsuite/gcc.target/i386/pr59501-3.c 2013-12-20 12:13:06.834181801 +0100
@@ -0,0 +1,30 @@
+/* PR target/59501 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -mno-accumulate-outgoing-args" } */
+
+#define CHECK_H "avx-check.h"
+#define TEST avx_test
+
+#include CHECK_H
+
+typedef double V __attribute__ ((vector_size (32)));
+
+__attribute__((noinline, noclone)) V
+foo (double *x, int a, int b, int c, int d, int e, int f, unsigned *y)
+{
+ V r = { x[y[0]], x[y[1]], x[y[2]], x[y[3]] };
+ return r;
+}
+
+static void
+TEST (void)
+{
+ double a[16];
+ unsigned b[4] = { 5, 0, 15, 7 };
+ int i;
+ for (i = 0; i < 16; i++)
+ a[i] = 0.5 + i;
+ V v = foo (a, 0, 0, 0, 0, 0, 0, b);
+ if (v[0] != 5.5 || v[1] != 0.5 || v[2] != 15.5 || v[3] != 7.5)
+ __builtin_abort ();
+}
--- gcc/testsuite/gcc.target/i386/pr59501-3a.c.jj 2013-12-20 12:18:41.313420496 +0100
+++ gcc/testsuite/gcc.target/i386/pr59501-3a.c 2013-12-20 12:22:15.257292900 +0100
@@ -0,0 +1,15 @@
+/* PR target/59501 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx -mno-accumulate-outgoing-args" } */
+
+typedef double V __attribute__ ((vector_size (32)));
+
+V
+foo (double *x, int a, int b, int c, int d, int e, int f, unsigned *y)
+{
+ V r = { x[y[0]], x[y[1]], x[y[2]], x[y[3]] };
+ return r;
+}
+
+/* Verify no dynamic realignment is performed. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*sp" } } */
--- gcc/testsuite/gcc.target/i386/pr59501-1a.c.jj 2013-12-20 12:15:16.890495826 +0100
+++ gcc/testsuite/gcc.target/i386/pr59501-1a.c 2013-12-20 12:22:02.820358481 +0100
@@ -0,0 +1,17 @@
+/* PR target/59501 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx -mno-accumulate-outgoing-args" } */
+
+typedef double V __attribute__ ((vector_size (32)));
+
+V
+foo (double *x, unsigned *y)
+{
+ V r = { x[y[0]], x[y[1]], x[y[2]], x[y[3]] };
+ return r;
+}
+
+/* Verify no dynamic realignment is performed. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*sp" } } */
+/* And DRAP isn't needed either. */
+/* { dg-final { scan-assembler-not "r10" } } */
--- gcc/testsuite/gcc.target/i386/pr59501-6.c.jj 2013-12-20 12:08:21.574682265 +0100
+++ gcc/testsuite/gcc.target/i386/pr59501-6.c 2013-12-20 12:08:32.966622139 +0100
@@ -0,0 +1,5 @@
+/* PR target/59501 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -maccumulate-outgoing-args" } */
+
+#include "pr59501-5.c"
--- gcc/testsuite/gcc.target/i386/pr59501-5.c.jj 2013-12-20 12:06:53.276148649 +0100
+++ gcc/testsuite/gcc.target/i386/pr59501-5.c 2013-12-20 12:13:38.012014216 +0100
@@ -0,0 +1,39 @@
+/* PR target/59501 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -mno-accumulate-outgoing-args" } */
+
+#define CHECK_H "avx-check.h"
+#define TEST avx_test
+
+#include CHECK_H
+
+typedef double V __attribute__ ((vector_size (32)));
+
+__attribute__((noinline, noclone)) void
+bar (char *p)
+{
+ p[0] = 1;
+ p[37] = 2;
+ asm volatile ("" : : "r" (p) : "memory");
+}
+
+__attribute__((noinline, noclone)) V
+foo (double *x, int a, int b, int c, int d, int e, int f, unsigned *y)
+{
+ bar (__builtin_alloca (a + b + c + d + e + f));
+ V r = { x[y[0]], x[y[1]], x[y[2]], x[y[3]] };
+ return r;
+}
+
+static void
+TEST (void)
+{
+ double a[16];
+ unsigned b[4] = { 5, 0, 15, 7 };
+ int i;
+ for (i = 0; i < 16; i++)
+ a[i] = 0.5 + i;
+ V v = foo (a, 0, 30, 0, 0, 8, 0, b);
+ if (v[0] != 5.5 || v[1] != 0.5 || v[2] != 15.5 || v[3] != 7.5)
+ __builtin_abort ();
+}
--- gcc/testsuite/gcc.target/i386/pr59501-4.c.jj 2013-12-20 12:03:16.159292616 +0100
+++ gcc/testsuite/gcc.target/i386/pr59501-4.c 2013-12-20 12:06:24.651298808 +0100
@@ -0,0 +1,5 @@
+/* PR target/59501 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -maccumulate-outgoing-args" } */
+
+#include "pr59501-3.c"
--- gcc/testsuite/gcc.target/i386/pr59501-2a.c.jj 2013-12-20 12:18:11.371578107 +0100
+++ gcc/testsuite/gcc.target/i386/pr59501-2a.c 2013-12-20 12:22:07.988329149 +0100
@@ -0,0 +1,10 @@
+/* PR target/59501 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx -maccumulate-outgoing-args" } */
+
+#include "pr59501-1a.c"
+
+/* Verify no dynamic realignment is performed. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*sp" } } */
+/* And DRAP isn't needed either. */
+/* { dg-final { scan-assembler-not "r10" } } */
Jakub