This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] Fix AMD64 handling of functions with huge stack frames
- From: Jakub Jelinek <jakub at redhat dot com>
- To: Jan Hubicka <hubicka at ucw dot cz>
- Cc: Richard Henderson <rth at redhat dot com>, jh at suse dot cz, gcc-patches at gcc dot gnu dot org
- Date: Thu, 23 Oct 2003 17:25:22 +0200
- Subject: Re: [PATCH] Fix AMD64 handling of functions with huge stack frames
- References: <20031023103239.GK12344@sunsite.ms.mff.cuni.cz> <20031023132104.GC4648@atrey.karlin.mff.cuni.cz>
- Reply-to: Jakub Jelinek <jakub at redhat dot com>
On Thu, Oct 23, 2003 at 03:21:04PM +0200, Jan Hubicka wrote:
> > 2003-10-23 Jakub Jelinek <jakub@redhat.com>
> >
> > * config/i386/i386.c (ix86_expand_prologue): Handle stack
> > adjustement bigger than 2GB.
> > (epilogue_adjust_stack): New function.
> > (ix86_expand_epilogue): Use it.
>
> Actually hammer branch has similar patch. I realize now that it appears
> to be missing on mainline for some reason. Sorry for that, it would
> probably save some of your time.
Oops, didn't know that.
> The hammer branch version embeddes the register load in
> pro_epilogue_adjust_stack_expander, this seem to avoid new for
> duplicating the code..
But on hammer-3_3-branch register %r11 is not used by indirect sibcall.
Relying on somebody not forgetting to tweak i386 backend when sibcall.c
stops limiting sibcalls to !frame_size sounds too error-prone to me,
plus FRAME_RELATED_P is not set on the first instruction that way.
> I was trying to do the same on hammer branch and it did hit problems
> with unwind code not being able to realize that (plus (rsp) (reg2))
> is (plus (rsp) (big_constant)), so I had to create new pattern
> pro_epilogue_adjust_stack_rex64_2. DOes this problem still remain in
> mainline or it has been fixed?
What was the difference in the unwind info?
Here is an updated patch which uses pro_epilogue_adjust_stack_rex64_2
from hammer-3_3-branch (fixed up a little bit and simplified),
but it generates identical assembly as when it was using just _rex64.
I get similar results on a backport to gcc-3_2-rhl8-branch, using
rex64_2 or just rex64 changes nothing.
This patch has the expander in i386.c instead of i386.md, so that epilogue
with indirect sibcall can be potentionally handled (as well as
FRAME_RELATED_P).
I have also noticed that stack_pointer_offset was computed wrongly for huge
stack usages (int offset variable doing += size where size is HOST_WIDE_INT
and can be > 4GB) and that we need to avoid using mov for pushes.
2003-10-23 Jakub Jelinek <jakub@redhat.com>
Jan Hubicka <jh@suse.cz>
* config/i386/i386.c (ix86_compute_frame_layout): Change offset type
to HOST_WIDE_INT. Don't save regs using mov for huge frame sizes
if TARGET_64BIT.
(pro_epilogue_adjust_stack): New function.
(ix86_expand_prologue, ix86_expand_epilogue): Use it.
* config/i386/i386.md (pro_epilogue_adjust_stack): Remove.
(pro_epilogue_adjust_stack_1): Remove * in front of name.
(pro_epilogue_adjust_stack_rex64_2): New insn.
* config/i386/i386.c (ix86_expand_epilogue): Fix comment typo.
* config/i386/i386.c (ix86_expand_call): Replace 40 with
FIRST_REX_INT_REG + 3 /* R11 */.
* gcc.c-torture/compile/20031023-1.c: New test.
* gcc.c-torture/compile/20031023-2.c: New test.
* gcc.c-torture/compile/20031023-3.c: New test.
* gcc.c-torture/compile/20031023-4.c: New test.
--- gcc/config/i386/i386.c.jj 2003-10-23 09:55:03.000000000 +0200
+++ gcc/config/i386/i386.c 2003-10-23 19:07:27.000000000 +0200
@@ -4841,7 +4841,7 @@ ix86_compute_frame_layout (struct ix86_f
{
HOST_WIDE_INT total_size;
int stack_alignment_needed = cfun->stack_alignment_needed / BITS_PER_UNIT;
- int offset;
+ HOST_WIDE_INT offset;
int preferred_alignment = cfun->preferred_stack_boundary / BITS_PER_UNIT;
HOST_WIDE_INT size = get_frame_size ();
@@ -4957,7 +4957,8 @@ ix86_compute_frame_layout (struct ix86_f
(size + frame->padding1 + frame->padding2
+ frame->outgoing_arguments_size + frame->va_arg_size);
- if (!frame->to_allocate && frame->nregs <= 1)
+ if ((!frame->to_allocate && frame->nregs <= 1)
+ || (TARGET_64BIT && frame->to_allocate >= (HOST_WIDE_INT) 0x80000000))
frame->save_regs_using_mov = false;
if (TARGET_RED_ZONE && current_function_sp_is_unchanging
@@ -5024,6 +5025,41 @@ ix86_emit_save_regs_using_mov (rtx point
}
}
+/* Expand prologue or epilogue stack adjustement.
+ The pattern exist to put a dependency on all ebp-based memory accesses.
+ STYLE should be negative if instructions should be marked as frame related,
+ zero if %r11 register is live and cannot be freely used and positive
+ otherwise. */
+
+static void
+pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, int style)
+{
+ rtx insn;
+
+ if (! TARGET_64BIT)
+ insn = emit_insn (gen_pro_epilogue_adjust_stack_1 (dest, src, offset));
+ else if (x86_64_immediate_operand (offset, DImode))
+ insn = emit_insn (gen_pro_epilogue_adjust_stack_rex64 (dest, src, offset));
+ else
+ {
+ rtx r11;
+ /* r11 is used by indirect sibcall return as well, set before the
+ epilogue and used after the epilogue. ATM indirect sibcall
+ shouldn't be used together with huge frame sizes in one
+ function because of the frame_size check in sibcall.c. */
+ if (style == 0)
+ abort ();
+ r11 = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 3 /* R11 */);
+ insn = emit_insn (gen_rtx_SET (DImode, r11, offset));
+ if (style < 0)
+ RTX_FRAME_RELATED_P (insn) = 1;
+ insn = emit_insn (gen_pro_epilogue_adjust_stack_rex64_2 (dest, src, r11,
+ offset));
+ }
+ if (style < 0)
+ RTX_FRAME_RELATED_P (insn) = 1;
+}
+
/* Expand the prologue into a bunch of separate insns. */
void
@@ -5065,12 +5101,8 @@ ix86_expand_prologue (void)
if (allocate == 0)
;
else if (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)
- {
- insn = emit_insn (gen_pro_epilogue_adjust_stack
- (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (-allocate)));
- RTX_FRAME_RELATED_P (insn) = 1;
- }
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (-allocate), -1);
else
{
/* Only valid for Win32 */
@@ -5213,8 +5245,8 @@ ix86_expand_epilogue (int style)
tmp = gen_rtx_MEM (Pmode, hard_frame_pointer_rtx);
emit_move_insn (hard_frame_pointer_rtx, tmp);
- emit_insn (gen_pro_epilogue_adjust_stack
- (stack_pointer_rtx, sa, const0_rtx));
+ pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
+ const0_rtx, style);
}
else
{
@@ -5225,19 +5257,19 @@ ix86_expand_epilogue (int style)
}
}
else if (!frame_pointer_needed)
- emit_insn (gen_pro_epilogue_adjust_stack
- (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (frame.to_allocate
- + frame.nregs * UNITS_PER_WORD)));
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (frame.to_allocate
+ + frame.nregs * UNITS_PER_WORD),
+ style);
/* If not an i386, mov & pop is faster than "leave". */
else if (TARGET_USE_LEAVE || optimize_size
|| !cfun->machine->use_fast_prologue_epilogue)
emit_insn (TARGET_64BIT ? gen_leave_rex64 () : gen_leave ());
else
{
- emit_insn (gen_pro_epilogue_adjust_stack (stack_pointer_rtx,
- hard_frame_pointer_rtx,
- const0_rtx));
+ pro_epilogue_adjust_stack (stack_pointer_rtx,
+ hard_frame_pointer_rtx,
+ const0_rtx, style);
if (TARGET_64BIT)
emit_insn (gen_popdi1 (hard_frame_pointer_rtx));
else
@@ -5252,14 +5284,13 @@ ix86_expand_epilogue (int style)
{
if (!frame_pointer_needed)
abort ();
- emit_insn (gen_pro_epilogue_adjust_stack (stack_pointer_rtx,
- hard_frame_pointer_rtx,
- GEN_INT (offset)));
+ pro_epilogue_adjust_stack (stack_pointer_rtx,
+ hard_frame_pointer_rtx,
+ GEN_INT (offset), style);
}
else if (frame.to_allocate)
- emit_insn (gen_pro_epilogue_adjust_stack
- (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (frame.to_allocate)));
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (frame.to_allocate), style);
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if (ix86_save_reg (regno, false))
@@ -5298,7 +5329,7 @@ ix86_expand_epilogue (int style)
{
rtx ecx = gen_rtx_REG (SImode, 2);
- /* There are is no "pascal" calling convention in 64bit ABI. */
+ /* There is no "pascal" calling convention in 64bit ABI. */
if (TARGET_64BIT)
abort ();
@@ -11541,7 +11572,7 @@ ix86_expand_call (rtx retval, rtx fnaddr
{
rtx addr;
addr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0));
- fnaddr = gen_rtx_REG (Pmode, 40);
+ fnaddr = gen_rtx_REG (Pmode, FIRST_REX_INT_REG + 3 /* R11 */);
emit_move_insn (fnaddr, addr);
fnaddr = gen_rtx_MEM (QImode, fnaddr);
}
--- gcc/config/i386/i386.md.jj 2003-10-23 09:55:07.000000000 +0200
+++ gcc/config/i386/i386.md 2003-10-23 16:14:47.000000000 +0200
@@ -17772,23 +17772,7 @@
;; [(set (mem (plus (reg ebp) (const_int -160000))) (const_int 0))]
;;
;; in proper program order.
-(define_expand "pro_epilogue_adjust_stack"
- [(parallel [(set (match_operand:SI 0 "register_operand" "=r,r")
- (plus:SI (match_operand:SI 1 "register_operand" "0,r")
- (match_operand:SI 2 "immediate_operand" "i,i")))
- (clobber (reg:CC 17))
- (clobber (mem:BLK (scratch)))])]
- ""
-{
- if (TARGET_64BIT)
- {
- emit_insn (gen_pro_epilogue_adjust_stack_rex64
- (operands[0], operands[1], operands[2]));
- DONE;
- }
-})
-
-(define_insn "*pro_epilogue_adjust_stack_1"
+(define_insn "pro_epilogue_adjust_stack_1"
[(set (match_operand:SI 0 "register_operand" "=r,r")
(plus:SI (match_operand:SI 1 "register_operand" "0,r")
(match_operand:SI 2 "immediate_operand" "i,i")))
@@ -17870,6 +17854,33 @@
(const_string "lea")))
(set_attr "mode" "DI")])
+(define_insn "pro_epilogue_adjust_stack_rex64_2"
+ [(set (match_operand:DI 0 "register_operand" "=r,r")
+ (plus:DI (match_operand:DI 1 "register_operand" "0,r")
+ (match_operand:DI 3 "immediate_operand" "i,i")))
+ (use (match_operand:DI 2 "register_operand" "r,r"))
+ (clobber (reg:CC 17))
+ (clobber (mem:BLK (scratch)))]
+ "TARGET_64BIT"
+{
+ switch (get_attr_type (insn))
+ {
+ case TYPE_ALU:
+ return "add{q}\t{%2, %0|%0, %2}";
+
+ case TYPE_LEA:
+ operands[2] = gen_rtx_PLUS (DImode, operands[1], operands[2]);
+ return "lea{q}\t{%a2, %0|%0, %a2}";
+
+ default:
+ abort ();
+ }
+}
+ [(set (attr "type")
+ (if_then_else (eq_attr "alternative" "0")
+ (const_string "alu")
+ (const_string "lea")))
+ (set_attr "mode" "DI")])
;; Placeholder for the conditional moves. This one is split either to SSE
;; based moves emulation or to usual cmove sequence. Little bit unfortunate
--- gcc/testsuite/gcc.c-torture/compile/20031023-1.c.jj 2003-10-23 14:19:10.000000000 +0200
+++ gcc/testsuite/gcc.c-torture/compile/20031023-1.c 2003-10-23 15:50:14.000000000 +0200
@@ -0,0 +1,66 @@
+#ifndef ASIZE
+# define ASIZE 0x10000000000UL
+#endif
+
+#include <limits.h>
+
+#if LONG_MAX < 8 * ASIZE
+# undef ASIZE
+# define ASIZE 4096
+#endif
+
+extern void abort (void);
+
+int __attribute__((noinline))
+foo (const char *s)
+{
+ if (!s)
+ return 1;
+ if (s[0] != 'a')
+ abort ();
+ s += ASIZE - 1;
+ if (s[0] != 'b')
+ abort ();
+ return 0;
+}
+
+int (*fn) (const char *) = foo;
+
+int __attribute__((noinline))
+bar (void)
+{
+ char s[ASIZE];
+ s[0] = 'a';
+ s[ASIZE - 1] = 'b';
+ foo (s);
+ foo (s);
+ return 0;
+}
+
+int __attribute__((noinline))
+baz (long i)
+{
+ if (i)
+ return fn (0);
+ else
+ {
+ char s[ASIZE];
+ s[0] = 'a';
+ s[ASIZE - 1] = 'b';
+ foo (s);
+ foo (s);
+ return fn (0);
+ }
+}
+
+int
+main (void)
+{
+ if (bar ())
+ abort ();
+ if (baz (0) != 1)
+ abort ();
+ if (baz (1) != 1)
+ abort ();
+ return 0;
+}
--- gcc/testsuite/gcc.c-torture/compile/20031023-2.c.jj 2003-10-23 14:19:25.000000000 +0200
+++ gcc/testsuite/gcc.c-torture/compile/20031023-2.c 2003-10-23 14:19:49.000000000 +0200
@@ -0,0 +1,2 @@
+#define ASIZE 0x1000000000UL
+#include "20031023-1.c"
--- gcc/testsuite/gcc.c-torture/compile/20031023-3.c.jj 2003-10-23 14:19:25.000000000 +0200
+++ gcc/testsuite/gcc.c-torture/compile/20031023-3.c 2003-10-23 14:20:28.000000000 +0200
@@ -0,0 +1,2 @@
+#define ASIZE 0x100000000UL
+#include "20031023-1.c"
--- gcc/testsuite/gcc.c-torture/compile/20031023-4.c.jj 2003-10-23 14:19:25.000000000 +0200
+++ gcc/testsuite/gcc.c-torture/compile/20031023-4.c 2003-10-23 14:20:51.000000000 +0200
@@ -0,0 +1,2 @@
+#define ASIZE 0x80000000UL
+#include "20031023-1.c"
Jakub