This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Fix AMD64 handling of functions with huge stack frames (take 2)


On Thu, Oct 23, 2003 at 11:35:51AM -0700, Richard Henderson wrote:
> > +  [(set (attr "type")
> > +	(if_then_else (eq_attr "alternative" "0")
> > +		      (const_string "alu")
> > +		      (const_string "lea")))
> 
>   (set_attr "type" "alu,lea")
> ?

Changed, thanks.

Further testing revealed two further problems:
1) pro_epilogue_adjust_stack_rex64 happily emitted subq $2147483648, %rsp
   instead of addq $-2147483648, %rsp
2) many places in reload1.c used int instead of HOST_WIDE_INT

This patch passed testing on AMD64, IA64 and IA32.
20031023-4.c test fails on PPC64 and 20031023-{1,2,3,4}.c fail on S390x,
but I assume those are architecture dependent problems which I'll try
to tackle next.
Ok to commit in this shape?

With stack frames > 2GB, at least on AMD64, generated code is quite
inefficient (at -O2):
bar:
        pushq   %rbx
        movabsq $-4294967296, %r11
        movabsq $4294967296, %rax
        movabsq $-4294967296, %rdx
        movabsq $4294967296, %rcx
        addq    %r11, %rsp
        addq    %rsp, %rax
        addq    %rsp, %rcx
        movb    $97, (%rax,%rdx)
        movl    $4294967295, %eax
        movq    %rcx, %rbx
        leaq    (%rcx,%rax), %rax
        addq    %rdx, %rbx
        movq    %rbx, %rdi
        movb    $98, (%rax,%rdx)
...
Looks some at least minimal CSE pass when stack frames are big wouldn't
hurt.  reload1.c doesn't see that %rax + %rdx is actually
(%rsp + 4294967296) + (-4294967296) == %rsp etc.

2003-10-26  Jakub Jelinek  <jakub@redhat.com>
	    Jan Hubicka  <jh@suse.cz>

	* reload1.c (struct elim_table): Change offset, initial_offset and
	previous_offset fields to HOST_WIDE_INT.
	(offsets_at): Change from int to HOST_WIDE_INT.
	(reload): Adjust offsets_at initialization.
	(eliminate_regs_in_insn): Change type of offset to HOST_WIDE_INT.
	(verify_initial_elim_offsets): Change type of t to HOST_WIDE_INT.
	* config/i386/i386.c (ix86_compute_frame_layout): Change offset type
	to HOST_WIDE_INT.  Don't save regs using mov for huge frame sizes
	if TARGET_64BIT.
	(pro_epilogue_adjust_stack): New function.
	(ix86_expand_prologue, ix86_expand_epilogue): Use it.
	* config/i386/i386.md (pro_epilogue_adjust_stack): Remove.
	(pro_epilogue_adjust_stack_1): Remove * in front of name.
	(pro_epilogue_adjust_stack_rex64): Handle -2147483648 properly.
	(pro_epilogue_adjust_stack_rex64_2): New insn.

	* config/i386/i386.c (ix86_expand_epilogue): Fix comment typo.

	* config/i386/i386.c (ix86_expand_call): Replace 40 with
	FIRST_REX_INT_REG + 3 /* R11 */.

	* gcc.c-torture/compile/20031023-1.c: New test.
	* gcc.c-torture/compile/20031023-2.c: New test.
	* gcc.c-torture/compile/20031023-3.c: New test.
	* gcc.c-torture/compile/20031023-4.c: New test.

--- gcc/reload1.c.jj	2003-10-23 09:54:43.000000000 +0200
+++ gcc/reload1.c	2003-10-26 09:39:17.000000000 +0100
@@ -293,12 +293,12 @@ struct elim_table
 {
   int from;			/* Register number to be eliminated.  */
   int to;			/* Register number used as replacement.  */
-  int initial_offset;		/* Initial difference between values.  */
+  HOST_WIDE_INT initial_offset;	/* Initial difference between values.  */
   int can_eliminate;		/* Nonzero if this elimination can be done.  */
   int can_eliminate_previous;	/* Value of CAN_ELIMINATE in previous scan over
 				   insns made by reload.  */
-  int offset;			/* Current offset between the two regs.  */
-  int previous_offset;		/* Offset at end of previous insn.  */
+  HOST_WIDE_INT offset;		/* Current offset between the two regs.  */
+  HOST_WIDE_INT previous_offset;/* Offset at end of previous insn.  */
   int ref_outside_mem;		/* "to" has been referenced outside a MEM.  */
   rtx from_rtx;			/* REG rtx for the register to be eliminated.
 				   We cannot simply compare the number since
@@ -352,7 +352,7 @@ static int num_eliminable_invariants;
 
 static int first_label_num;
 static char *offsets_known_at;
-static int (*offsets_at)[NUM_ELIMINABLE_REGS];
+static HOST_WIDE_INT (*offsets_at)[NUM_ELIMINABLE_REGS];
 
 /* Number of labels in the current function.  */
 
@@ -816,7 +816,7 @@ reload (rtx first, int global)
      allocate would occasionally cause it to exceed the stack limit and
      cause a core dump.  */
   offsets_known_at = xmalloc (num_labels);
-  offsets_at = xmalloc (num_labels * NUM_ELIMINABLE_REGS * sizeof (int));
+  offsets_at = xmalloc (num_labels * NUM_ELIMINABLE_REGS * sizeof (HOST_WIDE_INT));
 
   /* Alter each pseudo-reg rtx to contain its hard reg number.
      Assign stack slots to the pseudos that lack hard regs or equivalents.
@@ -2897,7 +2897,7 @@ eliminate_regs_in_insn (rtx insn, int re
 	      {
 		rtx base = SET_SRC (old_set);
 		rtx base_insn = insn;
-		int offset = 0;
+		HOST_WIDE_INT offset = 0;
 
 		while (base != ep->to_rtx)
 		  {
@@ -2980,7 +2980,7 @@ eliminate_regs_in_insn (rtx insn, int re
       && REGNO (XEXP (SET_SRC (old_set), 0)) < FIRST_PSEUDO_REGISTER)
     {
       rtx reg = XEXP (SET_SRC (old_set), 0);
-      int offset = INTVAL (XEXP (SET_SRC (old_set), 1));
+      HOST_WIDE_INT offset = INTVAL (XEXP (SET_SRC (old_set), 1));
 
       for (ep = reg_eliminate; ep < &reg_eliminate[NUM_ELIMINABLE_REGS]; ep++)
 	if (ep->from_rtx == reg && ep->can_eliminate)
@@ -3263,7 +3263,7 @@ mark_not_eliminable (rtx dest, rtx x, vo
 static void
 verify_initial_elim_offsets (void)
 {
-  int t;
+  HOST_WIDE_INT t;
 
 #ifdef ELIMINABLE_REGS
   struct elim_table *ep;
--- gcc/config/i386/i386.c.jj	2003-10-23 09:55:03.000000000 +0200
+++ gcc/config/i386/i386.c	2003-10-26 11:12:20.000000000 +0100
@@ -4841,7 +4841,7 @@ ix86_compute_frame_layout (struct ix86_f
 {
   HOST_WIDE_INT total_size;
   int stack_alignment_needed = cfun->stack_alignment_needed / BITS_PER_UNIT;
-  int offset;
+  HOST_WIDE_INT offset;
   int preferred_alignment = cfun->preferred_stack_boundary / BITS_PER_UNIT;
   HOST_WIDE_INT size = get_frame_size ();
 
@@ -4957,7 +4957,8 @@ ix86_compute_frame_layout (struct ix86_f
     (size + frame->padding1 + frame->padding2
      + frame->outgoing_arguments_size + frame->va_arg_size);
 
-  if (!frame->to_allocate && frame->nregs <= 1)
+  if ((!frame->to_allocate && frame->nregs <= 1)
+      || (TARGET_64BIT && frame->to_allocate >= (HOST_WIDE_INT) 0x80000000))
     frame->save_regs_using_mov = false;
 
   if (TARGET_RED_ZONE && current_function_sp_is_unchanging
@@ -5024,6 +5025,41 @@ ix86_emit_save_regs_using_mov (rtx point
       }
 }
 
+/* Expand prologue or epilogue stack adjustement.
+   The pattern exist to put a dependency on all ebp-based memory accesses.
+   STYLE should be negative if instructions should be marked as frame related,
+   zero if %r11 register is live and cannot be freely used and positive
+   otherwise.  */
+
+static void
+pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, int style)
+{
+  rtx insn;
+
+  if (! TARGET_64BIT)
+    insn = emit_insn (gen_pro_epilogue_adjust_stack_1 (dest, src, offset));
+  else if (x86_64_immediate_operand (offset, DImode))
+    insn = emit_insn (gen_pro_epilogue_adjust_stack_rex64 (dest, src, offset));
+  else
+    {
+      rtx r11;
+      /* r11 is used by indirect sibcall return as well, set before the
+	 epilogue and used after the epilogue.  ATM indirect sibcall
+	 shouldn't be used together with huge frame sizes in one
+	 function because of the frame_size check in sibcall.c.  */
+      if (style == 0)
+	abort ();
+      r11 = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 3 /* R11 */);
+      insn = emit_insn (gen_rtx_SET (DImode, r11, offset));
+      if (style < 0)
+	RTX_FRAME_RELATED_P (insn) = 1;
+      insn = emit_insn (gen_pro_epilogue_adjust_stack_rex64_2 (dest, src, r11,
+							       offset));
+    }
+  if (style < 0)
+    RTX_FRAME_RELATED_P (insn) = 1;
+}
+
 /* Expand the prologue into a bunch of separate insns.  */
 
 void
@@ -5065,12 +5101,8 @@ ix86_expand_prologue (void)
   if (allocate == 0)
     ;
   else if (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)
-    {
-      insn = emit_insn (gen_pro_epilogue_adjust_stack
-			(stack_pointer_rtx, stack_pointer_rtx,
-			 GEN_INT (-allocate)));
-      RTX_FRAME_RELATED_P (insn) = 1;
-    }
+    pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+			       GEN_INT (-allocate), -1);
   else
     {
       /* Only valid for Win32 */
@@ -5213,8 +5245,8 @@ ix86_expand_epilogue (int style)
 	      tmp = gen_rtx_MEM (Pmode, hard_frame_pointer_rtx);
 	      emit_move_insn (hard_frame_pointer_rtx, tmp);
 
-	      emit_insn (gen_pro_epilogue_adjust_stack
-			 (stack_pointer_rtx, sa, const0_rtx));
+	      pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
+					 const0_rtx, style);
 	    }
 	  else
 	    {
@@ -5225,19 +5257,19 @@ ix86_expand_epilogue (int style)
 	    }
 	}
       else if (!frame_pointer_needed)
-	emit_insn (gen_pro_epilogue_adjust_stack
-		   (stack_pointer_rtx, stack_pointer_rtx,
-		    GEN_INT (frame.to_allocate
-			     + frame.nregs * UNITS_PER_WORD)));
+	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				   GEN_INT (frame.to_allocate
+					    + frame.nregs * UNITS_PER_WORD),
+				   style);
       /* If not an i386, mov & pop is faster than "leave".  */
       else if (TARGET_USE_LEAVE || optimize_size
 	       || !cfun->machine->use_fast_prologue_epilogue)
 	emit_insn (TARGET_64BIT ? gen_leave_rex64 () : gen_leave ());
       else
 	{
-	  emit_insn (gen_pro_epilogue_adjust_stack (stack_pointer_rtx,
-						    hard_frame_pointer_rtx,
-						    const0_rtx));
+	  pro_epilogue_adjust_stack (stack_pointer_rtx,
+				     hard_frame_pointer_rtx,
+				     const0_rtx, style);
 	  if (TARGET_64BIT)
 	    emit_insn (gen_popdi1 (hard_frame_pointer_rtx));
 	  else
@@ -5252,14 +5284,13 @@ ix86_expand_epilogue (int style)
 	{
 	  if (!frame_pointer_needed)
 	    abort ();
-          emit_insn (gen_pro_epilogue_adjust_stack (stack_pointer_rtx,
-						    hard_frame_pointer_rtx,
-						    GEN_INT (offset)));
+	  pro_epilogue_adjust_stack (stack_pointer_rtx,
+				     hard_frame_pointer_rtx,
+				     GEN_INT (offset), style);
 	}
       else if (frame.to_allocate)
-	emit_insn (gen_pro_epilogue_adjust_stack
-		   (stack_pointer_rtx, stack_pointer_rtx,
-		    GEN_INT (frame.to_allocate)));
+	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				   GEN_INT (frame.to_allocate), style);
 
       for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
 	if (ix86_save_reg (regno, false))
@@ -5298,7 +5329,7 @@ ix86_expand_epilogue (int style)
 	{
 	  rtx ecx = gen_rtx_REG (SImode, 2);
 
-	  /* There are is no "pascal" calling convention in 64bit ABI.  */
+	  /* There is no "pascal" calling convention in 64bit ABI.  */
 	  if (TARGET_64BIT)
 	    abort ();
 
@@ -11541,7 +11572,7 @@ ix86_expand_call (rtx retval, rtx fnaddr
     {
       rtx addr;
       addr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0));
-      fnaddr = gen_rtx_REG (Pmode, 40);
+      fnaddr = gen_rtx_REG (Pmode, FIRST_REX_INT_REG + 3 /* R11 */);
       emit_move_insn (fnaddr, addr);
       fnaddr = gen_rtx_MEM (QImode, fnaddr);
     }
--- gcc/config/i386/i386.md.jj	2003-10-23 09:55:07.000000000 +0200
+++ gcc/config/i386/i386.md	2003-10-26 09:19:44.000000000 +0100
@@ -17772,23 +17772,7 @@
 ;; [(set (mem (plus (reg ebp) (const_int -160000))) (const_int 0))]
 ;;
 ;; in proper program order.
-(define_expand "pro_epilogue_adjust_stack"
-  [(parallel [(set (match_operand:SI 0 "register_operand" "=r,r")
-		   (plus:SI (match_operand:SI 1 "register_operand" "0,r")
-			    (match_operand:SI 2 "immediate_operand" "i,i")))
-	      (clobber (reg:CC 17))
-	      (clobber (mem:BLK (scratch)))])]
- ""
-{
-  if (TARGET_64BIT)
-    {
-      emit_insn (gen_pro_epilogue_adjust_stack_rex64
-		 (operands[0], operands[1], operands[2]));
-      DONE;
-    }
-})
-
-(define_insn "*pro_epilogue_adjust_stack_1"
+(define_insn "pro_epilogue_adjust_stack_1"
   [(set (match_operand:SI 0 "register_operand" "=r,r")
 	(plus:SI (match_operand:SI 1 "register_operand" "0,r")
 	         (match_operand:SI 2 "immediate_operand" "i,i")))
@@ -17844,6 +17828,8 @@
 
     case TYPE_ALU:
       if (GET_CODE (operands[2]) == CONST_INT
+	  /* Avoid overflows.  */
+	  && ((INTVAL (operands[2]) & ((((unsigned int) 1) << 31) - 1)))
           && (INTVAL (operands[2]) == 128
 	      || (INTVAL (operands[2]) < 0
 	          && INTVAL (operands[2]) != -128)))
@@ -17870,6 +17856,30 @@
 	      (const_string "lea")))
    (set_attr "mode" "DI")])
 
+(define_insn "pro_epilogue_adjust_stack_rex64_2"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(plus:DI (match_operand:DI 1 "register_operand" "0,r")
+		 (match_operand:DI 3 "immediate_operand" "i,i")))
+   (use (match_operand:DI 2 "register_operand" "r,r"))
+   (clobber (reg:CC 17))
+   (clobber (mem:BLK (scratch)))]
+  "TARGET_64BIT"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ALU:
+      return "add{q}\t{%2, %0|%0, %2}";
+
+    case TYPE_LEA:
+      operands[2] = gen_rtx_PLUS (DImode, operands[1], operands[2]);
+      return "lea{q}\t{%a2, %0|%0, %a2}";
+
+    default:
+      abort ();
+    }
+}
+  [(set_attr "type" "alu,lea")
+   (set_attr "mode" "DI")])
 
 ;; Placeholder for the conditional moves.  This one is split either to SSE
 ;; based moves emulation or to usual cmove sequence.  Little bit unfortunate
--- gcc/testsuite/gcc.c-torture/compile/20031023-1.c.jj	2003-10-23 14:19:10.000000000 +0200
+++ gcc/testsuite/gcc.c-torture/compile/20031023-1.c	2003-10-23 15:50:14.000000000 +0200
@@ -0,0 +1,66 @@
+#ifndef ASIZE
+# define ASIZE 0x10000000000UL
+#endif
+
+#include <limits.h>
+
+#if LONG_MAX < 8 * ASIZE
+# undef ASIZE
+# define ASIZE 4096
+#endif
+
+extern void abort (void);
+
+int __attribute__((noinline))
+foo (const char *s)
+{
+  if (!s)
+    return 1;
+  if (s[0] != 'a')
+    abort ();
+  s += ASIZE - 1;
+  if (s[0] != 'b')
+    abort ();
+  return 0;
+}
+
+int (*fn) (const char *) = foo;
+
+int __attribute__((noinline))
+bar (void)
+{
+  char s[ASIZE];
+  s[0] = 'a';
+  s[ASIZE - 1] = 'b';
+  foo (s);
+  foo (s);
+  return 0;
+}
+
+int __attribute__((noinline))
+baz (long i)
+{
+  if (i)
+    return fn (0);
+  else
+    {
+      char s[ASIZE];
+      s[0] = 'a';
+      s[ASIZE - 1] = 'b';
+      foo (s);
+      foo (s);
+      return fn (0);
+    }
+}
+
+int
+main (void)
+{
+  if (bar ())
+    abort ();
+  if (baz (0) != 1)
+    abort ();
+  if (baz (1) != 1)
+    abort ();
+  return 0;
+}
--- gcc/testsuite/gcc.c-torture/compile/20031023-2.c.jj	2003-10-23 14:19:25.000000000 +0200
+++ gcc/testsuite/gcc.c-torture/compile/20031023-2.c	2003-10-23 14:19:49.000000000 +0200
@@ -0,0 +1,2 @@
+#define ASIZE 0x1000000000UL
+#include "20031023-1.c"
--- gcc/testsuite/gcc.c-torture/compile/20031023-3.c.jj	2003-10-23 14:19:25.000000000 +0200
+++ gcc/testsuite/gcc.c-torture/compile/20031023-3.c	2003-10-23 14:20:28.000000000 +0200
@@ -0,0 +1,2 @@
+#define ASIZE 0x100000000UL
+#include "20031023-1.c"
--- gcc/testsuite/gcc.c-torture/compile/20031023-4.c.jj	2003-10-23 14:19:25.000000000 +0200
+++ gcc/testsuite/gcc.c-torture/compile/20031023-4.c	2003-10-23 14:20:51.000000000 +0200
@@ -0,0 +1,2 @@
+#define ASIZE 0x80000000UL
+#include "20031023-1.c"


	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]