PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned

H.J. Lu hongjiu.lu@intel.com
Thu Oct 15 15:58:00 GMT 2009


On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote:
> Hi,
> 
> In 32bit, the incoming stack may not be 16 byte aligned.  This patch
> assumes the incoming stack is 4 byte aligned and realigns stack if any
> SSE variable is put on stack. Any comments?
> 

For i386, gcc, up to gcc 3.4, had been generating 4byte stack alignment.
4byte stack alignment is a problem unless SSE registers are used, which
may be generated by vectorizer or intrinsics. This patch accommodates
the binaries generated by previous versions of gcc if SSE registers are
generated by vectorizer or intrinsics. The performance impact on SPEC
CPU 2006 is minimum:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156#c7

OK for trunk?

Thanks.


H.J.
---
gcc/

2009-09-26  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* function.c (allocate_struct_function): Initialize
	forced_stack_alignment and forced_stack_mode.

	* function.h (function): Add forced_stack_alignment and
	forced_stack_mode.

	* tree-vect-data-refs.c	(vect_verify_datarefs_alignment):
	Update forced_stack_alignment and forced_stack_mode.

	* config/i386/i386.c (VALID_SSE_VECTOR_MODE): New.
	(ix86_update_stack_boundary): In 32bit, use STACK_BOUNDARY if
	any SSE variables are forced on stack.
	(ix86_minimum_alignment): In 32bit, update
	forced_stack_alignment and forced_stack_mode if any SSE
	variables are put on stack.

gcc/testsuite/

2009-09-26  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* gcc.target/i386/incoming-6.c: New.
	* gcc.target/i386/incoming-7.c: Likewise.
	* gcc.target/i386/incoming-8.c: Likewise.
	* gcc.target/i386/incoming-9.c: Likewise.
	* gcc.target/i386/incoming-10.c: Likewise.
	* gcc.target/i386/incoming-11.c: Likewise.

Index: gcc/testsuite/gcc.target/i386/incoming-11.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-11.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-11.c	(revision 0)
@@ -0,0 +1,16 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */
+
+void g();
+
+int p[100];
+int q[100];
+
+void f()
+{
+	int i;
+	for (i = 0; i < 100; i++) p[i] = 0;
+	g();
+	for (i = 0; i < 100; i++) q[i] = 0;
+}
Index: gcc/testsuite/gcc.target/i386/incoming-7.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-7.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-7.c	(revision 0)
@@ -0,0 +1,16 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si, v4si, v4si, v4si, v4si);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  return y(s1, s2, s1, s2, s2);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/testsuite/gcc.target/i386/incoming-9.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-9.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-9.c	(revision 0)
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -O3 -mno-sse -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/testsuite/gcc.target/i386/incoming-10.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-10.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-10.c	(revision 0)
@@ -0,0 +1,19 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */
+
+struct s {
+	int x[8];
+};
+
+void g(struct s *);
+
+void f()
+{
+	int i;
+	struct s s;
+	for (i = 0; i < sizeof(s.x) / sizeof(*s.x); i++) s.x[i] = 0;
+	g(&s);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/testsuite/gcc.target/i386/incoming-6.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-6.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-6.c	(revision 0)
@@ -0,0 +1,17 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si *s3);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  v4si s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/testsuite/gcc.target/i386/incoming-8.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-8.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-8.c	(revision 0)
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -O3 -msse2 -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/function.c
===================================================================
--- gcc/function.c	(revision 152202)
+++ gcc/function.c	(working copy)
@@ -4139,6 +4139,9 @@ allocate_struct_function (tree fndecl, b
       /* Assume all registers in stdarg functions need to be saved.  */
       cfun->va_list_gpr_size = VA_LIST_MAX_GPR_SIZE;
       cfun->va_list_fpr_size = VA_LIST_MAX_FPR_SIZE;
+
+      cfun->forced_stack_alignment = 0;
+      cfun->forced_stack_mode = VOIDmode;
     }
 }
 
Index: gcc/function.h
===================================================================
--- gcc/function.h	(revision 152202)
+++ gcc/function.h	(working copy)
@@ -532,6 +532,12 @@ struct GTY(()) function {
      a string describing the reason for failure.  */
   const char * GTY((skip)) cannot_be_copied_reason;
 
+  /* The largest alignment forced on the stack.  */
+  unsigned int forced_stack_alignment;
+
+  /* The mode of the largest alignment forced on the stack.  */
+  enum machine_mode forced_stack_mode;
+
   /* Collected bit flags.  */
 
   /* Number of units of general registers that need saving in stdarg
Index: gcc/tree-vect-data-refs.c
===================================================================
--- gcc/tree-vect-data-refs.c	(revision 152202)
+++ gcc/tree-vect-data-refs.c	(working copy)
@@ -927,6 +927,8 @@ vect_verify_datarefs_alignment (loop_vec
     {
       gimple stmt = DR_STMT (dr);
       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+      enum machine_mode mode;
+      unsigned int align;
 
       /* For interleaving, only the alignment of the first access matters.  */
       if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
@@ -947,6 +949,15 @@ vect_verify_datarefs_alignment (loop_vec
             }
           return false;
         }
+
+      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
+      align = GET_MODE_ALIGNMENT (mode);
+      if (cfun->forced_stack_alignment < align)
+	{
+	  cfun->forced_stack_alignment = align;
+	  cfun->forced_stack_mode = mode;
+	}
+
       if (supportable_dr_alignment != dr_aligned
           && vect_print_dump_info (REPORT_ALIGNMENT))
         fprintf (vect_dump, "Vectorizing an unaligned access.");
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 152202)
+++ gcc/config/i386/i386.c	(working copy)
@@ -8151,16 +8151,31 @@ find_drap_reg (void)
     }
 }
 
+#define VALID_SSE_VECTOR_MODE(MODE) \
+  ((MODE) == V4SFmode || (MODE) == V4SImode || (MODE) == V2DFmode \
+   || (MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DImode)
+
 /* Update incoming stack boundary and estimated stack alignment.  */
 
 static void
 ix86_update_stack_boundary (void)
 {
+  /* Should we use STACK_BOUNDARY for incoming stack boundary?  */
+  unsigned int incoming_stack_boundary;
+
+  /* In 32bit, use STACK_BOUNDARY for incoming stack boundary if any
+     SSE variables are put on stack. */
+  if (!TARGET_64BIT
+      && VALID_SSE_VECTOR_MODE (cfun->forced_stack_mode))
+    incoming_stack_boundary = STACK_BOUNDARY;
+  else
+    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
+
   /* Prefer the one specified at command line. */
   ix86_incoming_stack_boundary 
     = (ix86_user_incoming_stack_boundary
        ? ix86_user_incoming_stack_boundary
-       : ix86_default_incoming_stack_boundary);
+       : incoming_stack_boundary);
 
   /* Incoming stack alignment can be changed on individual functions
      via force_align_arg_pointer attribute.  We use the smallest
@@ -19773,7 +19788,7 @@ ix86_minimum_alignment (tree exp, enum m
 {
   tree type, decl;
 
-  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
+  if (TARGET_64BIT)
     return align;
 
   if (exp && DECL_P (exp))
@@ -19787,6 +19802,25 @@ ix86_minimum_alignment (tree exp, enum m
       decl = NULL;
     }
 
+  /* In 32bit, update forced_stack_mode if any SSE variables are put
+     on stack.  */
+  if (cfun->forced_stack_alignment < 128)
+    {
+      if (VALID_SSE_VECTOR_MODE (mode))
+	{
+	  cfun->forced_stack_alignment = 128;
+	  cfun->forced_stack_mode = mode;
+	}
+      else if ((type && VALID_SSE_VECTOR_MODE (TYPE_MODE (type))))
+	{
+	  cfun->forced_stack_alignment = 128;
+	  cfun->forced_stack_mode = TYPE_MODE (type);
+	}
+    }
+
+  if (align != 64 || ix86_preferred_stack_boundary >= 64)
+    return align;
+
   /* Don't do dynamic stack realignment for long long objects with
      -mpreferred-stack-boundary=2.  */
   if ((mode == DImode || (type && TYPE_MODE (type) == DImode))



More information about the Gcc-patches mailing list