This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned


On 10/15/2009 05:48 PM, H.J. Lu wrote:
On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote:
Hi,

In 32bit, the incoming stack may not be 16 byte aligned.  This patch
assumes the incoming stack is 4 byte aligned and realigns stack if any
SSE variable is put on stack. Any comments?

For i386, gcc, up to gcc 3.4, had been generating 4byte stack alignment.
4byte stack alignment is a problem unless SSE registers are used, which
may be generated by vectorizer or intrinsics. This patch accommodates
the binaries generated by previous versions of gcc if SSE registers are
generated by vectorizer or intrinsics. The performance impact on SPEC
CPU 2006 is minimum:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156#c7

OK for trunk?

Comments bellow.


---
gcc/

2009-09-26 H.J. Lu<hongjiu.lu@intel.com>

	PR target/40838
	* function.c (allocate_struct_function): Initialize
	forced_stack_alignment and forced_stack_mode.

	* function.h (function): Add forced_stack_alignment and
	forced_stack_mode.

	* tree-vect-data-refs.c	(vect_verify_datarefs_alignment):
	Update forced_stack_alignment and forced_stack_mode.

	* config/i386/i386.c (VALID_SSE_VECTOR_MODE): New.
	(ix86_update_stack_boundary): In 32bit, use STACK_BOUNDARY if
	any SSE variables are forced on stack.
	(ix86_minimum_alignment): In 32bit, update
	forced_stack_alignment and forced_stack_mode if any SSE
	variables are put on stack.

gcc/testsuite/


...


Index: gcc/function.h =================================================================== --- gcc/function.h (revision 152202) +++ gcc/function.h (working copy) @@ -532,6 +532,12 @@ struct GTY(()) function { a string describing the reason for failure. */ const char * GTY((skip)) cannot_be_copied_reason;

+  /* The largest alignment forced on the stack.  */
+  unsigned int forced_stack_alignment;
+
+  /* The mode of the largest alignment forced on the stack.  */
+  enum machine_mode forced_stack_mode;
+
    /* Collected bit flags.  */

    /* Number of units of general registers that need saving in stdarg
Index: gcc/tree-vect-data-refs.c
===================================================================
--- gcc/tree-vect-data-refs.c	(revision 152202)
+++ gcc/tree-vect-data-refs.c	(working copy)
@@ -927,6 +927,8 @@ vect_verify_datarefs_alignment (loop_vec
      {
        gimple stmt = DR_STMT (dr);
        stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+      enum machine_mode mode;
+      unsigned int align;

        /* For interleaving, only the alignment of the first access matters.  */
        if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
@@ -947,6 +949,15 @@ vect_verify_datarefs_alignment (loop_vec
              }
            return false;
          }
+
+      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
+      align = GET_MODE_ALIGNMENT (mode);
+      if (cfun->forced_stack_alignment<  align)
+	{
+	  cfun->forced_stack_alignment = align;
+	  cfun->forced_stack_mode = mode;
+	}
+
        if (supportable_dr_alignment != dr_aligned
            &&  vect_print_dump_info (REPORT_ALIGNMENT))
          fprintf (vect_dump, "Vectorizing an unaligned access.");
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 152202)
+++ gcc/config/i386/i386.c	(working copy)
@@ -8151,16 +8151,31 @@ find_drap_reg (void)
      }
  }

+#define VALID_SSE_VECTOR_MODE(MODE) \
+  ((MODE) == V4SFmode || (MODE) == V4SImode || (MODE) == V2DFmode \
+   || (MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DImode)
+
  /* Update incoming stack boundary and estimated stack alignment.  */

  static void
  ix86_update_stack_boundary (void)
  {
+  /* Should we use STACK_BOUNDARY for incoming stack boundary?  */
+  unsigned int incoming_stack_boundary;
+
+  /* In 32bit, use STACK_BOUNDARY for incoming stack boundary if any
+     SSE variables are put on stack. */
+  if (!TARGET_64BIT
+&&  VALID_SSE_VECTOR_MODE (cfun->forced_stack_mode))
+    incoming_stack_boundary = STACK_BOUNDARY;
+  else
+    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
+
    /* Prefer the one specified at command line. */
    ix86_incoming_stack_boundary
      = (ix86_user_incoming_stack_boundary
         ? ix86_user_incoming_stack_boundary
-       : ix86_default_incoming_stack_boundary);
+       : incoming_stack_boundary);

    /* Incoming stack alignment can be changed on individual functions
       via force_align_arg_pointer attribute.  We use the smallest
@@ -19773,7 +19788,7 @@ ix86_minimum_alignment (tree exp, enum m
  {
    tree type, decl;

-  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary>= 64)
+  if (TARGET_64BIT)
      return align;

    if (exp&&  DECL_P (exp))
@@ -19787,6 +19802,25 @@ ix86_minimum_alignment (tree exp, enum m
        decl = NULL;
      }

+ /* In 32bit, update forced_stack_mode if any SSE variables are put
+ on stack. */
+ if (cfun->forced_stack_alignment< 128)
+ {
+ if (VALID_SSE_VECTOR_MODE (mode))
+ {
+ cfun->forced_stack_alignment = 128;
+ cfun->forced_stack_mode = mode;
+ }
+ else if ((type&& VALID_SSE_VECTOR_MODE (TYPE_MODE (type))))
+ {
+ cfun->forced_stack_alignment = 128;
+ cfun->forced_stack_mode = TYPE_MODE (type);
+ }
+ }
+
+ if (align != 64 || ix86_preferred_stack_boundary>= 64)
+ return align;
+

We should use PREFERRED_STACK_BOUNDARY_DEFAULT instead of hardcoded 128 here. At least the comment for P_S_B_D says:


/* It should be MIN_STACK_BOUNDARY.  But we set it to 128 bits for
   both 32bit and 64bit, to support codes that need 128 bit stack
   alignment for SSE instructions, but can't realign the stack.  */
#define PREFERRED_STACK_BOUNDARY_DEFAULT 128

OTOH, I think that this whole stuff should depend on -mstackrealing somehow, according to the comment:

/* 1 if -mstackrealign should be turned on by default.  It will
   generate an alternate prologue and epilogue that realigns the
   runtime stack if nessary.  This supports mixing codes that keep a
   4-byte aligned stack, as specified by i386 psABI, with codes that
   need a 16-byte aligned stack, as required by SSE instructions.  If
   STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is
   128, stacks for all functions may be realigned.  */
#define STACK_REALIGN_DEFAULT 0

As far as I understand from many comments from the PR40838 trail (especially comment #51), -mstackrealing is not effective in some cases involving automatic SSE variables on the stack. I think that -mstackrealign should be fixed in the way your patch outlines, so old/broken sources that assume 4byte alignment can be compiled using this option without penalizing new/fixed code that assumes 16byte alignment.

Also, Jakub has its opinion here, I have also CC'd him.
Uros.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]