This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Double alignment patches for x86


Here's the first version of a patch that implements proper alignment of
doubles on the stack on ix86 targets.  64 bit alignment is enabled with
the option "-mstack-align-double", 128 bit alignment (for long doubles)
with "-mstack-align-ldouble".

This patch is not meant for inclusion yet; however there are a couple of
problems I'd like feedback on, and I need some testers.  I ran a few
benchmarks and didn't really get anything resembling convincing results
which could mean that a) there's a bug somewhere, or b) aligning doubles
doesn't win all that much in real-life situations.  If anyone has some
floating point intensive code, please test with this patch and post
results.

I needed to tweak some system-independent files.  The problem is that
GET_MODE_ALIGNMENT currently simply returns the size of the mode, which
is incorrect for long doubles on ix86 (XFmode == 12 bytes; wanted alignment
16 bytes).  In the patch below I added some code to round all alignments
up to the next power of two, but I'm not sure whether this could break
some other ports which use non-power-of-2 mode sizes.

One problem remains; function arguments are copied from the argument
list to stack slots because the compiler wants to guarante alignment for
them.  This even happens if they are already aligned correctly, the
compiler can't determine this yet.

Bernd

	* machmode.h (mode_alignment): Declare new variable.
	(GET_MODE_ALIGNMENT): Use it.
	* rtl.c (mode_alignment): New variable.
	* emit-rtl.c (init_emit_once): Initialize mode_alignment.
	* function.c (assign_stack_local): Always round size to alignment.
	(assign_outer_stack_local): Likewise.

	* i386.h (MASK_ALIGN_STACK_DOUBLE): New macro.
	(MASK_ALIGN_STACK_LDBL): Likewise.
	(TARGET_ALIGN_STACK_DOUBLE): Likewise.
	(TARGET_ALIGN_STACK_LDBL): Likewise.
	(TARGET_SWITCHES): Add -malign-stack-double, -malign-stack-ldouble
	and their negations.
	(PREFERRED_STACK_BOUNDARY): New macro.
	(BIGGEST_ALIGNMENT): 128 with -malign-stack-ldouble, 64 with
	-malign-stack-double, 32 otherwise.
	(BIGGEST_FIELD_ALIGNMENT): New macro.
	(STARTING_FRAME_OFFSET): Use ix86_starting_frame_offset.
	(INITIAL_ELIMINATION_OFFSET): Use ix86_sp_offset.
	(ix86_sp_offset): Declare.
	(ix86_starting_frame_offset): Declare.

	* i386.c (compute_stack_layout): New static function.
	(ix86_sp_offset): New function.
	(ix86_frame_size): New static function.
	(ix86_starting_frame_offset): New function.
	(ix86_prologue): Use ix86_frame_size instead of get_frame_size.
	(ix86_epilogue): Likewise.

Index: emit-rtl.c
===================================================================
RCS file: /usr/local/cvs/gcs/gcc/emit-rtl.c,v
retrieving revision 1.1.1.33
diff -u -p -u -r1.1.1.33 emit-rtl.c
--- emit-rtl.c	1998/11/28 10:28:01	1.1.1.33
+++ emit-rtl.c	1998/11/28 10:40:10
@@ -3407,6 +3407,25 @@ init_emit_once (line_numbers)
 
   sequence_stack = NULL;
 
+  /* Compute mode alignments.  */
+  
+  for (mode = VOIDmode; (int) mode < (int) MAX_MACHINE_MODE;
+       mode = (enum machine_mode) ((int) mode + 1))
+    {
+      unsigned int align;
+      align = MIN (BIGGEST_ALIGNMENT,
+		   MAX (1, (GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT)));
+      /* Alignments must be powers of 2, so round up.  */
+      if ((align & (align - 1)) != 0)
+	{
+	  while ((align & (align - 1)) != 0)
+	    align &= align - 1;
+	  align *= 2;
+	}
+
+      mode_alignment[(int) mode] = align;
+    }
+
   /* Compute the word and byte modes.  */
 
   byte_mode = VOIDmode;
Index: function.c
===================================================================
RCS file: /usr/local/cvs/gcs/gcc/function.c,v
retrieving revision 1.1.1.46
diff -u -p -u -r1.1.1.46 function.c
--- function.c	1998/11/21 12:08:17	1.1.1.46
+++ function.c	1998/11/23 15:47:06
@@ -713,6 +713,7 @@ assign_stack_local (mode, size, align)
       alignment = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
       if (mode == BLKmode)
 	alignment = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
+      size = CEIL_ROUND (size, alignment);
     }
   else if (align == -1)
     {
@@ -788,6 +789,7 @@ assign_outer_stack_local (mode, size, al
       alignment = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
       if (mode == BLKmode)
 	alignment = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
+      size = CEIL_ROUND (size, alignment);
     }
   else if (align == -1)
     {
Index: machmode.h
===================================================================
RCS file: /usr/local/cvs/gcs/gcc/machmode.h,v
retrieving revision 1.1.1.7
diff -u -p -u -r1.1.1.7 machmode.h
--- machmode.h	1998/11/08 15:15:04	1.1.1.7
+++ machmode.h	1998/11/23 15:53:47
@@ -220,9 +220,8 @@ extern enum machine_mode get_best_mode P
 
 /* Determine alignment, 1<=result<=BIGGEST_ALIGNMENT.  */
 
-#define GET_MODE_ALIGNMENT(MODE)   \
-  MIN (BIGGEST_ALIGNMENT, 	   \
-       MAX (1, (GET_MODE_UNIT_SIZE (MODE) * BITS_PER_UNIT)))
+extern unsigned int mode_alignment[];
+#define GET_MODE_ALIGNMENT(MODE) (mode_alignment[(int) (MODE)])
 
 /* For each class, get the narrowest mode in that class.  */
 
Index: rtl.c
===================================================================
RCS file: /usr/local/cvs/gcs/gcc/rtl.c,v
retrieving revision 1.1.1.18
diff -u -p -u -r1.1.1.18 rtl.c
--- rtl.c	1998/10/27 16:52:30	1.1.1.18
+++ rtl.c	1998/11/23 16:03:08
@@ -130,6 +130,10 @@ unsigned HOST_WIDE_INT mode_mask_array[(
 
 enum machine_mode class_narrowest_mode[(int) MAX_MODE_CLASS];
 
+/* Indexed by machine mode, gives the alignment for the mode.  */
+
+unsigned int mode_alignment[(int) MAX_MACHINE_MODE];
+
 /* Indexed by rtx code, gives a sequence of operand-types for
    rtx's of that code.  The sequence is a C string in which
    each character describes one operand.  */
Index: config/i386/i386.c
===================================================================
RCS file: /usr/local/cvs/gcs/gcc/config/i386/i386.c,v
retrieving revision 1.1.1.29
diff -u -p -u -r1.1.1.29 i386.c
--- config/i386/i386.c	1998/11/05 18:46:08	1.1.1.29
+++ config/i386/i386.c	1998/11/23 16:51:37
@@ -1935,6 +1935,88 @@ ix86_unary_operator_ok (code, mode, oper
   return TRUE;
 }
 
+/* Figure out the stack layout for this function.  The frame may need to
+   be larger than what get_frame_size reports to ensure alignment of doubles.
+
+   For any function, the stack consists of these parts:
+     4 bytes        return address
+     (4 bytes        old frame pointer - if not -fomit-frame-pointer)
+     frame size     frame
+     4*n bytes      callee-saved regs
+
+   We must guarantee that:
+     - the total amount of bytes is divisible by the alignment we want
+     - the frame pointer is aligned properly
+   Both goals can be achieved by increasing the frame size, and fitting
+   the actually allocated frame size into the larger block so that it is
+   aligned.  */
+static void
+compute_stack_layout (pframesz, pframeoff, pargoff)
+     int *pframesz, *pframeoff, *pargoff;
+{
+  unsigned int wanted_alignment = PREFERRED_STACK_BOUNDARY / BITS_PER_UNIT;
+  unsigned int align_mask = ~(wanted_alignment - 1);
+  unsigned int regsave_total;
+  unsigned int actual_frame_size = get_frame_size () - STARTING_FRAME_OFFSET;
+  unsigned int frame_total;
+  unsigned int entry_total;
+  unsigned int offset = 0;
+  int limit;
+  int regno;
+  int pic_reg_used = flag_pic && (current_function_uses_pic_offset_table
+				  || current_function_uses_const_pool);
+
+  limit = frame_pointer_needed ? FRAME_POINTER_REGNUM : STACK_POINTER_REGNUM;
+  for (regno = 0; regno < limit; regno++)
+    if ((regs_ever_live[regno] && ! call_used_regs[regno])
+	|| (regno == PIC_OFFSET_TABLE_REGNUM && pic_reg_used))
+      offset += 4;
+
+  /* The frame is separated from the stack by the register save area.  The
+     stack is aligned; so the register save area has to be a multiple of the
+     wanted alignment to make the frame properly aligned.  */
+  regsave_total = (offset + wanted_alignment - 1) & align_mask;
+
+  /* The block containing the frame, the return address and possibly the
+     saved frame pointer also has to be a multiple of the alignment.  */
+  entry_total = 4 + (frame_pointer_needed ? 4 : 0);
+  frame_total = actual_frame_size + entry_total;
+  frame_total = (frame_total + wanted_alignment - 1) & align_mask;
+  
+  *pframesz = frame_total + regsave_total - offset - entry_total;
+  *pframeoff = regsave_total + actual_frame_size;
+  *pargoff = regsave_total + frame_total;
+}
+
+/* Calculate initial elimination offsets for fp/ap pointer elimination.  
+   FOR_ARG is nonzero if we need the offset of the arg pointer, zero if
+   we need the offset of the frame pointer.  */
+int
+ix86_sp_offset (for_arg)
+     int for_arg;
+{
+  int framesz, frameoff, argoff;
+  compute_stack_layout (&framesz, &frameoff, &argoff);
+  return for_arg ? argoff : frameoff;
+}
+
+/* Compute the amount of memory that needs to be allocated for the stack
+   frame.  This will be equal to get_frame_size () if we are not trying
+   to align doubles, larger otherwise.  */
+static int
+ix86_frame_size ()
+{
+  int framesz, frameoff, argoff;
+  compute_stack_layout (&framesz, &frameoff, &argoff);
+  return framesz;
+}
+
+int
+ix86_starting_frame_offset ()
+{
+  return PREFERRED_STACK_BOUNDARY == 128 ? -8 : 0;
+}
+
 static rtx pic_label_rtx;
 static char pic_label_name [256];
 static int pic_label_no = 0;
@@ -2086,7 +2168,7 @@ ix86_prologue (do_rtl)
   rtx xops[4];
   int pic_reg_used = flag_pic && (current_function_uses_pic_offset_table
 				  || current_function_uses_const_pool);
-  long tsize = get_frame_size ();
+  long tsize = ix86_frame_size ();
   rtx insn;
   int cfa_offset = INCOMING_FRAME_SP_OFFSET, cfa_store_offset = cfa_offset;
 
@@ -2305,7 +2387,7 @@ ix86_epilogue (do_rtl)
   int pic_reg_used = flag_pic && (current_function_uses_pic_offset_table
 				  || current_function_uses_const_pool);
   int sp_valid = !frame_pointer_needed || current_function_sp_is_unchanging;
-  long tsize = get_frame_size ();
+  long tsize = ix86_frame_size ();
 
   /* Compute the number of registers to pop */
 
Index: config/i386/i386.h
===================================================================
RCS file: /usr/local/cvs/gcs/gcc/config/i386/i386.h,v
retrieving revision 1.1.1.37
diff -u -p -u -r1.1.1.37 i386.h
--- config/i386/i386.h	1998/11/28 10:29:50	1.1.1.37
+++ config/i386/i386.h	1998/11/28 10:40:41
@@ -96,6 +96,8 @@ extern int target_flags;
 #define MASK_DEBUG_ARG		000020000000	/* Debug function_arg */   
 #define MASK_SCHEDULE_PROLOGUE  000040000000    /* Emit prologue as rtl */
 #define MASK_STACK_PROBE	000100000000	/* Enable stack probing */
+#define MASK_ALIGN_STACK_DOUBLE	000200000000	/* Align doubles on the stack */
+#define MASK_ALIGN_STACK_LDBL	000400000000	/* Align long doubles on the stack */
 
 /* Use the floating point instructions */
 #define TARGET_80387 (target_flags & MASK_80387)
@@ -110,6 +112,16 @@ extern int target_flags;
    faster code on the pentium.  */
 #define TARGET_ALIGN_DOUBLE (target_flags & MASK_ALIGN_DOUBLE)
 
+/* Align doubles to a two word boundary on the stack.  This option by itself
+   does not break binary compatibility.  It is implied by
+   TARGET_ALIGN_DOUBLE.  */
+#define TARGET_ALIGN_STACK_DOUBLE ((target_flags & MASK_ALIGN_STACK_DOUBLE) \
+				   || TARGET_ALIGN_DOUBLE)
+
+/* Align long doubles to a four word boundary on the stack.  This option by
+   itself does not break binary compatibility.  */
+#define TARGET_ALIGN_STACK_LDBL (target_flags & MASK_ALIGN_STACK_LDBL)
+
 /* Put uninitialized locals into bss, not data.
    Meaningful only on svr3.  */
 #define TARGET_SVR3_SHLIB (target_flags & MASK_SVR3_SHLIB)
@@ -192,6 +204,10 @@ extern const int x86_double_with_add;
   { "no-rtd",			-MASK_RTD, "Use normal calling convention" },\
   { "align-double",		 MASK_ALIGN_DOUBLE, "Align some doubles on dword boundary" },\
   { "no-align-double",		-MASK_ALIGN_DOUBLE, "Align doubles on word boundary" },		\
+  { "align-stack-double",	 MASK_ALIGN_STACK_DOUBLE, "Align doubles on the stack on dword boundary" },\
+  { "no-align-stack-double",	-MASK_ALIGN_STACK_DOUBLE, "Align doubles on the stack on word boundary" },		\
+  { "align-stack-ldouble",	 MASK_ALIGN_STACK_LDBL, "Align long doubles on the stack on qword boundary" },\
+  { "no-align-stack-ldouble",	-MASK_ALIGN_STACK_LDBL, "Align long doubles on the stack on word boundary" },		\
   { "svr3-shlib",		 MASK_SVR3_SHLIB, "Uninitialized locals in .bss"  },			\
   { "no-svr3-shlib",		-MASK_SVR3_SHLIB, "Uninitialized locals in .data" },			\
   { "ieee-fp",			 MASK_IEEE_FP, "Use IEEE math for fp comparisons" },	\
@@ -410,6 +426,11 @@ extern int ix86_arch;
 /* Boundary (in *bits*) on which stack pointer should be aligned.  */
 #define STACK_BOUNDARY 32
 
+/* Like STACK_BOUNDARY, but used only for laying out the stack, not for
+   optimizations based on knowledge about the alignment of the stack.  */
+#define PREFERRED_STACK_BOUNDARY (TARGET_ALIGN_STACK_LDBL ? 128 \
+				  : TARGET_ALIGN_STACK_DOUBLE ? 64 : 32)
+
 /* Allocation boundary (in *bits*) for the code of a function.
    For i486, we get better performance by aligning to a cache
    line (i.e. 16 byte) boundary.  */
@@ -427,8 +448,13 @@ extern int ix86_arch;
    The published ABIs say that doubles should be aligned on word
    boundaries, but the Pentium gets better performance with them
    aligned on 64 bit boundaries. */
-#define BIGGEST_ALIGNMENT (TARGET_ALIGN_DOUBLE ? 64 : 32)
+#define BIGGEST_ALIGNMENT (TARGET_ALIGN_STACK_LDBL ? 128		\
+			   : (TARGET_ALIGN_DOUBLE			\
+			      || TARGET_ALIGN_STACK_DOUBLE ? 64 : 32))
 
+/* Biggest alignment any structure field can require in bits.  */
+#define BIGGEST_FIELD_ALIGNMENT (TARGET_ALIGN_DOUBLE ? 64 : 32)
+
 /* If defined, a C expression to compute the alignment given to a
    constant that is being placed in memory.  CONSTANT is the constant
    and ALIGN is the alignment that the object would ordinarily have.
@@ -984,7 +1010,7 @@ enum reg_class
    If FRAME_GROWS_DOWNWARD, this is the offset to the END of the
    first local allocated.  Otherwise, it is the offset to the BEGINNING
    of the first local allocated.  */
-#define STARTING_FRAME_OFFSET 0
+#define STARTING_FRAME_OFFSET (ix86_starting_frame_offset ())
 
 /* If we generate an insn to push BYTES bytes,
    this says how many the stack pointer really advances by.
@@ -1585,23 +1611,10 @@ do {						\
 {									\
   if ((FROM) == ARG_POINTER_REGNUM && (TO) == FRAME_POINTER_REGNUM)	\
     (OFFSET) = 8;	/* Skip saved PC and previous frame pointer */	\
+  else if ((FROM) == ARG_POINTER_REGNUM && (TO) == STACK_POINTER_REGNUM)\
+    (OFFSET) = ix86_sp_offset (1);					\
   else									\
-    {									\
-      int regno;							\
-      int offset = 0;							\
-									\
-      for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)		\
-	if ((regs_ever_live[regno] && ! call_used_regs[regno])		\
-	    || ((current_function_uses_pic_offset_table			\
-		 || current_function_uses_const_pool)			\
-		&& flag_pic && regno == PIC_OFFSET_TABLE_REGNUM))	\
-	  offset += 4;							\
-									\
-      (OFFSET) = offset + get_frame_size ();				\
-									\
-      if ((FROM) == ARG_POINTER_REGNUM && (TO) == STACK_POINTER_REGNUM)	\
-	(OFFSET) += 4;	/* Skip saved PC */				\
-    }									\
+    (OFFSET) = ix86_sp_offset (0);					\
 }
 
 /* Addressing modes, and classification of registers for them.  */
@@ -2718,6 +2731,8 @@ extern int reg_mentioned_in_mem ();
 extern char *output_int_conditional_move ();
 extern char *output_fp_conditional_move ();
 extern int ix86_can_use_return_insn_p ();
+extern int ix86_sp_offset ();
+extern int ix86_starting_frame_offset ();
 
 #ifdef NOTYET
 extern struct rtx_def *copy_all_rtx ();



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]