PATCH: Pad short functions for Atom

H.J. Lu hongjiu.lu@intel.com
Mon Sep 13 16:01:00 GMT 2010


Hi,

On Atom, the function return address isn't ready until 4 cycles after
function is entered. This patch assumes that the most of instructions
in short functions take 1 cycle and pads short functions to 4 cycles,
from function extrance to where the return address is consumed, with
nops which takes half cycle.  OK for trunk?

Thanks.


H.J.
---
gcc/

2010-09-13  H.J. Lu  <hongjiu.lu@intel.com>

	* config/i386/i386.c (block_info): New.
	(BLOCK_INFO): Likewise.
	(ix86_count_insn): Likewise.
	(initial_ix86_tune_features): Add X86_TUNE_PAD_SHORT_FUNCTION.
	(ix86_code_end): Pad with 8 NOPs for TARGET_PAD_SHORT_FUNCTION.
	(ix86_pad_short_function): New.
	(ix86_reorg): Support TARGET_PAD_SHORT_FUNCTION.

	* config/i386/i386.h (ix86_tune_indices): Add
	X86_TUNE_PAD_SHORT_FUNCTION.
	(TARGET_PAD_SHORT_FUNCTION): New.

	* config/i386/i386.md (UNSPEC_RETURN_NOPS): New.
	(return_nops): Likewise.

gcc/testsuite/

2010-09-13  H.J. Lu  <hongjiu.lu@intel.com>

	* gcc.target/i386/pad-1.c: New.
	* gcc.target/i386/pad-2.c: Likewise.
	* gcc.target/i386/pad-3.c: Likewise.
	* gcc.target/i386/pad-4.c: Likewise.
	* gcc.target/i386/pad-5a.c: Likewise.
	* gcc.target/i386/pad-5b.c: Likewise.
	* gcc.target/i386/pad-6a.c: Likewise.
	* gcc.target/i386/pad-6b.c: Likewise.
	* gcc.target/i386/pad-7.c: Likewise.
	* gcc.target/i386/pad-8.c: Likewise.
	* gcc.target/i386/pad-9.c: Likewise.

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 1d79a18..14e05cc 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -56,6 +56,17 @@ along with GCC; see the file COPYING3.  If not see
 #include "debug.h"
 #include "dwarf2out.h"
 #include "sched-int.h"
+
+typedef struct block_info_def
+{
+  /* It can used to store an integer value.  */
+  int value;
+  /* TRUE if block has been processed.  */
+  bool done;
+} *block_info;
+
+#define BLOCK_INFO(B)   ((block_info) (B)->aux)
+
 static rtx legitimize_dllimport_symbol (rtx, bool);
 
 #ifndef CHECK_STACK_LIMIT
@@ -1576,6 +1587,9 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
   /* X86_TUNE_PAD_RETURNS */
   m_AMD_MULTIPLE | m_CORE2 | m_GENERIC,
 
+  /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion.  */
+  m_ATOM,
+
   /* X86_TUNE_EXT_80387_CONSTANTS */
   m_K6_GEODE | m_ATHLON_K8 | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO
   | m_CORE2 | m_GENERIC,
@@ -8013,6 +8027,11 @@ ix86_code_end (void)
 
       xops[0] = gen_rtx_REG (Pmode, regno);
       xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
+      /* Pad stack IP move with 4 instructions.  2 NOPs count as 1
+         instruction.  */
+      if (TARGET_PAD_SHORT_FUNCTION)
+	output_asm_insn (".byte 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, "
+			 "0x90, 0x90", xops);
       output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
       output_asm_insn ("ret", xops);
       final_end_function ();
@@ -27768,6 +27787,95 @@ ix86_pad_returns (void)
     }
 }
 
+/* Count the minimum number of instructions in code path in BB.  
+   Return 4 if the number of instructions >= 4.  */
+
+static int 
+ix86_count_insn (basic_block bb)
+{
+  edge e;
+  edge_iterator ei;
+  rtx insn;
+  int insn_count;
+  int pred_insn_count;
+
+  /* Check if we have seen this block.  The minimum number of
+     instructions is stored in value.  */
+  if (BLOCK_INFO (bb)->done)
+    return BLOCK_INFO (bb)->value;
+
+  BLOCK_INFO (bb)->done = true;
+
+  /* Count number of instructions in this block.  Return 4 if the number
+     of instructions >= 4.  */
+  insn_count = 0;
+  FOR_BB_INSNS (bb, insn)
+    {
+      /* Only happen in exit blocks.  */
+      if (JUMP_P (insn)
+	  && GET_CODE (PATTERN (insn)) == RETURN)
+	break;
+
+      if (NONDEBUG_INSN_P (insn)
+	  && GET_CODE (PATTERN (insn)) != USE
+	  && GET_CODE (PATTERN (insn)) != CLOBBER)
+	{
+	  insn_count++;
+	  if (insn_count >= 4)
+	    {
+	      BLOCK_INFO (bb)->value = insn_count;
+	      return insn_count;
+	    }
+	}
+    }
+
+  /* This block has less than 4 instructions.  Count all predecessor
+     edges of this block.  */
+  pred_insn_count = 0;
+  FOR_EACH_EDGE (e, ei, bb->preds)
+    {
+      int count = ix86_count_insn (e->src);
+      if (pred_insn_count == 0)
+	pred_insn_count = count;
+      else if (count < pred_insn_count)
+	pred_insn_count = count;
+    }
+
+  BLOCK_INFO (bb)->value = pred_insn_count + insn_count;
+  return pred_insn_count + insn_count;
+}
+
+/* Pad short funtion to 4 instructions.   */
+
+static void
+ix86_pad_short_function (void)
+{
+  edge e;
+  edge_iterator ei;
+
+  /* Set up block info for each basic block.  */
+  alloc_aux_for_blocks (sizeof (struct block_info_def));
+
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
+    {
+      rtx ret = BB_END (e->src);
+      if (JUMP_P (ret) && GET_CODE (PATTERN (ret)) == RETURN)
+	{
+	  int insn_count = ix86_count_insn (e->src);
+
+	  /* Pad short function.  */
+	  if (insn_count < 4)
+	    {
+	      emit_jump_insn_before (gen_return_nops (GEN_INT (insn_count)),
+				     ret);
+	      delete_insn (ret);
+	    }
+	}
+    }
+
+  free_aux_for_blocks ();
+}
+
 /* Implement machine specific optimizations.  We implement padding of returns
    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
 static void
@@ -27775,7 +27883,9 @@ ix86_reorg (void)
 {
   if (optimize && optimize_function_for_speed_p (cfun))
     {
-      if (TARGET_PAD_RETURNS)
+      if (TARGET_PAD_SHORT_FUNCTION)
+	ix86_pad_short_function ();
+      else if (TARGET_PAD_RETURNS)
 	ix86_pad_returns ();
 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
       if (TARGET_FOUR_JUMP_LIMIT)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 91238d5..2738c5d 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -299,6 +299,7 @@ enum ix86_tune_indices {
   X86_TUNE_USE_BT,
   X86_TUNE_USE_INCDEC,
   X86_TUNE_PAD_RETURNS,
+  X86_TUNE_PAD_SHORT_FUNCTION,
   X86_TUNE_EXT_80387_CONSTANTS,
   X86_TUNE_SHORTEN_X87_SSE,
   X86_TUNE_AVOID_VECTOR_DECODE,
@@ -384,6 +385,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_USE_BT		ix86_tune_features[X86_TUNE_USE_BT]
 #define TARGET_USE_INCDEC	ix86_tune_features[X86_TUNE_USE_INCDEC]
 #define TARGET_PAD_RETURNS	ix86_tune_features[X86_TUNE_PAD_RETURNS]
+#define TARGET_PAD_SHORT_FUNCTION \
+	ix86_tune_features[X86_TUNE_PAD_SHORT_FUNCTION]
 #define TARGET_EXT_80387_CONSTANTS \
 	ix86_tune_features[X86_TUNE_EXT_80387_CONSTANTS]
 #define TARGET_SHORTEN_X87_SSE	ix86_tune_features[X86_TUNE_SHORTEN_X87_SSE]
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9780eef..c235f56 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -101,6 +101,7 @@
   UNSPEC_ADD_CARRY
   UNSPEC_FLDCW
   UNSPEC_REP
+  UNSPEC_RETURN_NOPS
   UNSPEC_LD_MPIC	; load_macho_picbase
   UNSPEC_TRUNC_NOOP
 
@@ -11370,6 +11371,34 @@
    (set_attr "prefix_rep" "1")
    (set_attr "modrm" "0")])
 
+;; Used by x86_machine_dependent_reorg to avoid penalty on short
+;; functions.  Operand 0 is the number of instructions before return.
+;; Pad up to 4 instructions.  Two NOPs count as one instruction.
+
+(define_insn "return_nops"
+  [(return)
+   (unspec [(match_operand 0 "const_int_operand" "")]
+	   UNSPEC_RETURN_NOPS)]
+  "reload_completed"
+{
+  switch (INTVAL (operands[0]))
+    {
+    case 3:
+      return ".byte 0x90, 0x90\;ret";
+    case 2:
+      return ".byte 0x90, 0x90, 0x90, 0x90\;ret";
+    case 1:
+      return ".byte 0x90, 0x90, 0x90, 0x90, 0x90, 0x90\;ret";
+    case 0:
+      return ".byte 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90\;ret";
+    default:
+      gcc_unreachable ();
+      break;
+  }
+}
+  [(set_attr "length" "8")
+   (set_attr "atom_unit" "jeu")])
+
 (define_insn "return_pop_internal"
   [(return)
    (use (match_operand:SI 0 "const_int_operand" ""))]
diff --git a/gcc/testsuite/gcc.target/i386/pad-1.c b/gcc/testsuite/gcc.target/i386/pad-1.c
new file mode 100644
index 0000000..d5831b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fomit-frame-pointer -mtune=generic -S" } */
+/* { dg-final { scan-assembler "rep" } } */
+/* { dg-final { scan-assembler-not ".byte 0x90" } } */
+
+void
+foo ()
+{
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-2.c b/gcc/testsuite/gcc.target/i386/pad-2.c
new file mode 100644
index 0000000..6ff6103
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-2.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler ".byte 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+void
+foo ()
+{
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-3.c b/gcc/testsuite/gcc.target/i386/pad-3.c
new file mode 100644
index 0000000..9fa3766
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-3.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler-not ".byte 0x90" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int s[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+int d[8] = {11, 22, 33, 44, 55, 66, 77, 88};
+
+void
+foo ()
+{
+  int i;
+  for (i = 0; i < 8; i++)
+    d[i] = s[i] + 0x1000;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-4.c b/gcc/testsuite/gcc.target/i386/pad-4.c
new file mode 100644
index 0000000..15d50fa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-4.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S -fPIC" } */
+/* { dg-final { scan-assembler-times ".byte 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90" 1 } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+extern int bar;
+
+int
+foo ()
+{
+  return bar;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-5a.c b/gcc/testsuite/gcc.target/i386/pad-5a.c
new file mode 100644
index 0000000..f997ef8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-5a.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler ".byte 0x90, 0x90" } } */
+/* { dg-final { scan-assembler-not ".byte 0x90, 0x90, 0x90" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int
+foo (int x, int y, int z)
+{
+   return x + y + z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-5b.c b/gcc/testsuite/gcc.target/i386/pad-5b.c
new file mode 100644
index 0000000..e167319
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-5b.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler ".byte 0x90, 0x90, 0x90, 0x90" } } */
+/* { dg-final { scan-assembler-not ".byte 0x90, 0x90, 0x90, 0x90, 0x90" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int
+foo (int x, int y, int z)
+{
+   return x + y + z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-6a.c b/gcc/testsuite/gcc.target/i386/pad-6a.c
new file mode 100644
index 0000000..bfcb41e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-6a.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler ".byte 0x90, 0x90, 0x90, 0x90" } } */
+/* { dg-final { scan-assembler-not ".byte 0x90, 0x90, 0x90, 0x90, 0x90" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int
+foo (int x, int y)
+{
+   return x + y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-6b.c b/gcc/testsuite/gcc.target/i386/pad-6b.c
new file mode 100644
index 0000000..c04d0e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-6b.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler ".byte 0x90, 0x90, 0x90, 0x90, 0x90, 0x90" } } */
+/* { dg-final { scan-assembler-not ".byte 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int
+foo (int x, int y)
+{
+   return x + y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-7.c b/gcc/testsuite/gcc.target/i386/pad-7.c
new file mode 100644
index 0000000..02e3c3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-7.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler-not ".byte 0x90" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int
+foo (int x, int y, int z)
+{
+   return x + y + z + y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-8.c b/gcc/testsuite/gcc.target/i386/pad-8.c
new file mode 100644
index 0000000..645995a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-8.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler ".byte 0x90, 0x90, 0x90, 0x90, 0x90, 0x90" } } */
+/* { dg-final { scan-assembler-not ".byte 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+int
+foo (int x, int y)
+{
+   return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pad-9.c b/gcc/testsuite/gcc.target/i386/pad-9.c
new file mode 100644
index 0000000..1547260
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pad-9.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */
+/* { dg-final { scan-assembler ".byte 0x90, 0x90, 0x90, 0x90" } } */
+/* { dg-final { scan-assembler-not ".byte 0x90, 0x90, 0x90, 0x90, 0x90" } } */
+/* { dg-final { scan-assembler-not "rep" } } */
+
+extern void bar (void);
+
+void
+foo (int x)
+{
+  if (x)
+    bar ();
+}



More information about the Gcc-patches mailing list