K8 Tweek 3

Jan Hubicka jh@suse.cz
Wed Jun 4 08:09:00 GMT 2003


Hi,
K8 and Athlon misspredicts jumps when there are more than 3 jumps in 16 byte
window.  This brings occasional large regression that are dificult to analyze.
The attached patch solves it by inserting alignments in front of jumps that may
appear third in the window.
It cause about 0.1% code size growth and fixes 2 such occurences in hot spots
of SPEC2000 tests on the hammer branch (didn't re-checked whether same scenario
mnatches for the mainline).

Bootstrapped/regtested athlon-linux.  OK?

Honza

Wed Jun  4 10:06:44 CEST 2003  Jan Hubicka  <jh@suse.cz>
	* i386.c (min_insn_size, k8_avoid_jump_misspredicts): New functions
	(ix86_reorg): Use it.
Index: i386.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.c,v
retrieving revision 1.567
diff -c -3 -p -r1.567 i386.c
*** i386.c	4 Jun 2003 07:50:27 -0000	1.567
--- i386.c	4 Jun 2003 08:06:26 -0000
*************** static bool ix86_ms_bitfield_layout_p PA
*** 878,883 ****
--- 878,885 ----
  static tree ix86_handle_struct_attribute PARAMS ((tree *, tree, tree, int, bool *));
  static int extended_reg_mentioned_1 PARAMS ((rtx *, void *));
  static bool ix86_rtx_costs PARAMS ((rtx, int, int, int *));
+ static int min_insn_size PARAMS ((rtx));
+ static void k8_avoid_jump_misspredicts PARAMS ((void));
  
  #if defined (DO_GLOBAL_CTORS_BODY) && defined (HAS_INIT_SECTION)
  static void ix86_svr3_asm_out_constructor PARAMS ((rtx, int));
*************** x86_function_profiler (file, labelno)
*** 15526,15531 ****
--- 15528,15644 ----
      }
  }
  
+ /* We don't have exact information about the insn sizes, but we may assume
+    quite safely that we are informed about all 1 byte insns and memory
+    address sizes.  This is enought to elliminate unnecesary padding in
+    99% of cases.  */
+ 
+ static int
+ min_insn_size (insn)
+      rtx insn;
+ {
+   int l = 0;
+ 
+   if (!INSN_P (insn) || !active_insn_p (insn))
+     return 0;
+ 
+   /* Discard alignments we've emit and jump instructions.  */
+   if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
+       && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
+     return 0;
+   if (GET_CODE (insn) == JUMP_INSN
+       && (GET_CODE (PATTERN (insn)) == ADDR_VEC
+ 	  || GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC))
+     return 0;
+ 
+   /* Important case - calls are always 5 bytes.
+      It is common to have many calls in the row.  */
+   if (GET_CODE (insn) == CALL_INSN
+       && symbolic_reference_mentioned_p (PATTERN (insn))
+       && !SIBLING_CALL_P (insn))
+     return 5;
+   if (get_attr_length (insn) <= 1)
+     return 1;
+ 
+   /* For normal instructions we may rely on the sizes of addresses
+      and the presence of symbol to require 4 bytes of encoding.
+      This is not the case for jumps where references are PC relative.  */
+   if (GET_CODE (insn) != JUMP_INSN)
+     {
+       l = get_attr_length_address (insn);
+       if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
+ 	l = 4;
+     }
+   if (l)
+     return 1+l;
+   else
+     return 2;
+ }
+ 
+ /* AMD K8 core misspredicts jumps when there are more than 3 jumps in 16 byte
+    window.  */
+ 
+ static void
+ k8_avoid_jump_misspredicts ()
+ {
+   rtx insn, start = get_insns ();
+   int nbytes = 0, njumps = 0;
+   int isjump = 0;
+ 
+   /* Look for all minimal intervals of instructions containing 4 jumps.
+      The intervals are bounded by START and INSN.  NBYTES is the total
+      size of instructions in the interval including INSN and not including
+      START.  When the NBYTES is smaller than 16 bytes, it is possible
+      that the end of START and INSN ends up in the same 16byte page.
+ 
+      The smallest offset in the page INSN can start is the case where START
+      ends on the offset 0.  Offset of INSN is then NBYTES - sizeof (INSN).
+      We add p2align to 16byte window with maxskip 17 - NBYTES + sizeof (INSN).
+      */
+   for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+     {
+ 
+       nbytes += min_insn_size (insn);
+       if (rtl_dump_file)
+         fprintf(stderr,"Insn %i estimated to %i bytes\n",
+ 		INSN_UID (insn), min_insn_size (insn));
+       if ((GET_CODE (insn) == JUMP_INSN
+ 	   && GET_CODE (PATTERN (insn)) != ADDR_VEC
+ 	   && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
+ 	  || GET_CODE (insn) == CALL_INSN)
+ 	njumps++;
+       else
+ 	continue;
+ 
+       while (njumps > 3)
+ 	{
+ 	  start = NEXT_INSN (start);
+ 	  if ((GET_CODE (start) == JUMP_INSN
+ 	       && GET_CODE (PATTERN (start)) != ADDR_VEC
+ 	       && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
+ 	      || GET_CODE (start) == CALL_INSN)
+ 	    njumps--, isjump = 1;
+ 	  else
+ 	    isjump = 0;
+ 	  nbytes -= min_insn_size (start);
+ 	}
+       if (njumps < 0)
+ 	abort ();
+       if (rtl_dump_file)
+         fprintf(stderr,"Interval %i to %i has %i bytes\n",
+ 		INSN_UID (start), INSN_UID (insn), nbytes);
+ 
+       if (njumps == 3 && isjump && nbytes < 16)
+ 	{
+ 	  int padsize = 15 - nbytes + min_insn_size (insn);
+ 
+ 	  if (rtl_dump_file)
+ 	    fprintf (rtl_dump_file, "Padding insn %i by %i bytes!\n", INSN_UID (insn), padsize);
+           emit_insn_before (gen_align (GEN_INT (padsize)), insn);
+ 	}
+     }
+ }
+ 
  /* Implement machine specific optimizations.  
     At the moment we implement single transformation: AMD Athlon works faster
     when RET is not destination of conditional jump or directly preceded
*************** ix86_reorg ()
*** 15577,15582 ****
--- 15690,15696 ----
  	delete_insn (ret);
        }
    }
+   k8_avoid_jump_misspredicts ();
  }
  
  /* Return nonzero when QImode register that must be represented via REX prefix
Index: i386.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.md,v
retrieving revision 1.462
diff -c -3 -p -r1.462 i386.md
*** i386.md	4 Jun 2003 07:50:27 -0000	1.462
--- i386.md	4 Jun 2003 08:06:34 -0000
***************
*** 126,131 ****
--- 126,132 ----
     (UNSPECV_STMXCSR		40)
     (UNSPECV_FEMMS		46)
     (UNSPECV_CLFLUSH		57)
+    (UNSPECV_ALIGN		68)
    ])
  
  ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls
***************
*** 14277,14282 ****
--- 14278,14296 ----
     (set_attr "length_immediate" "0")
     (set_attr "modrm" "0")
     (set_attr "ppro_uops" "one")])
+ 
+ ;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and
+ ;; all of memory.  This blocks insns from being moved across this point.
+ 
+ (define_insn "align"
+   [(unspec_volatile [(match_operand 0 "" "")] UNSPECV_ALIGN)]
+   ""
+ {
+ #ifdef HAVE_GAS_MAX_SKIP_P2ALIGN
+   return ".p2align\t4,,%c0";
+ #endif
+ }
+   [(set_attr "length" "16")])
  
  (define_expand "prologue"
    [(const_int 1)]



More information about the Gcc-patches mailing list