This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Committed] S/390: Fix jump prediction for long loops on z10


Hi,

on z10 the dynamic branch predictions is only able to deal with a
limited loop length of 384 bytes.  Everything beyond that falls back
to static prediction for some reason.  For most of the loops we
currently use a compare and a normal conditional jump. Unfortunatly
conditional jumps are statically predicted as "not taken" what makes
the backward jump in long loops to be constantly wrong predicted.

The attached patch addresses cases which haven't been handled by using
a branch on index or branch on count instruction.  If such a loop is
found the conditional jump is inverted and used to jump over an
unconditional jump which jumps backwards.

In order to be able to easily invert a jump condition I had to add to
new patterns for the compare and branch instructions introduced with
z10.  All the existing conditional jump patterns already came with an
inverted variant.

Bootstrapped and regtested on s390 and s390x.

Committed to mainline and GCC 4.4 branch since this fixes a major
performance problem on z10.

Bye,

-Andreas-


2009-08-20  Andreas Krebbel  <krebbel1@de.ibm.com>

	* config/s390/s390.c (Z10_PREDICT_DISTANCE): New macro.
	(s390_z10_fix_long_loop_prediction): New function.
	(s390_z10_optimize_cmp): INSN walk moved to callee - s390_reorg.
	(s390_reorg): Walk over the INSNs and invoke
	s390_z10_fix_long_loop_prediction and s390_z10_optimize_cmp.


Index: gcc/config/s390/s390.c
===================================================================
*** gcc/config/s390/s390.c.orig	2009-08-17 09:29:17.000000000 +0200
--- gcc/config/s390/s390.c	2009-08-19 18:32:32.000000000 +0200
*************** struct GTY(()) machine_function
*** 345,350 ****
--- 345,354 ----
  #define REGNO_PAIR_OK(REGNO, MODE)                               \
    (HARD_REGNO_NREGS ((REGNO), (MODE)) == 1 || !((REGNO) & 1))
  
+ /* That's the read ahead of the dynamic branch prediction unit in
+    bytes on a z10 CPU.  */
+ #define Z10_PREDICT_DISTANCE 384
+ 
  static enum machine_mode
  s390_libgcc_cmp_return_mode (void)
  {
*************** s390_optimize_prologue (void)
*** 9651,9656 ****
--- 9655,9720 ----
      }
  }
  
+ /* On z10 the dynamic branch prediction must see the backward jump in
+    a window of 384 bytes. If not it falls back to the static
+    prediction.  This function rearranges the loop backward branch in a
+    way which makes the static prediction always correct.  The function
+    returns true if it added an instruction.  */
+ static bool
+ s390_z10_fix_long_loop_prediction (rtx insn)
+ {
+   rtx set = single_set (insn);
+   rtx code_label, label_ref, new_label;
+   rtx uncond_jump;
+   rtx cur_insn;
+   rtx tmp;
+   int distance;
+ 
+   /* This will exclude branch on count and branch on index patterns
+      since these are correctly statically predicted.  */
+   if (!set
+       || SET_DEST (set) != pc_rtx
+       || GET_CODE (SET_SRC(set)) != IF_THEN_ELSE)
+     return false;
+ 
+   label_ref = (GET_CODE (XEXP (SET_SRC (set), 1)) == LABEL_REF ?
+ 	       XEXP (SET_SRC (set), 1) : XEXP (SET_SRC (set), 2));
+ 
+   gcc_assert (GET_CODE (label_ref) == LABEL_REF);
+ 
+   code_label = XEXP (label_ref, 0);
+ 
+   if (INSN_ADDRESSES (INSN_UID (code_label)) == -1
+       || INSN_ADDRESSES (INSN_UID (insn)) == -1
+       || (INSN_ADDRESSES (INSN_UID (insn))
+ 	  - INSN_ADDRESSES (INSN_UID (code_label)) < Z10_PREDICT_DISTANCE))
+     return false;
+ 
+   for (distance = 0, cur_insn = PREV_INSN (insn);
+        distance < Z10_PREDICT_DISTANCE - 6;
+        distance += get_attr_length (cur_insn), cur_insn = PREV_INSN (cur_insn))
+     if (!cur_insn || JUMP_P (cur_insn) || LABEL_P (cur_insn))
+       return false;
+ 
+   new_label = gen_label_rtx ();
+   uncond_jump = emit_jump_insn_after (
+ 		  gen_rtx_SET (VOIDmode, pc_rtx,
+ 			       gen_rtx_LABEL_REF (VOIDmode, code_label)),
+ 		  insn);
+   emit_label_after (new_label, uncond_jump);
+ 
+   tmp = XEXP (SET_SRC (set), 1);
+   XEXP (SET_SRC (set), 1) = XEXP (SET_SRC (set), 2);
+   XEXP (SET_SRC (set), 2) = tmp;
+   INSN_CODE (insn) = -1;
+ 
+   XEXP (label_ref, 0) = new_label;
+   JUMP_LABEL (insn) = new_label;
+   JUMP_LABEL (uncond_jump) = code_label;
+ 
+   return true;
+ }
+ 
  /* Returns 1 if INSN reads the value of REG for purposes not related
     to addressing of memory, and 0 otherwise.  */
  static int
*************** s390_swap_cmp (rtx cond, rtx *op0, rtx *
*** 9733,9829 ****
     if that register's value is delivered via a bypass, then the
     pipeline recycles, thereby causing significant performance decline.
     This function locates such situations and exchanges the two
!    operands of the compare.  */
! static void
! s390_z10_optimize_cmp (void)
  {
!   rtx insn, prev_insn, next_insn;
!   int added_NOPs = 0;
  
!   for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
      {
!       rtx cond, *op0, *op1;
! 
!       if (!INSN_P (insn) || INSN_CODE (insn) <= 0)
! 	continue;
! 
!       if (GET_CODE (PATTERN (insn)) == PARALLEL)
! 	{
! 	  /* Handle compare and branch and branch on count
! 	     instructions.  */
! 	  rtx pattern = single_set (insn);
! 
! 	  if (!pattern
! 	      || SET_DEST (pattern) != pc_rtx
! 	      || GET_CODE (SET_SRC (pattern)) != IF_THEN_ELSE)
! 	    continue;
  
! 	  cond = XEXP (SET_SRC (pattern), 0);
! 	  op0 = &XEXP (cond, 0);
! 	  op1 = &XEXP (cond, 1);
! 	}
!       else if (GET_CODE (PATTERN (insn)) == SET)
! 	{
! 	  rtx src, dest;
  
! 	  /* Handle normal compare instructions.  */
! 	  src = SET_SRC (PATTERN (insn));
! 	  dest = SET_DEST (PATTERN (insn));
  
! 	  if (!REG_P (dest)
! 	      || !CC_REGNO_P (REGNO (dest))
! 	      || GET_CODE (src) != COMPARE)
! 	    continue;
  
! 	  /* s390_swap_cmp will try to find the conditional
! 	     jump when passing NULL_RTX as condition.  */
! 	  cond = NULL_RTX;
! 	  op0 = &XEXP (src, 0);
! 	  op1 = &XEXP (src, 1);
! 	}
!       else
! 	continue;
  
!       if (!REG_P (*op0) || !REG_P (*op1))
! 	continue;
  
!       /* Swap the COMPARE arguments and its mask if there is a
! 	 conflicting access in the previous insn.  */
!       prev_insn = PREV_INSN (insn);
        if (prev_insn != NULL_RTX && INSN_P (prev_insn)
! 	  && reg_referenced_p (*op1, PATTERN (prev_insn)))
! 	s390_swap_cmp (cond, op0, op1, insn);
! 
!       /* Check if there is a conflict with the next insn. If there
! 	 was no conflict with the previous insn, then swap the
! 	 COMPARE arguments and its mask.  If we already swapped
! 	 the operands, or if swapping them would cause a conflict
! 	 with the previous insn, issue a NOP after the COMPARE in
! 	 order to separate the two instuctions.  */
!       next_insn = NEXT_INSN (insn);
!       if (next_insn != NULL_RTX && INSN_P (next_insn)
! 	  && s390_non_addr_reg_read_p (*op1, next_insn))
  	{
! 	  if (prev_insn != NULL_RTX && INSN_P (prev_insn)
! 	      && s390_non_addr_reg_read_p (*op0, prev_insn))
! 	    {
! 	      if (REGNO (*op1) == 0)
! 		emit_insn_after (gen_nop1 (), insn);
! 	      else
! 		emit_insn_after (gen_nop (), insn);
! 	      added_NOPs = 1;
! 	    }
  	  else
! 	    s390_swap_cmp (cond, op0, op1, insn);
  	}
      }
! 
!   /* Adjust branches if we added new instructions.  */
!   if (added_NOPs)
!     shorten_branches (get_insns ());
  }
  
- 
  /* Perform machine-dependent processing.  */
  
  static void
--- 9797,9883 ----
     if that register's value is delivered via a bypass, then the
     pipeline recycles, thereby causing significant performance decline.
     This function locates such situations and exchanges the two
!    operands of the compare.  The function return true whenever it
!    added an insn.  */
! static bool
! s390_z10_optimize_cmp (rtx insn)
  {
!   rtx prev_insn, next_insn;
!   bool insn_added_p = false;
!   rtx cond, *op0, *op1;
  
!   if (GET_CODE (PATTERN (insn)) == PARALLEL)
      {
!       /* Handle compare and branch and branch on count
! 	 instructions.  */
!       rtx pattern = single_set (insn);
! 
!       if (!pattern
! 	  || SET_DEST (pattern) != pc_rtx
! 	  || GET_CODE (SET_SRC (pattern)) != IF_THEN_ELSE)
! 	return false;
  
!       cond = XEXP (SET_SRC (pattern), 0);
!       op0 = &XEXP (cond, 0);
!       op1 = &XEXP (cond, 1);
!     }
!   else if (GET_CODE (PATTERN (insn)) == SET)
!     {
!       rtx src, dest;
  
!       /* Handle normal compare instructions.  */
!       src = SET_SRC (PATTERN (insn));
!       dest = SET_DEST (PATTERN (insn));
  
!       if (!REG_P (dest)
! 	  || !CC_REGNO_P (REGNO (dest))
! 	  || GET_CODE (src) != COMPARE)
! 	return false;
  
!       /* s390_swap_cmp will try to find the conditional
! 	 jump when passing NULL_RTX as condition.  */
!       cond = NULL_RTX;
!       op0 = &XEXP (src, 0);
!       op1 = &XEXP (src, 1);
!     }
!   else
!     return false;
  
!   if (!REG_P (*op0) || !REG_P (*op1))
!     return false;
  
!   /* Swap the COMPARE arguments and its mask if there is a
!      conflicting access in the previous insn.  */
!   prev_insn = PREV_INSN (insn);
!   if (prev_insn != NULL_RTX && INSN_P (prev_insn)
!       && reg_referenced_p (*op1, PATTERN (prev_insn)))
!     s390_swap_cmp (cond, op0, op1, insn);
! 
!   /* Check if there is a conflict with the next insn. If there
!      was no conflict with the previous insn, then swap the
!      COMPARE arguments and its mask.  If we already swapped
!      the operands, or if swapping them would cause a conflict
!      with the previous insn, issue a NOP after the COMPARE in
!      order to separate the two instuctions.  */
!   next_insn = NEXT_INSN (insn);
!   if (next_insn != NULL_RTX && INSN_P (next_insn)
!       && s390_non_addr_reg_read_p (*op1, next_insn))
!     {
        if (prev_insn != NULL_RTX && INSN_P (prev_insn)
! 	  && s390_non_addr_reg_read_p (*op0, prev_insn))
  	{
! 	  if (REGNO (*op1) == 0)
! 	    emit_insn_after (gen_nop1 (), insn);
  	  else
! 	    emit_insn_after (gen_nop (), insn);
! 	  insn_added_p = true;
  	}
+       else
+ 	s390_swap_cmp (cond, op0, op1, insn);
      }
!   return insn_added_p;
  }
  
  /* Perform machine-dependent processing.  */
  
  static void
*************** s390_reorg (void)
*** 9934,9943 ****
    /* Try to optimize prologue and epilogue further.  */
    s390_optimize_prologue ();
  
!   /* Eliminate z10-specific pipeline recycles related to some compare
!      instructions.  */
    if (s390_tune == PROCESSOR_2097_Z10)
!     s390_z10_optimize_cmp ();
  }
  
  
--- 9988,10020 ----
    /* Try to optimize prologue and epilogue further.  */
    s390_optimize_prologue ();
  
!   /* Walk over the insns and do some z10 specific changes.  */
    if (s390_tune == PROCESSOR_2097_Z10)
!     {
!       rtx insn;
!       bool insn_added_p = false;
! 
!       /* The insn lengths and addresses have to be up to date for the
! 	 following manipulations.  */
!       shorten_branches (get_insns ());
! 
!       for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
! 	{
! 	  if (!INSN_P (insn) || INSN_CODE (insn) <= 0)
! 	    continue;
! 
! 	  if (JUMP_P (insn))
! 	    insn_added_p |= s390_z10_fix_long_loop_prediction (insn);
! 
! 	  if (GET_CODE (PATTERN (insn)) == PARALLEL
! 	      || GET_CODE (PATTERN (insn)) == SET)
! 	    insn_added_p |= s390_z10_optimize_cmp (insn);
! 	}
! 
!       /* Adjust branches if we added new instructions.  */
!       if (insn_added_p)
! 	shorten_branches (get_insns ());
!     }
  }
  
  
Index: gcc/config/s390/s390.md
===================================================================
*** gcc/config/s390/s390.md.orig	2009-08-17 09:29:17.000000000 +0200
--- gcc/config/s390/s390.md	2009-08-19 18:32:32.000000000 +0200
***************
*** 1046,1051 ****
--- 1046,1109 ----
                        (const_int 6) (const_int 12)))]) ; 8 byte for clr/jg
                                                         ; 10 byte for clgr/jg
  
+ ; And now the same two patterns as above but with a negated CC mask.
+ 
+ ; cij, cgij, crj, cgrj, cfi, cgfi, cr, cgr
+ ; The following instructions do a complementary access of their second
+ ; operand (z01 only): crj_c, cgrjc, cr, cgr
+ (define_insn "*icmp_and_br_signed_<mode>"
+   [(set (pc)
+ 	(if_then_else (match_operator 0 "s390_signed_integer_comparison"
+ 			[(match_operand:GPR 1 "register_operand"  "d,d")
+ 			 (match_operand:GPR 2 "nonmemory_operand" "d,C")])
+ 		      (pc)
+ 		      (label_ref (match_operand 3 "" ""))))
+    (clobber (reg:CC CC_REGNUM))]
+   "TARGET_Z10"
+ {
+   if (get_attr_length (insn) == 6)
+     return which_alternative ?
+       "c<g>ij%D0\t%1,%c2,%l3" : "c<g>rj%D0\t%1,%2,%l3";
+   else
+     return which_alternative ?
+       "c<g>fi\t%1,%c2\;jg%D0\t%l3" : "c<g>r\t%1,%2\;jg%D0\t%l3";
+ }
+   [(set_attr "op_type" "RIE")
+    (set_attr "type"    "branch")
+    (set_attr "z10prop" "z10_super_c,z10_super")
+    (set (attr "length")
+         (if_then_else (lt (abs (minus (pc) (match_dup 3))) (const_int 60000))
+                       (const_int 6) (const_int 12)))]) ; 8 byte for cr/jg
+                                                        ; 10 byte for cgr/jg
+ 
+ ; clij, clgij, clrj, clgrj, clfi, clgfi, clr, clgr
+ ; The following instructions do a complementary access of their second
+ ; operand (z10 only): clrj, clgrj, clr, clgr
+ (define_insn "*icmp_and_br_unsigned_<mode>"
+   [(set (pc)
+ 	(if_then_else (match_operator 0 "s390_unsigned_integer_comparison"
+ 			[(match_operand:GPR 1 "register_operand"  "d,d")
+ 			 (match_operand:GPR 2 "nonmemory_operand" "d,I")])
+ 		      (pc)
+ 		      (label_ref (match_operand 3 "" ""))))
+    (clobber (reg:CC CC_REGNUM))]
+   "TARGET_Z10"
+ {
+   if (get_attr_length (insn) == 6)
+     return which_alternative ?
+       "cl<g>ij%D0\t%1,%b2,%l3" : "cl<g>rj%D0\t%1,%2,%l3";
+   else
+     return which_alternative ?
+       "cl<g>fi\t%1,%b2\;jg%D0\t%l3" : "cl<g>r\t%1,%2\;jg%D0\t%l3";
+ }
+   [(set_attr "op_type" "RIE")
+    (set_attr "type"    "branch")
+    (set_attr "z10prop" "z10_super_c,z10_super")
+    (set (attr "length")
+         (if_then_else (lt (abs (minus (pc) (match_dup 3))) (const_int 60000))
+                       (const_int 6) (const_int 12)))]) ; 8 byte for clr/jg
+                                                        ; 10 byte for clgr/jg
+ 
  ;;
  ;;- Move instructions.
  ;;


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]