This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: optimizing predictable branches on x86


> Hi,
> > Core2 follows a similar pattern, although it's not seeing any
> > slowdown in the "no deps, predictable, jmp" case like K8 does.
> > 
> > Any comments? (please cc me) Should gcc be using conditional jumps
> > more often eg. in the case of __builtin_expect())?
> 
> The problem is that in general GCC's branch prediction algorithms are
> very poor on predicting predictability of branch: they are pretty good
> on guessing outcome but that is.
> 
> Only cases we do so quite reliably IMO are:
>   1) loop branches that are not interesting for cmov conversion
>   2) branches leading to noreturn calls, also not interesting
>   3) builtin_expect mentioned.
>   4) when profile feedback is around to some degree (ie we know when the
>   branch is very likely or very unlikely. We don't simulate what
>   hardware will do on it).
> 
> I guess we can implement the machinery for 3 and 4 (in fact once
> I played adding EDGE_PREDICTABLE_P predicate that basically tested if
> the esimated probability of branch is <5% or >95%) but never got really
> noticeable improvements out of it and gave up.

Just for those who might be interested, I found the old patch.
I will try to find time to update it to mainline, but if someone beats
me, I definitly won't complain.

Index: expr.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/expr.h,v
retrieving revision 1.171
diff -c -3 -p -r1.171 expr.h
*** expr.h	8 Sep 2004 18:44:56 -0000	1.171
--- expr.h	25 Sep 2004 13:22:22 -0000
*************** Software Foundation, 59 Temple Place - S
*** 38,43 ****
--- 38,49 ----
  #ifndef BRANCH_COST
  #define BRANCH_COST 1
  #endif
+ #ifndef PREDICTABLE_BRANCH_COST
+ #define PREDICTABLE_BRANCH_COST BRANCH_COST
+ #endif
+ #ifndef COLD_BRANCH_COST
+ #define COLD_BRANCH_COST BRANCH_COST
+ #endif
  
  /* This is the 4th arg to `expand_expr'.
     EXPAND_STACK_PARM means we are possibly expanding a call param onto
Index: ifcvt.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/ifcvt.c,v
retrieving revision 1.165
diff -c -3 -p -r1.165 ifcvt.c
*** ifcvt.c	17 Sep 2004 05:32:36 -0000	1.165
--- ifcvt.c	25 Sep 2004 13:22:22 -0000
*************** cond_exec_process_if_block (ce_if_block_
*** 608,613 ****
--- 608,614 ----
  struct noce_if_info
  {
    basic_block test_bb;
+   int branch_cost;
    rtx insn_a, insn_b;
    rtx x, a, b;
    rtx jump, cond, cond_earliest;
*************** noce_try_store_flag_constants (struct no
*** 869,888 ****
  	normalize = 0;
        else if (ifalse == 0 && exact_log2 (itrue) >= 0
  	       && (STORE_FLAG_VALUE == 1
! 		   || BRANCH_COST >= 2))
  	normalize = 1;
        else if (itrue == 0 && exact_log2 (ifalse) >= 0 && can_reverse
! 	       && (STORE_FLAG_VALUE == 1 || BRANCH_COST >= 2))
  	normalize = 1, reversep = 1;
        else if (itrue == -1
  	       && (STORE_FLAG_VALUE == -1
! 		   || BRANCH_COST >= 2))
  	normalize = -1;
        else if (ifalse == -1 && can_reverse
! 	       && (STORE_FLAG_VALUE == -1 || BRANCH_COST >= 2))
  	normalize = -1, reversep = 1;
!       else if ((BRANCH_COST >= 2 && STORE_FLAG_VALUE == -1)
! 	       || BRANCH_COST >= 3)
  	normalize = -1;
        else
  	return FALSE;
--- 870,889 ----
  	normalize = 0;
        else if (ifalse == 0 && exact_log2 (itrue) >= 0
  	       && (STORE_FLAG_VALUE == 1
! 		   || if_info->branch_cost >= 2))
  	normalize = 1;
        else if (itrue == 0 && exact_log2 (ifalse) >= 0 && can_reverse
! 	       && (STORE_FLAG_VALUE == 1 || if_info->branch_cost >= 2))
  	normalize = 1, reversep = 1;
        else if (itrue == -1
  	       && (STORE_FLAG_VALUE == -1
! 		   || if_info->branch_cost >= 2))
  	normalize = -1;
        else if (ifalse == -1 && can_reverse
! 	       && (STORE_FLAG_VALUE == -1 || if_info->branch_cost >= 2))
  	normalize = -1, reversep = 1;
!       else if ((if_info->branch_cost >= 2 && STORE_FLAG_VALUE == -1)
! 	       || if_info->branch_cost >= 3)
  	normalize = -1;
        else
  	return FALSE;
*************** noce_try_addcc (struct noce_if_info *if_
*** 1014,1020 ****
  
        /* If that fails, construct conditional increment or decrement using
  	 setcc.  */
!       if (BRANCH_COST >= 2
  	  && (XEXP (if_info->a, 1) == const1_rtx
  	      || XEXP (if_info->a, 1) == constm1_rtx))
          {
--- 1015,1021 ----
  
        /* If that fails, construct conditional increment or decrement using
  	 setcc.  */
!       if (if_info->branch_cost >= 2
  	  && (XEXP (if_info->a, 1) == const1_rtx
  	      || XEXP (if_info->a, 1) == constm1_rtx))
          {
*************** noce_try_store_flag_mask (struct noce_if
*** 1066,1072 ****
  
    reversep = 0;
    if (! no_new_pseudos
!       && (BRANCH_COST >= 2
  	  || STORE_FLAG_VALUE == -1)
        && ((if_info->a == const0_rtx
  	   && rtx_equal_p (if_info->b, if_info->x))
--- 1067,1073 ----
  
    reversep = 0;
    if (! no_new_pseudos
!       && (if_info->branch_cost >= 2
  	  || STORE_FLAG_VALUE == -1)
        && ((if_info->a == const0_rtx
  	   && rtx_equal_p (if_info->b, if_info->x))
*************** noce_try_cmove_arith (struct noce_if_inf
*** 1223,1229 ****
       already checked for no side effects.  */
    if (! no_new_pseudos && cse_not_expected
        && MEM_P (a) && MEM_P (b)
!       && BRANCH_COST >= 5)
      {
        a = XEXP (a, 0);
        b = XEXP (b, 0);
--- 1224,1230 ----
       already checked for no side effects.  */
    if (! no_new_pseudos && cse_not_expected
        && MEM_P (a) && MEM_P (b)
!       && if_info->branch_cost >= 5)
      {
        a = XEXP (a, 0);
        b = XEXP (b, 0);
*************** noce_try_cmove_arith (struct noce_if_inf
*** 1253,1259 ****
    if (insn_a)
      {
        insn_cost = insn_rtx_cost (PATTERN (insn_a));
!       if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (BRANCH_COST))
  	return FALSE;
      }
    else
--- 1254,1260 ----
    if (insn_a)
      {
        insn_cost = insn_rtx_cost (PATTERN (insn_a));
!       if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (if_info->branch_cost))
  	return FALSE;
      }
    else
*************** noce_try_cmove_arith (struct noce_if_inf
*** 1263,1269 ****
  
    if (insn_b) {
      insn_cost += insn_rtx_cost (PATTERN (insn_b));
!     if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (BRANCH_COST))
        return FALSE;
    }
  
--- 1264,1270 ----
  
    if (insn_b) {
      insn_cost += insn_rtx_cost (PATTERN (insn_b));
!     if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (if_info->branch_cost))
        return FALSE;
    }
  
*************** noce_process_if_block (struct ce_if_bloc
*** 2010,2015 ****
--- 2011,2024 ----
    if_info.a = a;
    if_info.b = b;
    if_info.b_unconditional = else_bb == 0;
+   if (!maybe_hot_bb (test_bb))
+     if_info.branch_cost = COLD_BRANCH_COST
+   if (profile_status != PROFILE_ABSENT
+       && (test_bb->succ < 0.03 * REG_BR_PROB_BASE
+ 	  || test_bb->succ > 0.97 * REG_BR_PROB_BASE))
+     if_info.branch_cost = PREDICTABLE_BRANCH_COST;
+   else
+     if_info.branch_cost = BRANCH_COST;
  
    /* Try optimizations in some approximation of a useful order.  */
    /* ??? Should first look to see if X is live incoming at all.  If it
Index: config/i386/i386.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.c,v
retrieving revision 1.730
diff -c -3 -p -r1.730 i386.c
*** config/i386/i386.c	23 Sep 2004 14:34:24 -0000	1.730
--- config/i386/i386.c	25 Sep 2004 13:22:24 -0000
*************** struct processor_costs size_cost = {	/* 
*** 98,103 ****
--- 98,104 ----
    0,					/* size of prefetch block */
    0,					/* number of parallel prefetches */
    1,					/* Branch cost */
+   1,					/* Predictable branch cost */
    2,					/* cost of FADD and FSUB insns.  */
    2,					/* cost of FMUL instruction.  */
    2,					/* cost of FDIV instruction.  */
*************** struct processor_costs i386_cost = {	/* 
*** 143,148 ****
--- 144,150 ----
    0,					/* size of prefetch block */
    0,					/* number of parallel prefetches */
    1,					/* Branch cost */
+   1,					/* Predictable branch cost */
    23,					/* cost of FADD and FSUB insns.  */
    27,					/* cost of FMUL instruction.  */
    88,					/* cost of FDIV instruction.  */
*************** struct processor_costs i486_cost = {	/* 
*** 187,192 ****
--- 189,195 ----
    0,					/* size of prefetch block */
    0,					/* number of parallel prefetches */
    1,					/* Branch cost */
+   1,					/* Predictable branch cost */
    8,					/* cost of FADD and FSUB insns.  */
    16,					/* cost of FMUL instruction.  */
    73,					/* cost of FDIV instruction.  */
*************** struct processor_costs pentium_cost = {
*** 231,236 ****
--- 234,240 ----
    0,					/* size of prefetch block */
    0,					/* number of parallel prefetches */
    2,					/* Branch cost */
+   1,					/* Predictable branch cost */
    3,					/* cost of FADD and FSUB insns.  */
    3,					/* cost of FMUL instruction.  */
    39,					/* cost of FDIV instruction.  */
*************** struct processor_costs pentiumpro_cost =
*** 275,280 ****
--- 279,285 ----
    32,					/* size of prefetch block */
    6,					/* number of parallel prefetches */
    2,					/* Branch cost */
+   1,					/* Predictable branch cost */
    3,					/* cost of FADD and FSUB insns.  */
    5,					/* cost of FMUL instruction.  */
    56,					/* cost of FDIV instruction.  */
*************** struct processor_costs k6_cost = {
*** 319,324 ****
--- 324,330 ----
    32,					/* size of prefetch block */
    1,					/* number of parallel prefetches */
    1,					/* Branch cost */
+   1,					/* Predictable branch cost */
    2,					/* cost of FADD and FSUB insns.  */
    2,					/* cost of FMUL instruction.  */
    56,					/* cost of FDIV instruction.  */
*************** struct processor_costs athlon_cost = {
*** 362,368 ****
    5,					/* MMX or SSE register to integer */
    64,					/* size of prefetch block */
    6,					/* number of parallel prefetches */
!   2,					/* Branch cost */
    4,					/* cost of FADD and FSUB insns.  */
    4,					/* cost of FMUL instruction.  */
    24,					/* cost of FDIV instruction.  */
--- 368,375 ----
    5,					/* MMX or SSE register to integer */
    64,					/* size of prefetch block */
    6,					/* number of parallel prefetches */
!   3,					/* Branch cost */
!   1,					/* Predictable branch cost */
    4,					/* cost of FADD and FSUB insns.  */
    4,					/* cost of FMUL instruction.  */
    24,					/* cost of FDIV instruction.  */
*************** struct processor_costs k8_cost = {
*** 406,412 ****
    5,					/* MMX or SSE register to integer */
    64,					/* size of prefetch block */
    6,					/* number of parallel prefetches */
!   2,					/* Branch cost */
    4,					/* cost of FADD and FSUB insns.  */
    4,					/* cost of FMUL instruction.  */
    19,					/* cost of FDIV instruction.  */
--- 413,420 ----
    5,					/* MMX or SSE register to integer */
    64,					/* size of prefetch block */
    6,					/* number of parallel prefetches */
!   3,					/* Branch cost */
!   1,					/* Predictable branch cost */
    4,					/* cost of FADD and FSUB insns.  */
    4,					/* cost of FMUL instruction.  */
    19,					/* cost of FDIV instruction.  */
*************** struct processor_costs pentium4_cost = {
*** 450,456 ****
    10,					/* MMX or SSE register to integer */
    64,					/* size of prefetch block */
    6,					/* number of parallel prefetches */
!   2,					/* Branch cost */
    5,					/* cost of FADD and FSUB insns.  */
    7,					/* cost of FMUL instruction.  */
    43,					/* cost of FDIV instruction.  */
--- 458,465 ----
    10,					/* MMX or SSE register to integer */
    64,					/* size of prefetch block */
    6,					/* number of parallel prefetches */
!   3,					/* Branch cost */
!   1,					/* Predictable branch cost */
    5,					/* cost of FADD and FSUB insns.  */
    7,					/* cost of FMUL instruction.  */
    43,					/* cost of FDIV instruction.  */
*************** struct processor_costs nocona_cost = {
*** 494,500 ****
    8,					/* MMX or SSE register to integer */
    128,					/* size of prefetch block */
    8,					/* number of parallel prefetches */
!   1,					/* Branch cost */
    6,					/* cost of FADD and FSUB insns.  */
    8,					/* cost of FMUL instruction.  */
    40,					/* cost of FDIV instruction.  */
--- 503,510 ----
    8,					/* MMX or SSE register to integer */
    128,					/* size of prefetch block */
    8,					/* number of parallel prefetches */
!   2,					/* Branch cost */
!   1,					/* Predictable branch cost */
    6,					/* cost of FADD and FSUB insns.  */
    8,					/* cost of FMUL instruction.  */
    40,					/* cost of FDIV instruction.  */
Index: config/i386/i386.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.h,v
retrieving revision 1.402
diff -c -3 -p -r1.402 i386.h
*** config/i386/i386.h	12 Sep 2004 23:31:31 -0000	1.402
--- config/i386/i386.h	25 Sep 2004 13:22:24 -0000
*************** struct processor_costs {
*** 78,83 ****
--- 78,84 ----
    const int simultaneous_prefetches; /* number of parallel prefetch
  				   operations.  */
    const int branch_cost;	/* Default value for BRANCH_COST.  */
+   const int predictable_branch_cost;	/* Default value for PREDICTABLE_BRANCH_COST.  */
    const int fadd;		/* cost of FADD and FSUB instructions.  */
    const int fmul;		/* cost of FMUL instruction.  */
    const int fdiv;		/* cost of FDIV instruction.  */
*************** do {							\
*** 2594,2599 ****
--- 2595,2602 ----
     is the default; other values are interpreted relative to that.  */
  
  #define BRANCH_COST ix86_branch_cost
+ #define COLD_BRANCH_COST 1
+ #define PREDICTABLE_BRANCH_COST ix86_predictable_branch_cost
  
  /* Define this macro as a C expression which is nonzero if accessing
     less than a word of memory (i.e. a `char' or a `short') is no
*************** extern unsigned int ix86_preferred_stack
*** 2910,2915 ****
--- 2913,2919 ----
  extern const char *ix86_preferred_stack_boundary_string;
  
  extern int ix86_branch_cost;
+ extern int ix86_predictable_branch_cost;
  extern const char *ix86_branch_cost_string;
  
  extern const char *ix86_debug_arg_string;


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]