This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Hoist adjacent pointer loads


On Mon, 2012-05-21 at 14:17 +0200, Richard Guenther wrote:
> On Thu, May 3, 2012 at 4:33 PM, William J. Schmidt
> <wschmidt@linux.vnet.ibm.com> wrote:
> > This patch was posted for comment back in February during stage 4.  It
> > addresses a performance issue noted in the EEMBC routelookup benchmark
> > on a common idiom:
> >
> >  if (...)
> >    x = y->left;
> >  else
> >    x = y->right;
> >
> > If the two loads can be hoisted out of the if/else, the if/else can be
> > replaced by a conditional move instruction on architectures that support
> > one.  Because this speculates one of the loads, the patch constrains the
> > optimization to avoid introducing page faults.
> >
> > Bootstrapped and regression tested on powerpc-unknown-linux-gnu with no
> > new failures.  The patch provides significant improvement to the
> > routelookup benchmark, and is neutral on SPEC cpu2000/cpu2006.
> >
> > One question is what optimization level should be required for this.
> > Because of the speculation, -O3 might be in order.  I don't believe
> > -Ofast is required as there is no potential correctness issue involved.
> > Right now the patch doesn't check the optimization level (like the rest
> > of the phi-opt transforms), which is likely a poor choice.
> >
> > Ok for trunk?
> >
> > Thanks,
> > Bill
> >
> >
> > 2012-05-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> >
> >        * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
> >        declaration.
> >        (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
> >        (tree_ssa_phiopt): Call gate_hoist_loads.
> >        (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
> >        (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
> >        hoist_adjacent_loads.
> >        (local_reg_dependence): New function.
> >        (local_mem_dependence): Likewise.
> >        (hoist_adjacent_loads): Likewise.
> >        (gate_hoist_loads): Likewise.
> >        * common.opt (fhoist-adjacent-loads): New switch.
> >        * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
> >        * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
> >
> >
> > Index: gcc/tree-ssa-phiopt.c
> > ===================================================================
> > --- gcc/tree-ssa-phiopt.c       (revision 187057)
> > +++ gcc/tree-ssa-phiopt.c       (working copy)
> > @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "cfgloop.h"
> >  #include "tree-data-ref.h"
> >  #include "tree-pretty-print.h"
> > +#include "gimple-pretty-print.h"
> > +#include "insn-config.h"
> > +#include "expr.h"
> > +#include "optabs.h"
> >
> > +#ifndef HAVE_conditional_move
> > +#define HAVE_conditional_move (0)
> > +#endif
> > +
> >  static unsigned int tree_ssa_phiopt (void);
> > -static unsigned int tree_ssa_phiopt_worker (bool);
> > +static unsigned int tree_ssa_phiopt_worker (bool, bool);
> >  static bool conditional_replacement (basic_block, basic_block,
> >                                     edge, edge, gimple, tree, tree);
> >  static int value_replacement (basic_block, basic_block,
> > @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
> >  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
> >  static struct pointer_set_t * get_non_trapping (void);
> >  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> > +static void hoist_adjacent_loads (basic_block, basic_block,
> > +                                 basic_block, basic_block);
> > +static bool gate_hoist_loads (void);
> >
> >  /* This pass tries to replaces an if-then-else block with an
> >    assignment.  We have four kinds of transformations.  Some of these
> > @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
> >      bb2:
> >        x = PHI <x' (bb0), ...>;
> >
> > -   A similar transformation is done for MAX_EXPR.  */
> > +   A similar transformation is done for MAX_EXPR.
> >
> > +
> > +   This pass also performs a fifth transformation of a slightly different
> > +   flavor.
> > +
> > +   Adjacent Load Hoisting
> > +   ----------------------
> > +
> > +   This transformation replaces
> > +
> > +     bb0:
> > +       if (...) goto bb2; else goto bb1;
> > +     bb1:
> > +       x1 = (<expr>).field1;
> > +       goto bb3;
> > +     bb2:
> > +       x2 = (<expr>).field2;
> > +     bb3:
> > +       # x = PHI <x1, x2>;
> > +
> > +   with
> > +
> > +     bb0:
> > +       x1 = (<expr>).field1;
> > +       x2 = (<expr>).field2;
> > +       if (...) goto bb2; else goto bb1;
> > +     bb1:
> > +       goto bb3;
> > +     bb2:
> > +     bb3:
> > +       # x = PHI <x1, x2>;
> > +
> > +   The purpose of this transformation is to enable generation of conditional
> > +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> > +   the loads is speculative, the transformation is restricted to very
> > +   specific cases to avoid introducing a page fault.  We are looking for
> > +   the common idiom:
> > +
> > +     if (...)
> > +       x = y->left;
> > +     else
> > +       x = y->right;
> > +
> > +   where left and right are typically adjacent pointers in a tree structure.  */
> > +
> >  static unsigned int
> >  tree_ssa_phiopt (void)
> >  {
> > -  return tree_ssa_phiopt_worker (false);
> > +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
> >  }
> >
> >  /* This pass tries to transform conditional stores into unconditional
> > @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
> >  static unsigned int
> >  tree_ssa_cs_elim (void)
> >  {
> > -  return tree_ssa_phiopt_worker (true);
> > +  return tree_ssa_phiopt_worker (true, false);
> >  }
> >
> >  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> > @@ -227,9 +282,11 @@ static tree condstoretemp;
> >  /* The core routine of conditional store replacement and normal
> >    phi optimizations.  Both share much of the infrastructure in how
> >    to match applicable basic block patterns.  DO_STORE_ELIM is true
> > -   when we want to do conditional store replacement, false otherwise.  */
> > +   when we want to do conditional store replacement, false otherwise.
> > +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
> > +   of diamond control flow patterns, false otherwise.  */
> >  static unsigned int
> > -tree_ssa_phiopt_worker (bool do_store_elim)
> > +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
> >  {
> >   basic_block bb;
> >   basic_block *bb_order;
> > @@ -312,6 +369,20 @@ static unsigned int
> >            cfgchanged = true;
> >          continue;
> >        }
> > +      else if (do_hoist_loads
> > +                && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> > +       {
> > +         basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> > +
> > +         if (single_succ_p (bb1)
> > +             && single_succ_p (bb2)
> > +             && single_pred_p (bb1)
> > +             && single_pred_p (bb2)
> 
> and same pred?

bb1 and bb2 are known to be successors of bb, so this is handled.

> 
> > +             && EDGE_COUNT (bb->succs) == 2
> > +             && EDGE_COUNT (bb3->preds) == 2)
> > +           hoist_adjacent_loads (bb, bb1, bb2, bb3);
> > +         continue;
> > +       }
> >       else
> >        continue;
> >
> > @@ -1707,6 +1778,220 @@ cond_if_else_store_replacement (basic_block then_b
> >   return ok;
> >  }
> >
> > +/* Return TRUE iff EXPR contains any SSA names that are defined in BB.  */
> > +
> > +static bool
> > +local_reg_dependence (tree expr, basic_block bb)
> > +{
> > +  int i;
> > +
> > +  if (TREE_CODE (expr) == SSA_NAME)
> > +    {
> > +      if (SSA_NAME_IS_DEFAULT_DEF (expr))
> > +       return false;
> > +      else
> > +       return (gimple_bb (SSA_NAME_DEF_STMT (expr)) == bb);
> > +    }
> > +
> > +  for (i = 0; i < TREE_OPERAND_LENGTH (expr); i++)
> > +    if (TREE_OPERAND (expr, i)
> > +       && local_reg_dependence (TREE_OPERAND (expr, i), bb))
> > +      return true;
> 
> This looks odd - what are you trying to catch here?

My initial thought was, we can't hoist an expr relying on an SSA name
defined in the block it's being hoisted from.  Given all the other
requirements on the refs, though, I suppose that's impossible.  (If the
refs are identical except for the FIELD_DECL, they can't be locally
dependent on any SSA names.)  Should be able to remove this.  I was
adding code for the memory dependence and got carried away.

> 
> > +  return false;
> > +}
> > +
> > +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> > +
> > +static bool
> > +local_mem_dependence (gimple stmt, basic_block bb)
> > +{
> > +  tree vuse = gimple_vuse (stmt);
> > +  gimple def;
> > +
> > +  if (!vuse)
> > +    return false;
> > +
> > +  def = SSA_NAME_DEF_STMT (vuse);
> > +  return (def && gimple_bb (def) == bb);
> > +}
> > +
> > +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> > +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> > +   and BB3 rejoins control flow following BB1 and BB2, look for
> > +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> > +   two loads, one each occurring in BB1 and BB2, and the loads are
> > +   provably of adjacent fields in the same structure, then move both
> > +   loads into BB0.  Of course this can only be done if there are no
> > +   dependencies preventing such motion.
> > +
> > +   One of the hoisted loads will always be speculative, so the
> > +   transformation is currently conservative:
> > +
> > +    - The fields must be strictly adjacent.
> > +    - Hoisting is only done on aligned pointer fields.
> > +    - The two fields must occupy a single memory block that is
> > +      guaranteed to not cross a page boundary.
> > +
> > +    The last is difficult to prove, as such memory blocks should be
> > +    aligned on the minimum of the stack alignment boundary and the
> > +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> > +    on a parameter for the alignment value.
> > +
> > +    Provided a good value is used for the last case, the first two
> > +    restrictions can probably be relaxed.  */
> > +
> > +static void
> > +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> > +                     basic_block bb2, basic_block bb3)
> > +{
> > +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> > +  gimple_stmt_iterator gsi;
> > +
> > +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> > +     for phis of two SSA names, one each of which is defined in bb1 and
> > +     bb2.  */
> > +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> > +    {
> > +      gimple phi_stmt = gsi_stmt (gsi);
> > +      gimple def1, def2;
> > +      tree arg1, arg2, ref1, ref2, field1, field2;
> > +      tree tree_offset1, tree_offset2, tree_size1, tree_size2;
> > +      int offset1, offset2, size1, size2, block_offset;
> > +      unsigned align1, align2;
> > +      gimple_stmt_iterator gsi2;
> > +
> > +      if (gimple_phi_num_args (phi_stmt) != 2)
> > +       continue;
> > +
> > +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> > +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> > +
> > +      if (TREE_CODE (arg1) != SSA_NAME
> > +         || TREE_CODE (arg2) != SSA_NAME
> > +         || SSA_NAME_IS_DEFAULT_DEF (arg1)
> > +         || SSA_NAME_IS_DEFAULT_DEF (arg2)
> > +         /* FORNOW: Only do this optimization for pointer types.  */
> > +         || !POINTER_TYPE_P (TREE_TYPE (arg1)))
> 
> I'd like to see it enabled generally, not only for pointer types.

OK.  I'll revise the patch and check some performance numbers.  Just
trying to be conservative given that I can't test all the various kinds
of hardware out there...

> 
> > +       continue;
> > +
> > +      def1 = SSA_NAME_DEF_STMT (arg1);
> > +      def2 = SSA_NAME_DEF_STMT (arg2);
> > +
> > +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> > +         && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> > +       continue;
> > +
> > +      /* Unless -fhoist-adjacent-loads was specified, check the mode
> > +        of the arguments to be sure a conditional move can be generated
> > +        for it.  */
> > +      if (flag_hoist_adjacent_loads != 1
> > +         && !optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> > +       continue;
> > +
> > +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> > +      if (!gimple_assign_single_p (def1)
> > +         || !gimple_assign_single_p (def2))
> > +       continue;
> > +
> > +      ref1 = gimple_assign_rhs1 (def1);
> > +      ref2 = gimple_assign_rhs1 (def2);
> > +
> > +      if (TREE_CODE (ref1) != COMPONENT_REF
> > +         || TREE_CODE (ref2) != COMPONENT_REF)
> > +       continue;
> > +
> > +      /* The zeroth operand of the two component references must be
> > +        identical.  It is not sufficient to compare get_base_address of
> > +        the two references, because this could allow for different
> > +        elements of the same array in the two trees.  It is not safe to
> > +        assume that the existence of one array element implies the
> > +        existence of a different one.  */
> > +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> > +       continue;
> > +
> > +      field1 = TREE_OPERAND (ref1, 1);
> > +      field2 = TREE_OPERAND (ref2, 1);
> > +
> > +      tree_offset1 = DECL_FIELD_BIT_OFFSET (field1);
> > +      tree_offset2 = DECL_FIELD_BIT_OFFSET (field2);
> > +      tree_size1 = DECL_SIZE_UNIT (field1);
> > +      tree_size2 = DECL_SIZE_UNIT (field2);
> > +
> > +      if (TREE_CODE (tree_offset1) != INTEGER_CST
> > +         || TREE_CODE (tree_offset2) != INTEGER_CST
> 
> DECL_FIELD_BIT_OFFSETs are always INTEGER_CSTs.  You fail to
> verify that DECL_FIELD_OFFSET of the two fields matches.

OK, will change.

> 
> > +         || TREE_CODE (tree_size1) != INTEGER_CST
> > +         || TREE_CODE (tree_size2) != INTEGER_CST)
> > +       continue;
> > +
> > +      /* FORNOW: The two field references must be byte-aligned, naturally
> > +        aligned within the record, and adjacent.  */
> > +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> > +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> > +      size1 = TREE_INT_CST_LOW (tree_size1);
> > +      size2 = TREE_INT_CST_LOW (tree_size2);
> 
> You should have checked host_integerp above if you just handle TREE_INT_CST_LOW.

Whoops.  Yes.

> 
> > +      align1 = TYPE_ALIGN (TREE_TYPE (arg1));
> > +      align2 = TYPE_ALIGN (TREE_TYPE (arg2));
> 
> You should be using DECL_ALIGN of the field-decls here.

OK.

> 
> > +      if (offset1 % BITS_PER_UNIT != 0
> > +         || offset2 % BITS_PER_UNIT != 0
> > +         || offset1 % align1 != 0
> > +         || offset2 % align2 != 0)
> > +       continue;
> > +
> > +      offset1 /= BITS_PER_UNIT;
> > +      offset2 /= BITS_PER_UNIT;
> > +
> > +      if (offset1 + size1 != offset2
> > +         && offset2 + size2 != offset1)
> > +       continue;
> 
> I think it's more natural to re-write all of the above to verify that
> field1 follows
> field2 by means of DECL_CHAIN (ignoring intermediate non-FIELD_DECLS)
> (or the other way around) and to only verify alignment of field1 and the extent
> of both accesses.  Note that field alignment can be a tricky business,
> but DECL_ALIGN of the first FIELD_DECL should be a reasonable hint
> (larger alignment cannot be easily detected because all information is relative
> to a possibly parent struct FIELD_DECL you may not even see).

OK, sounds good.  I'll rewrite it in this fashion.

> 
> > +      /* The two field references must fit within a block of memory that
> > +        is guaranteed to be on the same page.  */
> > +      block_offset = MIN (offset1, offset2) % param_align;
> > +
> > +      if (block_offset + size1 + size2 > param_align)
> > +       continue;
> > +
> > +      /* The two expressions cannot be dependent upon SSA names
> > +        or vdefs defined in bb1/bb2.  */
> > +      if (local_reg_dependence (TREE_OPERAND (ref1, 0), bb1)
> > +         || local_reg_dependence (TREE_OPERAND (ref2, 0), bb2)
> > +         || local_mem_dependence (def1, bb1)
> > +         || local_mem_dependence (def2, bb2))
> > +       continue;
> 
> You should be using FOR_EACH_SSA_USE_OPERAND here on the two
> definition statements (and accordingly adjust those helpers, or even better,
> trivially inline them).  I think you want a can_hoist_to_bb_end (stmt, bb)
> helper, which is really what you check here.  Maybe that already exists
> somewhere even.

If I remove local_reg_dependence and just leave the local_mem_dependence
checks in place, I believe that should be sufficient, right?  Given the
constraints, checking virtual dependences ought to be enough.

> 
> > +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> > +        bb0.  */
> > +      gsi2 = gsi_for_stmt (def1);
> > +      gsi_move_to_bb_end (&gsi2, bb0);
> > +      gsi2 = gsi_for_stmt (def2);
> > +      gsi_move_to_bb_end (&gsi2, bb0);
> 
> Should you be paying attention to the order of the loads here, thus load the
> first memory part first?

Perhaps so, in case of a cache miss.  I know of some existing hardware
that will begin a cache fill at the point of the miss and wrap it
around, so such hardware would benefit.

> 
> Are we sure we always transform the diamond to a conditional move after
> this transform, in the same pass?  I am concerned on how this conflicts
> with the patch floating around that enables sinking of loads - what do we do
> to avoid undoing this transform?

Hm, I was somewhat relying on the existing rule of no load-sinking.
However, I think we're still OK.  There is a late pass of phiopt long
after the sink pass which will re-do the optimization even if sinking
undoes it.  Right now we rely on RTL if-conversion to generate the
conditional move.  I'm open to suggestions if that doesn't seem like
enough of a guarantee.

Thanks,
Bill

> 
> Thanks,
> Richard.
> 
> > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > +       {
> > +         fprintf (dump_file,
> > +                  "\nHoisting adjacent loads from %d and %d into %d: \n",
> > +                  bb1->index, bb2->index, bb0->index);
> > +         print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> > +         print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> > +       }
> > +    }
> > +}
> > +
> > +/* Determine whether we should attempt to hoist adjacent loads out of
> > +   diamond patterns in pass_phiopt.  Always hoist loads if
> > +   -fhoist-adjacent-loads is specified, or if the target machine has
> > +   a conditional move instruction and -fno-hoist-adjacent-loads is
> > +   not specified.  */
> > +
> > +static bool
> > +gate_hoist_loads (void)
> > +{
> > +  return (flag_hoist_adjacent_loads == 1
> > +         || (HAVE_conditional_move && flag_hoist_adjacent_loads != 0));
> > +}
> > +
> >  /* Always do these optimizations if we have SSA
> >    trees to work on.  */
> >  static bool
> > Index: gcc/common.opt
> > ===================================================================
> > --- gcc/common.opt      (revision 187057)
> > +++ gcc/common.opt      (working copy)
> > @@ -1186,6 +1186,11 @@ fgraphite-identity
> >  Common Report Var(flag_graphite_identity) Optimization
> >  Enable Graphite Identity transformation
> >
> > +fhoist-adjacent-loads
> > +Common Report Var(flag_hoist_adjacent_loads) Init(-1) Optimization
> > +Enable hoisting adjacent loads to encourage generating conditional move
> > +instructions
> > +
> >  floop-parallelize-all
> >  Common Report Var(flag_loop_parallelize_all) Optimization
> >  Mark all loops as parallel
> > Index: gcc/Makefile.in
> > ===================================================================
> > --- gcc/Makefile.in     (revision 187057)
> > +++ gcc/Makefile.in     (working copy)
> > @@ -2351,7 +2351,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
> >    $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
> >    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
> >    $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> > -   $(TREE_DATA_REF_H) tree-pretty-print.h
> > +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> > +   insn-config.h $(EXPR_H) $(OPTABS_H)
> >  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
> >    $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
> >    $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> > Index: gcc/params.def
> > ===================================================================
> > --- gcc/params.def      (revision 187057)
> > +++ gcc/params.def      (working copy)
> > @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
> >          "track string lengths",
> >          1000, 0, 0)
> >
> > +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> > +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> > +         "min-cmove-struct-align",
> > +         "Minimum byte alignment to assume for structures in the stack "
> > +         "or heap when considering load hoisting for conditional moves",
> > +         16, 8, 256)
> > +
> >  /*
> >  Local variables:
> >  mode:c
> >
> >
> 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]