This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Hoist adjacent pointer loads


On Mon, 2012-06-11 at 13:28 +0200, Richard Guenther wrote:
> On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt
> <wschmidt@linux.vnet.ibm.com> wrote:
> > Hi Richard,
> >
> > Here's a revision of the hoist-adjacent-loads patch.  I'm sorry for the
> > delay since the last revision, but my performance testing has been
> > blocked waiting for a fix to PR53487.  I ended up applying a test
> > version of the patch to 4.7 and ran performance numbers with that
> > instead, with no degradations.
> >
> > In addition to addressing your comments, this patch contains one bug fix
> > where local_mem_dependence was called on the wrong blocks after swapping
> > def1 and def2.
> >
> > Bootstrapped with no regressions on powerpc64-unknown-linux-gnu.  Is
> > this version ok for trunk?  I won't commit it until I can do final
> > testing on trunk in conjunction with a fix for PR53487.
> >
> > Thanks,
> > Bill
> >
> >
> > 2012-06-04  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> >
> >        * opts.c: Add -fhoist_adjacent_loads to -O2 and above.
> >        * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
> >        declaration.
> >        (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
> >        (tree_ssa_phiopt): Call gate_hoist_loads.
> >        (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
> >        (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
> >        hoist_adjacent_loads.
> >        (local_mem_dependence): New function.
> >        (hoist_adjacent_loads): Likewise.
> >        (gate_hoist_loads): Likewise.
> >        * common.opt (fhoist-adjacent-loads): New switch.
> >        * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
> >        * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
> >
> >
> > Index: gcc/opts.c
> > ===================================================================
> > --- gcc/opts.c  (revision 187805)
> > +++ gcc/opts.c  (working copy)
> > @@ -489,6 +489,7 @@ static const struct default_options default_option
> >     { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 },
> >     { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
> >     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
> > +    { OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 },
> >
> >     /* -O3 optimizations.  */
> >     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
> > Index: gcc/tree-ssa-phiopt.c
> > ===================================================================
> > --- gcc/tree-ssa-phiopt.c       (revision 187805)
> > +++ gcc/tree-ssa-phiopt.c       (working copy)
> > @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "cfgloop.h"
> >  #include "tree-data-ref.h"
> >  #include "tree-pretty-print.h"
> > +#include "gimple-pretty-print.h"
> > +#include "insn-config.h"
> > +#include "expr.h"
> > +#include "optabs.h"
> >
> > +#ifndef HAVE_conditional_move
> > +#define HAVE_conditional_move (0)
> > +#endif
> > +
> >  static unsigned int tree_ssa_phiopt (void);
> > -static unsigned int tree_ssa_phiopt_worker (bool);
> > +static unsigned int tree_ssa_phiopt_worker (bool, bool);
> >  static bool conditional_replacement (basic_block, basic_block,
> >                                     edge, edge, gimple, tree, tree);
> >  static int value_replacement (basic_block, basic_block,
> > @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
> >  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
> >  static struct pointer_set_t * get_non_trapping (void);
> >  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> > +static void hoist_adjacent_loads (basic_block, basic_block,
> > +                                 basic_block, basic_block);
> > +static bool gate_hoist_loads (void);
> >
> >  /* This pass tries to replaces an if-then-else block with an
> >    assignment.  We have four kinds of transformations.  Some of these
> > @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
> >      bb2:
> >        x = PHI <x' (bb0), ...>;
> >
> > -   A similar transformation is done for MAX_EXPR.  */
> > +   A similar transformation is done for MAX_EXPR.
> >
> > +
> > +   This pass also performs a fifth transformation of a slightly different
> > +   flavor.
> > +
> > +   Adjacent Load Hoisting
> > +   ----------------------
> > +
> > +   This transformation replaces
> > +
> > +     bb0:
> > +       if (...) goto bb2; else goto bb1;
> > +     bb1:
> > +       x1 = (<expr>).field1;
> > +       goto bb3;
> > +     bb2:
> > +       x2 = (<expr>).field2;
> > +     bb3:
> > +       # x = PHI <x1, x2>;
> > +
> > +   with
> > +
> > +     bb0:
> > +       x1 = (<expr>).field1;
> > +       x2 = (<expr>).field2;
> > +       if (...) goto bb2; else goto bb1;
> > +     bb1:
> > +       goto bb3;
> > +     bb2:
> > +     bb3:
> > +       # x = PHI <x1, x2>;
> > +
> > +   The purpose of this transformation is to enable generation of conditional
> > +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> > +   the loads is speculative, the transformation is restricted to very
> > +   specific cases to avoid introducing a page fault.  We are looking for
> > +   the common idiom:
> > +
> > +     if (...)
> > +       x = y->left;
> > +     else
> > +       x = y->right;
> > +
> > +   where left and right are typically adjacent pointers in a tree structure.  */
> > +
> >  static unsigned int
> >  tree_ssa_phiopt (void)
> >  {
> > -  return tree_ssa_phiopt_worker (false);
> > +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
> >  }
> >
> >  /* This pass tries to transform conditional stores into unconditional
> > @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
> >  static unsigned int
> >  tree_ssa_cs_elim (void)
> >  {
> > -  return tree_ssa_phiopt_worker (true);
> > +  return tree_ssa_phiopt_worker (true, false);
> >  }
> >
> >  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> > @@ -227,9 +282,11 @@ static tree condstoretemp;
> >  /* The core routine of conditional store replacement and normal
> >    phi optimizations.  Both share much of the infrastructure in how
> >    to match applicable basic block patterns.  DO_STORE_ELIM is true
> > -   when we want to do conditional store replacement, false otherwise.  */
> > +   when we want to do conditional store replacement, false otherwise.
> > +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out
> > +   of diamond control flow patterns, false otherwise.  */
> >  static unsigned int
> > -tree_ssa_phiopt_worker (bool do_store_elim)
> > +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
> >  {
> >   basic_block bb;
> >   basic_block *bb_order;
> > @@ -312,6 +369,21 @@ static unsigned int
> >            cfgchanged = true;
> >          continue;
> >        }
> > +      else if (do_hoist_loads
> > +                && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> > +       {
> > +         basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> > +
> > +         if (!FLOAT_TYPE_P (TREE_TYPE (gimple_cond_lhs (cond_stmt)))
> > +             && single_succ_p (bb1)
> > +             && single_succ_p (bb2)
> > +             && single_pred_p (bb1)
> > +             && single_pred_p (bb2)
> > +             && EDGE_COUNT (bb->succs) == 2
> > +             && EDGE_COUNT (bb3->preds) == 2)
> > +           hoist_adjacent_loads (bb, bb1, bb2, bb3);
> > +         continue;
> > +       }
> >       else
> >        continue;
> >
> > @@ -1707,6 +1779,207 @@ cond_if_else_store_replacement (basic_block then_b
> >   return ok;
> >  }
> >
> > +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> > +
> > +static bool
> > +local_mem_dependence (gimple stmt, basic_block bb)
> > +{
> > +  tree vuse = gimple_vuse (stmt);
> > +  gimple def;
> > +
> > +  if (!vuse)
> > +    return false;
> > +
> > +  def = SSA_NAME_DEF_STMT (vuse);
> > +  return (def && gimple_bb (def) == bb);
> > +}
> > +
> > +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> > +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> > +   and BB3 rejoins control flow following BB1 and BB2, look for
> > +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> > +   two loads, one each occurring in BB1 and BB2, and the loads are
> > +   provably of adjacent fields in the same structure, then move both
> > +   loads into BB0.  Of course this can only be done if there are no
> > +   dependencies preventing such motion.
> > +
> > +   One of the hoisted loads will always be speculative, so the
> > +   transformation is currently conservative:
> > +
> > +    - The fields must be strictly adjacent.
> > +    - The two fields must occupy a single memory block that is
> > +      guaranteed to not cross a page boundary.
> > +
> > +    The last is difficult to prove, as such memory blocks should be
> > +    aligned on the minimum of the stack alignment boundary and the
> > +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> > +    on a parameter for the alignment value.
> > +
> > +    Provided a good value is used for the last case, the first
> > +    restriction could possibly be relaxed.  */
> > +
> > +static void
> > +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> > +                     basic_block bb2, basic_block bb3)
> > +{
> > +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> > +  unsigned param_align_bits = (unsigned) (param_align * BITS_PER_UNIT);
> > +  gimple_stmt_iterator gsi;
> > +
> > +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> > +     for phis of two SSA names, one each of which is defined in bb1 and
> > +     bb2.  */
> > +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> > +    {
> > +      gimple phi_stmt = gsi_stmt (gsi);
> > +      gimple def1, def2, defswap;
> > +      tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
> > +      tree tree_offset1, tree_offset2, tree_size2, next;
> > +      int offset1, offset2, size2;
> > +      unsigned align1;
> > +      gimple_stmt_iterator gsi2;
> > +      basic_block bb_for_def1, bb_for_def2;
> > +
> > +      if (gimple_phi_num_args (phi_stmt) != 2)
> > +       continue;
> > +
> > +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> > +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> > +
> > +      if (TREE_CODE (arg1) != SSA_NAME
> > +         || TREE_CODE (arg2) != SSA_NAME
> > +         || SSA_NAME_IS_DEFAULT_DEF (arg1)
> > +         || SSA_NAME_IS_DEFAULT_DEF (arg2))
> > +       continue;
> > +
> > +      def1 = SSA_NAME_DEF_STMT (arg1);
> > +      def2 = SSA_NAME_DEF_STMT (arg2);
> > +
> > +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> > +         && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> > +       continue;
> > +
> > +      /* Check the mode of the arguments to be sure a conditional move
> > +        can be generated for it.  */
> > +      if (!optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> > +       continue;
> > +
> > +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> > +      if (!gimple_assign_single_p (def1)
> > +         || !gimple_assign_single_p (def2))
> > +       continue;
> > +
> > +      ref1 = gimple_assign_rhs1 (def1);
> > +      ref2 = gimple_assign_rhs1 (def2);
> > +
> > +      if (TREE_CODE (ref1) != COMPONENT_REF
> > +         || TREE_CODE (ref2) != COMPONENT_REF)
> > +       continue;
> > +
> > +      /* The zeroth operand of the two component references must be
> > +        identical.  It is not sufficient to compare get_base_address of
> > +        the two references, because this could allow for different
> > +        elements of the same array in the two trees.  It is not safe to
> > +        assume that the existence of one array element implies the
> > +        existence of a different one.  */
> > +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> > +       continue;
> > +
> > +      field1 = TREE_OPERAND (ref1, 1);
> > +      field2 = TREE_OPERAND (ref2, 1);
> > +
> > +      /* Check for field adjacency, and ensure field1 comes first.  */
> > +      for (next = DECL_CHAIN (field1);
> > +          next && TREE_CODE (next) != FIELD_DECL;
> > +          next = DECL_CHAIN (next))
> > +       ;
> > +
> > +      if (next != field2)
> > +       {
> > +         for (next = DECL_CHAIN (field2);
> > +              next && TREE_CODE (next) != FIELD_DECL;
> > +              next = DECL_CHAIN (next))
> > +           ;
> > +
> > +         if (next != field1)
> > +           continue;
> > +
> > +         fieldswap = field1;
> > +         field1 = field2;
> > +         field2 = fieldswap;
> > +         defswap = def1;
> > +         def1 = def2;
> > +         def2 = defswap;
> > +         /* Don't swap bb1 and bb2 as we may have more than one
> > +            phi to process successfully.  */
> > +         bb_for_def1 = bb2;
> > +         bb_for_def2 = bb1;
> > +       }
> > +      else
> > +       {
> > +         bb_for_def1 = bb1;
> > +         bb_for_def2 = bb2;
> > +       }
> > +
> > +      /* Check for proper alignment of the first field.  */
> > +      tree_offset1 = bit_position (field1);
> > +      tree_offset2 = bit_position (field2);
> > +      tree_size2 = DECL_SIZE (field2);
> > +
> > +      if (!host_integerp (tree_offset1, 1)
> > +         || !host_integerp (tree_offset2, 1)
> > +         || !host_integerp (tree_size2, 1))
> > +       continue;
> > +
> > +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> > +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> > +      size2 = TREE_INT_CST_LOW (tree_size2);
> > +      align1 = DECL_ALIGN (field1) % param_align_bits;
> > +
> > +      if (offset1 % BITS_PER_UNIT != 0)
> > +       continue;
> > +
> > +      /* The two field references must fit within a block of memory that
> > +        is guaranteed to be on the same page.  */
> > +      if (align1 + offset2 - offset1 + size2 > param_align_bits)
> > +       continue;
> 
> If param_align_bits were the cache-line size then this would be appropriate
> for a check if the transform is profitable.  But the comment suggests it
> is a check for correctness?  We do already have a l1-cache-line-size
> param btw.
> 
> Did you check packed nested structs?  Like
> 
> struct A {
>   int i;
>   int j;
> };
> struct {
>   char c;
>   struct A a;
> } __attribute__((packed)) x;
> 
> and loads of x.a.i and x.a.j?  I _think_ that DECL_ALIGN of the i and j
> FIELD_DECL may still be 4 in this case.  Which means that if you
> are performing the check for correctness then it has to be a lot more
> conservative.
> 
> I _think_ we have concluded that if you see loads of p->a and p->b
> then either both trap or both do not trap, so the check for correctness
> is that you see two COMPONENT_REFs of the same base for
> adjacent fields.

The above is largely a commentary problem.  This isn't a test for
correctness, but a heuristic for profitability.  We don't want to cross
page boundaries with this optimization, at least not often.  As you
point out, for correctness we avoid extra traps by checking for a common
base.  What we're trying to avoid here is two adjacent fields on two
different memory pages so we don't introduce a page fault.

Maybe this is overkill and should just be removed.  It is difficult to
be accurate and effective with such a test.  The idea was to
parameterize on a size that corresponds to the alignment guarantee of
the stack and heap, but if the parameter is too small, it disables the
optimization too frequently when it is useful.

So I could go with fixing the comment, removing the test, or updating it
to something that tries to avoid page faults in a better way, if you
have ideas for that.  Please let me know which seems best.

> 
> Btw, you do not restrict the hoisting to the case where the the/else
> block will be empty afterwards?  It seems to be the transform is
> not profitable if we end up doing the if() anyways - it might even
> be not profitable (similar to the case where either path is very likely
> and the other not - conditional moves are mostly profitable when
> the possibility of the branches is equal).
> 

In general it is difficult to make such a decision accurately.  I have
seen numerous cases where the then/else blocks contain two or more
opportunities for collapsing into a conditional move.  Looking at just
one of them, the opportunity would look bad if we restricted it to only
cases where the blocks became empty.  So that introduces more complexity
to check that all non-control stmts will be removed by some instance of
the transformation.

Also, even without removing the if, we sometimes see profitable
situations like:

if (...)
  {
    a = p.x;
    b = p.y;
    ...
  }
else
  {
    a = p.y;
    b = p.x;
    ...
  }

Here the loads are not speculative at all, so hoisting them is always
good.  (But perhaps PRE and later processing would pick up the cmove
possibilities, not sure.)

In any case I haven't been able to find benchmarks where the heuristic
as written is bad on PowerPC, and our conditional move instruction isn't
always the most efficient in the universe. ;-/  In general, the extra
load is "free" (from a dcache point of view) and the potential extra
fetch cost, etc., tends to wash out due to fetch-ahead inefficiencies
and so on.

Thanks,
Bill

> Thanks,
> Richard.
> 
> > +
> > +      /* The two expressions cannot be dependent upon vdefs defined
> > +        in bb1/bb2.  */
> > +      if (local_mem_dependence (def1, bb_for_def1)
> > +         || local_mem_dependence (def2, bb_for_def2))
> > +       continue;
> > +
> > +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> > +        bb0.  We hoist the first one first so that a cache miss is handled
> > +         efficiently regardless of hardware cache-fill policy.  */
> > +      gsi2 = gsi_for_stmt (def1);
> > +      gsi_move_to_bb_end (&gsi2, bb0);
> > +      gsi2 = gsi_for_stmt (def2);
> > +      gsi_move_to_bb_end (&gsi2, bb0);
> > +
> > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > +       {
> > +         fprintf (dump_file,
> > +                  "\nHoisting adjacent loads from %d and %d into %d: \n",
> > +                  bb_for_def1->index, bb_for_def2->index, bb0->index);
> > +         print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> > +         print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> > +       }
> > +    }
> > +}
> > +
> > +/* Determine whether we should attempt to hoist adjacent loads out of
> > +   diamond patterns in pass_phiopt.  Always hoist loads if
> > +   -fhoist-adjacent-loads is specified and the target machine has
> > +   a conditional move instruction.  */
> > +
> > +static bool
> > +gate_hoist_loads (void)
> > +{
> > +  return (flag_hoist_adjacent_loads == 1 && HAVE_conditional_move);
> > +}
> > +
> >  /* Always do these optimizations if we have SSA
> >    trees to work on.  */
> >  static bool
> > Index: gcc/common.opt
> > ===================================================================
> > --- gcc/common.opt      (revision 187805)
> > +++ gcc/common.opt      (working copy)
> > @@ -1186,6 +1186,11 @@ fgraphite-identity
> >  Common Report Var(flag_graphite_identity) Optimization
> >  Enable Graphite Identity transformation
> >
> > +fhoist-adjacent-loads
> > +Common Report Var(flag_hoist_adjacent_loads) Optimization
> > +Enable hoisting adjacent loads to encourage generating conditional move
> > +instructions
> > +
> >  floop-parallelize-all
> >  Common Report Var(flag_loop_parallelize_all) Optimization
> >  Mark all loops as parallel
> > Index: gcc/Makefile.in
> > ===================================================================
> > --- gcc/Makefile.in     (revision 187805)
> > +++ gcc/Makefile.in     (working copy)
> > @@ -2355,7 +2355,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
> >    $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
> >    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
> >    $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> > -   $(TREE_DATA_REF_H) tree-pretty-print.h
> > +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> > +   insn-config.h $(EXPR_H) $(OPTABS_H)
> >  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
> >    $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
> >    $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> > Index: gcc/params.def
> > ===================================================================
> > --- gcc/params.def      (revision 187805)
> > +++ gcc/params.def      (working copy)
> > @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
> >          "track string lengths",
> >          1000, 0, 0)
> >
> > +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> > +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> > +         "min-cmove-struct-align",
> > +         "Minimum byte alignment to assume for structures in the stack "
> > +         "or heap when considering load hoisting for conditional moves",
> > +         32, 8, 256)
> > +
> >  /*
> >  Local variables:
> >  mode:c
> >
> >
> 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]