This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Hoist adjacent pointer loads


On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt
<wschmidt@linux.vnet.ibm.com> wrote:
> Hi Richard,
>
> Here's a revision of the hoist-adjacent-loads patch. ?I'm sorry for the
> delay since the last revision, but my performance testing has been
> blocked waiting for a fix to PR53487. ?I ended up applying a test
> version of the patch to 4.7 and ran performance numbers with that
> instead, with no degradations.
>
> In addition to addressing your comments, this patch contains one bug fix
> where local_mem_dependence was called on the wrong blocks after swapping
> def1 and def2.
>
> Bootstrapped with no regressions on powerpc64-unknown-linux-gnu. ?Is
> this version ok for trunk? ?I won't commit it until I can do final
> testing on trunk in conjunction with a fix for PR53487.
>
> Thanks,
> Bill
>
>
> 2012-06-04 ?Bill Schmidt ?<wschmidt@linux.vnet.ibm.com>
>
> ? ? ? ?* opts.c: Add -fhoist_adjacent_loads to -O2 and above.
> ? ? ? ?* tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
> ? ? ? ?declaration.
> ? ? ? ?(hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
> ? ? ? ?(tree_ssa_phiopt): Call gate_hoist_loads.
> ? ? ? ?(tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
> ? ? ? ?(tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
> ? ? ? ?hoist_adjacent_loads.
> ? ? ? ?(local_mem_dependence): New function.
> ? ? ? ?(hoist_adjacent_loads): Likewise.
> ? ? ? ?(gate_hoist_loads): Likewise.
> ? ? ? ?* common.opt (fhoist-adjacent-loads): New switch.
> ? ? ? ?* Makefile.in (tree-ssa-phiopt.o): Added dependencies.
> ? ? ? ?* params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
>
>
> Index: gcc/opts.c
> ===================================================================
> --- gcc/opts.c ?(revision 187805)
> +++ gcc/opts.c ?(working copy)
> @@ -489,6 +489,7 @@ static const struct default_options default_option
> ? ? { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 },
> ? ? { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
> ? ? { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
> + ? ?{ OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 },
>
> ? ? /* -O3 optimizations. ?*/
> ? ? { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
> Index: gcc/tree-ssa-phiopt.c
> ===================================================================
> --- gcc/tree-ssa-phiopt.c ? ? ? (revision 187805)
> +++ gcc/tree-ssa-phiopt.c ? ? ? (working copy)
> @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3. ?If not see
> ?#include "cfgloop.h"
> ?#include "tree-data-ref.h"
> ?#include "tree-pretty-print.h"
> +#include "gimple-pretty-print.h"
> +#include "insn-config.h"
> +#include "expr.h"
> +#include "optabs.h"
>
> +#ifndef HAVE_conditional_move
> +#define HAVE_conditional_move (0)
> +#endif
> +
> ?static unsigned int tree_ssa_phiopt (void);
> -static unsigned int tree_ssa_phiopt_worker (bool);
> +static unsigned int tree_ssa_phiopt_worker (bool, bool);
> ?static bool conditional_replacement (basic_block, basic_block,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? edge, edge, gimple, tree, tree);
> ?static int value_replacement (basic_block, basic_block,
> @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
> ?static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
> ?static struct pointer_set_t * get_non_trapping (void);
> ?static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> +static void hoist_adjacent_loads (basic_block, basic_block,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? basic_block, basic_block);
> +static bool gate_hoist_loads (void);
>
> ?/* This pass tries to replaces an if-then-else block with an
> ? ?assignment. ?We have four kinds of transformations. ?Some of these
> @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
> ? ? ?bb2:
> ? ? ? ?x = PHI <x' (bb0), ...>;
>
> - ? A similar transformation is done for MAX_EXPR. ?*/
> + ? A similar transformation is done for MAX_EXPR.
>
> +
> + ? This pass also performs a fifth transformation of a slightly different
> + ? flavor.
> +
> + ? Adjacent Load Hoisting
> + ? ----------------------
> +
> + ? This transformation replaces
> +
> + ? ? bb0:
> + ? ? ? if (...) goto bb2; else goto bb1;
> + ? ? bb1:
> + ? ? ? x1 = (<expr>).field1;
> + ? ? ? goto bb3;
> + ? ? bb2:
> + ? ? ? x2 = (<expr>).field2;
> + ? ? bb3:
> + ? ? ? # x = PHI <x1, x2>;
> +
> + ? with
> +
> + ? ? bb0:
> + ? ? ? x1 = (<expr>).field1;
> + ? ? ? x2 = (<expr>).field2;
> + ? ? ? if (...) goto bb2; else goto bb1;
> + ? ? bb1:
> + ? ? ? goto bb3;
> + ? ? bb2:
> + ? ? bb3:
> + ? ? ? # x = PHI <x1, x2>;
> +
> + ? The purpose of this transformation is to enable generation of conditional
> + ? move instructions such as Intel CMOVE or PowerPC ISEL. ?Because one of
> + ? the loads is speculative, the transformation is restricted to very
> + ? specific cases to avoid introducing a page fault. ?We are looking for
> + ? the common idiom:
> +
> + ? ? if (...)
> + ? ? ? x = y->left;
> + ? ? else
> + ? ? ? x = y->right;
> +
> + ? where left and right are typically adjacent pointers in a tree structure. ?*/
> +
> ?static unsigned int
> ?tree_ssa_phiopt (void)
> ?{
> - ?return tree_ssa_phiopt_worker (false);
> + ?return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
> ?}
>
> ?/* This pass tries to transform conditional stores into unconditional
> @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
> ?static unsigned int
> ?tree_ssa_cs_elim (void)
> ?{
> - ?return tree_ssa_phiopt_worker (true);
> + ?return tree_ssa_phiopt_worker (true, false);
> ?}
>
> ?/* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> @@ -227,9 +282,11 @@ static tree condstoretemp;
> ?/* The core routine of conditional store replacement and normal
> ? ?phi optimizations. ?Both share much of the infrastructure in how
> ? ?to match applicable basic block patterns. ?DO_STORE_ELIM is true
> - ? when we want to do conditional store replacement, false otherwise. ?*/
> + ? when we want to do conditional store replacement, false otherwise.
> + ? DO_HOIST_LOADS is true when we want to hoist adjacent loads out
> + ? of diamond control flow patterns, false otherwise. ?*/
> ?static unsigned int
> -tree_ssa_phiopt_worker (bool do_store_elim)
> +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
> ?{
> ? basic_block bb;
> ? basic_block *bb_order;
> @@ -312,6 +369,21 @@ static unsigned int
> ? ? ? ? ? ?cfgchanged = true;
> ? ? ? ? ?continue;
> ? ? ? ?}
> + ? ? ?else if (do_hoist_loads
> + ? ? ? ? ? ? ? ?&& EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> + ? ? ? {
> + ? ? ? ? basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> +
> + ? ? ? ? if (!FLOAT_TYPE_P (TREE_TYPE (gimple_cond_lhs (cond_stmt)))
> + ? ? ? ? ? ? && single_succ_p (bb1)
> + ? ? ? ? ? ? && single_succ_p (bb2)
> + ? ? ? ? ? ? && single_pred_p (bb1)
> + ? ? ? ? ? ? && single_pred_p (bb2)
> + ? ? ? ? ? ? && EDGE_COUNT (bb->succs) == 2
> + ? ? ? ? ? ? && EDGE_COUNT (bb3->preds) == 2)
> + ? ? ? ? ? hoist_adjacent_loads (bb, bb1, bb2, bb3);
> + ? ? ? ? continue;
> + ? ? ? }
> ? ? ? else
> ? ? ? ?continue;
>
> @@ -1707,6 +1779,207 @@ cond_if_else_store_replacement (basic_block then_b
> ? return ok;
> ?}
>
> +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB. ?*/
> +
> +static bool
> +local_mem_dependence (gimple stmt, basic_block bb)
> +{
> + ?tree vuse = gimple_vuse (stmt);
> + ?gimple def;
> +
> + ?if (!vuse)
> + ? ?return false;
> +
> + ?def = SSA_NAME_DEF_STMT (vuse);
> + ?return (def && gimple_bb (def) == bb);
> +}
> +
> +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> + ? BB1 and BB2 are "then" and "else" blocks dependent on this test,
> + ? and BB3 rejoins control flow following BB1 and BB2, look for
> + ? opportunities to hoist loads as follows. ?If BB3 contains a PHI of
> + ? two loads, one each occurring in BB1 and BB2, and the loads are
> + ? provably of adjacent fields in the same structure, then move both
> + ? loads into BB0. ?Of course this can only be done if there are no
> + ? dependencies preventing such motion.
> +
> + ? One of the hoisted loads will always be speculative, so the
> + ? transformation is currently conservative:
> +
> + ? ?- The fields must be strictly adjacent.
> + ? ?- The two fields must occupy a single memory block that is
> + ? ? ?guaranteed to not cross a page boundary.
> +
> + ? ?The last is difficult to prove, as such memory blocks should be
> + ? ?aligned on the minimum of the stack alignment boundary and the
> + ? ?alignment guaranteed by heap allocation interfaces. ?Thus we rely
> + ? ?on a parameter for the alignment value.
> +
> + ? ?Provided a good value is used for the last case, the first
> + ? ?restriction could possibly be relaxed. ?*/
> +
> +static void
> +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> + ? ? ? ? ? ? ? ? ? ? basic_block bb2, basic_block bb3)
> +{
> + ?int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> + ?unsigned param_align_bits = (unsigned) (param_align * BITS_PER_UNIT);
> + ?gimple_stmt_iterator gsi;
> +
> + ?/* Walk the phis in bb3 looking for an opportunity. ?We are looking
> + ? ? for phis of two SSA names, one each of which is defined in bb1 and
> + ? ? bb2. ?*/
> + ?for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> + ? ?{
> + ? ? ?gimple phi_stmt = gsi_stmt (gsi);
> + ? ? ?gimple def1, def2, defswap;
> + ? ? ?tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
> + ? ? ?tree tree_offset1, tree_offset2, tree_size2, next;
> + ? ? ?int offset1, offset2, size2;
> + ? ? ?unsigned align1;
> + ? ? ?gimple_stmt_iterator gsi2;
> + ? ? ?basic_block bb_for_def1, bb_for_def2;
> +
> + ? ? ?if (gimple_phi_num_args (phi_stmt) != 2)
> + ? ? ? continue;
> +
> + ? ? ?arg1 = gimple_phi_arg_def (phi_stmt, 0);
> + ? ? ?arg2 = gimple_phi_arg_def (phi_stmt, 1);
> +
> + ? ? ?if (TREE_CODE (arg1) != SSA_NAME
> + ? ? ? ? || TREE_CODE (arg2) != SSA_NAME
> + ? ? ? ? || SSA_NAME_IS_DEFAULT_DEF (arg1)
> + ? ? ? ? || SSA_NAME_IS_DEFAULT_DEF (arg2))
> + ? ? ? continue;
> +
> + ? ? ?def1 = SSA_NAME_DEF_STMT (arg1);
> + ? ? ?def2 = SSA_NAME_DEF_STMT (arg2);
> +
> + ? ? ?if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> + ? ? ? ? && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> + ? ? ? continue;
> +
> + ? ? ?/* Check the mode of the arguments to be sure a conditional move
> + ? ? ? ?can be generated for it. ?*/
> + ? ? ?if (!optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> + ? ? ? continue;
> +
> + ? ? ?/* Both statements must be assignments whose RHS is a COMPONENT_REF. ?*/
> + ? ? ?if (!gimple_assign_single_p (def1)
> + ? ? ? ? || !gimple_assign_single_p (def2))
> + ? ? ? continue;
> +
> + ? ? ?ref1 = gimple_assign_rhs1 (def1);
> + ? ? ?ref2 = gimple_assign_rhs1 (def2);
> +
> + ? ? ?if (TREE_CODE (ref1) != COMPONENT_REF
> + ? ? ? ? || TREE_CODE (ref2) != COMPONENT_REF)
> + ? ? ? continue;
> +
> + ? ? ?/* The zeroth operand of the two component references must be
> + ? ? ? ?identical. ?It is not sufficient to compare get_base_address of
> + ? ? ? ?the two references, because this could allow for different
> + ? ? ? ?elements of the same array in the two trees. ?It is not safe to
> + ? ? ? ?assume that the existence of one array element implies the
> + ? ? ? ?existence of a different one. ?*/
> + ? ? ?if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> + ? ? ? continue;
> +
> + ? ? ?field1 = TREE_OPERAND (ref1, 1);
> + ? ? ?field2 = TREE_OPERAND (ref2, 1);
> +
> + ? ? ?/* Check for field adjacency, and ensure field1 comes first. ?*/
> + ? ? ?for (next = DECL_CHAIN (field1);
> + ? ? ? ? ?next && TREE_CODE (next) != FIELD_DECL;
> + ? ? ? ? ?next = DECL_CHAIN (next))
> + ? ? ? ;
> +
> + ? ? ?if (next != field2)
> + ? ? ? {
> + ? ? ? ? for (next = DECL_CHAIN (field2);
> + ? ? ? ? ? ? ?next && TREE_CODE (next) != FIELD_DECL;
> + ? ? ? ? ? ? ?next = DECL_CHAIN (next))
> + ? ? ? ? ? ;
> +
> + ? ? ? ? if (next != field1)
> + ? ? ? ? ? continue;
> +
> + ? ? ? ? fieldswap = field1;
> + ? ? ? ? field1 = field2;
> + ? ? ? ? field2 = fieldswap;
> + ? ? ? ? defswap = def1;
> + ? ? ? ? def1 = def2;
> + ? ? ? ? def2 = defswap;
> + ? ? ? ? /* Don't swap bb1 and bb2 as we may have more than one
> + ? ? ? ? ? ?phi to process successfully. ?*/
> + ? ? ? ? bb_for_def1 = bb2;
> + ? ? ? ? bb_for_def2 = bb1;
> + ? ? ? }
> + ? ? ?else
> + ? ? ? {
> + ? ? ? ? bb_for_def1 = bb1;
> + ? ? ? ? bb_for_def2 = bb2;
> + ? ? ? }
> +
> + ? ? ?/* Check for proper alignment of the first field. ?*/
> + ? ? ?tree_offset1 = bit_position (field1);
> + ? ? ?tree_offset2 = bit_position (field2);
> + ? ? ?tree_size2 = DECL_SIZE (field2);
> +
> + ? ? ?if (!host_integerp (tree_offset1, 1)
> + ? ? ? ? || !host_integerp (tree_offset2, 1)
> + ? ? ? ? || !host_integerp (tree_size2, 1))
> + ? ? ? continue;
> +
> + ? ? ?offset1 = TREE_INT_CST_LOW (tree_offset1);
> + ? ? ?offset2 = TREE_INT_CST_LOW (tree_offset2);
> + ? ? ?size2 = TREE_INT_CST_LOW (tree_size2);
> + ? ? ?align1 = DECL_ALIGN (field1) % param_align_bits;
> +
> + ? ? ?if (offset1 % BITS_PER_UNIT != 0)
> + ? ? ? continue;
> +
> + ? ? ?/* The two field references must fit within a block of memory that
> + ? ? ? ?is guaranteed to be on the same page. ?*/
> + ? ? ?if (align1 + offset2 - offset1 + size2 > param_align_bits)
> + ? ? ? continue;

If param_align_bits were the cache-line size then this would be appropriate
for a check if the transform is profitable.  But the comment suggests it
is a check for correctness?  We do already have a l1-cache-line-size
param btw.

Did you check packed nested structs?  Like

struct A {
  int i;
  int j;
};
struct {
  char c;
  struct A a;
} __attribute__((packed)) x;

and loads of x.a.i and x.a.j?  I _think_ that DECL_ALIGN of the i and j
FIELD_DECL may still be 4 in this case.  Which means that if you
are performing the check for correctness then it has to be a lot more
conservative.

I _think_ we have concluded that if you see loads of p->a and p->b
then either both trap or both do not trap, so the check for correctness
is that you see two COMPONENT_REFs of the same base for
adjacent fields.

Btw, you do not restrict the hoisting to the case where the the/else
block will be empty afterwards?  It seems to be the transform is
not profitable if we end up doing the if() anyways - it might even
be not profitable (similar to the case where either path is very likely
and the other not - conditional moves are mostly profitable when
the possibility of the branches is equal).

Thanks,
Richard.

> +
> + ? ? ?/* The two expressions cannot be dependent upon vdefs defined
> + ? ? ? ?in bb1/bb2. ?*/
> + ? ? ?if (local_mem_dependence (def1, bb_for_def1)
> + ? ? ? ? || local_mem_dependence (def2, bb_for_def2))
> + ? ? ? continue;
> +
> + ? ? ?/* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> + ? ? ? ?bb0. ?We hoist the first one first so that a cache miss is handled
> + ? ? ? ? efficiently regardless of hardware cache-fill policy. ?*/
> + ? ? ?gsi2 = gsi_for_stmt (def1);
> + ? ? ?gsi_move_to_bb_end (&gsi2, bb0);
> + ? ? ?gsi2 = gsi_for_stmt (def2);
> + ? ? ?gsi_move_to_bb_end (&gsi2, bb0);
> +
> + ? ? ?if (dump_file && (dump_flags & TDF_DETAILS))
> + ? ? ? {
> + ? ? ? ? fprintf (dump_file,
> + ? ? ? ? ? ? ? ? ?"\nHoisting adjacent loads from %d and %d into %d: \n",
> + ? ? ? ? ? ? ? ? ?bb_for_def1->index, bb_for_def2->index, bb0->index);
> + ? ? ? ? print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> + ? ? ? ? print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> + ? ? ? }
> + ? ?}
> +}
> +
> +/* Determine whether we should attempt to hoist adjacent loads out of
> + ? diamond patterns in pass_phiopt. ?Always hoist loads if
> + ? -fhoist-adjacent-loads is specified and the target machine has
> + ? a conditional move instruction. ?*/
> +
> +static bool
> +gate_hoist_loads (void)
> +{
> + ?return (flag_hoist_adjacent_loads == 1 && HAVE_conditional_move);
> +}
> +
> ?/* Always do these optimizations if we have SSA
> ? ?trees to work on. ?*/
> ?static bool
> Index: gcc/common.opt
> ===================================================================
> --- gcc/common.opt ? ? ?(revision 187805)
> +++ gcc/common.opt ? ? ?(working copy)
> @@ -1186,6 +1186,11 @@ fgraphite-identity
> ?Common Report Var(flag_graphite_identity) Optimization
> ?Enable Graphite Identity transformation
>
> +fhoist-adjacent-loads
> +Common Report Var(flag_hoist_adjacent_loads) Optimization
> +Enable hoisting adjacent loads to encourage generating conditional move
> +instructions
> +
> ?floop-parallelize-all
> ?Common Report Var(flag_loop_parallelize_all) Optimization
> ?Mark all loops as parallel
> Index: gcc/Makefile.in
> ===================================================================
> --- gcc/Makefile.in ? ? (revision 187805)
> +++ gcc/Makefile.in ? ? (working copy)
> @@ -2355,7 +2355,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
> ? ?$(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
> ? ?$(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
> ? ?$(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> - ? $(TREE_DATA_REF_H) tree-pretty-print.h
> + ? $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> + ? insn-config.h $(EXPR_H) $(OPTABS_H)
> ?tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
> ? ?$(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
> ? ?$(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> Index: gcc/params.def
> ===================================================================
> --- gcc/params.def ? ? ?(revision 187805)
> +++ gcc/params.def ? ? ?(working copy)
> @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
> ? ? ? ? ?"track string lengths",
> ? ? ? ? ?1000, 0, 0)
>
> +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c. ?*/
> +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> + ? ? ? ? "min-cmove-struct-align",
> + ? ? ? ? "Minimum byte alignment to assume for structures in the stack "
> + ? ? ? ? "or heap when considering load hoisting for conditional moves",
> + ? ? ? ? 32, 8, 256)
> +
> ?/*
> ?Local variables:
> ?mode:c
>
>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]