[patch] enabling vectorization by default at -O3

Daniel Berlin dberlin@dberlin.org
Thu Sep 6 21:43:00 GMT 2007


I can't approve it, you need a middle end maintainer
(Yo guys who can approve, see patch below!)


On 9/6/07, H.J. Lu <hjl@lucon.org> wrote:
> On Thu, Sep 06, 2007 at 10:48:43AM -0400, Daniel Berlin wrote:
> > On 9/6/07, H.J. Lu <hjl@lucon.org> wrote:
> > > On Thu, Sep 06, 2007 at 09:23:31AM -0400, Daniel Berlin wrote:
> > > > On 9/6/07, H.J. Lu <hjl@lucon.org> wrote:
> > > > > On Thu, Sep 06, 2007 at 01:49:52PM +0200, Uros Bizjak wrote:
> > > > > > >
> > > > > > > * Hmm, why is --ffast-math slower? And with vectorization that much
> > > > >
> > > > > Also see
> > > > >
> > > > > http://gcc.gnu.org/bugzilla/show_bug.cgi?id=32183
> > > > >
> > > > > With -O2 --ffast-math, we turn a faster loop:
> > > > >
> > > > >       float sf;
> > > > >       ...
> > > > >       sf = 500 * sf;
> > > > >       for (i = 0; i < ceplen; i++)
> > > > >         sum[i] *= sf;
> > > > >
> > > > > into a slower loop:
> > > > >
> > > > >       for (i = 0; i < ceplen; i++)
> > > > >         sum[i] = (sum[i]* 500)*sf;
> > > > >
> > > > > > > slower? I recheck induct (V.F, NV.F) and I could reproduce the timings.
> > > > > > >
> > > > > >
> > > > > > > that is indeed interesting (I'd be happy to look at a testcase)
> > > > > >
> > > > > > This is PR 32084, http://gcc.gnu.org/bugzilla/show_bug.cgi?id=32084
> > > > > >
> > > >
> > > > I still don't remember why we have reassoc2.  I'm in favor of removing
> > > > it unless someone can show it's producing performance improvements :)
> > >
> > > I got
> > >
> > > Here are SPEC CPU 2006 -O2 -ffast-math differences between revision
> > > 125281 without the second reassoc and revision 125281 on Intel64:
> >
> > Okay, then i guess we should fix it.  I think we should just use
> > zdenek's patch for now, and if anyone complains about lack of
> > reassociation across loop boundaries, we fix that then.
>
> I have been using this patch for several months on Linux/x86-64,
> Linux/ia64 and Linux/ia32 without any regressions. OK to install?
>
> Thanks.
>
>
> H.J.
> ----
> 2007-06-04  Zdenek Dvorak  <ook@ucw.cz>
>
>         PR tree-optimization/32183
>         * Makefile.in (tree-ssa-reassoc.o): Also depend on $(CFGLOOP_H).
>
>         * tree-ssa-reassoc.c: Include cfgloop.h.
>         (is_reassociable_op): Add a loop argument and return true only
>         for inside loop.
>         (linearize_expr): Updated.
>         (should_break_up_subtract): Likewise.
>         (linearize_expr_tree): Likewise.
>         (init_reassoc): Call loop_optimizer_init with
>         AVOID_CFG_MODIFICATIONS.  Remove calculate_dominance_info call
>         with CDI_DOMINATORS.
>         (fini_reassoc): Call loop_optimizer_finalize.
>
> --- gcc/Makefile.in.reassoc     2007-09-02 05:27:10.000000000 -0700
> +++ gcc/Makefile.in     2007-09-02 05:27:10.000000000 -0700
> @@ -2202,7 +2202,7 @@ tree-ssa-reassoc.o : tree-ssa-reassoc.c
>     $(SYSTEM_H) $(TREE_H) $(GGC_H) $(DIAGNOSTIC_H) errors.h $(TIMEVAR_H) \
>     $(TM_H) coretypes.h $(TREE_DUMP_H) tree-pass.h $(FLAGS_H) tree-iterator.h\
>     $(BASIC_BLOCK_H) $(TREE_GIMPLE_H) $(TREE_INLINE_H) vec.h \
> -   alloc-pool.h pointer-set.h
> +   alloc-pool.h pointer-set.h $(CFGLOOP_H)
>  tree-optimize.o : tree-optimize.c $(TREE_FLOW_H) $(CONFIG_H) $(SYSTEM_H) \
>     $(RTL_H) $(TREE_H) $(TM_P_H) $(EXPR_H) $(GGC_H) output.h $(DIAGNOSTIC_H) \
>     $(FLAGS_H) $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) toplev.h \
> --- gcc/tree-ssa-reassoc.c.reassoc      2007-08-09 07:12:26.000000000 -0700
> +++ gcc/tree-ssa-reassoc.c      2007-09-02 05:27:10.000000000 -0700
> @@ -38,6 +38,7 @@ along with GCC; see the file COPYING3.
>  #include "vec.h"
>  #include "langhooks.h"
>  #include "pointer-set.h"
> +#include "cfgloop.h"
>
>  /*  This is a simple global reassociation pass.  It is, in part, based
>      on the LLVM pass of the same name (They do some things more/less
> @@ -344,13 +345,21 @@ add_to_ops_vec (VEC(operand_entry_t, hea
>  }
>
>  /* Return true if STMT is reassociable operation containing a binary
> -   operation with tree code CODE.  */
> +   operation with tree code CODE, and is inside LOOP.  */
>
>  static bool
> -is_reassociable_op (tree stmt, enum tree_code code)
> +is_reassociable_op (tree stmt, enum tree_code code, struct loop *loop)
>  {
> -  if (!IS_EMPTY_STMT (stmt)
> -      && TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
> +  basic_block bb;
> +
> +  if (IS_EMPTY_STMT (stmt))
> +    return false;
> +
> +  bb = bb_for_stmt (stmt);
> +  if (!flow_bb_inside_loop_p (loop, bb))
> +    return false;
> +
> +  if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
>        && TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)) == code
>        && has_single_use (GIMPLE_STMT_OPERAND (stmt, 0)))
>      return true;
> @@ -929,9 +938,10 @@ linearize_expr (tree stmt)
>    tree binrhs = SSA_NAME_DEF_STMT (TREE_OPERAND (rhs, 1));
>    tree binlhs = SSA_NAME_DEF_STMT (TREE_OPERAND (rhs, 0));
>    tree newbinrhs = NULL_TREE;
> +  struct loop *loop = loop_containing_stmt (stmt);
>
> -  gcc_assert (is_reassociable_op (binlhs, TREE_CODE (rhs))
> -             && is_reassociable_op (binrhs, TREE_CODE (rhs)));
> +  gcc_assert (is_reassociable_op (binlhs, TREE_CODE (rhs), loop)
> +             && is_reassociable_op (binrhs, TREE_CODE (rhs), loop));
>
>    bsinow = bsi_for_stmt (stmt);
>    bsirhs = bsi_for_stmt (binrhs);
> @@ -959,9 +969,8 @@ linearize_expr (tree stmt)
>    TREE_VISITED (stmt) = 1;
>
>    /* Tail recurse on the new rhs if it still needs reassociation.  */
> -  if (newbinrhs && is_reassociable_op (newbinrhs, rhscode))
> +  if (newbinrhs && is_reassociable_op (newbinrhs, rhscode, loop))
>      linearize_expr (stmt);
> -
>  }
>
>  /* If LHS has a single immediate use that is a GIMPLE_MODIFY_STMT, return
> @@ -1046,13 +1055,14 @@ should_break_up_subtract (tree stmt)
>    tree binlhs = TREE_OPERAND (rhs, 0);
>    tree binrhs = TREE_OPERAND (rhs, 1);
>    tree immusestmt;
> +  struct loop *loop = loop_containing_stmt (stmt);
>
>    if (TREE_CODE (binlhs) == SSA_NAME
> -      && is_reassociable_op (SSA_NAME_DEF_STMT (binlhs), PLUS_EXPR))
> +      && is_reassociable_op (SSA_NAME_DEF_STMT (binlhs), PLUS_EXPR, loop))
>      return true;
>
>    if (TREE_CODE (binrhs) == SSA_NAME
> -      && is_reassociable_op (SSA_NAME_DEF_STMT (binrhs), PLUS_EXPR))
> +      && is_reassociable_op (SSA_NAME_DEF_STMT (binrhs), PLUS_EXPR, loop))
>      return true;
>
>    if (TREE_CODE (lhs) == SSA_NAME
> @@ -1096,19 +1106,20 @@ linearize_expr_tree (VEC(operand_entry_t
>    bool binlhsisreassoc = false;
>    bool binrhsisreassoc = false;
>    enum tree_code rhscode = TREE_CODE (rhs);
> +  struct loop *loop = loop_containing_stmt (stmt);
>
>    TREE_VISITED (stmt) = 1;
>
>    if (TREE_CODE (binlhs) == SSA_NAME)
>      {
>        binlhsdef = SSA_NAME_DEF_STMT (binlhs);
> -      binlhsisreassoc = is_reassociable_op (binlhsdef, rhscode);
> +      binlhsisreassoc = is_reassociable_op (binlhsdef, rhscode, loop);
>      }
>
>    if (TREE_CODE (binrhs) == SSA_NAME)
>      {
>        binrhsdef = SSA_NAME_DEF_STMT (binrhs);
> -      binrhsisreassoc = is_reassociable_op (binrhsdef, rhscode);
> +      binrhsisreassoc = is_reassociable_op (binrhsdef, rhscode, loop);
>      }
>
>    /* If the LHS is not reassociable, but the RHS is, we need to swap
> @@ -1159,7 +1170,8 @@ linearize_expr_tree (VEC(operand_entry_t
>      }
>
>    gcc_assert (TREE_CODE (binrhs) != SSA_NAME
> -             || !is_reassociable_op (SSA_NAME_DEF_STMT (binrhs), rhscode));
> +             || !is_reassociable_op (SSA_NAME_DEF_STMT (binrhs),
> +                                     rhscode, loop));
>    bsinow = bsi_for_stmt (stmt);
>    bsilhs = bsi_for_stmt (SSA_NAME_DEF_STMT (binlhs));
>    bsi_move_before (&bsilhs, &bsinow);
> @@ -1399,6 +1411,10 @@ init_reassoc (void)
>    tree param;
>    int *bbs = XNEWVEC (int, last_basic_block + 1);
>
> +  /* Find the loops, so that we can prevent moving calculations in
> +     them.  */
> +  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
> +
>    memset (&reassociate_stats, 0, sizeof (reassociate_stats));
>
>    operand_entry_pool = create_alloc_pool ("operand entry pool",
> @@ -1435,7 +1451,6 @@ init_reassoc (void)
>      bb_rank[bbs[i]] = ++rank  << 16;
>
>    free (bbs);
> -  calculate_dominance_info (CDI_DOMINATORS);
>    calculate_dominance_info (CDI_POST_DOMINATORS);
>    broken_up_subtracts = NULL;
>  }
> @@ -1446,7 +1461,6 @@ init_reassoc (void)
>  static void
>  fini_reassoc (void)
>  {
> -
>    if (dump_file && (dump_flags & TDF_STATS))
>      {
>        fprintf (dump_file, "Reassociation stats:\n");
> @@ -1465,6 +1479,7 @@ fini_reassoc (void)
>    free (bb_rank);
>    VEC_free (tree, heap, broken_up_subtracts);
>    free_dominance_info (CDI_POST_DOMINATORS);
> +  loop_optimizer_finalize ();
>  }
>
>  /* Gate and execute functions for Reassociation.  */
>



More information about the Gcc-patches mailing list