Fix PR41879 (large mgrid regression)

Michael Matz matz@suse.de
Tue Jan 19 14:35:00 GMT 2010


Hi,

On Tue, 19 Jan 2010, Richard Guenther wrote:

> We settled on allowing conversions from signed or pointer types if that 
> conversion is widening or doesn't change precision. Unfortunately that 
> isn't enough to fix mgrid - Micha is still trying to fix that.

Yeah, the problem is that we can't look through widening conversions, but 
we would need to to see the equality of some data reference bases.  So, we 
need to emit slightly different code in the vectorizer, in particular the 
widening of the number of iterations of the prolog loop needs to happen 
earlier, so that we aren't left with a late conversion, hence we don't 
have to look through it anymore and all becomes well and dandy :)

I've also added a testcase for one of the two important mgrid loops.  I 
guess it will break on some targets, due to them not vectorizing the loops 
to start with, although it should always be possible to make the store 
aligned with peeling and to load the vectors pairwise.

Regstrapped on x86-64-linux, all langs+Ada, no regressions, still fixing 
the mgrid performance mostly.  Okay for trunk?


Ciao,
Michael.
-- 
	PR tree-optimization/41783
	* tree-data-ref.c (toplevel): Include flags.h.
	(dump_data_dependence_relation):  Also dump the inputs if the
	result will be unknown.
	(split_constant_offset_1): Look through some conversions.
	* tree-predcom.c (determine_roots_comp): Restart a new chain if
	the offset from last element is too large.
	(ref_at_iteration): Deal also with MISALIGNED_INDIRECT_REF.
	(reassociate_to_the_same_stmt): Handle vector registers.
	* tree-vect-data-refs.c (vect_equal_offsets): Handle unary operations
	(e.g. conversions).
	* tree-vect-loop-manip.c (vect_gen_niters_for_prolog_loop): Add 
	wide_prolog_niters argument, emit widening instructions.
	(vect_do_peeling_for_alignment): Adjust caller, use widened
	variant of the iteration cound.
	* Makefile.in (tree-data-ref.o): Add $(FLAGS_H).

testsuite/
	* gfortran.dg/vect/fast-math-mgrid-resid.f: New.

Index: tree-data-ref.c
===================================================================
*** tree-data-ref.c	(revision 155937)
--- tree-data-ref.c	(working copy)
*************** along with GCC; see the file COPYING3.
*** 79,84 ****
--- 79,85 ----
  #include "coretypes.h"
  #include "tm.h"
  #include "ggc.h"
+ #include "flags.h"
  #include "tree.h"
  
  /* These RTL headers are needed for basic-block.h.  */
*************** dump_data_dependence_relation (FILE *out
*** 380,385 ****
--- 381,399 ----
  
    if (!ddr || DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
      {
+       if (ddr)
+ 	{
+ 	  dra = DDR_A (ddr);
+ 	  drb = DDR_B (ddr);
+ 	  if (dra)
+ 	    dump_data_reference (outf, dra);
+ 	  else
+ 	    fprintf (outf, "    (nil)\n");
+ 	  if (drb)
+ 	    dump_data_reference (outf, drb);
+ 	  else
+ 	    fprintf (outf, "    (nil)\n");
+ 	}
        fprintf (outf, "    (don't know)\n)\n");
        return;
      }
*************** split_constant_offset_1 (tree type, tree
*** 631,636 ****
--- 645,668 ----
  
  	return split_constant_offset_1 (type, var0, subcode, var1, var, off);
        }
+     CASE_CONVERT:
+       {
+ 	/* We must not introduce undefined overflow, and we must not change the value.
+ 	   Hence we're okay if the inner type doesn't overflow to start with
+ 	   (pointer or signed), the outer type also is an integer or pointer
+ 	   and the outer precision is at least as large as the inner.  */
+ 	tree itype = TREE_TYPE (op0);
+ 	if ((POINTER_TYPE_P (itype)
+ 	     || (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_UNDEFINED (itype)))
+ 	    && TYPE_PRECISION (type) >= TYPE_PRECISION (type)
+ 	    && (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type)))
+ 	  {
+ 	    split_constant_offset (op0, &var0, off);
+ 	    *var = fold_convert (type, var0);
+ 	    return true;
+ 	  }
+ 	return false;
+       }
  
      default:
        return false;
Index: tree-predcom.c
===================================================================
*** tree-predcom.c	(revision 155937)
--- tree-predcom.c	(working copy)
*************** determine_roots_comp (struct loop *loop,
*** 1180,1185 ****
--- 1180,1186 ----
    unsigned i;
    dref a;
    chain_p chain = NULL;
+   double_int last_ofs = double_int_zero;
  
    /* Invariants are handled specially.  */
    if (comp->comp_step == RS_INVARIANT)
*************** determine_roots_comp (struct loop *loop,
*** 1194,1206 ****
  
    for (i = 0; VEC_iterate (dref, comp->refs, i, a); i++)
      {
!       if (!chain || !DR_IS_READ (a->ref))
  	{
  	  if (nontrivial_chain_p (chain))
! 	    VEC_safe_push (chain_p, heap, *chains, chain);
  	  else
  	    release_chain (chain);
  	  chain = make_rooted_chain (a);
  	  continue;
  	}
  
--- 1195,1214 ----
  
    for (i = 0; VEC_iterate (dref, comp->refs, i, a); i++)
      {
!       if (!chain || !DR_IS_READ (a->ref)
! 	  || double_int_ucmp (uhwi_to_double_int (MAX_DISTANCE),
! 			      double_int_add (a->offset,
! 					      double_int_neg (last_ofs))) <= 0)
  	{
  	  if (nontrivial_chain_p (chain))
! 	    {
! 	      add_looparound_copies (loop, chain);
! 	      VEC_safe_push (chain_p, heap, *chains, chain);
! 	    }
  	  else
  	    release_chain (chain);
  	  chain = make_rooted_chain (a);
+ 	  last_ofs = a->offset;
  	  continue;
  	}
  
*************** ref_at_iteration (struct loop *loop, tre
*** 1338,1346 ****
    else if (!INDIRECT_REF_P (ref))
      return unshare_expr (ref);
  
!   if (TREE_CODE (ref) == INDIRECT_REF)
      {
!       ret = build1 (INDIRECT_REF, TREE_TYPE (ref), NULL_TREE);
        idx = TREE_OPERAND (ref, 0);
        idx_p = &TREE_OPERAND (ret, 0);
      }
--- 1346,1356 ----
    else if (!INDIRECT_REF_P (ref))
      return unshare_expr (ref);
  
!   if (INDIRECT_REF_P (ref))
      {
!       /* Take care for INDIRECT_REF and MISALIGNED_INDIRECT_REF at
!          the same time.  */
!       ret = copy_node (ref);
        idx = TREE_OPERAND (ref, 0);
        idx_p = &TREE_OPERAND (ret, 0);
      }
*************** reassociate_to_the_same_stmt (tree name1
*** 2205,2215 ****
--- 2215,2231 ----
    /* Insert the new statement combining NAME1 and NAME2 before S1, and
       combine it with the rhs of S1.  */
    var = create_tmp_var (type, "predreastmp");
+   if (TREE_CODE (type) == COMPLEX_TYPE
+       || TREE_CODE (type) == VECTOR_TYPE)
+     DECL_GIMPLE_REG_P (var) = 1;
    add_referenced_var (var);
    new_name = make_ssa_name (var, NULL);
    new_stmt = gimple_build_assign_with_ops (code, new_name, name1, name2);
  
    var = create_tmp_var (type, "predreastmp");
+   if (TREE_CODE (type) == COMPLEX_TYPE
+       || TREE_CODE (type) == VECTOR_TYPE)
+     DECL_GIMPLE_REG_P (var) = 1;
    add_referenced_var (var);
    tmp_name = make_ssa_name (var, NULL);
  
Index: tree-vect-data-refs.c
===================================================================
*** tree-vect-data-refs.c	(revision 155937)
--- tree-vect-data-refs.c	(working copy)
*************** vect_update_interleaving_chain (struct d
*** 294,300 ****
  static bool
  vect_equal_offsets (tree offset1, tree offset2)
  {
!   bool res0, res1;
  
    STRIP_NOPS (offset1);
    STRIP_NOPS (offset2);
--- 294,300 ----
  static bool
  vect_equal_offsets (tree offset1, tree offset2)
  {
!   bool res;
  
    STRIP_NOPS (offset1);
    STRIP_NOPS (offset2);
*************** vect_equal_offsets (tree offset1, tree o
*** 303,318 ****
      return true;
  
    if (TREE_CODE (offset1) != TREE_CODE (offset2)
!       || !BINARY_CLASS_P (offset1)
!       || !BINARY_CLASS_P (offset2))
      return false;
  
!   res0 = vect_equal_offsets (TREE_OPERAND (offset1, 0),
! 			     TREE_OPERAND (offset2, 0));
!   res1 = vect_equal_offsets (TREE_OPERAND (offset1, 1),
! 			     TREE_OPERAND (offset2, 1));
  
!   return (res0 && res1);
  }
  
  
--- 303,321 ----
      return true;
  
    if (TREE_CODE (offset1) != TREE_CODE (offset2)
!       || (!BINARY_CLASS_P (offset1) && !UNARY_CLASS_P (offset1)))
      return false;
  
!   res = vect_equal_offsets (TREE_OPERAND (offset1, 0),
! 			    TREE_OPERAND (offset2, 0));
  
!   if (!res || !BINARY_CLASS_P (offset1))
!     return res;
! 
!   res = vect_equal_offsets (TREE_OPERAND (offset1, 1),
! 			    TREE_OPERAND (offset2, 1));
! 
!   return res;
  }
  
  
Index: tree-vect-loop-manip.c
===================================================================
*** tree-vect-loop-manip.c	(revision 155937)
--- tree-vect-loop-manip.c	(working copy)
*************** vect_do_peeling_for_loop_bound (loop_vec
*** 1961,1967 ****
     use TYPE_VECTOR_SUBPARTS.  */
  
  static tree
! vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
  {
    struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
--- 1961,1968 ----
     use TYPE_VECTOR_SUBPARTS.  */
  
  static tree
! vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters,
! 				 tree *wide_prolog_niters)
  {
    struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
*************** vect_gen_niters_for_prolog_loop (loop_ve
*** 2045,2050 ****
--- 2046,2064 ----
    add_referenced_var (var);
    stmts = NULL;
    iters_name = force_gimple_operand (iters, &stmts, false, var);
+   if (types_compatible_p (sizetype, niters_type))
+     *wide_prolog_niters = iters_name;
+   else
+     {
+       gimple_seq seq = NULL;
+       tree wide_iters = fold_convert (sizetype, iters);
+       var = create_tmp_var (sizetype, "prolog_loop_niters");
+       add_referenced_var (var);
+       *wide_prolog_niters = force_gimple_operand (wide_iters, &seq, false,
+ 						  var);
+       if (seq)
+ 	gimple_seq_add_seq (&stmts, seq);
+     }
  
    /* Insert stmt on loop preheader edge.  */
    if (stmts)
*************** vect_do_peeling_for_alignment (loop_vec_
*** 2115,2120 ****
--- 2129,2135 ----
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    tree niters_of_prolog_loop, ni_name;
    tree n_iters;
+   tree wide_prolog_niters;
    struct loop *new_loop;
    unsigned int th = 0;
    int min_profitable_iters;
*************** vect_do_peeling_for_alignment (loop_vec_
*** 2125,2131 ****
    initialize_original_copy_tables ();
  
    ni_name = vect_build_loop_niters (loop_vinfo, NULL);
!   niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
  
  
    /* Get profitability threshold for vectorized loop.  */
--- 2140,2147 ----
    initialize_original_copy_tables ();
  
    ni_name = vect_build_loop_niters (loop_vinfo, NULL);
!   niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name,
! 							   &wide_prolog_niters);
  
  
    /* Get profitability threshold for vectorized loop.  */
*************** vect_do_peeling_for_alignment (loop_vec_
*** 2150,2156 ****
  		TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
  
    /* Update the init conditions of the access functions of all data refs.  */
!   vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
  
    /* After peeling we have to reset scalar evolution analyzer.  */
    scev_reset ();
--- 2166,2172 ----
  		TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
  
    /* Update the init conditions of the access functions of all data refs.  */
!   vect_update_inits_of_drs (loop_vinfo, wide_prolog_niters);
  
    /* After peeling we have to reset scalar evolution analyzer.  */
    scev_reset ();
Index: Makefile.in
===================================================================
*** Makefile.in	(revision 155937)
--- Makefile.in	(working copy)
*************** tree-scalar-evolution.o: tree-scalar-evo
*** 2548,2554 ****
     $(TIMEVAR_H) $(CFGLOOP_H) $(SCEV_H) $(TREE_PASS_H) $(FLAGS_H) \
     gt-tree-scalar-evolution.h
  tree-data-ref.o: tree-data-ref.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
!    $(GGC_H) $(TREE_H) $(RTL_H) $(BASIC_BLOCK_H) $(DIAGNOSTIC_H) \
     $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \
     $(TREE_DATA_REF_H) $(TREE_PASS_H) langhooks.h
  sese.o: sese.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
--- 2548,2554 ----
     $(TIMEVAR_H) $(CFGLOOP_H) $(SCEV_H) $(TREE_PASS_H) $(FLAGS_H) \
     gt-tree-scalar-evolution.h
  tree-data-ref.o: tree-data-ref.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
!    $(GGC_H) $(FLAGS_H) $(TREE_H) $(RTL_H) $(BASIC_BLOCK_H) $(DIAGNOSTIC_H) \
     $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \
     $(TREE_DATA_REF_H) $(TREE_PASS_H) langhooks.h
  sese.o: sese.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
Index: testsuite/gfortran.dg/vect/fast-math-mgrid-resid.f
===================================================================
--- testsuite/gfortran.dg/vect/fast-math-mgrid-resid.f	(revision 0)
+++ testsuite/gfortran.dg/vect/fast-math-mgrid-resid.f	(revision 0)
@@ -0,0 +1,44 @@
+! { dg-do compile }
+! { dg-require-effective-target vect_double }
+! { dg-options "-O3 -ffast-math -fpredictive-commoning -ftree-vectorize -fdump-tree-optimized" }
+
+******* RESID COMPUTES THE RESIDUAL:  R = V - AU
+*
+*      THIS SIMPLE IMPLEMENTATION COSTS  27A + 4M PER RESULT, WHERE
+*      A AND M DENOTE THE COSTS OF ADDITION (OR SUBTRACTION) AND 
+*      MULTIPLICATION, RESPECTIVELY.  BY USING SEVERAL TWO-DIMENSIONAL 
+*      BUFFERS ONE CAN REDUCE THIS COST TO  13A + 4M IN THE GENERAL 
+*      CASE, OR  10A + 3M WHEN THE COEFFICIENT A(1) IS ZERO.
+*
+      SUBROUTINE RESID(U,V,R,N,A)
+      INTEGER N
+      REAL*8 U(N,N,N),V(N,N,N),R(N,N,N),A(0:3)
+      INTEGER I3, I2, I1
+C
+      DO 600 I3=2,N-1
+      DO 600 I2=2,N-1
+      DO 600 I1=2,N-1
+ 600  R(I1,I2,I3)=V(I1,I2,I3)
+     >      -A(0)*( U(I1,  I2,  I3  ) )
+     >      -A(1)*( U(I1-1,I2,  I3  ) + U(I1+1,I2,  I3  )
+     >                 +  U(I1,  I2-1,I3  ) + U(I1,  I2+1,I3  )
+     >                 +  U(I1,  I2,  I3-1) + U(I1,  I2,  I3+1) )
+     >      -A(2)*( U(I1-1,I2-1,I3  ) + U(I1+1,I2-1,I3  )
+     >                 +  U(I1-1,I2+1,I3  ) + U(I1+1,I2+1,I3  )
+     >                 +  U(I1,  I2-1,I3-1) + U(I1,  I2+1,I3-1)
+     >                 +  U(I1,  I2-1,I3+1) + U(I1,  I2+1,I3+1)
+     >                 +  U(I1-1,I2,  I3-1) + U(I1-1,I2,  I3+1)
+     >                 +  U(I1+1,I2,  I3-1) + U(I1+1,I2,  I3+1) )
+     >      -A(3)*( U(I1-1,I2-1,I3-1) + U(I1+1,I2-1,I3-1)
+     >                 +  U(I1-1,I2+1,I3-1) + U(I1+1,I2+1,I3-1)
+     >                 +  U(I1-1,I2-1,I3+1) + U(I1+1,I2-1,I3+1)
+     >                 +  U(I1-1,I2+1,I3+1) + U(I1+1,I2+1,I3+1) )
+C
+      RETURN
+      END
+! we want to check that predictive commoning did something on the
+! vectorized loop, which means we have to have exactly 13 vector
+! additions.
+! { dg-final { scan-tree-dump-times "vect_var\[^\\n\]*\\+ " 13 "optimized" } }
+! { dg-final { cleanup-tree-dump "vect" } }
+! { dg-final { cleanup-tree-dump "optimized" } }



More information about the Gcc-patches mailing list