[PATCH] Fix PR50031

William J. Schmidt wschmidt@linux.vnet.ibm.com
Wed Feb 8 14:48:00 GMT 2012


This is a vectorizer patch I inherited from Ira.  As with the fix for
PR50969, it handles problems with excessive permutes by adjusting the
cost model.  In this case the permutes comes from type
promotion/demotion, which is now modeled with a new vec_cvt cost.

This restores the lost performance for sphinx3, and gives wupwise an
additional boost.  This is a performance workaround for 4.7; for 4.8 we
want to try to do a better job of modeling the real issue, which is that
VSX permutes are constrained to a single processor pipe.  Thus isolated
permutes are less expensive per instruction than denser collections of
permutes.

There also remains an opportunity for versioning loops for alignment.

Bootstrapped and tested on powerpc64-linux with no regressions.  OK for
trunk?

Thanks,
Bill


2012-02-08  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
	    Ira Rosen  <irar@il.ibm.com>

	PR tree-optimization/50031
	* targhooks.c (default_builtin_vectorization_cost): Handle vec_cvt.
	* target.h (enum vect_cost_for_stmt): Add vec_cvt.
	* tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle
	all types of reduction and pattern statements.
	(vect_estimate_min_profitable_iters): Likewise.
	* tree-vect-stmts.c (vect_model_simple_cost): Use vec_cvt cost for
	type promotions and demotions.
	(vect_model_conversion_cost): New function.
	(vect_get_load_cost): Use vec_perm for permutations; add dump logic
	for explicit realigns.
	(vectorizable_conversion): Call vect_model_conversion_cost.
	* config/spu/spu.c (spu_builtin_vectorization_cost): Handle vec_cvt.
	* config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
	* config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update
	vec_perm for VSX and handle vec_cvt.


Index: gcc/targhooks.c
===================================================================
--- gcc/targhooks.c	(revision 183944)
+++ gcc/targhooks.c	(working copy)
@@ -514,6 +514,7 @@ default_builtin_vectorization_cost (enum vect_cost
       case scalar_to_vec:
       case cond_branch_not_taken:
       case vec_perm:
+      case vec_cvt:
         return 1;
 
       case unaligned_load:
Index: gcc/target.h
===================================================================
--- gcc/target.h	(revision 183944)
+++ gcc/target.h	(working copy)
@@ -145,7 +145,8 @@ enum vect_cost_for_stmt
   scalar_to_vec,
   cond_branch_not_taken,
   cond_branch_taken,
-  vec_perm
+  vec_perm,
+  vec_cvt
 };
 
 /* The target structure.  This holds all the backend hooks.  */
Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c	(revision 183944)
+++ gcc/tree-vect-loop.c	(working copy)
@@ -2417,7 +2417,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf
           if (stmt_info
               && !STMT_VINFO_RELEVANT_P (stmt_info)
               && (!STMT_VINFO_LIVE_P (stmt_info)
-                  || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
+                  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
+	      && !STMT_VINFO_IN_PATTERN_P (stmt_info))
             continue;
 
           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
@@ -2562,17 +2563,51 @@ vect_estimate_min_profitable_iters (loop_vec_info
 
       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 	{
-	  gimple stmt = gsi_stmt (si);
+	  gimple stmt = gsi_stmt (si), pattern_stmt;
 	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 	  /* Skip stmts that are not vectorized inside the loop.  */
 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
 	      && (!STMT_VINFO_LIVE_P (stmt_info)
-		  || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
-	    continue;
+                 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))))
+	    {
+	      if (STMT_VINFO_IN_PATTERN_P (stmt_info)
+		  && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
+		  && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
+		      || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
+                {
+                  stmt = pattern_stmt;
+                  stmt_info = vinfo_for_stmt (stmt);
+		}
+	      else
+		continue;
+	    }
 	  vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
 	  /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
 	     some of the "outside" costs are generated inside the outer-loop.  */
 	  vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
+          if (is_pattern_stmt_p (stmt_info)
+	      && STMT_VINFO_PATTERN_DEF_SEQ (stmt_info))
+            {
+	      gimple_stmt_iterator gsi;
+	      
+	      for (gsi = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
+		   !gsi_end_p (gsi); gsi_next (&gsi))
+                {
+                  gimple pattern_def_stmt = gsi_stmt (gsi);
+                  stmt_vec_info pattern_def_stmt_info
+		    = vinfo_for_stmt (pattern_def_stmt);
+                  if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
+                      || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
+		    {
+                      vec_inside_cost
+			+= STMT_VINFO_INSIDE_OF_LOOP_COST
+			   (pattern_def_stmt_info) * factor;
+                      vec_outside_cost
+			+= STMT_VINFO_OUTSIDE_OF_LOOP_COST
+			   (pattern_def_stmt_info);
+                    }
+		}
+	    }
 	}
     }
 
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c	(revision 183944)
+++ gcc/tree-vect-stmts.c	(working copy)
@@ -787,13 +787,18 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i
 {
   int i;
   int inside_cost = 0, outside_cost = 0;
+  enum vect_cost_for_stmt cost = vector_stmt;
 
   /* The SLP costs were already calculated during SLP tree build.  */
   if (PURE_SLP_STMT (stmt_info))
     return;
 
-  inside_cost = ncopies * vect_get_stmt_cost (vector_stmt); 
+  if (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type
+      || STMT_VINFO_TYPE (stmt_info) == type_demotion_vec_info_type)
+    cost = vec_cvt;
 
+  inside_cost = ncopies * vect_get_stmt_cost (cost); 
+
   /* FORNOW: Assuming maximum 2 args per stmts.  */
   for (i = 0; i < 2; i++)
     {
@@ -811,6 +816,43 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i
 }
 
 
+/* Model cost for type demotion and promotion operations.  */
+
+static void
+vect_model_conversion_cost (stmt_vec_info stmt_info,
+			    enum vect_def_type *dt, int pwr)
+{
+  int i, tmp;
+  int inside_cost = 0, outside_cost = 0, single_stmt_cost;
+
+  /* The SLP costs were already calculated during SLP tree build.  */
+  if (PURE_SLP_STMT (stmt_info))
+    return;
+
+  single_stmt_cost = vect_get_stmt_cost (vec_cvt);
+  for (i = 0; i < pwr + 1; i++)
+    {
+      tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ?
+	(i + 1) : i;
+      inside_cost += vect_pow2 (tmp) * single_stmt_cost;
+    }
+
+  /* FORNOW: Assuming maximum 2 args per stmts.  */
+  for (i = 0; i < 2; i++)
+    {
+      if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
+        outside_cost += vect_get_stmt_cost (vector_stmt);
+    }
+
+  if (vect_print_dump_info (REPORT_COST))
+    fprintf (vect_dump, "vect_model_conversion_cost: inside_cost = %d, "
+             "outside_cost = %d .", inside_cost, outside_cost);
+
+  /* Set the costs in STMT_INFO.  */
+  stmt_vinfo_set_inside_of_loop_cost (stmt_info, NULL, inside_cost);
+  stmt_vinfo_set_outside_of_loop_cost (stmt_info, NULL, outside_cost);
+}
+
 /* Function vect_cost_strided_group_size
 
    For strided load or store, return the group_size only if it is the first
@@ -887,7 +929,6 @@ vect_model_store_cost (stmt_vec_info stmt_info, in
       if (vect_print_dump_info (REPORT_COST))
         fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
                  group_size);
-
     }
 
   /* Costs of the stores.  */
@@ -1049,7 +1090,7 @@ vect_get_load_cost (struct data_reference *dr, int
     case dr_explicit_realign:
       {
         *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load)
-           + vect_get_stmt_cost (vector_stmt));
+				   + vect_get_stmt_cost (vec_perm));
 
         /* FIXME: If the misalignment remains fixed across the iterations of
            the containing loop, the following cost should be added to the
@@ -1057,6 +1098,9 @@ vect_get_load_cost (struct data_reference *dr, int
         if (targetm.vectorize.builtin_mask_for_load)
           *inside_cost += vect_get_stmt_cost (vector_stmt);
 
+        if (vect_print_dump_info (REPORT_COST))
+          fprintf (vect_dump, "vect_model_load_cost: explicit realign");
+
         break;
       }
     case dr_explicit_realign_optimized:
@@ -1080,7 +1124,7 @@ vect_get_load_cost (struct data_reference *dr, int
           }
 
         *inside_cost += ncopies * (vect_get_stmt_cost (vector_load)
-          + vect_get_stmt_cost (vector_stmt));
+				   + vect_get_stmt_cost (vec_perm));
         break;
       }
 
@@ -2392,16 +2436,19 @@ vectorizable_conversion (gimple stmt, gimple_stmt_
       if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "=== vectorizable_conversion ===");
       if (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR)
-	STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
+        {
+	  STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
+	  vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
+	}
       else if (modifier == NARROW)
 	{
 	  STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
-	  vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
+	  vect_model_conversion_cost (stmt_info, dt, multi_step_cvt);
 	}
       else
 	{
 	  STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
-	  vect_model_simple_cost (stmt_info, 2 * ncopies, dt, NULL);
+	  vect_model_conversion_cost (stmt_info, dt, multi_step_cvt);
 	}
       VEC_free (tree, heap, interm_types);
       return true;
Index: gcc/config/spu/spu.c
===================================================================
--- gcc/config/spu/spu.c	(revision 183944)
+++ gcc/config/spu/spu.c	(working copy)
@@ -6920,6 +6920,7 @@ spu_builtin_vectorization_cost (enum vect_cost_for
       case scalar_to_vec:
       case cond_branch_not_taken:
       case vec_perm:
+      case vec_cvt:
         return 1;
 
       case scalar_store:
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 183944)
+++ gcc/config/i386/i386.c	(working copy)
@@ -35336,6 +35336,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
         return ix86_cost->cond_not_taken_branch_cost;
 
       case vec_perm:
+      case vec_cvt:
         return ix86_cost->vec_stmt_cost;
 
       default:
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 183944)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -3543,10 +3543,17 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
         return 1;
 
       case vec_perm:
-	if (!TARGET_VSX)
+	if (TARGET_VSX)
+	  return 4;
+	else
 	  return 1;
-	return 2;
 
+      case vec_cvt:
+        if (TARGET_VSX)
+          return 5;
+        else
+          return 1;
+
       case cond_branch_taken:
         return 3;
 




More information about the Gcc-patches mailing list