This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [RFC PATCH] Gather vectorization (PR tree-optimization/50789)


On Sat, Oct 29, 2011 at 03:53:37PM +0200, Toon Moene wrote:
> I wonder whether it will work with the attached Fortran routine - it
> sure would mean a boost to the 18%+ heaviest CPU user in our code.

It didn't do anything, but only because I used a bad approach in
vect_check_gather.  I have been using DR_BASE_ADDRESS/DR_OFFSET/DR_INIT
into which dr_analyze_innermost splits the reference, but that split is
into something hopefully usable for alias analysis, variable and constant
offset using split_constant_offset that would create largish expressions in
your testcase, which add many values together.  But for gather vectorization
we'd either have to gimplify such expressions back before the load and
allow them to be vectorized, or, as done in this incremental patch,
I instead do something similar to what dr_analyze_innermost does,
but with different goal - to split stuff off into a loop invariant that
can be computed before the loop (and will be put into the scalar part of
gather), and a SSA_NAME defined in the loop which contains the rest (plus
optionally sign/zero extending that into a wider type and/or scaling by
2/4/8.  With this incremental patch I get 4 loops in this testcase
with -O3 -mavx2 vectorized (compared to 0 before), with 262 vgather* insns.

On x86_64 unfortunately this doesn't figure out that it could do all the
additions for the variable index in 32-bit type and then sign extend:
idx_202 = *kp_201(D)[D.1941_200];
idy_209 = *kq_208(D)[D.1941_200];
ilev_216 = *kr_215(D)[D.1941_200];
D.1955_229 = *pgama_228(D)[D.1941_200];
D.1960_237 = *pbeta_236(D)[D.1941_200];
D.1965_245 = *palfa_244(D)[D.1941_200];
D.1966_246 = ilev_216 + -1;
D.1967_247 = (integer(kind=8)) D.1966_246;
D.1968_248 = D.1967_247 * stride.32_141;
D.1969_249 = D.1968_248 + offset.33_155;
D.1970_250 = idy_209 + -1;
D.1971_251 = (integer(kind=8)) D.1970_250;
D.1972_252 = D.1971_251 * stride.30_129;
D.1973_253 = D.1969_249 + D.1972_252;
D.1974_254 = idx_202 + -1;
D.1975_255 = (integer(kind=8)) D.1974_254;
D.1976_256 = D.1973_253 + D.1975_255;
D.1977_258 = *parg_257(D)[D.1976_256];
so for -m64 it emits vgatherqps instructions (V4DImode indexes, loads
V4SFmode values) and then merges those, while for -m32 it emits
just 131 vgatherdps instructions (V8SImode indexes, V8SFmode values).

Would be nice to cut down slightly this testcase into just one or two loops
that are vectorized and turn it into a runtime testcase which verifies
the vectorization was correct.

2011-10-31  Jakub Jelinek  <jakub@redhat.com>

	* tree-vect-stmts.c (vectorizable_load): Don't add
	DR_INIT (dr) to ptr.
	* tree-vect-data-refs.c (vect_check_gather): Rewritten not to use
	DR_BASE_ADDRESS or DR_OFFSET, instead call get_inner_reference
	and try to separate in between base and off.

--- gcc/tree-vect-stmts.c.jj	2011-10-31 12:13:45.000000000 +0100
+++ gcc/tree-vect-stmts.c	2011-10-31 13:21:13.000000000 +0100
@@ -4452,8 +4452,6 @@ vectorizable_load (gimple stmt, gimple_s
       vec_dest = vect_create_destination_var (scalar_dest, vectype);
 
       ptr = fold_convert (ptrtype, gather_base);
-      ptr = fold_build2 (POINTER_PLUS_EXPR, ptrtype, ptr,
-			 fold_convert (sizetype, DR_INIT (dr)));
       if (!is_gimple_min_invariant (ptr))
 	{
 	  ptr = force_gimple_operand (ptr, &seq, true, NULL_TREE);
--- gcc/tree-vect-data-refs.c.jj	2011-10-31 12:13:45.000000000 +0100
+++ gcc/tree-vect-data-refs.c	2011-10-31 14:53:18.000000000 +0100
@@ -2504,109 +2504,156 @@ tree
 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
 		   tree *offp, int *scalep)
 {
-  HOST_WIDE_INT scale = 1;
+  HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
   tree offtype = NULL_TREE;
-  tree base = DR_BASE_ADDRESS (dr);
-  tree off = DR_OFFSET (dr);
-  tree decl;
-
-  if (TREE_CODE (base) == POINTER_PLUS_EXPR
-      && integer_zerop (off)
-      && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
-      && !chrec_contains_symbols_defined_in_loop (TREE_OPERAND (base, 0),
-						  loop->num))
+  tree decl, base, off;
+  enum machine_mode pmode;
+  int punsignedp, pvolatilep;
+
+  base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
+			      &pmode, &punsignedp, &pvolatilep, false);
+  gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
+
+  if (TREE_CODE (base) == MEM_REF)
     {
-      off = TREE_OPERAND (base, 1);
+      if (!integer_zerop (TREE_OPERAND (base, 1)))
+	{
+	  if (off == NULL_TREE)
+	    {
+	      double_int moff = mem_ref_offset (base);
+	      off = double_int_to_tree (sizetype, moff);
+	    }
+	  else
+	    off = size_binop (PLUS_EXPR, off, TREE_OPERAND (base, 1));
+	}
       base = TREE_OPERAND (base, 0);
     }
-  if (is_gimple_min_invariant (base)
-      || (TREE_CODE (base) == SSA_NAME
-	  && !chrec_contains_symbols_defined_in_loop (base, loop->num)))
-    {
-      /* DR_BASE_ADDRESS + DR_INIT would go into the constant
-	 pointer, DR_OFFSET into vector.  */
-      STRIP_NOPS (off);
-      if (TREE_CODE (off) == MULT_EXPR
-	  && host_integerp (TREE_OPERAND (off, 1), 0))
+  else
+    base = build_fold_addr_expr (base);
+
+  if (off == NULL_TREE)
+    off = size_zero_node;
+
+  if (!is_gimple_min_invariant (base)
+      && (TREE_CODE (base) != SSA_NAME
+	  || chrec_contains_symbols_defined_in_loop (base, loop->num)))
+    {
+      if (!integer_zerop (off))
+	return NULL_TREE;
+      off = base;
+      base = size_int (pbitpos / BITS_PER_UNIT);
+    }
+  else
+    {
+      base = fold_convert (sizetype, base);
+      base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
+    }
+
+  STRIP_NOPS (off);
+  while (offtype == NULL_TREE)
+    {
+      enum tree_code code;
+      tree op0, op1, add = NULL_TREE;
+
+      if (TREE_CODE (off) == SSA_NAME)
 	{
-	  scale = tree_low_cst (TREE_OPERAND (off, 1), 0);
-	  if (scale != (int) scale)
+	  gimple def_stmt = SSA_NAME_DEF_STMT (off);
+
+	  if (!chrec_contains_symbols_defined_in_loop (off, loop->num))
 	    return NULL_TREE;
-	  off = TREE_OPERAND (off, 0);
+
+	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
+	    break;
+
+	  op0 = gimple_assign_rhs1 (def_stmt);
+	  code = gimple_assign_rhs_code (def_stmt);
+	  op1 = gimple_assign_rhs2 (def_stmt);
 	}
-      if (CONVERT_EXPR_P (off))
+      else
 	{
-	  tree op = TREE_OPERAND (off, 0);
-	  if (TREE_CODE (TREE_TYPE (op)) == INTEGER_TYPE
-	      && TYPE_PRECISION (TREE_TYPE (op))
-		 == GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (op)))
-	      && TYPE_PRECISION (TREE_TYPE (op))
-		 < TYPE_PRECISION (TREE_TYPE (off)))
-	    {
-	      off = op;
-	      offtype = TREE_TYPE (off);
-	    }
+	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
+	    return NULL_TREE;
+	  code = TREE_CODE (off);
+	  extract_ops_from_tree (off, &code, &op0, &op1);
 	}
-      STRIP_NOPS (off);
-      switch (TREE_CODE (off))
+      switch (code)
 	{
-	case SSA_NAME:
-	  break;
+	case POINTER_PLUS_EXPR:
 	case PLUS_EXPR:
+	  if (is_gimple_min_invariant (op0)
+	      || (TREE_CODE (op0) == SSA_NAME
+		  && !chrec_contains_symbols_defined_in_loop (op0, loop->num)))
+	    {
+	      add = op0;
+	      off = op1;
+	    do_add:
+	      add = fold_convert (sizetype, add);
+	      if (scale != 1)
+		add = size_binop (MULT_EXPR, add, size_int (scale));
+	      base = size_binop (PLUS_EXPR, base, add);
+	      continue;
+	    }
+	  if (is_gimple_min_invariant (op1)
+	      || (TREE_CODE (op1) == SSA_NAME
+		  && !chrec_contains_symbols_defined_in_loop (op1, loop->num)))
+	    {
+	      add = op1;
+	      off = op0;
+	      goto do_add;
+	    }
+	  break;
 	case MINUS_EXPR:
-	  if (TREE_CODE (TREE_OPERAND (off, 0)) == SSA_NAME
-	      && TREE_CODE (TREE_OPERAND (off, 1)) == SSA_NAME)
+	  if (is_gimple_min_invariant (op1)
+	      || (TREE_CODE (op1) == SSA_NAME
+		  && !chrec_contains_symbols_defined_in_loop (op1, loop->num)))
 	    {
-	      bool c0
-		= chrec_contains_symbols_defined_in_loop (TREE_OPERAND (off,
-									0),
-							  loop->num);
-	      bool c1
-		= chrec_contains_symbols_defined_in_loop (TREE_OPERAND (off,
-									1),
-							  loop->num);
-	      if (c0 && !c1)
-		{
-		  tree bo = fold_convert (sizetype, TREE_OPERAND (off, 1));
-		  if (scale != 1)
-		    bo = size_binop (MULT_EXPR, bo, size_int (scale));
-		  if (TREE_CODE (off) == MINUS_EXPR)
-		    bo = fold_build1 (NEGATE_EXPR, sizetype, bo);
-		  base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (base),
-				      base, bo);
-		  off = TREE_OPERAND (off, 0);
-		  break;
-		}
-	      if (!c0 && c1 && TREE_CODE (off) == PLUS_EXPR)
-		{
-		  tree bo = fold_convert (sizetype, TREE_OPERAND (off, 0));
-		  if (scale != 1)
-		    bo = size_binop (MULT_EXPR, bo, size_int (scale));
-		  base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (base),
-				      base, bo);
-		  off = TREE_OPERAND (off, 1);
-		  break;
-		}
+	      add = fold_convert (sizetype, op1);
+	      add = size_binop (MINUS_EXPR, size_zero_node, add);
+	      off = op0;
+	      goto do_add;
+	    }
+	  break;
+	case MULT_EXPR:
+	  if (scale == 1 && host_integerp (op1, 0))
+	    {
+	      scale = tree_low_cst (op1, 0);
+	      off = op0;
+	      continue;
+	    }
+	  break;
+	case SSA_NAME:
+	  off = op0;
+	  continue;
+	CASE_CONVERT:
+	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
+	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
+	    break;
+	  if (TYPE_PRECISION (TREE_TYPE (op0))
+	      == TYPE_PRECISION (TREE_TYPE (off)))
+	    {
+	      off = op0;
+	      continue;
+	    }
+	  if (TYPE_PRECISION (TREE_TYPE (op0))
+	      < TYPE_PRECISION (TREE_TYPE (off)))
+	    {
+	      off = op0;
+	      offtype = TREE_TYPE (off);
+	      STRIP_NOPS (off);
+	      continue;
 	    }
-	  return NULL_TREE;
+	  break;
 	default:
-	  return NULL_TREE;
+	  break;
 	}
-    }
-  else
-    {
-      if (!integer_zerop (off))
-	return NULL_TREE;
-      else if (TREE_CODE (base) != SSA_NAME)
-	return NULL_TREE;
-      off = base;
-      base = build_int_cst (TREE_TYPE (DR_BASE_ADDRESS (dr)), 0);
+      break;
     }
 
-  if (!chrec_contains_symbols_defined_in_loop (off, loop->num))
+  if (TREE_CODE (off) != SSA_NAME
+      || !chrec_contains_symbols_defined_in_loop (off, loop->num))
     return NULL_TREE;
 
   if (offtype == NULL_TREE)


	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]