This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Use get_nonzero_bits to improve vectorization


Hi!

The following patch makes use of the computed nonzero_bits preserved
in the SSA_NAME_RANGE_INFO.
I chose to write a new routine instead of improving current
highest_pow2_factor, because that routine didn't care about overflows etc.
and by working on ctz numbers instead of powers of two in UHWI we can handle
even larger constants etc.  highest_pow2_factor could very well overflow to
zero etc.  So, the patch introduces a new tree_ctz routine and reimplements
highest_pow2_factor on top of that, plus uses tree_ctz also in
get_object_alignment_2 and in the vectorizer to determine if it can avoid
scalar loop for bound (and indirectly also in the analysis whether peeling
for alignment is needed).

With this patch, e.g.
int a[1024];

void
foo (int x, int y)
{
  int i;
  x &= -32;
  y &= -32;
  for (i = x + 32; i < y; i++)
    a[i]++;
}
can be vectorized without any peeling for alignment or scalar loop
afterwards.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2013-10-25  Jakub Jelinek  <jakub@redhat.com>

	* tree.c (tree_ctz): New function.
	* tree.h (tree_ctz): New prototype.
	* tree-ssanames.h (get_range_info, get_nonzero_bits): Change
	first argument from tree to const_tree.
	* tree-ssanames.c (get_range_info, get_nonzero_bits): Likewise.
	* tree-vectorizer.h (vect_generate_tmps_on_preheader): New prototype.
	* tree-vect-loop-manip.c (vect_generate_tmps_on_preheader): No longer
	static.
	* expr.c (highest_pow2_factor): Reimplemented using tree_ctz.
	* tree-vect-loop.c (vect_analyze_loop_operations,
	vect_transform_loop): Don't force scalar loop for bound just because
	number of iterations is unknown, only do it if it is not known to be
	a multiple of vectorization_factor.
	* builtins.c (get_object_alignment_2): Use tree_ctz on offset.

--- gcc/tree.c.jj	2013-10-23 14:43:12.000000000 +0200
+++ gcc/tree.c	2013-10-25 15:00:55.296178794 +0200
@@ -2213,6 +2213,110 @@ tree_floor_log2 (const_tree expr)
 	  : floor_log2 (low));
 }
 
+/* Return number of known trailing zero bits in EXPR, or, if the value of
+   EXPR is known to be zero, the precision of it's type.  */
+
+int
+tree_ctz (const_tree expr)
+{
+  if (!INTEGRAL_TYPE_P (TREE_TYPE (expr))
+      && !POINTER_TYPE_P (TREE_TYPE (expr)))
+    return 0;
+
+  int ret1, ret2, prec = TYPE_PRECISION (TREE_TYPE (expr));
+  switch (TREE_CODE (expr))
+    {
+    case INTEGER_CST:
+      ret1 = tree_to_double_int (expr).trailing_zeros ();
+      return MIN (ret1, prec);
+    case SSA_NAME:
+      ret1 = get_nonzero_bits (expr).trailing_zeros ();
+      return MIN (ret1, prec);
+    case PLUS_EXPR:
+    case MINUS_EXPR:
+    case BIT_IOR_EXPR:
+    case BIT_XOR_EXPR:
+    case MIN_EXPR:
+    case MAX_EXPR:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
+      return MIN (ret1, ret2);
+    case POINTER_PLUS_EXPR:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
+      ret2 = MIN (ret2, prec);
+      return MIN (ret1, ret2);
+    case BIT_AND_EXPR:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
+      return MAX (ret1, ret2);
+    case MULT_EXPR:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+      ret2 = tree_ctz (TREE_OPERAND (expr, 1));
+      return MIN (ret1 + ret2, prec);
+    case LSHIFT_EXPR:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+      if (host_integerp (TREE_OPERAND (expr, 1), 1)
+	  && ((unsigned HOST_WIDE_INT) tree_low_cst (TREE_OPERAND (expr, 1), 1)
+	      < (unsigned HOST_WIDE_INT) prec))
+	{
+	  ret2 = tree_low_cst (TREE_OPERAND (expr, 1), 1);
+	  return MIN (ret1 + ret2, prec);
+	}
+      return ret1;
+    case RSHIFT_EXPR:
+      if (host_integerp (TREE_OPERAND (expr, 1), 1)
+	  && ((unsigned HOST_WIDE_INT) tree_low_cst (TREE_OPERAND (expr, 1), 1)
+	      < (unsigned HOST_WIDE_INT) prec))
+	{
+	  ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+	  ret2 = tree_low_cst (TREE_OPERAND (expr, 1), 1);
+	  if (ret1 > ret2)
+	    return ret1 - ret2;
+	}
+      return 0;
+    case TRUNC_DIV_EXPR:
+    case CEIL_DIV_EXPR:
+    case FLOOR_DIV_EXPR:
+    case ROUND_DIV_EXPR:
+    case EXACT_DIV_EXPR:
+      if (TREE_CODE (TREE_OPERAND (expr, 1)) == INTEGER_CST)
+	{
+	  ret2 = tree_log2 (TREE_OPERAND (expr, 1));
+	  if (ret2 >= 0 && tree_int_cst_sgn (TREE_OPERAND (expr, 1)) == 1)
+	    {
+	      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+	      if (ret1 > ret2)
+		return ret1 - ret2;
+	    }
+	}
+      return 0;
+    CASE_CONVERT:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 0));
+      if (ret1 && ret1 == TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (expr, 0))))
+	ret1 = prec;
+      return MIN (ret1, prec);
+    case SAVE_EXPR:
+      return tree_ctz (TREE_OPERAND (expr, 0));
+    case COND_EXPR:
+      ret1 = tree_ctz (TREE_OPERAND (expr, 1));
+      ret2 = tree_ctz (TREE_OPERAND (expr, 2));
+      return MIN (ret1, ret2);
+    case COMPOUND_EXPR:
+      return tree_ctz (TREE_OPERAND (expr, 1));
+    case ADDR_EXPR:
+      ret1 = get_object_alignment (TREE_OPERAND (expr, 0));
+      if (ret1 > BITS_PER_UNIT)
+	{
+	  ret1 = ctz_hwi (ret1 / BITS_PER_UNIT);
+	  return MIN (ret1, prec);
+	}
+      return 0;
+    default:
+      return 0;
+    }
+}
+
 /* Return 1 if EXPR is the real constant zero.  Trailing zeroes matter for
    decimal float constants, so don't return 1 for them.  */
 
--- gcc/tree.h.jj	2013-10-17 22:30:45.000000000 +0200
+++ gcc/tree.h	2013-10-25 12:20:05.473673186 +0200
@@ -4546,6 +4546,7 @@ extern void get_type_static_bounds (cons
 extern bool variably_modified_type_p (tree, tree);
 extern int tree_log2 (const_tree);
 extern int tree_floor_log2 (const_tree);
+extern int tree_ctz (const_tree);
 extern int simple_cst_equal (const_tree, const_tree);
 extern hashval_t iterative_hash_expr (const_tree, hashval_t);
 extern hashval_t iterative_hash_exprs_commutative (const_tree,
--- gcc/tree-ssanames.h.jj	2013-10-24 15:52:53.000000000 +0200
+++ gcc/tree-ssanames.h	2013-10-25 14:09:21.227015919 +0200
@@ -72,9 +72,10 @@ enum value_range_type { VR_UNDEFINED, VR
 /* Sets the value range to SSA.  */
 extern void set_range_info (tree, double_int, double_int);
 /* Gets the value range from SSA.  */
-extern enum value_range_type get_range_info (tree, double_int *, double_int *);
+extern enum value_range_type get_range_info (const_tree, double_int *,
+					     double_int *);
 extern void set_nonzero_bits (tree, double_int);
-extern double_int get_nonzero_bits (tree);
+extern double_int get_nonzero_bits (const_tree);
 extern void init_ssanames (struct function *, int);
 extern void fini_ssanames (void);
 extern void ssanames_print_statistics (void);
--- gcc/tree-ssanames.c.jj	2013-10-24 17:32:22.000000000 +0200
+++ gcc/tree-ssanames.c	2013-10-25 14:08:46.218187581 +0200
@@ -221,7 +221,7 @@ set_range_info (tree name, double_int mi
    is used to determine if MIN and MAX are valid values.  */
 
 enum value_range_type
-get_range_info (tree name, double_int *min, double_int *max)
+get_range_info (const_tree name, double_int *min, double_int *max)
 {
   enum value_range_type range_type;
   gcc_assert (!POINTER_TYPE_P (TREE_TYPE (name)));
@@ -271,7 +271,7 @@ set_nonzero_bits (tree name, double_int
    NAME, or double_int_minus_one if unknown.  */
 
 double_int
-get_nonzero_bits (tree name)
+get_nonzero_bits (const_tree name)
 {
   if (POINTER_TYPE_P (TREE_TYPE (name)))
     {
--- gcc/tree-vectorizer.h.jj	2013-10-24 10:19:20.000000000 +0200
+++ gcc/tree-vectorizer.h	2013-10-25 14:02:58.198999063 +0200
@@ -901,6 +901,8 @@ extern void slpeel_make_loop_iterate_nti
 extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
 struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, edge);
 extern void vect_loop_versioning (loop_vec_info, unsigned int, bool);
+extern void vect_generate_tmps_on_preheader (loop_vec_info, tree *, tree *,
+					     tree *, gimple_seq);
 extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree *,
 					    unsigned int, bool);
 extern void vect_do_peeling_for_alignment (loop_vec_info, unsigned int, bool);
--- gcc/tree-vect-loop-manip.c.jj	2013-10-24 10:19:22.000000000 +0200
+++ gcc/tree-vect-loop-manip.c	2013-10-25 14:02:00.544284058 +0200
@@ -1437,7 +1437,7 @@ vect_build_loop_niters (loop_vec_info lo
  and places them at the loop preheader edge or in COND_EXPR_STMT_LIST
  if that is non-NULL.  */
 
-static void
+void
 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
 				 tree *ni_name_ptr,
 				 tree *ratio_mult_vf_name_ptr,
--- gcc/expr.c.jj	2013-10-23 14:43:15.000000000 +0200
+++ gcc/expr.c	2013-10-25 15:05:23.893781676 +0200
@@ -7282,74 +7282,14 @@ safe_from_p (const_rtx x, tree exp, int
 unsigned HOST_WIDE_INT
 highest_pow2_factor (const_tree exp)
 {
-  unsigned HOST_WIDE_INT c0, c1;
-
-  switch (TREE_CODE (exp))
-    {
-    case INTEGER_CST:
-      /* We can find the lowest bit that's a one.  If the low
-	 HOST_BITS_PER_WIDE_INT bits are zero, return BIGGEST_ALIGNMENT.
-	 We need to handle this case since we can find it in a COND_EXPR,
-	 a MIN_EXPR, or a MAX_EXPR.  If the constant overflows, we have an
-	 erroneous program, so return BIGGEST_ALIGNMENT to avoid any
-	 later ICE.  */
-      if (TREE_OVERFLOW (exp))
-	return BIGGEST_ALIGNMENT;
-      else
-	{
-	  /* Note: tree_low_cst is intentionally not used here,
-	     we don't care about the upper bits.  */
-	  c0 = TREE_INT_CST_LOW (exp);
-	  c0 &= -c0;
-	  return c0 ? c0 : BIGGEST_ALIGNMENT;
-	}
-      break;
-
-    case PLUS_EXPR:  case MINUS_EXPR:  case MIN_EXPR:  case MAX_EXPR:
-      c0 = highest_pow2_factor (TREE_OPERAND (exp, 0));
-      c1 = highest_pow2_factor (TREE_OPERAND (exp, 1));
-      return MIN (c0, c1);
-
-    case MULT_EXPR:
-      c0 = highest_pow2_factor (TREE_OPERAND (exp, 0));
-      c1 = highest_pow2_factor (TREE_OPERAND (exp, 1));
-      return c0 * c1;
-
-    case ROUND_DIV_EXPR:  case TRUNC_DIV_EXPR:  case FLOOR_DIV_EXPR:
-    case CEIL_DIV_EXPR:
-      if (integer_pow2p (TREE_OPERAND (exp, 1))
-	  && host_integerp (TREE_OPERAND (exp, 1), 1))
-	{
-	  c0 = highest_pow2_factor (TREE_OPERAND (exp, 0));
-	  c1 = tree_low_cst (TREE_OPERAND (exp, 1), 1);
-	  return MAX (1, c0 / c1);
-	}
-      break;
-
-    case BIT_AND_EXPR:
-      /* The highest power of two of a bit-and expression is the maximum of
-	 that of its operands.  We typically get here for a complex LHS and
-	 a constant negative power of two on the RHS to force an explicit
-	 alignment, so don't bother looking at the LHS.  */
-      return highest_pow2_factor (TREE_OPERAND (exp, 1));
-
-    CASE_CONVERT:
-    case SAVE_EXPR:
-      return highest_pow2_factor (TREE_OPERAND (exp, 0));
-
-    case COMPOUND_EXPR:
-      return highest_pow2_factor (TREE_OPERAND (exp, 1));
-
-    case COND_EXPR:
-      c0 = highest_pow2_factor (TREE_OPERAND (exp, 1));
-      c1 = highest_pow2_factor (TREE_OPERAND (exp, 2));
-      return MIN (c0, c1);
-
-    default:
-      break;
-    }
-
-  return 1;
+  unsigned HOST_WIDE_INT ret;
+  int trailing_zeros = tree_ctz (exp);
+  if (trailing_zeros >= HOST_BITS_PER_WIDE_INT)
+    return BIGGEST_ALIGNMENT;
+  ret = (unsigned HOST_WIDE_INT) 1 << trailing_zeros;
+  if (ret > BIGGEST_ALIGNMENT)
+    return BIGGEST_ALIGNMENT;
+  return ret;
 }
 
 /* Similar, except that the alignment requirements of TARGET are
--- gcc/tree-vect-loop.c.jj	2013-10-24 10:19:23.000000000 +0200
+++ gcc/tree-vect-loop.c	2013-10-25 14:01:35.968407222 +0200
@@ -1586,9 +1586,9 @@ vect_analyze_loop_operations (loop_vec_i
       return false;
     }
 
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
-      || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
+  if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)
+      || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+	  < exact_log2 (vectorization_factor)))
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.\n");
@@ -5656,15 +5656,20 @@ vect_transform_loop (loop_vec_info loop_
      will remain scalar and will compute the remaining (n%VF) iterations.
      (VF is the vectorization factor).  */
 
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-       || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	   && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
-       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+  if (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+      < exact_log2 (vectorization_factor)
+      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
 				    th, check_profitability);
-  else
+  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
 		LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
+  else
+    {
+      tree ni_name, ratio_mult_vf;
+      vect_generate_tmps_on_preheader (loop_vinfo, &ni_name, &ratio_mult_vf,
+				       &ratio, NULL);
+    }
 
   /* 1) Make sure the loop header has exactly two entries
      2) Make sure we have a preheader basic block.  */
--- gcc/builtins.c.jj	2013-10-23 14:43:12.000000000 +0200
+++ gcc/builtins.c	2013-10-25 12:31:49.426022284 +0200
@@ -309,7 +309,7 @@ get_object_alignment_2 (tree exp, unsign
   tree offset;
   enum machine_mode mode;
   int unsignedp, volatilep;
-  unsigned int inner, align = BITS_PER_UNIT;
+  unsigned int align = BITS_PER_UNIT;
   bool known_alignment = false;
 
   /* Get the innermost object and the constant (bitpos) and possibly
@@ -418,50 +418,16 @@ get_object_alignment_2 (tree exp, unsign
 
   /* If there is a non-constant offset part extract the maximum
      alignment that can prevail.  */
-  inner = ~0U;
-  while (offset)
+  if (offset)
     {
-      tree next_offset;
-
-      if (TREE_CODE (offset) == PLUS_EXPR)
-	{
-	  next_offset = TREE_OPERAND (offset, 0);
-	  offset = TREE_OPERAND (offset, 1);
-	}
-      else
-	next_offset = NULL;
-      if (host_integerp (offset, 1))
-	{
-	  /* Any overflow in calculating offset_bits won't change
-	     the alignment.  */
-	  unsigned offset_bits
-	    = ((unsigned) tree_low_cst (offset, 1) * BITS_PER_UNIT);
-
-	  if (offset_bits)
-	    inner = MIN (inner, (offset_bits & -offset_bits));
-	}
-      else if (TREE_CODE (offset) == MULT_EXPR
-	       && host_integerp (TREE_OPERAND (offset, 1), 1))
-	{
-	  /* Any overflow in calculating offset_factor won't change
-	     the alignment.  */
-	  unsigned offset_factor
-	    = ((unsigned) tree_low_cst (TREE_OPERAND (offset, 1), 1)
-	       * BITS_PER_UNIT);
-
-	  if (offset_factor)
-	    inner = MIN (inner, (offset_factor & -offset_factor));
-	}
-      else
+      int trailing_zeros = tree_ctz (offset);
+      if (trailing_zeros < HOST_BITS_PER_INT)
 	{
-	  inner = MIN (inner, BITS_PER_UNIT);
-	  break;
+	  unsigned int inner = (1U << trailing_zeros) * BITS_PER_UNIT;
+	  if (inner)
+	    align = MIN (align, inner);
 	}
-      offset = next_offset;
     }
-  /* Alignment is innermost object alignment adjusted by the constant
-     and non-constant offset parts.  */
-  align = MIN (align, inner);
 
   *alignp = align;
   *bitposp = bitpos & (*alignp - 1);


	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]