This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Re: Vectorizer question: DIV to RSHIFT conversion


On Tue, Dec 13, 2011 at 05:57:40PM +0400, Kirill Yukhin wrote:
> > Let me hack up a quick pattern recognizer for this...

Here it is, untested so far.
On the testcase doing 2000000 f1+f2+f3+f4 calls in the loop with -O3 -mavx
on Sandybridge (so, vectorized just with 16 byte vectors) gives:
vanilla					0m34.571s
the tree-vect* parts of this patch only	0m9.013s
the whole patch				0m8.824s
The i386 parts are just a small optimization, I guess it could be
done in the vectorizer too (but then we'd have to check whether the
arithmetic/logical right shifts are supported and check costs?), or
perhaps in the generic vcond expander (again, we'd need to check some
costs).

Can you see if it gives a measurable speedup on SPEC bzip2?

2011-12-14  Jakub Jelinek  <jakub@redhat.com>

	* tree-vectorizer.h (NUM_PATTERNS): Bump to 10.
	* tree-vect-patterns.c (vect_recog_sdivmod_pow2_pattern): New
	function.
	(vect_vect_recog_func_ptrs): Add it.

	* config/i386/sse.md (vcond<V_256:mode><VI_256:mode>,
	vcond<V_128:mode><VI124_128:mode>, vcond<VI8F_128:mode>v2di):
	Use general_operand instead of nonimmediate_operand for
	operand 5 and no predicate for operands 1 and 2.
	* config/i386/i386.c (ix86_expand_int_vcond): Optimize
	x < 0 ? -1 : 0 and x < 0 ? 1 : 0 into vector arithmetic
	resp. logical shift.

	* gcc.dg/vect/vect-sdivmod-1.c: New test.

--- gcc/tree-vectorizer.h.jj	2011-12-14 08:11:03.592010101 +0100
+++ gcc/tree-vectorizer.h	2011-12-14 08:28:57.006693371 +0100
@@ -929,7 +929,7 @@ extern void vect_slp_transform_bb (basic
    Additional pattern recognition functions can (and will) be added
    in the future.  */
 typedef gimple (* vect_recog_func_ptr) (VEC (gimple, heap) **, tree *, tree *);
-#define NUM_PATTERNS 9
+#define NUM_PATTERNS 10
 void vect_pattern_recog (loop_vec_info);
 
 /* In tree-vectorizer.c.  */
--- gcc/tree-vect-patterns.c.jj	2011-12-02 01:52:27.918883940 +0100
+++ gcc/tree-vect-patterns.c	2011-12-14 11:50:10.439763740 +0100
@@ -53,6 +53,8 @@ static gimple vect_recog_widen_shift_pat
 	                                tree *, tree *);
 static gimple vect_recog_vector_vector_shift_pattern (VEC (gimple, heap) **,
 						      tree *, tree *);
+static gimple vect_recog_sdivmod_pow2_pattern (VEC (gimple, heap) **,
+					       tree *, tree *);
 static gimple vect_recog_mixed_size_cond_pattern (VEC (gimple, heap) **,
 						  tree *, tree *);
 static gimple vect_recog_bool_pattern (VEC (gimple, heap) **, tree *, tree *);
@@ -64,6 +66,7 @@ static vect_recog_func_ptr vect_vect_rec
 	vect_recog_over_widening_pattern,
 	vect_recog_widen_shift_pattern,
 	vect_recog_vector_vector_shift_pattern,
+	vect_recog_sdivmod_pow2_pattern,
 	vect_recog_mixed_size_cond_pattern,
 	vect_recog_bool_pattern};
 
@@ -1573,6 +1576,211 @@ vect_recog_vector_vector_shift_pattern (
   return pattern_stmt;
 }
 
+/* Detect a signed division by power of two constant that wouldn't be
+   otherwise vectorized:
+
+   type a_t, b_t;
+
+   S1 a_t = b_t / N;
+
+  where type 'type' is a signed integral type and N is a constant positive
+  power of two.
+
+  Similarly handle signed modulo by power of two constant:
+
+   S4 a_t = b_t % N;
+
+  Input/Output:
+
+  * STMTS: Contains a stmt from which the pattern search begins,
+    i.e. the division stmt.  S1 is replaced by:
+  S3  y_t = b_t < 0 ? N - 1 : 0;
+  S2  x_t = b_t + y_t;
+  S1' a_t = x_t >> log2 (N);
+
+    S4 is replaced by (where *_T temporaries have unsigned type):
+  S9  y_T = b_t < 0 ? -1U : 0U;
+  S8  z_T = y_T >> (sizeof (type_t) * CHAR_BIT - log2 (N));
+  S7  z_t = (type) z_T;
+  S6  w_t = b_t + z_t;
+  S5  x_t = w_t & (N - 1);
+  S4' a_t = x_t - z_t;
+
+  Output:
+
+  * TYPE_IN: The type of the input arguments to the pattern.
+
+  * TYPE_OUT: The type of the output of this pattern.
+
+  * Return value: A new stmt that will be used to replace the division
+    S1 or modulo S4 stmt.  */
+
+static gimple
+vect_recog_sdivmod_pow2_pattern (VEC (gimple, heap) **stmts,
+				 tree *type_in, tree *type_out)
+{
+  gimple last_stmt = VEC_pop (gimple, *stmts);
+  gimple_stmt_iterator gsi;
+  tree oprnd0, oprnd1, vectype, itype, cond;
+  gimple pattern_stmt, def_stmt;
+  enum tree_code rhs_code;
+  stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
+  optab optab;
+
+  if (!is_gimple_assign (last_stmt))
+    return NULL;
+
+  rhs_code = gimple_assign_rhs_code (last_stmt);
+  switch (rhs_code)
+    {
+    case TRUNC_DIV_EXPR:
+    case TRUNC_MOD_EXPR:
+      break;
+    default:
+      return NULL;
+    }
+
+  if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
+    return NULL;
+
+  oprnd0 = gimple_assign_rhs1 (last_stmt);
+  oprnd1 = gimple_assign_rhs2 (last_stmt);
+  itype = TREE_TYPE (oprnd0);
+  if (TREE_CODE (oprnd0) != SSA_NAME
+      || TREE_CODE (oprnd1) != INTEGER_CST
+      || TREE_CODE (itype) != INTEGER_TYPE
+      || TYPE_UNSIGNED (itype)
+      || TYPE_PRECISION (itype) != GET_MODE_PRECISION (TYPE_MODE (itype))
+      || !integer_pow2p (oprnd1)
+      || tree_int_cst_sgn (oprnd1) != 1)
+    return NULL;
+
+  vectype = get_vectype_for_scalar_type (itype);
+  if (vectype == NULL_TREE)
+    return NULL;
+
+  /* If the target can handle vectorized division or modulo natively,
+     don't attempt to optimize this.  */
+  optab = optab_for_tree_code (rhs_code, vectype, optab_default);
+  if (optab != NULL)
+    {
+      enum machine_mode vec_mode = TYPE_MODE (vectype);
+      int icode = (int) optab_handler (optab, vec_mode);
+      if (icode != CODE_FOR_nothing
+	  || GET_MODE_SIZE (vec_mode) == UNITS_PER_WORD)
+	return NULL;
+    }
+
+  /* Pattern detected.  */
+  if (vect_print_dump_info (REPORT_DETAILS))
+    fprintf (vect_dump, "vect_recog_sdivmod_pow2_pattern: detected: ");
+
+  cond = build2 (LT_EXPR, boolean_type_node, oprnd0, build_int_cst (itype, 0));
+  gsi = gsi_for_stmt (last_stmt);
+  if (rhs_code == TRUNC_DIV_EXPR)
+    {
+      tree var = vect_recog_temp_ssa_var (itype, NULL);
+      def_stmt
+	= gimple_build_assign_with_ops3 (COND_EXPR, var, cond,
+					 fold_build2 (MINUS_EXPR, itype,
+						      oprnd1,
+						      build_int_cst (itype,
+								     1)),
+					 build_int_cst (itype, 0));
+      gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
+      set_vinfo_for_stmt (def_stmt, new_stmt_vec_info (def_stmt, loop_vinfo,
+						       NULL));
+      var = vect_recog_temp_ssa_var (itype, NULL);
+      def_stmt
+	= gimple_build_assign_with_ops (PLUS_EXPR, var, oprnd0,
+					gimple_assign_lhs (def_stmt));
+      STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo) = def_stmt;
+
+      pattern_stmt
+	= gimple_build_assign_with_ops (RSHIFT_EXPR,
+					vect_recog_temp_ssa_var (itype, NULL),
+					var,
+					build_int_cst (itype,
+						       tree_log2 (oprnd1)));
+    }
+  else
+    {
+      tree signmask;
+      tree utype = build_nonstandard_integer_type (TYPE_PRECISION (itype), 1);
+      tree shift = build_int_cst (utype, GET_MODE_BITSIZE (TYPE_MODE (itype))
+					 - tree_log2 (oprnd1));
+      if (compare_tree_int (oprnd1, 2) == 0)
+	{
+	  signmask = vect_recog_temp_ssa_var (itype, NULL);
+	  def_stmt
+	    = gimple_build_assign_with_ops3 (COND_EXPR, signmask, cond,
+					     build_int_cst (itype, 1),
+					     build_int_cst (itype, 0));
+	  gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
+	  set_vinfo_for_stmt (def_stmt,
+			      new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
+	}
+      else
+	{
+	  tree var = vect_recog_temp_ssa_var (utype, NULL);
+	  def_stmt
+	    = gimple_build_assign_with_ops3 (COND_EXPR, var, cond,
+					     build_int_cst (utype, -1),
+					     build_int_cst (utype, 0));
+	  gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
+	  set_vinfo_for_stmt (def_stmt,
+			      new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
+	  var = vect_recog_temp_ssa_var (utype, NULL);
+	  def_stmt
+	    = gimple_build_assign_with_ops (RSHIFT_EXPR, var,
+					    gimple_assign_lhs (def_stmt),
+					    shift);
+	  gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
+	  set_vinfo_for_stmt (def_stmt,
+			      new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
+	  signmask = vect_recog_temp_ssa_var (itype, NULL);
+	  def_stmt
+	    = gimple_build_assign_with_ops (NOP_EXPR, signmask, var,
+					    NULL_TREE);
+	  gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
+	  set_vinfo_for_stmt (def_stmt,
+			      new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
+	}
+      def_stmt
+	= gimple_build_assign_with_ops (PLUS_EXPR,
+					vect_recog_temp_ssa_var (itype, NULL),
+					oprnd0, signmask);
+      gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
+      set_vinfo_for_stmt (def_stmt, new_stmt_vec_info (def_stmt, loop_vinfo,
+						       NULL));
+      def_stmt
+	= gimple_build_assign_with_ops (BIT_AND_EXPR,
+					vect_recog_temp_ssa_var (itype, NULL),
+					gimple_assign_lhs (def_stmt),
+					fold_build2 (MINUS_EXPR, itype,
+						     oprnd1,
+						     build_int_cst (itype,
+								    1)));
+      STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo) = def_stmt;
+
+      pattern_stmt
+	= gimple_build_assign_with_ops (MINUS_EXPR,
+					vect_recog_temp_ssa_var (itype, NULL),
+					gimple_assign_lhs (def_stmt),
+					signmask);
+    }
+
+  if (vect_print_dump_info (REPORT_DETAILS))
+    print_gimple_stmt (vect_dump, pattern_stmt, 0, TDF_SLIM);
+
+  VEC_safe_push (gimple, heap, *stmts, last_stmt);
+
+  *type_in = vectype;
+  *type_out = vectype;
+  return pattern_stmt;
+}
+
 /* Function vect_recog_mixed_size_cond_pattern
 
    Try to find the following pattern:
--- gcc/config/i386/sse.md.jj	2011-12-03 12:22:33.000000000 +0100
+++ gcc/config/i386/sse.md	2011-12-14 13:00:47.695788256 +0100
@@ -6340,9 +6340,9 @@ (define_expand "vcond<V_256:mode><VI_256
 	(if_then_else:V_256
 	  (match_operator 3 ""
 	    [(match_operand:VI_256 4 "nonimmediate_operand" "")
-	     (match_operand:VI_256 5 "nonimmediate_operand" "")])
-	  (match_operand:V_256 1 "general_operand" "")
-	  (match_operand:V_256 2 "general_operand" "")))]
+	     (match_operand:VI_256 5 "general_operand" "")])
+	  (match_operand:V_256 1 "" "")
+	  (match_operand:V_256 2 "" "")))]
   "TARGET_AVX2
    && (GET_MODE_NUNITS (<V_256:MODE>mode)
        == GET_MODE_NUNITS (<VI_256:MODE>mode))"
@@ -6357,9 +6357,9 @@ (define_expand "vcond<V_128:mode><VI124_
 	(if_then_else:V_128
 	  (match_operator 3 ""
 	    [(match_operand:VI124_128 4 "nonimmediate_operand" "")
-	     (match_operand:VI124_128 5 "nonimmediate_operand" "")])
-	  (match_operand:V_128 1 "general_operand" "")
-	  (match_operand:V_128 2 "general_operand" "")))]
+	     (match_operand:VI124_128 5 "general_operand" "")])
+	  (match_operand:V_128 1 "" "")
+	  (match_operand:V_128 2 "" "")))]
   "TARGET_SSE2
    && (GET_MODE_NUNITS (<V_128:MODE>mode)
        == GET_MODE_NUNITS (<VI124_128:MODE>mode))"
@@ -6374,9 +6374,9 @@ (define_expand "vcond<VI8F_128:mode>v2di
 	(if_then_else:VI8F_128
 	  (match_operator 3 ""
 	    [(match_operand:V2DI 4 "nonimmediate_operand" "")
-	     (match_operand:V2DI 5 "nonimmediate_operand" "")])
-	  (match_operand:VI8F_128 1 "general_operand" "")
-	  (match_operand:VI8F_128 2 "general_operand" "")))]
+	     (match_operand:V2DI 5 "general_operand" "")])
+	  (match_operand:VI8F_128 1 "" "")
+	  (match_operand:VI8F_128 2 "" "")))]
   "TARGET_SSE4_2"
 {
   bool ok = ix86_expand_int_vcond (operands);
--- gcc/config/i386/i386.c.jj	2011-12-14 08:11:01.000000000 +0100
+++ gcc/config/i386/i386.c	2011-12-14 13:01:30.207538465 +0100
@@ -19434,6 +19434,45 @@ ix86_expand_int_vcond (rtx operands[])
   cop0 = operands[4];
   cop1 = operands[5];
 
+  /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
+     and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
+  if ((code == LT || code == GE)
+      && data_mode == mode
+      && cop1 == CONST0_RTX (mode)
+      && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
+      && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
+      && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
+      && (GET_MODE_SIZE (data_mode) == 16
+	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
+    {
+      rtx negop = operands[2 - (code == LT)];
+      int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
+      if (negop == CONST1_RTX (data_mode))
+	{
+	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
+					 operands[0], 1, OPTAB_DIRECT);
+	  if (res != operands[0])
+	    emit_move_insn (operands[0], res);
+	  return true;
+	}
+      else if (GET_MODE_INNER (data_mode) != DImode
+	       && vector_all_ones_operand (negop, data_mode))
+	{
+	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
+					 operands[0], 0, OPTAB_DIRECT);
+	  if (res != operands[0])
+	    emit_move_insn (operands[0], res);
+	  return true;
+	}
+    }
+
+  if (!nonimmediate_operand (cop1, mode))
+    cop1 = force_reg (mode, cop1);
+  if (!general_operand (operands[1], data_mode))
+    operands[1] = force_reg (data_mode, operands[1]);
+  if (!general_operand (operands[2], data_mode))
+    operands[2] = force_reg (data_mode, operands[2]);
+
   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
   if (TARGET_XOP
       && (mode == V16QImode || mode == V8HImode
--- gcc/testsuite/gcc.dg/vect/vect-sdivmod-1.c.jj	2011-12-14 13:05:36.240133846 +0100
+++ gcc/testsuite/gcc.dg/vect/vect-sdivmod-1.c	2011-12-14 13:08:10.288228745 +0100
@@ -0,0 +1,98 @@
+#include "tree-vect.h"
+
+extern void abort (void);
+int a[4096];
+
+__attribute__((noinline, noclone)) void
+f1 (int x)
+{
+  int i, j;
+  for (i = 1; i <= x; i++)
+    {
+      j = a[i] >> 8;
+      j = 1 + (j / 2);
+      a[i] = j << 8;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f2 (int x)
+{
+  int i, j;
+  for (i = 1; i <= x; i++)
+    {
+      j = a[i] >> 8;
+      j = 1 + (j / 16);
+      a[i] = j << 8;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f3 (int x)
+{
+  int i, j;
+  for (i = 1; i <= x; i++)
+    {
+      j = a[i] >> 8;
+      j = 1 + (j % 2);
+      a[i] = j << 8;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f4 (int x)
+{
+  int i, j;
+  for (i = 1; i <= x; i++)
+    {
+      j = a[i] >> 8;
+      j = 1 + (j % 16);
+      a[i] = j << 8;
+    }
+}
+
+int
+main ()
+{
+  int i;
+  check_vect ();
+  for (i = 0; i < 4096; i++)
+    {
+      asm ("");
+      a[i] = (i - 2048) << 8;
+    }
+  f1 (4095);
+  if (a[0] != (-2048 << 8))
+    abort ();
+  for (i = 1; i < 4096; i++)
+    if (a[i] != ((1 + ((i - 2048) / 2)) << 8))
+      abort ();
+    else
+      a[i] = (i - 2048) << 8;
+  f2 (4095);
+  if (a[0] != (-2048 << 8))
+    abort ();
+  for (i = 1; i < 4096; i++)
+    if (a[i] != ((1 + ((i - 2048) / 16)) << 8))
+      abort ();
+    else
+      a[i] = (i - 2048) << 8;
+  f3 (4095);
+  if (a[0] != (-2048 << 8))
+    abort ();
+  for (i = 1; i < 4096; i++)
+    if (a[i] != ((1 + ((i - 2048) % 2)) << 8))
+      abort ();
+    else
+      a[i] = (i - 2048) << 8;
+  f4 (4095);
+  if (a[0] != (-2048 << 8))
+    abort ();
+  for (i = 1; i < 4096; i++)
+    if (a[i] != ((1 + ((i - 2048) % 16)) << 8))
+      abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" { target vect_condition } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]