{PATCH, vect, i386]: Vectorize lrint() and generate cvtpd2dq insn

Uros Bizjak ubizjak@gmail.com
Fri Jun 29 11:11:00 GMT 2007


Hello!

This patch introduces the same approach using NARROW and WIDEN
modifier as already implemented in vectorizable_conversion() into
vectorizable_call() function. Using this modifier, gcc can vectorize
calls where (nunits_in == nunits_out / 2).

Attached patch uses this infrastructure to vectorize BUILT_IN_RINT
using cvtpd2dq sse insn. Also, this patch re-defines all 2-arg i386
builtins as const builtins (all builtins were checked that none of
them clobbers global memory).

Following testcase:

--cut here--
void foo(void)
{
  int i;

  for (i=0; i<256; ++i)
    b[i] = lrint (a[i]);
}
--cut here--

generates (-O2 -msse3 -ffast-math -ftree-vectorize):

.L7:
        cvtpd2dq       a(%eax,%eax), %xmm0
        cvtpd2dq       a+16(%eax,%eax), %xmm1
        punpcklqdq     %xmm1, %xmm0
        movdqa  %xmm0, b(%eax)
        addl    $16, %eax
        cmpl    $1024, %eax
        jne     .L7

The patch was bootstrapped on i686-pc-linux-gnu, regression tested for
all default languages. This patch finally closes PR
tree-optimization/24659, as all conversions are now vectorized (on
SSEx targets).

OK for mainline (The patch needs approval for vectorizer part)?

2007-06-29  Uros Bizjak  <ubizjak@gmail.com>

	PR tree-optimization/24659
	* tree-vect-transform.c (vectorizable_call): Handle
	(nunits_in == nunits_out / 2) and (nunits_out == nunits_in / 2) cases.

	* config/i386/sse.md (vec_pack_sfix_v2df): New expander.
	* config/i386/i386.c (enum ix86_builtins) [IX86_BUILTIN_VEC_PACK_SFIX]:
	New constant.
	(struct bdesc_2arg) [__builtin_ia32_vec_pack_sfix]: New builtin
	description.
	(ix86_init_mmx_sse_builtins): Define all builtins with 2 arguments as
	const using def_builtin_const.
	(ix86_expand_binop_builtin): Remove bogus assert() that insn wants
	input operands in the same modes as the result.
	(ix86_builtin_vectorized_function): Handle BUILT_IN_LRINT.

testsuite/ChangeLog:

2007-06-29  Uros Bizjak  <ubizjak@gmail.com>

	PR tree-optimization/24659
	* gcc.target/i386/vectorize2.c: New test.
	* gcc.target/i386/sse2-lrint-vec.c: New runtime test.
	* gcc.target/i386/sse2-lrintf-vec.c: Ditto.

Uros.
-------------- next part --------------
Index: tree-vect-transform.c
===================================================================
--- tree-vect-transform.c	(revision 126107)
+++ tree-vect-transform.c	(working copy)
@@ -2253,13 +2253,19 @@ vectorizable_call (tree stmt, block_stmt
   tree scalar_dest;
   tree operation;
   tree op, type;
+  tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
   tree vectype_out, vectype_in;
+  int nunits_in;
+  int nunits_out;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
   enum vect_def_type dt[2];
+  tree new_stmt;
   int ncopies, j, nargs;
   call_expr_arg_iterator iter;
+  tree vargs;
+  enum { NARROW, NONE, WIDEN } modifier;
 
   if (!STMT_VINFO_RELEVANT_P (stmt_info))
     return false;
@@ -2291,12 +2297,10 @@ vectorizable_call (tree stmt, block_stmt
   nargs = 0;
   FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
     {
-      ++nargs;
-
       /* Bail out if the function has more than two arguments, we
 	 do not have interesting builtin functions to vectorize with
 	 more than two arguments.  */
-      if (nargs > 2)
+      if (nargs >= 2)
 	return false;
 
       /* We can only handle calls with arguments of the same type.  */
@@ -2309,12 +2313,14 @@ vectorizable_call (tree stmt, block_stmt
 	}
       rhs_type = TREE_TYPE (op);
 
-      if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs-1]))
+      if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
 	{
 	  if (vect_print_dump_info (REPORT_DETAILS))
 	    fprintf (vect_dump, "use not simple.");
 	  return false;
 	}
+
+      ++nargs;
     }
 
   /* No arguments is also not good.  */
@@ -2322,15 +2328,20 @@ vectorizable_call (tree stmt, block_stmt
     return false;
 
   vectype_in = get_vectype_for_scalar_type (rhs_type);
+  nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
 
   lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
   vectype_out = get_vectype_for_scalar_type (lhs_type);
+  nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
 
-  /* Only handle the case of vectors with the same number of elements.
-     FIXME: We need a way to handle for example the SSE2 cvtpd2dq
-	    instruction which converts V2DFmode to V4SImode but only
-	    using the lower half of the V4SImode result.  */
-  if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
+  /* FORNOW */
+  if (nunits_in == nunits_out / 2)
+    modifier = NARROW;
+  else if (nunits_out == nunits_in)
+    modifier = NONE;
+  else if (nunits_out == nunits_in / 2)
+    modifier = WIDEN;
+  else
     return false;
 
   /* For now, we only vectorize functions if a target specific builtin
@@ -2348,8 +2359,14 @@ vectorizable_call (tree stmt, block_stmt
 
   gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
 
-  ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-	     / TYPE_VECTOR_SUBPARTS (vectype_out));
+  if (modifier == NARROW)
+    ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
+  else
+    ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
+
+  /* Sanity check: make sure that at least one copy of the vectorized stmt
+     needs to be generated.  */
+  gcc_assert (ncopies >= 1);
 
   if (!vec_stmt) /* transformation not required.  */
     {
@@ -2365,55 +2382,113 @@ vectorizable_call (tree stmt, block_stmt
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "transform operation.");
 
-  gcc_assert (ncopies >= 1);
-
   /* Handle def.  */
   scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
   prev_stmt_info = NULL;
-  for (j = 0; j < ncopies; ++j)
+  switch (modifier)
     {
-      tree new_stmt, vargs;
-      tree vec_oprnd[2];
-      int n;
-
-      /* Build argument list for the vectorized call.  */
-      /* FIXME: Rewrite this so that it doesn't construct a temporary
-	  list.  */
-      vargs = NULL_TREE;
-      n = -1;
-      FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
+    case NONE:
+      for (j = 0; j < ncopies; ++j)
 	{
-	  ++n;
+	  /* Build argument list for the vectorized call.  */
+	  /* FIXME: Rewrite this so that it doesn't
+	     construct a temporary list.  */
+	  vargs = NULL_TREE;
+	  nargs = 0;
+	  FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
+	    {
+	      if (j == 0)
+		vec_oprnd0
+		  = vect_get_vec_def_for_operand (op, stmt, NULL);
+	      else
+		vec_oprnd0
+		  = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
+
+	      vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
+
+	      ++nargs;
+	    }
+	  vargs = nreverse (vargs);
+
+	  rhs = build_function_call_expr (fndecl, vargs);
+	  new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
+	  new_temp = make_ssa_name (vec_dest, new_stmt);
+	  GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+
+	  vect_finish_stmt_generation (stmt, new_stmt, bsi);
 
 	  if (j == 0)
-	    vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
+	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
 	  else
-	    vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
+	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
 
-	  vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
+	  prev_stmt_info = vinfo_for_stmt (new_stmt);
 	}
-      vargs = nreverse (vargs);
 
-      rhs = build_function_call_expr (fndecl, vargs);
-      new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
-      new_temp = make_ssa_name (vec_dest, new_stmt);
-      GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+      break;
 
-      vect_finish_stmt_generation (stmt, new_stmt, bsi);
+    case NARROW:
+      for (j = 0; j < ncopies; ++j)
+	{
+	  /* Build argument list for the vectorized call.  */
+	  /* FIXME: Rewrite this so that it doesn't
+	     construct a temporary list.  */
+	  vargs = NULL_TREE;
+	  nargs = 0;
+	  FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
+	    {
+	      if (j == 0)
+		{
+		  vec_oprnd0
+		    = vect_get_vec_def_for_operand (op, stmt, NULL);
+		  vec_oprnd1
+		    = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
+		}
+	      else
+		{
+		  vec_oprnd0
+		    = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
+		  vec_oprnd1
+		    = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
+		}
 
-      if (j == 0)
-	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
-      else
-	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
-      prev_stmt_info = vinfo_for_stmt (new_stmt);
+	      vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
+	      vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
+
+	      ++nargs;
+	    }
+	  vargs = nreverse (vargs);
+
+	  rhs = build_function_call_expr (fndecl, vargs);
+	  new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
+	  new_temp = make_ssa_name (vec_dest, new_stmt);
+	  GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+
+	  vect_finish_stmt_generation (stmt, new_stmt, bsi);
+
+	  if (j == 0)
+	    STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
+	  else
+	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+
+	  prev_stmt_info = vinfo_for_stmt (new_stmt);
+	}
+
+      *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
+
+      break;
+
+    case WIDEN:
+      /* No current target implements this case.  */
+      return false;
     }
 
-  /* The call in STMT might prevent it from being removed in dce.  We however
-     cannot remove it here, due to the way the ssa name it defines is mapped
-     to the new definition.  So just replace rhs of the statement with something
-     harmless.  */
+  /* The call in STMT might prevent it from being removed in dce.
+     We however cannot remove it here, due to the way the ssa name
+     it defines is mapped to the new definition.  So just replace
+     rhs of the statement with something harmless.  */
   type = TREE_TYPE (scalar_dest);
   GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
   update_stmt (stmt);
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md	(revision 126107)
+++ config/i386/sse.md	(working copy)
@@ -2421,6 +2421,26 @@
   DONE;
 })
 
+(define_expand "vec_pack_sfix_v2df"
+  [(match_operand:V4SI 0 "register_operand" "")
+   (match_operand:V2DF 1 "nonimmediate_operand" "")
+   (match_operand:V2DF 2 "nonimmediate_operand" "")]
+  "TARGET_SSE2"
+{
+  rtx r1, r2;
+
+  r1 = gen_reg_rtx (V4SImode);
+  r2 = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_sse2_cvtpd2dq (r1, operands[1]));
+  emit_insn (gen_sse2_cvtpd2dq (r2, operands[2]));
+  emit_insn (gen_sse2_punpcklqdq (gen_lowpart (V2DImode, operands[0]),
+				  gen_lowpart (V2DImode, r1),
+				  gen_lowpart (V2DImode, r2)));
+  DONE;
+})
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel double-precision floating point element swizzling
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 126107)
+++ config/i386/i386.c	(working copy)
@@ -16820,6 +16820,8 @@ enum ix86_builtins
   IX86_BUILTIN_VEC_SET_V4HI,
   IX86_BUILTIN_VEC_SET_V16QI,
 
+  IX86_BUILTIN_VEC_PACK_SFIX,
+
   /* SSE4.2.  */
   IX86_BUILTIN_CRC32QI,
   IX86_BUILTIN_CRC32HI,
@@ -17167,6 +17169,8 @@ static const struct builtin_description 
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_unpckhpd, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_unpcklpd, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, 0 },
 
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, 0 },
+
   /* SSE2 MMX */
   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, 0 },
@@ -17563,6 +17567,9 @@ ix86_init_mmx_sse_builtins (void)
     = build_function_type_list (V2DF_type_node, V4SI_type_node, NULL_TREE);
   tree v4si_ftype_v2df
     = build_function_type_list (V4SI_type_node, V2DF_type_node, NULL_TREE);
+  tree v4si_ftype_v2df_v2df
+    = build_function_type_list (V4SI_type_node,
+				V2DF_type_node, V2DF_type_node, NULL_TREE);
   tree v2si_ftype_v2df
     = build_function_type_list (V2SI_type_node, V2DF_type_node, NULL_TREE);
   tree v4sf_ftype_v2df
@@ -17906,7 +17913,10 @@ ix86_init_mmx_sse_builtins (void)
 	  || d->icode == CODE_FOR_sse2_vmmaskcmpv2df3)
 	type = v2di_ftype_v2df_v2df;
 
-      def_builtin (d->mask, d->name, type, d->code);
+      if (d->icode == CODE_FOR_vec_pack_sfix_v2df)
+	type = v4si_ftype_v2df_v2df;
+
+      def_builtin_const (d->mask, d->name, type, d->code);
     }
 
   /* Add all builtins that are more or less simple operations on 1 operand.  */
@@ -18457,11 +18467,6 @@ ix86_expand_binop_builtin (enum insn_cod
       op1 = gen_lowpart (TImode, x);
     }
 
-  /* The insn must want input operands in the same modes as the
-     result.  */
-  gcc_assert ((GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
-	      && (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode));
-
   if (!(*insn_data[icode].operand[1].predicate) (op0, mode0))
     op0 = copy_to_mode_reg (mode0, op0);
   if (!(*insn_data[icode].operand[2].predicate) (op1, mode1))
@@ -19863,6 +19868,12 @@ ix86_builtin_vectorized_function (unsign
 	return ix86_builtins[IX86_BUILTIN_SQRTPS];
       return NULL_TREE;
 
+    case BUILT_IN_LRINT:
+      if (out_mode == SImode && out_n == 4
+	  && in_mode == DFmode && in_n == 2)
+	return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
+      return NULL_TREE;
+
     case BUILT_IN_LRINTF:
       if (out_mode == SImode && out_n == 4
 	  && in_mode == SFmode && in_n == 4)
Index: testsuite/gcc.target/i386/sse2-lrint-vec.c
===================================================================
--- testsuite/gcc.target/i386/sse2-lrint-vec.c	(revision 0)
+++ testsuite/gcc.target/i386/sse2-lrint-vec.c	(revision 0)
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2" } */
+
+#include "../../gcc.dg/i386-cpuid.h"
+
+extern long lrint (double);
+extern void abort (void);
+
+#define N 32
+
+int __attribute__((noinline))
+main1 ()
+{
+  double a[N] = {0.4,3.5,6.6,9.4,12.5,15.6,18.4,21.5,24.6,27.4,30.5,33.6,36.4,39.5,42.6,45.4,0.5,3.6,6.4,9.5,12.6,15.4,18.5,21.6,24.4,27.5,30.6,33.4,36.5,39.6,42.4,45.5};
+  long r[N];
+
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      r[i] = lrint (a[i]);
+    }
+
+  /* check results:  */
+  for (i = 0; i < N; i++)
+    {
+      if (r[i] != lrint (a[i]))
+	abort();
+    }   
+
+  return 0;
+}
+
+int
+main ()
+{
+  unsigned long cpu_facilities;
+
+  cpu_facilities = i386_cpuid ();
+
+  if ((cpu_facilities & (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV))
+      != (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV))
+    /* If host has no vector support, pass.  */
+    return 0;
+
+  main1 ();
+  return 0;
+}
Index: testsuite/gcc.target/i386/vectorize2.c
===================================================================
--- testsuite/gcc.target/i386/vectorize2.c	(revision 0)
+++ testsuite/gcc.target/i386/vectorize2.c	(revision 0)
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse" } */
+
+double a[256];
+int b[256];
+unsigned short c[256];
+
+extern long lrint (double);
+
+void foo(void)
+{
+  int i;
+
+  for (i=0; i<256; ++i)
+    b[i] = lrint (a[i]);
+}
+
+void bar(void)
+{
+  int i;
+
+  for (i=0; i<256; ++i)
+    {
+      b[i] = lrint (a[i]);
+      c[i] += c[i];
+    }
+}
+
+/* { dg-final { scan-assembler "cvtpd2dq" } } */
Index: testsuite/gcc.target/i386/sse2-lrintf-vec.c
===================================================================
--- testsuite/gcc.target/i386/sse2-lrintf-vec.c	(revision 0)
+++ testsuite/gcc.target/i386/sse2-lrintf-vec.c	(revision 0)
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2" } */
+
+#include "../../gcc.dg/i386-cpuid.h"
+
+extern long lrintf (float);
+extern void abort (void);
+
+#define N 32
+
+int __attribute__((noinline))
+main1 ()
+{
+  float a[N] = {0.4,3.5,6.6,9.4,12.5,15.6,18.4,21.5,24.6,27.4,30.5,33.6,36.4,39.5,42.6,45.4,0.5,3.6,6.4,9.5,12.6,15.4,18.5,21.6,24.4,27.5,30.6,33.4,36.5,39.6,42.4,45.5};
+  long r[N];
+
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      r[i] = lrintf (a[i]);
+    }
+
+  /* check results:  */
+  for (i = 0; i < N; i++)
+    {
+      if (r[i] != lrintf (a[i]))
+	abort();
+    }   
+
+  return 0;
+}
+
+int
+main ()
+{
+  unsigned long cpu_facilities;
+
+  cpu_facilities = i386_cpuid ();
+
+  if ((cpu_facilities & (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV))
+      != (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV))
+    /* If host has no vector support, pass.  */
+    return 0;
+
+  main1 ();
+  return 0;
+}


More information about the Gcc-patches mailing list