This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: [PATCH] Fix up AVX512F masked gather vectorization, add support for AVX512F 512-bit masked scatter vectorization (PR tree-optimization/88464)

From: Richard Biener <rguenther at suse dot de>
To: Jakub Jelinek <jakub at redhat dot com>,Uros Bizjak <ubizjak at gmail dot com>
Cc: gcc-patches at gcc dot gnu dot org
Date: Sat, 15 Dec 2018 11:25:12 +0100
Subject: Re: [PATCH] Fix up AVX512F masked gather vectorization, add support for AVX512F 512-bit masked scatter vectorization (PR tree-optimization/88464)
References: <20181214194708.GR12380@tucnak>
On December 14, 2018 8:47:08 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote:
>Hi!
>
>In the previous patch I've unfortunately left one important case from
>the
>testcase and apparently it wasn't covered by anything else in the
>testsuite.
>The 3 functions covered float and double gathers with indexes with the
>same
>bitsize and WIDENING gather (double gather with int index), but didn't
>cover
>NARROWING case (float gather with long index with -m64).  That was the
>only
>case that tried to permute the mask, unfortunately that isn't really
>supported and ICEs.  What works is VEC_UNPACK_{LO,HI}_EXPR on the
>VECTOR_BOOLEAN_TYPE_P, that is what other spots in the vectorizer emit
>for
>those.
>
>I had to also fix up the x86 backend, which had in expansion of these
>NARROWING gather builtins code cut&pasted from the 256-bit builtin,
>unfortunately it wasn't adjusted for the fact that the 512-bit builtin
>uses
>integral mask argument while the 256-bit one doesn't.  And even in the
>256-bit one there was a bug, it relied on the mask and src arguments to
>be
>always in the same register (which is actually what the vectorizer
>generates
>for those right now, but it could do something else).
>
>This patch fixes that and enables also masked x86 AVX512F 512-bit
>scatter support.
>
>Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK. 

Richard. 
>
>What is still unhandled (doesn't vectorize) is 128-bit or 256-bit
>scatters,
>I bet the mask operand is vectorized using normal non-bool vectors, but
>the
>instructions with AVX512VL actually need a mask register.  There are
>instructions that can handle that, but let's defer that for later.
>
>2018-12-14  Jakub Jelinek  <jakub@redhat.com>
>
>	PR tree-optimization/88464
>	* tree-vect-stmts.c (vect_build_gather_load_calls): For NARROWING
>	and mask with integral masktype, don't try to permute mask vectors,
>	instead emit VEC_UNPACK_{LO,HI}_EXPR.  Fix up NOP_EXPR operand.
>	(vectorizable_store): Handle masked scatters with decl and integral
>	mask type.
>	(permute_vec_elements): Allow scalar_dest to be NULL.
>	* config/i386/i386.c (ix86_get_builtin)
>	<case IX86_BUILTIN_GATHER3ALTDIV16SF>: Use lowpart_subreg for masks.
>	<case IX86_BUILTIN_GATHER3ALTDIV8SF>: Don't assume mask and src have
>	to be the same.
>
>	* gcc.target/i386/avx512f-pr88462-1.c: Rename to ...
>	* gcc.target/i386/avx512f-pr88464-1.c: ... this.  Fix up PR number.
>	Expect 4 vectorized loops instead of 3.
>	(f4): New function.
>	* gcc.target/i386/avx512f-pr88462-2.c: Rename to ...
>	* gcc.target/i386/avx512f-pr88464-2.c: ... this.  Fix up PR number
>	and #include.
>	(avx512f_test): Prepare arguments for f4 and check the results.
>	* gcc.target/i386/avx512f-pr88464-3.c: New test.
>	* gcc.target/i386/avx512f-pr88464-4.c: New test.
>
>--- gcc/tree-vect-stmts.c.jj	2018-12-13 18:01:13.000000000 +0100
>+++ gcc/tree-vect-stmts.c	2018-12-14 17:10:42.079054458 +0100
>@@ -2655,6 +2655,7 @@ vect_build_gather_load_calls (stmt_vec_i
>   if (mask && TREE_CODE (masktype) == INTEGER_TYPE)
>     masktype = build_same_sized_truth_vector_type (srctype);
> 
>+  tree mask_halftype = masktype;
>   tree perm_mask = NULL_TREE;
>   tree mask_perm_mask = NULL_TREE;
>   if (known_eq (nunits, gather_off_nunits))
>@@ -2690,13 +2691,16 @@ vect_build_gather_load_calls (stmt_vec_i
> 
>       ncopies *= 2;
> 
>-      if (mask)
>+      if (mask && masktype == real_masktype)
> 	{
> 	  for (int i = 0; i < count; ++i)
> 	    sel[i] = i | (count / 2);
> 	  indices.new_vector (sel, 2, count);
> 	  mask_perm_mask = vect_gen_perm_mask_checked (masktype, indices);
> 	}
>+      else if (mask)
>+	mask_halftype
>+	  = build_same_sized_truth_vector_type (gs_info->offset_vectype);
>     }
>   else
>     gcc_unreachable ();
>@@ -2761,7 +2765,7 @@ vect_build_gather_load_calls (stmt_vec_i
> 	    {
> 	      if (j == 0)
> 		vec_mask = vect_get_vec_def_for_operand (mask, stmt_info);
>-	      else
>+	      else if (modifier != NARROW || (j & 1) == 0)
> 		vec_mask = vect_get_vec_def_for_stmt_copy (loop_vinfo,
> 							   vec_mask);
> 
>@@ -2779,17 +2783,27 @@ vect_build_gather_load_calls (stmt_vec_i
> 		  mask_op = var;
> 		}
> 	    }
>+	  if (modifier == NARROW && masktype != real_masktype)
>+	    {
>+	      var = vect_get_new_ssa_name (mask_halftype, vect_simple_var);
>+	      gassign *new_stmt
>+		= gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR
>+						    : VEC_UNPACK_LO_EXPR,
>+				       mask_op);
>+	      vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
>+	      mask_op = var;
>+	    }
> 	  src_op = mask_op;
> 	}
> 
>       tree mask_arg = mask_op;
>       if (masktype != real_masktype)
> 	{
>-	  tree utype;
>-	  if (TYPE_MODE (real_masktype) == TYPE_MODE (masktype))
>+	  tree utype, optype = TREE_TYPE (mask_op);
>+	  if (TYPE_MODE (real_masktype) == TYPE_MODE (optype))
> 	    utype = real_masktype;
> 	  else
>-	    utype = lang_hooks.types.type_for_mode (TYPE_MODE (masktype), 1);
>+	    utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1);
> 	  var = vect_get_new_ssa_name (utype, vect_scalar_var);
> 	  mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_op);
> 	  gassign *new_stmt
>@@ -2801,7 +2815,7 @@ vect_build_gather_load_calls (stmt_vec_i
> 	      gcc_assert (TYPE_PRECISION (utype)
> 			  <= TYPE_PRECISION (real_masktype));
> 	      var = vect_get_new_ssa_name (real_masktype, vect_scalar_var);
>-	      new_stmt = gimple_build_assign (var, NOP_EXPR, utype);
>+	      new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg);
> 	      vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
> 	      mask_arg = var;
> 	    }
>@@ -6361,7 +6375,8 @@ vectorizable_store (stmt_vec_info stmt_i
> 	    return false;
> 	}
>       else if (memory_access_type != VMAT_LOAD_STORE_LANES
>-	       && (memory_access_type != VMAT_GATHER_SCATTER ||
>gs_info.decl))
>+	       && (memory_access_type != VMAT_GATHER_SCATTER
>+		   || (gs_info.decl && !VECTOR_BOOLEAN_TYPE_P (mask_vectype))))
> 	{
> 	  if (dump_enabled_p ())
> 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>@@ -6419,7 +6434,9 @@ vectorizable_store (stmt_vec_info stmt_i
>       tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, src;
>       tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
>       tree rettype, srctype, ptrtype, idxtype, masktype, scaletype;
>-      tree ptr, mask, var, scale, perm_mask = NULL_TREE;
>+      tree ptr, var, scale, vec_mask;
>+      tree mask_arg = NULL_TREE, mask_op = NULL_TREE, perm_mask =
>NULL_TREE;
>+      tree mask_halfvectype = mask_vectype;
>       edge pe = loop_preheader_edge (loop);
>       gimple_seq seq;
>       basic_block new_bb;
>@@ -6460,6 +6477,10 @@ vectorizable_store (stmt_vec_info stmt_i
> 	  perm_mask = vect_gen_perm_mask_checked (vectype, indices);
> 	  gcc_assert (perm_mask != NULL_TREE);
> 	  ncopies *= 2;
>+
>+	  if (mask)
>+	    mask_halfvectype
>+	      = build_same_sized_truth_vector_type (gs_info.offset_vectype);
> 	}
>       else
> 	gcc_unreachable ();
>@@ -6482,10 +6503,11 @@ vectorizable_store (stmt_vec_info stmt_i
> 	  gcc_assert (!new_bb);
> 	}
> 
>-      /* Currently we support only unconditional scatter stores,
>-	 so mask should be all ones.  */
>-      mask = build_int_cst (masktype, -1);
>-      mask = vect_init_vector (stmt_info, mask, masktype, NULL);
>+      if (mask == NULL_TREE)
>+	{
>+	  mask_arg = build_int_cst (masktype, -1);
>+	  mask_arg = vect_init_vector (stmt_info, mask_arg, masktype, NULL);
>+	}
> 
>       scale = build_int_cst (scaletype, gs_info.scale);
> 
>@@ -6494,36 +6516,46 @@ vectorizable_store (stmt_vec_info stmt_i
> 	{
> 	  if (j == 0)
> 	    {
>-	      src = vec_oprnd1
>-		= vect_get_vec_def_for_operand (op, stmt_info);
>-	      op = vec_oprnd0
>-		= vect_get_vec_def_for_operand (gs_info.offset, stmt_info);
>+	      src = vec_oprnd1 = vect_get_vec_def_for_operand (op,
>stmt_info);
>+	      op = vec_oprnd0 = vect_get_vec_def_for_operand (gs_info.offset,
>+							      stmt_info);
>+	      if (mask)
>+		mask_op = vec_mask = vect_get_vec_def_for_operand (mask,
>+								   stmt_info);
> 	    }
> 	  else if (modifier != NONE && (j & 1))
> 	    {
> 	      if (modifier == WIDEN)
> 		{
>-		  src = vec_oprnd1
>-		    = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1);
>+		  src
>+		    = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo,
>+								   vec_oprnd1);
> 		  op = permute_vec_elements (vec_oprnd0, vec_oprnd0, perm_mask,
> 					     stmt_info, gsi);
>+		  if (mask)
>+		    mask_op
>+		      = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo,
>+								   vec_mask);
> 		}
> 	      else if (modifier == NARROW)
> 		{
> 		  src = permute_vec_elements (vec_oprnd1, vec_oprnd1, perm_mask,
> 					      stmt_info, gsi);
>-		  op = vec_oprnd0
>-		    = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0);
>+		  op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo,
>+								    vec_oprnd0);
> 		}
> 	      else
> 		gcc_unreachable ();
> 	    }
> 	  else
> 	    {
>-	      src = vec_oprnd1
>-		= vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1);
>-	      op = vec_oprnd0
>-		= vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0);
>+	      src = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo,
>+								 vec_oprnd1);
>+	      op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo,
>+								vec_oprnd0);
>+	      if (mask)
>+		mask_op = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo,
>+								     vec_mask);
> 	    }
> 
> 	  if (!useless_type_conversion_p (srctype, TREE_TYPE (src)))
>@@ -6550,8 +6582,45 @@ vectorizable_store (stmt_vec_info stmt_i
> 	      op = var;
> 	    }
> 
>+	  if (mask)
>+	    {
>+	      tree utype;
>+	      mask_arg = mask_op;
>+	      if (modifier == NARROW)
>+		{
>+		  var = vect_get_new_ssa_name (mask_halfvectype,
>+					       vect_simple_var);
>+		  gassign *new_stmt
>+		    = gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR
>+							: VEC_UNPACK_LO_EXPR,
>+					   mask_op);
>+		  vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
>+		  mask_arg = var;
>+		}
>+	      tree optype = TREE_TYPE (mask_arg);
>+	      if (TYPE_MODE (masktype) == TYPE_MODE (optype))
>+		utype = masktype;
>+	      else
>+		utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1);
>+	      var = vect_get_new_ssa_name (utype, vect_scalar_var);
>+	      mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_arg);
>+	      gassign *new_stmt
>+		= gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg);
>+	      vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
>+	      mask_arg = var;
>+	      if (!useless_type_conversion_p (masktype, utype))
>+		{
>+		  gcc_assert (TYPE_PRECISION (utype)
>+			      <= TYPE_PRECISION (masktype));
>+		  var = vect_get_new_ssa_name (masktype, vect_scalar_var);
>+		  new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg);
>+		  vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
>+		  mask_arg = var;
>+		}
>+	    }
>+
> 	  gcall *new_stmt
>-	    = gimple_build_call (gs_info.decl, 5, ptr, mask, op, src, scale);
>+	    = gimple_build_call (gs_info.decl, 5, ptr, mask_arg, op, src,
>scale);
> 	  stmt_vec_info new_stmt_info
> 	    = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
> 
>@@ -7284,7 +7353,7 @@ permute_vec_elements (tree x, tree y, tr
>   gimple *perm_stmt;
> 
>   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>-  if (TREE_CODE (scalar_dest) == SSA_NAME)
>+  if (scalar_dest && TREE_CODE (scalar_dest) == SSA_NAME)
>     perm_dest = vect_create_destination_var (scalar_dest, vectype);
>   else
>    perm_dest = vect_get_new_vect_var (vectype, vect_simple_var, NULL);
>--- gcc/config/i386/i386.c.jj	2018-12-13 13:45:11.000000000 +0100
>+++ gcc/config/i386/i386.c	2018-12-14 17:34:11.131135056 +0100
>@@ -37605,13 +37605,7 @@ rdseed_step:
> 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
> 	  emit_insn (gen (half, op0));
> 	  op0 = half;
>-	  if (GET_MODE (op3) != VOIDmode)
>-	    {
>-	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
>-		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
>-	      emit_insn (gen (half, op3));
>-	      op3 = half;
>-	    }
>+	  op3 = lowpart_subreg (QImode, op3, HImode);
> 	  break;
> 	case IX86_BUILTIN_GATHER3ALTDIV8SF:
> 	case IX86_BUILTIN_GATHER3ALTDIV8SI:
>@@ -37628,6 +37622,7 @@ rdseed_step:
> 	  op0 = half;
> 	  if (GET_MODE (op3) != VOIDmode)
> 	    {
>+	      half = gen_reg_rtx (mode0);
> 	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
> 		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
> 	      emit_insn (gen (half, op3));
>--- gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c.jj	2018-12-14
>16:34:55.361955571 +0100
>+++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c	2018-12-14
>18:07:25.694686784 +0100
>@@ -0,0 +1,45 @@
>+/* PR tree-optimization/88464 */
>+/* { dg-do compile } */
>+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
>-mtune=skylake-avx512 -fdump-tree-vect-details" } */
>+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte
>vectors" 4 "vect" } } */
>+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function"
>4 "vect" } } */
>+
>+__attribute__((noipa)) void
>+f1 (double * __restrict__ a, const double * __restrict__ b, const int
>* __restrict__ c, int n)
>+{
>+  int i;
>+#pragma GCC ivdep
>+  for (i = 0; i < n; ++i)
>+    if (a[i] > 10.0)
>+      a[i] = b[c[i]];
>+}
>+
>+__attribute__((noipa)) void
>+f2 (double * __restrict__ a, const double * __restrict__ b, const long
>* __restrict__ c, int n)
>+{
>+  int i;
>+#pragma GCC ivdep
>+  for (i = 0; i < n; ++i)
>+    if (a[i] > 10.0)
>+      a[i] = b[c[i]];
>+}
>+
>+__attribute__((noipa)) void
>+f3 (float * __restrict__ a, const float * __restrict__ b, const int *
>__restrict__ c, int n)
>+{
>+  int i;
>+#pragma GCC ivdep
>+  for (i = 0; i < n; ++i)
>+    if (a[i] > 10.0f)
>+      a[i] = b[c[i]];
>+}
>+
>+__attribute__((noipa)) void
>+f4 (float * __restrict__ a, const float * __restrict__ b, const long *
>__restrict__ c, int n)
>+{
>+  int i;
>+#pragma GCC ivdep
>+  for (i = 0; i < n; ++i)
>+    if (a[i] > 10.0f)
>+      a[i] = b[c[i]];
>+}
>--- gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c.jj	2018-12-14
>16:35:00.681869029 +0100
>+++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c	2018-12-14
>17:43:40.294876267 +0100
>@@ -0,0 +1,61 @@
>+/* PR tree-optimization/88464 */
>+/* { dg-do run { target { avx512f } } } */
>+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
>-mtune=skylake-avx512" } */
>+
>+#include "avx512f-check.h"
>+
>+#include "avx512f-pr88464-1.c"
>+
>+static void
>+avx512f_test (void)
>+{
>+  double a[1024], b[1024];
>+  float c[1024], f[1024];
>+  int d[1024];
>+  long e[1024];
>+  int i;
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      a[i] = (i % 3) != 0 ? 15.0 : -5.0;
>+      b[i] = 2 * i;
>+      d[i] = (i % 3) ? 1023 - i : __INT_MAX__;
>+    }
>+  f1 (a, b, d, 1024);
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
>+	abort ();
>+      a[i] = (i % 3) != 1 ? 15.0 : -5.0;
>+      b[i] = 3 * i;
>+      e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
>+    }
>+  f2 (a, b, e, 1024);
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0))
>+	abort ();
>+      c[i] = (i % 3) != 2 ? 15.0f : -5.0f;
>+      d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
>+      f[i] = 4 * i;
>+    }
>+  f3 (c, f, d, 1024);
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f))
>+	abort ();
>+      c[i] = (i % 3) != 0 ? 15.0f : -5.0f;
>+      e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__;
>+      f[i] = 5 * i;
>+    }
>+  f4 (c, f, e, 1024);
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f))
>+	abort ();
>+    }
>+}
>--- gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c.jj	2018-12-14
>18:01:19.297647800 +0100
>+++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c	2018-12-14
>18:07:14.906862302 +0100
>@@ -0,0 +1,45 @@
>+/* PR tree-optimization/88464 */
>+/* { dg-do compile } */
>+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
>-mtune=skylake-avx512 -fdump-tree-vect-details" } */
>+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte
>vectors" 4 "vect" } } */
>+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function"
>4 "vect" } } */
>+
>+__attribute__((noipa)) void
>+f1 (double * __restrict__ a, const double * __restrict__ b, const int
>* __restrict__ c, int n)
>+{
>+  int i;
>+#pragma GCC ivdep
>+  for (i = 0; i < n; ++i)
>+    if (b[i] > -2.0)
>+      a[c[i]] = b[i];
>+}
>+
>+__attribute__((noipa)) void
>+f2 (double * __restrict__ a, const double * __restrict__ b, const long
>* __restrict__ c, int n)
>+{
>+  int i;
>+#pragma GCC ivdep
>+  for (i = 0; i < n; ++i)
>+    if (b[i] > -2.0)
>+      a[c[i]] = b[i];
>+}
>+
>+__attribute__((noipa)) void
>+f3 (float * __restrict__ a, const float * __restrict__ b, const int *
>__restrict__ c, int n)
>+{
>+  int i;
>+#pragma GCC ivdep
>+  for (i = 0; i < n; ++i)
>+    if (b[i] > -2.0f)
>+      a[c[i]] = b[i];
>+}
>+
>+__attribute__((noipa)) void
>+f4 (float * __restrict__ a, const float * __restrict__ b, const long *
>__restrict__ c, int n)
>+{
>+  int i;
>+#pragma GCC ivdep
>+  for (i = 0; i < n; ++i)
>+    if (b[i] > -2.0f)
>+      a[c[i]] = b[i];
>+}
>--- gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c.jj	2018-12-14
>18:03:03.100958998 +0100
>+++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c	2018-12-14
>18:12:32.209699741 +0100
>@@ -0,0 +1,61 @@
>+/* PR tree-optimization/88464 */
>+/* { dg-do run { target { avx512f } } } */
>+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
>-mtune=skylake-avx512" } */
>+
>+#include "avx512f-check.h"
>+
>+#include "avx512f-pr88464-3.c"
>+
>+static void
>+avx512f_test (void)
>+{
>+  double a[1024], b[1024];
>+  float c[1024], f[1024];
>+  int d[1024];
>+  long e[1024];
>+  int i;
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      a[i] = -5.0;
>+      b[i] = (i % 3) != 0 ? 2.0 * i : -5.0;
>+      d[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__;
>+    }
>+  f1 (a, b, d, 1024);
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
>+	abort ();
>+      a[i] = -5.0;
>+      b[i] = (i % 3) != 1 ? 3.0 * i : -5.0;
>+      e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
>+    }
>+  f2 (a, b, e, 1024);
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      if (a[i] != ((i % 3) != 2 ? (1023 - i) * 3.0 : -5.0))
>+	abort ();
>+      c[i] = -5.0f;
>+      d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
>+      f[i] = (i % 3) != 2 ? 4.0f * i : -5.0f;
>+    }
>+  f3 (c, f, d, 1024);
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      if (c[i] != ((i % 3) != 1 ? (1023 - i) * 4.0f : -5.0f))
>+	abort ();
>+      c[i] = -5.0f;
>+      e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__;
>+      f[i] = (i % 3) != 0 ? 5.0f * i : -5.0f;
>+    }
>+  f4 (c, f, e, 1024);
>+  for (i = 0; i < 1024; i++)
>+    {
>+      asm volatile ("" : "+g" (i));
>+      if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f))
>+	abort ();
>+    }
>+}
>--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c.jj	2018-12-13
>18:01:13.913271190 +0100
>+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c	2018-11-06
>14:56:08.851174491 +0100
>@@ -1,35 +0,0 @@
>-/* PR tree-optimization/88462 */
>-/* { dg-do compile } */
>-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
>-mtune=skylake-avx512 -fdump-tree-vect-details" } */
>-/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte
>vectors" 3 "vect" } } */
>-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function"
>3 "vect" } } */
>-
>-__attribute__((noipa)) void
>-f1 (double * __restrict__ a, const double * __restrict__ b, const int
>* __restrict__ c, int n)
>-{
>-  int i;
>-#pragma GCC ivdep
>-  for (i = 0; i < n; ++i)
>-    if (a[i] > 10.0)
>-      a[i] = b[c[i]];
>-}
>-
>-__attribute__((noipa)) void
>-f2 (double * __restrict__ a, const double * __restrict__ b, const long
>* __restrict__ c, int n)
>-{
>-  int i;
>-#pragma GCC ivdep
>-  for (i = 0; i < n; ++i)
>-    if (a[i] > 10.0)
>-      a[i] = b[c[i]];
>-}
>-
>-__attribute__((noipa)) void
>-f3 (float * __restrict__ a, const float * __restrict__ b, const int *
>__restrict__ c, int n)
>-{
>-  int i;
>-#pragma GCC ivdep
>-  for (i = 0; i < n; ++i)
>-    if (a[i] > 10.0f)
>-      a[i] = b[c[i]];
>-}
>--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c.jj	2018-12-13
>18:01:13.914271174 +0100
>+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c	2018-11-06
>14:56:08.851174491 +0100
>@@ -1,51 +0,0 @@
>-/* PR tree-optimization/88462 */
>-/* { dg-do run { target { avx512f } } } */
>-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512
>-mtune=skylake-avx512" } */
>-
>-#include "avx512f-check.h"
>-
>-#include "avx512f-pr88462-1.c"
>-
>-static void
>-avx512f_test (void)
>-{
>-  double a[1024], b[1024];
>-  float c[1024], f[1024];
>-  int d[1024];
>-  long e[1024];
>-  int i;
>-  for (i = 0; i < 1024; i++)
>-    {
>-      asm volatile ("" : "+g" (i));
>-      a[i] = (i % 3) != 0 ? 15.0 : -5.0;
>-      b[i] = 2 * i;
>-      d[i] = (i % 3) ? 1023 - i : __INT_MAX__;
>-    }
>-  f1 (a, b, d, 1024);
>-  for (i = 0; i < 1024; i++)
>-    {
>-      asm volatile ("" : "+g" (i));
>-      if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
>-	abort ();
>-      a[i] = (i % 3) != 1 ? 15.0 : -5.0;
>-      b[i] = 3 * i;
>-      e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
>-    }
>-  f2 (a, b, e, 1024);
>-  for (i = 0; i < 1024; i++)
>-    {
>-      asm volatile ("" : "+g" (i));
>-      if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0))
>-	abort ();
>-      c[i] = (i % 3) != 2 ? 15.0f : -5.0f;
>-      d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
>-      f[i] = 4 * i;
>-    }
>-  f3 (c, f, d, 1024);
>-  for (i = 0; i < 1024; i++)
>-    {
>-      asm volatile ("" : "+g" (i));
>-      if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f))
>-	abort ();
>-    }
>-}
>
>	Jakub
References:
- [PATCH] Fix up AVX512F masked gather vectorization, add support for AVX512F 512-bit masked scatter vectorization (PR tree-optimization/88464)
  - From: Jakub Jelinek
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]