This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH] Fix up AVX512F masked gather vectorization, add support for AVX512F 512-bit masked scatter vectorization (PR tree-optimization/88464)

From: Jakub Jelinek <jakub at redhat dot com>
To: Richard Biener <rguenther at suse dot de>, Uros Bizjak <ubizjak at gmail dot com>
Cc: gcc-patches at gcc dot gnu dot org
Date: Fri, 14 Dec 2018 20:47:08 +0100
Subject: [PATCH] Fix up AVX512F masked gather vectorization, add support for AVX512F 512-bit masked scatter vectorization (PR tree-optimization/88464)
Reply-to: Jakub Jelinek <jakub at redhat dot com>
Hi!

In the previous patch I've unfortunately left one important case from the
testcase and apparently it wasn't covered by anything else in the testsuite.
The 3 functions covered float and double gathers with indexes with the same
bitsize and WIDENING gather (double gather with int index), but didn't cover
NARROWING case (float gather with long index with -m64).  That was the only
case that tried to permute the mask, unfortunately that isn't really
supported and ICEs.  What works is VEC_UNPACK_{LO,HI}_EXPR on the
VECTOR_BOOLEAN_TYPE_P, that is what other spots in the vectorizer emit for
those.

I had to also fix up the x86 backend, which had in expansion of these
NARROWING gather builtins code cut&pasted from the 256-bit builtin,
unfortunately it wasn't adjusted for the fact that the 512-bit builtin uses
integral mask argument while the 256-bit one doesn't.  And even in the
256-bit one there was a bug, it relied on the mask and src arguments to be
always in the same register (which is actually what the vectorizer generates
for those right now, but it could do something else).

This patch fixes that and enables also masked x86 AVX512F 512-bit
scatter support.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

What is still unhandled (doesn't vectorize) is 128-bit or 256-bit scatters,
I bet the mask operand is vectorized using normal non-bool vectors, but the
instructions with AVX512VL actually need a mask register.  There are
instructions that can handle that, but let's defer that for later.

2018-12-14  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/88464
	* tree-vect-stmts.c (vect_build_gather_load_calls): For NARROWING
	and mask with integral masktype, don't try to permute mask vectors,
	instead emit VEC_UNPACK_{LO,HI}_EXPR.  Fix up NOP_EXPR operand.
	(vectorizable_store): Handle masked scatters with decl and integral
	mask type.
	(permute_vec_elements): Allow scalar_dest to be NULL.
	* config/i386/i386.c (ix86_get_builtin)
	<case IX86_BUILTIN_GATHER3ALTDIV16SF>: Use lowpart_subreg for masks.
	<case IX86_BUILTIN_GATHER3ALTDIV8SF>: Don't assume mask and src have
	to be the same.

	* gcc.target/i386/avx512f-pr88462-1.c: Rename to ...
	* gcc.target/i386/avx512f-pr88464-1.c: ... this.  Fix up PR number.
	Expect 4 vectorized loops instead of 3.
	(f4): New function.
	* gcc.target/i386/avx512f-pr88462-2.c: Rename to ...
	* gcc.target/i386/avx512f-pr88464-2.c: ... this.  Fix up PR number
	and #include.
	(avx512f_test): Prepare arguments for f4 and check the results.
	* gcc.target/i386/avx512f-pr88464-3.c: New test.
	* gcc.target/i386/avx512f-pr88464-4.c: New test.

--- gcc/tree-vect-stmts.c.jj	2018-12-13 18:01:13.000000000 +0100
+++ gcc/tree-vect-stmts.c	2018-12-14 17:10:42.079054458 +0100
@@ -2655,6 +2655,7 @@ vect_build_gather_load_calls (stmt_vec_i
   if (mask && TREE_CODE (masktype) == INTEGER_TYPE)
     masktype = build_same_sized_truth_vector_type (srctype);
 
+  tree mask_halftype = masktype;
   tree perm_mask = NULL_TREE;
   tree mask_perm_mask = NULL_TREE;
   if (known_eq (nunits, gather_off_nunits))
@@ -2690,13 +2691,16 @@ vect_build_gather_load_calls (stmt_vec_i
 
       ncopies *= 2;
 
-      if (mask)
+      if (mask && masktype == real_masktype)
 	{
 	  for (int i = 0; i < count; ++i)
 	    sel[i] = i | (count / 2);
 	  indices.new_vector (sel, 2, count);
 	  mask_perm_mask = vect_gen_perm_mask_checked (masktype, indices);
 	}
+      else if (mask)
+	mask_halftype
+	  = build_same_sized_truth_vector_type (gs_info->offset_vectype);
     }
   else
     gcc_unreachable ();
@@ -2761,7 +2765,7 @@ vect_build_gather_load_calls (stmt_vec_i
 	    {
 	      if (j == 0)
 		vec_mask = vect_get_vec_def_for_operand (mask, stmt_info);
-	      else
+	      else if (modifier != NARROW || (j & 1) == 0)
 		vec_mask = vect_get_vec_def_for_stmt_copy (loop_vinfo,
 							   vec_mask);
 
@@ -2779,17 +2783,27 @@ vect_build_gather_load_calls (stmt_vec_i
 		  mask_op = var;
 		}
 	    }
+	  if (modifier == NARROW && masktype != real_masktype)
+	    {
+	      var = vect_get_new_ssa_name (mask_halftype, vect_simple_var);
+	      gassign *new_stmt
+		= gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR
+						    : VEC_UNPACK_LO_EXPR,
+				       mask_op);
+	      vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+	      mask_op = var;
+	    }
 	  src_op = mask_op;
 	}
 
       tree mask_arg = mask_op;
       if (masktype != real_masktype)
 	{
-	  tree utype;
-	  if (TYPE_MODE (real_masktype) == TYPE_MODE (masktype))
+	  tree utype, optype = TREE_TYPE (mask_op);
+	  if (TYPE_MODE (real_masktype) == TYPE_MODE (optype))
 	    utype = real_masktype;
 	  else
-	    utype = lang_hooks.types.type_for_mode (TYPE_MODE (masktype), 1);
+	    utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1);
 	  var = vect_get_new_ssa_name (utype, vect_scalar_var);
 	  mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_op);
 	  gassign *new_stmt
@@ -2801,7 +2815,7 @@ vect_build_gather_load_calls (stmt_vec_i
 	      gcc_assert (TYPE_PRECISION (utype)
 			  <= TYPE_PRECISION (real_masktype));
 	      var = vect_get_new_ssa_name (real_masktype, vect_scalar_var);
-	      new_stmt = gimple_build_assign (var, NOP_EXPR, utype);
+	      new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg);
 	      vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
 	      mask_arg = var;
 	    }
@@ -6361,7 +6375,8 @@ vectorizable_store (stmt_vec_info stmt_i
 	    return false;
 	}
       else if (memory_access_type != VMAT_LOAD_STORE_LANES
-	       && (memory_access_type != VMAT_GATHER_SCATTER || gs_info.decl))
+	       && (memory_access_type != VMAT_GATHER_SCATTER
+		   || (gs_info.decl && !VECTOR_BOOLEAN_TYPE_P (mask_vectype))))
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6419,7 +6434,9 @@ vectorizable_store (stmt_vec_info stmt_i
       tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, src;
       tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
       tree rettype, srctype, ptrtype, idxtype, masktype, scaletype;
-      tree ptr, mask, var, scale, perm_mask = NULL_TREE;
+      tree ptr, var, scale, vec_mask;
+      tree mask_arg = NULL_TREE, mask_op = NULL_TREE, perm_mask = NULL_TREE;
+      tree mask_halfvectype = mask_vectype;
       edge pe = loop_preheader_edge (loop);
       gimple_seq seq;
       basic_block new_bb;
@@ -6460,6 +6477,10 @@ vectorizable_store (stmt_vec_info stmt_i
 	  perm_mask = vect_gen_perm_mask_checked (vectype, indices);
 	  gcc_assert (perm_mask != NULL_TREE);
 	  ncopies *= 2;
+
+	  if (mask)
+	    mask_halfvectype
+	      = build_same_sized_truth_vector_type (gs_info.offset_vectype);
 	}
       else
 	gcc_unreachable ();
@@ -6482,10 +6503,11 @@ vectorizable_store (stmt_vec_info stmt_i
 	  gcc_assert (!new_bb);
 	}
 
-      /* Currently we support only unconditional scatter stores,
-	 so mask should be all ones.  */
-      mask = build_int_cst (masktype, -1);
-      mask = vect_init_vector (stmt_info, mask, masktype, NULL);
+      if (mask == NULL_TREE)
+	{
+	  mask_arg = build_int_cst (masktype, -1);
+	  mask_arg = vect_init_vector (stmt_info, mask_arg, masktype, NULL);
+	}
 
       scale = build_int_cst (scaletype, gs_info.scale);
 
@@ -6494,36 +6516,46 @@ vectorizable_store (stmt_vec_info stmt_i
 	{
 	  if (j == 0)
 	    {
-	      src = vec_oprnd1
-		= vect_get_vec_def_for_operand (op, stmt_info);
-	      op = vec_oprnd0
-		= vect_get_vec_def_for_operand (gs_info.offset, stmt_info);
+	      src = vec_oprnd1 = vect_get_vec_def_for_operand (op, stmt_info);
+	      op = vec_oprnd0 = vect_get_vec_def_for_operand (gs_info.offset,
+							      stmt_info);
+	      if (mask)
+		mask_op = vec_mask = vect_get_vec_def_for_operand (mask,
+								   stmt_info);
 	    }
 	  else if (modifier != NONE && (j & 1))
 	    {
 	      if (modifier == WIDEN)
 		{
-		  src = vec_oprnd1
-		    = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1);
+		  src
+		    = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo,
+								   vec_oprnd1);
 		  op = permute_vec_elements (vec_oprnd0, vec_oprnd0, perm_mask,
 					     stmt_info, gsi);
+		  if (mask)
+		    mask_op
+		      = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo,
+								   vec_mask);
 		}
 	      else if (modifier == NARROW)
 		{
 		  src = permute_vec_elements (vec_oprnd1, vec_oprnd1, perm_mask,
 					      stmt_info, gsi);
-		  op = vec_oprnd0
-		    = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0);
+		  op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo,
+								    vec_oprnd0);
 		}
 	      else
 		gcc_unreachable ();
 	    }
 	  else
 	    {
-	      src = vec_oprnd1
-		= vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1);
-	      op = vec_oprnd0
-		= vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0);
+	      src = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo,
+								 vec_oprnd1);
+	      op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo,
+								vec_oprnd0);
+	      if (mask)
+		mask_op = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo,
+								     vec_mask);
 	    }
 
 	  if (!useless_type_conversion_p (srctype, TREE_TYPE (src)))
@@ -6550,8 +6582,45 @@ vectorizable_store (stmt_vec_info stmt_i
 	      op = var;
 	    }
 
+	  if (mask)
+	    {
+	      tree utype;
+	      mask_arg = mask_op;
+	      if (modifier == NARROW)
+		{
+		  var = vect_get_new_ssa_name (mask_halfvectype,
+					       vect_simple_var);
+		  gassign *new_stmt
+		    = gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR
+							: VEC_UNPACK_LO_EXPR,
+					   mask_op);
+		  vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+		  mask_arg = var;
+		}
+	      tree optype = TREE_TYPE (mask_arg);
+	      if (TYPE_MODE (masktype) == TYPE_MODE (optype))
+		utype = masktype;
+	      else
+		utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1);
+	      var = vect_get_new_ssa_name (utype, vect_scalar_var);
+	      mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_arg);
+	      gassign *new_stmt
+		= gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg);
+	      vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+	      mask_arg = var;
+	      if (!useless_type_conversion_p (masktype, utype))
+		{
+		  gcc_assert (TYPE_PRECISION (utype)
+			      <= TYPE_PRECISION (masktype));
+		  var = vect_get_new_ssa_name (masktype, vect_scalar_var);
+		  new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg);
+		  vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+		  mask_arg = var;
+		}
+	    }
+
 	  gcall *new_stmt
-	    = gimple_build_call (gs_info.decl, 5, ptr, mask, op, src, scale);
+	    = gimple_build_call (gs_info.decl, 5, ptr, mask_arg, op, src, scale);
 	  stmt_vec_info new_stmt_info
 	    = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
 
@@ -7284,7 +7353,7 @@ permute_vec_elements (tree x, tree y, tr
   gimple *perm_stmt;
 
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
-  if (TREE_CODE (scalar_dest) == SSA_NAME)
+  if (scalar_dest && TREE_CODE (scalar_dest) == SSA_NAME)
     perm_dest = vect_create_destination_var (scalar_dest, vectype);
   else
     perm_dest = vect_get_new_vect_var (vectype, vect_simple_var, NULL);
--- gcc/config/i386/i386.c.jj	2018-12-13 13:45:11.000000000 +0100
+++ gcc/config/i386/i386.c	2018-12-14 17:34:11.131135056 +0100
@@ -37605,13 +37605,7 @@ rdseed_step:
 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
 	  emit_insn (gen (half, op0));
 	  op0 = half;
-	  if (GET_MODE (op3) != VOIDmode)
-	    {
-	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
-		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
-	      emit_insn (gen (half, op3));
-	      op3 = half;
-	    }
+	  op3 = lowpart_subreg (QImode, op3, HImode);
 	  break;
 	case IX86_BUILTIN_GATHER3ALTDIV8SF:
 	case IX86_BUILTIN_GATHER3ALTDIV8SI:
@@ -37628,6 +37622,7 @@ rdseed_step:
 	  op0 = half;
 	  if (GET_MODE (op3) != VOIDmode)
 	    {
+	      half = gen_reg_rtx (mode0);
 	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
 		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
 	      emit_insn (gen (half, op3));
--- gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c.jj	2018-12-14 16:34:55.361955571 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c	2018-12-14 18:07:25.694686784 +0100
@@ -0,0 +1,45 @@
+/* PR tree-optimization/88464 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+
+__attribute__((noipa)) void
+f1 (double * __restrict__ a, const double * __restrict__ b, const int * __restrict__ c, int n)
+{
+  int i;
+#pragma GCC ivdep
+  for (i = 0; i < n; ++i)
+    if (a[i] > 10.0)
+      a[i] = b[c[i]];
+}
+
+__attribute__((noipa)) void
+f2 (double * __restrict__ a, const double * __restrict__ b, const long * __restrict__ c, int n)
+{
+  int i;
+#pragma GCC ivdep
+  for (i = 0; i < n; ++i)
+    if (a[i] > 10.0)
+      a[i] = b[c[i]];
+}
+
+__attribute__((noipa)) void
+f3 (float * __restrict__ a, const float * __restrict__ b, const int * __restrict__ c, int n)
+{
+  int i;
+#pragma GCC ivdep
+  for (i = 0; i < n; ++i)
+    if (a[i] > 10.0f)
+      a[i] = b[c[i]];
+}
+
+__attribute__((noipa)) void
+f4 (float * __restrict__ a, const float * __restrict__ b, const long * __restrict__ c, int n)
+{
+  int i;
+#pragma GCC ivdep
+  for (i = 0; i < n; ++i)
+    if (a[i] > 10.0f)
+      a[i] = b[c[i]];
+}
--- gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c.jj	2018-12-14 16:35:00.681869029 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c	2018-12-14 17:43:40.294876267 +0100
@@ -0,0 +1,61 @@
+/* PR tree-optimization/88464 */
+/* { dg-do run { target { avx512f } } } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512" } */
+
+#include "avx512f-check.h"
+
+#include "avx512f-pr88464-1.c"
+
+static void
+avx512f_test (void)
+{
+  double a[1024], b[1024];
+  float c[1024], f[1024];
+  int d[1024];
+  long e[1024];
+  int i;
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      a[i] = (i % 3) != 0 ? 15.0 : -5.0;
+      b[i] = 2 * i;
+      d[i] = (i % 3) ? 1023 - i : __INT_MAX__;
+    }
+  f1 (a, b, d, 1024);
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
+	abort ();
+      a[i] = (i % 3) != 1 ? 15.0 : -5.0;
+      b[i] = 3 * i;
+      e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
+    }
+  f2 (a, b, e, 1024);
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0))
+	abort ();
+      c[i] = (i % 3) != 2 ? 15.0f : -5.0f;
+      d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
+      f[i] = 4 * i;
+    }
+  f3 (c, f, d, 1024);
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f))
+	abort ();
+      c[i] = (i % 3) != 0 ? 15.0f : -5.0f;
+      e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__;
+      f[i] = 5 * i;
+    }
+  f4 (c, f, e, 1024);
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f))
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c.jj	2018-12-14 18:01:19.297647800 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c	2018-12-14 18:07:14.906862302 +0100
@@ -0,0 +1,45 @@
+/* PR tree-optimization/88464 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+
+__attribute__((noipa)) void
+f1 (double * __restrict__ a, const double * __restrict__ b, const int * __restrict__ c, int n)
+{
+  int i;
+#pragma GCC ivdep
+  for (i = 0; i < n; ++i)
+    if (b[i] > -2.0)
+      a[c[i]] = b[i];
+}
+
+__attribute__((noipa)) void
+f2 (double * __restrict__ a, const double * __restrict__ b, const long * __restrict__ c, int n)
+{
+  int i;
+#pragma GCC ivdep
+  for (i = 0; i < n; ++i)
+    if (b[i] > -2.0)
+      a[c[i]] = b[i];
+}
+
+__attribute__((noipa)) void
+f3 (float * __restrict__ a, const float * __restrict__ b, const int * __restrict__ c, int n)
+{
+  int i;
+#pragma GCC ivdep
+  for (i = 0; i < n; ++i)
+    if (b[i] > -2.0f)
+      a[c[i]] = b[i];
+}
+
+__attribute__((noipa)) void
+f4 (float * __restrict__ a, const float * __restrict__ b, const long * __restrict__ c, int n)
+{
+  int i;
+#pragma GCC ivdep
+  for (i = 0; i < n; ++i)
+    if (b[i] > -2.0f)
+      a[c[i]] = b[i];
+}
--- gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c.jj	2018-12-14 18:03:03.100958998 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c	2018-12-14 18:12:32.209699741 +0100
@@ -0,0 +1,61 @@
+/* PR tree-optimization/88464 */
+/* { dg-do run { target { avx512f } } } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512" } */
+
+#include "avx512f-check.h"
+
+#include "avx512f-pr88464-3.c"
+
+static void
+avx512f_test (void)
+{
+  double a[1024], b[1024];
+  float c[1024], f[1024];
+  int d[1024];
+  long e[1024];
+  int i;
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      a[i] = -5.0;
+      b[i] = (i % 3) != 0 ? 2.0 * i : -5.0;
+      d[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__;
+    }
+  f1 (a, b, d, 1024);
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
+	abort ();
+      a[i] = -5.0;
+      b[i] = (i % 3) != 1 ? 3.0 * i : -5.0;
+      e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
+    }
+  f2 (a, b, e, 1024);
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      if (a[i] != ((i % 3) != 2 ? (1023 - i) * 3.0 : -5.0))
+	abort ();
+      c[i] = -5.0f;
+      d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
+      f[i] = (i % 3) != 2 ? 4.0f * i : -5.0f;
+    }
+  f3 (c, f, d, 1024);
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      if (c[i] != ((i % 3) != 1 ? (1023 - i) * 4.0f : -5.0f))
+	abort ();
+      c[i] = -5.0f;
+      e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__;
+      f[i] = (i % 3) != 0 ? 5.0f * i : -5.0f;
+    }
+  f4 (c, f, e, 1024);
+  for (i = 0; i < 1024; i++)
+    {
+      asm volatile ("" : "+g" (i));
+      if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f))
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c.jj	2018-12-13 18:01:13.913271190 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c	2018-11-06 14:56:08.851174491 +0100
@@ -1,35 +0,0 @@
-/* PR tree-optimization/88462 */
-/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
-/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 3 "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3 "vect" } } */
-
-__attribute__((noipa)) void
-f1 (double * __restrict__ a, const double * __restrict__ b, const int * __restrict__ c, int n)
-{
-  int i;
-#pragma GCC ivdep
-  for (i = 0; i < n; ++i)
-    if (a[i] > 10.0)
-      a[i] = b[c[i]];
-}
-
-__attribute__((noipa)) void
-f2 (double * __restrict__ a, const double * __restrict__ b, const long * __restrict__ c, int n)
-{
-  int i;
-#pragma GCC ivdep
-  for (i = 0; i < n; ++i)
-    if (a[i] > 10.0)
-      a[i] = b[c[i]];
-}
-
-__attribute__((noipa)) void
-f3 (float * __restrict__ a, const float * __restrict__ b, const int * __restrict__ c, int n)
-{
-  int i;
-#pragma GCC ivdep
-  for (i = 0; i < n; ++i)
-    if (a[i] > 10.0f)
-      a[i] = b[c[i]];
-}
--- gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c.jj	2018-12-13 18:01:13.914271174 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c	2018-11-06 14:56:08.851174491 +0100
@@ -1,51 +0,0 @@
-/* PR tree-optimization/88462 */
-/* { dg-do run { target { avx512f } } } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512" } */
-
-#include "avx512f-check.h"
-
-#include "avx512f-pr88462-1.c"
-
-static void
-avx512f_test (void)
-{
-  double a[1024], b[1024];
-  float c[1024], f[1024];
-  int d[1024];
-  long e[1024];
-  int i;
-  for (i = 0; i < 1024; i++)
-    {
-      asm volatile ("" : "+g" (i));
-      a[i] = (i % 3) != 0 ? 15.0 : -5.0;
-      b[i] = 2 * i;
-      d[i] = (i % 3) ? 1023 - i : __INT_MAX__;
-    }
-  f1 (a, b, d, 1024);
-  for (i = 0; i < 1024; i++)
-    {
-      asm volatile ("" : "+g" (i));
-      if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
-	abort ();
-      a[i] = (i % 3) != 1 ? 15.0 : -5.0;
-      b[i] = 3 * i;
-      e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
-    }
-  f2 (a, b, e, 1024);
-  for (i = 0; i < 1024; i++)
-    {
-      asm volatile ("" : "+g" (i));
-      if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0))
-	abort ();
-      c[i] = (i % 3) != 2 ? 15.0f : -5.0f;
-      d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
-      f[i] = 4 * i;
-    }
-  f3 (c, f, d, 1024);
-  for (i = 0; i < 1024; i++)
-    {
-      asm volatile ("" : "+g" (i));
-      if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f))
-	abort ();
-    }
-}

	Jakub
Follow-Ups:
- Re: [PATCH] Fix up AVX512F masked gather vectorization, add support for AVX512F 512-bit masked scatter vectorization (PR tree-optimization/88464)
  - From: Uros Bizjak
- Re: [PATCH] Fix up AVX512F masked gather vectorization, add support for AVX512F 512-bit masked scatter vectorization (PR tree-optimization/88464)
  - From: Richard Biener
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]