[PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

Kirill Yukhin kirill.yukhin@gmail.com
Mon Dec 30 11:00:00 GMT 2013


Hello Uroš, Jakub,
On 22 Dec 11:47, Uros Bizjak wrote:
> The x86 part is OK for mainline. You will also need approval from the
> middle-end reviewer for tree-* parts.

Thanks, I'am testing (in agreed volume, bootstrap passed so far) patch
in the bottom.

If no more inputs - I'll check it in to main trunk tomorrow (Moscow time)
after testing is over.

Jakub, I've filed: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59617
But not sure that fix (when it'll be invented) can go to main trunk since
it is performance issue.

gcc/
2013-12-30  Alexander Ivchenko  <alexander.ivchenko@intel.com>
	    Maxim Kuznetsov  <maxim.kuznetsov@intel.com>
	    Sergey Lega  <sergey.s.lega@intel.com>
	    Anna Tikhonova  <anna.tikhonova@intel.com>
	    Ilya Tocar  <ilya.tocar@intel.com>
	    Andrey Turetskiy  <andrey.turetskiy@intel.com>
	    Ilya Verbin  <ilya.verbin@intel.com>
	    Kirill Yukhin  <kirill.yukhin@intel.com>
	    Michael Zolotukhin  <michael.v.zolotukhin@intel.com>

	* config/i386/i386.c (MAX_CLASSES): Increase number of classes.
	(classify_argument): Extend for 512 bit vectors.
	(construct_container): Ditto.
	(function_arg_advance_32): Ditto.
	(function_arg_advance_64): Ditto.
	(function_arg_32): Ditto.
	(function_arg_64): Ditto.
	(function_value_32): Ditto.
	(return_in_memory_32): Ditto.
	(ix86_gimplify_va_arg): Ditto.
	(standard_sse_constant_p): Ditto.
	(standard_sse_constant_opcode): Ditto.
	(ix86_expand_vector_convert_uns_vsivsf): Ditto.
	(ix86_build_const_vector): Ditto.
	(ix86_build_signbit_mask): Ditto.
	(ix86_expand_sse_cmp): Extend for AVX512.
	(ix86_expand_sse_movcc): Ditto.
	(ix86_expand_int_vcond): Ditto.
	(ix86_expand_vec_perm): Ditto.
	(ix86_expand_sse_unpack): Ditto.
	(ix86_constant_alignment): Ditto.
	(ix86_builtin_vectorized_function): Ditto.
	(ix86_vectorize_builtin_gather): Ditto.
	(avx_vpermilp_parallel): Ditto.
	(ix86_rtx_costs): Ditto.
	(ix86_expand_vector_init_duplicate): Ditto.
	(ix86_expand_vector_init_concat): Ditto.
	(ix86_expand_vector_init_general): Ditto.
	(ix86_expand_vector_extract): Ditto.
	(emit_reduc_half): Ditto.
	(ix86_vector_mode_supported_p): Ditto.
	(ix86_emit_swdivsf): Ditto.
	(ix86_emit_swsqrtsf): Ditto.
	(expand_vec_perm_1): Ditto.
	(ix86_vectorize_vec_perm_const_ok): Ditto.
	(ix86_expand_mul_widen_evenodd): Ditto.
	(ix86_expand_sse2_mulvxdi3): Ditto.
	(ix86_preferred_simd_mode): Ditto.
	(ix86_autovectorize_vector_sizes): Ditto.
	(ix86_expand_vec_perm_vpermi2): New.
	(ix86_vector_duplicate_value): Ditto.
	(IX86_BUILTIN_SQRTPD512, IX86_BUILTIN_EXP2PS, IX86_BUILTIN_SQRTPS_NR512,
	IX86_BUILTIN_GATHER3ALTDIV16SF, IX86_BUILTIN_GATHER3ALTDIV16SI,
	IX86_BUILTIN_GATHER3ALTSIV8DF, IX86_BUILTIN_GATHER3ALTSIV8DI,
	IX86_BUILTIN_GATHER3DIV16SF, IX86_BUILTIN_GATHER3DIV16SI,
	IX86_BUILTIN_GATHER3DIV8DF, IX86_BUILTIN_GATHER3DIV8DI,
	IX86_BUILTIN_GATHER3SIV16SF, IX86_BUILTIN_GATHER3SIV16SI,
	IX86_BUILTIN_GATHER3SIV8DF, IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512,
	IX86_BUILTIN_CPYSGNPS512, IX86_BUILTIN_CPYSGNPD512,
	IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512,
	IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512): Ditto.
	* config/i386/sse.md (*mov<mode>_internal): Disable SSE typeless
	stores vectors > 128bit (AVX*).
	(<sse>_storeu<ssemodesuffix><avxsizesuffix>): Ditto.
	(<sse2_avx_avx512f>_storedqu<mode>): Extend for AVX-512, disable
	SSE typeless stores vectors > 128bit (AVX*).
	(fixuns_trunc<mode><sseintvecmodelower>2): Extend for AVX-512.
	(vec_pack_ufix_trunc_<mode>): Ditto.
	(vec_unpacku_float_hi_v16si): New.
	* tree-vect-stmts.c (vectorizable_load): Support AVX512's gathers.
	* tree-vectorizer.h (MAX_VECTORIZATION_FACTOR): Extend for 512 bit
	vectors.

testsuite/
2013-12-30  Alexander Ivchenko  <alexander.ivchenko@intel.com>
	Maxim Kuznetsov  <maxim.kuznetsov@intel.com>
	Sergey Lega  <sergey.s.lega@intel.com>
	Anna Tikhonova  <anna.tikhonova@intel.com>
	Ilya Tocar  <ilya.tocar@intel.com>
	Andrey Turetskiy  <andrey.turetskiy@intel.com>
	Ilya Verbin  <ilya.verbin@intel.com>
	Kirill Yukhin  <kirill.yukhin@intel.com>
	Michael Zolotukhin  <michael.v.zolotukhin@intel.com>

	* gcc.target/i386/pr49002-2.c: allow vmovapd generation.

--
Thanks, K

---
 gcc/config/i386/i386.c                    | 673 ++++++++++++++++++++++++++----
 gcc/config/i386/sse.md                    | 115 +++--
 gcc/testsuite/gcc.target/i386/pr49002-2.c |   2 +-
 gcc/tree-vect-stmts.c                     |  34 +-
 gcc/tree-vectorizer.h                     |   4 +-
 5 files changed, 717 insertions(+), 111 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 2fc9b80..b0002ff 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2308,7 +2308,7 @@ enum x86_64_reg_class
     X86_64_MEMORY_CLASS
   };
 
-#define MAX_CLASSES 4
+#define MAX_CLASSES 8
 
 /* Table of constants used by fldpi, fldln2, etc....  */
 static REAL_VALUE_TYPE ext_80387_constants_table [5];
@@ -6242,7 +6242,7 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
    sized containers, classes[0] will be NO_CLASS and 1 is returned.
 
    BIT_OFFSET is used internally for handling records and specifies offset
-   of the offset in bits modulo 256 to avoid overflow cases.
+   of the offset in bits modulo 512 to avoid overflow cases.
 
    See the x86-64 PS ABI for details.
 */
@@ -6342,7 +6342,7 @@ classify_argument (enum machine_mode mode, const_tree type,
 		      num = classify_argument (TYPE_MODE (type), type,
 					       subclasses,
 					       (int_bit_position (field)
-						+ bit_offset) % 256);
+						+ bit_offset) % 512);
 		      if (!num)
 			return 0;
 		      pos = (int_bit_position (field)
@@ -6592,6 +6592,21 @@ classify_argument (enum machine_mode mode, const_tree type,
       classes[2] = X86_64_SSEUP_CLASS;
       classes[3] = X86_64_SSEUP_CLASS;
       return 4;
+    case V8DFmode:
+    case V16SFmode:
+    case V8DImode:
+    case V16SImode:
+    case V32HImode:
+    case V64QImode:
+      classes[0] = X86_64_SSE_CLASS;
+      classes[1] = X86_64_SSEUP_CLASS;
+      classes[2] = X86_64_SSEUP_CLASS;
+      classes[3] = X86_64_SSEUP_CLASS;
+      classes[4] = X86_64_SSEUP_CLASS;
+      classes[5] = X86_64_SSEUP_CLASS;
+      classes[6] = X86_64_SSEUP_CLASS;
+      classes[7] = X86_64_SSEUP_CLASS;
+      return 8;
     case V4SFmode:
     case V4SImode:
     case V16QImode:
@@ -6777,6 +6792,18 @@ construct_container (enum machine_mode mode, enum machine_mode orig_mode,
       && mode != BLKmode)
     return gen_reg_or_parallel (mode, orig_mode,
 				SSE_REGNO (sse_regno));
+  if (n == 8
+      && regclass[0] == X86_64_SSE_CLASS
+      && regclass[1] == X86_64_SSEUP_CLASS
+      && regclass[2] == X86_64_SSEUP_CLASS
+      && regclass[3] == X86_64_SSEUP_CLASS
+      && regclass[4] == X86_64_SSEUP_CLASS
+      && regclass[5] == X86_64_SSEUP_CLASS
+      && regclass[6] == X86_64_SSEUP_CLASS
+      && regclass[7] == X86_64_SSEUP_CLASS
+      && mode != BLKmode)
+    return gen_reg_or_parallel (mode, orig_mode,
+				SSE_REGNO (sse_regno));
   if (n == 2
       && regclass[0] == X86_64_X87_CLASS
       && regclass[1] == X86_64_X87UP_CLASS)
@@ -6858,6 +6885,18 @@ construct_container (enum machine_mode mode, enum machine_mode orig_mode,
 		tmpmode = OImode;
 		i += 3;
 		break;
+	      case 8:
+		gcc_assert (i == 0
+			    && regclass[1] == X86_64_SSEUP_CLASS
+			    && regclass[2] == X86_64_SSEUP_CLASS
+			    && regclass[3] == X86_64_SSEUP_CLASS
+			    && regclass[4] == X86_64_SSEUP_CLASS
+			    && regclass[5] == X86_64_SSEUP_CLASS
+			    && regclass[6] == X86_64_SSEUP_CLASS
+			    && regclass[7] == X86_64_SSEUP_CLASS);
+		tmpmode = XImode;
+		i += 7;
+		break;
 	      default:
 		gcc_unreachable ();
 	      }
@@ -6931,6 +6970,12 @@ function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
 
     case V8SFmode:
     case V8SImode:
+    case V64QImode:
+    case V32HImode:
+    case V16SImode:
+    case V8DImode:
+    case V16SFmode:
+    case V8DFmode:
     case V32QImode:
     case V16HImode:
     case V4DFmode:
@@ -6982,8 +7027,9 @@ function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
 {
   int int_nregs, sse_nregs;
 
-  /* Unnamed 256bit vector mode parameters are passed on stack.  */
-  if (!named && VALID_AVX256_REG_MODE (mode))
+  /* Unnamed 512 and 256bit vector mode parameters are passed on stack.  */
+  if (!named && (VALID_AVX512F_REG_MODE (mode)
+		 || VALID_AVX256_REG_MODE (mode)))
     return;
 
   if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
@@ -7134,9 +7180,16 @@ function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
       break;
 
     case OImode:
-      /* OImode shouldn't be used directly.  */
+    case XImode:
+      /* OImode and XImode shouldn't be used directly.  */
       gcc_unreachable ();
 
+    case V64QImode:
+    case V32HImode:
+    case V16SImode:
+    case V8DImode:
+    case V16SFmode:
+    case V8DFmode:
     case V8SFmode:
     case V8SImode:
     case V32QImode:
@@ -7199,7 +7252,13 @@ function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
     case V16HImode:
     case V4DFmode:
     case V4DImode:
-      /* Unnamed 256bit vector mode parameters are passed on stack.  */
+    case V16SFmode:
+    case V16SImode:
+    case V64QImode:
+    case V32HImode:
+    case V8DFmode:
+    case V8DImode:
+      /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
       if (!named)
 	return NULL;
       break;
@@ -7602,6 +7661,10 @@ function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
   else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
     regno = FIRST_SSE_REG;
 
+  /* 64-byte vector modes in %zmm0.   */
+  else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
+    regno = FIRST_SSE_REG;
+
   /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387).  */
   else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
     regno = FIRST_FLOAT_REG;
@@ -7809,6 +7872,10 @@ return_in_memory_32 (const_tree type, enum machine_mode mode)
       /* AVX values are returned in YMM0, except when it doesn't exist.  */
       if (size == 32)
 	return !TARGET_AVX;
+
+      /* AVX512F values are returned in ZMM0, except when it doesn't exist.  */
+      if (size == 64)
+	return !TARGET_AVX512F;
     }
 
   if (mode == XFmode)
@@ -8345,7 +8412,13 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
     case V16HImode:
     case V4DFmode:
     case V4DImode:
-      /* Unnamed 256bit vector mode parameters are passed on stack.  */
+    case V16SFmode:
+    case V16SImode:
+    case V64QImode:
+    case V32HImode:
+    case V8DFmode:
+    case V8DImode:
+      /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
       if (!TARGET_64BIT_MS_ABI)
 	{
 	  container = NULL;
@@ -8760,6 +8833,12 @@ standard_sse_constant_p (rtx x)
       case V4DImode:
 	if (TARGET_AVX2)
 	  return 2;
+      case V64QImode:
+      case V32HImode:
+      case V16SImode:
+      case V8DImode:
+	if (TARGET_AVX512F)
+	  return 2;
       default:
 	break;
       }
@@ -8778,6 +8857,11 @@ standard_sse_constant_opcode (rtx insn, rtx x)
     case 1:
       switch (get_attr_mode (insn))
 	{
+	case MODE_XI:
+	case MODE_V16SF:
+	  return "vpxord\t%g0, %g0, %g0";
+	case MODE_V8DF:
+	  return "vpxorq\t%g0, %g0, %g0";
 	case MODE_TI:
 	  return "%vpxor\t%0, %d0";
 	case MODE_V2DF:
@@ -18668,17 +18752,23 @@ ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
 
   switch (mode)
     {
+    case V64QImode:
     case V32QImode:
     case V16QImode:
+    case V32HImode:
     case V16HImode:
     case V8HImode:
+    case V16SImode:
     case V8SImode:
     case V4SImode:
+    case V8DImode:
     case V4DImode:
     case V2DImode:
       gcc_assert (vect);
+    case V16SFmode:
     case V8SFmode:
     case V4SFmode:
+    case V8DFmode:
     case V4DFmode:
     case V2DFmode:
       n_elt = GET_MODE_NUNITS (mode);
@@ -18715,6 +18805,8 @@ ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
   /* Find the sign bit, sign extended to 2*HWI.  */
   switch (mode)
     {
+    case V16SImode:
+    case V16SFmode:
     case V8SImode:
     case V4SImode:
     case V8SFmode:
@@ -18725,8 +18817,10 @@ ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
       lo = 0x80000000, hi = lo < 0;
       break;
 
+    case V8DImode:
     case V4DImode:
     case V2DImode:
+    case V8DFmode:
     case V4DFmode:
     case V2DFmode:
       vec_mode = mode;
@@ -20583,22 +20677,63 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
 		     rtx op_true, rtx op_false)
 {
   enum machine_mode mode = GET_MODE (dest);
-  enum machine_mode cmp_mode = GET_MODE (cmp_op0);
+  enum machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
+
+  /* In general case result of comparison can differ from operands' type.  */
+  enum machine_mode cmp_mode;
+
+  /* In AVX512F the result of comparison is an integer mask.  */
+  bool maskcmp = false;
   rtx x;
 
-  cmp_op0 = force_reg (cmp_mode, cmp_op0);
-  if (!nonimmediate_operand (cmp_op1, cmp_mode))
-    cmp_op1 = force_reg (cmp_mode, cmp_op1);
+  if (GET_MODE_SIZE (cmp_ops_mode) == 64)
+    {
+      cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
+      gcc_assert (cmp_mode != BLKmode);
+
+      maskcmp = true;
+    }
+  else
+    cmp_mode = cmp_ops_mode;
+
+
+  cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
+  if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
+    cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
 
   if (optimize
       || reg_overlap_mentioned_p (dest, op_true)
       || reg_overlap_mentioned_p (dest, op_false))
-    dest = gen_reg_rtx (mode);
+    dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
+
+  /* Compare patterns for int modes are unspec in AVX512F only.  */
+  if (maskcmp && (code == GT || code == EQ))
+    {
+      rtx (*gen)(rtx, rtx, rtx);
 
+      switch (cmp_ops_mode)
+	{
+	case V16SImode:
+	  gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
+	  break;
+	case V8DImode:
+	  gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
+	  break;
+	default:
+	  gen = NULL;
+	}
+
+      if (gen)
+	{
+	  emit_insn (gen (dest, cmp_op0, cmp_op1));
+	  return dest;
+	}
+    }
   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
-  if (cmp_mode != mode)
+
+  if (cmp_mode != mode && !maskcmp)
     {
-      x = force_reg (cmp_mode, x);
+      x = force_reg (cmp_ops_mode, x);
       convert_move (dest, x, false);
     }
   else
@@ -20614,33 +20749,43 @@ static void
 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
 {
   enum machine_mode mode = GET_MODE (dest);
+  enum machine_mode cmpmode = GET_MODE (cmp);
+
+  /* In AVX512F the result of comparison is an integer mask.  */
+  bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
+
   rtx t2, t3, x;
 
   if (vector_all_ones_operand (op_true, mode)
-      && rtx_equal_p (op_false, CONST0_RTX (mode)))
+      && rtx_equal_p (op_false, CONST0_RTX (mode))
+      && !maskcmp)
     {
       emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
     }
-  else if (op_false == CONST0_RTX (mode))
+  else if (op_false == CONST0_RTX (mode)
+      && !maskcmp)
     {
       op_true = force_reg (mode, op_true);
       x = gen_rtx_AND (mode, cmp, op_true);
       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
     }
-  else if (op_true == CONST0_RTX (mode))
+  else if (op_true == CONST0_RTX (mode)
+      && !maskcmp)
     {
       op_false = force_reg (mode, op_false);
       x = gen_rtx_NOT (mode, cmp);
       x = gen_rtx_AND (mode, x, op_false);
       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
     }
-  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
+  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
+      && !maskcmp)
     {
       op_false = force_reg (mode, op_false);
       x = gen_rtx_IOR (mode, cmp, op_false);
       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
     }
-  else if (TARGET_XOP)
+  else if (TARGET_XOP
+      && !maskcmp)
     {
       op_true = force_reg (mode, op_true);
 
@@ -20708,6 +20853,20 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
 	      cmp = gen_lowpart (V32QImode, cmp);
 	    }
 	  break;
+
+	case V16SImode:
+	  gen = gen_avx512f_blendmv16si;
+	  break;
+	case V8DImode:
+	  gen = gen_avx512f_blendmv8di;
+	  break;
+	case V8DFmode:
+	  gen = gen_avx512f_blendmv8df;
+	  break;
+	case V16SFmode:
+	  gen = gen_avx512f_blendmv16sf;
+	  break;
+
 	default:
 	  break;
 	}
@@ -20975,6 +21134,8 @@ ix86_expand_int_vcond (rtx operands[])
 
 	  switch (mode)
 	    {
+	    case V16SImode:
+	    case V8DImode:
 	    case V8SImode:
 	    case V4DImode:
 	    case V4SImode:
@@ -20985,6 +21146,8 @@ ix86_expand_int_vcond (rtx operands[])
 
 		  switch (mode)
 		    {
+		    case V16SImode: gen_sub3 = gen_subv16si3; break;
+		    case V8DImode: gen_sub3 = gen_subv8di3; break;
 		    case V8SImode: gen_sub3 = gen_subv8si3; break;
 		    case V4DImode: gen_sub3 = gen_subv4di3; break;
 		    case V4SImode: gen_sub3 = gen_subv4si3; break;
@@ -21040,7 +21203,8 @@ ix86_expand_int_vcond (rtx operands[])
       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
       x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
 			       operands[1+negate], operands[2-negate]);
-      x = gen_lowpart (data_mode, x);
+      if (GET_MODE (x) == mode)
+	x = gen_lowpart (data_mode, x);
     }
 
   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
@@ -21048,6 +21212,35 @@ ix86_expand_int_vcond (rtx operands[])
   return true;
 }
 
+static bool
+ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  switch (mode)
+    {
+    case V16SImode:
+      emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
+					      force_reg (V16SImode, mask),
+					      op1));
+      return true;
+    case V16SFmode:
+      emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
+					      force_reg (V16SImode, mask),
+					      op1));
+      return true;
+    case V8DImode:
+      emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
+					     force_reg (V8DImode, mask), op1));
+      return true;
+    case V8DFmode:
+      emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0,
+					     force_reg (V8DImode, mask), op1));
+      return true;
+    default:
+      return false;
+    }
+}
+
 /* Expand a variable vector permutation.  */
 
 void
@@ -21066,7 +21259,10 @@ ix86_expand_vec_perm (rtx operands[])
   /* Number of elements in the vector.  */
   w = GET_MODE_NUNITS (mode);
   e = GET_MODE_UNIT_SIZE (mode);
-  gcc_assert (w <= 32);
+  gcc_assert (w <= 64);
+
+  if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1))
+    return;
 
   if (TARGET_AVX2)
     {
@@ -21446,6 +21642,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
 	  extract
 	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
 	  break;
+	case V32HImode:
+	  if (unsigned_p)
+	    unpack = gen_avx512f_zero_extendv16hiv16si2;
+	  else
+	    unpack = gen_avx512f_sign_extendv16hiv16si2;
+	  halfmode = V16HImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
+	  break;
 	case V16HImode:
 	  if (unsigned_p)
 	    unpack = gen_avx2_zero_extendv8hiv8si2;
@@ -21455,6 +21660,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
 	  extract
 	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
 	  break;
+	case V16SImode:
+	  if (unsigned_p)
+	    unpack = gen_avx512f_zero_extendv8siv8di2;
+	  else
+	    unpack = gen_avx512f_sign_extendv8siv8di2;
+	  halfmode = V8SImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
+	  break;
 	case V8SImode:
 	  if (unsigned_p)
 	    unpack = gen_avx2_zero_extendv4siv4di2;
@@ -21486,7 +21700,7 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
 	  gcc_unreachable ();
 	}
 
-      if (GET_MODE_SIZE (imode) == 32)
+      if (GET_MODE_SIZE (imode) >= 32)
 	{
 	  tmp = gen_reg_rtx (halfmode);
 	  emit_insn (extract (tmp, src));
@@ -26219,7 +26433,8 @@ ix86_constant_alignment (tree exp, int align)
 int
 ix86_data_alignment (tree type, int align, bool opt)
 {
-  int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
+  int max_align = optimize_size ? BITS_PER_WORD
+				: MIN (512, MAX_OFILE_ALIGNMENT);
 
   if (opt
       && AGGREGATE_TYPE_P (type)
@@ -27681,12 +27896,27 @@ enum ix86_builtins
   IX86_BUILTIN_GATHERDIV4SI,
   IX86_BUILTIN_GATHERDIV8SI,
 
+  IX86_BUILTIN_SQRTPD512,
+  IX86_BUILTIN_EXP2PS,
+  IX86_BUILTIN_SQRTPS_NR512,
+
   /* Alternate 4 element gather for the vectorizer where
      all operands are 32-byte wide.  */
   IX86_BUILTIN_GATHERALTSIV4DF,
   IX86_BUILTIN_GATHERALTDIV8SF,
   IX86_BUILTIN_GATHERALTSIV4DI,
   IX86_BUILTIN_GATHERALTDIV8SI,
+  IX86_BUILTIN_GATHER3ALTDIV16SF,
+  IX86_BUILTIN_GATHER3ALTDIV16SI,
+  IX86_BUILTIN_GATHER3ALTSIV8DF,
+  IX86_BUILTIN_GATHER3ALTSIV8DI,
+  IX86_BUILTIN_GATHER3DIV16SF,
+  IX86_BUILTIN_GATHER3DIV16SI,
+  IX86_BUILTIN_GATHER3DIV8DF,
+  IX86_BUILTIN_GATHER3DIV8DI,
+  IX86_BUILTIN_GATHER3SIV16SF,
+  IX86_BUILTIN_GATHER3SIV16SI,
+  IX86_BUILTIN_GATHER3SIV8DF,
 
   /* TFmode support builtins.  */
   IX86_BUILTIN_INFQ,
@@ -27695,10 +27925,16 @@ enum ix86_builtins
   IX86_BUILTIN_COPYSIGNQ,
 
   /* Vectorizer support builtins.  */
+  IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512,
   IX86_BUILTIN_CPYSGNPS,
   IX86_BUILTIN_CPYSGNPD,
   IX86_BUILTIN_CPYSGNPS256,
+  IX86_BUILTIN_CPYSGNPS512,
   IX86_BUILTIN_CPYSGNPD256,
+  IX86_BUILTIN_CPYSGNPD512,
+  IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512,
+  IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512,
+
 
   /* FMA4 instructions.  */
   IX86_BUILTIN_VFMADDSS,
@@ -33876,6 +34112,16 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_SQRTPD);
 	  else if (out_n == 4 && in_n == 4)
 	    return ix86_get_builtin (IX86_BUILTIN_SQRTPD256);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_SQRTPD512);
+	}
+      break;
+
+    case BUILT_IN_EXP2F:
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
 	}
       break;
 
@@ -33886,6 +34132,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR);
 	  else if (out_n == 8 && in_n == 8)
 	    return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR512);
 	}
       break;
 
@@ -33902,6 +34150,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
 	  else if (out_n == 8 && in_n == 4)
 	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
+	  else if (out_n == 16 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
 	}
       break;
 
@@ -33934,6 +34184,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
 	  else if (out_n == 8 && in_n == 4)
 	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
+	  else if (out_n == 16 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
 	}
       break;
 
@@ -33990,6 +34242,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
 	  else if (out_n == 8 && in_n == 4)
 	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
+	  else if (out_n == 16 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
 	}
       break;
 
@@ -34016,6 +34270,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD);
 	  else if (out_n == 4 && in_n == 4)
 	    return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD256);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD512);
 	}
       break;
 
@@ -34026,6 +34282,8 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out,
 	    return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS);
 	  else if (out_n == 8 && in_n == 8)
 	    return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS512);
 	}
       break;
 
@@ -34461,6 +34719,34 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
     case V8SImode:
       code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
       break;
+#if 0
+    /*  FIXME: Commented until vectorizer can work with (mask_type != src_type)
+	PR59617.   */
+    case V8DFmode:
+      if (TARGET_AVX512F)
+	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
+      else
+	return NULL_TREE;
+      break;
+    case V8DImode:
+      if (TARGET_AVX512F)
+	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
+      else
+	return NULL_TREE;
+      break;
+    case V16SFmode:
+      if (TARGET_AVX512F)
+	code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
+      else
+	return NULL_TREE;
+      break;
+    case V16SImode:
+      if (TARGET_AVX512F)
+	code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
+      else
+	return NULL_TREE;
+      break;
+#endif
     default:
       return NULL_TREE;
     }
@@ -34516,7 +34802,7 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode)
 {
   unsigned i, nelt = GET_MODE_NUNITS (mode);
   unsigned mask = 0;
-  unsigned char ipar[8] = {};  /* Silence -Wuninitialized warning.  */
+  unsigned char ipar[16] = {};  /* Silence -Wuninitialized warning.  */
 
   if (XVECLEN (par, 0) != (int) nelt)
     return 0;
@@ -34539,6 +34825,24 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode)
 
   switch (mode)
     {
+    case V8DFmode:
+      /* In the 512-bit DFmode case, we can only move elements within
+         a 128-bit lane.  First fill the second part of the mask,
+	 then fallthru.  */
+      for (i = 4; i < 6; ++i)
+	{
+	  if (ipar[i] < 4 || ipar[i] >= 6)
+	    return 0;
+	  mask |= (ipar[i] - 4) << i;
+	}
+      for (i = 6; i < 8; ++i)
+	{
+	  if (ipar[i] < 6)
+	    return 0;
+	  mask |= (ipar[i] - 6) << i;
+	}
+      /* FALLTHRU */
+
     case V4DFmode:
       /* In the 256-bit DFmode case, we can only move elements within
          a 128-bit lane.  */
@@ -34556,10 +34860,18 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode)
 	}
       break;
 
+    case V16SFmode:
+      /* In 512 bit SFmode case, permutation in the upper 256 bits
+	 must mirror the permutation in the lower 256-bits.  */
+      for (i = 0; i < 8; ++i)
+	if (ipar[i] + 8 != ipar[i + 8])
+	  return 0;
+      /* FALLTHRU */
+
     case V8SFmode:
-      /* In the 256-bit SFmode case, we have full freedom of movement
-	 within the low 128-bit lane, but the high 128-bit lane must
-	 mirror the exact same pattern.  */
+      /* In 256 bit SFmode case, we have full freedom of
+         movement within the low 128-bit lane, but the high 128-bit
+         lane must mirror the exact same pattern.  */
       for (i = 0; i < 4; ++i)
 	if (ipar[i] + 4 != ipar[i + 4])
 	  return 0;
@@ -35510,6 +35822,7 @@ static bool
 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
 		bool speed)
 {
+  rtx mask;
   enum rtx_code code = (enum rtx_code) code_i;
   enum rtx_code outer_code = (enum rtx_code) outer_code_i;
   enum machine_mode mode = GET_MODE (x);
@@ -35986,13 +36299,21 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
 
     case VEC_SELECT:
     case VEC_CONCAT:
-    case VEC_MERGE:
     case VEC_DUPLICATE:
       /* ??? Assume all of these vector manipulation patterns are
 	 recognizable.  In which case they all pretty much have the
 	 same cost.  */
      *total = cost->fabs;
      return true;
+    case VEC_MERGE:
+      mask = XEXP (x, 2);
+      /* This is masked instruction, assume the same cost,
+	 as nonmasked variant.  */
+      if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
+	*total = rtx_cost (XEXP (x, 0), outer_code, opno, speed);
+      else
+	*total = cost->fabs;
+      return true;
 
     default:
       return false;
@@ -37158,6 +37479,36 @@ get_mode_wider_vector (enum machine_mode o)
   return n;
 }
 
+/* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
+   fill target with val via vec_duplicate.  */
+
+static bool
+ix86_vector_duplicate_value (enum machine_mode mode, rtx target, rtx val)
+{
+  bool ok;
+  rtx insn, dup;
+
+  /* First attempt to recognize VAL as-is.  */
+  dup = gen_rtx_VEC_DUPLICATE (mode, val);
+  insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
+  if (recog_memoized (insn) < 0)
+    {
+      rtx seq;
+      /* If that fails, force VAL into a register.  */
+
+      start_sequence ();
+      XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
+      seq = get_insns ();
+      end_sequence ();
+      if (seq)
+	emit_insn_before (seq, insn);
+
+      ok = recog_memoized (insn) >= 0;
+      gcc_assert (ok);
+    }
+  return true;
+}
+
 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
    with all elements equal to VAR.  Return true if successful.  */
 
@@ -37183,29 +37534,11 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
     case V2DImode:
     case V4SFmode:
     case V4SImode:
-      {
-	rtx insn, dup;
-
-	/* First attempt to recognize VAL as-is.  */
-	dup = gen_rtx_VEC_DUPLICATE (mode, val);
-	insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
-	if (recog_memoized (insn) < 0)
-	  {
-	    rtx seq;
-	    /* If that fails, force VAL into a register.  */
-
-	    start_sequence ();
-	    XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
-	    seq = get_insns ();
-	    end_sequence ();
-	    if (seq)
-	      emit_insn_before (seq, insn);
-
-	    ok = recog_memoized (insn) >= 0;
-	    gcc_assert (ok);
-	  }
-      }
-      return true;
+    case V16SImode:
+    case V8DImode:
+    case V16SFmode:
+    case V8DFmode:
+      return ix86_vector_duplicate_value (mode, target, val);
 
     case V4HImode:
       if (!mmx_ok)
@@ -37555,8 +37888,8 @@ static void
 ix86_expand_vector_init_concat (enum machine_mode mode,
 				rtx target, rtx *ops, int n)
 {
-  enum machine_mode cmode, hmode = VOIDmode;
-  rtx first[8], second[4];
+  enum machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
+  rtx first[16], second[8], third[4];
   rtvec v;
   int i, j;
 
@@ -37565,6 +37898,18 @@ ix86_expand_vector_init_concat (enum machine_mode mode,
     case 2:
       switch (mode)
 	{
+	case V16SImode:
+	  cmode = V8SImode;
+	  break;
+	case V16SFmode:
+	  cmode = V8SFmode;
+	  break;
+	case V8DImode:
+	  cmode = V4DImode;
+	  break;
+	case V8DFmode:
+	  cmode = V4DFmode;
+	  break;
 	case V8SImode:
 	  cmode = V4SImode;
 	  break;
@@ -37631,6 +37976,14 @@ ix86_expand_vector_init_concat (enum machine_mode mode,
     case 8:
       switch (mode)
 	{
+	case V8DImode:
+	  cmode = V2DImode;
+	  hmode = V4DImode;
+	  break;
+	case V8DFmode:
+	  cmode = V2DFmode;
+	  hmode = V4DFmode;
+	  break;
 	case V8SImode:
 	  cmode = V2SImode;
 	  hmode = V4SImode;
@@ -37644,6 +37997,24 @@ ix86_expand_vector_init_concat (enum machine_mode mode,
 	}
       goto half;
 
+    case 16:
+      switch (mode)
+	{
+	case V16SImode:
+	  cmode = V2SImode;
+	  hmode = V4SImode;
+	  gmode = V8SImode;
+	  break;
+	case V16SFmode:
+	  cmode = V2SFmode;
+	  hmode = V4SFmode;
+	  gmode = V8SFmode;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      goto half;
+
 half:
       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
       i = n - 1;
@@ -37657,7 +38028,27 @@ half:
 	}
 
       n >>= 1;
-      if (n > 2)
+      if (n > 4)
+	{
+	  gcc_assert (hmode != VOIDmode);
+	  gcc_assert (gmode != VOIDmode);
+	  for (i = j = 0; i < n; i += 2, j++)
+	    {
+	      second[j] = gen_reg_rtx (hmode);
+	      ix86_expand_vector_init_concat (hmode, second [j],
+					      &first [i], 2);
+	    }
+	  n >>= 1;
+	  for (i = j = 0; i < n; i += 2, j++)
+	    {
+	      third[j] = gen_reg_rtx (gmode);
+	      ix86_expand_vector_init_concat (gmode, third[j],
+					      &second[i], 2);
+	    }
+	  n >>= 1;
+	  ix86_expand_vector_init_concat (mode, target, third, n);
+	}
+      else if (n > 2)
 	{
 	  gcc_assert (hmode != VOIDmode);
 	  for (i = j = 0; i < n; i += 2, j++)
@@ -37800,7 +38191,7 @@ static void
 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
 				 rtx target, rtx vals)
 {
-  rtx ops[32], op0, op1;
+  rtx ops[64], op0, op1;
   enum machine_mode half_mode = VOIDmode;
   int n, i;
 
@@ -37812,6 +38203,10 @@ ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
 	break;
       /* FALLTHRU */
 
+    case V16SImode:
+    case V16SFmode:
+    case V8DFmode:
+    case V8DImode:
     case V8SFmode:
     case V8SImode:
     case V4DFmode:
@@ -38437,6 +38832,42 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
 	}
       break;
 
+    case V16SFmode:
+      tmp = gen_reg_rtx (V8SFmode);
+      if (elt < 8)
+	emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 7);
+      return;
+
+    case V8DFmode:
+      tmp = gen_reg_rtx (V4DFmode);
+      if (elt < 4)
+	emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 3);
+      return;
+
+    case V16SImode:
+      tmp = gen_reg_rtx (V8SImode);
+      if (elt < 8)
+	emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 7);
+      return;
+
+    case V8DImode:
+      tmp = gen_reg_rtx (V4DImode);
+      if (elt < 4)
+	emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 3);
+      return;
+
     case V8QImode:
       /* ??? Could extract the appropriate HImode element and shift.  */
     default:
@@ -38529,6 +38960,44 @@ emit_reduc_half (rtx dest, rtx src, int i)
 				    GEN_INT (i / 2));
 	}
       break;
+    case V16SImode:
+    case V16SFmode:
+    case V8DImode:
+    case V8DFmode:
+      if (i > 128)
+	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
+				      gen_lowpart (V16SImode, src),
+				      gen_lowpart (V16SImode, src),
+				      GEN_INT (0x4 + (i == 512 ? 4 : 0)),
+				      GEN_INT (0x5 + (i == 512 ? 4 : 0)),
+				      GEN_INT (0x6 + (i == 512 ? 4 : 0)),
+				      GEN_INT (0x7 + (i == 512 ? 4 : 0)),
+				      GEN_INT (0xC), GEN_INT (0xD),
+				      GEN_INT (0xE), GEN_INT (0xF),
+				      GEN_INT (0x10), GEN_INT (0x11),
+				      GEN_INT (0x12), GEN_INT (0x13),
+				      GEN_INT (0x14), GEN_INT (0x15),
+				      GEN_INT (0x16), GEN_INT (0x17));
+      else
+	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
+				   gen_lowpart (V16SImode, src),
+				   GEN_INT (i == 128 ? 0x2 : 0x1),
+				   GEN_INT (0x3),
+				   GEN_INT (0x3),
+				   GEN_INT (0x3),
+				   GEN_INT (i == 128 ? 0x6 : 0x5),
+				   GEN_INT (0x7),
+				   GEN_INT (0x7),
+				   GEN_INT (0x7),
+				   GEN_INT (i == 128 ? 0xA : 0x9),
+				   GEN_INT (0xB),
+				   GEN_INT (0xB),
+				   GEN_INT (0xB),
+				   GEN_INT (i == 128 ? 0xE : 0xD),
+				   GEN_INT (0xF),
+				   GEN_INT (0xF),
+				   GEN_INT (0xF));
+      break;
     default:
       gcc_unreachable ();
     }
@@ -38593,6 +39062,8 @@ ix86_vector_mode_supported_p (enum machine_mode mode)
     return true;
   if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
     return true;
+  if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
+    return true;
   if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
     return true;
   if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
@@ -38906,9 +39377,15 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
   b = force_reg (mode, b);
 
   /* x0 = rcp(b) estimate */
-  emit_insn (gen_rtx_SET (VOIDmode, x0,
-			  gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
-					  UNSPEC_RCP)));
+  if (mode == V16SFmode || mode == V8DFmode)
+    emit_insn (gen_rtx_SET (VOIDmode, x0,
+			    gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+					    UNSPEC_RCP14)));
+  else
+    emit_insn (gen_rtx_SET (VOIDmode, x0,
+			    gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+					    UNSPEC_RCP)));
+
   /* e0 = x0 * b */
   emit_insn (gen_rtx_SET (VOIDmode, e0,
 			  gen_rtx_MULT (mode, x0, b)));
@@ -38938,6 +39415,7 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
 {
   rtx x0, e0, e1, e2, e3, mthree, mhalf;
   REAL_VALUE_TYPE r;
+  int unspec;
 
   x0 = gen_reg_rtx (mode);
   e0 = gen_reg_rtx (mode);
@@ -38950,11 +39428,15 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
 
   real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
   mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
+  unspec = UNSPEC_RSQRT;
 
   if (VECTOR_MODE_P (mode))
     {
       mthree = ix86_build_const_vector (mode, true, mthree);
       mhalf = ix86_build_const_vector (mode, true, mhalf);
+      /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
+      if (GET_MODE_SIZE (mode) == 64)
+	unspec = UNSPEC_RSQRT14;
     }
 
   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
@@ -38965,7 +39447,7 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
   /* x0 = rsqrt(a) estimate */
   emit_insn (gen_rtx_SET (VOIDmode, x0,
 			  gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
-					  UNSPEC_RSQRT)));
+					  unspec)));
 
   /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
   if (!recip)
@@ -38976,11 +39458,23 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
       mask = gen_reg_rtx (mode);
 
       zero = force_reg (mode, CONST0_RTX(mode));
-      emit_insn (gen_rtx_SET (VOIDmode, mask,
-			      gen_rtx_NE (mode, zero, a)));
 
-      emit_insn (gen_rtx_SET (VOIDmode, x0,
-			      gen_rtx_AND (mode, x0, mask)));
+      /* Handle masked compare.  */
+      if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
+	{
+	  mask = gen_reg_rtx (HImode);
+	  /* Imm value 0x4 corresponds to not-equal comparison.  */
+	  emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
+	  emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
+	}
+      else
+	{
+	  emit_insn (gen_rtx_SET (VOIDmode, mask,
+				  gen_rtx_NE (mode, zero, a)));
+
+	  emit_insn (gen_rtx_SET (VOIDmode, x0,
+				  gen_rtx_AND (mode, x0, mask)));
+	}
     }
 
   /* e0 = x0 * a */
@@ -40502,6 +40996,19 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_pshufb (d))
     return true;
 
+  /* Try the AVX512F vpermi2 instructions.  */
+  rtx vec[64];
+  enum machine_mode mode = d->vmode;
+  if (mode == V8DFmode)
+    mode = V8DImode;
+  else if (mode == V16SFmode)
+    mode = V16SImode;
+  for (i = 0; i < nelt; ++i)
+    vec[i] = GEN_INT (d->perm[i]);
+  rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
+  if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, mask, d->op1))
+    return true;
+
   return false;
 }
 
@@ -42109,6 +42616,10 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
 
   /* Given sufficient ISA support we can just return true here
      for selected vector modes.  */
+  if (d.vmode == V16SImode || d.vmode == V16SFmode
+      || d.vmode == V8DFmode || d.vmode == V8DImode)
+    /* All implementable with a single vpermi2 insn.  */
+    return true;
   if (GET_MODE_SIZE (d.vmode) == 16)
     {
       /* All implementable with a single vpperm insn.  */
@@ -42351,7 +42862,7 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
     op2 = force_reg (mode, op2);
 
   /* We only play even/odd games with vectors of SImode.  */
-  gcc_assert (mode == V4SImode || mode == V8SImode);
+  gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
 
   /* If we're looking for the odd results, shift those members down to
      the even slots.  For some cpus this is faster than a PSHUFD.  */
@@ -42377,7 +42888,14 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
       op2 = gen_lowpart (mode, op2);
     }
 
-  if (mode == V8SImode)
+  if (mode == V16SImode)
+    {
+      if (uns_p)
+	x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
+      else
+	x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
+    }
+  else if (mode == V8SImode)
     {
       if (uns_p)
 	x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
@@ -42597,6 +43115,11 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
 	  umul = gen_vec_widen_umult_even_v8si;
 	  nmode = V8SImode;
 	}
+      else if (mode == V8DImode)
+	{
+	  umul = gen_vec_widen_umult_even_v16si;
+	  nmode = V16SImode;
+	}
       else
 	gcc_unreachable ();
 
@@ -43743,12 +44266,16 @@ ix86_preferred_simd_mode (enum machine_mode mode)
     case HImode:
       return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
     case SImode:
-      return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
+      return TARGET_AVX512F ? V16SImode :
+	(TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
     case DImode:
-      return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
+      return TARGET_AVX512F ? V8DImode :
+	(TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
 
     case SFmode:
-      if (TARGET_AVX && !TARGET_PREFER_AVX128)
+      if (TARGET_AVX512F)
+	return V16SFmode;
+      else if (TARGET_AVX && !TARGET_PREFER_AVX128)
 	return V8SFmode;
       else
 	return V4SFmode;
@@ -43756,6 +44283,8 @@ ix86_preferred_simd_mode (enum machine_mode mode)
     case DFmode:
       if (!TARGET_VECTORIZE_DOUBLE)
 	return word_mode;
+      else if (TARGET_AVX512F)
+	return V8DFmode;
       else if (TARGET_AVX && !TARGET_PREFER_AVX128)
 	return V4DFmode;
       else if (TARGET_SSE2)
@@ -43768,12 +44297,14 @@ ix86_preferred_simd_mode (enum machine_mode mode)
 }
 
 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
-   vectors.  */
+   vectors.  If AVX512F is enabled then try vectorizing with 512bit,
+   256bit and 128bit vectors.  */
 
 static unsigned int
 ix86_autovectorize_vector_sizes (void)
 {
-  return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
+  return TARGET_AVX512F ? 64 | 32 | 16 :
+    (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
 }
 
 

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 7beb245..a3c0e0c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -748,8 +748,9 @@
    (set (attr "mode")
 	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
 		 (const_string "<ssePSmode>")
-	       (and (eq_attr "alternative" "2")
-		    (match_test "TARGET_SSE_TYPELESS_STORES"))
+	       (and (match_test "GET_MODE_SIZE (<MODE>mode) == 16")
+		    (and (eq_attr "alternative" "2")
+			 (match_test "TARGET_SSE_TYPELESS_STORES")))
 		 (const_string "<ssePSmode>")
 	       (match_test "TARGET_AVX")
 		 (const_string "<sseinsnmode>")
@@ -986,8 +987,9 @@
    (set_attr "ssememalign" "8")
    (set_attr "prefix" "maybe_vex")
    (set (attr "mode")
-	(cond [(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
-		    (match_test "TARGET_SSE_TYPELESS_STORES"))
+        (cond [(and (match_test "GET_MODE_SIZE (<MODE>mode) == 16")
+                    (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                         (match_test "TARGET_SSE_TYPELESS_STORES")))
 		 (const_string "<ssePSmode>")
 	       (match_test "TARGET_AVX")
 		 (const_string "<MODE>")
@@ -1091,6 +1093,7 @@
 {
   switch (get_attr_mode (insn))
     {
+    case MODE_V16SF:
     case MODE_V8SF:
     case MODE_V4SF:
       return "%vmovups\t{%1, %0|%0, %1}";
@@ -1113,8 +1116,9 @@
      (const_string "1")))
    (set_attr "prefix" "maybe_vex")
    (set (attr "mode")
-	(cond [(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
-		    (match_test "TARGET_SSE_TYPELESS_STORES"))
+	(cond [(and (match_test "GET_MODE_SIZE (<MODE>mode) == 16")
+		    (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+			 (match_test "TARGET_SSE_TYPELESS_STORES")))
 		 (const_string "<ssePSmode>")
 	       (match_test "TARGET_AVX")
 		 (const_string "<sseinsnmode>")
@@ -3492,7 +3496,11 @@
    (match_operand:<sseintvecmode> 1 "register_operand")]
   "TARGET_SSE2 && (<MODE>mode == V4SFmode || TARGET_AVX2)"
 {
-  ix86_expand_vector_convert_uns_vsivsf (operands[0], operands[1]);
+  if (<MODE>mode == V16SFmode)
+    emit_insn (gen_ufloatv16siv16sf2 (operands[0], operands[1]));
+  else
+    ix86_expand_vector_convert_uns_vsivsf (operands[0], operands[1]);
+
   DONE;
 })
 
@@ -3583,11 +3591,17 @@
    (match_operand:VF1 1 "register_operand")]
   "TARGET_SSE2"
 {
-  rtx tmp[3];
-  tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]);
-  tmp[1] = gen_reg_rtx (<sseintvecmode>mode);
-  emit_insn (gen_fix_trunc<mode><sseintvecmodelower>2 (tmp[1], tmp[0]));
-  emit_insn (gen_xor<sseintvecmodelower>3 (operands[0], tmp[1], tmp[2]));
+  if (<MODE>mode == V16SFmode)
+    emit_insn (gen_ufix_truncv16sfv16si2 (operands[0],
+					  operands[1]));
+  else
+    {
+      rtx tmp[3];
+      tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]);
+      tmp[1] = gen_reg_rtx (<sseintvecmode>mode);
+      emit_insn (gen_fix_trunc<mode><sseintvecmodelower>2 (tmp[1], tmp[0]));
+      emit_insn (gen_xor<sseintvecmodelower>3 (operands[0], tmp[1], tmp[2]));
+    }
   DONE;
 })
 
@@ -4514,6 +4528,32 @@
   DONE;
 })
 
+(define_expand "vec_unpacku_float_hi_v16si"
+  [(match_operand:V8DF 0 "register_operand")
+   (match_operand:V16SI 1 "register_operand")]
+  "TARGET_AVX512F"
+{
+  REAL_VALUE_TYPE TWO32r;
+  rtx k, x, tmp[4];
+
+  real_ldexp (&TWO32r, &dconst1, 32);
+  x = const_double_from_real_value (TWO32r, DFmode);
+
+  tmp[0] = force_reg (V8DFmode, CONST0_RTX (V8DFmode));
+  tmp[1] = force_reg (V8DFmode, ix86_build_const_vector (V8DFmode, 1, x));
+  tmp[2] = gen_reg_rtx (V8DFmode);
+  tmp[3] = gen_reg_rtx (V8SImode);
+  k = gen_reg_rtx (QImode);
+
+  emit_insn (gen_vec_extract_hi_v16si (tmp[3], operands[1]));
+  emit_insn (gen_floatv8siv8df2 (tmp[2], tmp[3]));
+  emit_insn (gen_rtx_SET (VOIDmode, k,
+			  gen_rtx_LT (QImode, tmp[2], tmp[0])));
+  emit_insn (gen_addv8df3_mask (tmp[2], tmp[2], tmp[1], tmp[2], k));
+  emit_move_insn (operands[0], tmp[2]);
+  DONE;
+})
+
 (define_expand "vec_unpacku_float_lo_v8si"
   [(match_operand:V4DF 0 "register_operand")
    (match_operand:V8SI 1 "nonimmediate_operand")]
@@ -4679,31 +4719,46 @@
 
 (define_expand "vec_pack_ufix_trunc_<mode>"
   [(match_operand:<ssepackfltmode> 0 "register_operand")
-   (match_operand:VF2_128_256 1 "register_operand")
-   (match_operand:VF2_128_256 2 "register_operand")]
+   (match_operand:VF2 1 "register_operand")
+   (match_operand:VF2 2 "register_operand")]
   "TARGET_SSE2"
 {
-  rtx tmp[7];
-  tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]);
-  tmp[1] = ix86_expand_adjust_ufix_to_sfix_si (operands[2], &tmp[3]);
-  tmp[4] = gen_reg_rtx (<ssepackfltmode>mode);
-  emit_insn (gen_vec_pack_sfix_trunc_<mode> (tmp[4], tmp[0], tmp[1]));
-  if (<ssepackfltmode>mode == V4SImode || TARGET_AVX2)
+  if (<MODE>mode == V8DFmode)
     {
-      tmp[5] = gen_reg_rtx (<ssepackfltmode>mode);
-      ix86_expand_vec_extract_even_odd (tmp[5], tmp[2], tmp[3], 0);
+      rtx r1, r2;
+
+      r1 = gen_reg_rtx (V8SImode);
+      r2 = gen_reg_rtx (V8SImode);
+
+      emit_insn (gen_ufix_truncv8dfv8si2 (r1, operands[1]));
+      emit_insn (gen_ufix_truncv8dfv8si2 (r2, operands[2]));
+      emit_insn (gen_avx_vec_concatv16si (operands[0], r1, r2));
     }
   else
     {
-      tmp[5] = gen_reg_rtx (V8SFmode);
-      ix86_expand_vec_extract_even_odd (tmp[5], gen_lowpart (V8SFmode, tmp[2]),
-					gen_lowpart (V8SFmode, tmp[3]), 0);
-      tmp[5] = gen_lowpart (V8SImode, tmp[5]);
+      rtx tmp[7];
+      tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]);
+      tmp[1] = ix86_expand_adjust_ufix_to_sfix_si (operands[2], &tmp[3]);
+      tmp[4] = gen_reg_rtx (<ssepackfltmode>mode);
+      emit_insn (gen_vec_pack_sfix_trunc_<mode> (tmp[4], tmp[0], tmp[1]));
+      if (<ssepackfltmode>mode == V4SImode || TARGET_AVX2)
+	{
+	  tmp[5] = gen_reg_rtx (<ssepackfltmode>mode);
+	  ix86_expand_vec_extract_even_odd (tmp[5], tmp[2], tmp[3], 0);
+	}
+      else
+	{
+	  tmp[5] = gen_reg_rtx (V8SFmode);
+	  ix86_expand_vec_extract_even_odd (tmp[5], gen_lowpart (V8SFmode, tmp[2]),
+					    gen_lowpart (V8SFmode, tmp[3]), 0);
+	  tmp[5] = gen_lowpart (V8SImode, tmp[5]);
+	}
+      tmp[6] = expand_simple_binop (<ssepackfltmode>mode, XOR, tmp[4], tmp[5],
+				    operands[0], 0, OPTAB_DIRECT);
+      if (tmp[6] != operands[0])
+	emit_move_insn (operands[0], tmp[6]);
     }
-  tmp[6] = expand_simple_binop (<ssepackfltmode>mode, XOR, tmp[4], tmp[5],
-				operands[0], 0, OPTAB_DIRECT);
-  if (tmp[6] != operands[0])
-    emit_move_insn (operands[0], tmp[6]);
+
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/pr49002-2.c b/gcc/testsuite/gcc.target/i386/pr49002-2.c
index 9f21a2d..dfb83b4 100644
--- a/gcc/testsuite/gcc.target/i386/pr49002-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr49002-2.c
@@ -12,4 +12,4 @@ void foo(const __m128d from, __m256d *to)
 /* Ensure we store ymm, not xmm.  */
 /* { dg-final { scan-assembler-not "vmovapd\[\t \]*%xmm\[0-9\]\+,\[^,\]*" } } */
 /* { dg-final { scan-assembler-not "vmovaps\[\t \]*%xmm\[0-9\]\+,\[^,\]*" } } */
-/* { dg-final { scan-assembler "vmovaps\[\t \]*%ymm\[0-9\]\+,\[^,\]*" } } */
+/* { dg-final { scan-assembler "vmovap\[sd\]\[\t \]*%ymm\[0-9\]\+,\[^,\]*" } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index e3009d9..a3aaa6e 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -5687,7 +5687,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
       tree vec_oprnd0 = NULL_TREE, op;
       tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl));
       tree rettype, srctype, ptrtype, idxtype, masktype, scaletype;
-      tree ptr, mask, var, scale, perm_mask = NULL_TREE, prev_res = NULL_TREE;
+      tree ptr, mask, var, scale, merge, perm_mask = NULL_TREE, prev_res = NULL_TREE;
       edge pe = loop_preheader_edge (loop);
       gimple_seq seq;
       basic_block new_bb;
@@ -5729,8 +5729,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
       idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
       masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
       scaletype = TREE_VALUE (arglist);
-      gcc_checking_assert (types_compatible_p (srctype, rettype)
-			   && types_compatible_p (srctype, masktype));
+      gcc_checking_assert (types_compatible_p (srctype, rettype));
 
       vec_dest = vect_create_destination_var (scalar_dest, vectype);
 
@@ -5744,8 +5743,13 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
 
       /* Currently we support only unconditional gather loads,
 	 so mask should be all ones.  */
-      if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
-	mask = build_int_cst (TREE_TYPE (masktype), -1);
+      if (TREE_CODE (masktype) == INTEGER_TYPE)
+	mask = build_int_cst (masktype, -1);
+      else if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
+	{
+	  mask = build_int_cst (TREE_TYPE (masktype), -1);
+	  mask = build_vector_from_val (masktype, mask);
+	}
       else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (masktype)))
 	{
 	  REAL_VALUE_TYPE r;
@@ -5754,14 +5758,30 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
 	    tmp[j] = -1;
 	  real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (masktype)));
 	  mask = build_real (TREE_TYPE (masktype), r);
+	  mask = build_vector_from_val (masktype, mask);
 	}
       else
 	gcc_unreachable ();
-      mask = build_vector_from_val (masktype, mask);
       mask = vect_init_vector (stmt, mask, masktype, NULL);
 
       scale = build_int_cst (scaletype, gather_scale);
 
+      if (TREE_CODE (TREE_TYPE (rettype)) == INTEGER_TYPE)
+	merge = build_int_cst (TREE_TYPE (rettype), 0);
+      else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (rettype)))
+	{
+	  REAL_VALUE_TYPE r;
+	  long tmp[6];
+	  for (j = 0; j < 6; ++j)
+	    tmp[j] = 0;
+	  real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (rettype)));
+	  merge = build_real (TREE_TYPE (rettype), r);
+	}
+      else
+	gcc_unreachable ();
+      merge = build_vector_from_val (rettype, merge);
+      merge = vect_init_vector (stmt, merge, rettype, NULL);
+
       prev_stmt_info = NULL;
       for (j = 0; j < ncopies; ++j)
 	{
@@ -5790,7 +5810,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
 	    }
 
 	  new_stmt
-	    = gimple_build_call (gather_decl, 5, mask, ptr, op, mask, scale);
+	    = gimple_build_call (gather_decl, 5, merge, ptr, op, mask, scale);
 
 	  if (!useless_type_conversion_p (vectype, rettype))
 	    {
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 54e73c8..00e56dc 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -683,8 +683,8 @@ struct dataref_aux {
    conversion.  */
 #define MAX_INTERM_CVT_STEPS         3
 
-/* The maximum vectorization factor supported by any target (V32QI).  */
-#define MAX_VECTORIZATION_FACTOR 32
+/* The maximum vectorization factor supported by any target (V64QI).  */
+#define MAX_VECTORIZATION_FACTOR 64
 
 /* Avoid GTY(()) on stmt_vec_info.  */
 typedef void *vec_void_p;



More information about the Gcc-patches mailing list