[PATCH i386 AVX512] [63/n] Add vpshufb, perm autogen.

Kirill Yukhin kirill.yukhin@gmail.com
Mon Oct 6 12:55:00 GMT 2014


Hello,
This patch extends permutations for AVX-512*.
Comments are welcome!

Bootstrapped.
AVX-512* tests on top of patch-set all pass
under simulator.

Is it ok for trunk?

gcc/
	* config/i386/i386.c
	(ix86_expand_vec_perm_vpermi2): Handle V64QImode, V8HImode, V16HImode,
	V32HImode, V4SImode, V8SImode, V4SFmode, V8SFmode, V2DImode, V4DImode,
	V2DFmode, V4DFmode.
	(ix86_expand_sse_unpack): Handle V64QImode.
	(expand_vec_perm_blend): Update conditions for TARGET, handle
	V8DFmode, V16SFmode, V32HImode, V64QImode, V16SImode, V8DImode.
	(expand_vec_perm_pshufb): Handle V64QImode.
	(expand_vec_perm_1): Handle V64QImode, V32HImode, V16SImode, V16SFmode,
	V8DFmode, V8DImode, V4DFmode, V2DFmode, V8SFmode, V4SFmode.
	(ix86_expand_vec_perm_const_1): Call  ix86_expand_vec_perm_vpermi2.
	(ix86_vectorize_vec_perm_const_ok): Handle V32HImode, V64QImode.
	(ix86_expand_vecop_qihi): Handle V64QImode.
	* config/i386/sse.md
	(define_mode_iterator VI1_AVX2): Add V64QI mode.
	(define_mode_iterator VEC_PERM_AVX2): Add V32HI mode.
	(define_mode_iterator VEC_PERM_CONST): Add V64QI and V32HI mode.
	(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"): Add masking.

--
Thanks, K

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 352ab81..d759a45 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
   enum machine_mode mode = GET_MODE (op0);
   switch (mode)
     {
+      /* There is no byte version of vpermi2.  So we use vpermi2w.  */
+    case V64QImode:
+      if (!TARGET_AVX512BW)
+	return false;
+      rtx mask_lowpart, op0_lowpart, op1_lowpart;
+      rtx perm_lo, perm_hi, tmp, res_lo, tmp2, res_hi;
+
+      mask_lowpart = gen_lowpart (V32HImode, force_reg (V64QImode, mask));
+      op0_lowpart = gen_lowpart (V32HImode, op0);
+      op1_lowpart = gen_lowpart (V32HImode, op1);
+      tmp = gen_reg_rtx (V32HImode);
+      tmp2 = gen_reg_rtx (V32HImode);
+      perm_lo = gen_reg_rtx (V32HImode);
+      perm_hi = gen_reg_rtx (V32HImode);
+      res_lo = gen_reg_rtx (V32HImode);
+      res_hi = gen_reg_rtx (V32HImode);
+
+      emit_insn (gen_ashlv32hi3 (tmp, mask_lowpart, GEN_INT (8)));
+      emit_insn (gen_ashrv32hi3 (perm_lo, tmp, GEN_INT (9)));
+      emit_insn (gen_ashrv32hi3 (perm_hi, mask_lowpart, GEN_INT (9)));
+      emit_insn (gen_avx512bw_vpermi2varv32hi3 (res_lo, op0_lowpart,
+						perm_lo, op1_lowpart));
+      emit_insn (gen_avx512bw_vpermi2varv32hi3 (tmp2, op0_lowpart,
+						perm_hi, op1_lowpart));
+      emit_insn (gen_ashlv32hi3 (res_hi, tmp2, GEN_INT (8)));
+      emit_insn (gen_avx512bw_blendmv64qi (target, gen_lowpart (V64QImode, res_lo),
+					   gen_lowpart (V64QImode, res_hi),
+					   force_reg (DImode, GEN_INT (0xAAAAAAAAAAAAAAAALL))));
+      return true;
+    case V8HImode:
+      if (!TARGET_AVX512VL)
+	return false;
+      emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
+					       force_reg (V8HImode, mask), op1));
+      return true;
+    case V16HImode:
+      if (!TARGET_AVX512VL)
+	return false;
+      emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
+					     force_reg (V16HImode, mask), op1));
+      return true;
+    case V32HImode:
+      emit_insn (gen_avx512bw_vpermi2varv32hi3 (target, op0,
+					     force_reg (V32HImode, mask), op1));
+      return true;
+    case V4SImode:
+      if (!TARGET_AVX512VL)
+	return false;
+      emit_insn (gen_avx512vl_vpermi2varv4si3 (target, op0,
+					    force_reg (V4SImode, mask), op1));
+      return true;
+    case V8SImode:
+      if (!TARGET_AVX512VL)
+	return false;
+      emit_insn (gen_avx512vl_vpermi2varv8si3 (target, op0,
+					    force_reg (V8SImode, mask), op1));
+      return true;
     case V16SImode:
       emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
 					      force_reg (V16SImode, mask),
 					      op1));
       return true;
+    case V4SFmode:
+      if (!TARGET_AVX512VL)
+	return false;
+      emit_insn (gen_avx512vl_vpermi2varv4sf3 (target, op0,
+					       force_reg (V4SImode, mask), op1));
+      return true;
+    case V8SFmode:
+      if (!TARGET_AVX512VL)
+	return false;
+      emit_insn (gen_avx512vl_vpermi2varv8sf3 (target, op0,
+					       force_reg (V8SImode, mask), op1));
+      return true;
     case V16SFmode:
       emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
 					      force_reg (V16SImode, mask),
 					      op1));
       return true;
+    case V2DImode:
+      if (!TARGET_AVX512VL)
+	return false;
+      emit_insn (gen_avx512vl_vpermi2varv2di3 (target, op0,
+					       force_reg (V2DImode, mask), op1));
+      return true;
+    case V4DImode:
+      if (!TARGET_AVX512VL)
+	return false;
+      emit_insn (gen_avx512vl_vpermi2varv4di3 (target, op0,
+					       force_reg (V4DImode, mask), op1));
+      return true;
     case V8DImode:
       emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
 					     force_reg (V8DImode, mask), op1));
       return true;
+    case V2DFmode:
+      if (!TARGET_AVX512VL)
+	return false;
+      emit_insn (gen_avx512vl_vpermi2varv2df3 (target, op0,
+					       force_reg (V2DImode, mask), op1));
+      return true;
+    case V4DFmode:
+      if (!TARGET_AVX512VL)
+	return false;
+      emit_insn (gen_avx512vl_vpermi2varv4df3 (target, op0,
+					       force_reg (V4DImode, mask), op1));
+      return true;
     case V8DFmode:
       emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0,
 					     force_reg (V8DImode, mask), op1));
@@ -21779,6 +21872,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
 
       switch (imode)
 	{
+	case V64QImode:
+	  if (unsigned_p)
+	    unpack = gen_avx512bw_zero_extendv32qiv32hi2;
+	  else
+	    unpack = gen_avx512bw_sign_extendv32qiv32hi2;
+	  halfmode = V32QImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
+	  break;
 	case V32QImode:
 	  if (unsigned_p)
 	    unpack = gen_avx2_zero_extendv16qiv16hi2;
@@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
 
   if (d->one_operand_p)
     return false;
-  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+  if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 &&
+      GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
+    ;
+  else if (TARGET_AVX512VL)
+    ;
+  else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
     ;
   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
     ;
@@ -42693,12 +42800,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
 
   switch (vmode)
     {
+    case V8DFmode:
+    case V16SFmode:
     case V4DFmode:
     case V8SFmode:
     case V2DFmode:
     case V4SFmode:
     case V8HImode:
     case V8SImode:
+    case V32HImode:
+    case V64QImode:
+    case V16SImode:
+    case V8DImode:
       for (i = 0; i < nelt; ++i)
 	mask |= (d->perm[i] >= nelt) << i;
       break;
@@ -42921,9 +43034,9 @@ static bool
 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
 {
   unsigned i, nelt, eltsz, mask;
-  unsigned char perm[32];
+  unsigned char perm[64];
   enum machine_mode vmode = V16QImode;
-  rtx rperm[32], vperm, target, op0, op1;
+  rtx rperm[64], vperm, target, op0, op1;
 
   nelt = d->nelt;
 
@@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
 		  return false;
 	    }
 	}
+      else if (GET_MODE_SIZE (d->vmode) == 64)
+	{
+	  if (!TARGET_AVX512BW)
+	    return false;
+	  if (vmode == V64QImode)
+	    {
+	      for (i = 0; i < nelt; ++i)
+		if ((d->perm[i] ^ i) & (nelt / 4))
+		  return false;
+	    }
+	}
       else
 	return false;
     }
@@ -43029,6 +43153,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
 	mask = 2 * nelt - 1;
       else if (vmode == V16QImode)
 	mask = nelt - 1;
+      else if (vmode == V64QImode)
+	mask = nelt / 4 - 1;
       else
 	mask = nelt / 2 - 1;
 
@@ -43054,6 +43180,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
 	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
       else if (vmode == V32QImode)
 	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+      else if (vmode == V64QImode)
+	emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
       else if (vmode == V8SFmode)
 	emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
       else
@@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	  rtx (*gen) (rtx, rtx) = NULL;
 	  switch (d->vmode)
 	    {
+	    case V64QImode:
+	      if (TARGET_AVX512VL)
+		gen = gen_avx512bw_vec_dupv64qi;
+	      break;
 	    case V32QImode:
 	      gen = gen_avx2_pbroadcastv32qi_1;
 	      break;
+	    case V32HImode:
+	      if (TARGET_AVX512VL)
+		gen = gen_avx512bw_vec_dupv32hi;
+	      break;
 	    case V16HImode:
 	      gen = gen_avx2_pbroadcastv16hi_1;
 	      break;
+	    case V16SImode:
+	      if (TARGET_AVX512F)
+		gen = gen_avx512f_vec_dupv16si;
+	      break;
 	    case V8SImode:
 	      gen = gen_avx2_pbroadcastv8si_1;
 	      break;
@@ -43124,9 +43264,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	    case V8HImode:
 	      gen = gen_avx2_pbroadcastv8hi;
 	      break;
+	    case V16SFmode:
+	      if (TARGET_AVX512F)
+		gen = gen_avx512f_vec_dupv16sf;
+	      break;
 	    case V8SFmode:
 	      gen = gen_avx2_vec_dupv8sf_1;
 	      break;
+	    case V8DFmode:
+	      if (TARGET_AVX512F)
+		gen = gen_avx512f_vec_dupv8df;
+	      break;
+	    case V8DImode:
+	      if (TARGET_AVX512F)
+		gen = gen_avx512f_vec_dupv8di;
+	      break;
 	    /* For other modes prefer other shuffles this function creates.  */
 	    default: break;
 	    }
@@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
     mode = V8DImode;
   else if (mode == V16SFmode)
     mode = V16SImode;
+  else if (mode == V4DFmode)
+    mode = V4DImode;
+  else if (mode == V2DFmode)
+    mode = V2DImode;
+  else if (mode == V8SFmode)
+    mode = V8SImode;
+  else if (mode == V4SFmode)
+    mode = V4SImode;
   for (i = 0; i < nelt; ++i)
     vec[i] = GEN_INT (d->perm[i]);
   rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
@@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
     return true;
 
   /* Try sequences of two instructions.  */
+    /* ix86_expand_vec_perm_vpermi2 is also called from
+     * ix86_expand_vec_perm.  So it doesn't take d as parameter.
+     * Construct needed params.  */
+    rtx vec[64];
+    int i;
+    for (i = 0; i < d->nelt; ++i)
+      vec[i] = GEN_INT (d->perm[i]);
+    rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec));
+    if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1))
+      return true;
 
   if (expand_vec_perm_pshuflw_pshufhw (d))
     return true;
@@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
   /* Given sufficient ISA support we can just return true here
      for selected vector modes.  */
   if (d.vmode == V16SImode || d.vmode == V16SFmode
-      || d.vmode == V8DFmode || d.vmode == V8DImode)
+      || d.vmode == V8DFmode || d.vmode == V8DImode
+      || d.vmode == V32HImode || d.vmode == V64QImode)
     /* All implementable with a single vpermi2 insn.  */
     return true;
   if (GET_MODE_SIZE (d.vmode) == 16)
@@ -45066,6 +45237,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
       gen_il = gen_avx2_interleave_lowv32qi;
       gen_ih = gen_avx2_interleave_highv32qi;
       break;
+    case V64QImode:
+      himode = V32HImode;
+      gen_il = gen_avx512bw_interleave_lowv64qi;
+      gen_ih = gen_avx512bw_interleave_highv64qi;
+      break;
     default:
       gcc_unreachable ();
     }
@@ -45126,7 +45302,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
     {
       /* For SSE2, we used an full interleave, so the desired
 	 results are in the even elements.  */
-      for (i = 0; i < 32; ++i)
+      for (i = 0; i < 64; ++i)
 	d.perm[i] = i * 2;
     }
   else
@@ -45134,7 +45310,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
       /* For AVX, the interleave used above was not cross-lane.  So the
 	 extraction is evens but with the second and third quarter swapped.
 	 Happily, that is even one insn shorter than even extraction.  */
-      for (i = 0; i < 32; ++i)
+      for (i = 0; i < 64; ++i)
 	d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
     }
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bb6372a..d3e9635 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -298,7 +298,7 @@
   [V8DI (V4DI "TARGET_AVX512VL")])
 
 (define_mode_iterator VI1_AVX2
-  [(V32QI "TARGET_AVX2") V16QI])
+  [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI])
 
 (define_mode_iterator VI2_AVX2
   [(V16HI "TARGET_AVX2") V8HI])
@@ -10621,7 +10621,8 @@
    (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
    (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")
    (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
-   (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")])
+   (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
+   (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")])
 
 (define_expand "vec_perm<mode>"
   [(match_operand:VEC_PERM_AVX2 0 "register_operand")
@@ -10642,7 +10643,8 @@
    (V8SI "TARGET_AVX") (V4DI "TARGET_AVX")
    (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
    (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
-   (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+   (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
+   (V32HI "TARGET_AVX512BW")])
 
 (define_expand "vec_perm_const<mode>"
   [(match_operand:VEC_PERM_CONST 0 "register_operand")
@@ -13559,21 +13561,21 @@
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])
 
-(define_insn "<ssse3_avx2>_pshufb<mode>3"
-  [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,v")
 	(unspec:VI1_AVX2
-	  [(match_operand:VI1_AVX2 1 "register_operand" "0,x")
-	   (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")]
+	  [(match_operand:VI1_AVX2 1 "register_operand" "0,v")
+	   (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,vm")]
 	  UNSPEC_PSHUFB))]
-  "TARGET_SSSE3"
+  "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
   "@
    pshufb\t{%2, %0|%0, %2}
-   vpshufb\t{%2, %1, %0|%0, %1, %2}"
+   vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
   [(set_attr "isa" "noavx,avx")
    (set_attr "type" "sselog1")
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "orig,vex")
+   (set_attr "prefix" "orig,maybe_evex")
    (set_attr "btver2_decode" "vector,vector")
    (set_attr "mode" "<sseinsnmode>")])
 



More information about the Gcc-patches mailing list