This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH i386 AVX512] [63/n] Add vpshufb, perm autogen.


On Mon, Oct 06, 2014 at 04:55:28PM +0400, Kirill Yukhin wrote:
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
>    enum machine_mode mode = GET_MODE (op0);
>    switch (mode)
>      {
> +      /* There is no byte version of vpermi2.  So we use vpermi2w.  */
> +    case V64QImode:
> +      if (!TARGET_AVX512BW)
> +	return false;
> +      rtx mask_lowpart, op0_lowpart, op1_lowpart;
> +      rtx perm_lo, perm_hi, tmp, res_lo, tmp2, res_hi;
> +
> +      mask_lowpart = gen_lowpart (V32HImode, force_reg (V64QImode, mask));
> +      op0_lowpart = gen_lowpart (V32HImode, op0);
> +      op1_lowpart = gen_lowpart (V32HImode, op1);
> +      tmp = gen_reg_rtx (V32HImode);
> +      tmp2 = gen_reg_rtx (V32HImode);
> +      perm_lo = gen_reg_rtx (V32HImode);
> +      perm_hi = gen_reg_rtx (V32HImode);
> +      res_lo = gen_reg_rtx (V32HImode);
> +      res_hi = gen_reg_rtx (V32HImode);
> +
> +      emit_insn (gen_ashlv32hi3 (tmp, mask_lowpart, GEN_INT (8)));
> +      emit_insn (gen_ashrv32hi3 (perm_lo, tmp, GEN_INT (9)));
> +      emit_insn (gen_ashrv32hi3 (perm_hi, mask_lowpart, GEN_INT (9)));
> +      emit_insn (gen_avx512bw_vpermi2varv32hi3 (res_lo, op0_lowpart,
> +						perm_lo, op1_lowpart));
> +      emit_insn (gen_avx512bw_vpermi2varv32hi3 (tmp2, op0_lowpart,
> +						perm_hi, op1_lowpart));
> +      emit_insn (gen_ashlv32hi3 (res_hi, tmp2, GEN_INT (8)));
> +      emit_insn (gen_avx512bw_blendmv64qi (target, gen_lowpart (V64QImode, res_lo),
> +					   gen_lowpart (V64QImode, res_hi),
> +					   force_reg (DImode, GEN_INT (0xAAAAAAAAAAAAAAAALL))));
> +      return true;

I believe this case doesn't belong to this function, other than this
case ix86_expand_vec_perm_vpermi2 emits always just a single insn, and
so it should always do that, and there should be a separate function
that expands the worst case of V64QImode full 2 operand permutation.
See my previous mail, IMHO it is doable with 5 instructions rather than 7.
And IMHO we should have a separate function which emits that, supposedly
one for the constant permutations, one for the variable case (perhaps
then your 7 insn sequence is best?).

Also, IMHO rather than building a CONST_VECTOR ahead in each of the callers,
supposedly ix86_expand_vec_perm_vpermi2 could take the arguments it takes
right now plus D, either D would be NULL (then it would behave as now), or
SEL would be NULL, then it would create a CONST_VECTOR on the fly if needed.
I.e. the function would start with a switch that would just contain the
if (...)
return false; 
hunks plus break; for the success case, then code to generate CONST_VECTOR
if sel is NULL_RTX from d, and finally another switch with just the emit
cases.  Or, the first switch could just set a function pointer before
break, and just use one common
emit_insn (gen (target, op0, force_reg (vmode, mask), op1));

> +    case V8HImode:
> +      if (!TARGET_AVX512VL)
> +	return false;
> +      emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
> +					       force_reg (V8HImode, mask), op1));
> +      return true;
> +    case V16HImode:
> +      if (!TARGET_AVX512VL)
> +	return false;
> +      emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
> +					     force_reg (V16HImode, mask), op1));
> +      return true;

Aren't these two insns there only if both TARGET_AVX512VL && TARGET_AVX512BW?
I mean, the ISA pdf mentions both of the CPUID flags simultaneously, and I
think neither of these depends on the other one in GCC.  That's unlike insns
where CPUID AVX512VL and AVX512F are mentioned together, because in GCC
AVX512VL depends on AVX512F.

> @@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
>  
>    if (d->one_operand_p)
>      return false;
> -  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
> +  if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 &&
> +      GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)

Formatting, && belongs on the second line.

> +    ;
> +  else if (TARGET_AVX512VL)

I'd add && GET_MODE_SIZE (GET_MODE_INNER (vmode) == 64 here.
AVX512VL is not going to handle 64-bit vectors, or 1024-bit ones,
and the == 32 and == 16 cases are handled because AVX512VL implies
TARGET_AVX2 and TARGET_SSE4_1, doesn't it?

> @@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
>  		  return false;
>  	    }
>  	}
> +      else if (GET_MODE_SIZE (d->vmode) == 64)
> +	{
> +	  if (!TARGET_AVX512BW)
> +	    return false;
> +	  if (vmode == V64QImode)
> +	    {
> +	      for (i = 0; i < nelt; ++i)
> +		if ((d->perm[i] ^ i) & (nelt / 4))
> +		  return false;

Missing comment, I'd duplicate the
              /* vpshufb only works intra lanes, it is not
                 possible to shuffle bytes in between the lanes.  */
comment there.

> @@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
>  	  rtx (*gen) (rtx, rtx) = NULL;
>  	  switch (d->vmode)
>  	    {
> +	    case V64QImode:
> +	      if (TARGET_AVX512VL)

VL?  Isn't that BW?

> +		gen = gen_avx512bw_vec_dupv64qi;
> +	      break;
>  	    case V32QImode:
>  	      gen = gen_avx2_pbroadcastv32qi_1;
>  	      break;
> +	    case V32HImode:
> +	      if (TARGET_AVX512VL)

Ditto.

> @@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
>      mode = V8DImode;
>    else if (mode == V16SFmode)
>      mode = V16SImode;
> +  else if (mode == V4DFmode)
> +    mode = V4DImode;
> +  else if (mode == V2DFmode)
> +    mode = V2DImode;
> +  else if (mode == V8SFmode)
> +    mode = V8SImode;
> +  else if (mode == V4SFmode)
> +    mode = V4SImode;
>    for (i = 0; i < nelt; ++i)
>      vec[i] = GEN_INT (d->perm[i]);
>    rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));

See above comment about CONST_VECTOR.

> @@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>      return true;
>  
>    /* Try sequences of two instructions.  */
> +    /* ix86_expand_vec_perm_vpermi2 is also called from
> +     * ix86_expand_vec_perm.  So it doesn't take d as parameter.
> +     * Construct needed params.  */
> +    rtx vec[64];
> +    int i;
> +    for (i = 0; i < d->nelt; ++i)
> +      vec[i] = GEN_INT (d->perm[i]);
> +    rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec));
> +    if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1))
> +      return true;
>  
>    if (expand_vec_perm_pshuflw_pshufhw (d))
>      return true;

I don't understand this.  Doesn't ix86_expand_vec_perm_vpermi2 generate
(except for the V64QI case discussed above) a single insn?  Then
expand_vec_perm_1 should have handled that already, so this is just a waste
of resources here.

> @@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
>    /* Given sufficient ISA support we can just return true here
>       for selected vector modes.  */
>    if (d.vmode == V16SImode || d.vmode == V16SFmode
> -      || d.vmode == V8DFmode || d.vmode == V8DImode)
> +      || d.vmode == V8DFmode || d.vmode == V8DImode
> +      || d.vmode == V32HImode || d.vmode == V64QImode)
>      /* All implementable with a single vpermi2 insn.  */
>      return true;

1) Shouldn't this be guarded with TARGET_AVX512F &&
   and in the V32HImode/V64QImode also with TARGET_AVX512BW?
   The comment is not correct for V64QImode.

2) For TARGET_AVX512VL, vpermi2 can handle also smaller mode sizes.
   Perhaps it would be best to turn this into
  switch (d.vmode)
    {
    case V16SImode:
    case V16SFmode:
    case V8DFmode:
    case V8DImode:
      if (TARGET_AVX512F)
	/* All implementable with a single vpermi2 insn.  */	
	return true;
      break;
    case V32HImode:
      if (TARGET_AVX512BW)
	/* Implementable with a single vpermi2 insn.  */
	return true;
      break;
    case V64HImode:
      if (TARGET_AVX512BW)
	/* Implementable with 2 vpermi2w, 2 vpshufb and one vpor insns.  */
	return true;
      break;
    case V8SImode:
    case V8SFmode:
    case V4DFmode:
    case V4DImode:
      if (TARGET_AVX512VL)
	/* Implementable with a single vpermi2 insn.  */
	return true;
      break;
    case V16HImode:
      if (TARGET_AVX512VL && TARGET_AVX512BW)
	/* Implementable with a single vpermi2 insn.  */
	return true;
      if (TARGET_AVX2)
	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
	return true;
      break;
    case V32QImode:
      if (TARGET_AVX2)
	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
	return true;
      break;
    case V4SImode:
    case V4SFmode:
    case V8HImode:
    case V16QImode:
      /* All implementable with a single vpperm insn.  */
      if (TARGET_XOP)
        return true;
      /* All implementable with 2 pshufb + 1 ior.  */
      if (TARGET_SSSE3)
        return true;
      break;
    case V2DImode:
    case V2DFmode:
      /* All implementable with shufpd or unpck[lh]pd.  */
      return true;
    }

Now, for V8SI/V8SF/V4DI/V4DF, I wonder if we have (for either AVX or AVX2)
any expanders that guarantee we generate some sequence for all possible
2 operand constant permutations.  I think ix86_expand_vec_perm is able
to emit the non-constant permutations for all of these, so in theory
we should have an upper bound for all these.

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]