[PATCH] i386, expand: Optimize also 256-bit and 512-bit permutatations as vpmovzx if possible [PR95905]

Wed Jan 13 09:04:40 GMT 2021

On Wed, Jan 13, 2021 at 8:13 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> The following patch implements what I've talked about, i.e. to no longer
> force operands of vec_perm_const into registers in the generic code, but let
> each of the (currently 8) targets force it into registers individually,
> giving the targets better control on if it does that and when and allowing
> them to do something special with some particular operands.
> And then defines the define_insn_and_split for the 256-bit and 512-bit
> permutations into vpmovzx* (only the bw, wd and dq cases, in theory we could
> add define_insn_and_split patterns also for the bd, bq and wq).
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2021-01-13  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/95905
>         * optabs.c (expand_vec_perm_const): Don't force v0 and v1 into
>         registers before calling targetm.vectorize.vec_perm_const, only after
>         that.
>         * config/i386/i386-expand.c (ix86_vectorize_vec_perm_const): Handle
>         two argument permutation when one operand is zero vector and only
>         after that force operands into registers.
>         * config/i386/sse.md (*avx2_zero_extendv16qiv16hi2_1,
>         *avx512bw_zero_extendv32qiv32hi2_1, *avx512f_zero_extendv16hiv16si2_1,
>         *avx2_zero_extendv8hiv8si2_1, *avx512f_zero_extendv8siv8di2_1,
>         *avx2_zero_extendv4siv4di2_1): New define_insn_and_split patterns.
>         * config/mips/mips.c (mips_vectorize_vec_perm_const): Force operands
>         into registers.
>         * config/arm/arm.c (arm_vectorize_vec_perm_const): Likewise.
>         * config/sparc/sparc.c (sparc_vectorize_vec_perm_const): Likewise.
>         * config/ia64/ia64.c (ia64_vectorize_vec_perm_const): Likewise.
>         * config/aarch64/aarch64.c (aarch64_vectorize_vec_perm_const): Likewise.
>         * config/rs6000/rs6000.c (rs6000_vectorize_vec_perm_const): Likewise.
>         * config/gcn/gcn.c (gcn_vectorize_vec_perm_const): Likewise.  Use std::swap.
>
>         * gcc.target/i386/pr95905-2.c: Use scan-assembler-times instead of
>         scan-assembler.  Add tests with zero vector as first __builtin_shuffle
>         operand.
>         * gcc.target/i386/pr95905-3.c: New test.
>         * gcc.target/i386/pr95905-4.c: New test.

LGTM for x86 part.

Thanks,
Uros.

>
> --- gcc/optabs.c.jj     2021-01-04 10:25:38.632236100 +0100
> +++ gcc/optabs.c        2021-01-12 14:46:44.719557815 +0100
> @@ -6070,11 +6070,8 @@ expand_vec_perm_const (machine_mode mode
>
>    if (targetm.vectorize.vec_perm_const != NULL)
>      {
> -      v0 = force_reg (mode, v0);
>        if (single_arg_p)
>         v1 = v0;
> -      else
> -       v1 = force_reg (mode, v1);
>
>        if (targetm.vectorize.vec_perm_const (mode, target, v0, v1, indices))
>         return target;
> @@ -6095,6 +6092,11 @@ expand_vec_perm_const (machine_mode mode
>         return gen_lowpart (mode, target_qi);
>      }
>
> +  v0 = force_reg (mode, v0);
> +  if (single_arg_p)
> +    v1 = v0;
> +  v1 = force_reg (mode, v1);
> +
>    /* Otherwise expand as a fully variable permuation.  */
>
>    /* The optabs are only defined for selectors with the same width
> --- gcc/config/i386/i386-expand.c.jj    2021-01-12 11:01:51.189386077 +0100
> +++ gcc/config/i386/i386-expand.c       2021-01-12 15:43:55.673095807 +0100
> @@ -19929,6 +19929,33 @@ ix86_vectorize_vec_perm_const (machine_m
>
>    two_args = canonicalize_perm (&d);
>
> +  /* If one of the operands is a zero vector, try to match pmovzx.  */
> +  if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
> +    {
> +      struct expand_vec_perm_d dzero = d;
> +      if (d.op0 == CONST0_RTX (vmode))
> +       {
> +         d.op1 = dzero.op1 = force_reg (vmode, d.op1);
> +         std::swap (dzero.op0, dzero.op1);
> +         for (i = 0; i < nelt; ++i)
> +           dzero.perm[i] ^= nelt;
> +       }
> +      else
> +       d.op0 = dzero.op0 = force_reg (vmode, d.op0);
> +
> +      if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
> +                                 dzero.perm, nelt, dzero.testing_p))
> +       return true;
> +    }
> +
> +  /* Force operands into registers.  */
> +  rtx nop0 = force_reg (vmode, d.op0);
> +  if (d.op0 == d.op1)
> +    d.op1 = nop0;
> +  d.op0 = nop0;
> +  if (d.op0 != d.op1)
> +    d.op1 = force_reg (vmode, d.op1);
> +
>    if (ix86_expand_vec_perm_const_1 (&d))
>      return true;
>
> --- gcc/config/i386/sse.md.jj   2021-01-12 14:30:32.688546846 +0100
> +++ gcc/config/i386/sse.md      2021-01-12 15:40:29.018402527 +0100
> @@ -17611,6 +17611,23 @@ (define_insn "avx2_<code>v16qiv16hi2<mas
>     (set_attr "prefix" "maybe_evex")
>     (set_attr "mode" "OI")])
>
> +(define_insn_and_split "*avx2_zero_extendv16qiv16hi2_1"
> +  [(set (match_operand:V32QI 0 "register_operand" "=v")
> +       (vec_select:V32QI
> +         (vec_concat:V64QI
> +           (match_operand:V32QI 1 "nonimmediate_operand" "vm")
> +           (match_operand:V32QI 2 "const0_operand" "C"))
> +         (match_parallel 3 "pmovzx_parallel"
> +           [(match_operand 4 "const_int_operand" "n")])))]
> +  "TARGET_AVX2"
> +  "#"
> +  "&& reload_completed"
> +  [(set (match_dup 0) (zero_extend:V16HI (match_dup 1)))]
> +{
> +  operands[0] = lowpart_subreg (V16HImode, operands[0], V32QImode);
> +  operands[1] = lowpart_subreg (V16QImode, operands[1], V32QImode);
> +})
> +
>  (define_expand "<insn>v16qiv16hi2"
>    [(set (match_operand:V16HI 0 "register_operand")
>         (any_extend:V16HI
> @@ -17628,6 +17645,23 @@ (define_insn "avx512bw_<code>v32qiv32hi2
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "XI")])
>
> +(define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_1"
> +  [(set (match_operand:V64QI 0 "register_operand" "=v")
> +       (vec_select:V64QI
> +         (vec_concat:V128QI
> +           (match_operand:V64QI 1 "nonimmediate_operand" "vm")
> +           (match_operand:V64QI 2 "const0_operand" "C"))
> +         (match_parallel 3 "pmovzx_parallel"
> +           [(match_operand 4 "const_int_operand" "n")])))]
> +  "TARGET_AVX512BW"
> +  "#"
> +  "&& reload_completed"
> +  [(set (match_dup 0) (zero_extend:V32HI (match_dup 1)))]
> +{
> +  operands[0] = lowpart_subreg (V32HImode, operands[0], V64QImode);
> +  operands[1] = lowpart_subreg (V32QImode, operands[1], V64QImode);
> +})
> +
>  (define_expand "<insn>v32qiv32hi2"
>    [(set (match_operand:V32HI 0 "register_operand")
>         (any_extend:V32HI
> @@ -17883,6 +17917,23 @@ (define_expand "<insn>v16hiv16si2"
>           (match_operand:V16HI 1 "nonimmediate_operand")))]
>    "TARGET_AVX512F")
>
> +(define_insn_and_split "avx512f_zero_extendv16hiv16si2_1"
> +  [(set (match_operand:V32HI 0 "register_operand" "=v")
> +       (vec_select:V32HI
> +         (vec_concat:V64HI
> +           (match_operand:V32HI 1 "nonimmediate_operand" "vm")
> +           (match_operand:V32HI 2 "const0_operand" "C"))
> +         (match_parallel 3 "pmovzx_parallel"
> +           [(match_operand 4 "const_int_operand" "n")])))]
> +  "TARGET_AVX512F"
> +  "#"
> +  "&& reload_completed"
> +  [(set (match_dup 0) (zero_extend:V16SI (match_dup 1)))]
> +{
> +  operands[0] = lowpart_subreg (V16SImode, operands[0], V32HImode);
> +  operands[1] = lowpart_subreg (V16HImode, operands[1], V32HImode);
> +})
> +
>  (define_insn "avx2_<code>v8hiv8si2<mask_name>"
>    [(set (match_operand:V8SI 0 "register_operand" "=v")
>         (any_extend:V8SI
> @@ -17900,6 +17951,23 @@ (define_expand "<insn>v8hiv8si2"
>           (match_operand:V8HI 1 "nonimmediate_operand")))]
>    "TARGET_AVX2")
>
> +(define_insn_and_split "avx2_zero_extendv8hiv8si2_1"
> +  [(set (match_operand:V16HI 0 "register_operand" "=v")
> +       (vec_select:V16HI
> +         (vec_concat:V32HI
> +           (match_operand:V16HI 1 "nonimmediate_operand" "vm")
> +           (match_operand:V16HI 2 "const0_operand" "C"))
> +         (match_parallel 3 "pmovzx_parallel"
> +           [(match_operand 4 "const_int_operand" "n")])))]
> +  "TARGET_AVX2"
> +  "#"
> +  "&& reload_completed"
> +  [(set (match_dup 0) (zero_extend:V8SI (match_dup 1)))]
> +{
> +  operands[0] = lowpart_subreg (V8SImode, operands[0], V16HImode);
> +  operands[1] = lowpart_subreg (V8HImode, operands[1], V16HImode);
> +})
> +
>  (define_insn "sse4_1_<code>v4hiv4si2<mask_name>"
>    [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
>         (any_extend:V4SI
> @@ -18275,6 +18343,23 @@ (define_insn "avx512f_<code>v8siv8di2<ma
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "XI")])
>
> +(define_insn_and_split "*avx512f_zero_extendv8siv8di2_1"
> +  [(set (match_operand:V16SI 0 "register_operand" "=v")
> +       (vec_select:V16SI
> +         (vec_concat:V32SI
> +           (match_operand:V16SI 1 "nonimmediate_operand" "vm")
> +           (match_operand:V16SI 2 "const0_operand" "C"))
> +         (match_parallel 3 "pmovzx_parallel"
> +           [(match_operand 4 "const_int_operand" "n")])))]
> +  "TARGET_AVX512F"
> +  "#"
> +  "&& reload_completed"
> +  [(set (match_dup 0) (zero_extend:V8DI (match_dup 1)))]
> +{
> +  operands[0] = lowpart_subreg (V8DImode, operands[0], V16SImode);
> +  operands[1] = lowpart_subreg (V8SImode, operands[1], V16SImode);
> +})
> +
>  (define_expand "<insn>v8siv8di2"
>    [(set (match_operand:V8DI 0 "register_operand" "=v")
>         (any_extend:V8DI
> @@ -18292,6 +18377,23 @@ (define_insn "avx2_<code>v4siv4di2<mask_
>     (set_attr "prefix_extra" "1")
>     (set_attr "mode" "OI")])
>
> +(define_insn_and_split "*avx2_zero_extendv4siv4di2_1"
> +  [(set (match_operand:V8SI 0 "register_operand" "=v")
> +       (vec_select:V8SI
> +         (vec_concat:V16SI
> +           (match_operand:V8SI 1 "nonimmediate_operand" "vm")
> +           (match_operand:V8SI 2 "const0_operand" "C"))
> +         (match_parallel 3 "pmovzx_parallel"
> +           [(match_operand 4 "const_int_operand" "n")])))]
> +  "TARGET_AVX2"
> +  "#"
> +  "&& reload_completed"
> +  [(set (match_dup 0) (zero_extend:V4DI (match_dup 1)))]
> +{
> +  operands[0] = lowpart_subreg (V4DImode, operands[0], V8SImode);
> +  operands[1] = lowpart_subreg (V4SImode, operands[1], V8SImode);
> +})
> +
>  (define_expand "<insn>v4siv4di2"
>    [(set (match_operand:V4DI 0 "register_operand" "=v")
>         (any_extend:V4DI
> --- gcc/config/mips/mips.c.jj   2021-01-04 10:25:41.592202583 +0100
> +++ gcc/config/mips/mips.c      2021-01-12 15:06:07.608535692 +0100
> @@ -21624,6 +21624,15 @@ mips_vectorize_vec_perm_const (machine_m
>    bool ok;
>
>    d.target = target;
> +  if (op0)
> +    {
> +      rtx nop0 = force_reg (vmode, op0);
> +      if (op0 == op1)
> +        op1 = nop0;
> +      op0 = nop0;
> +    }
> +  if (op1 && op0 != op1)
> +    op1 = force_reg (vmode, op1);
>    d.op0 = op0;
>    d.op1 = op1;
>
> --- gcc/config/arm/arm.c.jj     2021-01-04 10:25:44.469170006 +0100
> +++ gcc/config/arm/arm.c        2021-01-12 15:02:24.333038536 +0100
> @@ -31482,6 +31482,15 @@ arm_vectorize_vec_perm_const (machine_mo
>      return false;
>
>    d.target = target;
> +  if (op0)
> +    {
> +      rtx nop0 = force_reg (vmode, op0);
> +      if (op0 == op1)
> +        op1 = nop0;
> +      op0 = nop0;
> +    }
> +  if (op1 && op0 != op1)
> +    op1 = force_reg (vmode, op1);
>    d.op0 = op0;
>    d.op1 = op1;
>
> --- gcc/config/sparc/sparc.c.jj 2021-01-04 10:25:45.662156497 +0100
> +++ gcc/config/sparc/sparc.c    2021-01-12 15:10:43.491443165 +0100
> @@ -12942,6 +12942,13 @@ sparc_vectorize_vec_perm_const (machine_
>    if (vmode != V8QImode)
>      return false;
>
> +  rtx nop0 = force_reg (vmode, op0);
> +  if (op0 == op1)
> +    op1 = nop0;
> +  op0 = nop0;
> +  if (op0 != op1)
> +    op1 = force_reg (vmode, op1);
> +
>    unsigned int i, mask;
>    for (i = mask = 0; i < 8; ++i)
>      mask |= (sel[i] & 0xf) << (28 - i*4);
> --- gcc/config/ia64/ia64.c.jj   2021-01-04 10:25:45.808154844 +0100
> +++ gcc/config/ia64/ia64.c      2021-01-12 15:03:26.704339360 +0100
> @@ -11759,6 +11759,15 @@ ia64_vectorize_vec_perm_const (machine_m
>    unsigned int i, nelt, which;
>
>    d.target = target;
> +  if (op0)
> +    {
> +      rtx nop0 = force_reg (vmode, op0);
> +      if (op0 == op1)
> +        op1 = nop0;
> +      op0 = nop0;
> +    }
> +  if (op1 && op0 != op1)
> +    op1 = force_reg (vmode, op1);
>    d.op0 = op0;
>    d.op1 = op1;
>
> --- gcc/config/aarch64/aarch64.c.jj     2021-01-05 13:53:53.291683826 +0100
> +++ gcc/config/aarch64/aarch64.c        2021-01-12 14:51:26.645401653 +0100
> @@ -21020,8 +21020,11 @@ aarch64_vectorize_vec_perm_const (machin
>    d.vmode = vmode;
>    d.vec_flags = aarch64_classify_vector_mode (d.vmode);
>    d.target = target;
> -  d.op0 = op0;
> -  d.op1 = op1;
> +  d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
> +  if (op0 == op1)
> +    d.op1 = d.op0;
> +  else
> +    d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX;
>    d.testing_p = !target;
>
>    if (!d.testing_p)
> --- gcc/config/rs6000/rs6000.c.jj       2021-01-04 10:25:47.037140928 +0100
> +++ gcc/config/rs6000/rs6000.c  2021-01-12 15:09:32.866234841 +0100
> @@ -22946,6 +22946,16 @@ rs6000_vectorize_vec_perm_const (machine
>    if (TARGET_ALTIVEC && testing_p)
>      return true;
>
> +  if (op0)
> +    {
> +      rtx nop0 = force_reg (vmode, op0);
> +      if (op0 == op1)
> +        op1 = nop0;
> +      op0 = nop0;
> +    }
> +  if (op1 && op0 != op1)
> +    op1 = force_reg (vmode, op1);
> +
>    /* Check for ps_merge* or xxpermdi insns.  */
>    if ((vmode == V2DFmode || vmode == V2DImode) && VECTOR_MEM_VSX_P (vmode))
>      {
> --- gcc/config/gcn/gcn.c.jj     2021-01-04 10:25:45.939153361 +0100
> +++ gcc/config/gcn/gcn.c        2021-01-12 14:56:17.394146737 +0100
> @@ -3986,13 +3986,14 @@ gcn_vectorize_vec_perm_const (machine_mo
>    for (unsigned int i = 0; i < nelt; ++i)
>      perm[i] = sel[i] & (2 * nelt - 1);
>
> +  src0 = force_reg (vmode, src0);
> +  src1 = force_reg (vmode, src1);
> +
>    /* Make life a bit easier by swapping operands if necessary so that
>       the first element always comes from src0.  */
>    if (perm[0] >= nelt)
>      {
> -      rtx temp = src0;
> -      src0 = src1;
> -      src1 = temp;
> +      std::swap (src0, src1);
>
>        for (unsigned int i = 0; i < nelt; ++i)
>         if (perm[i] < nelt)
> --- gcc/testsuite/gcc.target/i386/pr95905-2.c.jj        2021-01-12 13:58:39.820222075 +0100
> +++ gcc/testsuite/gcc.target/i386/pr95905-2.c   2021-01-12 15:50:05.796964412 +0100
> @@ -1,9 +1,9 @@
>  /* PR target/95905 */
>  /* { dg-do compile } */
>  /* { dg-options "-O2 -msse4.1" } */
> -/* { dg-final { scan-assembler "\tv?pmovzxbw\t" } } */
> -/* { dg-final { scan-assembler "\tv?pmovzxwd\t" } } */
> -/* { dg-final { scan-assembler "\tv?pmovzxdq\t" } } */
> +/* { dg-final { scan-assembler-times "\tv?pmovzxbw\t" 4 } } */
> +/* { dg-final { scan-assembler-times "\tv?pmovzxwd\t" 4 } } */
> +/* { dg-final { scan-assembler-times "\tv?pmovzxdq\t" 4 } } */
>
>  typedef unsigned char V1 __attribute__((vector_size (16)));
>  typedef unsigned short V2 __attribute__((vector_size (16)));
> @@ -44,3 +44,39 @@ f6 (V3 *x)
>  {
>    return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 4, 1, 5 });
>  }
> +
> +V1
> +f7 (V1 x)
> +{
> +  return __builtin_shuffle ((V1) {}, x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
> +}
> +
> +V2
> +f8 (V2 x)
> +{
> +  return __builtin_shuffle ((V2) {}, x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 });
> +}
> +
> +V3
> +f9 (V3 x)
> +{
> +  return __builtin_shuffle ((V3) {}, x, (V3) { 4, 0, 5, 1 });
> +}
> +
> +V1
> +f10 (V1 *x)
> +{
> +  return __builtin_shuffle ((V1) {}, *x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
> +}
> +
> +V2
> +f11 (V2 *x)
> +{
> +  return __builtin_shuffle ((V2) {}, *x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 });
> +}
> +
> +V3
> +f12 (V3 *x)
> +{
> +  return __builtin_shuffle ((V3) {}, *x, (V3) { 4, 0, 5, 1 });
> +}
> --- gcc/testsuite/gcc.target/i386/pr95905-3.c.jj        2021-01-12 15:53:05.627957108 +0100
> +++ gcc/testsuite/gcc.target/i386/pr95905-3.c   2021-01-12 15:52:32.393328070 +0100
> @@ -0,0 +1,82 @@
> +/* PR target/95905 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2" } */
> +/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */
> +/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */
> +/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */
> +
> +typedef unsigned char V1 __attribute__((vector_size (32)));
> +typedef unsigned short V2 __attribute__((vector_size (32)));
> +typedef unsigned int V3 __attribute__((vector_size (32)));
> +
> +V1
> +f1 (V1 x)
> +{
> +  return __builtin_shuffle (x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
> +}
> +
> +V2
> +f2 (V2 x)
> +{
> +  return __builtin_shuffle (x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
> +}
> +
> +V3
> +f3 (V3 x)
> +{
> +  return __builtin_shuffle (x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 });
> +}
> +
> +V1
> +f4 (V1 *x)
> +{
> +  return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
> +}
> +
> +V2
> +f5 (V2 *x)
> +{
> +  return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
> +}
> +
> +V3
> +f6 (V3 *x)
> +{
> +  return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 });
> +}
> +
> +V1
> +f7 (V1 x)
> +{
> +  return __builtin_shuffle ((V1) {}, x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
> +}
> +
> +V2
> +f8 (V2 x)
> +{
> +  return __builtin_shuffle ((V2) {}, x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
> +}
> +
> +V3
> +f9 (V3 x)
> +{
> +  return __builtin_shuffle ((V3) {}, x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 });
> +}
> +
> +V1
> +f10 (V1 *x)
> +{
> +  return __builtin_shuffle ((V1) {}, *x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
> +}
> +
> +V2
> +f11 (V2 *x)
> +{
> +  return __builtin_shuffle ((V2) {}, *x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
> +}
> +
> +V3
> +f12 (V3 *x)
> +{
> +  return __builtin_shuffle ((V3) {}, *x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 });
> +}
> --- gcc/testsuite/gcc.target/i386/pr95905-4.c.jj        2021-01-12 15:55:30.065343628 +0100
> +++ gcc/testsuite/gcc.target/i386/pr95905-4.c   2021-01-12 15:55:01.957657667 +0100
> @@ -0,0 +1,82 @@
> +/* PR target/95905 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512bw" } */
> +/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */
> +/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */
> +/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */
> +
> +typedef unsigned char V1 __attribute__((vector_size (64)));
> +typedef unsigned short V2 __attribute__((vector_size (64)));
> +typedef unsigned int V3 __attribute__((vector_size (64)));
> +
> +V1
> +f1 (V1 x)
> +{
> +  return __builtin_shuffle (x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 });
> +}
> +
> +V2
> +f2 (V2 x)
> +{
> +  return __builtin_shuffle (x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
> +}
> +
> +V3
> +f3 (V3 x)
> +{
> +  return __builtin_shuffle (x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
> +}
> +
> +V1
> +f4 (V1 *x)
> +{
> +  return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 });
> +}
> +
> +V2
> +f5 (V2 *x)
> +{
> +  return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
> +}
> +
> +V3
> +f6 (V3 *x)
> +{
> +  return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
> +}
> +
> +V1
> +f7 (V1 x)
> +{
> +  return __builtin_shuffle ((V1) {}, x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 });
> +}
> +
> +V2
> +f8 (V2 x)
> +{
> +  return __builtin_shuffle ((V2) {}, x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
> +}
> +
> +V3
> +f9 (V3 x)
> +{
> +  return __builtin_shuffle ((V3) {}, x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
> +}
> +
> +V1
> +f10 (V1 *x)
> +{
> +  return __builtin_shuffle ((V1) {}, *x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 });
> +}
> +
> +V2
> +f11 (V2 *x)
> +{
> +  return __builtin_shuffle ((V2) {}, *x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
> +}
> +
> +V3
> +f12 (V3 *x)
> +{
> +  return __builtin_shuffle ((V3) {}, *x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
> +}
>
>         Jakub
>