This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] Improve -mavx -mno-avx2 32-byte vector permutations (PR target/91560)
- From: Uros Bizjak <ubizjak at gmail dot com>
- To: Jakub Jelinek <jakub at redhat dot com>
- Cc: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Thu, 29 Aug 2019 11:13:34 +0200
- Subject: Re: [PATCH] Improve -mavx -mno-avx2 32-byte vector permutations (PR target/91560)
- References: <20190829084146.GG20160@tucnak>
On Thu, Aug 29, 2019 at 10:41 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> The following patch improves especially V8SFmode permutations for
> AVX (non-AVX2) ISA, where we punted way too often, even when we can handle
> it.
> On the
> typedef float __v8sf __attribute__((vector_size (32)));
> typedef double __v4df __attribute__((vector_size (32)));
> typedef int __v8si __attribute__((vector_size (32)));
> typedef long long __v4di __attribute__((vector_size (32)));
> #ifdef __clang__
> #define S(x, y, t, ...) __builtin_shufflevector (x, y, __VA_ARGS__)
> #else
> #define S(x, y, t, ...) __builtin_shuffle (x, y, (t) { __VA_ARGS__ })
> #endif
>
> __v8sf f1 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 0, 8, 9, 10, 11, 12, 13, 14 ); }
> __v8sf f2 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 0, 1, 8, 9, 10, 11, 12, 13 ); }
> testcase we used to emit terrible code (8 BIT_FIELD_REFs + composition
> back), while LLVM emits:
> vpermilps $144, %xmm1, %xmm2 # xmm2 = xmm1[0,0,1,2]
> vextractf128 $1, %ymm1, %xmm3
> vblendps $8, %xmm1, %xmm3, %xmm1 # xmm1 = xmm3[0,1,2],xmm1[3]
> vpermilps $147, %xmm1, %xmm1 # xmm1 = xmm1[3,0,1,2]
> vinsertf128 $1, %xmm1, %ymm2, %ymm1
> vblendps $1, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
> and
> vextractf128 $1, %ymm1, %xmm2
> vshufpd $1, %xmm2, %xmm1, %xmm2 # xmm2 = xmm1[1],xmm2[0]
> vmovddup %xmm1, %xmm1 # xmm1 = xmm1[0,0]
> vinsertf128 $1, %xmm2, %ymm1, %ymm1
> vblendps $3, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
> With the patch we emit:
> vpermilps $144, %ymm1, %ymm2
> vpermilps .LC0(%rip), %ymm1, %ymm1
> vblendps $238, %ymm2, %ymm0, %ymm0
> vperm2f128 $1, %ymm1, %ymm1, %ymm1
> vblendps $16, %ymm1, %ymm0, %ymm0
> and
> vshufps $68, %ymm1, %ymm0, %ymm0
> vpermilps .LC1(%rip), %ymm1, %ymm1
> vperm2f128 $1, %ymm1, %ymm1, %ymm1
> vblendps $48, %ymm1, %ymm0, %ymm0
> so one insn each shorter than what LLVM emits.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2019-08-29 Jakub Jelinek <jakub@redhat.com>
>
> PR target/91560
> * config/i386/i386-expand.c (expand_vec_perm_movs,
> expand_vec_perm_blend, expand_vec_perm_vpermil,
> expand_vec_perm_pshufb, expand_vec_perm_1,
> expand_vec_perm_pshuflw_pshufhw, expand_vec_perm_palignr,
> expand_vec_perm_interleave2, expand_vec_perm_vpermq_perm_1,
> expand_vec_perm_vperm2f128, expand_vec_perm_interleave3,
> expand_vec_perm_vperm2f128_vblend, expand_vec_perm_2vperm2f128_vshuf,
> expand_vec_perm_even_odd, expand_vec_perm_broadcast): Adjust function
> comments - replace ix86_expand_vec_perm_builtin_1 with
> ix86_expand_vec_perm_const_1.
> (expand_vec_perm2_vperm2f128_vblend): New function.
> (ix86_expand_vec_perm_const_1): New forward declaration. Call
> expand_vec_perm2_vperm2f128_vblend as last resort.
> (canonicalize_perm): Formatting fix.
>
> * gcc.dg/torture/vshuf-8.inc: Add two further permutations.
LGTM, but actually your area ;)
Thanks,
Uros.
> --- gcc/config/i386/i386-expand.c.jj 2019-08-27 12:26:25.383089132 +0200
> +++ gcc/config/i386/i386-expand.c 2019-08-28 15:22:43.911004586 +0200
> @@ -16372,7 +16372,7 @@ expand_vselect_vconcat (rtx target, rtx
> return ok;
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
> using movss or movsd. */
> static bool
> expand_vec_perm_movs (struct expand_vec_perm_d *d)
> @@ -16408,7 +16408,7 @@ expand_vec_perm_movs (struct expand_vec_
> return true;
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
> in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
>
> static bool
> @@ -16633,7 +16633,7 @@ expand_vec_perm_blend (struct expand_vec
> return true;
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
> in terms of the variable form of vpermilps.
>
> Note that we will have already failed the immediate input vpermilps,
> @@ -16709,7 +16709,7 @@ valid_perm_using_mode_p (machine_mode vm
> return true;
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
> in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
>
> static bool
> @@ -17026,7 +17026,7 @@ ix86_expand_vec_one_operand_perm_avx512
>
> static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
> in a single instruction. */
>
> static bool
> @@ -17216,7 +17216,7 @@ expand_vec_perm_1 (struct expand_vec_per
> return false;
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
> in terms of a pair of pshuflw + pshufhw instructions. */
>
> static bool
> @@ -17257,7 +17257,7 @@ expand_vec_perm_pshuflw_pshufhw (struct
> return true;
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
> the permutation using the SSSE3 palignr instruction. This succeeds
> when all of the elements in PERM fit within one vector and we merely
> need to shift them down so that a single vector permutation has a
> @@ -17474,7 +17474,7 @@ expand_vec_perm_pblendv (struct expand_v
>
> static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
> a two vector permutation into a single vector permutation by using
> an interleave operation to merge the vectors. */
>
> @@ -17752,7 +17752,7 @@ expand_vec_perm_interleave2 (struct expa
> return true;
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
> a single vector cross-lane permutation into vpermq followed
> by any of the single insn permutations. */
>
> @@ -17833,7 +17833,7 @@ expand_vec_perm_vpermq_perm_1 (struct ex
>
> static bool canonicalize_perm (struct expand_vec_perm_d *d);
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
> a vector permutation using two instructions, vperm2f128 resp.
> vperm2i128 followed by any single in-lane permutation. */
>
> @@ -17950,7 +17950,7 @@ expand_vec_perm_vperm2f128 (struct expan
> return false;
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
> a two vector permutation using 2 intra-lane interleave insns
> and cross-lane shuffle for 32-byte vectors. */
>
> @@ -18026,7 +18026,7 @@ expand_vec_perm_interleave3 (struct expa
> return true;
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
> a single vector permutation using a single intra-lane vector
> permutation, vperm2f128 swapping the lanes and vblend* insn blending
> the non-swapped and swapped vectors together. */
> @@ -18094,7 +18094,7 @@ expand_vec_perm_vperm2f128_vblend (struc
> return true;
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
> +/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
> permutation using two vperm2f128, followed by a vshufpd insn blending
> the two vectors together. */
>
> @@ -18145,6 +18145,106 @@ expand_vec_perm_2vperm2f128_vshuf (struc
> return true;
> }
>
> +static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
> +
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
> + a two vector permutation using two intra-lane vector
> + permutations, vperm2f128 swapping the lanes and vblend* insn blending
> + the non-swapped and swapped vectors together. */
> +
> +static bool
> +expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
> +{
> + struct expand_vec_perm_d dfirst, dsecond, dthird;
> + unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
> + rtx_insn *seq1, *seq2;
> + bool ok;
> + rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
> +
> + if (!TARGET_AVX
> + || TARGET_AVX2
> + || (d->vmode != V8SFmode && d->vmode != V4DFmode)
> + || d->one_operand_p)
> + return false;
> +
> + dfirst = *d;
> + dsecond = *d;
> + for (i = 0; i < nelt; i++)
> + {
> + dfirst.perm[i] = 0xff;
> + dsecond.perm[i] = 0xff;
> + }
> + for (i = 0, msk = 0; i < nelt; i++)
> + {
> + j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
> + if (j == i)
> + {
> + dfirst.perm[j] = d->perm[i];
> + which1 |= (d->perm[i] < nelt ? 1 : 2);
> + }
> + else
> + {
> + dsecond.perm[j] = d->perm[i];
> + which2 |= (d->perm[i] < nelt ? 1 : 2);
> + msk |= (1U << i);
> + }
> + }
> + if (msk == 0 || msk == (1U << nelt) - 1)
> + return false;
> +
> + if (!d->testing_p)
> + {
> + dfirst.target = gen_reg_rtx (dfirst.vmode);
> + dsecond.target = gen_reg_rtx (dsecond.vmode);
> + }
> +
> + for (i = 0; i < nelt; i++)
> + {
> + if (dfirst.perm[i] == 0xff)
> + dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
> + if (dsecond.perm[i] == 0xff)
> + dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
> + }
> + canonicalize_perm (&dfirst);
> + start_sequence ();
> + ok = ix86_expand_vec_perm_const_1 (&dfirst);
> + seq1 = get_insns ();
> + end_sequence ();
> +
> + if (!ok)
> + return false;
> +
> + canonicalize_perm (&dsecond);
> + start_sequence ();
> + ok = ix86_expand_vec_perm_const_1 (&dsecond);
> + seq2 = get_insns ();
> + end_sequence ();
> +
> + if (!ok)
> + return false;
> +
> + if (d->testing_p)
> + return true;
> +
> + emit_insn (seq1);
> + emit_insn (seq2);
> +
> + dthird = *d;
> + dthird.op0 = dsecond.target;
> + dthird.op1 = dsecond.target;
> + dthird.one_operand_p = true;
> + dthird.target = gen_reg_rtx (dthird.vmode);
> + for (i = 0; i < nelt; i++)
> + dthird.perm[i] = i ^ nelt2;
> +
> + ok = expand_vec_perm_1 (&dthird);
> + gcc_assert (ok);
> +
> + blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
> + emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
> + return true;
> +}
> +
> /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
> permutation with two pshufb insns and an ior. We should have already
> failed all two instruction sequences. */
> @@ -18534,7 +18634,7 @@ expand_vec_perm_even_odd_trunc (struct e
> return true;
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
> +/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
> and extract-odd permutations. */
>
> static bool
> @@ -18743,7 +18843,7 @@ expand_vec_perm_even_odd_1 (struct expan
> return true;
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
> +/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
> extract-even and extract-odd permutations. */
>
> static bool
> @@ -18762,7 +18862,7 @@ expand_vec_perm_even_odd (struct expand_
> return expand_vec_perm_even_odd_1 (d, odd);
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
> +/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
> permutations. We assume that expand_vec_perm_1 has already failed. */
>
> static bool
> @@ -18841,7 +18941,7 @@ expand_vec_perm_broadcast_1 (struct expa
> }
> }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
> +/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
> broadcast permutations. */
>
> static bool
> @@ -19137,6 +19237,10 @@ ix86_expand_vec_perm_const_1 (struct exp
> return true;
> }
>
> + /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
> + if (expand_vec_perm2_vperm2f128_vblend (d))
> + return true;
> +
> return false;
> }
>
> @@ -19149,7 +19253,7 @@ canonicalize_perm (struct expand_vec_per
> int i, which, nelt = d->nelt;
>
> for (i = which = 0; i < nelt; ++i)
> - which |= (d->perm[i] < nelt ? 1 : 2);
> + which |= (d->perm[i] < nelt ? 1 : 2);
>
> d->one_operand_p = true;
> switch (which)
> --- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj 2015-12-04 09:24:31.234396066 +0100
> +++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc 2019-08-28 15:11:35.778754247 +0200
> @@ -25,7 +25,9 @@ T (21, 4, 12, 5, 13, 6, 14, 7, 15) \
> T (22, 1, 2, 3, 4, 5, 6, 7, 0) \
> T (23, 6, 5, 4, 3, 2, 1, 0, 7) \
> T (24, 0, 1, 2, 3, 8, 9, 10, 11) \
> -T (25, 0, 1, 2, 3, 12, 13, 14, 15)
> +T (25, 0, 1, 2, 3, 12, 13, 14, 15) \
> +T (26, 0, 1, 8, 9, 10, 11, 12, 13) \
> +T (27, 0, 8, 9, 10, 11, 12, 13, 14)
> #define EXPTESTS \
> T (116, 9, 3, 9, 4, 7, 0, 0, 6) \
> T (117, 4, 14, 12, 8, 9, 6, 0, 10) \
>
> Jakub