[PATCH x86, PR60451] Expand even/odd permutation using pack insn.
Evgeny Stupachenko
evstupac@gmail.com
Thu Nov 20 16:28:00 GMT 2014
Bootstrap / make check passed with updated patch.
Is it still ok?
It looks like we don't need "expand_vec_perm_vpshufb2_vpermq_even_odd"
any more with the patch.
However the clean up will be in the separate patch after appropriate testing.
Modified ChangeLog:
2014-11-20 Evgeny Stupachenko <evstupac@gmail.com>
gcc/testsuite
PR target/60451
* gcc.target/i386/pr60451.c: New.
gcc/
PR target/60451
* config/i386/i386.c (expand_vec_perm_even_odd_pack): New.
(expand_vec_perm_even_odd_1): Add new expand for V8HI mode,
replace for V16QI, V16HI and V32QI modes.
(ix86_expand_vec_perm_const_1): Add new expand.
On Thu, Nov 20, 2014 at 6:03 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> Good point! "gen_shift" also requires only SSE2.
> That way we can optimize out interleave sequence for V16QI mode in
> expand_vec_perm_even_odd_1.
> Thanks!
>
> Evgeny
>
> Updated patch:
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 085eb54..054089b 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -48322,6 +48322,127 @@ expand_vec_perm_vpshufb2_vpermq_even_odd
> (struct expand_vec_perm_d *d)
> return true;
> }
>
> +/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
> + and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
> + with two "and" and "pack" or two "shift" and "pack" insns. We should
> + have already failed all two instruction sequences. */
> +
> +static bool
> +expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
> +{
> + rtx op, dop0, dop1, t, rperm[16];
> + unsigned i, odd, c, s, nelt = d->nelt;
> + bool end_perm = false;
> + machine_mode half_mode;
> + rtx (*gen_and) (rtx, rtx, rtx);
> + rtx (*gen_pack) (rtx, rtx, rtx);
> + rtx (*gen_shift) (rtx, rtx, rtx);
> +
> + if (d->one_operand_p)
> + return false;
> +
> + switch (d->vmode)
> + {
> + case V8HImode:
> + /* Required for "pack". */
> + if (!TARGET_SSE4_1)
> + return false;
> + c = 0xffff;
> + s = 16;
> + half_mode = V4SImode;
> + gen_and = gen_andv4si3;
> + gen_pack = gen_sse4_1_packusdw;
> + gen_shift = gen_lshrv4si3;
> + break;
> + case V16QImode:
> + /* No check as all instructions are SSE2. */
> + c = 0xff;
> + s = 8;
> + half_mode = V8HImode;
> + gen_and = gen_andv8hi3;
> + gen_pack = gen_sse2_packuswb;
> + gen_shift = gen_lshrv8hi3;
> + break;
> + case V16HImode:
> + if (!TARGET_AVX2)
> + return false;
> + c = 0xffff;
> + s = 16;
> + half_mode = V8SImode;
> + gen_and = gen_andv8si3;
> + gen_pack = gen_avx2_packusdw;
> + gen_shift = gen_lshrv8si3;
> + end_perm = true;
> + break;
> + case V32QImode:
> + if (!TARGET_AVX2)
> + return false;
> + c = 0xff;
> + s = 8;
> + half_mode = V16HImode;
> + gen_and = gen_andv16hi3;
> + gen_pack = gen_avx2_packuswb;
> + gen_shift = gen_lshrv16hi3;
> + end_perm = true;
> + break;
> + default:
> + /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
> + general shuffles. */
> + return false;
> + }
> +
> + /* Check that permutation is even or odd. */
> + odd = d->perm[0];
> + if (odd > 1)
> + return false;
> +
> + for (i = 1; i < nelt; ++i)
> + if (d->perm[i] != 2 * i + odd)
> + return false;
> +
> + if (d->testing_p)
> + return true;
> +
> + dop0 = gen_reg_rtx (half_mode);
> + dop1 = gen_reg_rtx (half_mode);
> + if (odd == 0)
> + {
> + for (i = 0; i < nelt / 2; i++)
> + rperm[i] = GEN_INT (c);
> + t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
> + t = force_reg (half_mode, t);
> + emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
> + emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
> + }
> + else
> + {
> + emit_insn (gen_shift (dop0,
> + gen_lowpart (half_mode, d->op0),
> + GEN_INT (s)));
> + emit_insn (gen_shift (dop1,
> + gen_lowpart (half_mode, d->op1),
> + GEN_INT (s)));
> + }
> + /* In AVX2 for 256 bit case we need to permute pack result. */
> + if (TARGET_AVX2 && end_perm)
> + {
> + op = gen_reg_rtx (d->vmode);
> + t = gen_reg_rtx (V4DImode);
> + emit_insn (gen_pack (op, dop0, dop1));
> + emit_insn (gen_avx2_permv4di_1 (t,
> + gen_lowpart (V4DImode, op),
> + const0_rtx,
> + const2_rtx,
> + const1_rtx,
> + GEN_INT (3)));
> + emit_move_insn (d->target, gen_lowpart (d->vmode, t));
> + }
> + else
> + emit_insn (gen_pack (d->target, dop0, dop1));
> +
> + return true;
> +}
> +
> /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
> and extract-odd permutations. */
>
> @@ -48393,7 +48514,9 @@ expand_vec_perm_even_odd_1 (struct
> expand_vec_perm_d *d, unsigned odd)
> gcc_unreachable ();
>
> case V8HImode:
> - if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
> + if (TARGET_SSE4_1)
> + return expand_vec_perm_even_odd_pack (d);
> + else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
> return expand_vec_perm_pshufb2 (d);
> else
> {
> @@ -48416,32 +48539,11 @@ expand_vec_perm_even_odd_1 (struct
> expand_vec_perm_d *d, unsigned odd)
> break;
>
> case V16QImode:
> - if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
> - return expand_vec_perm_pshufb2 (d);
> - else
> - {
> - if (d->testing_p)
> - break;
> - t1 = gen_reg_rtx (V16QImode);
> - t2 = gen_reg_rtx (V16QImode);
> - t3 = gen_reg_rtx (V16QImode);
> - emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
> - emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
> - emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
> - emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
> - emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
> - emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
> - if (odd)
> - t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
> - else
> - t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
> - emit_insn (t3);
> - }
> - break;
> + return expand_vec_perm_even_odd_pack (d);
>
> case V16HImode:
> case V32QImode:
> - return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
> + return expand_vec_perm_even_odd_pack (d);
>
> case V4DImode:
> if (!TARGET_AVX2)
> @@ -48814,6 +48916,9 @@ ix86_expand_vec_perm_const_1 (struct
> expand_vec_perm_d *d)
>
> /* Try sequences of three instructions. */
>
> + if (expand_vec_perm_even_odd_pack (d))
> + return true;
> +
> if (expand_vec_perm_2vperm2f128_vshuf (d))
> return true;
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr60451.c
> b/gcc/testsuite/gcc.target/i386/pr60451.c
> new file mode 100644
> index 0000000..c600f4a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr60451.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target sse2 } */
> +/* { dg-options "-O2 -ftree-vectorize -msse2" } */
> +
> +void
> +foo (unsigned char *a, unsigned char *b, unsigned char *c, int size)
> +{
> + int i;
> +
> + for (i = 0; i < size; i++)
> + a[i] = (unsigned char) ((unsigned int)1 + b[i] * c[i] * 117);
> +}
> +
> +/* { dg-final { scan-assembler "packuswb|vpunpck" } } */
>
> On Thu, Nov 20, 2014 at 5:30 PM, Richard Henderson <rth@redhat.com> wrote:
>> On 11/20/2014 12:36 PM, Evgeny Stupachenko wrote:
>>> + /* Required for "pack". */
>>> + if (!TARGET_SSE4_2 || d->one_operand_p)
>>> + return false;
>>
>> Why the SSE4_2 check here when...
>>
>>> +
>>> + /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than general
>>> + shuffles. */
>>> + if (d->vmode == V8HImode)
>>> + {
>>> + c = 0xffff;
>>> + s = 16;
>>> + half_mode = V4SImode;
>>> + gen_and = gen_andv4si3;
>>> + gen_pack = gen_sse4_1_packusdw;
>>
>> ... it's SSE4_1 here,
>>
>>> + gen_shift = gen_lshrv4si3;
>>> + }
>>> + else if (d->vmode == V16QImode)
>>> + {
>>> + c = 0xff;
>>> + s = 8;
>>> + half_mode = V8HImode;
>>> + gen_and = gen_andv8hi3;
>>> + gen_pack = gen_sse2_packuswb;
>>
>> ... and SSE2 here?
>>
>>
>>
>> r~
More information about the Gcc-patches
mailing list