This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH 1/2, x86] Add palignr support for AVX2.
- From: Uros Bizjak <ubizjak at gmail dot com>
- To: Jakub Jelinek <jakub at redhat dot com>
- Cc: Evgeny Stupachenko <evstupac at gmail dot com>, "H.J. Lu" <hjl dot tools at gmail dot com>, Richard Henderson <rth at redhat dot com>, GCC Patches <gcc-patches at gcc dot gnu dot org>, Richard Biener <rguenther at suse dot de>
- Date: Wed, 1 Oct 2014 15:09:59 +0200
- Subject: Re: [PATCH 1/2, x86] Add palignr support for AVX2.
- Authentication-results: sourceware.org; auth=none
- References: <CAOvf_xz4W7dn3F-VnWowSG211s8WcU2Qo_8+c1rcNAYwh-k7+g at mail dot gmail dot com> <CAMe9rOoaQ90P9wb4m5ch5W-bPh5-1xvmCMQnd9Sc9meoJ0unNQ at mail dot gmail dot com> <CAOvf_xxiLsTCZSEHJ8DLdD7kRHRTHHSjZXWyNPu3H-6xnSfCsA at mail dot gmail dot com> <CAOvf_xyNC1mRGNrM1kU_nNz_tO6_M4T8wox75D+zndhY5=TVAQ at mail dot gmail dot com> <CAFULd4bfOLW2kOmSndwK=LdNbUwHR1Ogds+5_AZ7j=tH=zu12w at mail dot gmail dot com> <20141001103514 dot GO1986 at tucnak dot redhat dot com> <20141001113815 dot GQ1986 at tucnak dot redhat dot com> <CAFULd4b_T0XByAhGew-wL6D-udF6oPwuw=v6NPYdupAn9JtzXA at mail dot gmail dot com> <20141001121715 dot GR1986 at tucnak dot redhat dot com> <CAFULd4Yo0VVJ_Z6dkc1VMpFmO6BNkbaddieXgOX7uTQTrUL11A at mail dot gmail dot com> <20141001125618 dot GT1986 at tucnak dot redhat dot com>
On Wed, Oct 1, 2014 at 2:56 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> And now the expand_vec_perm_palignr improvement, tested
> with GCC_TEST_RUN_EXPENSIVE=1 make check-gcc \
> RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'
> E.g.
> typedef unsigned long long V __attribute__ ((vector_size (32)));
> extern void abort (void);
> V a, b, c, d;
> void test_14 (void)
> {
> V mask = { 6, 1, 3, 4 };
> int i;
> c = __builtin_shuffle (a, mask);
> d = __builtin_shuffle (a, b, mask);
> }
> (distilled from test 15 in vshuf-v4di.c) results in:
> - vmovdqa a(%rip), %ymm0
> - vpermq $54, %ymm0, %ymm1
> - vpshufb .LC1(%rip), %ymm0, %ymm0
> - vmovdqa %ymm1, c(%rip)
> - vmovdqa b(%rip), %ymm1
> - vpshufb .LC0(%rip), %ymm1, %ymm1
> - vpermq $78, %ymm1, %ymm1
> - vpor %ymm1, %ymm0, %ymm0
> + vmovdqa a(%rip), %ymm1
> + vpermq $54, %ymm1, %ymm0
> + vmovdqa %ymm0, c(%rip)
> + vmovdqa b(%rip), %ymm0
> + vpalignr $8, %ymm1, %ymm0, %ymm0
> + vpermq $99, %ymm0, %ymm0
> vmovdqa %ymm0, d(%rip)
> vzeroupper
> ret
> change (and two fewer .rodata constants).
>
> Ok for trunk?
>
> 2014-10-01 Jakub Jelinek <jakub@redhat.com>
>
> * config/i386/i386.c (expand_vec_perm_palignr): Handle
> 256-bit vectors for TARGET_AVX2.
Please mention PR 62128 and include the testcase from the PR. Also,
please add a version of gcc.target/i386/pr52252-atom.c, compiled with
-mavx2 (perhaps named pr52252-avx2.c)
OK with a small adjustment below.
Thanks,
Uros.
> --- gcc/config/i386/i386.c.jj 2014-10-01 14:24:16.483138899 +0200
> +++ gcc/config/i386/i386.c 2014-10-01 14:27:53.577222011 +0200
> @@ -43297,44 +43297,75 @@ expand_vec_perm_palignr (struct expand_v
> rtx shift, target;
> struct expand_vec_perm_d dcopy;
>
> - /* Even with AVX, palignr only operates on 128-bit vectors. */
> - if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
> + /* Even with AVX, palignr only operates on 128-bit vectors,
> + in AVX2 palignr operates on both 128-bit lanes. */
> + if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
> + && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
Please simplify the above condition ...
> return false;
>
> - min = nelt, max = 0;
> + min = 2 * nelt, max = 0;
> for (i = 0; i < nelt; ++i)
> {
> unsigned e = d->perm[i];
> + if (GET_MODE_SIZE (d->vmode) == 32)
> + e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
> if (e < min)
> min = e;
> if (e > max)
> max = e;
> }
> - if (min == 0 || max - min >= nelt)
> + if (min == 0
> + || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
> return false;
>
> /* Given that we have SSSE3, we know we'll be able to implement the
> - single operand permutation after the palignr with pshufb. */
> - if (d->testing_p)
> + single operand permutation after the palignr with pshufb for
> + 128-bit vectors. */
> + if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16)
> return true;
>
> dcopy = *d;
> - shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
> - target = gen_reg_rtx (TImode);
> - emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
> - gen_lowpart (TImode, d->op0), shift));
> -
> - dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
> - dcopy.one_operand_p = true;
>
> in_order = true;
> for (i = 0; i < nelt; ++i)
> {
> - unsigned e = dcopy.perm[i] - min;
> + unsigned e = dcopy.perm[i];
> + if (GET_MODE_SIZE (d->vmode) == 32
> + && e >= nelt
> + && (e & (nelt / 2 - 1)) < min)
> + e = e - min - (nelt / 2);
> + else
> + e = e - min;
> if (e != i)
> in_order = false;
> dcopy.perm[i] = e;
> }
> + dcopy.one_operand_p = true;
> +
> + /* For AVX2, test whether we can permute the result in one instruction. */
> + if (d->testing_p)
> + {
> + if (in_order)
> + return true;
> + dcopy.op1 = dcopy.op0;
> + return expand_vec_perm_1 (&dcopy);
> + }
> +
> + shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
> + if (GET_MODE_SIZE (d->vmode) == 16)
> + {
> + target = gen_reg_rtx (TImode);
> + emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
> + gen_lowpart (TImode, d->op0), shift));
> + }
> + else
> + {
> + target = gen_reg_rtx (V2TImode);
> + emit_insn (gen_avx2_palignrv2ti (target, gen_lowpart (V2TImode, d->op1),
> + gen_lowpart (V2TImode, d->op0), shift));
> + }
> +
> + dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
>
> /* Test for the degenerate case where the alignment by itself
> produces the desired permutation. */
> @@ -43345,7 +43376,7 @@ expand_vec_perm_palignr (struct expand_v
> }
>
> ok = expand_vec_perm_1 (&dcopy);
> - gcc_assert (ok);
> + gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
>
> return ok;
> }
>
>
> Jakub