[PATCH 2/2, x86] Add palignr support for AVX2.
Evgeny Stupachenko
evstupac@gmail.com
Tue Apr 29 17:23:00 GMT 2014
Thanks. The path is fixed according to your comments:
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 002d295..aa6372a 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -42807,6 +42807,79 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
return true;
}
+/* A subroutine of ix86_expand_vec_perm_1. Try to use just palignr
+ instruction for one operand permutation. This is better than pshufb
+ as does not require to pass big constant and faster on some x86
+ architectures. */
+
+static bool
+expand_vec_perm_palignr_one_operand (struct expand_vec_perm_d *d)
+{
+ unsigned i, nelt = d->nelt;
+ unsigned min;
+ rtx shift, shift1, target, tmp;
+
+ /* PALIGNR of 2 128-bits registers takes only 1 instrucion.
+ Requires SSSE3. */
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ if (!TARGET_SSSE3)
+ return false;
+ }
+ /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions:
+ PERM and PALIGNR. It is more profitable than 2 PSHUFB and PERM. */
+ else if (GET_MODE_SIZE (d->vmode) == 32)
+ {
+ if (!TARGET_AVX2)
+ return false;
+ }
+ else
+ return false;
+
+ if (!d->one_operand_p)
+ return false;
+
+ /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
+ PALIGNR is better than PSHUFB. Check for a rotation in permutation. */
+ for (i = 0; i < nelt; ++i)
+ if ((((d->perm[(i + 1) & (nelt - 1)] - d->perm[i])) & (nelt - 1)) != 1)
+ return false;
+
+ min = d->perm[0];
+ shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+ shift1 = GEN_INT ((min - nelt / 2) *
+ GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ target = gen_reg_rtx (TImode);
+ emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
+ gen_lowpart (TImode, d->op0), shift));
+ }
+ else
+ {
+ target = gen_reg_rtx (V2TImode);
+ tmp = gen_reg_rtx (V4DImode);
+ emit_insn (gen_avx2_permv2ti (tmp,
+ gen_lowpart (V4DImode, d->op0),
+ gen_lowpart (V4DImode, d->op1),
+ GEN_INT (33)));
+ if (min < nelt / 2)
+ emit_insn (gen_avx2_palignrv2ti (target,
+ gen_lowpart (V2TImode, tmp),
+ gen_lowpart (V2TImode, d->op0),
+ shift));
+ else
+ emit_insn (gen_avx2_palignrv2ti (target,
+ gen_lowpart (V2TImode, d->op1),
+ gen_lowpart (V2TImode, tmp),
+ shift1));
+ }
+ emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+
+ return true;
+}
+
static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d);
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
@@ -42943,6 +43016,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_vpermil (d))
return true;
+ /* Try palignr on one operand. */
+ if (expand_vec_perm_palignr_one_operand (d))
+ return true;
+
/* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
vpshufb, vpermd, vpermps or vpermq variable permutation. */
if (expand_vec_perm_pshufb (d))
On Tue, Apr 29, 2014 at 6:50 PM, Richard Henderson <rth@redhat.com> wrote:
> On 04/29/2014 06:50 AM, Evgeny Stupachenko wrote:
>> + if (d->one_operand_p != true)
>> + return false;
>
> This looks odd. Better as !d->one_operand_p.
>
>> +
>> + /* For an in order permutation with one operand like: {5 6 7 0 1 2 3 4}
>> + PALIGNR is better than PSHUFB. Check for an order in permutation. */
>
> FWIW, "in order permutation" sounds like a contradiction in terms.
> Better to describe this as a rotate.
>
>> + in_order_length = 0;
>> + in_order_length_max = 0;
>> + if (d->one_operand_p == true)
>
> You've just tested one_operand above.
>
>> + for (i = 0; i < 2 * nelt; ++i)
>
> Which means that 2*nelt is doing twice as much work as needed.
>
>> + {
>> + if ((d->perm[(i + 1) & (nelt - 1)] -
>> + d->perm[i & (nelt - 1)]) != 1)
>
> Surely we can avoid re-reading the comparison value...
>
> next = (d->perm[0] + 1) & (nelt - 1);
> for (i = 1; i < nelt; ++i)
> {
> if (d->perm[i] != next)
> return false;
> next = (next + 1) & (nelt - 1);
> }
>
>> + {
>> + if (in_order_length > in_order_length_max)
>> + in_order_length_max = in_order_length;
>> + in_order_length = 0;
>> + }
>> + else
>> + in_order_length++;
>> + }
>> +
>> + /* If not an ordered permutation then try something else. */
>> + if (in_order_length_max != nelt - 1)
>> + return false;
>
> I don't understand what this length and max stuff is trying to accomplish.
>
>> +
>> + min = d->perm[0];
>> +
>> + shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
>> + shift1 = GEN_INT ((min - nelt / 2) *
>> + GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
>> +
>> + if (GET_MODE_SIZE (d->vmode) != 32)
>
> Positive tests are almost always clearer: == 16.
>
>
> r~
More information about the Gcc-patches
mailing list