This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH 2/2, x86] Add palignr support for AVX2.
- From: Evgeny Stupachenko <evstupac at gmail dot com>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>, Richard Biener <rguenther at suse dot de>, Uros Bizjak <ubizjak at gmail dot com>
- Cc: "H.J. Lu" <hjl dot tools at gmail dot com>, Richard Henderson <rth at redhat dot com>
- Date: Tue, 29 Apr 2014 17:50:03 +0400
- Subject: [PATCH 2/2, x86] Add palignr support for AVX2.
- Authentication-results: sourceware.org; auth=none
Hi,
The patch adds use of palignr instruction, when we have one operand
permutation like:
{5 6 7 0 1 2 3 4}:
Treating this as {5 6 7 8 9 a b c} on 2 operands, and therefore palignr on 5.
Bootstrap and make check passed.
Is it ok?
Evgeny
2014-04-29 Evgeny Stupachenko <evstupac@gmail.com>
* config/i386/i386.c (expand_vec_perm_palignr_one_operand): New.
Enables PALIGNR on one operand permutation.
* config/i386/i386.c (expand_vec_perm_1): Try PALIGNR on one operand.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 002d295..8950cf7 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -42807,6 +42807,97 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
return true;
}
+/* A subroutine of ix86_expand_vec_perm_1. Try to use just palignr
+ instruction for one operand permutation. This is better than pshufb
+ as does not require to pass big constant and faster on some x86
+ architectures. */
+
+static bool
+expand_vec_perm_palignr_one_operand (struct expand_vec_perm_d *d)
+{
+ unsigned i, nelt = d->nelt;
+ unsigned min;
+ unsigned in_order_length, in_order_length_max;
+ rtx shift, shift1, target, tmp;
+
+ /* PALIGNR of 2 128-bits registers takes only 1 instrucion.
+ Requires SSSE3. */
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ if(!TARGET_SSSE3)
+ return false;
+ }
+ /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions:
+ PERM and PALIGNR. It is more profitable than 2 PSHUFB and PERM. */
+ else if (GET_MODE_SIZE (d->vmode) == 32)
+ {
+ if(!TARGET_AVX2)
+ return false;
+ }
+ else
+ return false;
+
+ if (d->one_operand_p != true)
+ return false;
+
+ /* For an in order permutation with one operand like: {5 6 7 0 1 2 3 4}
+ PALIGNR is better than PSHUFB. Check for an order in permutation. */
+ in_order_length = 0;
+ in_order_length_max = 0;
+ if (d->one_operand_p == true)
+ for (i = 0; i < 2 * nelt; ++i)
+ {
+ if ((d->perm[(i + 1) & (nelt - 1)] -
+ d->perm[i & (nelt - 1)]) != 1)
+ {
+ if (in_order_length > in_order_length_max)
+ in_order_length_max = in_order_length;
+ in_order_length = 0;
+ }
+ else
+ in_order_length++;
+ }
+
+ /* If not an ordered permutation then try something else. */
+ if (in_order_length_max != nelt - 1)
+ return false;
+
+ min = d->perm[0];
+
+ shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+ shift1 = GEN_INT ((min - nelt / 2) *
+ GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+
+ if (GET_MODE_SIZE (d->vmode) != 32)
+ {
+ target = gen_reg_rtx (TImode);
+ emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
+ gen_lowpart (TImode, d->op0), shift));
+ }
+ else
+ {
+ target = gen_reg_rtx (V2TImode);
+ tmp = gen_reg_rtx (V4DImode);
+ emit_insn (gen_avx2_permv2ti (tmp,
+ gen_lowpart (V4DImode, d->op0),
+ gen_lowpart (V4DImode, d->op1),
+ GEN_INT (33)));
+ if (min < nelt / 2)
+ emit_insn (gen_avx2_palignrv2ti (target,
+ gen_lowpart (V2TImode, tmp),
+ gen_lowpart (V2TImode, d->op0),
+ shift));
+ else
+ emit_insn (gen_avx2_palignrv2ti (target,
+ gen_lowpart (V2TImode, d->op1),
+ gen_lowpart (V2TImode, tmp),
+ shift1));
+ }
+ emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+
+ return true;
+}
+
static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d);
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
@@ -42943,6 +43034,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_vpermil (d))
return true;
+ /* Try palignr on one operand. */
+ if (expand_vec_perm_palignr_one_operand (d))
+ return true;
+
/* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
vpshufb, vpermd, vpermps or vpermq variable permutation. */
if (expand_vec_perm_pshufb (d))