[PATCH 1/2, x86] Add palignr support for AVX2.
Jakub Jelinek
jakub@redhat.com
Wed Oct 1 12:56:00 GMT 2014
On Wed, Oct 01, 2014 at 02:25:01PM +0200, Uros Bizjak wrote:
> OK.
And now the expand_vec_perm_palignr improvement, tested
with GCC_TEST_RUN_EXPENSIVE=1 make check-gcc \
RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'
E.g.
typedef unsigned long long V __attribute__ ((vector_size (32)));
extern void abort (void);
V a, b, c, d;
void test_14 (void)
{
V mask = { 6, 1, 3, 4 };
int i;
c = __builtin_shuffle (a, mask);
d = __builtin_shuffle (a, b, mask);
}
(distilled from test 15 in vshuf-v4di.c) results in:
- vmovdqa a(%rip), %ymm0
- vpermq $54, %ymm0, %ymm1
- vpshufb .LC1(%rip), %ymm0, %ymm0
- vmovdqa %ymm1, c(%rip)
- vmovdqa b(%rip), %ymm1
- vpshufb .LC0(%rip), %ymm1, %ymm1
- vpermq $78, %ymm1, %ymm1
- vpor %ymm1, %ymm0, %ymm0
+ vmovdqa a(%rip), %ymm1
+ vpermq $54, %ymm1, %ymm0
+ vmovdqa %ymm0, c(%rip)
+ vmovdqa b(%rip), %ymm0
+ vpalignr $8, %ymm1, %ymm0, %ymm0
+ vpermq $99, %ymm0, %ymm0
vmovdqa %ymm0, d(%rip)
vzeroupper
ret
change (and two fewer .rodata constants).
Ok for trunk?
2014-10-01 Jakub Jelinek <jakub@redhat.com>
* config/i386/i386.c (expand_vec_perm_palignr): Handle
256-bit vectors for TARGET_AVX2.
--- gcc/config/i386/i386.c.jj 2014-10-01 14:24:16.483138899 +0200
+++ gcc/config/i386/i386.c 2014-10-01 14:27:53.577222011 +0200
@@ -43297,44 +43297,75 @@ expand_vec_perm_palignr (struct expand_v
rtx shift, target;
struct expand_vec_perm_d dcopy;
- /* Even with AVX, palignr only operates on 128-bit vectors. */
- if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+ /* Even with AVX, palignr only operates on 128-bit vectors,
+ in AVX2 palignr operates on both 128-bit lanes. */
+ if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+ && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
return false;
- min = nelt, max = 0;
+ min = 2 * nelt, max = 0;
for (i = 0; i < nelt; ++i)
{
unsigned e = d->perm[i];
+ if (GET_MODE_SIZE (d->vmode) == 32)
+ e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
if (e < min)
min = e;
if (e > max)
max = e;
}
- if (min == 0 || max - min >= nelt)
+ if (min == 0
+ || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
return false;
/* Given that we have SSSE3, we know we'll be able to implement the
- single operand permutation after the palignr with pshufb. */
- if (d->testing_p)
+ single operand permutation after the palignr with pshufb for
+ 128-bit vectors. */
+ if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16)
return true;
dcopy = *d;
- shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
- target = gen_reg_rtx (TImode);
- emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
- gen_lowpart (TImode, d->op0), shift));
-
- dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
- dcopy.one_operand_p = true;
in_order = true;
for (i = 0; i < nelt; ++i)
{
- unsigned e = dcopy.perm[i] - min;
+ unsigned e = dcopy.perm[i];
+ if (GET_MODE_SIZE (d->vmode) == 32
+ && e >= nelt
+ && (e & (nelt / 2 - 1)) < min)
+ e = e - min - (nelt / 2);
+ else
+ e = e - min;
if (e != i)
in_order = false;
dcopy.perm[i] = e;
}
+ dcopy.one_operand_p = true;
+
+ /* For AVX2, test whether we can permute the result in one instruction. */
+ if (d->testing_p)
+ {
+ if (in_order)
+ return true;
+ dcopy.op1 = dcopy.op0;
+ return expand_vec_perm_1 (&dcopy);
+ }
+
+ shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ target = gen_reg_rtx (TImode);
+ emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
+ gen_lowpart (TImode, d->op0), shift));
+ }
+ else
+ {
+ target = gen_reg_rtx (V2TImode);
+ emit_insn (gen_avx2_palignrv2ti (target, gen_lowpart (V2TImode, d->op1),
+ gen_lowpart (V2TImode, d->op0), shift));
+ }
+
+ dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
/* Test for the degenerate case where the alignment by itself
produces the desired permutation. */
@@ -43345,7 +43376,7 @@ expand_vec_perm_palignr (struct expand_v
}
ok = expand_vec_perm_1 (&dcopy);
- gcc_assert (ok);
+ gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
return ok;
}
Jakub
More information about the Gcc-patches
mailing list