This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] palignr improvement (PR target/62128)
- From: Jakub Jelinek <jakub at redhat dot com>
- To: Uros Bizjak <ubizjak at gmail dot com>, Evgeny <evstupac at gmail dot com>
- Cc: GCC Patches <gcc-patches at gcc dot gnu dot org>
- Date: Thu, 2 Oct 2014 11:48:27 +0200
- Subject: [PATCH] palignr improvement (PR target/62128)
- Authentication-results: sourceware.org; auth=none
- References: <uky3h7i9cc0xquh4vqdlf05c dot 1412234110405 at email dot android dot com> <20141002075045 dot GB1986 at tucnak dot redhat dot com>
- Reply-to: Jakub Jelinek <jakub at redhat dot com>
On Thu, Oct 02, 2014 at 09:50:45AM +0200, Jakub Jelinek wrote:
> On Thu, Oct 02, 2014 at 11:15:10AM +0400, Evgeny wrote:
> > Hold on. The patch has a conflict with previously approved here.
>
> Which?
>
> In any case, I've already committed the patch.
>
> That said, if what you care now about is pr52252-atom.c with -O2
> -ftree-vectorize -mavx2, let's look at all the permutations it needs
> below. The patch I've committed improved f3, from
> 3x vpshufb + 2x vpermq + 2x vpor into vperm2i128 $33 + vpalignr $11,
> left other permutations as is.
So, looking at that testcase, I see a very easy improvement implemented below.
For palignr, we were only trying one order of arguments to palignr, but the
order of those arguments is significant, and if we try the other order too,
also the f5 and f6 functions can now be emitted as vperm2i128 + vpalignr.
It even handles
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5
V16QImode permutation using single palignr insn, previously we only handled
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21.
Tested with
GCC_TEST_RUN_EXPENSIVE=1 make -k check-gcc \
RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'
on x86_64-linux, ok for trunk if it passes bootstrap?
As for the previous testcase with distilled pr52252-atom.c permutations,
f1/f4 is now vpunpcklbw/vpunpckhbw/vperm2i128, f2 2x vpshufb/vpermq/vpor,
f3/f5/f6 vperm2i128/vpalignr, suggestions how to improve that?
2014-10-02 Jakub Jelinek <jakub@redhat.com>
PR target/62128
* config/i386/i386.c (expand_vec_perm_palignr): If op1, op0 order
of palignr arguments can't be used due to min 0 or max - min
too high, try also op0, op1 order of palignr arguments.
* gcc.dg/torture/vshuf-16.inc (TESTS): Add 2 new permutations.
* gcc.dg/torture/vshuf-32.inc (TESTS): Add 5 new permutations.
--- gcc/config/i386/i386.c.jj 2014-10-02 09:18:50.000000000 +0200
+++ gcc/config/i386/i386.c 2014-10-02 11:17:29.792953321 +0200
@@ -43298,8 +43298,8 @@ static bool
expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
{
unsigned i, nelt = d->nelt;
- unsigned min, max;
- bool in_order, ok;
+ unsigned min, max, minswap, maxswap;
+ bool in_order, ok, swap = false;
rtx shift, target;
struct expand_vec_perm_d dcopy;
@@ -43309,20 +43309,40 @@ expand_vec_perm_palignr (struct expand_v
&& (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
return false;
- min = 2 * nelt, max = 0;
+ min = 2 * nelt;
+ max = 0;
+ minswap = 2 * nelt;
+ maxswap = 0;
for (i = 0; i < nelt; ++i)
{
unsigned e = d->perm[i];
+ unsigned eswap = d->perm[i] ^ nelt;
if (GET_MODE_SIZE (d->vmode) == 32)
- e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
+ {
+ e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
+ eswap = e ^ (nelt / 2);
+ }
if (e < min)
min = e;
if (e > max)
max = e;
+ if (eswap < minswap)
+ minswap = eswap;
+ if (eswap > maxswap)
+ maxswap = eswap;
}
if (min == 0
|| max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
- return false;
+ {
+ if (d->one_operand_p
+ || minswap == 0
+ || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
+ ? nelt / 2 : nelt))
+ return false;
+ swap = true;
+ min = minswap;
+ max = maxswap;
+ }
/* Given that we have SSSE3, we know we'll be able to implement the
single operand permutation after the palignr with pshufb for
@@ -43332,6 +43352,13 @@ expand_vec_perm_palignr (struct expand_v
return true;
dcopy = *d;
+ if (swap)
+ {
+ dcopy.op0 = d->op1;
+ dcopy.op1 = d->op0;
+ for (i = 0; i < nelt; ++i)
+ dcopy.perm[i] ^= nelt;
+ }
in_order = true;
for (i = 0; i < nelt; ++i)
@@ -43365,14 +43392,16 @@ expand_vec_perm_palignr (struct expand_v
if (GET_MODE_SIZE (d->vmode) == 16)
{
target = gen_reg_rtx (TImode);
- emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
- gen_lowpart (TImode, d->op0), shift));
+ emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
+ gen_lowpart (TImode, dcopy.op0), shift));
}
else
{
target = gen_reg_rtx (V2TImode);
- emit_insn (gen_avx2_palignrv2ti (target, gen_lowpart (V2TImode, d->op1),
- gen_lowpart (V2TImode, d->op0), shift));
+ emit_insn (gen_avx2_palignrv2ti (target,
+ gen_lowpart (V2TImode, dcopy.op1),
+ gen_lowpart (V2TImode, dcopy.op0),
+ shift));
}
dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
--- gcc/testsuite/gcc.dg/torture/vshuf-16.inc.jj 2012-03-20 08:51:25.000000000 +0100
+++ gcc/testsuite/gcc.dg/torture/vshuf-16.inc 2014-10-02 11:29:13.906821725 +0200
@@ -23,7 +23,9 @@ T (19, 15, 14, 13, 12, 11, 10, 9, 8, 7,
T (20, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23) \
T (21, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31) \
T (22, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0) \
-T (23, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15)
+T (23, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15) \
+T (24, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5) \
+T (25, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21)
#define EXPTESTS \
T (116, 28, 13, 27, 11, 21, 1, 5, 22, 29, 14, 15, 6, 3, 10, 16, 30) \
T (117, 22, 26, 1, 13, 29, 3, 18, 18, 11, 21, 12, 28, 19, 5, 7, 4) \
--- gcc/testsuite/gcc.dg/torture/vshuf-32.inc.jj 2012-03-20 08:51:25.000000000 +0100
+++ gcc/testsuite/gcc.dg/torture/vshuf-32.inc 2014-10-02 11:28:05.753097953 +0200
@@ -23,7 +23,12 @@ T (19, 31, 30, 29, 28, 27, 26, 25, 24, 2
T (20, 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47) \
T (21, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63) \
T (22, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0) \
-T (23, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31)
+T (23, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31) \
+T (24, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29) \
+T (25, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42) \
+T (26, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52) \
+T (27, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53) \
+T (28, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 0, 1, 2, 3, 4, 5, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 16, 17, 18, 19, 20, 21)
#define EXPTESTS \
T (116, 13, 38, 47, 3, 17, 8, 38, 20, 59, 61, 39, 26, 7, 49, 63, 43, 57, 16, 40, 19, 4, 32, 27, 7, 52, 19, 46, 55, 36, 41, 48, 6) \
T (117, 39, 35, 59, 20, 56, 18, 58, 63, 57, 14, 2, 16, 5, 61, 35, 4, 53, 9, 52, 51, 27, 33, 61, 12, 3, 35, 36, 40, 37, 7, 45, 42) \
Jakub