This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] palignr improvement (PR target/62128)


On Thu, Oct 02, 2014 at 09:50:45AM +0200, Jakub Jelinek wrote:
> On Thu, Oct 02, 2014 at 11:15:10AM +0400, Evgeny wrote:
> > Hold on. The patch has a conflict with previously approved here.
> 
> Which?
> 
> In any case, I've already committed the patch.
> 
> That said, if what you care now about is pr52252-atom.c with -O2
> -ftree-vectorize -mavx2, let's look at all the permutations it needs
> below.  The patch I've committed improved f3, from
> 3x vpshufb + 2x vpermq + 2x vpor into vperm2i128 $33 + vpalignr $11,
> left other permutations as is.

So, looking at that testcase, I see a very easy improvement implemented below.
For palignr, we were only trying one order of arguments to palignr, but the
order of those arguments is significant, and if we try the other order too,
also the f5 and f6 functions can now be emitted as vperm2i128 + vpalignr.
It even handles
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5
V16QImode permutation using single palignr insn, previously we only handled
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21.

Tested with
GCC_TEST_RUN_EXPENSIVE=1 make -k check-gcc \
RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'
on x86_64-linux, ok for trunk if it passes bootstrap?

As for the previous testcase with distilled pr52252-atom.c permutations,
f1/f4 is now vpunpcklbw/vpunpckhbw/vperm2i128, f2 2x vpshufb/vpermq/vpor,
f3/f5/f6 vperm2i128/vpalignr, suggestions how to improve that?

2014-10-02  Jakub Jelinek  <jakub@redhat.com>

	PR target/62128
	* config/i386/i386.c (expand_vec_perm_palignr): If op1, op0 order
	of palignr arguments can't be used due to min 0 or max - min
	too high, try also op0, op1 order of palignr arguments.

	* gcc.dg/torture/vshuf-16.inc (TESTS): Add 2 new permutations.
	* gcc.dg/torture/vshuf-32.inc (TESTS): Add 5 new permutations.

--- gcc/config/i386/i386.c.jj	2014-10-02 09:18:50.000000000 +0200
+++ gcc/config/i386/i386.c	2014-10-02 11:17:29.792953321 +0200
@@ -43298,8 +43298,8 @@ static bool
 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
 {
   unsigned i, nelt = d->nelt;
-  unsigned min, max;
-  bool in_order, ok;
+  unsigned min, max, minswap, maxswap;
+  bool in_order, ok, swap = false;
   rtx shift, target;
   struct expand_vec_perm_d dcopy;
 
@@ -43309,20 +43309,40 @@ expand_vec_perm_palignr (struct expand_v
       && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
     return false;
 
-  min = 2 * nelt, max = 0;
+  min = 2 * nelt;
+  max = 0;
+  minswap = 2 * nelt;
+  maxswap = 0;
   for (i = 0; i < nelt; ++i)
     {
       unsigned e = d->perm[i];
+      unsigned eswap = d->perm[i] ^ nelt;
       if (GET_MODE_SIZE (d->vmode) == 32)
-	e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
+	{
+	  e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
+	  eswap = e ^ (nelt / 2);
+	}
       if (e < min)
 	min = e;
       if (e > max)
 	max = e;
+      if (eswap < minswap)
+	minswap = eswap;
+      if (eswap > maxswap)
+	maxswap = eswap;
     }
   if (min == 0
       || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
-    return false;
+    {
+      if (d->one_operand_p
+	  || minswap == 0
+	  || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
+				   ? nelt / 2 : nelt))
+	return false;
+      swap = true;
+      min = minswap;
+      max = maxswap;
+    }
 
   /* Given that we have SSSE3, we know we'll be able to implement the
      single operand permutation after the palignr with pshufb for
@@ -43332,6 +43352,13 @@ expand_vec_perm_palignr (struct expand_v
     return true;
 
   dcopy = *d;
+  if (swap)
+    {
+      dcopy.op0 = d->op1;
+      dcopy.op1 = d->op0;
+      for (i = 0; i < nelt; ++i)
+	dcopy.perm[i] ^= nelt;
+    }
 
   in_order = true;
   for (i = 0; i < nelt; ++i)
@@ -43365,14 +43392,16 @@ expand_vec_perm_palignr (struct expand_v
   if (GET_MODE_SIZE (d->vmode) == 16)
     {
       target = gen_reg_rtx (TImode);
-      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
-				      gen_lowpart (TImode, d->op0), shift));
+      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
+				      gen_lowpart (TImode, dcopy.op0), shift));
     }
   else
     {
       target = gen_reg_rtx (V2TImode);
-      emit_insn (gen_avx2_palignrv2ti (target, gen_lowpart (V2TImode, d->op1),
-				       gen_lowpart (V2TImode, d->op0), shift));
+      emit_insn (gen_avx2_palignrv2ti (target,
+				       gen_lowpart (V2TImode, dcopy.op1),
+				       gen_lowpart (V2TImode, dcopy.op0),
+				       shift));
     }
 
   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
--- gcc/testsuite/gcc.dg/torture/vshuf-16.inc.jj	2012-03-20 08:51:25.000000000 +0100
+++ gcc/testsuite/gcc.dg/torture/vshuf-16.inc	2014-10-02 11:29:13.906821725 +0200
@@ -23,7 +23,9 @@ T (19,	15, 14, 13, 12, 11, 10, 9, 8, 7,
 T (20,	0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23) \
 T (21,	8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31) \
 T (22,	1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0) \
-T (23,	14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15)
+T (23,	14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15) \
+T (24,	22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5) \
+T (25,	6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21)
 #define EXPTESTS \
 T (116,	28, 13, 27, 11, 21, 1, 5, 22, 29, 14, 15, 6, 3, 10, 16, 30) \
 T (117,	22, 26, 1, 13, 29, 3, 18, 18, 11, 21, 12, 28, 19, 5, 7, 4) \
--- gcc/testsuite/gcc.dg/torture/vshuf-32.inc.jj	2012-03-20 08:51:25.000000000 +0100
+++ gcc/testsuite/gcc.dg/torture/vshuf-32.inc	2014-10-02 11:28:05.753097953 +0200
@@ -23,7 +23,12 @@ T (19,	31, 30, 29, 28, 27, 26, 25, 24, 2
 T (20,	0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47) \
 T (21,	16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63) \
 T (22,	1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0) \
-T (23,	30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31)
+T (23,	30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31) \
+T (24,	0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29) \
+T (25,	11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42) \
+T (26,	21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52) \
+T (27,	22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53) \
+T (28,	38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 0, 1, 2, 3, 4, 5, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 16, 17, 18, 19, 20, 21)
 #define EXPTESTS \
 T (116,	13, 38, 47, 3, 17, 8, 38, 20, 59, 61, 39, 26, 7, 49, 63, 43, 57, 16, 40, 19, 4, 32, 27, 7, 52, 19, 46, 55, 36, 41, 48, 6) \
 T (117,	39, 35, 59, 20, 56, 18, 58, 63, 57, 14, 2, 16, 5, 61, 35, 4, 53, 9, 52, 51, 27, 33, 61, 12, 3, 35, 36, 40, 37, 7, 45, 42) \


	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]