This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 1/2, x86] Add palignr support for AVX2.


Is the following patch ok? It passes bootstrap and make check.

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 88142a8..91f6f21 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -42807,6 +42807,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
   return true;
 }

+static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d);
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
    in a single instruction.  */

@@ -42946,6 +42948,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_pshufb (d))
     return true;

+  /* Try the AVX2 vpshufb.  */
+  if (expand_vec_perm_vpshufb2_vpermq (d))
+    return true;
+
   /* Try the AVX512F vpermi2 instructions.  */
   rtx vec[64];
   enum machine_mode mode = d->vmode;
@@ -43004,7 +43010,7 @@ expand_vec_perm_pshuflw_pshufhw (struct
expand_vec_perm_d *d)
 }

 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
-   the permutation using the SSSE3 palignr instruction.  This succeeds
+   the permutation using the SSSE3/AVX2 palignr instruction.  This succeeds
    when all of the elements in PERM fit within one vector and we merely
    need to shift them down so that a single vector permutation has a
    chance to succeed.  */
@@ -43015,14 +43021,26 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
   unsigned i, nelt = d->nelt;
   unsigned min, max;
   bool in_order, ok;
-  rtx shift, target;
+  rtx shift, shift1, target, tmp;
   struct expand_vec_perm_d dcopy;

-  /* Even with AVX, palignr only operates on 128-bit vectors.  */
-  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+  /* SSSE3 is required to apply PALIGNR on 16 bytes operands.  */
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      if (!TARGET_SSSE3)
+       return false;
+    }
+  /* AVX2 is required to apply PALIGNR on 32 bytes operands.  */
+  else if (GET_MODE_SIZE (d->vmode) == 32)
+    {
+      if (!TARGET_AVX2)
+       return false;
+    }
+  /* Other sizes are not supported.  */
+  else
     return false;

-  min = nelt, max = 0;
+  min = 2 * nelt, max = 0;
   for (i = 0; i < nelt; ++i)
     {
       unsigned e = d->perm[i];
@@ -43041,9 +43059,35 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)

   dcopy = *d;
   shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
-  target = gen_reg_rtx (TImode);
-  emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
-                                 gen_lowpart (TImode, d->op0), shift));
+  shift1 = GEN_INT ((min - nelt / 2) *
+          GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+
+  if (GET_MODE_SIZE (d->vmode) != 32)
+    {
+      target = gen_reg_rtx (TImode);
+      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
+                                     gen_lowpart (TImode, d->op0), shift));
+    }
+  else
+    {
+      target = gen_reg_rtx (V2TImode);
+      tmp = gen_reg_rtx (V4DImode);
+      emit_insn (gen_avx2_permv2ti (tmp,
+                                   gen_lowpart (V4DImode, d->op0),
+                                   gen_lowpart (V4DImode, d->op1),
+                                   GEN_INT (33)));
+      if (min < nelt / 2)
+        emit_insn (gen_avx2_palignrv2ti (target,
+                                        gen_lowpart (V2TImode, tmp),
+                                        gen_lowpart (V2TImode, d->op0),
+                                        shift));
+      else
+       emit_insn (gen_avx2_palignrv2ti (target,
+                                        gen_lowpart (V2TImode, d->op1),
+                                        gen_lowpart (V2TImode, tmp),
+                                        shift1));
+    }
+

   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
   dcopy.one_operand_p = true;

On Tue, Apr 29, 2014 at 1:03 AM, Richard Henderson <rth@redhat.com> wrote:
> On 04/28/2014 01:43 PM, Evgeny Stupachenko wrote:
>> Agree on checks:
>>
>>   /* PALIGNR of 2 128-bits registers takes only 1 instrucion.
>>      Requires SSSE3.  */
>>   if (GET_MODE_SIZE (d->vmode) == 16)
>>     {
>>       if(!TARGET_SSSE3)
>>         return false;
>>     }
>>   /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions:
>>      PERM and PALIGNR.  It is more profitable than 2 PSHUFB and PERM.  */
>>   else if (GET_MODE_SIZE (d->vmode) == 32)
>>     {
>>       if(!TARGET_AVX2)
>>         return false;
>>     }
>>   else
>>     return false;
>
> Thanks, much better.
>
>
> r~


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]