Bug 93613 - Missed optimization with _mm256_permute2x128_si256 intrinsic
Summary: Missed optimization with _mm256_permute2x128_si256 intrinsic
Status: UNCONFIRMED
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 9.2.1
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on: 93594
Blocks:
  Show dependency treegraph
 
Reported: 2020-02-06 14:39 UTC by Jakub Jelinek
Modified: 2020-02-07 07:40 UTC (History)
0 users

See Also:
Host:
Target: x86_64-*-*, i?86-*-*
Build:
Known to work:
Known to fail:
Last reconfirmed:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Jakub Jelinek 2020-02-06 14:39:29 UTC
+++ This bug was initially created as a clone of Bug #93594 +++

#include <x86intrin.h>

__m256i
foo (__m128i x)
{
  return _mm256_permute2x128_si256 (_mm256_castsi128_si256 (x), _mm256_castsi128_si256 (x), 0x80);
}

__m256i
bar (__m128i x)
{
  return _mm256_permute2x128_si256 (_mm256_setzero_si256 (), _mm256_castsi128_si256 (x), 0x02);
}

__m256i
baz (__m128i x)
{
  return _mm256_permute2x128_si256 (_mm256_castsi128_si256 (x), _mm256_setzero_si256 (), 0x20);
}

__m256i
qux (__m128i x)
{
  return _mm256_permute2x128_si256 (_mm256_set_epi64x (1, 2, 3, 4), _mm256_set_epi64x (5, 6, 7, 8), 0x80);
}

__m256i
corge (__m128i x)
{
  return _mm256_permute2x128_si256 (_mm256_set_epi64x (1, 2, 3, 4), _mm256_set_epi64x (5, 6, 7, 8), 0x02);
}

__m256i
quux (__m128i x)
{
  return _mm256_permute2x128_si256 (_mm256_set_epi64x (1, 2, 3, 4), _mm256_set_epi64x (5, 6, 7, 8), 0x20);
}

The _mm256_permute2x128_si256 issues are similar, but really unrelated and IMHO should be tracked in a separate PR.  The problem there is that the pattern we use doesn't really describe what the instruction does, uses an UNSPEC_VPERMTI, which obviously can't be simplified by the generic code.  The reason is mainly that the instruction isn't just a two source permutation, but essentially 3 source permutation, with the third source of 0.
Comment 1 Jakub Jelinek 2020-02-06 14:42:21 UTC
I've tried:
--- gcc/config/i386/sse.md.jj	2020-02-06 13:40:27.485007762 +0100
+++ gcc/config/i386/sse.md	2020-02-06 15:24:35.097743017 +0100
@@ -81,7 +81,6 @@ (define_c_enum "unspec" [
 
   ;; For AVX2 support
   UNSPEC_VPERMVAR
-  UNSPEC_VPERMTI
   UNSPEC_GATHER
   UNSPEC_VSIBADDR
 
@@ -20224,15 +20223,55 @@ (define_insn "avx512f_perm<mode>_1<mask_
    (set_attr "prefix" "<mask_prefix2>")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "avx2_permv2ti"
-  [(set (match_operand:V4DI 0 "register_operand" "=x")
-	(unspec:V4DI
-	  [(match_operand:V4DI 1 "register_operand" "x")
-	   (match_operand:V4DI 2 "nonimmediate_operand" "xm")
-	   (match_operand:SI 3 "const_0_to_255_operand" "n")]
-	  UNSPEC_VPERMTI))]
+(define_expand "avx2_permv2ti"
+  [(match_operand:V4DI 0 "register_operand")
+   (match_operand:V4DI 1 "register_operand")
+   (match_operand:V4DI 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_0_to_255_operand")]
   "TARGET_AVX2"
-  "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+{
+  int mask = INTVAL (operands[3]);
+  int first = (mask & 0x08) ? 8 : (mask & 0x03) * 2;
+  int second = (mask & 0x80) ? 8 : (mask & 0x30) / 8;
+  emit_insn (gen_avx2_permv2ti_1 (operands[0], operands[1],
+				  operands[2], CONST0_RTX (V8DImode),
+				  GEN_INT (first),
+				  GEN_INT (first + 1),
+				  GEN_INT (second),
+				  GEN_INT (second + 1)));
+  DONE;
+})
+
+(define_insn "avx2_permv2ti_1"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+	(vec_select:V4DI
+	  (vec_concat:V16DI
+	    (vec_concat:V8DI
+	      (match_operand:V4DI 1 "register_operand" "x")
+	      (match_operand:V4DI 2 "nonimmediate_operand" "xm"))
+	    (match_operand:V8DI 3 "const0_operand" "C"))
+	  (parallel [(match_operand 4 "const_0_to_15_operand")
+		     (match_operand 5 "const_0_to_15_operand")
+		     (match_operand 6 "const_0_to_15_operand")
+		     (match_operand 7 "const_0_to_15_operand")])))]
+  "TARGET_AVX2
+   && (INTVAL (operands[4]) & 2) == 0
+   && INTVAL (operands[5]) == INTVAL (operands[4]) + 1
+   && (INTVAL (operands[6]) & 2) == 0
+   && INTVAL (operands[7]) == INTVAL (operands[6]) + 1"
+{
+  int mask = 0;
+  if (INTVAL (operands[4]) >= 8)
+    mask |= 0x08;
+  else
+    mask |= INTVAL (operands[4]) / 2;
+  if (INTVAL (operands[6]) >= 8)
+    mask |= 0x80;
+  else
+    mask |= INTVAL (operands[6]) * 8;
+  operands[4] = GEN_INT (mask);
+  return "vperm2i128\t{%4, %2, %1, %0|%0, %1, %2, %4}";
+}
   [(set_attr "type" "sselog")
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])

but unfortunately it doesn't help, guess we'll need to improve simplify-rtx.c to deal with that (and for the last 3 functions it even makes things worse, as combine then simplifies those patterns to vector constants but we don't have an instruction that would force the const_vector into memory that combine could match and could be split before reload).  For those I guess we want gimple folding of the builtin.  Of course, people really should use __builtin_shuffle instead of this mess... ;)