This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PATCH] Improve avx_vperm_broadcast_

From: Jakub Jelinek <jakub at redhat dot com>
To: Uros Bizjak <ubizjak at gmail dot com>, Kirill Yukhin <kirill dot yukhin at gmail dot com>, \
Cc: gcc-patches at gcc dot gnu dot org
Date: Mon, 23 May 2016 19:15:07 +0200
Subject: [PATCH] Improve *avx_vperm_broadcast_*
Authentication-results: sourceware.org; auth=none
Reply-to: Jakub Jelinek <jakub at redhat dot com>

Hi!

The vbroadcastss and vpermilps insns are already in AVX512F & AVX512VL,
so can be used with v instead of x, the splitter case where we for AVX
emit vpermilps plus vpermf128 is more problematic, because the latter
insn isn't available in EVEX.  But, we can get the same effect with
vshuff32x4 when both source operands are the same.
Alternatively, we could replace the vpermilps and vshuff32x4 insns
with the AVX512VL arbitrary permutations I think, the question is
what is faster, because we'd need to load the mask from memory.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2016-05-23  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/sse.md
	(<mask_codefor>avx512vl_shuf_<shuffletype>32x4_1<mask_name>): Rename
	to ...
	(avx512vl_shuf_<shuffletype>32x4_1<mask_name>): ... this.
	(*avx_vperm_broadcast_v4sf): Use v constraint instead of x.  Use
	maybe_evex prefix instead of vex.
	(*avx_vperm_broadcast_<mode>): Use v constraint instead of x.  Handle
	EXT_REX_SSE_REG_P (op0) case in the splitter.

	* gcc.target/i386/avx512vl-vbroadcast-3.c: New test.

--- gcc/config/i386/sse.md.jj	2016-05-22 12:27:34.000000000 +0200
+++ gcc/config/i386/sse.md	2016-05-23 13:54:22.211998751 +0200
@@ -12380,7 +12380,7 @@
   DONE;
 })
 
-(define_insn "<mask_codefor>avx512vl_shuf_<shuffletype>32x4_1<mask_name>"
+(define_insn "avx512vl_shuf_<shuffletype>32x4_1<mask_name>"
   [(set (match_operand:VI4F_256 0 "register_operand" "=v")
 	(vec_select:VI4F_256
 	  (vec_concat:<ssedoublemode>
@@ -17247,9 +17247,9 @@
 ;; If it so happens that the input is in memory, use vbroadcast.
 ;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128).
 (define_insn "*avx_vperm_broadcast_v4sf"
-  [(set (match_operand:V4SF 0 "register_operand" "=x,x,x")
+  [(set (match_operand:V4SF 0 "register_operand" "=v,v,v")
 	(vec_select:V4SF
-	  (match_operand:V4SF 1 "nonimmediate_operand" "m,o,x")
+	  (match_operand:V4SF 1 "nonimmediate_operand" "m,o,v")
 	  (match_parallel 2 "avx_vbroadcast_operand"
 	    [(match_operand 3 "const_int_operand" "C,n,n")])))]
   "TARGET_AVX"
@@ -17271,13 +17271,13 @@
   [(set_attr "type" "ssemov,ssemov,sselog1")
    (set_attr "prefix_extra" "1")
    (set_attr "length_immediate" "0,0,1")
-   (set_attr "prefix" "vex")
+   (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "SF,SF,V4SF")])
 
 (define_insn_and_split "*avx_vperm_broadcast_<mode>"
-  [(set (match_operand:VF_256 0 "register_operand" "=x,x,x")
+  [(set (match_operand:VF_256 0 "register_operand" "=v,v,v")
 	(vec_select:VF_256
-	  (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?x")
+	  (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?v")
 	  (match_parallel 2 "avx_vbroadcast_operand"
 	    [(match_operand 3 "const_int_operand" "C,n,n")])))]
   "TARGET_AVX"
@@ -17309,6 +17309,23 @@
 
       /* Shuffle the lane we care about into both lanes of the dest.  */
       mask = (elt / (<ssescalarnum> / 2)) * 0x11;
+      if (EXT_REX_SSE_REG_P (op0))
+	{
+	  /* There is no EVEX VPERM2F128, but we can use either VBROADCASTSS
+	     or VSHUFF128.  */
+	  gcc_assert (<MODE>mode == V8SFmode);
+	  if ((mask & 1) == 0)
+	    emit_insn (gen_avx2_vec_dupv8sf (op0,
+					     gen_lowpart (V4SFmode, op0)));
+	  else
+	    emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0,
+						  GEN_INT (4), GEN_INT (5),
+						  GEN_INT (6), GEN_INT (7),
+						  GEN_INT (12), GEN_INT (13),
+						  GEN_INT (14), GEN_INT (15)));
+	  DONE;
+	}
+
       emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask)));
       DONE;
     }
--- gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c.jj	2016-05-23 14:07:36.266695992 +0200
+++ gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c	2016-05-23 14:14:49.495012459 +0200
@@ -0,0 +1,162 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx512vl -masm=att" } */
+
+typedef float V1 __attribute__((vector_size (16)));
+typedef float V2 __attribute__((vector_size (32)));
+typedef int V4 __attribute__((vector_size (16)));
+typedef int V5 __attribute__((vector_size (32)));
+
+void
+f1 (V1 x)
+{
+  register V1 a __asm ("xmm16");
+  a = x;
+  asm volatile ("" : "+v" (a));
+  a = __builtin_shuffle (a, (V4) { 0, 0, 0, 0 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f2 (V1 x)
+{
+  register V1 a __asm ("xmm16");
+  a = x;
+  asm volatile ("" : "+v" (a));
+  a = __builtin_shuffle (a, (V4) { 1, 1, 1, 1 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f3 (V1 x)
+{
+  register V1 a __asm ("xmm16");
+  a = x;
+  asm volatile ("" : "+v" (a));
+  a = __builtin_shuffle (a, (V4) { 2, 2, 2, 2 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f4 (V1 x)
+{
+  register V1 a __asm ("xmm16");
+  a = x;
+  asm volatile ("" : "+v" (a));
+  a = __builtin_shuffle (a, (V4) { 3, 3, 3, 3 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f5 (V1 *x)
+{
+  register V1 a __asm ("xmm16");
+  a = __builtin_shuffle (*x, (V4) { 0, 0, 0, 0 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f6 (V1 *x)
+{
+  register V1 a __asm ("xmm16");
+  a = __builtin_shuffle (*x, (V4) { 1, 1, 1, 1 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f7 (V1 *x)
+{
+  register V1 a __asm ("xmm16");
+  a = __builtin_shuffle (*x, (V4) { 2, 2, 2, 2 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f8 (V1 *x)
+{
+  register V1 a __asm ("xmm16");
+  a = __builtin_shuffle (*x, (V4) { 3, 3, 3, 3 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f9 (V2 x)
+{
+  register V2 a __asm ("xmm16");
+  a = x;
+  asm volatile ("" : "+v" (a));
+  a = __builtin_shuffle (a, (V5) { 0, 0, 0, 0, 0, 0, 0, 0 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f10 (V2 x)
+{
+  register V2 a __asm ("xmm16");
+  a = x;
+  asm volatile ("" : "+v" (a));
+  a = __builtin_shuffle (a, (V5) { 1, 1, 1, 1, 1, 1, 1, 1 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f11 (V2 x)
+{
+  register V2 a __asm ("xmm16");
+  a = x;
+  asm volatile ("" : "+v" (a));
+  a = __builtin_shuffle (a, (V5) { 4, 4, 4, 4, 4, 4, 4, 4 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f12 (V2 x)
+{
+  register V2 a __asm ("xmm16");
+  a = x;
+  asm volatile ("" : "+v" (a));
+  a = __builtin_shuffle (a, (V5) { 5, 5, 5, 5, 5, 5, 5, 5 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f13 (V2 *x)
+{
+  register V2 a __asm ("xmm16");
+  a = __builtin_shuffle (*x, (V5) { 0, 0, 0, 0, 0, 0, 0, 0 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f14 (V2 *x)
+{
+  register V2 a __asm ("xmm16");
+  a = __builtin_shuffle (*x, (V5) { 1, 1, 1, 1, 1, 1, 1, 1 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f15 (V2 *x)
+{
+  register V2 a __asm ("xmm16");
+  a = __builtin_shuffle (*x, (V5) { 4, 4, 4, 4, 4, 4, 4, 4 });
+  asm volatile ("" : "+v" (a));
+}
+
+void
+f16 (V2 *x)
+{
+  register V2 a __asm ("xmm16");
+  a = __builtin_shuffle (*x, (V5) { 5, 5, 5, 5, 5, 5, 5, 5 });
+  asm volatile ("" : "+v" (a));
+}
+
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%rdi\[^\n\r]*%xmm16" 4 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%xmm16\[^\n\r]*%ymm16" 3 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%rdi\[^\n\r]*%ymm16" 3 } } */
+/* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$0\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */
+/* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$85\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */
+/* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$170\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */
+/* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$255\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */
+/* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$0\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 1 } } */
+/* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$85\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 2 } } */
+/* { dg-final { scan-assembler-times "vshuff32x4\[^\n\r]*\\\$3\[^\n\r]*%ymm16\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 2 } } */

	Jakub

Follow-Ups:
- Re: [PATCH] Improve *avx_vperm_broadcast_*
  - From: Kirill Yukhin
- Re: [PATCH] Improve *avx_vperm_broadcast_*
  - From: H.J. Lu

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]

[PATCH] Improve *avx_vperm_broadcast_*

[PATCH] Improve avx_vperm_broadcast_