[PATCH] [i386] Combine avx_vec_concatv16si and avx512f_zero_extendv16hiv16si2_1 to avx512f_zero_extendv16hiv16si2_2.

liuhongt hongtao.liu@intel.com
Wed Aug 11 06:43:06 GMT 2021


Hi:
  Add define_insn_and_split to combine avx_vec_concatv16si/2 and
avx512f_zero_extendv16hiv16si2_1 since the latter already zero_extend
the upper bits, similar for other patterns which are related to
pmovzx{bw,wd,dq}.

It will do optimization like

-       vmovdqa %ymm0, %ymm0    # 7     [c=4 l=6]  avx_vec_concatv16si/2
        vpmovzxwd       %ymm0, %zmm0    # 22    [c=4 l=6]  avx512f_zero_extendv16hiv16si2
        ret             # 25    [c=0 l=1]  simple_return_internal

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

	PR target/101846
	* config/i386/sse.md (*avx2_zero_extendv16qiv16hi2_2): New
	post_reload define_insn_and_split.
	(*avx512bw_zero_extendv32qiv32hi2_2): Ditto.
	(*sse4_1_zero_extendv8qiv8hi2_4): Ditto.
	(*avx512f_zero_extendv16hiv16si2_2): Ditto.
	(*avx2_zero_extendv8hiv8si2_2): Ditto.
	(*sse4_1_zero_extendv4hiv4si2_4): Ditto.
	(*avx512f_zero_extendv8siv8di2_2): Ditto.
	(*avx2_zero_extendv4siv4di2_2): Ditto.
	(*sse4_1_zero_extendv2siv2di2_4): Ditto.

gcc/testsuite/ChangeLog:

	PR target/101846
	* gcc.target/i386/pr101846-1.c: New test.
---
 gcc/config/i386/sse.md                     | 220 +++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr101846-1.c |  95 +++++++++
 2 files changed, 315 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-1.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a46a2373547..6450c058458 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -673,8 +673,14 @@ (define_mode_iterator VI12_128 [V16QI V8HI])
 (define_mode_iterator VI14_128 [V16QI V4SI])
 (define_mode_iterator VI124_128 [V16QI V8HI V4SI])
 (define_mode_iterator VI24_128 [V8HI V4SI])
+(define_mode_iterator VI128_128 [V16QI V8HI V2DI])
 (define_mode_iterator VI248_128 [V8HI V4SI V2DI])
+(define_mode_iterator VI248_256 [V16HI V8SI V4DI])
+(define_mode_iterator VI248_512 [V32HI V16SI V8DI])
 (define_mode_iterator VI48_128 [V4SI V2DI])
+(define_mode_iterator VI148_512 [V64QI V16SI V8DI])
+(define_mode_iterator VI148_256 [V32QI V8SI V4DI])
+(define_mode_iterator VI148_128 [V16QI V4SI V2DI])
 
 ;; Various 256bit and 512 vector integer mode combinations
 (define_mode_iterator VI124_256 [V32QI V16HI V8SI])
@@ -18499,6 +18505,26 @@ (define_insn_and_split "*avx2_zero_extendv16qiv16hi2_1"
   operands[1] = lowpart_subreg (V16QImode, operands[1], V32QImode);
 })
 
+(define_insn_and_split "*avx2_zero_extendv16qiv16hi2_2"
+  [(set (match_operand:V32QI 0 "register_operand" "=v")
+	(vec_select:V32QI
+	  (vec_concat:V64QI
+	    (subreg:V32QI
+	      (vec_concat:VI248_256
+		(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
+		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0)
+	    (match_operand:V32QI 3 "const0_operand" "C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V16HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V16HImode, operands[0], V32QImode);
+  operands[1] = lowpart_subreg (V16QImode, operands[1], <ssehalfvecmode>mode);
+})
+
 (define_expand "<insn>v16qiv16hi2"
   [(set (match_operand:V16HI 0 "register_operand")
 	(any_extend:V16HI
@@ -18533,6 +18559,26 @@ (define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_1"
   operands[1] = lowpart_subreg (V32QImode, operands[1], V64QImode);
 })
 
+(define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_2"
+  [(set (match_operand:V64QI 0 "register_operand" "=v")
+	(vec_select:V64QI
+	  (vec_concat:V128QI
+	    (subreg:V64QI
+	      (vec_concat:VI248_512
+		(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
+		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0)
+	    (match_operand:V64QI 3 "const0_operand" "C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX512BW"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V32HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V32HImode, operands[0], V64QImode);
+  operands[1] = lowpart_subreg (V32QImode, operands[1], <ssehalfvecmode>mode);
+})
+
 (define_expand "<insn>v32qiv32hi2"
   [(set (match_operand:V32HI 0 "register_operand")
 	(any_extend:V32HI
@@ -18619,6 +18665,41 @@ (define_insn_and_split "*sse4_1_zero_extendv8qiv8hi2_3"
 }
   [(set_attr "isa" "noavx,noavx,avx")])
 
+(define_insn_and_split "*sse4_1_zero_extendv8qiv8hi2_4"
+  [(set (match_operand:V16QI 0 "register_operand" "=Yr,*x,Yw")
+	(vec_select:V16QI
+	  (vec_concat:V32QI
+	    (subreg:V16QI
+	      (vec_concat:VI248_128
+		(match_operand:<ssehalfvecmode> 1 "vector_operand" "YrBm,*xBm,Ywm")
+		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C,C,C")) 0)
+	    (match_operand:V16QI 3 "const0_operand" "C,C,C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n,n,n")])))]
+  "TARGET_SSE4_1"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(zero_extend:V8HI
+	  (vec_select:V8QI
+	    (match_dup 1)
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)
+		       (const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)]))))]
+{
+  operands[0] = lowpart_subreg (V8HImode, operands[0], V16QImode);
+  if (MEM_P (operands[1]))
+    {
+      operands[1] = lowpart_subreg (V8QImode, operands[1], <ssehalfvecmode>mode);
+      operands[1] = gen_rtx_ZERO_EXTEND (V8HImode, operands[1]);
+      emit_insn (gen_rtx_SET (operands[0], operands[1]));
+      DONE;
+    }
+  operands[1] = lowpart_subreg (V16QImode, operands[1], <ssehalfvecmode>mode);
+}
+  [(set_attr "isa" "noavx,noavx,avx")])
+
 (define_expand "<insn>v8qiv8hi2"
   [(set (match_operand:V8HI 0 "register_operand")
 	(any_extend:V8HI
@@ -18809,6 +18890,26 @@ (define_insn_and_split "avx512f_zero_extendv16hiv16si2_1"
   operands[1] = lowpart_subreg (V16HImode, operands[1], V32HImode);
 })
 
+(define_insn_and_split "*avx512f_zero_extendv16hiv16si2_2"
+  [(set (match_operand:V32HI 0 "register_operand" "=v")
+	(vec_select:V32HI
+	  (vec_concat:V64HI
+	    (subreg:V32HI
+	      (vec_concat:VI148_512
+	        (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
+		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0)
+	    (match_operand:V32HI 3 "const0_operand" "C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX512F"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V16SI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V16SImode, operands[0], V32HImode);
+  operands[1] = lowpart_subreg (V16HImode, operands[1], <ssehalfvecmode>mode);
+})
+
 (define_insn "avx2_<code>v8hiv8si2<mask_name>"
   [(set (match_operand:V8SI 0 "register_operand" "=v")
 	(any_extend:V8SI
@@ -18843,6 +18944,27 @@ (define_insn_and_split "avx2_zero_extendv8hiv8si2_1"
   operands[1] = lowpart_subreg (V8HImode, operands[1], V16HImode);
 })
 
+(define_insn_and_split "*avx2_zero_extendv8hiv8si2_2"
+  [(set (match_operand:V16HI 0 "register_operand" "=v")
+	(vec_select:V16HI
+	  (vec_concat:V32HI
+	    (subreg:V16HI
+	      (vec_concat:VI148_256
+		(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
+		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0)
+	    (match_operand:V16HI 3 "const0_operand" "C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V8SI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V8SImode, operands[0], V16HImode);
+  operands[1] = lowpart_subreg (V8HImode, operands[1], <ssehalfvecmode>mode);
+})
+
+
 (define_insn "sse4_1_<code>v4hiv4si2<mask_name>"
   [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
 	(any_extend:V4SI
@@ -18932,6 +19054,39 @@ (define_insn_and_split "*sse4_1_zero_extendv4hiv4si2_3"
 }
   [(set_attr "isa" "noavx,noavx,avx")])
 
+(define_insn_and_split "*sse4_1_zero_extendv4hiv4si2_4"
+  [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v")
+	(vec_select:V8HI
+	  (vec_concat:V16HI
+	    (subreg:V8HI
+	      (vec_concat:VI148_128
+		(match_operand:<ssehalfvecmode> 1 "vector_operand" "YrBm,*xBm,vm")
+		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C,C,C")) 0)
+	    (match_operand:V8HI 3 "const0_operand" "C,C,C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n,n,n")])))]
+  "TARGET_SSE4_1"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(zero_extend:V4SI
+	  (vec_select:V4HI
+	    (match_dup 1)
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)]))))]
+{
+  operands[0] = lowpart_subreg (V4SImode, operands[0], V8HImode);
+  if (MEM_P (operands[1]))
+    {
+      operands[1] = lowpart_subreg (V4HImode, operands[1], <ssehalfvecmode>mode);
+      operands[1] = gen_rtx_ZERO_EXTEND (V4SImode, operands[1]);
+      emit_insn (gen_rtx_SET (operands[0], operands[1]));
+      DONE;
+    }
+  operands[1] = lowpart_subreg (V8HImode, operands[1], <ssehalfvecmode>mode);
+}
+  [(set_attr "isa" "noavx,noavx,avx")])
+
 (define_insn "avx512f_<code>v8qiv8di2<mask_name>"
   [(set (match_operand:V8DI 0 "register_operand" "=v")
 	(any_extend:V8DI
@@ -19242,6 +19397,24 @@ (define_insn_and_split "*avx512f_zero_extendv8siv8di2_1"
   operands[1] = lowpart_subreg (V8SImode, operands[1], V16SImode);
 })
 
+(define_insn_and_split "*avx512f_zero_extendv8siv8di2_2"
+  [(set (match_operand:V16SI 0 "register_operand" "=v")
+	(vec_select:V16SI
+	  (vec_concat:V32SI
+	    (vec_concat:V16SI
+	      (match_operand:V8SI 1 "nonimmediate_operand" "vm")
+	      (match_operand:V8SI 2 "const0_operand" "C"))
+	    (match_operand:V16SI 3 "const0_operand" "C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX512F"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V8DI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V8DImode, operands[0], V16SImode);
+})
+
 (define_expand "<insn>v8siv8di2"
   [(set (match_operand:V8DI 0 "register_operand" "=v")
 	(any_extend:V8DI
@@ -19276,6 +19449,24 @@ (define_insn_and_split "*avx2_zero_extendv4siv4di2_1"
   operands[1] = lowpart_subreg (V4SImode, operands[1], V8SImode);
 })
 
+(define_insn_and_split "*avx2_zero_extendv4siv4di2_2"
+  [(set (match_operand:V8SI 0 "register_operand" "=v")
+	(vec_select:V8SI
+	  (vec_concat:V16SI
+	    (vec_concat:V8SI
+	      (match_operand:V4SI 1 "nonimmediate_operand" "vm")
+	      (match_operand:V4SI 2 "const0_operand" "C"))
+	    (match_operand:V8SI 3 "const0_operand" "C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n")])))]
+  "TARGET_AVX2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V4DI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V4DImode, operands[0], V8SImode);
+})
+
 (define_expand "<insn>v4siv4di2"
   [(set (match_operand:V4DI 0 "register_operand")
 	(any_extend:V4DI
@@ -19352,6 +19543,35 @@ (define_insn_and_split "*sse4_1_zero_extendv2siv2di2_3"
 }
   [(set_attr "isa" "noavx,noavx,avx")])
 
+(define_insn_and_split "*sse4_1_zero_extendv2siv2di2_4"
+  [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
+	(vec_select:V4SI
+	  (vec_concat:V8SI
+	    (vec_concat:V4SI
+	      (match_operand:V2SI 1 "vector_operand" "YrBm, *xBm, vm")
+	      (match_operand:V2SI 2 "const0_operand" "C,C,C"))
+	    (match_operand:V4SI 3 "const0_operand" "C,C,C"))
+	  (match_parallel 4 "pmovzx_parallel"
+	    [(match_operand 5 "const_int_operand" "n,n,n")])))]
+  "TARGET_SSE4_1"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(zero_extend:V2DI
+	  (vec_select:V2SI (match_dup 1)
+			   (parallel [(const_int 0) (const_int 1)]))))]
+{
+  operands[0] = lowpart_subreg (V2DImode, operands[0], V4SImode);
+  if (MEM_P (operands[1]))
+    {
+      operands[1] = gen_rtx_ZERO_EXTEND (V2DImode, operands[1]);
+      emit_insn (gen_rtx_SET (operands[0], operands[1]));
+      DONE;
+    }
+  operands[1] = lowpart_subreg (V4SImode, operands[1], V2SImode);
+}
+  [(set_attr "isa" "noavx,noavx,avx")])
+
 (define_expand "<insn>v2siv2di2"
   [(set (match_operand:V2DI 0 "register_operand")
 	(any_extend:V2DI
diff --git a/gcc/testsuite/gcc.target/i386/pr101846-1.c b/gcc/testsuite/gcc.target/i386/pr101846-1.c
new file mode 100644
index 00000000000..40d95bde6fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101846-1.c
@@ -0,0 +1,95 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vl -mavx512dq -O2" } */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-times "vpmovzxbw" "3" } } */
+/* { dg-final { scan-assembler-times "vpmovzxwd" "3" } } */
+/* { dg-final { scan-assembler-times "vpmovzxdq" "3" } } */
+
+typedef short v4hi __attribute__((vector_size (8)));
+typedef short v8hi __attribute__((vector_size (16)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef char v8qi __attribute__((vector_size (8)));
+typedef char v16qi __attribute__((vector_size (16)));
+typedef char v32qi __attribute__((vector_size (32)));
+typedef char v64qi __attribute__((vector_size (64)));
+typedef int v2si __attribute__((vector_size (8)));
+typedef int v4si __attribute__((vector_size (16)));
+typedef int v8si __attribute__((vector_size (32)));
+typedef int v16si __attribute__((vector_size (64)));
+
+v32hi
+foo_zxwd_512 (v16hi x)
+{
+  return __builtin_shufflevector (x, (v16hi) {},
+				  0, 16, 1, 17, 2, 18, 3, 19,
+				  4, 20, 5, 21, 6, 22, 7, 23,
+				  8, 24, 9, 25, 10, 26, 11, 27,
+				  12, 28, 13, 29, 14, 30, 15, 31);
+}
+
+v16hi
+foo_zxwd_256 (v8hi x)
+{
+  return __builtin_shufflevector (x, (v8hi) {},
+				  0, 8, 1, 9, 2, 10, 3, 11,
+				  4, 12, 5, 13, 6, 14, 7, 15);
+}
+
+v8hi
+foo_zxwd_128 (v4hi x)
+{
+  return __builtin_shufflevector (x, (v4hi) {}, 0, 4, 1, 5, 2, 6, 3, 7);
+}
+
+v16si
+foo_zxdq_512 (v8si x)
+{
+  return __builtin_shufflevector (x, (v8si) {},
+				  0, 8, 1, 9, 2, 10, 3, 11,
+				  4, 12, 5, 13, 6, 14, 7, 15);
+}
+
+v8si
+foo_zxdq_256 (v4si x)
+{
+  return __builtin_shufflevector (x, (v4si) {}, 0, 4, 1, 5, 2, 6, 3, 7);
+}
+
+v4si
+foo_zxdq_128 (v2si x)
+{
+  return __builtin_shufflevector (x, (v2si) {}, 0, 2, 1, 3);
+}
+
+v64qi
+foo_zxbw_512 (v32qi x)
+{
+  return __builtin_shufflevector (x, (v32qi) {},
+				  0, 32, 1, 33, 2, 34, 3, 35,
+				  4, 36, 5, 37, 6, 38, 7, 39,
+				  8, 40, 9, 41, 10, 42, 11, 43,
+				  12, 44, 13, 45, 14, 46, 15, 47,
+				  16, 48, 17, 49, 18, 50, 19, 51,
+				  20, 52, 21, 53, 22, 54, 23, 55,
+				  24, 56, 25, 57, 26, 58, 27, 59,
+				  28, 60, 29, 61, 30, 62, 31, 63);
+}
+
+v32qi
+foo_zxbw_256 (v16qi x)
+{
+  return __builtin_shufflevector (x, (v16qi) {},
+				  0, 16, 1, 17, 2, 18, 3, 19,
+				  4, 20, 5, 21, 6, 22, 7, 23,
+				  8, 24, 9, 25, 10, 26, 11, 27,
+				  12, 28, 13, 29, 14, 30, 15, 31);
+}
+
+v16qi
+foo_zxbw_128 (v8qi x)
+{
+  return __builtin_shufflevector (x, (v8qi) {},
+				  0, 8, 1, 9, 2, 10, 3, 11,
+				  4, 12, 5, 13, 6, 14, 7, 15);
+}
-- 
2.27.0



More information about the Gcc-patches mailing list