This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Implement x86 reduc_plus_scal_v{16,32,64}qi (PR tree-optimization/91201)


Hi!

As mentioned in the PR, we can use psadbw to shorten the final reductions to
scalar for 8-bit elements.  E.g. for -mavx2 the difference is:
-	vmovdqa	%xmm1, %xmm0
-	vextracti128	$0x1, %ymm1, %xmm1
-	vpaddb	%xmm1, %xmm0, %xmm0
-	vpsrldq	$8, %xmm0, %xmm1
-	vpaddb	%xmm1, %xmm0, %xmm0
-	vpsrldq	$4, %xmm0, %xmm1
-	vpaddb	%xmm1, %xmm0, %xmm0
-	vpsrldq	$2, %xmm0, %xmm1
-	vpaddb	%xmm1, %xmm0, %xmm0
-	vpsrldq	$1, %xmm0, %xmm1
-	vpaddb	%xmm1, %xmm0, %xmm0
+	vextracti128	$0x1, %ymm1, %xmm0
+	vpaddb	%xmm1, %xmm0, %xmm1
+	vpsrldq	$8, %xmm1, %xmm0
+	vpaddb	%xmm0, %xmm1, %xmm1
+	vpxor	%xmm0, %xmm0, %xmm0
+	vpsadbw	%xmm0, %xmm1, %xmm0
 	vpextrb	$0, %xmm0, %eax
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2019-07-31  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/91201
	* config/i386/sse.md (reduc_plus_scal_v16qi): New expander.
	(REDUC_PLUS_MODE): Add V32QImode for TARGET_AVX and V64QImode for
	TARGET_AVX512F.
	(reduc_plus_scal_<mode>): Improve formatting by introducing
	a temporary.

	* gcc.target/i386/sse2-pr91201.c: New test.
	* gcc.target/i386/avx2-pr91201.c: New test.
	* gcc.target/i386/avx512bw-pr91201.c: New test.

--- gcc/config/i386/sse.md.jj	2019-07-30 12:19:45.999490854 +0200
+++ gcc/config/i386/sse.md	2019-07-30 12:19:55.379352735 +0200
@@ -2728,9 +2728,30 @@ (define_expand "reduc_plus_scal_<mode>"
   DONE;
 })
 
+(define_expand "reduc_plus_scal_v16qi"
+ [(plus:V16QI
+    (match_operand:QI 0 "register_operand")
+    (match_operand:V16QI 1 "register_operand"))]
+ "TARGET_SSE2"
+{
+  rtx tmp = gen_reg_rtx (V1TImode);
+  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, operands[1]),
+				 GEN_INT (64)));
+  rtx tmp2 = gen_reg_rtx (V16QImode);
+  emit_insn (gen_addv16qi3 (tmp2, operands[1], gen_lowpart (V16QImode, tmp)));
+  rtx tmp3 = gen_reg_rtx (V16QImode);
+  emit_move_insn (tmp3, CONST0_RTX (V16QImode));
+  rtx tmp4 = gen_reg_rtx (V2DImode);
+  emit_insn (gen_sse2_psadbw (tmp4, tmp2, tmp3));
+  tmp4 = gen_lowpart (V16QImode, tmp4);
+  emit_insn (gen_vec_extractv16qiqi (operands[0], tmp4, const0_rtx));
+  DONE;
+})
+
 (define_mode_iterator REDUC_PLUS_MODE
  [(V4DF "TARGET_AVX") (V8SF "TARGET_AVX")
-  (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")])
+  (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
+  (V32QI "TARGET_AVX") (V64QI "TARGET_AVX512F")])
 
 (define_expand "reduc_plus_scal_<mode>"
  [(plus:REDUC_PLUS_MODE
@@ -2741,8 +2762,8 @@ (define_expand "reduc_plus_scal_<mode>"
   rtx tmp = gen_reg_rtx (<ssehalfvecmode>mode);
   emit_insn (gen_vec_extract_hi_<mode> (tmp, operands[1]));
   rtx tmp2 = gen_reg_rtx (<ssehalfvecmode>mode);
-  emit_insn (gen_add<ssehalfvecmodelower>3
-    (tmp2, tmp, gen_lowpart (<ssehalfvecmode>mode, operands[1])));
+  rtx tmp3 = gen_lowpart (<ssehalfvecmode>mode, operands[1]);
+  emit_insn (gen_add<ssehalfvecmodelower>3 (tmp2, tmp, tmp3));
   emit_insn (gen_reduc_plus_scal_<ssehalfvecmodelower> (operands[0], tmp2));
   DONE;
 })
--- gcc/testsuite/gcc.target/i386/sse2-pr91201.c.jj	2019-07-30 12:23:48.930913778 +0200
+++ gcc/testsuite/gcc.target/i386/sse2-pr91201.c	2019-07-30 12:23:45.518964018 +0200
@@ -0,0 +1,18 @@
+/* PR tree-optimization/91201 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -msse2 -mno-sse3" } */
+/* { dg-final { scan-assembler "\tpsadbw\t" } } */
+
+unsigned char bytes[1024];
+
+unsigned char
+sum (void)
+{
+  unsigned char r = 0;
+  unsigned char *p = (unsigned char *) bytes;
+  int n;
+
+  for (n = 0; n < sizeof (bytes); ++n)
+    r += p[n];
+  return r;
+}
--- gcc/testsuite/gcc.target/i386/avx2-pr91201.c.jj	2019-07-30 12:24:05.199674228 +0200
+++ gcc/testsuite/gcc.target/i386/avx2-pr91201.c	2019-07-30 12:24:34.544242142 +0200
@@ -0,0 +1,6 @@
+/* PR tree-optimization/91201 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -mno-avx512f" } */
+/* { dg-final { scan-assembler "\tvpsadbw\t" } } */
+
+#include "sse2-pr91201.c"
--- gcc/testsuite/gcc.target/i386/avx512bw-pr91201.c.jj	2019-07-30 12:24:50.079013395 +0200
+++ gcc/testsuite/gcc.target/i386/avx512bw-pr91201.c	2019-07-30 12:25:10.685709971 +0200
@@ -0,0 +1,6 @@
+/* PR tree-optimization/91201 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512bw -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler "\tvpsadbw\t" } } */
+
+#include "sse2-pr91201.c"

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]