This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Implement x86 reduc_plus_scal_v8qi (PR tree-optimization/91201)
- From: Jakub Jelinek <jakub at redhat dot com>
- To: Uros Bizjak <ubizjak at gmail dot com>
- Cc: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Wed, 31 Jul 2019 11:30:13 +0200
- Subject: [PATCH] Implement x86 reduc_plus_scal_v8qi (PR tree-optimization/91201)
- References: <20190731071046.GT15878@tucnak> <CAFULd4YwD+Z6DLPE_D6mqydn0ax6ELQEV1txoFbRFJVTxqjLxw@mail.gmail.com>
- Reply-to: Jakub Jelinek <jakub at redhat dot com>
Hi!
On Wed, Jul 31, 2019 at 10:51:22AM +0200, Uros Bizjak wrote:
> OK.
Thanks. This follow-up implements the same for mmx with sse for V8QImode,
the testcase shows that it is useful too. The difference is quite large:
- movq $0, -72(%rsp)
- movl $bytes, %eax
movq bytes(%rip), %xmm0
+ movl $bytes, %eax
+ pxor %xmm2, %xmm2
.p2align 4,,10
.p2align 3
.L2:
movdqa %xmm0, %xmm1
movq 8(%rax), %xmm0
- movq -72(%rsp), %xmm2
addq $8, %rax
paddb %xmm0, %xmm1
paddb %xmm0, %xmm2
movq %xmm1, -8(%rax)
- movq %xmm2, -72(%rsp)
cmpq $bytes+1016, %rax
jne .L2
- movq -72(%rsp), %rcx
- movzbl -72(%rsp), %eax
- movzbl %ch, %edx
- addl %edx, %eax
- movq %rcx, %rdx
- shrq $16, %rdx
- addl %edx, %eax
- movq %rcx, %rdx
- shrq $24, %rdx
- addl %edx, %eax
- movq %rcx, %rdx
- shrq $32, %rdx
- addl %edx, %eax
- movq %rcx, %rdx
- shrq $40, %rdx
- addl %edx, %eax
- movq %rcx, %rdx
- shrq $48, %rdx
- addl %eax, %edx
- movq %rcx, %rax
- shrq $56, %rax
- addl %edx, %eax
+ pxor %xmm0, %xmm0
+ movdqa %xmm2, %xmm3
+ psadbw %xmm0, %xmm3
+ movq %xmm3, %rax
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
2019-07-31 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/91201
* config/i386/mmx.md (reduc_plus_scal_v8qi): New expander.
* gcc.target/i386/sse2-pr91201-2.c: New test.
--- gcc/config/i386/mmx.md.jj 2019-07-20 08:35:05.720255567 +0200
+++ gcc/config/i386/mmx.md 2019-07-31 08:43:23.054776025 +0200
@@ -1897,6 +1897,21 @@ (define_insn "mmx_psadbw"
(set_attr "type" "mmxshft,sseiadd,sseiadd")
(set_attr "mode" "DI,TI,TI")])
+(define_expand "reduc_plus_scal_v8qi"
+ [(plus:V8QI
+ (match_operand:QI 0 "register_operand")
+ (match_operand:V8QI 1 "register_operand"))]
+ "TARGET_MMX_WITH_SSE"
+{
+ rtx tmp = gen_reg_rtx (V8QImode);
+ emit_move_insn (tmp, CONST0_RTX (V8QImode));
+ rtx tmp2 = gen_reg_rtx (V1DImode);
+ emit_insn (gen_mmx_psadbw (tmp2, operands[1], tmp));
+ tmp2 = gen_lowpart (V8QImode, tmp2);
+ emit_insn (gen_vec_extractv8qiqi (operands[0], tmp2, const0_rtx));
+ DONE;
+})
+
(define_insn_and_split "mmx_pmovmskb"
[(set (match_operand:SI 0 "register_operand" "=r,r")
(unspec:SI [(match_operand:V8QI 1 "register_operand" "y,x")]
--- gcc/testsuite/gcc.target/i386/sse2-pr91201-2.c.jj 2019-07-31 08:45:19.553086849 +0200
+++ gcc/testsuite/gcc.target/i386/sse2-pr91201-2.c 2019-07-31 08:46:52.556738334 +0200
@@ -0,0 +1,21 @@
+/* PR tree-optimization/91201 */
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O3 -msse2 -mno-sse3" } */
+/* { dg-final { scan-assembler "\tpsadbw\t" } } */
+
+unsigned char bytes[1024];
+
+unsigned char
+sum (void)
+{
+ unsigned char r = 0;
+ unsigned char *p = (unsigned char *) bytes;
+ int n;
+
+ for (n = 8; n < sizeof (bytes); ++n)
+ {
+ p[n - 8] += p[n];
+ r += p[n];
+ }
+ return r;
+}
Jakub