This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] Add *vec_concatv4sf_0 and *vec_concatv4si_0 insns (PR target/88278)
- From: Uros Bizjak <ubizjak at gmail dot com>
- To: Jakub Jelinek <jakub at redhat dot com>
- Cc: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Sun, 2 Dec 2018 20:23:21 +0100
- Subject: Re: [PATCH] Add *vec_concatv4sf_0 and *vec_concatv4si_0 insns (PR target/88278)
- References: <20181130211441.GJ12380@tucnak>
On Fri, Nov 30, 2018 at 10:14 PM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> As mentioned in the PR, while in vec_concatv2df or vec_concatv2di we have
> alternatives where the lower half is low 64-bit part of a xmm reg or 64-bit
> memory and upper half is zero using movq/vmovq (or movq2dq), for
> vec_concatv4sf and vec_concatv4si we don't have anything similar, although
> the operations do pretty much the same thing. I'm not advocating to put
> in alternatives with GPR operands as having V2SF or V2SI in a GPR is too
> weird, but for the cases where a simple movq or vmovq instruction can move
> 64-bits and clear upper 64-bits these patterns look useful to me.
>
> I had to tweak the pr53759.c testcase because it started FAILing, but only
> because it changed:
> - vxorps %xmm0, %xmm0, %xmm0
> - vmovlps (%rsi), %xmm0, %xmm0
> + vmovq (%rsi), %xmm0
> vaddps %xmm0, %xmm0, %xmm0
> vmovaps %xmm0, (%rdi)
> ret
> I've added a variant of that testcase that tests its original purpose by
> using a register there not known to be all zeros.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2018-11-30 Jakub Jelinek <jakub@redhat.com>
>
> PR target/88278
> * config/i386/sse.md (*vec_concatv4sf_0, *vec_concatv4si_0): New insns.
>
> * gcc.target/i386/pr88278.c: New test.
> * gcc.target/i386/pr53759.c: Don't expect vmovlps insn, expect vmovq
> instead.
> * gcc.target/i386/pr53759-2.c: New test.
OK with a constraint adjustment below.
Thanks,
Uros.
>
> --- gcc/config/i386/sse.md.jj 2018-11-30 21:36:22.277762263 +0100
> +++ gcc/config/i386/sse.md 2018-11-30 21:38:02.261120768 +0100
> @@ -7248,6 +7248,17 @@ (define_insn "*vec_concatv4sf"
> (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex")
> (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
>
> +(define_insn "*vec_concatv4sf_0"
> + [(set (match_operand:V4SF 0 "register_operand" "=v")
> + (vec_concat:V4SF
> + (match_operand:V2SF 1 "nonimmediate_operand" "xm")
The constraint here can be "vm". There is no limitation on register
set for vmovq insn.
> + (match_operand:V2SF 2 "const0_operand" " C")))]
> + "TARGET_SSE2"
> + "%vmovq\t{%1, %0|%0, %1}"
> + [(set_attr "type" "ssemov")
> + (set_attr "prefix" "maybe_vex")
> + (set_attr "mode" "DF")])
> +
> ;; Avoid combining registers from different units in a single alternative,
> ;; see comment above inline_secondary_memory_needed function in i386.c
> (define_insn "vec_set<mode>_0"
> @@ -14409,6 +14420,23 @@ (define_insn "*vec_concatv4si"
> (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex")
> (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")])
>
> +(define_insn "*vec_concatv4si_0"
> + [(set (match_operand:V4SI 0 "register_operand" "=v,x")
> + (vec_concat:V4SI
> + (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
> + (match_operand:V2SI 2 "const0_operand" " C,C")))]
> + "TARGET_SSE2"
> + "@
> + %vmovq\t{%1, %0|%0, %1}
> + movq2dq\t{%1, %0|%0, %1}"
> + [(set_attr "type" "ssemov")
> + (set_attr "prefix" "maybe_vex,orig")
> + (set_attr "mode" "TI")
> + (set (attr "preferred_for_speed")
> + (if_then_else (eq_attr "alternative" "1")
> + (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
> + (symbol_ref "true")))])
> +
> ;; movd instead of movq is required to handle broken assemblers.
> (define_insn "vec_concatv2di"
> [(set (match_operand:V2DI 0 "register_operand"
> --- gcc/testsuite/gcc.target/i386/pr88278.c.jj 2018-11-30 21:38:02.261120768 +0100
> +++ gcc/testsuite/gcc.target/i386/pr88278.c 2018-11-30 21:38:02.261120768 +0100
> @@ -0,0 +1,34 @@
> +/* PR target/88278 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-sse3 -fgimple -masm=att" } */
> +/* { dg-final { scan-assembler-times "movq\[ \t]+\\(" 2 } } */
> +/* { dg-final { scan-assembler-not "punpcklqdq\[ \t]+" } } */
> +/* { dg-final { scan-assembler-not "pxor\[ \t]+" } } */
> +
> +typedef unsigned char v16qi __attribute__((vector_size(16)));
> +typedef unsigned char v8qi __attribute__((vector_size(8)));
> +typedef unsigned int v4si __attribute__((vector_size(16)));
> +typedef unsigned int v2si __attribute__((vector_size(8)));
> +
> +v16qi __GIMPLE foo (unsigned char *p)
> +{
> + v8qi _2;
> + v16qi _3;
> +
> +bb_2:
> + _2 = __MEM <v8qi, 8> (p_1(D));
> + _3 = _Literal (v16qi) { _2, _Literal (v8qi) { _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0 } };
> + return _3;
> +}
> +
> +
> +v4si __GIMPLE bar (unsigned int *p)
> +{
> + v2si _2;
> + v4si _3;
> +
> +bb_2:
> + _2 = __MEM <v2si, 32> (p_1(D));
> + _3 = _Literal (v4si) { _2, _Literal (v2si) { 0u, 0u } };
> + return _3;
> +}
> --- gcc/testsuite/gcc.target/i386/pr53759.c.jj 2016-05-22 12:20:04.184034591 +0200
> +++ gcc/testsuite/gcc.target/i386/pr53759.c 2018-11-30 22:04:56.432806470 +0100
> @@ -12,5 +12,6 @@ foo (__m128 *x, __m64 *y)
> *x = _mm_add_ps (b, b);
> }
>
> -/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */
> +/* { dg-final { scan-assembler "vmovq\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-not "vmovlps\[ \\t\]" } } */
> /* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */
> --- gcc/testsuite/gcc.target/i386/pr53759-2.c.jj 2018-11-30 22:05:06.932657374 +0100
> +++ gcc/testsuite/gcc.target/i386/pr53759-2.c 2018-11-30 22:05:42.568151361 +0100
> @@ -0,0 +1,16 @@
> +/* PR target/53759 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +
> +#include <xmmintrin.h>
> +
> +void
> +foo (__m128 *x, __m64 *y)
> +{
> + __m128 a = _mm_add_ps (x[1], x[2]);
> + __m128 b = _mm_loadl_pi (a, y);
> + *x = _mm_add_ps (b, b);
> +}
> +
> +/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */
>
> Jakub