Bug 87317

Summary: Missed optimisation: merging VMOVQ with operations that only use the low 8 bytes
Product: gcc Reporter: Thiago Macieira <thiago>
Component: targetAssignee: Not yet assigned to anyone <unassigned>
Status: RESOLVED FIXED    
Severity: normal CC: hjl.tools
Priority: P3 Keywords: missed-optimization
Version: 8.2.1   
Target Milestone: 9.0   
Host: Target: x86_64-*-*
Build: Known to work:
Known to fail: Last reconfirmed: 2018-09-16 00:00:00
Attachments: A patch

Description Thiago Macieira 2018-09-15 05:26:43 UTC
Test:

#include <immintrin.h>

int f(void *ptr)
{
    __m128i data = _mm_loadl_epi64((__m128i *)ptr);
    data = _mm_cvtepu8_epi16(data);
    return _mm_cvtsi128_si32(data);
}

GCC generates (-march=haswell or -march=skylake):

        vmovq   (%rdi), %xmm0
        vpmovzxbw       %xmm0, %xmm0
        vmovd   %xmm0, %eax
        ret

Note that the VPMOVZXBW instruction only reads the low 8 bytes from the source, including if it is a memory reference. Both Clang and ICC generate:

        vpmovzxbw       (%rdi), %xmm0
        vmovd   %xmm0, %eax
        retq

Similarly for:

void f(void *dst, void *ptr)
{
    __m128i data = _mm_cvtsi32_si128(*(int*)ptr);
    data = _mm_cvtepu8_epi32(data);
    _mm_storeu_si128((__m128i*)dst, data);
}

GCC:

        vmovd   (%rsi), %xmm0
        vpmovzxbd       %xmm0, %xmm0
        vmovups %xmm0, (%rdi)
        ret

Clang and ICC:

        vpmovzxbd       (%rsi), %xmm0
        vmovdqu %xmm0, (%rdi)
        retq

There are other instructions that might benefit from this.

AVX-512 memory instructions where the OpMask is a constant might be candidates too.
Comment 1 Marc Glisse 2018-09-15 06:22:17 UTC
What I see:

- we could implement _mm_cvtsi128_si32 using gcc extensions instead of a builtin

- I think we have code to simplify (vec_select (vec_concat ...) ...) when we select everything from the same half, but we do not handle an intermediate subreg: (vec_select:V8QI (subreg:V16QI (vec_concat:V2DI ...

- sse4_1_zero_extendv8qiv8hi2 is described as taking a v16qi as input, not a v8qi
Comment 2 H.J. Lu 2018-09-16 03:58:56 UTC
Try this:

https://github.com/hjl-tools/gcc/commit/a5e6fd272554f58136ba45bbdf9fd48553a72324

I got

[hjl@gnu-tools-1 gcc]$ cat x.c
#include <immintrin.h>

int f(void *ptr)
{
    __m128i data = _mm_loadl_epi64((__m128i *)ptr);
    data = _mm_cvtepu8_epi16(data);
    return _mm_cvtsi128_si32(data);
}
[hjl@gnu-tools-1 gcc]$ ./xgcc -B./ -O2 -march=haswell x.c -S
cat [hjl@gnu-tools-1 gcc]$ cat x.s
	.file	"x.c"
	.text
	.p2align 4
	.globl	f
	.type	f, @function
f:
.LFB5178:
	.cfi_startproc
	vpmovzxbw	(%rdi), %xmm0
	vmovd	%xmm0, %eax
	ret
	.cfi_endproc
.LFE5178:
	.size	f, .-f
	.ident	"GCC: (GNU) 9.0.0 20180916 (experimental)"
	.section	.note.GNU-stack,"",@progbits
[hjl@gnu-tools-1 gcc]$ cat y.c
#include <immintrin.h>

void f(void *dst, void *ptr)
{
    __m128i data = _mm_cvtsi32_si128(*(int*)ptr);
    data = _mm_cvtepu8_epi32(data);
    _mm_storeu_si128((__m128i*)dst, data);
}
[hjl@gnu-tools-1 gcc]$ ./xgcc -B./ -O2 -march=haswell y.c -S
c[hjl@gnu-tools-1 gcc]$ cat y.s
	.file	"y.c"
	.text
	.p2align 4
	.globl	f
	.type	f, @function
f:
.LFB5178:
	.cfi_startproc
	vpmovzxbd	(%rsi), %xmm0
	vmovups	%xmm0, (%rdi)
	ret
	.cfi_endproc
.LFE5178:
	.size	f, .-f
	.ident	"GCC: (GNU) 9.0.0 20180916 (experimental)"
	.section	.note.GNU-stack,"",@progbits
[hjl@gnu-tools-1 gcc]$
Comment 3 H.J. Lu 2018-09-16 04:05:40 UTC
We can add patterns with memory operand for all vpmov<extsuffix> instructions.
Comment 4 Marc Glisse 2018-09-16 05:56:25 UTC
> (match_operand:DI 1 "nonimmediate_operand" "m,*m,m")

Does it have to come from memory, can't it also come from a (sub)register?

int f(__m64 x){
  __m128i y = _mm_movpi64_epi64(x); // or harder _mm_set1_epi64(x)
  __m128i z = _mm_cvtepu8_epi16(y);
  return _mm_cvtsi128_si32(z);
}
Comment 5 H.J. Lu 2018-09-16 21:32:05 UTC
Created attachment 44702 [details]
A patch

Please try this.
Comment 6 hjl@gcc.gnu.org 2018-11-21 13:19:25 UTC
Author: hjl
Date: Wed Nov 21 13:18:54 2018
New Revision: 266342

URL: https://gcc.gnu.org/viewcvs?rev=266342&root=gcc&view=rev
Log:
x86: Add pmovzx/pmovsx patterns with memory operands

Many x86 pmovzx/pmovsx instructions with memory operands are modeled in
a wrong way.  For example:

(define_insn "sse4_1_<code>v8qiv8hi2<mask_name>"
  [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v")
    (any_extend:V8HI
      (vec_select:V8QI
        (match_operand:V16QI 1 "nonimmediate_operand" "Yrm,*xm,vm")
        (parallel [(const_int 0) (const_int 1)
               (const_int 2) (const_int 3)
               (const_int 4) (const_int 5)
               (const_int 6) (const_int 7)]))))]

should be defind for memory operands as:

(define_insn "sse4_1_<code>v8qiv8hi2<mask_name>"
  [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v")
    (any_extend:V8HI
      (match_operand:V8QI "memory_operand" "m,m,m")))]

This patch updates them to

(define_insn "sse4_1_<code>v8qiv8hi2<mask_name>"
  [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v")
	(any_extend:V8HI
	  (vec_select:V8QI
	    (match_operand:V16QI 1 "register_operand" "Yr,*x,v")
	    (parallel [(const_int 0) (const_int 1)
		       (const_int 2) (const_int 3)
		       (const_int 4) (const_int 5)
		       (const_int 6) (const_int 7)]))))]

(define_insn "*sse4_1_<code>v8qiv8hi2<mask_name>_1"
  [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v")
	(any_extend:V8HI
	  (match_operand:V8QI "subreg_memory_operand" "m,m,m")))]

with a splitter:

(define_insn_and_split "*sse4_1_<code>v8qiv8hi2<mask_name>_2"
  [(set (match_operand:V8HI 0 "register_operand")
        (any_extend:V8HI
          (vec_select:V8QI
            (subreg:V16QI
              (vec_concat:V2DI
                (match_operand:DI 1 "memory_operand")
                (const_int 0)) 0)
            (parallel [(const_int 0) (const_int 1)
                       (const_int 2) (const_int 3)
                       (const_int 4) (const_int 5)
                       (const_int 6) (const_int 7)]))))]
  "TARGET_SSE4_1
   && <mask_avx512bw_condition>
   && <mask_avx512vl_condition>
  "&& can_create_pseudo_p ()"
  "#"
  "&& 1"
  [(set (match_dup 0)
        (any_extend:V8HI (match_dup 1)))]
  "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);")

This patch requires updating apply_subst_iterator to handle
define_insn_and_split.

gcc/

	PR target/87317
	* config/i386/sse.md (sse4_1_<code>v8qiv8hi2<mask_name>): Replace
	nonimmediate_operand with register_operand.
	(avx2_<code>v8qiv8si2<mask_name>): Likewise.
	(sse4_1_<code>v4qiv4si2<mask_name>): Likewise.
	(sse4_1_<code>v4hiv4si2<mask_name>): Likewise.
	(sse4_1_<code>v2qiv2di2<mask_name>): Likewise.
	(avx512f_<code>v8qiv8di2<mask_name>): Likewise.
	(avx2_<code>v4qiv4di2<mask_name>): Likewise.
	(avx2_<code>v4hiv4di2<mask_name>): Likewise.
	(sse4_1_<code>v2hiv2di2<mask_name>): Likewise.
	(sse4_1_<code>v2siv2di2<mask_name>): Likewise.
	(*sse4_1_<code>v8qiv8hi2<mask_name>_1): New pattern.
	(*sse4_1_<code>v8qiv8hi2<mask_name>_2): Likewise.
	(*avx2_<code>v8qiv8si2<mask_name>_1): Likewise.
	(*avx2_<code>v8qiv8si2<mask_name>_2): Likewise.
	(*sse4_1_<code>v4qiv4si2<mask_name>_1): Likewise.
	(*sse4_1_<code>v4qiv4si2<mask_name>_2): Likewise.
	(*sse4_1_<code>v4hiv4si2<mask_name>_1): Likewise.
	(*sse4_1_<code>v4hiv4si2<mask_name>_2): Likewise.
	(*avx512f_<code>v8qiv8di2<mask_name>_1): Likewise.
	(*avx512f_<code>v8qiv8di2<mask_name>_2): Likewise.
	(*avx2_<code>v4qiv4di2<mask_name>_1): Likewise.
	(*avx2_<code>v4qiv4di2<mask_name>_2): Likewise.
	(*avx2_<code>v4hiv4di2<mask_name>_1): Likewise.
	(*avx2_<code>v4hiv4di2<mask_name>_2): Likewise.
	(*sse4_1_<code>v2hiv2di2<mask_name>_1): Likewise.
	(*sse4_1_<code>v2hiv2di2<mask_name>_2): Likewise.
	(*sse4_1_<code>v2siv2di2<mask_name>_1): Likewise.
	(*sse4_1_<code>v2siv2di2<mask_name>_2): Likewise.

gcc/testsuite/

	PR target/87317
	* gcc.target/i386/pr87317-1.c: New file.
	* gcc.target/i386/pr87317-2.c: Likewise.
	* gcc.target/i386/pr87317-3.c: Likewise.
	* gcc.target/i386/pr87317-4.c: Likewise.
	* gcc.target/i386/pr87317-5.c: Likewise.
	* gcc.target/i386/pr87317-6.c: Likewise.
	* gcc.target/i386/pr87317-7.c: Likewise.
	* gcc.target/i386/pr87317-8.c: Likewise.
	* gcc.target/i386/pr87317-9.c: Likewise.
	* gcc.target/i386/pr87317-10.c: Likewise.
	* gcc.target/i386/pr87317-11.c: Likewise.
	* gcc.target/i386/pr87317-12.c: Likewise.
	* gcc.target/i386/pr87317-13.c: Likewise.

Added:
    trunk/gcc/testsuite/gcc.target/i386/pr87317-1.c
    trunk/gcc/testsuite/gcc.target/i386/pr87317-10.c
    trunk/gcc/testsuite/gcc.target/i386/pr87317-11.c
    trunk/gcc/testsuite/gcc.target/i386/pr87317-12.c
    trunk/gcc/testsuite/gcc.target/i386/pr87317-13.c
    trunk/gcc/testsuite/gcc.target/i386/pr87317-2.c
    trunk/gcc/testsuite/gcc.target/i386/pr87317-3.c
    trunk/gcc/testsuite/gcc.target/i386/pr87317-4.c
    trunk/gcc/testsuite/gcc.target/i386/pr87317-5.c
    trunk/gcc/testsuite/gcc.target/i386/pr87317-6.c
    trunk/gcc/testsuite/gcc.target/i386/pr87317-7.c
    trunk/gcc/testsuite/gcc.target/i386/pr87317-8.c
    trunk/gcc/testsuite/gcc.target/i386/pr87317-9.c
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/config/i386/sse.md
    trunk/gcc/testsuite/ChangeLog
Comment 7 H.J. Lu 2018-11-21 13:21:28 UTC
Fixed for GCC 9.