This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] Use TImode for piecewise move in 64-bit mode
- From: Uros Bizjak <ubizjak at gmail dot com>
- To: "H.J. Lu" <hjl dot tools at gmail dot com>
- Cc: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Wed, 10 Aug 2016 17:55:09 +0200
- Subject: Re: [PATCH] Use TImode for piecewise move in 64-bit mode
- Authentication-results: sourceware.org; auth=none
- References: <20160810153247.GA14280@intel.com>
On Wed, Aug 10, 2016 at 5:32 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:
> Use TImode for piecewise move in 64-bit mode. When vector register
> is used for piecewise move, we don't increase stack_alignment_needed
> since vector register spill isn't required for piecewise move. Since
> stack_realign_needed is set to true by checking stack_alignment_estimated
> set by pseudo vector register usage, we also need to check
> stack_realign_needed to eliminate frame pointer.
Why only in 64-bit mode? We can use SSE moves also in 32-bit mode.
I don't think we can handle crtl->stack_realign_needed in this way. If
there are other insns with 32byte vector registers in use in the same
(large) function as converted __builtin_memcpy, we will *still* need
realigned stack.
Uros.
> Tested on x86-64. OK for trunk?
>
> H.J.
> ---
> gcc/
>
> * config/i386/i386.c (ix86_finalize_stack_realign_flags): Also
> check stack_realign_needed for stack realignment.
> * config/i386/i386.h (MOVE_MAX_PIECES): Set to 16 in 64-bit mode
> if unaligned SSE load and store are optimal.
>
> gcc/testsuite/
>
> * gcc.target/i386/pieces-memcpy-1.c: New test.
> * gcc.target/i386/pieces-memcpy-2.c: Likewise.
> * gcc.target/i386/pieces-memcpy-3.c: Likewise.
> * gcc.target/i386/pieces-memcpy-4.c: Likewise.
> * gcc.target/i386/pieces-memcpy-5.c: Likewise.
> * gcc.target/i386/pieces-memcpy-6.c: Likewise.
> ---
> gcc/config/i386/i386.c | 11 +++++++++--
> gcc/config/i386/i386.h | 6 +++++-
> gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c | 17 +++++++++++++++++
> gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c | 17 +++++++++++++++++
> gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c | 17 +++++++++++++++++
> gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c | 17 +++++++++++++++++
> gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c | 17 +++++++++++++++++
> gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c | 17 +++++++++++++++++
> 8 files changed, 116 insertions(+), 3 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 93eaab1..60dc160 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -13286,8 +13286,15 @@ ix86_finalize_stack_realign_flags (void)
> /* If the only reason for frame_pointer_needed is that we conservatively
> assumed stack realignment might be needed, but in the end nothing that
> needed the stack alignment had been spilled, clear frame_pointer_needed
> - and say we don't need stack realignment. */
> - if (stack_realign
> + and say we don't need stack realignment.
> +
> + When vector register is used for piecewise move and store, we don't
> + increase stack_alignment_needed as there is no register spill for
> + piecewise move and store. Since stack_realign_needed is set to true
> + by checking stack_alignment_estimated which is updated by pseudo
> + vector register usage, we also need to check stack_realign_needed to
> + eliminate frame pointer. */
> + if ((stack_realign || crtl->stack_realign_needed)
> && frame_pointer_needed
> && crtl->is_leaf
> && flag_omit_frame_pointer
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 9b66264..24db855 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -1951,7 +1951,11 @@ typedef struct ix86_args {
> /* MOVE_MAX_PIECES is the number of bytes at a time which we can
> move efficiently, as opposed to MOVE_MAX which is the maximum
> number of bytes we can move with a single instruction. */
> -#define MOVE_MAX_PIECES UNITS_PER_WORD
> +#define MOVE_MAX_PIECES \
> + ((TARGET_64BIT \
> + && TARGET_SSE2 \
> + && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
> + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) ? 16 : UNITS_PER_WORD)
>
> /* If a memory-to-memory move would take MOVE_RATIO or more simple
> move-instruction pairs, we will do a movmem or libcall instead.
> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c
> new file mode 100644
> index 0000000..adc0aa8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
> +
> +extern char *dst, *src;
> +
> +void
> +foo (void)
> +{
> + __builtin_memcpy (dst, src, 64);
> +}
> +
> +/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 4 } } */
> +/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */
> +/* No need to dynamically realign the stack here. */
> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
> +/* Nor use a frame pointer. */
> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c
> new file mode 100644
> index 0000000..c52c1d9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
> +
> +extern char *dst, *src;
> +
> +void
> +foo (void)
> +{
> + __builtin_memcpy (dst, src, 33);
> +}
> +
> +/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
> +/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */
> +/* No need to dynamically realign the stack here. */
> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
> +/* Nor use a frame pointer. */
> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c
> new file mode 100644
> index 0000000..c532bbd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
> +
> +extern char *dst, *src;
> +
> +void
> +foo (void)
> +{
> + __builtin_memcpy (dst, src, 17);
> +}
> +
> +/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
> +/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 1 } } */
> +/* No need to dynamically realign the stack here. */
> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
> +/* Nor use a frame pointer. */
> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c
> new file mode 100644
> index 0000000..4ef763d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
> +
> +extern char *dst, *src;
> +
> +void
> +foo (void)
> +{
> + __builtin_memcpy (dst, src, 18);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 1 } } */
> +/* No need to dynamically realign the stack here. */
> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
> +/* Nor use a frame pointer. */
> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c
> new file mode 100644
> index 0000000..2687560
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mavx512f -mtune=generic" } */
> +
> +extern char *dst, *src;
> +
> +void
> +foo (void)
> +{
> + __builtin_memcpy (dst, src, 19);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 1 } } */
> +/* No need to dynamically realign the stack here. */
> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
> +/* Nor use a frame pointer. */
> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
> new file mode 100644
> index 0000000..a205f83
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
> +
> +extern char *dst, *src;
> +
> +void
> +foo (void)
> +{
> + __builtin_memcpy (dst, src, 33);
> +}
> +
> +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
> +/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 2 } } */
> +/* No need to dynamically realign the stack here. */
> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
> +/* Nor use a frame pointer. */
> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
> --
> 2.7.4
>