[PATCH v6 03/10] x86: Update piecewise move and store

Uros Bizjak ubizjak@gmail.com
Mon Aug 2 11:20:20 GMT 2021


On Fri, Jul 30, 2021 at 11:32 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> We can use TImode/OImode/XImode integers for piecewise move and store.
>
> 1. Define MAX_MOVE_MAX to 64, which is the constant maximum number of
> bytes that a single instruction can move quickly between memory and
> registers or between two memory locations.
> 2. Define MOVE_MAX to MOVE_MAX_PIECES, which is the maximum number of
> bytes we can move from memory to memory in one reasonably fast instruction.
> The difference between MAX_MOVE_MAX and MOVE_MAX is that MAX_MOVE_MAX
> must be a constant, independent of compiler options, since it is used in
> reload.h to define struct target_reload and MOVE_MAX can vary, depending
> on compiler options.
> 3. When vector register is used for piecewise move and store, we don't
> increase stack_alignment_needed since vector register spill isn't
> required for piecewise move and store.  Since stack_realign_needed is
> set to true by checking stack_alignment_estimated set by pseudo vector
> register usage, we also need to check stack_realign_needed to eliminate
> frame pointer.
>
> gcc/
>
>         * config/i386/i386.c (ix86_finalize_stack_frame_flags): Also
>         check stack_realign_needed for stack realignment.
>         (ix86_legitimate_constant_p): Always allow CONST_WIDE_INT smaller
>         than the largest integer supported by vector register.
>         * config/i386/i386.h (MAX_MOVE_MAX): New.  Set to 64.
>         (MOVE_MAX_PIECES): Set to bytes of the largest integer supported
>         by vector register.
>         (MOVE_MAX): Defined to MOVE_MAX_PIECES.
>         (STORE_MAX_PIECES): New.
>
> gcc/testsuite/
>
>         * gcc.target/i386/pr90773-1.c: Adjust to expect movq for 32-bit.
>         * gcc.target/i386/pr90773-4.c: Also run for 32-bit.
>         * gcc.target/i386/pr90773-15.c: Likewise.
>         * gcc.target/i386/pr90773-16.c: Likewise.
>         * gcc.target/i386/pr90773-17.c: Likewise.
>         * gcc.target/i386/pr90773-24.c: Likewise.
>         * gcc.target/i386/pr90773-25.c: Likewise.
>         * gcc.target/i386/pr100865-1.c: Likewise.
>         * gcc.target/i386/pr100865-2.c: Likewise.
>         * gcc.target/i386/pr100865-3.c: Likewise.
>         * gcc.target/i386/pr90773-14.c: Also run for 32-bit and expect
>         XMM movd to store 4 bytes.
>         * gcc.target/i386/pr100865-4a.c: Also run for 32-bit and expect
>         YMM registers.
>         * gcc.target/i386/pr100865-4b.c: Likewise.
>         * gcc.target/i386/pr100865-10a.c: Expect YMM registers.
>         * gcc.target/i386/pr100865-10b.c: Likewise.
> ---
>  gcc/config/i386/i386.c                       | 21 ++++++++--
>  gcc/config/i386/i386.h                       | 40 ++++++++++++++++----
>  gcc/testsuite/gcc.target/i386/pr100865-1.c   |  2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-10a.c |  4 +-
>  gcc/testsuite/gcc.target/i386/pr100865-10b.c |  4 +-
>  gcc/testsuite/gcc.target/i386/pr100865-2.c   |  2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-3.c   |  2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-4a.c  |  6 +--
>  gcc/testsuite/gcc.target/i386/pr100865-4b.c  |  8 ++--
>  gcc/testsuite/gcc.target/i386/pr90773-1.c    | 10 ++---
>  gcc/testsuite/gcc.target/i386/pr90773-14.c   |  2 +-
>  gcc/testsuite/gcc.target/i386/pr90773-15.c   |  6 +--
>  gcc/testsuite/gcc.target/i386/pr90773-16.c   |  2 +-
>  gcc/testsuite/gcc.target/i386/pr90773-17.c   |  2 +-
>  gcc/testsuite/gcc.target/i386/pr90773-24.c   |  2 +-
>  gcc/testsuite/gcc.target/i386/pr90773-25.c   |  2 +-
>  gcc/testsuite/gcc.target/i386/pr90773-4.c    |  2 +-
>  17 files changed, 76 insertions(+), 41 deletions(-)
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 5d20ca2067f..842eb0e6786 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -7953,8 +7953,17 @@ ix86_finalize_stack_frame_flags (void)
>       assumed stack realignment might be needed or -fno-omit-frame-pointer
>       is used, but in the end nothing that needed the stack alignment had
>       been spilled nor stack access, clear frame_pointer_needed and say we
> -     don't need stack realignment.  */
> -  if ((stack_realign || (!flag_omit_frame_pointer && optimize))
> +     don't need stack realignment.
> +
> +     When vector register is used for piecewise move and store, we don't
> +     increase stack_alignment_needed as there is no register spill for
> +     piecewise move and store.  Since stack_realign_needed is set to true
> +     by checking stack_alignment_estimated which is updated by pseudo
> +     vector register usage, we also need to check stack_realign_needed to
> +     eliminate frame pointer.  */
> +  if ((stack_realign
> +       || (!flag_omit_frame_pointer && optimize)
> +       || crtl->stack_realign_needed)
>        && frame_pointer_needed
>        && crtl->is_leaf
>        && crtl->sp_is_unchanging
> @@ -10418,7 +10427,13 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
>           /* FALLTHRU */
>         case E_OImode:
>         case E_XImode:
> -         if (!standard_sse_constant_p (x, mode))
> +         if (!standard_sse_constant_p (x, mode)
> +             && GET_MODE_SIZE (TARGET_AVX512F
> +                               ? XImode
> +                               : (TARGET_AVX
> +                                  ? OImode
> +                                  : (TARGET_SSE2
> +                                     ? TImode : DImode))) < GET_MODE_SIZE (mode))
>             return false;
>         default:
>           break;
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index d1e1c225990..50418a0cc9b 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -1757,9 +1757,10 @@ typedef struct ix86_args {
>  /* Define this as 1 if `char' should by default be signed; else as 0.  */
>  #define DEFAULT_SIGNED_CHAR 1
>
> -/* Max number of bytes we can move from memory to memory
> -   in one reasonably fast instruction.  */
> -#define MOVE_MAX 16
> +/* The constant maximum number of bytes that a single instruction can
> +   move quickly between memory and registers or between two memory
> +   locations.  */
> +#define MAX_MOVE_MAX 64
>
>  /* MOVE_MAX_PIECES is the number of bytes at a time which we can
>     move efficiently, as opposed to  MOVE_MAX which is the maximum

The comment here is now totally wrong.

> @@ -1770,11 +1771,34 @@ typedef struct ix86_args {
>     widest mode with MAX_FIXED_MODE_SIZE, we can only use TImode in
>     64-bit mode.  */
>  #define MOVE_MAX_PIECES \
> -  ((TARGET_64BIT \
> -    && TARGET_SSE2 \
> -    && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
> -    && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> -   ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD)
> +  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> +   ? 64 \
> +   : ((TARGET_AVX \
> +       && !TARGET_PREFER_AVX128 \
> +       && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \
> +       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> +      ? 32 \
> +      : ((TARGET_SSE2 \
> +         && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
> +         && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> +        ? 16 : UNITS_PER_WORD)))
> +
> +/* Max number of bytes we can move from memory to memory in one
> +   reasonably fast instruction.  */
> +#define MOVE_MAX MOVE_MAX_PIECES

Isn't this a bit backward now? Instead of the above define, we should
define MOVE_MAX instead of MOVE_MAX_PIECES, defaults.h has:

defaults.h:#ifndef MOVE_MAX_PIECES
defaults.h:#define MOVE_MAX_PIECES   MOVE_MAX

Uros.

> +
> +/* STORE_MAX_PIECES is the number of bytes at a time that we can
> +   store efficiently.  */
> +#define STORE_MAX_PIECES \
> +  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> +   ? 64 \
> +   : ((TARGET_AVX \
> +       && !TARGET_PREFER_AVX128 \
> +       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> +      ? 32 \
> +      : ((TARGET_SSE2 \
> +         && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> +        ? 16 : UNITS_PER_WORD)))
>
>  /* If a memory-to-memory move would take MOVE_RATIO or more simple
>     move-instruction pairs, we will do a cpymem or libcall instead.
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-1.c b/gcc/testsuite/gcc.target/i386/pr100865-1.c
> index 6c3097fb2a6..949dd5c337a 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-1.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-1.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-do compile } */
>  /* { dg-options "-O2 -march=x86-64" } */
>
>  extern char *dst;
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10a.c b/gcc/testsuite/gcc.target/i386/pr100865-10a.c
> index 7ffc19e56a8..98b6dfb16f3 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-10a.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-10a.c
> @@ -29,5 +29,5 @@ foo (void)
>      array[i] = MK_CONST128_BROADCAST (0x1f);
>  }
>
> -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> -/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> index edf52765c60..e5616d8d258 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> @@ -3,5 +3,5 @@
>
>  #include "pr100865-10a.c"
>
> -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> -/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
> +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-2.c b/gcc/testsuite/gcc.target/i386/pr100865-2.c
> index 17efe2d72a3..f3ea7753abe 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-2.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-2.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-do compile } */
>  /* { dg-options "-O2 -march=skylake" } */
>
>  extern char *dst;
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c
> index 007e79f91b0..714c43e12c9 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-3.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-do compile } */
>  /* { dg-options "-O2 -march=skylake-avx512" } */
>
>  extern char *dst;
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4a.c b/gcc/testsuite/gcc.target/i386/pr100865-4a.c
> index f55883598f9..365487337ae 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-4a.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-4a.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-do compile } */
>  /* { dg-options "-O2 -march=skylake" } */
>
>  extern char array[64];
> @@ -11,6 +11,6 @@ foo (void)
>      array[i] = -45;
>  }
>
> -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" 1 } } */
> -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */
> +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 2 } } */
>  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> index 1e50dc842bc..8e8a7eaaaff 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> @@ -1,9 +1,9 @@
> -/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-do compile } */
>  /* { dg-options "-O2 -march=skylake-avx512" } */
>
>  #include "pr100865-4a.c"
>
> -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> -/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */
> -/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
> +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */
> +/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
>  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr90773-1.c b/gcc/testsuite/gcc.target/i386/pr90773-1.c
> index 1d9f282dc0d..4fd5a40d99d 100644
> --- a/gcc/testsuite/gcc.target/i386/pr90773-1.c
> +++ b/gcc/testsuite/gcc.target/i386/pr90773-1.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -mtune=generic" } */
> +/* { dg-options "-O2 -msse2 -mtune=generic" } */
>
>  extern char *dst, *src;
>
> @@ -9,9 +9,5 @@ foo (void)
>    __builtin_memcpy (dst, src, 15);
>  }
>
> -/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
> -/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
> -/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
> -/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
> -/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
> -/* { dg-final { scan-assembler-times "movl\[\\t \]+11\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 } } */
> +/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr90773-14.c b/gcc/testsuite/gcc.target/i386/pr90773-14.c
> index e5c19f49cf5..96ee5cb08c1 100644
> --- a/gcc/testsuite/gcc.target/i386/pr90773-14.c
> +++ b/gcc/testsuite/gcc.target/i386/pr90773-14.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-do compile } */
>  /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
>
>  extern char *dst;
> diff --git a/gcc/testsuite/gcc.target/i386/pr90773-15.c b/gcc/testsuite/gcc.target/i386/pr90773-15.c
> index 185ea60e1d2..403cdb248a2 100644
> --- a/gcc/testsuite/gcc.target/i386/pr90773-15.c
> +++ b/gcc/testsuite/gcc.target/i386/pr90773-15.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-do compile } */
>  /* { dg-options "-O2 -march=skylake-avx512" } */
>
>  extern char *dst;
> @@ -9,6 +9,6 @@ foo (int c)
>    __builtin_memset (dst, c, 17);
>  }
>
> -/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%edi, %xmm\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%.*, %xmm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
> -/* { dg-final { scan-assembler-times "movb\[\\t \]+%dil, 16\\(%\[\^,\]+\\)" 1 } } */
> +/* { dg-final { scan-assembler-times "movb\[\\t \]+%.*, 16\\(%\[\^,\]+\\)" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr90773-16.c b/gcc/testsuite/gcc.target/i386/pr90773-16.c
> index d820cc318c3..bb0aadbc77e 100644
> --- a/gcc/testsuite/gcc.target/i386/pr90773-16.c
> +++ b/gcc/testsuite/gcc.target/i386/pr90773-16.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-do compile } */
>  /* { dg-options "-O2 -march=skylake-avx512" } */
>
>  extern char *dst;
> diff --git a/gcc/testsuite/gcc.target/i386/pr90773-17.c b/gcc/testsuite/gcc.target/i386/pr90773-17.c
> index f6f179e9b5b..73d5d5abaee 100644
> --- a/gcc/testsuite/gcc.target/i386/pr90773-17.c
> +++ b/gcc/testsuite/gcc.target/i386/pr90773-17.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-do compile } */
>  /* { dg-options "-O2 -march=skylake-avx512" } */
>
>  extern char *dst;
> diff --git a/gcc/testsuite/gcc.target/i386/pr90773-24.c b/gcc/testsuite/gcc.target/i386/pr90773-24.c
> index 7b2ea66dcfc..71f1fd8c4df 100644
> --- a/gcc/testsuite/gcc.target/i386/pr90773-24.c
> +++ b/gcc/testsuite/gcc.target/i386/pr90773-24.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-do compile } */
>  /* { dg-options "-O2 -march=x86-64" } */
>
>  struct S
> diff --git a/gcc/testsuite/gcc.target/i386/pr90773-25.c b/gcc/testsuite/gcc.target/i386/pr90773-25.c
> index 57642ea8d2d..ad19a88c883 100644
> --- a/gcc/testsuite/gcc.target/i386/pr90773-25.c
> +++ b/gcc/testsuite/gcc.target/i386/pr90773-25.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-do compile } */
>  /* { dg-options "-O2 -march=x86-64" } */
>
>  struct S
> diff --git a/gcc/testsuite/gcc.target/i386/pr90773-4.c b/gcc/testsuite/gcc.target/i386/pr90773-4.c
> index ec0bc0100ae..ee4c04678d1 100644
> --- a/gcc/testsuite/gcc.target/i386/pr90773-4.c
> +++ b/gcc/testsuite/gcc.target/i386/pr90773-4.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-do compile } */
>  /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
>
>  extern char *dst;
> --
> 2.31.1
>


More information about the Gcc-patches mailing list