[PATCH] i386: Fix up _mm256_vzeroupper() handling [PR99563]
Uros Bizjak
ubizjak@gmail.com
Tue Mar 16 10:04:51 GMT 2021
On Tue, Mar 16, 2021 at 10:51 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> My r10-6451-gb7b3378f91c0641f2ef4d88db22af62a571c9359 fix for
> vzeroupper vs. ms ABI apparently broke the explicit vzeroupper handling
> when the implicit vzeroupper handling is disabled.
> The epilogue_completed splitter for vzeroupper now adds clobbers for all
> registers which don't have explicit sets in the pattern and the sets are
> added during vzeroupper pass. Before my changes, for explicit user
> vzeroupper, we just weren't modelling its effects at all, it was just
> unspec that didn't tell that it clobbers the upper parts of all XMM < %xmm16
> registers. But now the splitter will even for those add clobbers and as
> it has no sets, it will add clobbers for all registers, which means
> we optimize away anything that lived across that vzeroupper.
>
> The vzeroupper pass has two parts, one is the mode switching that computes
> where to put the implicit vzeroupper calls and puts them there, and then
> another that uses df to figure out what sets to add to all the vzeroupper.
> The former part should be done only under the conditions we have in the
> gate, but the latter as this PR shows needs to happen either if we perform
> the implicit vzeroupper additions, or if there are (or could be) any
> explicit vzeroupper instructions. As that function does df_analyze and
> walks the whole IL, I think it would be too expensive to run it always
> whenever TARGET_AVX, so this patch remembers if we've expanded at least
> one __builtin_ia32_vzeroupper in the function and runs that part of the
> vzeroupper pass both when the old condition is true or when this new
> flag is set.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2021-03-16 Jakub Jelinek <jakub@redhat.com>
>
> PR target/99563
> * config/i386/i386.h (struct machine_function): Add
> has_explicit_vzeroupper bitfield.
> * config/i386/i386-expand.c (ix86_expand_builtin): Set
> cfun->machine->has_explicit_vzeroupper when expanding
> IX86_BUILTIN_VZEROUPPER.
> * config/i386/i386-features.c (rest_of_handle_insert_vzeroupper):
> Do the mode switching only when TARGET_VZEROUPPER, expensive
> optimizations turned on and not optimizing for size.
> (pass_insert_vzeroupper::gate): Enable even when
> cfun->machine->has_explicit_vzeroupper is set.
>
> * gcc.target/i386/avx-pr99563.c: New test.
OK.
Thanks,
Uros.
>
> --- gcc/config/i386/i386.h.jj 2021-02-22 17:54:05.617799002 +0100
> +++ gcc/config/i386/i386.h 2021-03-15 12:30:00.814841624 +0100
> @@ -2941,6 +2941,10 @@ struct GTY(()) machine_function {
> /* True if the function needs a stack frame. */
> BOOL_BITFIELD stack_frame_required : 1;
>
> + /* True if __builtin_ia32_vzeroupper () has been expanded in current
> + function. */
> + BOOL_BITFIELD has_explicit_vzeroupper : 1;
> +
> /* The largest alignment, in bytes, of stack slot actually used. */
> unsigned int max_used_stack_alignment;
>
> --- gcc/config/i386/i386-expand.c.jj 2021-02-09 12:28:14.069323264 +0100
> +++ gcc/config/i386/i386-expand.c 2021-03-15 12:34:26.549901726 +0100
> @@ -13210,6 +13210,10 @@ rdseed_step:
>
> return 0;
>
> + case IX86_BUILTIN_VZEROUPPER:
> + cfun->machine->has_explicit_vzeroupper = true;
> + break;
> +
> default:
> break;
> }
> --- gcc/config/i386/i386-features.c.jj 2021-02-01 09:55:45.953519272 +0100
> +++ gcc/config/i386/i386-features.c 2021-03-15 12:37:07.886116827 +0100
> @@ -1837,19 +1837,22 @@ ix86_add_reg_usage_to_vzerouppers (void)
> static unsigned int
> rest_of_handle_insert_vzeroupper (void)
> {
> - int i;
> -
> - /* vzeroupper instructions are inserted immediately after reload to
> - account for possible spills from 256bit or 512bit registers. The pass
> - reuses mode switching infrastructure by re-running mode insertion
> - pass, so disable entities that have already been processed. */
> - for (i = 0; i < MAX_386_ENTITIES; i++)
> - ix86_optimize_mode_switching[i] = 0;
> + if (TARGET_VZEROUPPER
> + && flag_expensive_optimizations
> + && !optimize_size)
> + {
> + /* vzeroupper instructions are inserted immediately after reload to
> + account for possible spills from 256bit or 512bit registers. The pass
> + reuses mode switching infrastructure by re-running mode insertion
> + pass, so disable entities that have already been processed. */
> + for (int i = 0; i < MAX_386_ENTITIES; i++)
> + ix86_optimize_mode_switching[i] = 0;
>
> - ix86_optimize_mode_switching[AVX_U128] = 1;
> + ix86_optimize_mode_switching[AVX_U128] = 1;
>
> - /* Call optimize_mode_switching. */
> - g->get_passes ()->execute_pass_mode_switching ();
> + /* Call optimize_mode_switching. */
> + g->get_passes ()->execute_pass_mode_switching ();
> + }
> ix86_add_reg_usage_to_vzerouppers ();
> return 0;
> }
> @@ -1880,8 +1883,10 @@ public:
> virtual bool gate (function *)
> {
> return TARGET_AVX
> - && TARGET_VZEROUPPER && flag_expensive_optimizations
> - && !optimize_size;
> + && ((TARGET_VZEROUPPER
> + && flag_expensive_optimizations
> + && !optimize_size)
> + || cfun->machine->has_explicit_vzeroupper);
> }
>
> virtual unsigned int execute (function *)
> --- gcc/testsuite/gcc.target/i386/avx-pr99563.c.jj 2021-03-15 13:18:08.896950279 +0100
> +++ gcc/testsuite/gcc.target/i386/avx-pr99563.c 2021-03-15 13:17:28.881392012 +0100
> @@ -0,0 +1,38 @@
> +/* PR target/99563 */
> +/* { dg-do run { target avx } } */
> +/* { dg-options "-O2 -mavx -mno-vzeroupper" } */
> +
> +#include "avx-check.h"
> +#include <immintrin.h>
> +
> +
> +__attribute__((noipa)) float
> +compute_generic (void)
> +{
> + return 0.0f;
> +}
> +
> +static inline __attribute__((always_inline))
> +float compute_avx (unsigned long block_count)
> +{
> + __m128d mm_res = _mm_set1_pd (256.0);
> + float res = (float) (_mm_cvtsd_f64 (mm_res) / (double) block_count);
> + _mm256_zeroupper ();
> + return res;
> +}
> +
> +__attribute__((noipa)) float
> +compute (unsigned long block_count)
> +{
> + if (block_count >= 64)
> + return compute_avx (block_count);
> + else
> + return compute_generic ();
> +}
> +
> +static void
> +avx_test (void)
> +{
> + if (compute (128) != 2.0f || compute (32) != 0.0f)
> + abort ();
> +}
>
> Jakub
>
More information about the Gcc-patches
mailing list