[PATCH] i386: Fix up _mm256_vzeroupper() handling [PR99563]

Tue Mar 16 10:04:51 GMT 2021

On Tue, Mar 16, 2021 at 10:51 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> My r10-6451-gb7b3378f91c0641f2ef4d88db22af62a571c9359 fix for
> vzeroupper vs. ms ABI apparently broke the explicit vzeroupper handling
> when the implicit vzeroupper handling is disabled.
> The epilogue_completed splitter for vzeroupper now adds clobbers for all
> registers which don't have explicit sets in the pattern and the sets are
> added during vzeroupper pass.  Before my changes, for explicit user
> vzeroupper, we just weren't modelling its effects at all, it was just
> unspec that didn't tell that it clobbers the upper parts of all XMM < %xmm16
> registers.  But now the splitter will even for those add clobbers and as
> it has no sets, it will add clobbers for all registers, which means
> we optimize away anything that lived across that vzeroupper.
>
> The vzeroupper pass has two parts, one is the mode switching that computes
> where to put the implicit vzeroupper calls and puts them there, and then
> another that uses df to figure out what sets to add to all the vzeroupper.
> The former part should be done only under the conditions we have in the
> gate, but the latter as this PR shows needs to happen either if we perform
> the implicit vzeroupper additions, or if there are (or could be) any
> explicit vzeroupper instructions.  As that function does df_analyze and
> walks the whole IL, I think it would be too expensive to run it always
> whenever TARGET_AVX, so this patch remembers if we've expanded at least
> one __builtin_ia32_vzeroupper in the function and runs that part of the
> vzeroupper pass both when the old condition is true or when this new
> flag is set.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2021-03-16  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/99563
>         * config/i386/i386.h (struct machine_function): Add
>         has_explicit_vzeroupper bitfield.
>         * config/i386/i386-expand.c (ix86_expand_builtin): Set
>         cfun->machine->has_explicit_vzeroupper when expanding
>         IX86_BUILTIN_VZEROUPPER.
>         * config/i386/i386-features.c (rest_of_handle_insert_vzeroupper):
>         Do the mode switching only when TARGET_VZEROUPPER, expensive
>         optimizations turned on and not optimizing for size.
>         (pass_insert_vzeroupper::gate): Enable even when
>         cfun->machine->has_explicit_vzeroupper is set.
>
>         * gcc.target/i386/avx-pr99563.c: New test.

OK.

Thanks,
Uros.

>
> --- gcc/config/i386/i386.h.jj   2021-02-22 17:54:05.617799002 +0100
> +++ gcc/config/i386/i386.h      2021-03-15 12:30:00.814841624 +0100
> @@ -2941,6 +2941,10 @@ struct GTY(()) machine_function {
>    /* True if the function needs a stack frame.  */
>    BOOL_BITFIELD stack_frame_required : 1;
>
> +  /* True if __builtin_ia32_vzeroupper () has been expanded in current
> +     function.  */
> +  BOOL_BITFIELD has_explicit_vzeroupper : 1;
> +
>    /* The largest alignment, in bytes, of stack slot actually used.  */
>    unsigned int max_used_stack_alignment;
>
> --- gcc/config/i386/i386-expand.c.jj    2021-02-09 12:28:14.069323264 +0100
> +++ gcc/config/i386/i386-expand.c       2021-03-15 12:34:26.549901726 +0100
> @@ -13210,6 +13210,10 @@ rdseed_step:
>
>        return 0;
>
> +    case IX86_BUILTIN_VZEROUPPER:
> +      cfun->machine->has_explicit_vzeroupper = true;
> +      break;
> +
>      default:
>        break;
>      }
> --- gcc/config/i386/i386-features.c.jj  2021-02-01 09:55:45.953519272 +0100
> +++ gcc/config/i386/i386-features.c     2021-03-15 12:37:07.886116827 +0100
> @@ -1837,19 +1837,22 @@ ix86_add_reg_usage_to_vzerouppers (void)
>  static unsigned int
>  rest_of_handle_insert_vzeroupper (void)
>  {
> -  int i;
> -
> -  /* vzeroupper instructions are inserted immediately after reload to
> -     account for possible spills from 256bit or 512bit registers.  The pass
> -     reuses mode switching infrastructure by re-running mode insertion
> -     pass, so disable entities that have already been processed.  */
> -  for (i = 0; i < MAX_386_ENTITIES; i++)
> -    ix86_optimize_mode_switching[i] = 0;
> +  if (TARGET_VZEROUPPER
> +      && flag_expensive_optimizations
> +      && !optimize_size)
> +    {
> +      /* vzeroupper instructions are inserted immediately after reload to
> +        account for possible spills from 256bit or 512bit registers.  The pass
> +        reuses mode switching infrastructure by re-running mode insertion
> +        pass, so disable entities that have already been processed.  */
> +      for (int i = 0; i < MAX_386_ENTITIES; i++)
> +       ix86_optimize_mode_switching[i] = 0;
>
> -  ix86_optimize_mode_switching[AVX_U128] = 1;
> +      ix86_optimize_mode_switching[AVX_U128] = 1;
>
> -  /* Call optimize_mode_switching.  */
> -  g->get_passes ()->execute_pass_mode_switching ();
> +      /* Call optimize_mode_switching.  */
> +      g->get_passes ()->execute_pass_mode_switching ();
> +    }
>    ix86_add_reg_usage_to_vzerouppers ();
>    return 0;
>  }
> @@ -1880,8 +1883,10 @@ public:
>    virtual bool gate (function *)
>      {
>        return TARGET_AVX
> -            && TARGET_VZEROUPPER && flag_expensive_optimizations
> -            && !optimize_size;
> +            && ((TARGET_VZEROUPPER
> +                 && flag_expensive_optimizations
> +                 && !optimize_size)
> +                || cfun->machine->has_explicit_vzeroupper);
>      }
>
>    virtual unsigned int execute (function *)
> --- gcc/testsuite/gcc.target/i386/avx-pr99563.c.jj      2021-03-15 13:18:08.896950279 +0100
> +++ gcc/testsuite/gcc.target/i386/avx-pr99563.c 2021-03-15 13:17:28.881392012 +0100
> @@ -0,0 +1,38 @@
> +/* PR target/99563 */
> +/* { dg-do run { target avx } } */
> +/* { dg-options "-O2 -mavx -mno-vzeroupper" } */
> +
> +#include "avx-check.h"
> +#include <immintrin.h>
> +
> +
> +__attribute__((noipa)) float
> +compute_generic (void)
> +{
> +  return 0.0f;
> +}
> +
> +static inline __attribute__((always_inline))
> +float compute_avx (unsigned long block_count)
> +{
> +  __m128d mm_res = _mm_set1_pd (256.0);
> +  float res = (float) (_mm_cvtsd_f64 (mm_res) / (double) block_count);
> +  _mm256_zeroupper ();
> +  return res;
> +}
> +
> +__attribute__((noipa)) float
> +compute (unsigned long block_count)
> +{
> +  if (block_count >= 64)
> +    return compute_avx (block_count);
> +  else
> +    return compute_generic ();
> +}
> +
> +static void
> +avx_test (void)
> +{
> +  if (compute (128) != 2.0f || compute (32) != 0.0f)
> +    abort ();
> +}
>
>         Jakub
>