[PATCH] i386: Fix up vec_extract_lo* patterns [PR93670]

Wed Feb 12 10:02:00 GMT 2020

On Wed, Feb 12, 2020 at 10:27 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> The VEXTRACT* insns have way too many different CPUID feature flags (ATT
> syntax)
> vextractf128 $imm, %ymm, %xmm/mem               AVX
> vextracti128 $imm, %ymm, %xmm/mem               AVX2
> vextract{f,i}32x4 $imm, %ymm, %xmm/mem {k}{z}   AVX512VL+AVX512F
> vextract{f,i}32x4 $imm, %zmm, %xmm/mem {k}{z}   AVX512F
> vextract{f,i}64x2 $imm, %ymm, %xmm/mem {k}{z}   AVX512VL+AVX512DQ
> vextract{f,i}64x2 $imm, %zmm, %xmm/mem {k}{z}   AVX512DQ
> vextract{f,i}32x8 $imm, %zmm, %ymm/mem {k}{z}   AVX512DQ
> vextract{f,i}64x4 $imm, %zmm, %ymm/mem {k}{z}   AVX512F
>
> As the testcase shows and the patch too, we didn't get it right in all
> cases.
>
> The first hunk is about avx512vl_vextractf128v8s[if] incorrectly
> requiring TARGET_AVX512DQ.  The corresponding insn is the first
> vextract{f,i}32x4 above, so it requires VL+F, and the builtins have it
> correct (TARGET_AVX512VL implies TARGET_AVX512F):
> BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_vextractf128v8sf, "__builtin_ia32_extractf32x4_256_mask", IX86_BUILTIN_EXTRACTF32X4_256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT_V4SF_UQI)
> BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_vextractf128v8si, "__builtin_ia32_extracti32x4_256_mask", IX86_BUILTIN_EXTRACTI32X4_256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT_V4SI_UQI)
> We only need TARGET_AVX512DQ for avx512vl_vextractf128v4d[if].
>
> The second hunk is about vec_extract_lo_v16s[if]{,_mask}.  These are using
> the vextract{f,i}32x8 insns (AVX512DQ above), but we weren't requiring that,
> but instead incorrectly && 1 for non-masked and && (64 == 64 && TARGET_AVX512VL)
> for masked insns.  This is extraction from ZMM, so it doesn't need VL for
> anything.  The hunk actually only requires TARGET_AVX512DQ when the insn
> is masked, if it is not masked, when TARGET_AVX512DQ isn't available we can
> use vextract{f,i}64x4 instead which is available already in TARGET_AVX512F
> and does the same thing, extracts the low 256 bits from 512 bits vector
> (often we split it into just nothing, but there are some special cases like
> when using xmm16+ when we can't without AVX512VL).
>
> The last hunk is about vec_extract_lo_v8s[if]{,_mask}.  The non-_mask
> suffixed ones are ok already and just split into nothing (lowpart subreg).
> The masked ones were incorrectly requiring TARGET_AVX512VL and
> TARGET_AVX512DQ, when we only need TARGET_AVX512VL.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2020-02-12  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/93670
>         * config/i386/sse.md (VI48F_256_DQ): New mode iterator.
>         (avx512vl_vextractf128<mode>): Use it instead of VI48F_256.  Remove
>         TARGET_AVX512DQ from condition.
>         (vec_extract_lo_<mode><mask_name>): Use <mask_avx512dq_condition>
>         instead of <mask_mode512bit_condition> in condition.  If
>         TARGET_AVX512DQ is false, emit vextract*64x4 instead of
>         vextract*32x8.
>         (vec_extract_lo_<mode><mask_name>): Drop <mask_avx512dq_condition>
>         from condition.
>
>         * gcc.target/i386/avx512vl-pr93670.c: New test.

OK.

Thanks,
Uros.

> --- gcc/config/i386/sse.md.jj   2020-02-11 14:54:38.017593464 +0100
> +++ gcc/config/i386/sse.md      2020-02-11 15:50:59.629130828 +0100
> @@ -8719,13 +8719,16 @@ (define_insn "vec_extract_hi_<mode><mask
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +(define_mode_iterator VI48F_256_DQ
> +  [V8SI V8SF (V4DI "TARGET_AVX512DQ") (V4DF "TARGET_AVX512DQ")])
> +
>  (define_expand "avx512vl_vextractf128<mode>"
>    [(match_operand:<ssehalfvecmode> 0 "nonimmediate_operand")
> -   (match_operand:VI48F_256 1 "register_operand")
> +   (match_operand:VI48F_256_DQ 1 "register_operand")
>     (match_operand:SI 2 "const_0_to_1_operand")
>     (match_operand:<ssehalfvecmode> 3 "nonimm_or_0_operand")
>     (match_operand:QI 4 "register_operand")]
> -  "TARGET_AVX512DQ && TARGET_AVX512VL"
> +  "TARGET_AVX512VL"
>  {
>    rtx (*insn)(rtx, rtx, rtx, rtx);
>    rtx dest = operands[0];
> @@ -8793,14 +8796,19 @@ (define_insn "vec_extract_lo_<mode><mask
>                       (const_int 4) (const_int 5)
>                       (const_int 6) (const_int 7)])))]
>    "TARGET_AVX512F
> -   && <mask_mode512bit_condition>
> +   && <mask_avx512dq_condition>
>     && (<mask_applied> || !(MEM_P (operands[0]) && MEM_P (operands[1])))"
>  {
>    if (<mask_applied>
>        || (!TARGET_AVX512VL
>           && !REG_P (operands[0])
>           && EXT_REX_SSE_REG_P (operands[1])))
> -    return "vextract<shuffletype>32x8\t{$0x0, %1, %0<mask_operand2>|%0<mask_operand2>, %1, 0x0}";
> +    {
> +      if (TARGET_AVX512DQ)
> +       return "vextract<shuffletype>32x8\t{$0x0, %1, %0<mask_operand2>|%0<mask_operand2>, %1, 0x0}";
> +      else
> +       return "vextract<shuffletype>64x4\t{$0x0, %1, %0|%0, %1, 0x0}";
> +    }
>    else
>      return "#";
>  }
> @@ -8910,7 +8918,7 @@ (define_insn "vec_extract_lo_<mode><mask
>           (parallel [(const_int 0) (const_int 1)
>                      (const_int 2) (const_int 3)])))]
>    "TARGET_AVX
> -   && <mask_avx512vl_condition> && <mask_avx512dq_condition>
> +   && <mask_avx512vl_condition>
>     && (<mask_applied> || !(MEM_P (operands[0]) && MEM_P (operands[1])))"
>  {
>    if (<mask_applied>)
> --- gcc/testsuite/gcc.target/i386/avx512vl-pr93670.c.jj 2020-02-11 16:00:14.874930873 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512vl-pr93670.c    2020-02-11 15:59:01.252019025 +0100
> @@ -0,0 +1,77 @@
> +/* PR target/93670 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512vl -mno-avx512dq" } */
> +
> +#include <x86intrin.h>
> +
> +__m128i
> +f1 (__m256i x)
> +{
> +  return _mm256_extracti32x4_epi32 (x, 0);
> +}
> +
> +__m128i
> +f2 (__m256i x, __m128i w, __mmask8 m)
> +{
> +  return _mm256_mask_extracti32x4_epi32 (w, m, x, 0);
> +}
> +
> +__m128i
> +f3 (__m256i x, __mmask8 m)
> +{
> +  return _mm256_maskz_extracti32x4_epi32 (m, x, 0);
> +}
> +
> +__m128
> +f4 (__m256 x)
> +{
> +  return _mm256_extractf32x4_ps (x, 0);
> +}
> +
> +__m128
> +f5 (__m256 x, __m128 w, __mmask8 m)
> +{
> +  return _mm256_mask_extractf32x4_ps (w, m, x, 0);
> +}
> +
> +__m128
> +f6 (__m256 x, __mmask8 m)
> +{
> +  return _mm256_maskz_extractf32x4_ps (m, x, 0);
> +}
> +
> +__m128i
> +f7 (__m256i x)
> +{
> +  return _mm256_extracti32x4_epi32 (x, 1);
> +}
> +
> +__m128i
> +f8 (__m256i x, __m128i w, __mmask8 m)
> +{
> +  return _mm256_mask_extracti32x4_epi32 (w, m, x, 1);
> +}
> +
> +__m128i
> +f9 (__m256i x, __mmask8 m)
> +{
> +  return _mm256_maskz_extracti32x4_epi32 (m, x, 1);
> +}
> +
> +__m128
> +f10 (__m256 x)
> +{
> +  return _mm256_extractf32x4_ps (x, 1);
> +}
> +
> +__m128
> +f11 (__m256 x, __m128 w, __mmask8 m)
> +{
> +  return _mm256_mask_extractf32x4_ps (w, m, x, 1);
> +}
> +
> +__m128
> +f12 (__m256 x, __mmask8 m)
> +{
> +  return _mm256_maskz_extractf32x4_ps (m, x, 1);
> +}
>
>         Jakub
>