[PATCH] Optimize integer vector comparison followed by movmsk (PR target/88152)

Thu Nov 29 16:27:00 GMT 2018

On Thu, Nov 29, 2018 at 3:36 PM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> Like blend, movmsk also only cares about the most significant bit,
> so prior < 0 comparisons or (happens also on the testcase below in some
> cases) arithmetic shift right (by any value) isn't needed before the movmsk.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Same comment as with your lt+blend -> blend patch. I think that
pre-reload define_insn_and_split that splits the combination to movmsk
would be better here. We already implement similar approach to remove
useless maskings of shift operands (c.f. various "..._mask" insns in
i386.md).

Uros.

> 2018-11-29  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/88152
>         * config/i386/sse.md (*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_lt,
>         *<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_lt,
>         *<sse>_movmsk<ssemodesuffix><avxsizesuffix>_shift,
>         *<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_shift,
>         *<sse2_avx2>_pmovmskb_lt, *<sse2_avx2>_pmovmskb_zext_lt): New
>         patterns.
>
>         * g++.target/i386/pr88152.C: New test.
>
> --- gcc/config/i386/sse.md.jj   2018-11-29 12:30:45.257028189 +0100
> +++ gcc/config/i386/sse.md      2018-11-29 13:16:34.111969513 +0100
> @@ -14653,6 +14653,62 @@ (define_insn "*<sse>_movmsk<ssemodesuffi
>     (set_attr "prefix" "maybe_vex")
>     (set_attr "mode" "<MODE>")])
>
> +(define_insn "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_lt"
> +  [(set (match_operand:SI 0 "register_operand" "=r")
> +       (unspec:SI
> +         [(lt:VF_128_256
> +            (match_operand:<sseintvecmode> 1 "register_operand" "x")
> +            (match_operand:<sseintvecmode> 2 "const0_operand" "C"))]
> +         UNSPEC_MOVMSK))]
> +  "TARGET_SSE"
> +  "%vmovmsk<ssemodesuffix>\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "maybe_vex")
> +   (set_attr "mode" "<MODE>")])
> +
> +(define_insn "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_lt"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (zero_extend:DI
> +         (unspec:SI
> +           [(lt:VF_128_256
> +              (match_operand:<sseintvecmode> 1 "register_operand" "x")
> +              (match_operand:<sseintvecmode> 2 "const0_operand" "C"))]
> +           UNSPEC_MOVMSK)))]
> +  "TARGET_64BIT && TARGET_SSE"
> +  "%vmovmsk<ssemodesuffix>\t{%1, %k0|%k0, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "maybe_vex")
> +   (set_attr "mode" "<MODE>")])
> +
> +(define_insn "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_shift"
> +  [(set (match_operand:SI 0 "register_operand" "=r")
> +       (unspec:SI
> +         [(subreg:VF_128_256
> +            (ashiftrt:<sseintvecmode>
> +              (match_operand:<sseintvecmode> 1 "register_operand" "x")
> +              (match_operand:QI 2 "const_int_operand" "n")) 0)]
> +         UNSPEC_MOVMSK))]
> +  "TARGET_SSE"
> +  "%vmovmsk<ssemodesuffix>\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "maybe_vex")
> +   (set_attr "mode" "<MODE>")])
> +
> +(define_insn "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_shift"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (zero_extend:DI
> +         (unspec:SI
> +           [(subreg:VF_128_256
> +              (ashiftrt:<sseintvecmode>
> +                (match_operand:<sseintvecmode> 1 "register_operand" "x")
> +              (match_operand:QI 2 "const_int_operand" "n")) 0)]
> +           UNSPEC_MOVMSK)))]
> +  "TARGET_64BIT && TARGET_SSE"
> +  "%vmovmsk<ssemodesuffix>\t{%1, %k0|%k0, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "maybe_vex")
> +   (set_attr "mode" "<MODE>")])
> +
>  (define_insn "<sse2_avx2>_pmovmskb"
>    [(set (match_operand:SI 0 "register_operand" "=r")
>         (unspec:SI
> @@ -14677,6 +14733,41 @@ (define_insn "*<sse2_avx2>_pmovmskb_zext
>             UNSPEC_MOVMSK)))]
>    "TARGET_64BIT && TARGET_SSE2"
>    "%vpmovmskb\t{%1, %k0|%k0, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set (attr "prefix_data16")
> +     (if_then_else
> +       (match_test "TARGET_AVX")
> +     (const_string "*")
> +     (const_string "1")))
> +   (set_attr "prefix" "maybe_vex")
> +   (set_attr "mode" "SI")])
> +
> +(define_insn "*<sse2_avx2>_pmovmskb_lt"
> +  [(set (match_operand:SI 0 "register_operand" "=r")
> +       (unspec:SI
> +         [(lt:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand" "x")
> +                       (match_operand:VI1_AVX2 2 "const0_operand" "C"))]
> +         UNSPEC_MOVMSK))]
> +  "TARGET_SSE2"
> +  "%vpmovmskb\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set (attr "prefix_data16")
> +     (if_then_else
> +       (match_test "TARGET_AVX")
> +     (const_string "*")
> +     (const_string "1")))
> +   (set_attr "prefix" "maybe_vex")
> +   (set_attr "mode" "SI")])
> +
> +(define_insn "*<sse2_avx2>_pmovmskb_zext_lt"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (zero_extend:DI
> +         (unspec:SI
> +           [(lt:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand" "x")
> +                         (match_operand:VI1_AVX2 2 "const0_operand" "C"))]
> +           UNSPEC_MOVMSK)))]
> +  "TARGET_64BIT && TARGET_SSE2"
> +  "%vpmovmskb\t{%1, %k0|%k0, %1}"
>    [(set_attr "type" "ssemov")
>     (set (attr "prefix_data16")
>       (if_then_else
> --- gcc/testsuite/g++.target/i386/pr88152.C.jj  2018-11-29 13:25:23.809113651 +0100
> +++ gcc/testsuite/g++.target/i386/pr88152.C     2018-11-29 13:26:20.362168048 +0100
> @@ -0,0 +1,44 @@
> +// PR target/88152
> +// { dg-do compile }
> +// { dg-options "-O2 -mavx2 -std=c++11" }
> +// { dg-final { scan-assembler-times "vpmovmskb\[^\n\r]*xmm" 6 } }
> +// { dg-final { scan-assembler-times "vpmovmskb\[^\n\r]*ymm" 6 } }
> +// { dg-final { scan-assembler-times "vmovmskps\[^\n\r]*xmm" 4 } }
> +// { dg-final { scan-assembler-times "vmovmskps\[^\n\r]*ymm" 4 } }
> +// { dg-final { scan-assembler-times "vmovmskpd\[^\n\r]*xmm" 4 } }
> +// { dg-final { scan-assembler-times "vmovmskpd\[^\n\r]*ymm" 4 } }
> +// { dg-final { scan-assembler-not "vpcmpgt|vpcmpeq|vpsra" } }
> +
> +#include <x86intrin.h>
> +
> +template <typename T, size_t N>
> +using V [[gnu::vector_size(N)]] = T;
> +
> +int f0 (V<unsigned char, 16> a) { return _mm_movemask_epi8 (reinterpret_cast<__m128i> (a > 0x7f)); }
> +long int f1 (V<unsigned char, 16> a) { return (unsigned) _mm_movemask_epi8 (reinterpret_cast<__m128i> (a >= 0x80)); }
> +long int f2 (V<signed char, 16> a) { return (unsigned) _mm_movemask_epi8 (reinterpret_cast<__m128i> (a < 0)); }
> +int f3 (V<signed char, 16> a) { return _mm_movemask_epi8 (reinterpret_cast<__m128i> (a <= -1)); }
> +int f4 (V<char, 16> a) { return _mm_movemask_epi8 (reinterpret_cast<__m128i> (a < 0)); }
> +long int f5 (V<char, 16> a) { return (unsigned) _mm_movemask_epi8 (reinterpret_cast<__m128i> (a <= -1)); }
> +int f6 (V<unsigned int, 16> a) { return _mm_movemask_ps (reinterpret_cast<__m128> (a > __INT_MAX__)); }
> +int f7 (V<unsigned int, 16> a) { return _mm_movemask_ps (reinterpret_cast<__m128> (a >= 1U + __INT_MAX__)); }
> +int f8 (V<int, 16> a) { return _mm_movemask_ps (reinterpret_cast<__m128> (a < 0)); }
> +int f9 (V<int, 16> a) { return _mm_movemask_ps (reinterpret_cast<__m128> (a <= -1)); }
> +int f10 (V<unsigned long long, 16> a) { return _mm_movemask_pd (reinterpret_cast<__m128d> (a > __LONG_LONG_MAX__)); }
> +int f11 (V<unsigned long long, 16> a) { return _mm_movemask_pd (reinterpret_cast<__m128d> (a >= 1ULL + __LONG_LONG_MAX__)); }
> +long int f12 (V<long long, 16> a) { return (unsigned) _mm_movemask_pd (reinterpret_cast<__m128d> (a < 0)); }
> +int f13 (V<long long, 16> a) { return _mm_movemask_pd (reinterpret_cast<__m128d> (a <= -1)); }
> +int f14 (V<unsigned char, 32> a) { return _mm256_movemask_epi8 (reinterpret_cast<__m256i> (a > 0x7f)); }
> +int f15 (V<unsigned char, 32> a) { return _mm256_movemask_epi8 (reinterpret_cast<__m256i> (a >= 0x80)); }
> +long int f16 (V<signed char, 32> a) { return (unsigned) _mm256_movemask_epi8 (reinterpret_cast<__m256i> (a < 0)); }
> +int f17 (V<signed char, 32> a) { return _mm256_movemask_epi8 (reinterpret_cast<__m256i> (a <= -1)); }
> +int f18 (V<char, 32> a) { return _mm256_movemask_epi8 (reinterpret_cast<__m256i> (a < 0)); }
> +int f19 (V<char, 32> a) { return _mm256_movemask_epi8 (reinterpret_cast<__m256i> (a <= -1)); }
> +long int f20 (V<unsigned int, 32> a) { return (unsigned) _mm256_movemask_ps (reinterpret_cast<__m256> (a > __INT_MAX__)); }
> +int f21 (V<unsigned int, 32> a) { return _mm256_movemask_ps (reinterpret_cast<__m256> (a >= 1U + __INT_MAX__)); }
> +int f22 (V<int, 32> a) { return _mm256_movemask_ps (reinterpret_cast<__m256> (a < 0)); }
> +int f23 (V<int, 32> a) { return _mm256_movemask_ps (reinterpret_cast<__m256> (a <= -1)); }
> +int f24 (V<unsigned long long, 32> a) { return _mm256_movemask_pd (reinterpret_cast<__m256d> (a > __LONG_LONG_MAX__)); }
> +int f25 (V<unsigned long long, 32> a) { return _mm256_movemask_pd (reinterpret_cast<__m256d> (a >= 1ULL + __LONG_LONG_MAX__)); }
> +int f26 (V<long long, 32> a) { return _mm256_movemask_pd (reinterpret_cast<__m256d> (a < 0)); }
> +long int f27 (V<long long, 32> a) { return (unsigned) _mm256_movemask_pd (reinterpret_cast<__m256d> (a <= -1)); }
>
>         Jakub