[PATCH] [i386] Optimize (a & b) | (c & ~b) to vpternlog instruction.

Hongtao Liu crazylht@gmail.com
Wed Aug 25 01:23:53 GMT 2021


On Tue, Aug 24, 2021 at 9:11 PM Bernhard Reutner-Fischer
<rep.dot.nop@gmail.com> wrote:
>
> On Tue, 24 Aug 2021 17:53:27 +0800
> Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>
> > On Tue, Aug 24, 2021 at 9:36 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > Also optimize below 3 forms to vpternlog, op1, op2, op3 are
> > > register_operand or unary_p as (not reg)
>
> > > gcc/ChangeLog:
> > >
> > >         PR target/101989
> > >         * config/i386/i386-protos.h
> > >         (ix86_strip_reg_or_notreg_operand): New declare.
>
> "New declaration."
>
> > >         * config/i386/i386.c (ix86_rtx_costs): Define cost for
> > >         UNSPEC_VTERNLOG.
>
> I do not see a considerable amount of VTERNLOG in the docs i have here.
> Is there a P missing in vPternlog?
The output assembly is vpternlog, and the internal pattern name is
originally vternlog (not clear why it is not called vpternlog, perhaps
the abbreviation of vector ternary logic), I added the new
define_insn_and_split just to keep in line with the original name.
>
> > >         (ix86_strip_reg_or_notreg_operand): New function.
> > Push to trunk by changing ix86_strip_reg_or_notreg_operand to macro,
> > function call seems too inefficient for the simple strip unary.
> > >         * config/i386/predicates.md (reg_or_notreg_operand): New
> > >         predicate.
> > >         * config/i386/sse.md (*<avx512>_vternlog<mode>_all): New define_insn.
> > >         (*<avx512>_vternlog<mode>_1): New pre_reload
> > >         define_insn_and_split.
> > >         (*<avx512>_vternlog<mode>_2): Ditto.
> > >         (*<avx512>_vternlog<mode>_3): Ditto.
>
> at least the above 3 insn_and_split do have a 'p' in the md.
> thanks,
> > >         (any_logic1,any_logic2): New code iterator.
> > >         (logic_op): New code attribute.
> > >         (ternlogsuffix): Extend to VNxDF and VNxSF.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         PR target/101989
> > >         * gcc.target/i386/pr101989-1.c: New test.
> > >         * gcc.target/i386/pr101989-2.c: New test.
> > >         * gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Adjust testcase.
> > > ---
> > >  gcc/config/i386/i386-protos.h                 |   1 +
> > >  gcc/config/i386/i386.c                        |  13 +
> > >  gcc/config/i386/predicates.md                 |   7 +
> > >  gcc/config/i386/sse.md                        | 234 ++++++++++++++++++
> > >  .../i386/avx512bw-shiftqihi-constant-1.c      |   4 +-
> > >  gcc/testsuite/gcc.target/i386/pr101989-1.c    |  51 ++++
> > >  gcc/testsuite/gcc.target/i386/pr101989-2.c    | 102 ++++++++
> > >  7 files changed, 410 insertions(+), 2 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-1.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-2.c
> > >
> > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> > > index 2fd13074c81..2bdaadcf4f3 100644
> > > --- a/gcc/config/i386/i386-protos.h
> > > +++ b/gcc/config/i386/i386-protos.h
> > > @@ -60,6 +60,7 @@ extern rtx standard_80387_constant_rtx (int);
> > >  extern int standard_sse_constant_p (rtx, machine_mode);
> > >  extern const char *standard_sse_constant_opcode (rtx_insn *, rtx *);
> > >  extern bool ix86_standard_x87sse_constant_load_p (const rtx_insn *, rtx);
> > > +extern rtx ix86_strip_reg_or_notreg_operand (rtx);
> > >  extern bool ix86_pre_reload_split (void);
> > >  extern bool symbolic_reference_mentioned_p (rtx);
> > >  extern bool extended_reg_mentioned_p (rtx);
> > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > index 46844fab08f..a69225ccc81 100644
> > > --- a/gcc/config/i386/i386.c
> > > +++ b/gcc/config/i386/i386.c
> > > @@ -5236,6 +5236,14 @@ ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
> > >    return true;
> > >  }
> > >
> > > +/* Returns true if INSN can be transformed from a memory load
> > > +   to a supported FP constant load.  */
> > > +rtx
> > > +ix86_strip_reg_or_notreg_operand (rtx op)
> > > +{
> > > +  return UNARY_P (op) ? XEXP (op, 0) : op;
> > > +}
> > > +
> > >  /* Predicate for pre-reload splitters with associated instructions,
> > >     which can match any time before the split1 pass (usually combine),
> > >     then are unconditionally split in that pass and should not be
> > > @@ -20544,6 +20552,11 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
> > >      case UNSPEC:
> > >        if (XINT (x, 1) == UNSPEC_TP)
> > >         *total = 0;
> > > +      else if (XINT(x, 1) == UNSPEC_VTERNLOG)
> > > +       {
> > > +         *total = cost->sse_op;
> > > +         return true;
> > > +       }
> > >        return false;
> > >
> > >      case VEC_SELECT:
> > > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> > > index 9321f332ef9..df5acb425d4 100644
> > > --- a/gcc/config/i386/predicates.md
> > > +++ b/gcc/config/i386/predicates.md
> > > @@ -1044,6 +1044,13 @@ (define_predicate "reg_or_pm1_operand"
> > >             (ior (match_test "op == const1_rtx")
> > >                  (match_test "op == constm1_rtx")))))
> > >
> > > +;; True for registers, or (not: registers).  Used to optimize 3-operand
> > > +;; bitwise operation.
> > > +(define_predicate "reg_or_notreg_operand"
> > > +  (ior (match_operand 0 "register_operand")
> > > +       (and (match_code "not")
> > > +           (match_test "register_operand (XEXP (op, 0), mode)"))))
> > > +
> > >  ;; True if OP is acceptable as operand of DImode shift expander.
> > >  (define_predicate "shiftdi_operand"
> > >    (if_then_else (match_test "TARGET_64BIT")
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index 13889687793..0acd749d21c 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -933,7 +933,9 @@ (define_mode_attr iptr
> > >  ;; Mapping of vector modes to VPTERNLOG suffix
> > >  (define_mode_attr ternlogsuffix
> > >    [(V8DI "q") (V4DI "q") (V2DI "q")
> > > +   (V8DF "q") (V4DF "q") (V2DF "q")
> > >     (V16SI "d") (V8SI "d") (V4SI "d")
> > > +   (V16SF "d") (V8SF "d") (V4SF "d")
> > >     (V32HI "d") (V16HI "d") (V8HI "d")
> > >     (V64QI "d") (V32QI "d") (V16QI "d")])
> > >
> > > @@ -10041,6 +10043,238 @@ (define_insn "<avx512>_vternlog<mode><sd_maskz_name>"
> > >     (set_attr "prefix" "evex")
> > >     (set_attr "mode" "<sseinsnmode>")])
> > >
> > > +(define_insn "*<avx512>_vternlog<mode>_all"
> > > +  [(set (match_operand:V 0 "register_operand" "=v")
> > > +       (unspec:V
> > > +         [(match_operand:V 1 "register_operand" "0")
> > > +          (match_operand:V 2 "register_operand" "v")
> > > +          (match_operand:V 3 "nonimmediate_operand" "vm")
> > > +          (match_operand:SI 4 "const_0_to_255_operand")]
> > > +         UNSPEC_VTERNLOG))]
> > > +  "TARGET_AVX512F"
> > > +  "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"
> > > +  [(set_attr "type" "sselog")
> > > +   (set_attr "prefix" "evex")
> > > +   (set_attr "mode" "<sseinsnmode>")])
> > > +
> > > +;; There must be lots of other combinations like
> > > +;;
> > > +;; (any_logic:V
> > > +;;   (any_logic:V op1 op2)
> > > +;;   (any_logic:V op1 op3))
> > > +;;
> > > +;; (any_logic:V
> > > +;;   (any_logic:V
> > > +;;     (any_logic:V op1, op2)
> > > +;;     op3)
> > > +;;   op1)
> > > +;;
> > > +;; and so on.
> > > +
> > > +(define_code_iterator any_logic1 [and ior xor])
> > > +(define_code_iterator any_logic2 [and ior xor])
> > > +(define_code_attr logic_op [(and "&") (ior "|") (xor "^")])
> > > +
> > > +(define_insn_and_split "*<avx512>_vpternlog<mode>_1"
> > > +  [(set (match_operand:V 0 "register_operand")
> > > +       (any_logic:V
> > > +         (any_logic1:V
> > > +           (match_operand:V 1 "reg_or_notreg_operand")
> > > +           (match_operand:V 2 "reg_or_notreg_operand"))
> > > +         (any_logic2:V
> > > +           (match_operand:V 3 "reg_or_notreg_operand")
> > > +           (match_operand:V 4 "reg_or_notreg_operand"))))]
> > > +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> > > +   && ix86_pre_reload_split ()
> > > +   && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                   ix86_strip_reg_or_notreg_operand (operands[4]))
> > > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[4]))
> > > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[3]))
> > > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[3])))"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(set (match_dup 0)
> > > +       (unspec:V
> > > +         [(match_dup 6)
> > > +          (match_dup 2)
> > > +          (match_dup 1)
> > > +          (match_dup 5)]
> > > +         UNSPEC_VTERNLOG))]
> > > +{
> > > +  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
> > > +  int reg6 = 0xF0;
> > > +  int reg2 = 0xCC;
> > > +  int reg1 = 0xAA;
> > > +  int reg3 = 0;
> > > +  int reg4 = 0;
> > > +  int reg_mask, tmp1, tmp2;
> > > +  if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                  ix86_strip_reg_or_notreg_operand (operands[4])))
> > > +    {
> > > +      reg4 = reg1;
> > > +      reg3 = reg6;
> > > +      operands[6] = operands[3];
> > > +    }
> > > +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[4])))
> > > +    {
> > > +      reg4 = reg2;
> > > +      reg3 = reg6;
> > > +      operands[6] = operands[3];
> > > +    }
> > > +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                       ix86_strip_reg_or_notreg_operand (operands[3])))
> > > +    {
> > > +      reg4 = reg6;
> > > +      reg3 = reg1;
> > > +      operands[6] = operands[4];
> > > +    }
> > > +  else
> > > +    {
> > > +      reg4 = reg6;
> > > +      reg3 = reg2;
> > > +      operands[6] = operands[4];
> > > +    }
> > > +
> > > +  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
> > > +  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
> > > +  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
> > > +  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
> > > +
> > > +  tmp1 = reg1 <any_logic1:logic_op> reg2;
> > > +  tmp2 = reg3 <any_logic2:logic_op> reg4;
> > > +  reg_mask = tmp1  <any_logic:logic_op> tmp2;
> > > +  reg_mask &= 0xFF;
> > > +
> > > +  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
> > > +  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
> > > +  operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]);
> > > +  operands[5] = GEN_INT (reg_mask);
> > > +})
> > > +
> > > +(define_insn_and_split "*<avx512>_vpternlog<mode>_2"
> > > +  [(set (match_operand:V 0 "register_operand")
> > > +       (any_logic:V
> > > +         (any_logic1:V
> > > +           (any_logic2:V
> > > +             (match_operand:V 1 "reg_or_notreg_operand")
> > > +             (match_operand:V 2 "reg_or_notreg_operand"))
> > > +           (match_operand:V 3 "reg_or_notreg_operand"))
> > > +         (match_operand:V 4 "reg_or_notreg_operand")))]
> > > +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> > > +   && ix86_pre_reload_split ()
> > > +   && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                   ix86_strip_reg_or_notreg_operand (operands[4]))
> > > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[4]))
> > > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[3]))
> > > +       || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[3])))"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(set (match_dup 0)
> > > +       (unspec:V
> > > +         [(match_dup 6)
> > > +          (match_dup 2)
> > > +          (match_dup 1)
> > > +          (match_dup 5)]
> > > +         UNSPEC_VTERNLOG))]
> > > +{
> > > +  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
> > > +  int reg6 = 0xF0;
> > > +  int reg2 = 0xCC;
> > > +  int reg1 = 0xAA;
> > > +  int reg3 = 0;
> > > +  int reg4 = 0;
> > > +  int reg_mask, tmp1, tmp2;
> > > +  if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                  ix86_strip_reg_or_notreg_operand (operands[4])))
> > > +    {
> > > +      reg4 = reg1;
> > > +      reg3 = reg6;
> > > +      operands[6] = operands[3];
> > > +    }
> > > +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]),
> > > +                      ix86_strip_reg_or_notreg_operand (operands[4])))
> > > +    {
> > > +      reg4 = reg2;
> > > +      reg3 = reg6;
> > > +      operands[6] = operands[3];
> > > +    }
> > > +  else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]),
> > > +                       ix86_strip_reg_or_notreg_operand (operands[3])))
> > > +    {
> > > +      reg4 = reg6;
> > > +      reg3 = reg1;
> > > +      operands[6] = operands[4];
> > > +    }
> > > +  else
> > > +    {
> > > +      reg4 = reg6;
> > > +      reg3 = reg2;
> > > +      operands[6] = operands[4];
> > > +    }
> > > +
> > > +  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
> > > +  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
> > > +  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
> > > +  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
> > > +
> > > +  tmp1 = reg1 <any_logic2:logic_op> reg2;
> > > +  tmp2 = tmp1 <any_logic1:logic_op> reg3;
> > > +  reg_mask = tmp2 <any_logic:logic_op> reg4;
> > > +  reg_mask &= 0xFF;
> > > +
> > > +  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
> > > +  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
> > > +  operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]);
> > > +  operands[5] = GEN_INT (reg_mask);
> > > +})
> > > +
> > > +(define_insn_and_split "*<avx512>_vpternlog<mode>_3"
> > > +  [(set (match_operand:V 0 "register_operand")
> > > +       (any_logic:V
> > > +         (any_logic1:V
> > > +           (match_operand:V 1 "reg_or_notreg_operand")
> > > +           (match_operand:V 2 "reg_or_notreg_operand"))
> > > +         (match_operand:V 3 "reg_or_notreg_operand")))]
> > > +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
> > > +   && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(set (match_dup 0)
> > > +       (unspec:V
> > > +         [(match_dup 3)
> > > +          (match_dup 2)
> > > +          (match_dup 1)
> > > +          (match_dup 4)]
> > > +         UNSPEC_VTERNLOG))]
> > > +{
> > > +  /* VPTERNLOGD reg3, reg2, reg1, imm8.  */
> > > +  int reg3 = 0xF0;
> > > +  int reg2 = 0xCC;
> > > +  int reg1 = 0xAA;
> > > +  int reg_mask, tmp1;
> > > +
> > > +  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
> > > +  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
> > > +  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
> > > +
> > > +  tmp1 = reg1 <any_logic1:logic_op> reg2;
> > > +  reg_mask = tmp1 <any_logic:logic_op> reg3;
> > > +  reg_mask &= 0xFF;
> > > +
> > > +  operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]);
> > > +  operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]);
> > > +  operands[3] = ix86_strip_reg_or_notreg_operand (operands[3]);
> > > +  operands[4] = GEN_INT (reg_mask);
> > > +})
> > > +
> > > +
> > >  (define_insn "<avx512>_vternlog<mode>_mask"
> > >    [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
> > >         (vec_merge:VI48_AVX512VL
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
> > > index 78bf5d33689..fbc3de08119 100644
> > > --- a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
> > > @@ -1,7 +1,8 @@
> > >  /* PR target/95524 */
> > >  /* { dg-do compile } */
> > >  /* { dg-options "-O2 -mavx512bw" } */
> > > -/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } }  */
> > > +/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 2 } }  */
> > > +/* { dg-final { scan-assembler-times "vpternlogd\[^\n\]*%zmm" 1 } }  */
> > >  typedef char v64qi  __attribute__ ((vector_size (64)));
> > >  typedef unsigned char v64uqi  __attribute__ ((vector_size (64)));
> > >
> > > @@ -11,7 +12,6 @@ foo_ashiftrt_512 (v64qi a)
> > >    return a >> 2;
> > >  }
> > >  /* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */
> > > -/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */
> > >  /* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */
> > >
> > >  __attribute__((noipa)) v64qi
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr101989-1.c b/gcc/testsuite/gcc.target/i386/pr101989-1.c
> > > new file mode 100644
> > > index 00000000000..594093ecdde
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr101989-1.c
> > > @@ -0,0 +1,51 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2" } */
> > > +/* { dg-final { scan-assembler-times "vpternlog" 6 } } */
> > > +/* { dg-final { scan-assembler-not "vpxor" } } */
> > > +/* { dg-final { scan-assembler-not "vpor" } } */
> > > +/* { dg-final { scan-assembler-not "vpand" } } */
> > > +
> > > +#include<immintrin.h>
> > > +__m256d
> > > +__attribute__((noipa, target("avx512vl")))
> > > +copysign2_pd(__m256d from, __m256d to) {
> > > +  __m256i a = _mm256_castpd_si256(from);
> > > +  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
> > > +  /* (avx_signbit & from) | (~avx_signbit & to)  */
> > > +  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
> > > +}
> > > +
> > > +__m256i
> > > +__attribute__((noipa, target("avx512vl")))
> > > +foo (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (src2 & ~src1) | (src3 & src1);
> > > +}
> > > +
> > > +__m256i
> > > +__attribute__ ((noipa, target("avx512vl")))
> > > +foo1 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (src2 & src1) | (src3 & ~src1);
> > > +}
> > > +
> > > +__m256i
> > > +__attribute__ ((noipa, target("avx512vl")))
> > > +foo2 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (src2 & src1) | (~src3 & src1);
> > > +}
> > > +
> > > +__m256i
> > > +__attribute__ ((noipa, target("avx512vl")))
> > > +foo3 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (~src2 & src1) | (src3 & src1);
> > > +}
> > > +
> > > +__m256i
> > > +__attribute__ ((noipa, target("avx512vl")))
> > > +foo4 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return src3 & src2 ^ src1;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr101989-2.c b/gcc/testsuite/gcc.target/i386/pr101989-2.c
> > > new file mode 100644
> > > index 00000000000..9d9759a8e1d
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr101989-2.c
> > > @@ -0,0 +1,102 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
> > > +/* { dg-require-effective-target avx512vl } */
> > > +
> > > +#define AVX512VL
> > > +
> > > +#include "avx512f-helper.h"
> > > +
> > > +#include "pr101989-1.c"
> > > +__m256d
> > > +avx2_copysign2_pd (__m256d from, __m256d to) {
> > > +  __m256i a = _mm256_castpd_si256(from);
> > > +  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
> > > +  /* (avx_signbit & from) | (~avx_signbit & to)  */
> > > +  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
> > > +}
> > > +
> > > +__m256i
> > > +avx2_foo (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (src2 & ~src1) | (src3 & src1);
> > > +}
> > > +
> > > +__m256i
> > > +avx2_foo1 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (src2 & src1) | (src3 & ~src1);
> > > +}
> > > +
> > > +__m256i
> > > +avx2_foo2 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (src2 & src1) | (~src3 & src1);
> > > +}
> > > +
> > > +__m256i
> > > +avx2_foo3 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return (~src2 & src1) | (src3 & src1);
> > > +}
> > > +
> > > +__m256i
> > > +avx2_foo4 (__m256i src1, __m256i src2, __m256i src3)
> > > +{
> > > +  return src3 & src2 ^ src1;
> > > +}
> > > +
> > > +
> > > +void
> > > +test_256 (void)
> > > +{
> > > +  union256i_q q1, q2, q3, res2, exp2;
> > > +  union256d d1, d2, res1, exp1;
> > > +  int i, sign = 1;
> > > +
> > > +  for (i = 0; i < 4; i++)
> > > +    {
> > > +      d1.a[i] = 12.34 * (i + 2000) * sign;
> > > +      d2.a[i] = 56.78 * (i - 30) * sign;
> > > +      q1.a[i] = 12 * (i + 2000) * sign;
> > > +      q2.a[i] = 56 * (i - 30) * sign;
> > > +      q3.a[i] = 90 * (i + 40) * sign;
> > > +      res1.a[i] = DEFAULT_VALUE;
> > > +      exp1.a[i] = DEFAULT_VALUE;
> > > +      res2.a[i] = exp2.a[i] = -1;
> > > +      sign = -sign;
> > > +    }
> > > +
> > > +  exp1.x = avx2_copysign2_pd (d1.x, d2.x);
> > > +  res1.x = copysign2_pd (d1.x, d2.x);
> > > +  if (UNION_CHECK (256, d) (res1, exp1.a))
> > > +    abort ();
> > > +
> > > +  exp2.x = avx2_foo1 (q1.x, q2.x, q3.x);
> > > +  res2.x = foo1 (q1.x, q2.x, q3.x);
> > > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > > +    abort ();
> > > +
> > > +  exp2.x = avx2_foo2 (q1.x, q2.x, q3.x);
> > > +  res2.x = foo2 (q1.x, q2.x, q3.x);
> > > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > > +    abort ();
> > > +
> > > +  exp2.x = avx2_foo3 (q1.x, q2.x, q3.x);
> > > +  res2.x = foo3 (q1.x, q2.x, q3.x);
> > > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > > +    abort ();
> > > +
> > > +  exp2.x = avx2_foo4 (q1.x, q2.x, q3.x);
> > > +  res2.x = foo4 (q1.x, q2.x, q3.x);
> > > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > > +    abort ();
> > > +
> > > +  exp2.x = avx2_foo (q1.x, q2.x, q3.x);
> > > +  res2.x = foo (q1.x, q2.x, q3.x);
> > > +  if (UNION_CHECK (256, i_q) (res2, exp2.a))
> > > +    abort ();
> > > +}
> > > +
> > > +static void
> > > +test_128 ()
> > > +{}
> > > --
> > > 2.18.1
> > >
> >
> >
>


-- 
BR,
Hongtao


More information about the Gcc-patches mailing list