[PATCH] i386: Improve SImode constant - __builtin_clzll for -mno-lzcnt

Uros Bizjak ubizjak@gmail.com
Sun Aug 1 19:20:49 GMT 2021


On Sun, Aug 1, 2021 at 7:12 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sat, Jul 31, 2021 at 12:53:44PM -0700, H.J. Lu wrote:
> > On Fri, Jul 30, 2021 at 6:27 AM Jakub Jelinek via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > On Fri, Jul 30, 2021 at 12:27:39PM +0200, Uros Bizjak wrote:
> > > > Please put some space here, e.g.:
> > > ...
> > > > Can you just name the relevant insn pattern and use
> > > >
> > > > emit_insn (gen_bsr_1)?
> > >
> > > Here is the updated patch.  I'll bootstrap/regtest it tonight.
> > >
> > > 2021-07-30  Jakub Jelinek  <jakub@redhat.com>
> > >
> > >         PR target/78103
> > >         * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New
> > >         define_insn patterns.
> > >         (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
> > >         Add combine splitters for constant - clz.
> > >         (clz<mode>2): Use a temporary pseudo for bsr result.
> > >
> > >         * gcc.target/i386/pr78103-1.c: New test.
> > >         * gcc.target/i386/pr78103-2.c: New test.
> > >         * gcc.target/i386/pr78103-3.c: New test.
> > >
> > > --- gcc/config/i386/i386.md.jj  2021-07-28 12:05:56.857977764 +0200
> > > +++ gcc/config/i386/i386.md     2021-07-30 15:13:49.994946550 +0200
> > > @@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64"
> > >     (set_attr "znver1_decode" "vector")
> > >     (set_attr "mode" "DI")])
> > >
> > > +(define_insn "bsr_rex64_1"
> > > +  [(set (match_operand:DI 0 "register_operand" "=r")
> > > +       (minus:DI (const_int 63)
> > > +                 (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
> > > +   (clobber (reg:CC FLAGS_REG))]
> > > +  "!TARGET_LZCNT && TARGET_64BIT"
> > > +  "bsr{q}\t{%1, %0|%0, %1}"
> > > +  [(set_attr "type" "alu1")
> > > +   (set_attr "prefix_0f" "1")
> > > +   (set_attr "znver1_decode" "vector")
> > > +   (set_attr "mode" "DI")])
> > > +
> > >  (define_insn "bsr"
> > >    [(set (reg:CCZ FLAGS_REG)
> > >         (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
> > > @@ -14775,17 +14787,204 @@ (define_insn "bsr"
> > >     (set_attr "znver1_decode" "vector")
> > >     (set_attr "mode" "SI")])
> > >
> > > +(define_insn "bsr_1"
> > > +  [(set (match_operand:SI 0 "register_operand" "=r")
> > > +       (minus:SI (const_int 31)
> > > +                 (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
> > > +   (clobber (reg:CC FLAGS_REG))]
> > > +  "!TARGET_LZCNT"
> > > +  "bsr{l}\t{%1, %0|%0, %1}"
> > > +  [(set_attr "type" "alu1")
> > > +   (set_attr "prefix_0f" "1")
> > > +   (set_attr "znver1_decode" "vector")
> > > +   (set_attr "mode" "SI")])
> > > +
> > > +(define_insn "bsr_zext_1"
> > > +  [(set (match_operand:DI 0 "register_operand" "=r")
> > > +       (zero_extend:DI
> > > +         (minus:SI
> > > +           (const_int 31)
> > > +           (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
> > > +   (clobber (reg:CC FLAGS_REG))]
> > > +  "!TARGET_LZCNT && TARGET_64BIT"
> > > +  "bsr{l}\t{%1, %k0|%k0, %1}"
> > > +  [(set_attr "type" "alu1")
> > > +   (set_attr "prefix_0f" "1")
> > > +   (set_attr "znver1_decode" "vector")
> > > +   (set_attr "mode" "SI")])
> > > +
> > > +; As bsr is undefined behavior on zero and for other input
> > > +; values it is in range 0 to 63, we can optimize away sign-extends.
> > > +(define_insn_and_split "*bsr_rex64_2"
> > > +  [(set (match_operand:DI 0 "register_operand")
> > > +       (xor:DI
> > > +         (sign_extend:DI
> > > +           (minus:SI
> > > +             (const_int 63)
> > > +             (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > > +                        0)))
> > > +         (const_int 63)))
> > > +    (clobber (reg:CC FLAGS_REG))]
> > > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(parallel [(set (reg:CCZ FLAGS_REG)
> > > +                  (compare:CCZ (match_dup 1) (const_int 0)))
> > > +             (set (match_dup 2)
> > > +                  (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
> > > +   (parallel [(set (match_dup 0)
> > > +                  (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
> > > +             (clobber (reg:CC FLAGS_REG))])]
> > > +{
> > > +  operands[2] = gen_reg_rtx (DImode);
> > > +  operands[3] = lowpart_subreg (SImode, operands[2], DImode);
> > > +})
> > > +
> > > +(define_insn_and_split "*bsr_2"
> > > +  [(set (match_operand:DI 0 "register_operand")
> > > +       (sign_extend:DI
> > > +         (xor:SI
> > > +           (minus:SI
> > > +             (const_int 31)
> > > +             (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > > +           (const_int 31))))
> > > +   (clobber (reg:CC FLAGS_REG))]
> > > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(parallel [(set (reg:CCZ FLAGS_REG)
> > > +                  (compare:CCZ (match_dup 1) (const_int 0)))
> > > +             (set (match_dup 2)
> > > +                  (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
> > > +   (parallel [(set (match_dup 0)
> > > +                  (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
> > > +             (clobber (reg:CC FLAGS_REG))])]
> > > +  "operands[2] = gen_reg_rtx (SImode);")
> > > +
> > > +; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
> > > +; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
> > > +; in [0, 63] or [0, 31] range.
> > > +(define_split
> > > +  [(set (match_operand:SI 0 "register_operand")
> > > +       (minus:SI
> > > +         (match_operand:SI 2 "const_int_operand")
> > > +         (xor:SI
> > > +           (minus:SI (const_int 63)
> > > +                     (subreg:SI
> > > +                       (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > > +                       0))
> > > +           (const_int 63))))]
> > > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > > +  [(set (match_dup 3)
> > > +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> > > +   (set (match_dup 0)
> > > +       (plus:SI (match_dup 5) (match_dup 4)))]
> > > +{
> > > +  operands[3] = gen_reg_rtx (DImode);
> > > +  operands[5] = lowpart_subreg (SImode, operands[3], DImode);
> > > +  if (INTVAL (operands[2]) == 63)
> > > +    {
> > > +      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
> > > +      emit_move_insn (operands[0], operands[5]);
> > > +      DONE;
> > > +    }
> > > +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
> > > +})
> > > +
> > > +(define_split
> > > +  [(set (match_operand:SI 0 "register_operand")
> > > +       (minus:SI
> > > +         (match_operand:SI 2 "const_int_operand")
> > > +         (xor:SI
> > > +           (minus:SI (const_int 31)
> > > +                     (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > > +           (const_int 31))))]
> > > +  "!TARGET_LZCNT && ix86_pre_reload_split ()"
> > > +  [(set (match_dup 3)
> > > +       (minus:SI (const_int 31) (clz:SI (match_dup 1))))
> > > +   (set (match_dup 0)
> > > +       (plus:SI (match_dup 3) (match_dup 4)))]
> > > +{
> > > +  if (INTVAL (operands[2]) == 31)
> > > +    {
> > > +      emit_insn (gen_bsr_1 (operands[0], operands[1]));
> > > +      DONE;
> > > +    }
> > > +  operands[3] = gen_reg_rtx (SImode);
> > > +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
> > > +})
> > > +
> > > +(define_split
> > > +  [(set (match_operand:DI 0 "register_operand")
> > > +       (minus:DI
> > > +         (match_operand:DI 2 "const_int_operand")
> > > +         (xor:DI
> > > +           (sign_extend:DI
> > > +             (minus:SI (const_int 63)
> > > +                       (subreg:SI
> > > +                         (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > > +                         0)))
> > > +           (const_int 63))))]
> > > +  "!TARGET_LZCNT
> > > +   && TARGET_64BIT
> > > +   && ix86_pre_reload_split ()
> > > +   && ((unsigned HOST_WIDE_INT)
> > > +       trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
> > > +       == UINTVAL (operands[2]) - 63)"
> > > +  [(set (match_dup 3)
> > > +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> > > +   (set (match_dup 0)
> > > +       (plus:DI (match_dup 3) (match_dup 4)))]
> > > +{
> > > +  if (INTVAL (operands[2]) == 63)
> > > +    {
> > > +      emit_insn (gen_bsr_rex64_1 (operands[0], operands[1]));
> > > +      DONE;
> > > +    }
> > > +  operands[3] = gen_reg_rtx (DImode);
> > > +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
> > > +})
> > > +
> > > +(define_split
> > > +  [(set (match_operand:DI 0 "register_operand")
> > > +       (minus:DI
> > > +         (match_operand:DI 2 "const_int_operand")
> > > +         (sign_extend:DI
> > > +           (xor:SI
> > > +             (minus:SI (const_int 31)
> > > +                       (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > > +             (const_int 31)))))]
> > > +  "!TARGET_LZCNT
> > > +   && TARGET_64BIT
> > > +   && ix86_pre_reload_split ()
> > > +   && ((unsigned HOST_WIDE_INT)
> > > +       trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
> > > +       == UINTVAL (operands[2]) - 31)"
> > > +  [(set (match_dup 3)
> > > +       (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
> > > +   (set (match_dup 0)
> > > +       (plus:DI (match_dup 3) (match_dup 4)))]
> > > +{
> > > +  if (INTVAL (operands[2]) == 31)
> > > +    {
> > > +      emit_insn (gen_bsr_zext_1 (operands[0], operands[1]));
> > > +      DONE;
> > > +    }
> > > +  operands[3] = gen_reg_rtx (DImode);
> > > +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
> > > +})
> > > +
> > >  (define_expand "clz<mode>2"
> > >    [(parallel
> > >       [(set (reg:CCZ FLAGS_REG)
> > >         (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
> > >                      (const_int 0)))
> > > -      (set (match_operand:SWI48 0 "register_operand")
> > > -          (minus:SWI48
> > > -            (match_dup 2)
> > > -            (clz:SWI48 (match_dup 1))))])
> > > +      (set (match_dup 3) (minus:SWI48
> > > +                          (match_dup 2)
> > > +                          (clz:SWI48 (match_dup 1))))])
> > >     (parallel
> > > -     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
> > > +     [(set (match_operand:SWI48 0 "register_operand")
> > > +          (xor:SWI48 (match_dup 3) (match_dup 2)))
> > >        (clobber (reg:CC FLAGS_REG))])]
> > >    ""
> > >  {
> > > @@ -14795,6 +14994,7 @@ (define_expand "clz<mode>2"
> > >        DONE;
> > >      }
> > >    operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
> > > +  operands[3] = gen_reg_rtx (<MODE>mode);
> > >  })
> > >
> > >  (define_insn_and_split "clz<mode>2_lzcnt"
> > > --- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj        2021-07-30 15:07:26.104139537 +0200
> > > +++ gcc/testsuite/gcc.target/i386/pr78103-1.c   2021-07-30 15:07:26.104139537 +0200
> > > @@ -0,0 +1,28 @@
> > > +/* PR target/78103 */
> > > +/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-options "-O2 -mno-lzcnt" } */
> > > +/* { dg-final { scan-assembler-not {\mcltq\M} } } */
> > > +
> > > +long long
> > > +foo (long long x)
> > > +{
> > > +  return __builtin_clzll (x);
> > > +}
> > > +
> > > +long long
> > > +bar (long long x)
> > > +{
> > > +  return (unsigned int) __builtin_clzll (x);
> > > +}
> > > +
> > > +long long
> > > +baz (int x)
> > > +{
> > > +  return __builtin_clz (x);
> > > +}
> > > +
> > > +long long
> > > +qux (int x)
> > > +{
> > > +  return (unsigned int) __builtin_clz (x);
> > > +}
> > > --- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj        2021-07-30 15:07:26.104139537 +0200
> > > +++ gcc/testsuite/gcc.target/i386/pr78103-2.c   2021-07-30 15:07:26.104139537 +0200
> > > @@ -0,0 +1,33 @@
> > > +/* PR target/78103 */
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -mno-lzcnt" } */
> > > +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> > > +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> > > +/* { dg-final { scan-assembler-not {\msubl\M} } } */
> > > +/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
> > > +
> > > +unsigned int
> > > +foo (unsigned int x)
> > > +{
> > > +  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> > > +}
> > > +
> > > +unsigned int
> > > +bar (unsigned int x)
> > > +{
> > > +  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> > > +}
> > > +
> > > +#ifdef __x86_64__
> > > +unsigned int
> > > +baz (unsigned long long x)
> > > +{
> > > +  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> > > +}
> > > +
> > > +unsigned int
> > > +qux (unsigned long long x)
> > > +{
> > > +  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> > > +}
> > > +#endif
> > > --- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj        2021-07-30 15:07:26.104139537 +0200
> > > +++ gcc/testsuite/gcc.target/i386/pr78103-3.c   2021-07-30 15:07:26.104139537 +0200
> > > @@ -0,0 +1,32 @@
> > > +/* PR target/78103 */
> > > +/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-options "-O2 -mno-lzcnt" } */
> > > +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> > > +/* { dg-final { scan-assembler-not {\mmovslq\M} } } */
> > > +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> > > +/* { dg-final { scan-assembler-not {\msubq\M} } } */
> > > +/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
> >
> > It fails for -mx32:
> >
> > @@ -7,7 +7,7 @@ foo:
> >  .LFB0:
> >   .cfi_startproc
> >   bsrl %edi, %edi
> > - leaq 1(%rdi), %rax
> > + leal 1(%rdi), %eax  << This should also work for -m64.  Why isn't it
> > used for -m64?
> >   ret
> >   .cfi_endproc
> >  .LFE0:
> > @@ -30,7 +30,7 @@ baz:
> >  .LFB2:
> >   .cfi_startproc
> >   bsrq %rdi, %rdi
> > - leaq 1(%rdi), %rax
> > + leal 1(%rdi), %eax   << This should also work for -m64.  Why isn't
> > it used for -m64?
> >   ret
> >   .cfi_endproc
> >  .LFE2:
> > @@ -42,6 +42,7 @@ qux:
> >  .LFB3:
> >   .cfi_startproc
> >   bsrq %rdi, %rax
> > + movl %eax, %eax  << Why is it generated for -mx32?
> >   ret
> >   .cfi_endproc
> >  .LFE3:
> >
>
> Add a zero_extend patten for bsr_rex64_1 and use it to split SImode
> constant - __builtin_clzll to avoid unncessary zero_extend.
>
> OK for master?
>
> H.J.
> ---
> gcc/
>
>         PR target/78103
>         * config/i386/i386.md (bsr_rex64_1_zext): New.
>         (combine splitter for SImode constant - clzll): Replace
>         gen_bsr_rex64_1 with gen_bsr_rex64_1_zext.
>
> gcc/testsuite/
>
>         PR target/78103
>         * gcc.target/i386/pr78103-2.c: Also scan incl.
>         * gcc.target/i386/pr78103-3.c: Scan leal|addl|incl for x32.  Also
>         scan incq.

OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.md                   | 17 ++++++++++++++++-
>  gcc/testsuite/gcc.target/i386/pr78103-2.c |  2 +-
>  gcc/testsuite/gcc.target/i386/pr78103-3.c |  3 ++-
>  3 files changed, 19 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index c9787d73262..0c23ddb8d1f 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -14796,6 +14796,21 @@ (define_insn "bsr_rex64_1"
>     (set_attr "znver1_decode" "vector")
>     (set_attr "mode" "DI")])
>
> +(define_insn "bsr_rex64_1_zext"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (zero_extend:DI
> +         (minus:SI (const_int 63)
> +                   (subreg:SI
> +                     (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))
> +                     0))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT"
> +  "bsr{q}\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "DI")])
> +
>  (define_insn "bsr"
>    [(set (reg:CCZ FLAGS_REG)
>         (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
> @@ -14907,7 +14922,7 @@ (define_split
>    operands[5] = lowpart_subreg (SImode, operands[3], DImode);
>    if (INTVAL (operands[2]) == 63)
>      {
> -      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
> +      emit_insn (gen_bsr_rex64_1_zext (operands[3], operands[1]));
>        emit_move_insn (operands[0], operands[5]);
>        DONE;
>      }
> diff --git a/gcc/testsuite/gcc.target/i386/pr78103-2.c b/gcc/testsuite/gcc.target/i386/pr78103-2.c
> index b3523382926..30f7f98f60a 100644
> --- a/gcc/testsuite/gcc.target/i386/pr78103-2.c
> +++ b/gcc/testsuite/gcc.target/i386/pr78103-2.c
> @@ -4,7 +4,7 @@
>  /* { dg-final { scan-assembler-not {\mmovl\M} } } */
>  /* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
>  /* { dg-final { scan-assembler-not {\msubl\M} } } */
> -/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
> +/* { dg-final { scan-assembler {\m(leal|addl|incl)\M} } } */
>
>  unsigned int
>  foo (unsigned int x)
> diff --git a/gcc/testsuite/gcc.target/i386/pr78103-3.c b/gcc/testsuite/gcc.target/i386/pr78103-3.c
> index 49a36eccf4d..b8d82312a0e 100644
> --- a/gcc/testsuite/gcc.target/i386/pr78103-3.c
> +++ b/gcc/testsuite/gcc.target/i386/pr78103-3.c
> @@ -5,7 +5,8 @@
>  /* { dg-final { scan-assembler-not {\mmovslq\M} } } */
>  /* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
>  /* { dg-final { scan-assembler-not {\msubq\M} } } */
> -/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
> +/* { dg-final { scan-assembler {\m(leaq|addq|incq)\M} { target { ! x32 } } } } */
> +/* { dg-final { scan-assembler {\m(leal|addl|incl)\M} { target x32 } } } */
>
>  unsigned long long
>  foo (unsigned int x)
> --
> 2.31.1
>


More information about the Gcc-patches mailing list