[PATCH] i386: Improve SImode constant - __builtin_clzll for -mno-lzcnt

H.J. Lu hjl.tools@gmail.com
Sun Aug 1 17:12:21 GMT 2021


On Sat, Jul 31, 2021 at 12:53:44PM -0700, H.J. Lu wrote:
> On Fri, Jul 30, 2021 at 6:27 AM Jakub Jelinek via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Fri, Jul 30, 2021 at 12:27:39PM +0200, Uros Bizjak wrote:
> > > Please put some space here, e.g.:
> > ...
> > > Can you just name the relevant insn pattern and use
> > >
> > > emit_insn (gen_bsr_1)?
> >
> > Here is the updated patch.  I'll bootstrap/regtest it tonight.
> >
> > 2021-07-30  Jakub Jelinek  <jakub@redhat.com>
> >
> >         PR target/78103
> >         * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New
> >         define_insn patterns.
> >         (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
> >         Add combine splitters for constant - clz.
> >         (clz<mode>2): Use a temporary pseudo for bsr result.
> >
> >         * gcc.target/i386/pr78103-1.c: New test.
> >         * gcc.target/i386/pr78103-2.c: New test.
> >         * gcc.target/i386/pr78103-3.c: New test.
> >
> > --- gcc/config/i386/i386.md.jj  2021-07-28 12:05:56.857977764 +0200
> > +++ gcc/config/i386/i386.md     2021-07-30 15:13:49.994946550 +0200
> > @@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64"
> >     (set_attr "znver1_decode" "vector")
> >     (set_attr "mode" "DI")])
> >
> > +(define_insn "bsr_rex64_1"
> > +  [(set (match_operand:DI 0 "register_operand" "=r")
> > +       (minus:DI (const_int 63)
> > +                 (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT && TARGET_64BIT"
> > +  "bsr{q}\t{%1, %0|%0, %1}"
> > +  [(set_attr "type" "alu1")
> > +   (set_attr "prefix_0f" "1")
> > +   (set_attr "znver1_decode" "vector")
> > +   (set_attr "mode" "DI")])
> > +
> >  (define_insn "bsr"
> >    [(set (reg:CCZ FLAGS_REG)
> >         (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
> > @@ -14775,17 +14787,204 @@ (define_insn "bsr"
> >     (set_attr "znver1_decode" "vector")
> >     (set_attr "mode" "SI")])
> >
> > +(define_insn "bsr_1"
> > +  [(set (match_operand:SI 0 "register_operand" "=r")
> > +       (minus:SI (const_int 31)
> > +                 (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT"
> > +  "bsr{l}\t{%1, %0|%0, %1}"
> > +  [(set_attr "type" "alu1")
> > +   (set_attr "prefix_0f" "1")
> > +   (set_attr "znver1_decode" "vector")
> > +   (set_attr "mode" "SI")])
> > +
> > +(define_insn "bsr_zext_1"
> > +  [(set (match_operand:DI 0 "register_operand" "=r")
> > +       (zero_extend:DI
> > +         (minus:SI
> > +           (const_int 31)
> > +           (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT && TARGET_64BIT"
> > +  "bsr{l}\t{%1, %k0|%k0, %1}"
> > +  [(set_attr "type" "alu1")
> > +   (set_attr "prefix_0f" "1")
> > +   (set_attr "znver1_decode" "vector")
> > +   (set_attr "mode" "SI")])
> > +
> > +; As bsr is undefined behavior on zero and for other input
> > +; values it is in range 0 to 63, we can optimize away sign-extends.
> > +(define_insn_and_split "*bsr_rex64_2"
> > +  [(set (match_operand:DI 0 "register_operand")
> > +       (xor:DI
> > +         (sign_extend:DI
> > +           (minus:SI
> > +             (const_int 63)
> > +             (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > +                        0)))
> > +         (const_int 63)))
> > +    (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(parallel [(set (reg:CCZ FLAGS_REG)
> > +                  (compare:CCZ (match_dup 1) (const_int 0)))
> > +             (set (match_dup 2)
> > +                  (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
> > +   (parallel [(set (match_dup 0)
> > +                  (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
> > +             (clobber (reg:CC FLAGS_REG))])]
> > +{
> > +  operands[2] = gen_reg_rtx (DImode);
> > +  operands[3] = lowpart_subreg (SImode, operands[2], DImode);
> > +})
> > +
> > +(define_insn_and_split "*bsr_2"
> > +  [(set (match_operand:DI 0 "register_operand")
> > +       (sign_extend:DI
> > +         (xor:SI
> > +           (minus:SI
> > +             (const_int 31)
> > +             (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > +           (const_int 31))))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > +  "#"
> > +  "&& 1"
> > +  [(parallel [(set (reg:CCZ FLAGS_REG)
> > +                  (compare:CCZ (match_dup 1) (const_int 0)))
> > +             (set (match_dup 2)
> > +                  (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
> > +   (parallel [(set (match_dup 0)
> > +                  (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
> > +             (clobber (reg:CC FLAGS_REG))])]
> > +  "operands[2] = gen_reg_rtx (SImode);")
> > +
> > +; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
> > +; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
> > +; in [0, 63] or [0, 31] range.
> > +(define_split
> > +  [(set (match_operand:SI 0 "register_operand")
> > +       (minus:SI
> > +         (match_operand:SI 2 "const_int_operand")
> > +         (xor:SI
> > +           (minus:SI (const_int 63)
> > +                     (subreg:SI
> > +                       (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > +                       0))
> > +           (const_int 63))))]
> > +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> > +  [(set (match_dup 3)
> > +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> > +   (set (match_dup 0)
> > +       (plus:SI (match_dup 5) (match_dup 4)))]
> > +{
> > +  operands[3] = gen_reg_rtx (DImode);
> > +  operands[5] = lowpart_subreg (SImode, operands[3], DImode);
> > +  if (INTVAL (operands[2]) == 63)
> > +    {
> > +      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
> > +      emit_move_insn (operands[0], operands[5]);
> > +      DONE;
> > +    }
> > +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
> > +})
> > +
> > +(define_split
> > +  [(set (match_operand:SI 0 "register_operand")
> > +       (minus:SI
> > +         (match_operand:SI 2 "const_int_operand")
> > +         (xor:SI
> > +           (minus:SI (const_int 31)
> > +                     (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > +           (const_int 31))))]
> > +  "!TARGET_LZCNT && ix86_pre_reload_split ()"
> > +  [(set (match_dup 3)
> > +       (minus:SI (const_int 31) (clz:SI (match_dup 1))))
> > +   (set (match_dup 0)
> > +       (plus:SI (match_dup 3) (match_dup 4)))]
> > +{
> > +  if (INTVAL (operands[2]) == 31)
> > +    {
> > +      emit_insn (gen_bsr_1 (operands[0], operands[1]));
> > +      DONE;
> > +    }
> > +  operands[3] = gen_reg_rtx (SImode);
> > +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
> > +})
> > +
> > +(define_split
> > +  [(set (match_operand:DI 0 "register_operand")
> > +       (minus:DI
> > +         (match_operand:DI 2 "const_int_operand")
> > +         (xor:DI
> > +           (sign_extend:DI
> > +             (minus:SI (const_int 63)
> > +                       (subreg:SI
> > +                         (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> > +                         0)))
> > +           (const_int 63))))]
> > +  "!TARGET_LZCNT
> > +   && TARGET_64BIT
> > +   && ix86_pre_reload_split ()
> > +   && ((unsigned HOST_WIDE_INT)
> > +       trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
> > +       == UINTVAL (operands[2]) - 63)"
> > +  [(set (match_dup 3)
> > +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> > +   (set (match_dup 0)
> > +       (plus:DI (match_dup 3) (match_dup 4)))]
> > +{
> > +  if (INTVAL (operands[2]) == 63)
> > +    {
> > +      emit_insn (gen_bsr_rex64_1 (operands[0], operands[1]));
> > +      DONE;
> > +    }
> > +  operands[3] = gen_reg_rtx (DImode);
> > +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
> > +})
> > +
> > +(define_split
> > +  [(set (match_operand:DI 0 "register_operand")
> > +       (minus:DI
> > +         (match_operand:DI 2 "const_int_operand")
> > +         (sign_extend:DI
> > +           (xor:SI
> > +             (minus:SI (const_int 31)
> > +                       (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> > +             (const_int 31)))))]
> > +  "!TARGET_LZCNT
> > +   && TARGET_64BIT
> > +   && ix86_pre_reload_split ()
> > +   && ((unsigned HOST_WIDE_INT)
> > +       trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
> > +       == UINTVAL (operands[2]) - 31)"
> > +  [(set (match_dup 3)
> > +       (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
> > +   (set (match_dup 0)
> > +       (plus:DI (match_dup 3) (match_dup 4)))]
> > +{
> > +  if (INTVAL (operands[2]) == 31)
> > +    {
> > +      emit_insn (gen_bsr_zext_1 (operands[0], operands[1]));
> > +      DONE;
> > +    }
> > +  operands[3] = gen_reg_rtx (DImode);
> > +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
> > +})
> > +
> >  (define_expand "clz<mode>2"
> >    [(parallel
> >       [(set (reg:CCZ FLAGS_REG)
> >         (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
> >                      (const_int 0)))
> > -      (set (match_operand:SWI48 0 "register_operand")
> > -          (minus:SWI48
> > -            (match_dup 2)
> > -            (clz:SWI48 (match_dup 1))))])
> > +      (set (match_dup 3) (minus:SWI48
> > +                          (match_dup 2)
> > +                          (clz:SWI48 (match_dup 1))))])
> >     (parallel
> > -     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
> > +     [(set (match_operand:SWI48 0 "register_operand")
> > +          (xor:SWI48 (match_dup 3) (match_dup 2)))
> >        (clobber (reg:CC FLAGS_REG))])]
> >    ""
> >  {
> > @@ -14795,6 +14994,7 @@ (define_expand "clz<mode>2"
> >        DONE;
> >      }
> >    operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
> > +  operands[3] = gen_reg_rtx (<MODE>mode);
> >  })
> >
> >  (define_insn_and_split "clz<mode>2_lzcnt"
> > --- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj        2021-07-30 15:07:26.104139537 +0200
> > +++ gcc/testsuite/gcc.target/i386/pr78103-1.c   2021-07-30 15:07:26.104139537 +0200
> > @@ -0,0 +1,28 @@
> > +/* PR target/78103 */
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O2 -mno-lzcnt" } */
> > +/* { dg-final { scan-assembler-not {\mcltq\M} } } */
> > +
> > +long long
> > +foo (long long x)
> > +{
> > +  return __builtin_clzll (x);
> > +}
> > +
> > +long long
> > +bar (long long x)
> > +{
> > +  return (unsigned int) __builtin_clzll (x);
> > +}
> > +
> > +long long
> > +baz (int x)
> > +{
> > +  return __builtin_clz (x);
> > +}
> > +
> > +long long
> > +qux (int x)
> > +{
> > +  return (unsigned int) __builtin_clz (x);
> > +}
> > --- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj        2021-07-30 15:07:26.104139537 +0200
> > +++ gcc/testsuite/gcc.target/i386/pr78103-2.c   2021-07-30 15:07:26.104139537 +0200
> > @@ -0,0 +1,33 @@
> > +/* PR target/78103 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mno-lzcnt" } */
> > +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> > +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> > +/* { dg-final { scan-assembler-not {\msubl\M} } } */
> > +/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
> > +
> > +unsigned int
> > +foo (unsigned int x)
> > +{
> > +  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> > +}
> > +
> > +unsigned int
> > +bar (unsigned int x)
> > +{
> > +  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> > +}
> > +
> > +#ifdef __x86_64__
> > +unsigned int
> > +baz (unsigned long long x)
> > +{
> > +  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> > +}
> > +
> > +unsigned int
> > +qux (unsigned long long x)
> > +{
> > +  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> > +}
> > +#endif
> > --- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj        2021-07-30 15:07:26.104139537 +0200
> > +++ gcc/testsuite/gcc.target/i386/pr78103-3.c   2021-07-30 15:07:26.104139537 +0200
> > @@ -0,0 +1,32 @@
> > +/* PR target/78103 */
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O2 -mno-lzcnt" } */
> > +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> > +/* { dg-final { scan-assembler-not {\mmovslq\M} } } */
> > +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> > +/* { dg-final { scan-assembler-not {\msubq\M} } } */
> > +/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
> 
> It fails for -mx32:
> 
> @@ -7,7 +7,7 @@ foo:
>  .LFB0:
>   .cfi_startproc
>   bsrl %edi, %edi
> - leaq 1(%rdi), %rax
> + leal 1(%rdi), %eax  << This should also work for -m64.  Why isn't it
> used for -m64?
>   ret
>   .cfi_endproc
>  .LFE0:
> @@ -30,7 +30,7 @@ baz:
>  .LFB2:
>   .cfi_startproc
>   bsrq %rdi, %rdi
> - leaq 1(%rdi), %rax
> + leal 1(%rdi), %eax   << This should also work for -m64.  Why isn't
> it used for -m64?
>   ret
>   .cfi_endproc
>  .LFE2:
> @@ -42,6 +42,7 @@ qux:
>  .LFB3:
>   .cfi_startproc
>   bsrq %rdi, %rax
> + movl %eax, %eax  << Why is it generated for -mx32?
>   ret
>   .cfi_endproc
>  .LFE3:
> 

Add a zero_extend patten for bsr_rex64_1 and use it to split SImode
constant - __builtin_clzll to avoid unncessary zero_extend.

OK for master?

H.J.
---
gcc/

	PR target/78103
	* config/i386/i386.md (bsr_rex64_1_zext): New.
	(combine splitter for SImode constant - clzll): Replace
	gen_bsr_rex64_1 with gen_bsr_rex64_1_zext.

gcc/testsuite/

	PR target/78103
	* gcc.target/i386/pr78103-2.c: Also scan incl.
	* gcc.target/i386/pr78103-3.c: Scan leal|addl|incl for x32.  Also
	scan incq.
---
 gcc/config/i386/i386.md                   | 17 ++++++++++++++++-
 gcc/testsuite/gcc.target/i386/pr78103-2.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr78103-3.c |  3 ++-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index c9787d73262..0c23ddb8d1f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14796,6 +14796,21 @@ (define_insn "bsr_rex64_1"
    (set_attr "znver1_decode" "vector")
    (set_attr "mode" "DI")])
 
+(define_insn "bsr_rex64_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (minus:SI (const_int 63)
+		    (subreg:SI
+		      (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))
+		      0))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT"
+  "bsr{q}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "DI")])
+
 (define_insn "bsr"
   [(set (reg:CCZ FLAGS_REG)
 	(compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
@@ -14907,7 +14922,7 @@ (define_split
   operands[5] = lowpart_subreg (SImode, operands[3], DImode);
   if (INTVAL (operands[2]) == 63)
     {
-      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
+      emit_insn (gen_bsr_rex64_1_zext (operands[3], operands[1]));
       emit_move_insn (operands[0], operands[5]);
       DONE;
     }
diff --git a/gcc/testsuite/gcc.target/i386/pr78103-2.c b/gcc/testsuite/gcc.target/i386/pr78103-2.c
index b3523382926..30f7f98f60a 100644
--- a/gcc/testsuite/gcc.target/i386/pr78103-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr78103-2.c
@@ -4,7 +4,7 @@
 /* { dg-final { scan-assembler-not {\mmovl\M} } } */
 /* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
 /* { dg-final { scan-assembler-not {\msubl\M} } } */
-/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
+/* { dg-final { scan-assembler {\m(leal|addl|incl)\M} } } */
 
 unsigned int
 foo (unsigned int x)
diff --git a/gcc/testsuite/gcc.target/i386/pr78103-3.c b/gcc/testsuite/gcc.target/i386/pr78103-3.c
index 49a36eccf4d..b8d82312a0e 100644
--- a/gcc/testsuite/gcc.target/i386/pr78103-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr78103-3.c
@@ -5,7 +5,8 @@
 /* { dg-final { scan-assembler-not {\mmovslq\M} } } */
 /* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
 /* { dg-final { scan-assembler-not {\msubq\M} } } */
-/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
+/* { dg-final { scan-assembler {\m(leaq|addq|incq)\M} { target { ! x32 } } } } */
+/* { dg-final { scan-assembler {\m(leal|addl|incl)\M} { target x32 } } } */
 
 unsigned long long
 foo (unsigned int x)
-- 
2.31.1



More information about the Gcc-patches mailing list