[PATCH] i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103]

H.J. Lu hjl.tools@gmail.com
Sat Jul 31 19:53:44 GMT 2021


On Fri, Jul 30, 2021 at 6:27 AM Jakub Jelinek via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Fri, Jul 30, 2021 at 12:27:39PM +0200, Uros Bizjak wrote:
> > Please put some space here, e.g.:
> ...
> > Can you just name the relevant insn pattern and use
> >
> > emit_insn (gen_bsr_1)?
>
> Here is the updated patch.  I'll bootstrap/regtest it tonight.
>
> 2021-07-30  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/78103
>         * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New
>         define_insn patterns.
>         (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
>         Add combine splitters for constant - clz.
>         (clz<mode>2): Use a temporary pseudo for bsr result.
>
>         * gcc.target/i386/pr78103-1.c: New test.
>         * gcc.target/i386/pr78103-2.c: New test.
>         * gcc.target/i386/pr78103-3.c: New test.
>
> --- gcc/config/i386/i386.md.jj  2021-07-28 12:05:56.857977764 +0200
> +++ gcc/config/i386/i386.md     2021-07-30 15:13:49.994946550 +0200
> @@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64"
>     (set_attr "znver1_decode" "vector")
>     (set_attr "mode" "DI")])
>
> +(define_insn "bsr_rex64_1"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (minus:DI (const_int 63)
> +                 (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT"
> +  "bsr{q}\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "DI")])
> +
>  (define_insn "bsr"
>    [(set (reg:CCZ FLAGS_REG)
>         (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
> @@ -14775,17 +14787,204 @@ (define_insn "bsr"
>     (set_attr "znver1_decode" "vector")
>     (set_attr "mode" "SI")])
>
> +(define_insn "bsr_1"
> +  [(set (match_operand:SI 0 "register_operand" "=r")
> +       (minus:SI (const_int 31)
> +                 (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT"
> +  "bsr{l}\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "SI")])
> +
> +(define_insn "bsr_zext_1"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +       (zero_extend:DI
> +         (minus:SI
> +           (const_int 31)
> +           (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT"
> +  "bsr{l}\t{%1, %k0|%k0, %1}"
> +  [(set_attr "type" "alu1")
> +   (set_attr "prefix_0f" "1")
> +   (set_attr "znver1_decode" "vector")
> +   (set_attr "mode" "SI")])
> +
> +; As bsr is undefined behavior on zero and for other input
> +; values it is in range 0 to 63, we can optimize away sign-extends.
> +(define_insn_and_split "*bsr_rex64_2"
> +  [(set (match_operand:DI 0 "register_operand")
> +       (xor:DI
> +         (sign_extend:DI
> +           (minus:SI
> +             (const_int 63)
> +             (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> +                        0)))
> +         (const_int 63)))
> +    (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(parallel [(set (reg:CCZ FLAGS_REG)
> +                  (compare:CCZ (match_dup 1) (const_int 0)))
> +             (set (match_dup 2)
> +                  (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
> +   (parallel [(set (match_dup 0)
> +                  (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
> +             (clobber (reg:CC FLAGS_REG))])]
> +{
> +  operands[2] = gen_reg_rtx (DImode);
> +  operands[3] = lowpart_subreg (SImode, operands[2], DImode);
> +})
> +
> +(define_insn_and_split "*bsr_2"
> +  [(set (match_operand:DI 0 "register_operand")
> +       (sign_extend:DI
> +         (xor:SI
> +           (minus:SI
> +             (const_int 31)
> +             (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> +           (const_int 31))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(parallel [(set (reg:CCZ FLAGS_REG)
> +                  (compare:CCZ (match_dup 1) (const_int 0)))
> +             (set (match_dup 2)
> +                  (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
> +   (parallel [(set (match_dup 0)
> +                  (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
> +             (clobber (reg:CC FLAGS_REG))])]
> +  "operands[2] = gen_reg_rtx (SImode);")
> +
> +; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
> +; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
> +; in [0, 63] or [0, 31] range.
> +(define_split
> +  [(set (match_operand:SI 0 "register_operand")
> +       (minus:SI
> +         (match_operand:SI 2 "const_int_operand")
> +         (xor:SI
> +           (minus:SI (const_int 63)
> +                     (subreg:SI
> +                       (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> +                       0))
> +           (const_int 63))))]
> +  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> +  [(set (match_dup 3)
> +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> +   (set (match_dup 0)
> +       (plus:SI (match_dup 5) (match_dup 4)))]
> +{
> +  operands[3] = gen_reg_rtx (DImode);
> +  operands[5] = lowpart_subreg (SImode, operands[3], DImode);
> +  if (INTVAL (operands[2]) == 63)
> +    {
> +      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
> +      emit_move_insn (operands[0], operands[5]);
> +      DONE;
> +    }
> +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
> +})
> +
> +(define_split
> +  [(set (match_operand:SI 0 "register_operand")
> +       (minus:SI
> +         (match_operand:SI 2 "const_int_operand")
> +         (xor:SI
> +           (minus:SI (const_int 31)
> +                     (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> +           (const_int 31))))]
> +  "!TARGET_LZCNT && ix86_pre_reload_split ()"
> +  [(set (match_dup 3)
> +       (minus:SI (const_int 31) (clz:SI (match_dup 1))))
> +   (set (match_dup 0)
> +       (plus:SI (match_dup 3) (match_dup 4)))]
> +{
> +  if (INTVAL (operands[2]) == 31)
> +    {
> +      emit_insn (gen_bsr_1 (operands[0], operands[1]));
> +      DONE;
> +    }
> +  operands[3] = gen_reg_rtx (SImode);
> +  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
> +})
> +
> +(define_split
> +  [(set (match_operand:DI 0 "register_operand")
> +       (minus:DI
> +         (match_operand:DI 2 "const_int_operand")
> +         (xor:DI
> +           (sign_extend:DI
> +             (minus:SI (const_int 63)
> +                       (subreg:SI
> +                         (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> +                         0)))
> +           (const_int 63))))]
> +  "!TARGET_LZCNT
> +   && TARGET_64BIT
> +   && ix86_pre_reload_split ()
> +   && ((unsigned HOST_WIDE_INT)
> +       trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
> +       == UINTVAL (operands[2]) - 63)"
> +  [(set (match_dup 3)
> +       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> +   (set (match_dup 0)
> +       (plus:DI (match_dup 3) (match_dup 4)))]
> +{
> +  if (INTVAL (operands[2]) == 63)
> +    {
> +      emit_insn (gen_bsr_rex64_1 (operands[0], operands[1]));
> +      DONE;
> +    }
> +  operands[3] = gen_reg_rtx (DImode);
> +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
> +})
> +
> +(define_split
> +  [(set (match_operand:DI 0 "register_operand")
> +       (minus:DI
> +         (match_operand:DI 2 "const_int_operand")
> +         (sign_extend:DI
> +           (xor:SI
> +             (minus:SI (const_int 31)
> +                       (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> +             (const_int 31)))))]
> +  "!TARGET_LZCNT
> +   && TARGET_64BIT
> +   && ix86_pre_reload_split ()
> +   && ((unsigned HOST_WIDE_INT)
> +       trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
> +       == UINTVAL (operands[2]) - 31)"
> +  [(set (match_dup 3)
> +       (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
> +   (set (match_dup 0)
> +       (plus:DI (match_dup 3) (match_dup 4)))]
> +{
> +  if (INTVAL (operands[2]) == 31)
> +    {
> +      emit_insn (gen_bsr_zext_1 (operands[0], operands[1]));
> +      DONE;
> +    }
> +  operands[3] = gen_reg_rtx (DImode);
> +  operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
> +})
> +
>  (define_expand "clz<mode>2"
>    [(parallel
>       [(set (reg:CCZ FLAGS_REG)
>         (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
>                      (const_int 0)))
> -      (set (match_operand:SWI48 0 "register_operand")
> -          (minus:SWI48
> -            (match_dup 2)
> -            (clz:SWI48 (match_dup 1))))])
> +      (set (match_dup 3) (minus:SWI48
> +                          (match_dup 2)
> +                          (clz:SWI48 (match_dup 1))))])
>     (parallel
> -     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
> +     [(set (match_operand:SWI48 0 "register_operand")
> +          (xor:SWI48 (match_dup 3) (match_dup 2)))
>        (clobber (reg:CC FLAGS_REG))])]
>    ""
>  {
> @@ -14795,6 +14994,7 @@ (define_expand "clz<mode>2"
>        DONE;
>      }
>    operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
> +  operands[3] = gen_reg_rtx (<MODE>mode);
>  })
>
>  (define_insn_and_split "clz<mode>2_lzcnt"
> --- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj        2021-07-30 15:07:26.104139537 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-1.c   2021-07-30 15:07:26.104139537 +0200
> @@ -0,0 +1,28 @@
> +/* PR target/78103 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mcltq\M} } } */
> +
> +long long
> +foo (long long x)
> +{
> +  return __builtin_clzll (x);
> +}
> +
> +long long
> +bar (long long x)
> +{
> +  return (unsigned int) __builtin_clzll (x);
> +}
> +
> +long long
> +baz (int x)
> +{
> +  return __builtin_clz (x);
> +}
> +
> +long long
> +qux (int x)
> +{
> +  return (unsigned int) __builtin_clz (x);
> +}
> --- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj        2021-07-30 15:07:26.104139537 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-2.c   2021-07-30 15:07:26.104139537 +0200
> @@ -0,0 +1,33 @@
> +/* PR target/78103 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> +/* { dg-final { scan-assembler-not {\msubl\M} } } */
> +/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
> +
> +unsigned int
> +foo (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> +}
> +
> +unsigned int
> +bar (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> +}
> +
> +#ifdef __x86_64__
> +unsigned int
> +baz (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> +}
> +
> +unsigned int
> +qux (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> +}
> +#endif
> --- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj        2021-07-30 15:07:26.104139537 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-3.c   2021-07-30 15:07:26.104139537 +0200
> @@ -0,0 +1,32 @@
> +/* PR target/78103 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> +/* { dg-final { scan-assembler-not {\mmovslq\M} } } */
> +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> +/* { dg-final { scan-assembler-not {\msubq\M} } } */
> +/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */

It fails for -mx32:

@@ -7,7 +7,7 @@ foo:
 .LFB0:
  .cfi_startproc
  bsrl %edi, %edi
- leaq 1(%rdi), %rax
+ leal 1(%rdi), %eax  << This should also work for -m64.  Why isn't it
used for -m64?
  ret
  .cfi_endproc
 .LFE0:
@@ -30,7 +30,7 @@ baz:
 .LFB2:
  .cfi_startproc
  bsrq %rdi, %rdi
- leaq 1(%rdi), %rax
+ leal 1(%rdi), %eax   << This should also work for -m64.  Why isn't
it used for -m64?
  ret
  .cfi_endproc
 .LFE2:
@@ -42,6 +42,7 @@ qux:
 .LFB3:
  .cfi_startproc
  bsrq %rdi, %rax
+ movl %eax, %eax  << Why is it generated for -mx32?
  ret
  .cfi_endproc
 .LFE3:

> +unsigned long long
> +foo (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> +}
> +
> +unsigned long long
> +bar (unsigned int x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> +}
> +
> +unsigned long long
> +baz (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> +}
> +
> +unsigned long long
> +qux (unsigned long long x)
> +{
> +  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> +}
>
>
>         Jakub
>


-- 
H.J.


More information about the Gcc-patches mailing list