[PATCH] i386: Improve extensions of __builtin_clz and constant - __builtin_clz for -mno-lzcnt [PR78103]
H.J. Lu
hjl.tools@gmail.com
Sat Jul 31 19:53:44 GMT 2021
On Fri, Jul 30, 2021 at 6:27 AM Jakub Jelinek via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Fri, Jul 30, 2021 at 12:27:39PM +0200, Uros Bizjak wrote:
> > Please put some space here, e.g.:
> ...
> > Can you just name the relevant insn pattern and use
> >
> > emit_insn (gen_bsr_1)?
>
> Here is the updated patch. I'll bootstrap/regtest it tonight.
>
> 2021-07-30 Jakub Jelinek <jakub@redhat.com>
>
> PR target/78103
> * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New
> define_insn patterns.
> (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
> Add combine splitters for constant - clz.
> (clz<mode>2): Use a temporary pseudo for bsr result.
>
> * gcc.target/i386/pr78103-1.c: New test.
> * gcc.target/i386/pr78103-2.c: New test.
> * gcc.target/i386/pr78103-3.c: New test.
>
> --- gcc/config/i386/i386.md.jj 2021-07-28 12:05:56.857977764 +0200
> +++ gcc/config/i386/i386.md 2021-07-30 15:13:49.994946550 +0200
> @@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64"
> (set_attr "znver1_decode" "vector")
> (set_attr "mode" "DI")])
>
> +(define_insn "bsr_rex64_1"
> + [(set (match_operand:DI 0 "register_operand" "=r")
> + (minus:DI (const_int 63)
> + (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
> + (clobber (reg:CC FLAGS_REG))]
> + "!TARGET_LZCNT && TARGET_64BIT"
> + "bsr{q}\t{%1, %0|%0, %1}"
> + [(set_attr "type" "alu1")
> + (set_attr "prefix_0f" "1")
> + (set_attr "znver1_decode" "vector")
> + (set_attr "mode" "DI")])
> +
> (define_insn "bsr"
> [(set (reg:CCZ FLAGS_REG)
> (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
> @@ -14775,17 +14787,204 @@ (define_insn "bsr"
> (set_attr "znver1_decode" "vector")
> (set_attr "mode" "SI")])
>
> +(define_insn "bsr_1"
> + [(set (match_operand:SI 0 "register_operand" "=r")
> + (minus:SI (const_int 31)
> + (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
> + (clobber (reg:CC FLAGS_REG))]
> + "!TARGET_LZCNT"
> + "bsr{l}\t{%1, %0|%0, %1}"
> + [(set_attr "type" "alu1")
> + (set_attr "prefix_0f" "1")
> + (set_attr "znver1_decode" "vector")
> + (set_attr "mode" "SI")])
> +
> +(define_insn "bsr_zext_1"
> + [(set (match_operand:DI 0 "register_operand" "=r")
> + (zero_extend:DI
> + (minus:SI
> + (const_int 31)
> + (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
> + (clobber (reg:CC FLAGS_REG))]
> + "!TARGET_LZCNT && TARGET_64BIT"
> + "bsr{l}\t{%1, %k0|%k0, %1}"
> + [(set_attr "type" "alu1")
> + (set_attr "prefix_0f" "1")
> + (set_attr "znver1_decode" "vector")
> + (set_attr "mode" "SI")])
> +
> +; As bsr is undefined behavior on zero and for other input
> +; values it is in range 0 to 63, we can optimize away sign-extends.
> +(define_insn_and_split "*bsr_rex64_2"
> + [(set (match_operand:DI 0 "register_operand")
> + (xor:DI
> + (sign_extend:DI
> + (minus:SI
> + (const_int 63)
> + (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> + 0)))
> + (const_int 63)))
> + (clobber (reg:CC FLAGS_REG))]
> + "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> + "#"
> + "&& 1"
> + [(parallel [(set (reg:CCZ FLAGS_REG)
> + (compare:CCZ (match_dup 1) (const_int 0)))
> + (set (match_dup 2)
> + (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
> + (parallel [(set (match_dup 0)
> + (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
> + (clobber (reg:CC FLAGS_REG))])]
> +{
> + operands[2] = gen_reg_rtx (DImode);
> + operands[3] = lowpart_subreg (SImode, operands[2], DImode);
> +})
> +
> +(define_insn_and_split "*bsr_2"
> + [(set (match_operand:DI 0 "register_operand")
> + (sign_extend:DI
> + (xor:SI
> + (minus:SI
> + (const_int 31)
> + (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> + (const_int 31))))
> + (clobber (reg:CC FLAGS_REG))]
> + "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> + "#"
> + "&& 1"
> + [(parallel [(set (reg:CCZ FLAGS_REG)
> + (compare:CCZ (match_dup 1) (const_int 0)))
> + (set (match_dup 2)
> + (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
> + (parallel [(set (match_dup 0)
> + (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
> + (clobber (reg:CC FLAGS_REG))])]
> + "operands[2] = gen_reg_rtx (SImode);")
> +
> +; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
> +; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
> +; in [0, 63] or [0, 31] range.
> +(define_split
> + [(set (match_operand:SI 0 "register_operand")
> + (minus:SI
> + (match_operand:SI 2 "const_int_operand")
> + (xor:SI
> + (minus:SI (const_int 63)
> + (subreg:SI
> + (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> + 0))
> + (const_int 63))))]
> + "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
> + [(set (match_dup 3)
> + (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> + (set (match_dup 0)
> + (plus:SI (match_dup 5) (match_dup 4)))]
> +{
> + operands[3] = gen_reg_rtx (DImode);
> + operands[5] = lowpart_subreg (SImode, operands[3], DImode);
> + if (INTVAL (operands[2]) == 63)
> + {
> + emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
> + emit_move_insn (operands[0], operands[5]);
> + DONE;
> + }
> + operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
> +})
> +
> +(define_split
> + [(set (match_operand:SI 0 "register_operand")
> + (minus:SI
> + (match_operand:SI 2 "const_int_operand")
> + (xor:SI
> + (minus:SI (const_int 31)
> + (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> + (const_int 31))))]
> + "!TARGET_LZCNT && ix86_pre_reload_split ()"
> + [(set (match_dup 3)
> + (minus:SI (const_int 31) (clz:SI (match_dup 1))))
> + (set (match_dup 0)
> + (plus:SI (match_dup 3) (match_dup 4)))]
> +{
> + if (INTVAL (operands[2]) == 31)
> + {
> + emit_insn (gen_bsr_1 (operands[0], operands[1]));
> + DONE;
> + }
> + operands[3] = gen_reg_rtx (SImode);
> + operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
> +})
> +
> +(define_split
> + [(set (match_operand:DI 0 "register_operand")
> + (minus:DI
> + (match_operand:DI 2 "const_int_operand")
> + (xor:DI
> + (sign_extend:DI
> + (minus:SI (const_int 63)
> + (subreg:SI
> + (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
> + 0)))
> + (const_int 63))))]
> + "!TARGET_LZCNT
> + && TARGET_64BIT
> + && ix86_pre_reload_split ()
> + && ((unsigned HOST_WIDE_INT)
> + trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
> + == UINTVAL (operands[2]) - 63)"
> + [(set (match_dup 3)
> + (minus:DI (const_int 63) (clz:DI (match_dup 1))))
> + (set (match_dup 0)
> + (plus:DI (match_dup 3) (match_dup 4)))]
> +{
> + if (INTVAL (operands[2]) == 63)
> + {
> + emit_insn (gen_bsr_rex64_1 (operands[0], operands[1]));
> + DONE;
> + }
> + operands[3] = gen_reg_rtx (DImode);
> + operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
> +})
> +
> +(define_split
> + [(set (match_operand:DI 0 "register_operand")
> + (minus:DI
> + (match_operand:DI 2 "const_int_operand")
> + (sign_extend:DI
> + (xor:SI
> + (minus:SI (const_int 31)
> + (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
> + (const_int 31)))))]
> + "!TARGET_LZCNT
> + && TARGET_64BIT
> + && ix86_pre_reload_split ()
> + && ((unsigned HOST_WIDE_INT)
> + trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
> + == UINTVAL (operands[2]) - 31)"
> + [(set (match_dup 3)
> + (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
> + (set (match_dup 0)
> + (plus:DI (match_dup 3) (match_dup 4)))]
> +{
> + if (INTVAL (operands[2]) == 31)
> + {
> + emit_insn (gen_bsr_zext_1 (operands[0], operands[1]));
> + DONE;
> + }
> + operands[3] = gen_reg_rtx (DImode);
> + operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
> +})
> +
> (define_expand "clz<mode>2"
> [(parallel
> [(set (reg:CCZ FLAGS_REG)
> (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
> (const_int 0)))
> - (set (match_operand:SWI48 0 "register_operand")
> - (minus:SWI48
> - (match_dup 2)
> - (clz:SWI48 (match_dup 1))))])
> + (set (match_dup 3) (minus:SWI48
> + (match_dup 2)
> + (clz:SWI48 (match_dup 1))))])
> (parallel
> - [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
> + [(set (match_operand:SWI48 0 "register_operand")
> + (xor:SWI48 (match_dup 3) (match_dup 2)))
> (clobber (reg:CC FLAGS_REG))])]
> ""
> {
> @@ -14795,6 +14994,7 @@ (define_expand "clz<mode>2"
> DONE;
> }
> operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
> + operands[3] = gen_reg_rtx (<MODE>mode);
> })
>
> (define_insn_and_split "clz<mode>2_lzcnt"
> --- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj 2021-07-30 15:07:26.104139537 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-1.c 2021-07-30 15:07:26.104139537 +0200
> @@ -0,0 +1,28 @@
> +/* PR target/78103 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mcltq\M} } } */
> +
> +long long
> +foo (long long x)
> +{
> + return __builtin_clzll (x);
> +}
> +
> +long long
> +bar (long long x)
> +{
> + return (unsigned int) __builtin_clzll (x);
> +}
> +
> +long long
> +baz (int x)
> +{
> + return __builtin_clz (x);
> +}
> +
> +long long
> +qux (int x)
> +{
> + return (unsigned int) __builtin_clz (x);
> +}
> --- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj 2021-07-30 15:07:26.104139537 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-2.c 2021-07-30 15:07:26.104139537 +0200
> @@ -0,0 +1,33 @@
> +/* PR target/78103 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> +/* { dg-final { scan-assembler-not {\msubl\M} } } */
> +/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
> +
> +unsigned int
> +foo (unsigned int x)
> +{
> + return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> +}
> +
> +unsigned int
> +bar (unsigned int x)
> +{
> + return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> +}
> +
> +#ifdef __x86_64__
> +unsigned int
> +baz (unsigned long long x)
> +{
> + return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> +}
> +
> +unsigned int
> +qux (unsigned long long x)
> +{
> + return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> +}
> +#endif
> --- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj 2021-07-30 15:07:26.104139537 +0200
> +++ gcc/testsuite/gcc.target/i386/pr78103-3.c 2021-07-30 15:07:26.104139537 +0200
> @@ -0,0 +1,32 @@
> +/* PR target/78103 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mno-lzcnt" } */
> +/* { dg-final { scan-assembler-not {\mmovl\M} } } */
> +/* { dg-final { scan-assembler-not {\mmovslq\M} } } */
> +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
> +/* { dg-final { scan-assembler-not {\msubq\M} } } */
> +/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
It fails for -mx32:
@@ -7,7 +7,7 @@ foo:
.LFB0:
.cfi_startproc
bsrl %edi, %edi
- leaq 1(%rdi), %rax
+ leal 1(%rdi), %eax << This should also work for -m64. Why isn't it
used for -m64?
ret
.cfi_endproc
.LFE0:
@@ -30,7 +30,7 @@ baz:
.LFB2:
.cfi_startproc
bsrq %rdi, %rdi
- leaq 1(%rdi), %rax
+ leal 1(%rdi), %eax << This should also work for -m64. Why isn't
it used for -m64?
ret
.cfi_endproc
.LFE2:
@@ -42,6 +42,7 @@ qux:
.LFB3:
.cfi_startproc
bsrq %rdi, %rax
+ movl %eax, %eax << Why is it generated for -mx32?
ret
.cfi_endproc
.LFE3:
> +unsigned long long
> +foo (unsigned int x)
> +{
> + return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
> +}
> +
> +unsigned long long
> +bar (unsigned int x)
> +{
> + return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
> +}
> +
> +unsigned long long
> +baz (unsigned long long x)
> +{
> + return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
> +}
> +
> +unsigned long long
> +qux (unsigned long long x)
> +{
> + return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
> +}
>
>
> Jakub
>
--
H.J.
More information about the Gcc-patches
mailing list