[PATCH] Improve integer bit test on atomic builtin return

Tue Oct 5 10:07:30 GMT 2021

On Mon, 4 Oct 2021, H.J. Lu wrote:

> commit adedd5c173388ae505470df152b9cb3947339566
> Author: Jakub Jelinek <jakub@redhat.com>
> Date:   Tue May 3 13:37:25 2016 +0200
> 
>     re PR target/49244 (__sync or __atomic builtins will not emit 'lock bts/btr/btc')
> 
> optimized bit test on atomic builtin return with lock bts/btr/btc.  But
> it works only for unsigned integers since atomic builtins operate on the
> 'uintptr_t' type.  It fails on bool:
> 
>   _1 = atomic builtin;
>   _4 = (_Bool) _1;
> 
> and signed integers:
> 
>   _1 = atomic builtin;
>   _2 = (int) _1;
>   _5 = _2 & (1 << N);
> 
> Improve bit test on atomic builtin return by converting:
> 
>   _1 = atomic builtin;
>   _4 = (_Bool) _1;
> 
> to
> 
>   _1 = atomic builtin;
>   _5 = _1 & (1 << 0);
>   _4 = (_Bool) _5;
> 
> and converting:
> 
>   _1 = atomic builtin;
>   _2 = (int) _1;
>   _5 = _2 & (1 << N);
> 
> to
>   _1 = atomic builtin;
>   _6 = _1 & (1 << N);
>   _5 = (int) _6;

Why not do this last bit with match.pd patterns (and independent on
whether _1 is defined by an atomic builtin)?  For the first suggested
transform that's likely going to be undone by folding, no?

Richard.

> gcc/
> 
> 	PR middle-end/102566
> 	* tree-ssa-ccp.c (optimize_atomic_bit_test_and): Handle cast
> 	between atomic builtin and bit test.
> 
> gcc/testsuite/
> 
> 	PR middle-end/102566
> 	* g++.target/i386/pr102566-1.C: New test.
> 	* gcc.target/i386/pr102566-1a.c: Likewise.
> 	* gcc.target/i386/pr102566-1b.c: Likewise.
> 	* gcc.target/i386/pr102566-2.c: Likewise.
> ---
>  gcc/testsuite/g++.target/i386/pr102566-1.C  |  12 ++
>  gcc/testsuite/gcc.target/i386/pr102566-1a.c | 188 ++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr102566-1b.c | 107 +++++++++++
>  gcc/testsuite/gcc.target/i386/pr102566-2.c  |  14 ++
>  gcc/tree-ssa-ccp.c                          | 136 +++++++++++++-
>  5 files changed, 452 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr102566-1.C
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-2.c
> 
> diff --git a/gcc/testsuite/g++.target/i386/pr102566-1.C b/gcc/testsuite/g++.target/i386/pr102566-1.C
> new file mode 100644
> index 00000000000..6e33298d8bf
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr102566-1.C
> @@ -0,0 +1,12 @@
> +/* { dg-do compile { target c++11 } } */
> +/* { dg-options "-O2" } */
> +
> +#include <atomic>
> +
> +bool tbit(std::atomic<int> &i)
> +{
> +  return i.fetch_or(1, std::memory_order_relaxed) & 1;
> +}
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 1 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1a.c b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
> new file mode 100644
> index 00000000000..a915de354e5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
> @@ -0,0 +1,188 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +void bar (void);
> +
> +__attribute__((noinline, noclone)) int
> +f1 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__sync_fetch_and_or (a, mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f2 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  int t1 = __atomic_fetch_or (a, mask, __ATOMIC_RELAXED);
> +  int t2 = t1 & mask;
> +  return t2 != 0;
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f3 (long int *a, int bit)
> +{
> +  long int mask = 1l << bit;
> +  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) == 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f4 (int *a)
> +{
> +  int mask = 1 << 7;
> +  return (__sync_fetch_and_or (a, mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f5 (int *a)
> +{
> +  int mask = 1 << 13;
> +  return (__atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f6 (int *a)
> +{
> +  int mask = 1 << 0;
> +  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f7 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  if ((__sync_fetch_and_xor (a, mask) & mask) != 0)
> +    bar ();
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f8 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  if ((__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) == 0)
> +    bar ();
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f9 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f10 (int *a)
> +{
> +  int mask = 1 << 7;
> +  return (__sync_fetch_and_xor (a, mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f11 (int *a)
> +{
> +  int mask = 1 << 13;
> +  return (__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f12 (int *a)
> +{
> +  int mask = 1 << 0;
> +  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f13 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__sync_fetch_and_and (a, ~mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f14 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f15 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f16 (int *a)
> +{
> +  int mask = 1 << 7;
> +  return (__sync_fetch_and_and (a, ~mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f17 (int *a)
> +{
> +  int mask = 1 << 13;
> +  return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f18 (int *a)
> +{
> +  int mask = 1 << 0;
> +  return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f19 (long int *a, int bit)
> +{
> +  long int mask = 1l << bit;
> +  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f20 (long int *a)
> +{
> +  long int mask = 1l << 7;
> +  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) == 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f21 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__sync_fetch_and_or (a, mask) & mask);
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f22 (long int *a)
> +{
> +  long int mask = 1l << 7;
> +  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask);
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f23 (long int *a)
> +{
> +  long int mask = 1l << 7;
> +  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask);
> +}
> +
> +__attribute__((noinline, noclone)) short int
> +f24 (short int *a)
> +{
> +  short int mask = 1 << 7;
> +  return (__sync_fetch_and_or (a, mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) short int
> +f25 (short int *a)
> +{
> +  short int mask = 1 << 7;
> +  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 9 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 10 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1b.c b/gcc/testsuite/gcc.target/i386/pr102566-1b.c
> new file mode 100644
> index 00000000000..c4dab8135c7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-1b.c
> @@ -0,0 +1,107 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -g" } */
> +
> +int cnt;
> +
> +__attribute__((noinline, noclone)) void
> +bar (void)
> +{
> +  cnt++;
> +}
> +
> +#include "pr102566-1a.c"
> +
> +int a;
> +long int b;
> +unsigned long int c;
> +unsigned short int d;
> +
> +int
> +main ()
> +{
> +  __atomic_store_n (&a, 15, __ATOMIC_RELAXED);
> +  if (f1 (&a, 2) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 15
> +      || f1 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31)
> +    __builtin_abort ();
> +  if (f2 (&a, 1) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31
> +      || f2 (&a, 5) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 63)
> +    __builtin_abort ();
> +  __atomic_store_n (&b, 24, __ATOMIC_RELAXED);
> +  if (f3 (&b, 2) != 1 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28
> +      || f3 (&b, 3) != 0 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28)
> +    __builtin_abort ();
> +  __atomic_store_n (&a, 0, __ATOMIC_RELAXED);
> +  if (f4 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128
> +      || f4 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128)
> +    __builtin_abort ();
> +  if (f5 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320
> +      || f5 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320)
> +    __builtin_abort ();
> +  if (f6 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321
> +      || f6 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (cnt != 0
> +      || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if ((f8 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || (f8 (&a, 7), cnt) != 2 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f9 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129
> +      || f9 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f10 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || f10 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f11 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129
> +      || f11 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f12 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320
> +      || f12 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f13 (&a, 7) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || f13 (&a, 7) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193)
> +    __builtin_abort ();
> +  if (f14 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1
> +      || f14 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1)
> +    __builtin_abort ();
> +  if (f15 (&a, 0) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0
> +      || f15 (&a, 0) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0)
> +    __builtin_abort ();
> +  __atomic_store_n (&a, 8321, __ATOMIC_RELAXED);
> +  if (f16 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || f16 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193)
> +    __builtin_abort ();
> +  if (f17 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1
> +      || f17 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1)
> +    __builtin_abort ();
> +  if (f18 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0
> +      || f18 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0)
> +    __builtin_abort ();
> +  if (f19 (&c, 7) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128
> +      || f19 (&c, 7) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0)
> +    __builtin_abort ();
> +  if (f20 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128
> +      || f20 (&c) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0)
> +    __builtin_abort ();
> +  __atomic_store_n (&a, 128, __ATOMIC_RELAXED);
> +  if (f21 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144
> +      || f21 (&a, 4) != 16 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144)
> +    __builtin_abort ();
> +  __atomic_store_n (&c, 1, __ATOMIC_RELAXED);
> +  if (f22 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129
> +      || f22 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1)
> +    __builtin_abort ();
> +  if (f23 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129
> +      || f23 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1)
> +    __builtin_abort ();
> +  if (f24 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128
> +      || f24 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128)
> +    __builtin_abort ();
> +  __atomic_store_n (&d, 1, __ATOMIC_RELAXED);
> +  if (f25 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129
> +      || f25 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129
> +      || cnt != 2)
> +    __builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-2.c b/gcc/testsuite/gcc.target/i386/pr102566-2.c
> new file mode 100644
> index 00000000000..d1c30315353
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#include <stdatomic.h>
> +#include <stdbool.h>
> +
> +bool
> +foo (_Atomic int *v)
> +{
> +  return atomic_fetch_or_explicit (v, 1, memory_order_relaxed) & 1;
> +}
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 1 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> diff --git a/gcc/tree-ssa-ccp.c b/gcc/tree-ssa-ccp.c
> index 70ce6a4d5b8..a3f7b7f233e 100644
> --- a/gcc/tree-ssa-ccp.c
> +++ b/gcc/tree-ssa-ccp.c
> @@ -3279,10 +3279,115 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
>        || SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs)
>        || !single_imm_use (lhs, &use_p, &use_stmt)
>        || !is_gimple_assign (use_stmt)
> -      || gimple_assign_rhs_code (use_stmt) != BIT_AND_EXPR
>        || !gimple_vdef (call))
>      return;
>  
> +  mask = gimple_call_arg (call, 1);
> +  tree_code rhs_code = gimple_assign_rhs_code (use_stmt);
> +  if (rhs_code != BIT_AND_EXPR)
> +    {
> +      if (rhs_code != NOP_EXPR)
> +	return;
> +
> +      tree nop_lhs = gimple_assign_lhs (use_stmt);
> +      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (nop_lhs))
> +	return;
> +
> +      tree nop_rhs = gimple_assign_rhs1 (use_stmt);
> +
> +      gimple *g;
> +      gimple_stmt_iterator gsi;
> +      tree var;
> +
> +      if (TREE_CODE (TREE_TYPE (nop_lhs)) == BOOLEAN_TYPE)
> +	{
> +	  /* Convert
> +	     _1 = atomic bit op;
> +	     _4 = (_Bool) _1;
> +	     to
> +	     _1 = atomic bit op;
> +	     _5 = _1 & 1;
> +	     _4 = (_Bool) _5;
> +	   */
> +	  var = make_ssa_name (TREE_TYPE (nop_rhs));
> +	  replace_uses_by (nop_rhs, var);
> +	  g = gimple_build_assign (var, BIT_AND_EXPR, nop_rhs,
> +				   build_int_cst (TREE_TYPE (lhs), 1));
> +	  gsi = gsi_for_stmt (use_stmt);
> +	  gsi_insert_before (&gsi, g, GSI_NEW_STMT);
> +	  use_stmt = g;
> +	}
> +      else if (TYPE_PRECISION (TREE_TYPE (nop_lhs))
> +	       == TYPE_PRECISION (TREE_TYPE (nop_rhs)))
> +	{
> +	  gimple *use_nop_stmt;
> +	  if (!single_imm_use (nop_lhs, &use_p, &use_nop_stmt)
> +	      || !is_gimple_assign (use_nop_stmt)
> +	      || gimple_assign_rhs_code (use_nop_stmt) != BIT_AND_EXPR)
> +	    return;
> +
> +	  tree op_mask = mask;
> +	  if (TREE_CODE (op_mask) == SSA_NAME)
> +	    {
> +	      g = SSA_NAME_DEF_STMT (op_mask);
> +	      if (gimple_assign_rhs_code (g) == NOP_EXPR)
> +		{
> +		  tree mask_nop_lhs = gimple_assign_lhs (g);
> +
> +		  if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (mask_nop_lhs))
> +		    return;
> +
> +		  tree mask_nop_rhs = gimple_assign_rhs1 (g);
> +		  if (TYPE_PRECISION (TREE_TYPE (mask_nop_lhs))
> +		      != TYPE_PRECISION (TREE_TYPE (mask_nop_rhs)))
> +		    return;
> +		  op_mask = mask_nop_rhs;
> +		  g = SSA_NAME_DEF_STMT (op_mask);
> +		}
> +
> +	      if (fn == IFN_ATOMIC_BIT_TEST_AND_RESET)
> +		{
> +		  if (!is_gimple_assign (g)
> +		      || gimple_assign_rhs_code (g) != BIT_NOT_EXPR)
> +		    return;
> +		  tree reset_mask = gimple_assign_rhs1 (g);
> +		  if (TREE_CODE (op_mask) != SSA_NAME)
> +		    return;
> +		  g = SSA_NAME_DEF_STMT (reset_mask);
> +		}
> +
> +	      if (!is_gimple_assign (g)
> +		  || gimple_assign_rhs_code (g) != LSHIFT_EXPR
> +		  || !integer_onep (gimple_assign_rhs1 (g)))
> +		return;
> +	    }
> +
> +	  /* Convert
> +	     _1 = atomic bit op;
> +	     _2 = (int) _1;
> +	     _5 = _2 & N;
> +	     to
> +	     _1 = atomic bit op;
> +	     _6 = _1 & N;
> +	     _5 = (int) _6;
> +	   */
> +	  replace_uses_by (nop_lhs, lhs);
> +	  tree use_nop_lhs = gimple_assign_lhs (use_nop_stmt);
> +	  var = make_ssa_name (TREE_TYPE (use_nop_lhs));
> +	  gimple_assign_set_lhs (use_nop_stmt, var);
> +	  gsi = gsi_for_stmt (use_stmt);
> +	  gsi_remove (&gsi, true);
> +	  release_defs (use_stmt);
> +	  gsi_remove (gsip, true);
> +	  var = build1 (NOP_EXPR, TREE_TYPE (use_nop_lhs), var);
> +	  gsi = gsi_for_stmt (use_nop_stmt);
> +	  g = gimple_build_assign (use_nop_lhs, var);
> +	  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
> +	  use_stmt = use_nop_stmt;
> +	  mask = op_mask;
> +	}
> +    }
> +
>    switch (fn)
>      {
>      case IFN_ATOMIC_BIT_TEST_AND_SET:
> @@ -3301,7 +3406,6 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
>    if (optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs))) == CODE_FOR_nothing)
>      return;
>  
> -  mask = gimple_call_arg (call, 1);
>    tree use_lhs = gimple_assign_lhs (use_stmt);
>    if (!use_lhs)
>      return;
> @@ -3434,18 +3538,40 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
>  	 of the specified bit after the atomic operation (makes only sense
>  	 for xor, otherwise the bit content is compile time known),
>  	 we need to invert the bit.  */
> +      tree mask_convert = mask;
> +      gimple *g_convert = nullptr;
> +      if (!use_bool && TREE_TYPE (lhs) != TREE_TYPE (mask))
> +	{
> +	  mask_convert = make_ssa_name (TREE_TYPE (lhs));
> +	  tree var = build1 (NOP_EXPR, TREE_TYPE (lhs), mask);
> +	  g_convert = gimple_build_assign (mask_convert, var);
> +	}
>        g = gimple_build_assign (make_ssa_name (TREE_TYPE (lhs)),
>  			       BIT_XOR_EXPR, new_lhs,
>  			       use_bool ? build_int_cst (TREE_TYPE (lhs), 1)
> -					: mask);
> +					: mask_convert);
>        new_lhs = gimple_assign_lhs (g);
>        if (throws)
>  	{
> -	  gsi_insert_on_edge_immediate (e, g);
> +	  if (g_convert)
> +	    {
> +	      gsi_insert_on_edge_immediate (e, g_convert);
> +	      gsi = gsi_for_stmt (g_convert);
> +	      gsi_insert_after (&gsi, g, GSI_NEW_STMT);
> +	    }
> +	  else
> +	    gsi_insert_on_edge_immediate (e, g);
>  	  gsi = gsi_for_stmt (g);
>  	}
>        else
> -	gsi_insert_after (&gsi, g, GSI_NEW_STMT);
> +	{
> +	  if (g_convert)
> +	    {
> +	      gsi_insert_after (&gsi, g_convert, GSI_NEW_STMT);
> +	      gsi = gsi_for_stmt (g_convert);
> +	    }
> +	  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
> +	}
>      }
>    if (use_bool && has_debug_uses)
>      {
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)