Bug 66867 - Suboptimal code generation for atomic_compare_exchange
Summary: Suboptimal code generation for atomic_compare_exchange
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: middle-end (show other bugs)
Version: 6.0
: P3 normal
Target Milestone: ---
Assignee: Jakub Jelinek
URL:
Keywords: missed-optimization
: 66713 70825 71599 (view as bug list)
Depends on:
Blocks:
 
Reported: 2015-07-14 11:17 UTC by Sebastian Huber
Modified: 2016-07-18 11:36 UTC (History)
6 users (show)

See Also:
Host:
Target: powerpc64*-*-*, x86_64-*-*
Build:
Known to work:
Known to fail:
Last reconfirmed: 2015-09-21 00:00:00


Attachments
gcc7-pr66867-wip.patch (2.48 KB, patch)
2016-06-21 16:38 UTC, Jakub Jelinek
Details | Diff
gcc7-pr66867.patch (5.21 KB, patch)
2016-06-22 13:48 UTC, Jakub Jelinek
Details | Diff

Note You need to log in before you can comment on or make changes to this bug.
Description Sebastian Huber 2015-07-14 11:17:33 UTC
At least on ARM and PowerPC for the following test case

#include <stdatomic.h>

void f(atomic_uint *a)
{
  unsigned int e = 0;
  atomic_compare_exchange_strong_explicit(a, &e, 1, memory_order_relaxed, memory_order_relaxed);
}

a superfluous stack frame and store is generated:

        .file   "test-cas.c"
        .machine ppc
        .section        ".text"
        .align 2
        .globl f
        .type   f, @function
f:
        stwu 1,-24(1) <- Superfluous
        li 9,0
        li 10,1
        stw 9,8(1) <- Superfluous
.L2:
        lwarx 9,0,3
        cmpwi 0,9,0
        bne- 0,.L3
        stwcx. 10,0,3
        bne- 0,.L2
.L3:
        addi 1,1,24 <- Superfluous
        blr
        .size   f, .-f
        .ident  "GCC: (GNU) 6.0.0 20150714 (experimental)"
Comment 1 Sebastian Huber 2015-07-17 20:42:57 UTC
This problem is also present on x86.  The depreated __sync_bool_compare_and_swap() produces better code.

#include <stdatomic.h>

void f(atomic_uint *a)
{
  unsigned int e = 0;
  atomic_compare_exchange_strong_explicit(a, &e, 1, memory_order_relaxed, memory_order_relaxed);
}

void g(unsigned int *a)
{
  __sync_bool_compare_and_swap(a, 0, 1);
}

        .file   "cas.c"
        .section        .text.unlikely,"ax",@progbits
.LCOLDB0:
        .text
.LHOTB0:
        .p2align 4,,15
        .globl  f
        .type   f, @function
f:
.LFB0:
        .cfi_startproc
        movl    $1, %edx
        xorl    %eax, %eax
        movl    $0, -4(%rsp) <- Superfluous
        lock cmpxchgl   %edx, (%rdi)
        ret
        .cfi_endproc
.LFE0:
        .size   f, .-f
        .section        .text.unlikely
.LCOLDE0:
        .text
.LHOTE0:
        .section        .text.unlikely
.LCOLDB1:
        .text
.LHOTB1:
        .p2align 4,,15
        .globl  g
        .type   g, @function
g:
.LFB1:
        .cfi_startproc
        movl    $1, %edx
        xorl    %eax, %eax
        lock cmpxchgl   %edx, (%rdi)
        ret
        .cfi_endproc
.LFE1:
        .size   g, .-g
        .section        .text.unlikely
.LCOLDE1:
        .text
.LHOTE1:
        .ident  "GCC: (GNU) 6.0.0 20150717 (experimental)"
        .section        .note.GNU-stack,"",@progbits
Comment 2 Alan Modra 2015-09-21 07:17:28 UTC
Confirmed.  Here's another related testcase showing unnecessary stack memory writes and reads on both powerpc64le and x86_64.

int test2 (int *ptr, int value, int comparand)
{
  __atomic_compare_exchange_n (ptr, &comparand, value,
			       false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  return comparand;
}
Comment 3 Sebastian Huber 2015-12-15 13:57:37 UTC
clang 3.7 generates optimal code on x86 in both cases:

        .text
        .file   "test.c"
        .globl  f
        .align  16, 0x90
        .type   f,@function
f:                                      # @f
        .cfi_startproc
# BB#0:
        movl    $1, %ecx
        xorl    %eax, %eax
        lock            cmpxchgl        %ecx, (%rdi)
        retq
.Lfunc_end0:
        .size   f, .Lfunc_end0-f
        .cfi_endproc

        .globl  g
        .align  16, 0x90
        .type   g,@function
g:                                      # @g
        .cfi_startproc
# BB#0:
        movl    $1, %ecx
        xorl    %eax, %eax
        lock            cmpxchgl        %ecx, (%rdi)
        retq
.Lfunc_end1:
        .size   g, .Lfunc_end1-g
        .cfi_endproc


        .ident  "clang version 3.7.0 (tags/RELEASE_370/final)"
        .section        ".note.GNU-stack","",@progbits
Comment 4 Andrew Pinski 2016-06-20 19:24:31 UTC
*** Bug 70825 has been marked as a duplicate of this bug. ***
Comment 5 Andrew Pinski 2016-06-20 19:24:38 UTC
*** Bug 66713 has been marked as a duplicate of this bug. ***
Comment 6 Andrew Pinski 2016-06-20 19:24:49 UTC
*** Bug 71599 has been marked as a duplicate of this bug. ***
Comment 7 Jakub Jelinek 2016-06-21 16:38:21 UTC
Created attachment 38740 [details]
gcc7-pr66867-wip.patch

WIP untested patch using the same technique as we use for __builtin_*_overflow*
builtins.  I'll need to still finish writing the expansion for the case when we should expand it to external calls, once that is tested disable the ifn folding for !flag_inline_atomics, fix up -fsanitize=threads, use for other spots that look at atomic builtins.
And, we need some optimization that would optimize the case where REALPART_EXPR is conditionally stored into a non-addressable var - we can store the oldval there unconditionally then, because it can't create a race.
Comment 8 Jakub Jelinek 2016-06-22 13:48:23 UTC
Created attachment 38749 [details]
gcc7-pr66867.patch

Patch I'm going to bootstrap/regtest.
Comment 9 Jakub Jelinek 2016-06-28 08:27:51 UTC
Author: jakub
Date: Tue Jun 28 08:27:18 2016
New Revision: 237814

URL: https://gcc.gnu.org/viewcvs?rev=237814&root=gcc&view=rev
Log:
	PR middle-end/66867
	* builtins.c (expand_ifn_atomic_compare_exchange_into_call,
	expand_ifn_atomic_compare_exchange): New functions.
	* internal-fn.c (expand_ATOMIC_COMPARE_EXCHANGE): New function.
	* tree.h (build_call_expr_internal_loc): Rename to ...
	(build_call_expr_internal_loc_array): ... this.  Fix up type of
	last argument.
	* internal-fn.def (ATOMIC_COMPARE_EXCHANGE): New internal fn.
	* predict.c (expr_expected_value_1): Handle IMAGPART_EXPR of
	ATOMIC_COMPARE_EXCHANGE result.
	* builtins.h (expand_ifn_atomic_compare_exchange): New prototype.
	* gimple-fold.h (optimize_atomic_compare_exchange_p,
	fold_builtin_atomic_compare_exchange): New prototypes.
	* gimple-fold.c (optimize_atomic_compare_exchange_p,
	fold_builtin_atomic_compare_exchange): New functions..
	* tree-ssa.c (execute_update_addresses_taken): If
	optimize_atomic_compare_exchange_p, ignore &var in 2nd argument
	of call when finding addressable vars, and if such var becomes
	non-addressable, call fold_builtin_atomic_compare_exchange.

Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/builtins.c
    trunk/gcc/builtins.h
    trunk/gcc/gimple-fold.c
    trunk/gcc/gimple-fold.h
    trunk/gcc/internal-fn.c
    trunk/gcc/internal-fn.def
    trunk/gcc/predict.c
    trunk/gcc/tree-ssa.c
    trunk/gcc/tree.h
Comment 10 Jakub Jelinek 2016-06-28 09:34:59 UTC
Hopefully fixed for 7+.
Comment 11 dhowells@redhat.com 2016-07-18 11:36:25 UTC
I applied the patch to the Fedora cross-gcc-6.1.1 rpm with one minor fixup.  Using the example code I added in bug 70825 I now see:

0000000000000000 <test_atomic_cmpxchg>:
   0:   ba 2a 00 00 00          mov    $0x2a,%edx
   5:   b8 17 00 00 00          mov    $0x17,%eax
   a:   f0 0f b1 17             lock cmpxchg %edx,(%rdi)
   e:   c3                      retq   

there's now no extraneous store before the locked instruction.  And:

000000000000000f <test_atomic_cmpxchg_2>:
   f:   ba 2a 00 00 00          mov    $0x2a,%edx
  14:   b8 17 00 00 00          mov    $0x17,%eax
  19:   f0 0f b1 17             lock cmpxchg %edx,(%rdi)
  1d:   c3                      retq   

it now just passes the return value of cmpxchg back directly without potentially putting on and off the stack and maybe jumping round that bit.  And:

0000000000000043 <test_atomic_cmpxchg_B>:
  43:   ba 2a 00 00 00          mov    $0x2a,%edx
  48:   b8 17 00 00 00          mov    $0x17,%eax
  4d:   f0 0f b1 17             lock cmpxchg %edx,(%rdi)
  51:   c3                      retq   

where it makes no difference changing how the the return-statements are contructed in C.

I've also booted a kernel built with the patched compiler.