Consider the following test case: #include <stdatomic.h> atomic_int foo; void plusPlus(void) { foo++; } void fetchAdd(void) { atomic_fetch_add(&foo, 1); } The assembly for plusPlus and fetchAdd should be the same, since the two perform the same operation. However, GCC 5.3 with -O2 -c yields: 0000000000000000 <plusPlus>: 0: 8b 05 00 00 00 00 mov 0x0(%rip),%eax # 6 <plusPlus+0x6> 6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) d: 00 00 00 10: 89 44 24 fc mov %eax,-0x4(%rsp) 14: 8d 50 01 lea 0x1(%rax),%edx 17: 8b 44 24 fc mov -0x4(%rsp),%eax 1b: f0 0f b1 15 00 00 00 lock cmpxchg %edx,0x0(%rip) # 23 <plusPlus+0x23> 22: 00 23: 75 eb jne 10 <plusPlus+0x10> 25: f3 c3 repz retq 27: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) 2e: 00 00 0000000000000030 <fetchAdd>: 30: f0 83 05 00 00 00 00 lock addl $0x1,0x0(%rip) # 38 <fetchAdd+0x8> 37: 01 38: c3 retq It seems like instead of a "simple" atomic increment, foo++ does the following: 1. Atomically load the value (with full sequential consistency). 2. Increment 3. Compare and swap the original value (with full sequential consistency). 4. If the CAS didn't take, go to step 1. While this produces the desired effect (atomically adding one), there's obviously much more going on than there should be, and the extra memory barriers around each step are probably quite unfriendly to performance (especially if there are several threads accessing the value). The problem exists on ARM as well (where I originally discovered this issue), so it doesn't seem to be target dependent: 00000000 <plusPlus>: 0: b510 push {r4, lr} 2: 4a0f ldr r2, [pc, #60] ; (40 <plusPlus+0x40>) 4: f3bf 8f5f dmb sy 8: b082 sub sp, #8 a: ac02 add r4, sp, #8 c: 6813 ldr r3, [r2, #0] e: f3bf 8f5f dmb sy 12: f844 3d04 str.w r3, [r4, #-4]! 16: 1c58 adds r0, r3, #1 18: 6821 ldr r1, [r4, #0] 1a: f3bf 8f5f dmb sy 1e: e852 3f00 ldrex r3, [r2] 22: 428b cmp r3, r1 24: d104 bne.n 30 <plusPlus+0x30> 26: e842 0e00 strex lr, r0, [r2] 2a: f1be 0f00 cmp.w lr, #0 2e: d1f6 bne.n 1e <plusPlus+0x1e> 30: f3bf 8f5f dmb sy 34: d101 bne.n 3a <plusPlus+0x3a> 36: b002 add sp, #8 38: bd10 pop {r4, pc} 3a: 6023 str r3, [r4, #0] 3c: e7eb b.n 16 <plusPlus+0x16> 3e: bf00 nop 40: 00000000 .word 0x00000000 00000044 <fetchAdd>: 44: 4b06 ldr r3, [pc, #24] ; (60 <fetchAdd+0x1c>) 46: f3bf 8f5f dmb sy 4a: e853 2f00 ldrex r2, [r3] 4e: 3201 adds r2, #1 50: e843 2100 strex r1, r2, [r3] 54: 2900 cmp r1, #0 56: d1f8 bne.n 4a <fetchAdd+0x6> 58: f3bf 8f5f dmb sy 5c: 4770 bx lr 5e: bf00 nop 60: 00000000 .word 0x00000000 (compiled using arm-none-eabi-gcc version 5.3 with -O2 -mthumb -mcpu=cortex-m4) What's curious is that the C++ frontend doesn't seem to have this problem. With g++ -std=c++11 -O2 -c and the equivalent C++: #include <atomic> std::atomic_int foo; void plusPlus() { foo++; } void fetchAdd() { foo.fetch_add(1); } the compiler recognizes it's the same and implements one as a jump to the other: 0000000000000000 <fetchAdd()>: 0: f0 83 05 00 00 00 00 lock addl $0x1,0x0(%rip) # 8 <fetchAdd()+0x8> 7: 01 8: c3 retq 9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 0000000000000010 <plusPlus()>: 10: e9 00 00 00 00 jmpq 15 <plusPlus()+0x5> Clang also generates the same code for both (using clang and clang++): 0000000000000000 <plusPlus>: 0: f0 ff 05 00 00 00 00 lock incl 0x0(%rip) # 7 <plusPlus+0x7> 7: c3 retq 8: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1) f: 00 0000000000000010 <fetchAdd>: 10: f0 ff 05 00 00 00 00 lock incl 0x0(%rip) # 17 <fetchAdd+0x7> 17: c3 retq Obligatory information: $ gcc -v Using built-in specs. COLLECT_GCC=gcc COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-unknown-linux-gnu/5.3.0/lto-wrapper Target: x86_64-unknown-linux-gnu Configured with: /build/gcc-multilib/src/gcc-5-20160209/configure --prefix=/usr --libdir=/usr/lib --libexecdir=/usr/lib --mandir=/usr/share/man --infodir=/usr/share/info --with-bugurl=https://bugs.archlinux.org/ --enable-languages=c,c++,ada,fortran,go,lto,objc,obj-c++ --enable-shared --enable-threads=posix --enable-libmpx --with-system-zlib --with-isl --enable-__cxa_atexit --disable-libunwind-exceptions --enable-clocale=gnu --disable-libstdcxx-pch --disable-libssp --enable-gnu-unique-object --enable-linker-build-id --enable-lto --enable-plugin --enable-install-libiberty --with-linker-hash-style=gnu --enable-gnu-indirect-function --enable-multilib --disable-werror --enable-checking=release Thread model: posix gcc version 5.3.0 (GCC) $ arm-none-eabi-gcc -v Using built-in specs. COLLECT_GCC=arm-none-eabi-gcc COLLECT_LTO_WRAPPER=/usr/lib/gcc/arm-none-eabi/5.3.0/lto-wrapper Target: arm-none-eabi Configured with: /build/arm-none-eabi-gcc/src/gcc-5.3.0/configure --target=arm-none-eabi --prefix=/usr --with-sysroot=/usr/arm-none-eabi --with-native-system-header-dir=/include --libexecdir=/usr/lib --enable-languages=c,c++ --enable-plugins --disable-decimal-float --disable-libffi --disable-libgomp --disable-libmudflap --disable-libquadmath --disable-libssp --disable-libstdcxx-pch --disable-nls --disable-shared --disable-threads --disable-tls --with-gnu-as --with-gnu-ld --with-system-zlib --with-newlib --with-headers=/usr/arm-none-eabi/include --with-python-dir=share/gcc-arm-none-eabi --with-gmp --with-mpfr --with-mpc --with-isl --with-libelf --enable-gnu-indirect-function --with-host-libstdcxx='-static-libgcc -Wl,-Bstatic,-lstdc++,-Bdynamic -lm' --with-pkgversion='Arch Repository' --with-bugurl=https://bugs.archlinux.org/ --with-multilib-list=armv6-m,armv7-m,armv7e-m,armv7-r Thread model: single gcc version 5.3.0 (Arch Repository)
Dup of PR68908. *** This bug has been marked as a duplicate of bug 68908 ***