This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug target/81501] New: Unneccessary calls to __tls_get_addr() in simple thread-singleton pattern
- From: "jak at jak-linux dot org" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Thu, 20 Jul 2017 21:10:26 +0000
- Subject: [Bug target/81501] New: Unneccessary calls to __tls_get_addr() in simple thread-singleton pattern
- Auto-submitted: auto-generated
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81501
Bug ID: 81501
Summary: Unneccessary calls to __tls_get_addr() in simple
thread-singleton pattern
Product: gcc
Version: 7.1.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: jak@jak-linux.org
Target Milestone: ---
I only tested this on amd64, but see for yourself:
+ cat t.cc
struct foo {
foo();
~foo();
};
foo *test() {
static thread_local foo foo_tls;
return &foo_tls;
}
+ g++-7 -std=c++14 -v -pthread -fPIC -shared -O2 -o gcc.so t.cc
Using built-in specs.
COLLECT_GCC=/usr/bin/g++-7
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/7/lto-wrapper
OFFLOAD_TARGET_NAMES=nvptx-none
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Debian 7.1.0-9'
--with-bugurl=file:///usr/share/doc/gcc-7/README.Bugs
--enable-languages=c,ada,c++,go,brig,d,fortran,objc,obj-c++ --prefix=/usr
--with-gcc-major-version-only --program-suffix=-7
--program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id
--libexecdir=/usr/lib --without-included-gettext --enable-threads=posix
--libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu
--enable-libstdcxx-debug --enable-libstdcxx-time=yes
--with-default-libstdcxx-abi=new --enable-gnu-unique-object
--disable-vtable-verify --enable-libmpx --enable-plugin --enable-default-pie
--with-system-zlib --with-target-system-zlib --enable-objc-gc=auto
--enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64
--with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic
--enable-offload-targets=nvptx-none --without-cuda-driver
--enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu
--target=x86_64-linux-gnu
Thread model: posix
gcc version 7.1.0 (Debian 7.1.0-9)
COLLECT_GCC_OPTIONS='-std=c++14' '-v' '-pthread' '-fPIC' '-shared' '-O2' '-o'
'gcc.so' '-shared-libgcc' '-mtune=generic' '-march=x86-64'
/usr/lib/gcc/x86_64-linux-gnu/7/cc1plus -quiet -v -imultiarch x86_64-linux-gnu
-D_GNU_SOURCE -D_REENTRANT t.cc -quiet -dumpbase t.cc -mtune=generic
-march=x86-64 -auxbase t -O2 -std=c++14 -version -fPIC -o /tmp/ccdUrCDS.s
GNU C++14 (Debian 7.1.0-9) version 7.1.0 (x86_64-linux-gnu)
compiled by GNU C version 7.1.0, GMP version 6.1.2, MPFR version 3.1.5,
MPC version 1.0.3, isl version isl-0.18-GMP
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
ignoring duplicate directory "/usr/include/x86_64-linux-gnu/c++/7"
ignoring nonexistent directory "/usr/local/include/x86_64-linux-gnu"
ignoring nonexistent directory
"/usr/lib/gcc/x86_64-linux-gnu/7/../../../../x86_64-linux-gnu/include"
#include "..." search starts here:
#include <...> search starts here:
/usr/include/c++/7
/usr/include/x86_64-linux-gnu/c++/7
/usr/include/c++/7/backward
/usr/lib/gcc/x86_64-linux-gnu/7/include
/usr/local/include
/usr/lib/gcc/x86_64-linux-gnu/7/include-fixed
/usr/include/x86_64-linux-gnu
/usr/include
End of search list.
GNU C++14 (Debian 7.1.0-9) version 7.1.0 (x86_64-linux-gnu)
compiled by GNU C version 7.1.0, GMP version 6.1.2, MPFR version 3.1.5,
MPC version 1.0.3, isl version isl-0.18-GMP
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
Compiler executable checksum: 3681302eda59faba4e53a905eca4bf72
COLLECT_GCC_OPTIONS='-std=c++14' '-v' '-pthread' '-fPIC' '-shared' '-O2' '-o'
'gcc.so' '-shared-libgcc' '-mtune=generic' '-march=x86-64'
as -v --64 -o /tmp/ccI2B3TO.o /tmp/ccdUrCDS.s
GNU assembler version 2.28 (x86_64-linux-gnu) using BFD version (GNU Binutils
for Debian) 2.28
COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/7/:/usr/lib/gcc/x86_64-linux-gnu/7/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/7/:/usr/lib/gcc/x86_64-linux-gnu/
LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/7/:/usr/lib/gcc/x86_64-linux-gnu/7/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/7/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/7/../../../:/lib/:/usr/lib/
COLLECT_GCC_OPTIONS='-std=c++14' '-v' '-pthread' '-fPIC' '-shared' '-O2' '-o'
'gcc.so' '-shared-libgcc' '-mtune=generic' '-march=x86-64'
/usr/lib/gcc/x86_64-linux-gnu/7/collect2 -plugin
/usr/lib/gcc/x86_64-linux-gnu/7/liblto_plugin.so
-plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/7/lto-wrapper
-plugin-opt=-fresolution=/tmp/cc9S0zbL.res -plugin-opt=-pass-through=-lgcc_s
-plugin-opt=-pass-through=-lpthread -plugin-opt=-pass-through=-lc
-plugin-opt=-pass-through=-lgcc_s --sysroot=/ --build-id --eh-frame-hdr -m
elf_x86_64 --hash-style=gnu -shared -o gcc.so
/usr/lib/gcc/x86_64-linux-gnu/7/../../../x86_64-linux-gnu/crti.o
/usr/lib/gcc/x86_64-linux-gnu/7/crtbeginS.o -L/usr/lib/gcc/x86_64-linux-gnu/7
-L/usr/lib/gcc/x86_64-linux-gnu/7/../../../x86_64-linux-gnu
-L/usr/lib/gcc/x86_64-linux-gnu/7/../../../../lib -L/lib/x86_64-linux-gnu
-L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib
-L/usr/lib/gcc/x86_64-linux-gnu/7/../../.. /tmp/ccI2B3TO.o -lstdc++ -lm -lgcc_s
-lpthread -lc -lgcc_s /usr/lib/gcc/x86_64-linux-gnu/7/crtendS.o
/usr/lib/gcc/x86_64-linux-gnu/7/../../../x86_64-linux-gnu/crtn.o
COLLECT_GCC_OPTIONS='-std=c++14' '-v' '-pthread' '-fPIC' '-shared' '-O2' '-o'
'gcc.so' '-shared-libgcc' '-mtune=generic' '-march=x86-64'
+ gdb -q -ex disassemble test -ex quit gcc.so
Reading symbols from gcc.so...(no debugging symbols found)...done.
Dump of assembler code for function _Z4testv:
0x00000000000007f0 <+0>: push %rbx
0x00000000000007f1 <+1>: sub $0x10,%rsp
0x00000000000007f5 <+5>: lea 0x2007cc(%rip),%rdi # 0x200fc8
0x00000000000007fc <+12>: callq 0x6e0 <__tls_get_addr@plt>
0x0000000000000801 <+17>: cmpb $0x0,0x0(%rax)
0x0000000000000808 <+24>: jne 0x840 <_Z4testv+80>
0x000000000000080a <+26>: lea 0x8(%rax),%rbx
0x0000000000000811 <+33>: mov %rax,0x8(%rsp)
0x0000000000000816 <+38>: mov %rbx,%rdi
0x0000000000000819 <+41>: callq 0x6d0 <_ZN3fooC1Ev@plt>
0x000000000000081e <+46>: mov 0x8(%rsp),%rax
0x0000000000000823 <+51>: mov 0x2007b6(%rip),%rdi # 0x200fe0
0x000000000000082a <+58>: lea 0x2007ff(%rip),%rdx # 0x201030
0x0000000000000831 <+65>: mov %rbx,%rsi
0x0000000000000834 <+68>: movb $0x1,0x0(%rax)
0x000000000000083b <+75>: callq 0x6f0 <__cxa_thread_atexit@plt>
0x0000000000000840 <+80>: lea 0x200781(%rip),%rdi # 0x200fc8
0x0000000000000847 <+87>: callq 0x6e0 <__tls_get_addr@plt>
0x000000000000084c <+92>: add $0x10,%rsp
0x0000000000000850 <+96>: add $0x8,%rax
0x0000000000000856 <+102>: pop %rbx
0x0000000000000857 <+103>: retq
End of assembler dump.
As you can see after the first call to __tls_get_addr() jne jumps to a second
call to __tls_get_addr(). It should really only need to get the address once
here, like clang does:
+ clang++ -std=c++14 -pthread -fPIC -shared -O2 -o clang.so t.cc
+ gdb -q -ex disassemble test -ex quit clang.so
Reading symbols from clang.so...(no debugging symbols found)...done.
Dump of assembler code for function _Z4testv:
0x00000000000007a0 <+0>: push %r14
0x00000000000007a2 <+2>: push %rbx
0x00000000000007a3 <+3>: push %rax
0x00000000000007a4 <+4>: lea 0x20081d(%rip),%rdi # 0x200fc8
0x00000000000007ab <+11>: callq 0x690 <__tls_get_addr@plt>
0x00000000000007b0 <+16>: mov %rax,%rbx
0x00000000000007b3 <+19>: mov 0x1(%rax),%al
0x00000000000007b9 <+25>: and $0x1,%al
0x00000000000007bb <+27>: jne 0x7ef <_Z4testv+79>
0x00000000000007bd <+29>: mov %rbx,%rax
0x00000000000007c0 <+32>: lea 0x0(%rax),%r14
0x00000000000007c7 <+39>: mov %r14,%rdi
0x00000000000007ca <+42>: callq 0x680 <_ZN3fooC1Ev@plt>
0x00000000000007cf <+47>: mov 0x20080a(%rip),%rdi # 0x200fe0
0x00000000000007d6 <+54>: lea 0x200853(%rip),%rdx # 0x201030
0x00000000000007dd <+61>: mov %r14,%rsi
0x00000000000007e0 <+64>: callq 0x6a0 <__cxa_thread_atexit@plt>
0x00000000000007e5 <+69>: mov %rbx,%rax
0x00000000000007e8 <+72>: movb $0x1,0x1(%rax)
0x00000000000007ef <+79>: mov %rbx,%rax
0x00000000000007f2 <+82>: lea 0x0(%rax),%rax
0x00000000000007f9 <+89>: add $0x8,%rsp
0x00000000000007fd <+93>: pop %rbx
0x00000000000007fe <+94>: pop %r14
0x0000000000000800 <+96>: retq
End of assembler dump.
This has some performance overhead which I'd like to avoid.