This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [RFC PATCH] Optimized __cxa_guard_{acquire,release,abort} for Linux
- From: "Richard Guenther" <richard dot guenther at gmail dot com>
- To: "Jakub Jelinek" <jakub at redhat dot com>
- Cc: "Benjamin Kosnik" <bkoz at redhat dot com>, "Jason Merrill" <jason at redhat dot com>, gcc-patches at gcc dot gnu dot org, libstdc++ at gcc dot gnu dot org
- Date: Mon, 31 Dec 2007 17:00:12 +0100
- Subject: Re: [RFC PATCH] Optimized __cxa_guard_{acquire,release,abort} for Linux
- References: <20071231133948.GQ20451@devserv.devel.redhat.com>
On Dec 31, 2007 2:39 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> Hi!
>
> While backporting the guard deadlock fix to 4.1, I have noticed that
> on Linux it IMHO uses global mutex (and conditional variable) completely
> unnecessarily, which hurts parallelism if multiple threads initialize
> different static local vars which need guards.
> As shown below, all we IMHO need is futex syscall (just
> FUTEX_WAIT/FUTEX_WAKE, so any Linux kernel with futex syscall will do)
> and supported __sync_val_compare_and_swap.
>
> __guard is 64-bit var, where we only use first byte (0 uninitialized,
> 1 initialized) and second byte (0 init not pending, 1 pending) if __GTHREAD
> (well, on arm eabi first byte is actually 4th byte if big endian).
> With atomic builtins we can update both these flags together atomically. The patch
> below adds (as pure optimization) another bit to the second byte, to avoid
> any syscalls if no other thread calls __cxa_guard_acquire before the first
> thread that called it for a particular guard var calls __cxa_guard_release.
> We then have 4 valid states. __cxa_guard_acquire can atomically transition
> from 0 to pending (in this case it won the race and the calling thread
> will do the initialization) or from pending to pending|waiting (in this
> case it will FUTEX_WAIT until guard is set, or it is aborted back to 0).
> __cxa_guard_release will atomically transition from pending to guard
> or pending|waiting to guard (in the latter case FUTEX_WAIT is used to wake all
> waiters). __cxa_guard_abort will transition pending to 0 or pending|waiting
> to guard (in the latter case FUTEX_WAIT is used to wake all waiters).
>
> On x86_64-linux, if _GLIBCXX_HAVE_FUTEX is defined .text section decreases
> by 1168 bytes and .bss by 160 bytes, so it also decreases libstdc++.so or
> libsupc++.a footprint.
>
> The patch below doesn't include the configury bits yet (they might be very
> similar to enable_linux_futex stuff in libgomp configury).
> Would this be acceptable with the configury bits in even for 4.3 (could be
> e.g. only enabled if explicitly requested with --enable-linux-futex there),
> or just 4.4+?
This would also "fix" PR33960, right? So at least for me it would be
worthwhile to
do for 4.3 ;)
Thanks,
Richard.
> 2007-12-31 Jakub Jelinek <jakub@redhat.com>
>
> * config/cpu/generic/cxxabi_tweaks.h (_GLIBCXX_GUARD_BIT,
> _GLIBCXX_GUARD_PENDING_BIT, _GLIBCXX_GUARD_WAITING_BIT): Define.
> * config/cpu/arm/cxxabi_tweaks.h (_GLIBCXX_GUARD_BIT,
> _GLIBCXX_GUARD_PENDING_BIT, _GLIBCXX_GUARD_WAITING_BIT): Define.
> * libsupc++/guard.cc: Include climits and syscall.h.
> (_GLIBCXX_USE_FUTEX): Define if futex syscall and atomic builtins
> are supported.
> (_GLIBCXX_FUTEX_WAIT, _GLIBCXX_FUTEX_WAKE): Likewise.
> (__guard_test_bit): New static inline.
> (__cxa_guard_acquire, __cxa_guard_release, __cxa_guard_abort): Use
> atomic builtins and futex syscall if _GLIBCXX_USE_FUTEX.
>
> --- libstdc++-v3/config/cpu/arm/cxxabi_tweaks.h.jj 2006-12-08 15:58:16.000000000 +0100
> +++ libstdc++-v3/config/cpu/arm/cxxabi_tweaks.h 2007-12-31 11:45:37.000000000 +0100
> @@ -1,6 +1,6 @@
> // Control various target specific ABI tweaks. ARM version.
>
> -// Copyright (C) 2004, 2006 Free Software Foundation, Inc.
> +// Copyright (C) 2004, 2006, 2007 Free Software Foundation, Inc.
> //
> // This file is part of the GNU ISO C++ Library. This library is free
> // software; you can redistribute it and/or modify it under the
> @@ -46,6 +46,9 @@ namespace __cxxabiv1
> // guard variable. */
> #define _GLIBCXX_GUARD_TEST(x) ((*(x) & 1) != 0)
> #define _GLIBCXX_GUARD_SET(x) *(x) = 1
> +#define _GLIBCXX_GUARD_BIT 1
> +#define _GLIBCXX_GUARD_PENDING_BIT __guard_test_bit (1, 1)
> +#define _GLIBCXX_GUARD_WAITING_BIT __guard_test_bit (1, 2)
> typedef int __guard;
>
> // We also want the element size in array cookies.
> @@ -62,6 +65,9 @@ namespace __cxxabiv1
> // The generic ABI uses the first byte of a 64-bit guard variable.
> #define _GLIBCXX_GUARD_TEST(x) (*(char *) (x) != 0)
> #define _GLIBCXX_GUARD_SET(x) *(char *) (x) = 1
> +#define _GLIBCXX_GUARD_BIT __guard_test_bit (0, 1)
> +#define _GLIBCXX_GUARD_PENDING_BIT __guard_test_bit (1, 1)
> +#define _GLIBCXX_GUARD_WAITING_BIT __guard_test_bit (1, 2)
> __extension__ typedef int __guard __attribute__((mode (__DI__)));
>
> // __cxa_vec_ctor has void return type.
> --- libstdc++-v3/config/cpu/generic/cxxabi_tweaks.h.jj 2006-12-08 15:58:16.000000000 +0100
> +++ libstdc++-v3/config/cpu/generic/cxxabi_tweaks.h 2007-12-31 11:45:44.000000000 +0100
> @@ -1,6 +1,6 @@
> // Control various target specific ABI tweaks. Generic version.
>
> -// Copyright (C) 2004, 2006 Free Software Foundation, Inc.
> +// Copyright (C) 2004, 2006, 2007 Free Software Foundation, Inc.
> //
> // This file is part of the GNU ISO C++ Library. This library is free
> // software; you can redistribute it and/or modify it under the
> @@ -44,6 +44,9 @@ namespace __cxxabiv1
> // The generic ABI uses the first byte of a 64-bit guard variable.
> #define _GLIBCXX_GUARD_TEST(x) (*(char *) (x) != 0)
> #define _GLIBCXX_GUARD_SET(x) *(char *) (x) = 1
> +#define _GLIBCXX_GUARD_BIT __guard_test_bit (0, 1)
> +#define _GLIBCXX_GUARD_PENDING_BIT __guard_test_bit (1, 1)
> +#define _GLIBCXX_GUARD_WAITING_BIT __guard_test_bit (1, 2)
> __extension__ typedef int __guard __attribute__((mode (__DI__)));
>
> // __cxa_vec_ctor has void return type.
> --- libstdc++-v3/libsupc++/guard.cc.jj 2007-10-11 10:54:25.000000000 +0200
> +++ libstdc++-v3/libsupc++/guard.cc 2007-12-31 12:54:24.000000000 +0100
> @@ -35,12 +35,21 @@
> #include <new>
> #include <ext/atomicity.h>
> #include <ext/concurrence.h>
> +#if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \
> + && defined(_GLIBCXX_ATOMIC_BUILTINS) && defined(_GLIBCXX_HAVE_FUTEX)
> +# include <climits>
> +# include <syscall.h>
> +# define _GLIBCXX_USE_FUTEX
> +# define _GLIBCXX_FUTEX_WAIT 0
> +# define _GLIBCXX_FUTEX_WAKE 1
> +#endif
>
> // The IA64/generic ABI uses the first byte of the guard variable.
> // The ARM EABI uses the least significant bit.
>
> // Thread-safe static local initialization support.
> #ifdef __GTHREADS
> +# ifndef _GLIBCXX_USE_FUTEX
> namespace
> {
> // A single mutex controlling all static initializations.
> @@ -75,8 +84,9 @@ namespace
> }
> };
> }
> +# endif
>
> -#ifdef __GTHREAD_HAS_COND
> +# if defined(__GTHREAD_HAS_COND) && !defined(_GLIBCXX_USE_FUTEX)
> namespace
> {
> // A single conditional variable controlling all static initializations.
> @@ -98,9 +108,9 @@ namespace
> return *static_cond;
> }
> }
> -#endif
> +# endif
>
> -#ifndef _GLIBCXX_GUARD_TEST_AND_ACQUIRE
> +# ifndef _GLIBCXX_GUARD_TEST_AND_ACQUIRE
> inline bool
> __test_and_acquire (__cxxabiv1::__guard *g)
> {
> @@ -108,24 +118,24 @@ __test_and_acquire (__cxxabiv1::__guard
> _GLIBCXX_READ_MEM_BARRIER;
> return b;
> }
> -#define _GLIBCXX_GUARD_TEST_AND_ACQUIRE(G) __test_and_acquire (G)
> -#endif
> +# define _GLIBCXX_GUARD_TEST_AND_ACQUIRE(G) __test_and_acquire (G)
> +# endif
>
> -#ifndef _GLIBCXX_GUARD_SET_AND_RELEASE
> +# ifndef _GLIBCXX_GUARD_SET_AND_RELEASE
> inline void
> __set_and_release (__cxxabiv1::__guard *g)
> {
> _GLIBCXX_WRITE_MEM_BARRIER;
> _GLIBCXX_GUARD_SET (g);
> }
> -#define _GLIBCXX_GUARD_SET_AND_RELEASE(G) __set_and_release (G)
> -#endif
> +# define _GLIBCXX_GUARD_SET_AND_RELEASE(G) __set_and_release (G)
> +# endif
>
> #else /* !__GTHREADS */
>
> -#undef _GLIBCXX_GUARD_TEST_AND_ACQUIRE
> -#undef _GLIBCXX_GUARD_SET_AND_RELEASE
> -#define _GLIBCXX_GUARD_SET_AND_RELEASE(G) _GLIBCXX_GUARD_SET (G)
> +# undef _GLIBCXX_GUARD_TEST_AND_ACQUIRE
> +# undef _GLIBCXX_GUARD_SET_AND_RELEASE
> +# define _GLIBCXX_GUARD_SET_AND_RELEASE(G) _GLIBCXX_GUARD_SET (G)
>
> #endif /* __GTHREADS */
>
> @@ -176,8 +186,35 @@ namespace __gnu_cxx
> // headers define a symbol __GTHREAD_HAS_COND for platforms that support POSIX
> // like conditional variables. For platforms that do not support conditional
> // variables, we need to fall back to the old code.
> +
> +// If _GLIBCXX_USE_FUTEX, no global mutex or conditional variable is used,
> +// only atomic operations are used together with futex syscall.
> +// Valid values of the first integer in guard are:
> +// 0 No thread encountered the guarded init
> +// yet or it has been aborted.
> +// _GLIBCXX_GUARD_BIT The guarded static var has been successfully
> +// initialized.
> +// _GLIBCXX_GUARD_PENDING_BIT The guarded static var is being initialized
> +// and no other thread is waiting for its
> +// initialization.
> +// (_GLIBCXX_GUARD_PENDING_BIT The guarded static var is being initialized
> +// | _GLIBCXX_GUARD_WAITING_BIT) and some other threads are waiting until
> +// it is initialized.
> +
> namespace __cxxabiv1
> {
> +#ifdef _GLIBCXX_USE_FUTEX
> + namespace
> + {
> + static inline int __guard_test_bit (const int __byte, const int __val)
> + {
> + union { int __i; char __c[sizeof (int)]; } __u = { 0 };
> + __u.__c[__byte] = __val;
> + return __u.__i;
> + }
> + }
> +#endif
> +
> static inline int
> init_in_progress_flag(__guard* g)
> { return ((char *)g)[1]; }
> @@ -207,7 +244,7 @@ namespace __cxxabiv1
> return 0;
>
> if (init_in_progress_flag(g))
> - throw_recursive_init_exception();
> + throw_recursive_init_exception();
>
> set_init_in_progress_flag(g, 1);
> return 1;
> @@ -223,14 +260,46 @@ namespace __cxxabiv1
> if (_GLIBCXX_GUARD_TEST_AND_ACQUIRE (g))
> return 0;
>
> +# ifdef _GLIBCXX_USE_FUTEX
> + // If __sync_* and futex syscall are supported, don't use any global
> + // mutex.
> + if (__gthread_active_p ())
> + {
> + int *gi = (int *) (void *) g;
> + const int guard_bit = _GLIBCXX_GUARD_BIT;
> + const int pending_bit = _GLIBCXX_GUARD_PENDING_BIT;
> + const int waiting_bit = _GLIBCXX_GUARD_WAITING_BIT;
> +
> + while (1)
> + {
> + int old = __sync_val_compare_and_swap (gi, 0, pending_bit);
> + if (old == 0)
> + return 1; // This thread should do the initialization.
> +
> + if (old == guard_bit)
> + return 0; // Already initialized.
> +
> + if (old == pending_bit)
> + {
> + int newv = old | waiting_bit;
> + if (__sync_val_compare_and_swap (gi, old, newv) != old)
> + continue;
> +
> + old = newv;
> + }
> +
> + syscall (SYS_futex, gi, _GLIBCXX_FUTEX_WAIT, old, 0);
> + }
> + }
> +# else
> if (__gthread_active_p ())
> {
> mutex_wrapper mw;
>
> while (1) // When this loop is executing, mutex is locked.
> {
> -#ifdef __GTHREAD_HAS_COND
> - // The static is allready initialized.
> +# ifdef __GTHREAD_HAS_COND
> + // The static is already initialized.
> if (_GLIBCXX_GUARD_TEST(g))
> return 0; // The mutex will be unlocked via wrapper
>
> @@ -247,7 +316,7 @@ namespace __cxxabiv1
> set_init_in_progress_flag(g, 1);
> return 1; // The mutex will be unlocked via wrapper.
> }
> -#else
> +# else
> // This provides compatibility with older systems not supporting
> // POSIX like conditional variables.
> if (acquire(g))
> @@ -256,9 +325,10 @@ namespace __cxxabiv1
> return 1; // The mutex still locked.
> }
> return 0; // The mutex will be unlocked via wrapper.
> -#endif
> +# endif
> }
> }
> +# endif
> #endif
>
> return acquire (g);
> @@ -267,7 +337,35 @@ namespace __cxxabiv1
> extern "C"
> void __cxa_guard_abort (__guard *g)
> {
> -#ifdef __GTHREAD_HAS_COND
> +#ifdef _GLIBCXX_USE_FUTEX
> + // If __sync_* and futex syscall are supported, don't use any global
> + // mutex.
> + if (__gthread_active_p ())
> + {
> + int *gi = (int *) (void *) g;
> + const int pending_bit = _GLIBCXX_GUARD_PENDING_BIT;
> + const int waiting_bit = _GLIBCXX_GUARD_WAITING_BIT;
> + int old;
> +
> + old = __sync_val_compare_and_swap (gi, pending_bit, 0);
> + if (old == (pending_bit | waiting_bit))
> + {
> + while (1)
> + {
> + int newv = old & ~(pending_bit | waiting_bit);
> + int oldv = __sync_val_compare_and_swap (gi, old, newv);
> +
> + if (oldv == old)
> + break;
> + if ((oldv & pending_bit) == 0)
> + break;
> + old = oldv;
> + }
> + syscall (SYS_futex, gi, _GLIBCXX_FUTEX_WAKE, INT_MAX);
> + }
> + return;
> + }
> +#elif defined(__GTHREAD_HAS_COND)
> if (__gthread_active_p())
> {
> mutex_wrapper mw;
> @@ -293,7 +391,33 @@ namespace __cxxabiv1
> extern "C"
> void __cxa_guard_release (__guard *g)
> {
> -#ifdef __GTHREAD_HAS_COND
> +#ifdef _GLIBCXX_USE_FUTEX
> + // If __sync_* and futex syscall are supported, don't use any global
> + // mutex.
> + if (__gthread_active_p ())
> + {
> + int *gi = (int *) (void *) g;
> + const int guard_bit = _GLIBCXX_GUARD_BIT;
> + const int pending_bit = _GLIBCXX_GUARD_PENDING_BIT;
> + const int waiting_bit = _GLIBCXX_GUARD_WAITING_BIT;
> + int old;
> +
> + old = __sync_val_compare_and_swap (gi, pending_bit, guard_bit);
> + if (old == (pending_bit | waiting_bit))
> + {
> + while (1)
> + {
> + int oldv = __sync_val_compare_and_swap (gi, old, guard_bit);
> +
> + if (oldv == old || oldv == guard_bit)
> + break;
> + old = oldv;
> + }
> + syscall (SYS_futex, gi, _GLIBCXX_FUTEX_WAKE, INT_MAX);
> + }
> + return;
> + }
> +#elif defined(__GTHREAD_HAS_COND)
> if (__gthread_active_p())
> {
> mutex_wrapper mw;
>
> Jakub
>