[RFC PATCH] Optimized __cxa_guard_{acquire,release,abort} for Linux

Jakub Jelinek jakub@redhat.com
Mon Dec 31 16:00:00 GMT 2007


Hi!

While backporting the guard deadlock fix to 4.1, I have noticed that
on Linux it IMHO uses global mutex (and conditional variable) completely
unnecessarily, which hurts parallelism if multiple threads initialize
different static local vars which need guards.
As shown below, all we IMHO need is futex syscall (just
FUTEX_WAIT/FUTEX_WAKE, so any Linux kernel with futex syscall will do)
and supported __sync_val_compare_and_swap.

__guard is 64-bit var, where we only use first byte (0 uninitialized,
1 initialized) and second byte (0 init not pending, 1 pending) if __GTHREAD
(well, on arm eabi first byte is actually 4th byte if big endian).
With atomic builtins we can update both these flags together atomically.  The patch
below adds (as pure optimization) another bit to the second byte, to avoid
any syscalls if no other thread calls __cxa_guard_acquire before the first
thread that called it for a particular guard var calls __cxa_guard_release.
We then have 4 valid states.  __cxa_guard_acquire can atomically transition
from 0 to pending (in this case it won the race and the calling thread
will do the initialization) or from pending to pending|waiting (in this
case it will FUTEX_WAIT until guard is set, or it is aborted back to 0).
__cxa_guard_release will atomically transition from pending to guard
or pending|waiting to guard (in the latter case FUTEX_WAIT is used to wake all
waiters).  __cxa_guard_abort will transition pending to 0 or pending|waiting
to guard (in the latter case FUTEX_WAIT is used to wake all waiters).

On x86_64-linux, if _GLIBCXX_HAVE_FUTEX is defined .text section decreases
by 1168 bytes and .bss by 160 bytes, so it also decreases libstdc++.so or
libsupc++.a footprint.

The patch below doesn't include the configury bits yet (they might be very
similar to enable_linux_futex stuff in libgomp configury).
Would this be acceptable with the configury bits in even for 4.3 (could be
e.g. only enabled if explicitly requested with --enable-linux-futex there),
or just 4.4+?

2007-12-31  Jakub Jelinek  <jakub@redhat.com>

	* config/cpu/generic/cxxabi_tweaks.h (_GLIBCXX_GUARD_BIT,
	_GLIBCXX_GUARD_PENDING_BIT, _GLIBCXX_GUARD_WAITING_BIT): Define.
	* config/cpu/arm/cxxabi_tweaks.h (_GLIBCXX_GUARD_BIT,
	_GLIBCXX_GUARD_PENDING_BIT, _GLIBCXX_GUARD_WAITING_BIT): Define.
	* libsupc++/guard.cc: Include climits and syscall.h.
	(_GLIBCXX_USE_FUTEX): Define if futex syscall and atomic builtins
	are supported.
	(_GLIBCXX_FUTEX_WAIT, _GLIBCXX_FUTEX_WAKE): Likewise.
	(__guard_test_bit): New static inline.
	(__cxa_guard_acquire, __cxa_guard_release, __cxa_guard_abort): Use
	atomic builtins and futex syscall if _GLIBCXX_USE_FUTEX.

--- libstdc++-v3/config/cpu/arm/cxxabi_tweaks.h.jj	2006-12-08 15:58:16.000000000 +0100
+++ libstdc++-v3/config/cpu/arm/cxxabi_tweaks.h	2007-12-31 11:45:37.000000000 +0100
@@ -1,6 +1,6 @@
 // Control various target specific ABI tweaks.  ARM version.
 
-// Copyright (C) 2004, 2006 Free Software Foundation, Inc.
+// Copyright (C) 2004, 2006, 2007 Free Software Foundation, Inc.
 //
 // This file is part of the GNU ISO C++ Library.  This library is free
 // software; you can redistribute it and/or modify it under the
@@ -46,6 +46,9 @@ namespace __cxxabiv1
   // guard variable.  */
 #define _GLIBCXX_GUARD_TEST(x) ((*(x) & 1) != 0)
 #define _GLIBCXX_GUARD_SET(x) *(x) = 1
+#define _GLIBCXX_GUARD_BIT 1
+#define _GLIBCXX_GUARD_PENDING_BIT __guard_test_bit (1, 1)
+#define _GLIBCXX_GUARD_WAITING_BIT __guard_test_bit (1, 2)
   typedef int __guard;
 
   // We also want the element size in array cookies.
@@ -62,6 +65,9 @@ namespace __cxxabiv1
   // The generic ABI uses the first byte of a 64-bit guard variable.
 #define _GLIBCXX_GUARD_TEST(x) (*(char *) (x) != 0)
 #define _GLIBCXX_GUARD_SET(x) *(char *) (x) = 1
+#define _GLIBCXX_GUARD_BIT __guard_test_bit (0, 1)
+#define _GLIBCXX_GUARD_PENDING_BIT __guard_test_bit (1, 1)
+#define _GLIBCXX_GUARD_WAITING_BIT __guard_test_bit (1, 2)
   __extension__ typedef int __guard __attribute__((mode (__DI__)));
 
   // __cxa_vec_ctor has void return type.
--- libstdc++-v3/config/cpu/generic/cxxabi_tweaks.h.jj	2006-12-08 15:58:16.000000000 +0100
+++ libstdc++-v3/config/cpu/generic/cxxabi_tweaks.h	2007-12-31 11:45:44.000000000 +0100
@@ -1,6 +1,6 @@
 // Control various target specific ABI tweaks.  Generic version.
 
-// Copyright (C) 2004, 2006 Free Software Foundation, Inc.
+// Copyright (C) 2004, 2006, 2007 Free Software Foundation, Inc.
 //
 // This file is part of the GNU ISO C++ Library.  This library is free
 // software; you can redistribute it and/or modify it under the
@@ -44,6 +44,9 @@ namespace __cxxabiv1
   // The generic ABI uses the first byte of a 64-bit guard variable.
 #define _GLIBCXX_GUARD_TEST(x) (*(char *) (x) != 0)
 #define _GLIBCXX_GUARD_SET(x) *(char *) (x) = 1
+#define _GLIBCXX_GUARD_BIT __guard_test_bit (0, 1)
+#define _GLIBCXX_GUARD_PENDING_BIT __guard_test_bit (1, 1)
+#define _GLIBCXX_GUARD_WAITING_BIT __guard_test_bit (1, 2)
   __extension__ typedef int __guard __attribute__((mode (__DI__)));
 
   // __cxa_vec_ctor has void return type.
--- libstdc++-v3/libsupc++/guard.cc.jj	2007-10-11 10:54:25.000000000 +0200
+++ libstdc++-v3/libsupc++/guard.cc	2007-12-31 12:54:24.000000000 +0100
@@ -35,12 +35,21 @@
 #include <new>
 #include <ext/atomicity.h>
 #include <ext/concurrence.h>
+#if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \
+    && defined(_GLIBCXX_ATOMIC_BUILTINS) && defined(_GLIBCXX_HAVE_FUTEX)
+# include <climits>
+# include <syscall.h>
+# define _GLIBCXX_USE_FUTEX
+# define _GLIBCXX_FUTEX_WAIT 0
+# define _GLIBCXX_FUTEX_WAKE 1
+#endif
 
 // The IA64/generic ABI uses the first byte of the guard variable.
 // The ARM EABI uses the least significant bit.
 
 // Thread-safe static local initialization support.
 #ifdef __GTHREADS
+# ifndef _GLIBCXX_USE_FUTEX
 namespace
 {
   // A single mutex controlling all static initializations.
@@ -75,8 +84,9 @@ namespace
     }
   };
 }
+# endif
 
-#ifdef __GTHREAD_HAS_COND
+# if defined(__GTHREAD_HAS_COND) && !defined(_GLIBCXX_USE_FUTEX)
 namespace
 {
   // A single conditional variable controlling all static initializations.
@@ -98,9 +108,9 @@ namespace
     return *static_cond;
   }
 }
-#endif
+# endif
 
-#ifndef _GLIBCXX_GUARD_TEST_AND_ACQUIRE
+# ifndef _GLIBCXX_GUARD_TEST_AND_ACQUIRE
 inline bool
 __test_and_acquire (__cxxabiv1::__guard *g)
 {
@@ -108,24 +118,24 @@ __test_and_acquire (__cxxabiv1::__guard 
   _GLIBCXX_READ_MEM_BARRIER;
   return b;
 }
-#define _GLIBCXX_GUARD_TEST_AND_ACQUIRE(G) __test_and_acquire (G)
-#endif
+#  define _GLIBCXX_GUARD_TEST_AND_ACQUIRE(G) __test_and_acquire (G)
+# endif
 
-#ifndef _GLIBCXX_GUARD_SET_AND_RELEASE
+# ifndef _GLIBCXX_GUARD_SET_AND_RELEASE
 inline void
 __set_and_release (__cxxabiv1::__guard *g)
 {
   _GLIBCXX_WRITE_MEM_BARRIER;
   _GLIBCXX_GUARD_SET (g);
 }
-#define _GLIBCXX_GUARD_SET_AND_RELEASE(G) __set_and_release (G)
-#endif
+#  define _GLIBCXX_GUARD_SET_AND_RELEASE(G) __set_and_release (G)
+# endif
 
 #else /* !__GTHREADS */
 
-#undef _GLIBCXX_GUARD_TEST_AND_ACQUIRE
-#undef _GLIBCXX_GUARD_SET_AND_RELEASE
-#define _GLIBCXX_GUARD_SET_AND_RELEASE(G) _GLIBCXX_GUARD_SET (G)
+# undef _GLIBCXX_GUARD_TEST_AND_ACQUIRE
+# undef _GLIBCXX_GUARD_SET_AND_RELEASE
+# define _GLIBCXX_GUARD_SET_AND_RELEASE(G) _GLIBCXX_GUARD_SET (G)
 
 #endif /* __GTHREADS */
 
@@ -176,8 +186,35 @@ namespace __gnu_cxx
 // headers define a symbol __GTHREAD_HAS_COND for platforms that support POSIX
 // like conditional variables. For platforms that do not support conditional
 // variables, we need to fall back to the old code.
+
+// If _GLIBCXX_USE_FUTEX, no global mutex or conditional variable is used,
+// only atomic operations are used together with futex syscall.
+// Valid values of the first integer in guard are:
+// 0				  No thread encountered the guarded init
+//				  yet or it has been aborted.
+// _GLIBCXX_GUARD_BIT		  The guarded static var has been successfully
+//				  initialized.
+// _GLIBCXX_GUARD_PENDING_BIT	  The guarded static var is being initialized
+//				  and no other thread is waiting for its
+//				  initialization.
+// (_GLIBCXX_GUARD_PENDING_BIT    The guarded static var is being initialized
+//  | _GLIBCXX_GUARD_WAITING_BIT) and some other threads are waiting until
+//				  it is initialized.
+
 namespace __cxxabiv1 
 {
+#ifdef _GLIBCXX_USE_FUTEX
+  namespace
+  {
+    static inline int __guard_test_bit (const int __byte, const int __val)
+    {
+      union { int __i; char __c[sizeof (int)]; } __u = { 0 };
+      __u.__c[__byte] = __val;
+      return __u.__i;
+    }
+  }
+#endif
+
   static inline int
   init_in_progress_flag(__guard* g)
   { return ((char *)g)[1]; }
@@ -207,7 +244,7 @@ namespace __cxxabiv1 
       return 0;
 
     if (init_in_progress_flag(g))
-	throw_recursive_init_exception();
+      throw_recursive_init_exception();
 
     set_init_in_progress_flag(g, 1);
     return 1;
@@ -223,14 +260,46 @@ namespace __cxxabiv1 
     if (_GLIBCXX_GUARD_TEST_AND_ACQUIRE (g))
       return 0;
 
+# ifdef _GLIBCXX_USE_FUTEX
+    // If __sync_* and futex syscall are supported, don't use any global
+    // mutex.
+    if (__gthread_active_p ())
+      {
+	int *gi = (int *) (void *) g;
+	const int guard_bit = _GLIBCXX_GUARD_BIT;
+	const int pending_bit = _GLIBCXX_GUARD_PENDING_BIT;
+	const int waiting_bit = _GLIBCXX_GUARD_WAITING_BIT;
+
+	while (1)
+	  {
+	    int old = __sync_val_compare_and_swap (gi, 0, pending_bit);
+	    if (old == 0)
+	      return 1;	// This thread should do the initialization.
+
+	    if (old == guard_bit)
+	      return 0;	// Already initialized.
+
+	    if (old == pending_bit)
+	      {
+		int newv = old | waiting_bit;
+		if (__sync_val_compare_and_swap (gi, old, newv) != old)
+		  continue;
+
+		old = newv;
+	      }
+
+	    syscall (SYS_futex, gi, _GLIBCXX_FUTEX_WAIT, old, 0);
+	  }
+      }
+# else
     if (__gthread_active_p ())
       {
 	mutex_wrapper mw;
 
 	while (1)	// When this loop is executing, mutex is locked.
 	  {
-#ifdef __GTHREAD_HAS_COND
-	    // The static is allready initialized.
+#  ifdef __GTHREAD_HAS_COND
+	    // The static is already initialized.
 	    if (_GLIBCXX_GUARD_TEST(g))
 	      return 0;	// The mutex will be unlocked via wrapper
 
@@ -247,7 +316,7 @@ namespace __cxxabiv1 
 		set_init_in_progress_flag(g, 1);
 		return 1; // The mutex will be unlocked via wrapper.
 	      }
-#else
+#  else
 	    // This provides compatibility with older systems not supporting
 	    // POSIX like conditional variables.
 	    if (acquire(g))
@@ -256,9 +325,10 @@ namespace __cxxabiv1 
 		return 1; // The mutex still locked.
 	      }
 	    return 0; // The mutex will be unlocked via wrapper.
-#endif
+#  endif
 	  }
       }
+# endif
 #endif
 
     return acquire (g);
@@ -267,7 +337,35 @@ namespace __cxxabiv1 
   extern "C"
   void __cxa_guard_abort (__guard *g)
   {
-#ifdef __GTHREAD_HAS_COND
+#ifdef _GLIBCXX_USE_FUTEX
+    // If __sync_* and futex syscall are supported, don't use any global
+    // mutex.
+    if (__gthread_active_p ())
+      {
+	int *gi = (int *) (void *) g;
+	const int pending_bit = _GLIBCXX_GUARD_PENDING_BIT;
+	const int waiting_bit = _GLIBCXX_GUARD_WAITING_BIT;
+	int old;
+
+	old = __sync_val_compare_and_swap (gi, pending_bit, 0);
+	if (old == (pending_bit | waiting_bit))
+	  {
+	    while (1)
+	      {
+		int newv = old & ~(pending_bit | waiting_bit);
+		int oldv = __sync_val_compare_and_swap (gi, old, newv);
+
+		if (oldv == old)
+		  break;
+		if ((oldv & pending_bit) == 0)
+		  break;
+		old = oldv;
+	      }
+	    syscall (SYS_futex, gi, _GLIBCXX_FUTEX_WAKE, INT_MAX);
+	  }
+	return;
+      }
+#elif defined(__GTHREAD_HAS_COND)
     if (__gthread_active_p())
       {	
 	mutex_wrapper mw;
@@ -293,7 +391,33 @@ namespace __cxxabiv1 
   extern "C"
   void __cxa_guard_release (__guard *g)
   {
-#ifdef __GTHREAD_HAS_COND
+#ifdef _GLIBCXX_USE_FUTEX
+    // If __sync_* and futex syscall are supported, don't use any global
+    // mutex.
+    if (__gthread_active_p ())
+      {
+	int *gi = (int *) (void *) g;
+	const int guard_bit = _GLIBCXX_GUARD_BIT;
+	const int pending_bit = _GLIBCXX_GUARD_PENDING_BIT;
+	const int waiting_bit = _GLIBCXX_GUARD_WAITING_BIT;
+	int old;
+
+	old = __sync_val_compare_and_swap (gi, pending_bit, guard_bit);
+	if (old == (pending_bit | waiting_bit))
+	  {
+	    while (1)
+	      {
+		int oldv = __sync_val_compare_and_swap (gi, old, guard_bit);
+
+		if (oldv == old || oldv == guard_bit)
+		  break;
+		old = oldv;
+	      }
+	    syscall (SYS_futex, gi, _GLIBCXX_FUTEX_WAKE, INT_MAX);
+	  }
+	return;
+      }
+#elif defined(__GTHREAD_HAS_COND)
     if (__gthread_active_p())
       {
 	mutex_wrapper mw;

	Jakub



More information about the Libstdc++ mailing list