This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH][libgomp/gomp-3_0-branch] Thread pool for nested threads
- From: Jakob Blomer <jakob dot blomer at gmx dot net>
- To: gcc-patches at gcc dot gnu dot org
- Cc: Johannes Singler <singler at ira dot uka dot de>, Jakub Jelinek <jakub at redhat dot com>
- Date: Wed, 14 May 2008 17:42:53 +0200
- Subject: [PATCH][libgomp/gomp-3_0-branch] Thread pool for nested threads
This is a patch for gomp-3_0-branch, revision 135293.
A global thread pool for nested threads is added. The size of the pool
(i.e. how many threads may block at the nested pool dock) is controlled
by the GOMP_NESTED_POOL_SIZE environment variable, which is
gomp_available_threads-1 by default. Slight changes to the internal
semaphore were necessary to support wake-up of multiple threads.
This patch does not affect non-nested cases.
Tested x86_64-unknown-linux-gnu: No regressions
i386-apple-darwin9.2.2: No regressions
i686-pc-linux-gnu: No regressions
Please comment and/or approve.
2008-05-14 Jakob Blomer <jakob.blomer@ira.uka.de>
* team.c (gomp_nstd_threads_idle,
gomp_nstd_threads_dock, gomp_nstd_start_data,
gomp_nstd_threads_dock): New global variables.
(gomp_thread_start): Added pool support for nested
case. Woken up threads grab their TLS data from
gomp_nstd_start_data and proceed. They enqueue
themselves back into the pool, as long as the pool
has not more than gomp_nested_pool_size threads.
(gomp_team_start): In nested case, initialize
gomp_nstd_start_data and release threads from dock,
adjust gomp_managed_threads accordingly.
(gomp_team_end): Be aware of nested threads remain in
the pool, i.e. they are still gomp managed threads.
(initialize_team): Initialize global pool for nested
threads.
* env.c (gomp_nested_pool_size): New global variable.
(initialize_env): Parse GOMP_NESTED_POOL_SIZE
into gomp_nested_pool_size, default is
gomp_available_cpus - 1.
* libgomp.h (gomp_nested_pool_size): new extern variable.
* config/$arch/sem.h (gomp_sem_post_multiple): New
function.
* config/linux/sem.h (gomp_sem_post_slow): Added unsigned
parameter that specifies how many threads should be woken
up.
* config/linux/sem.c (gomp_sem_post_slow): Added parameter
unsigned count.
Cheers,
Jakob
Index: team.c
===================================================================
--- team.c (Revision 211)
+++ team.c (Arbeitskopie)
@@ -38,7 +38,29 @@
/* This key is for the thread destructor. */
pthread_key_t gomp_thread_destructor;
+/* Thread pool for nested threads,
+ gomp_nstd_threads_idle is the number of pooled nested threads. */
+unsigned gomp_nstd_threads_idle;
+gomp_sem_t gomp_nstd_threads_dock;
+/* This structure is used to communicate across nested threads,
+ woken up nested threads get their initialization from this struct. */
+struct
+{
+ void (*fn) (void *);
+ void *fn_data;
+ struct gomp_team_state ts;
+ struct gomp_task *task;
+ struct gomp_thread_pool *thread_pool;
+ struct gomp_task_icv *icv;
+ struct gomp_task *parent_task;
+ int id;
+#ifndef HAVE_SYNC_BUILTINS
+ gomp_mutex_t id_lock;
+#endif
+} gomp_nstd_start_data;
+gomp_mutex_t gomp_nstd_threads_lock;
+
/* This is the libgomp per-thread data structure. */
#ifdef HAVE_TLS
__thread struct gomp_thread gomp_tls_data;
@@ -93,14 +115,83 @@
/* Make thread pool local. */
pool = thr->thread_pool;
+ /* Switches to nested mode, thread will remain in global nested
+ thread pool, or run out, if there are too many threads managed */
if (data->nested)
{
- gomp_barrier_wait (&thr->ts.team->barrier);
- local_fn (local_data);
- gomp_barrier_wait_last (&thr->ts.team->barrier);
+ struct gomp_task_icv *icv;
+ struct gomp_task *parent_task;
+ int team_id = thr->ts.team_id;
+ do
+ {
+ gomp_barrier_wait (&thr->ts.team->barrier);
+
+ /* Do actual work. */
+ local_fn (local_data);
+ gomp_end_task ();
+ gomp_barrier_wait_last (&thr->ts.team->barrier);
+
+ /* Don't overload the system with many threads in the pool,
+ eventually let some threads running out */
+ if (gomp_nstd_threads_idle >= gomp_nested_pool_size)
+ break;
+
+ /* Add to pool. */
+#ifdef HAVE_SYNC_BUILTINS
+ __sync_fetch_and_add (&gomp_nstd_threads_idle, 1);
+#else
+ gomp_mutex_lock (&gomp_nstd_threads_lock);
+ team_id = gomp_nstd_threads_idle++;
+ gomp_mutex_unlock (&gomp_nstd_threads_lock);
+#endif
+ gomp_sem_wait (&gomp_nstd_threads_dock);
+
+
+ /* Woken up from nested thread pool, initialize. */
+
+ local_fn = gomp_nstd_start_data.fn;
+ local_data = gomp_nstd_start_data.fn_data;
+ thr->ts = gomp_nstd_start_data.ts;
+ icv = gomp_nstd_start_data.icv;
+ parent_task = gomp_nstd_start_data.parent_task;
+#ifdef HAVE_SYNC_BUILTINS
+ team_id = __sync_fetch_and_add (&gomp_nstd_start_data.id, -1);
+#else
+ gomp_mutex_lock (&gomp_nstd_start_data.id_lock);
+ team_id = gomp_nstd_start_data.id--;
+ gomp_mutex_unlock (&gomp_nstd_start_data.id_lock);
+#endif
+ /* Last thread of the worker group, release
+ gomp_nstd_start_data */
+ if (team_id == 1)
+ gomp_mutex_unlock (&gomp_nstd_threads_lock);
+
+ /* This signal allows for waking up and killing a thread.
+ Currently this is not used. */
+ if (__builtin_expect(local_fn == NULL, 0))
+ break;
+
+ thr->ts.team_id = team_id;
+ thr->ts.team->ordered_release[team_id] = &thr->release;
+ thr->task = &thr->ts.team->implicit_task[team_id];
+ gomp_init_task (thr->task, parent_task, icv);
+ } while (local_fn);
+
+ /* This thread will die soon and is no longer managed by libgomp */
+#ifdef HAVE_SYNC_BUILTINS
+ __sync_fetch_and_add (&gomp_managed_threads, -1L);
+#else
+ gomp_mutex_lock (&gomp_remaining_threads_lock);
+ gomp_managed_threads--;
+ gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
}
+ /* Non-nested mode, we are in the per user-created pthread
+ thread pool, which is essentially a barrier */
else
{
+ /* Make thread pool local and enqueue thread. */
+ struct gomp_thread_pool *pool = thr->thread_pool;
pool->threads[thr->ts.team_id] = thr;
gomp_barrier_wait (&pool->threads_dock);
@@ -275,7 +366,7 @@
thr = gomp_thread ();
nested = thr->ts.team != NULL;
- if (__builtin_expect (thr->thread_pool == NULL, 0))
+ if (__builtin_expect ((thr->thread_pool == NULL) && (!nested), 0))
{
thr->thread_pool = gomp_new_thread_pool ();
pthread_setspecific (gomp_thread_destructor, thr);
@@ -308,13 +399,51 @@
i = 1;
- /* We only allow the reuse of idle threads for non-nested PARALLEL
- regions. This appears to be implied by the semantics of
- threadprivate variables, but perhaps that's reading too much into
- things. Certainly it does prevent any locking problems, since
- only the initial program thread will modify gomp_threads. */
- if (!nested)
+ /* In the nested case, initialize the gomp_nstd_start_data
+ struct and release the necessary amount of threads.
+ We need to lock the structure. */
+ if (nested)
{
+ /* Will be unlocked by last worker thread, if a thread
+ from the nested pool is released at all. */
+ gomp_mutex_lock (&gomp_nstd_threads_lock);
+
+ n = gomp_nstd_threads_idle < nthreads-1 ? gomp_nstd_threads_idle
: nthreads-1;
+
+ gomp_nstd_start_data.fn = fn;
+ gomp_nstd_start_data.fn_data = data;
+ gomp_nstd_start_data.ts.team = team;
+ gomp_nstd_start_data.ts.work_share = &team->work_shares[0];
+ gomp_nstd_start_data.ts.last_work_share = NULL;
+ gomp_nstd_start_data.id = n;
+ gomp_nstd_start_data.ts.level = team->prev_ts.level + 1;
+ gomp_nstd_start_data.ts.active_level = thr->ts.active_level;
+#ifdef HAVE_SYNC_BUILTINS
+ gomp_nstd_start_data.ts.single_count = 0;
+#endif
+ gomp_nstd_start_data.ts.static_trip = 0;
+ gomp_nstd_start_data.icv = icv;
+ gomp_nstd_start_data.parent_task = task;
+
+#ifdef HAVE_SYNC_BUILTINS
+ __sync_add_and_fetch (&gomp_nstd_threads_idle, -n);
+#else
+ /* Lock is already hold. */
+ gomp_nstd_threads_idle -= n;
+#endif
+ if (__builtin_expect(n > 0, 1))
+ {
+ gomp_sem_post_multiple (&gomp_nstd_threads_dock, n);
+ i += n;
+ }
+ else
+ gomp_mutex_unlock (&gomp_nstd_threads_lock);
+ }
+ /* Non-nested case, we are in the per user-created pthread
+ thread pool, which means we don't have to lock anything,
+ only the first-level master thread will run this code. */
+ else
+ {
old_threads_used = pool->threads_used;
if (nthreads <= old_threads_used)
@@ -479,17 +608,6 @@
gomp_end_task ();
thr->ts = team->prev_ts;
- if (__builtin_expect (thr->ts.team != NULL, 0))
- {
-#ifdef HAVE_SYNC_BUILTINS
- __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
-#else
- gomp_mutex_lock (&gomp_remaining_threads_lock);
- gomp_managed_threads -= team->nthreads - 1L;
- gomp_mutex_unlock (&gomp_remaining_threads_lock);
-#endif
- }
-
free_team (team);
}
@@ -511,6 +629,14 @@
if (pthread_key_create (&gomp_thread_destructor, gomp_free_thread) != 0)
gomp_fatal ("could not create thread pool destructor.");
+ /* Nested thread pool. */
+ gomp_mutex_init (&gomp_nstd_threads_lock);
+ gomp_sem_init (&gomp_nstd_threads_dock, 0);
+ gomp_nstd_threads_idle = 0;
+#ifndef HAVE_SYNC_BUILTINS
+ gomp_mutex_init (&gomp_nstd_start_data.id_lock);
+#endif
+
#ifdef HAVE_TLS
thr = &gomp_tls_data;
#else
Index: env.c
===================================================================
--- env.c (Revision 211)
+++ env.c (Arbeitskopie)
@@ -47,7 +47,6 @@
#include <limits.h>
#include <errno.h>
-
struct gomp_task_icv gomp_global_icv = {
.nthreads_var = 1,
.run_sched_var = GFS_DYNAMIC,
@@ -64,7 +63,7 @@
#ifndef HAVE_SYNC_BUILTINS
gomp_mutex_t gomp_remaining_threads_lock;
#endif
-unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
+unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1,
gomp_nested_pool_size = 0;
unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
/* Parse the OMP_SCHEDULE environment variable. */
@@ -469,6 +468,7 @@
return false;
}
+
static void __attribute__((constructor))
initialize_env (void)
{
@@ -519,6 +519,9 @@
if (gomp_throttled_spin_count_var > gomp_spin_count_var)
gomp_throttled_spin_count_var = gomp_spin_count_var;
+ if (!parse_unsigned_long ("GOMP_NESTED_POOL_SIZE",
&gomp_nested_pool_size))
+ gomp_nested_pool_size = gomp_available_cpus - 1;
+
/* Not strictly environment related, but ordering constructors is
tricky. */
pthread_attr_init (&gomp_thread_attr);
pthread_attr_setdetachstate (&gomp_thread_attr,
PTHREAD_CREATE_DETACHED);
Index: libgomp.h
===================================================================
--- libgomp.h (Revision 211)
+++ libgomp.h (Arbeitskopie)
@@ -214,7 +214,7 @@
#endif
extern unsigned long gomp_max_active_levels_var;
extern unsigned long long gomp_spin_count_var,
gomp_throttled_spin_count_var;
-extern unsigned long gomp_available_cpus, gomp_managed_threads;
+extern unsigned long gomp_available_cpus, gomp_managed_threads,
gomp_nested_pool_size;
/* This structure describes a "task" to be run by a thread. At present
we implement only synchronous tasks, i.e. no tasks are deferred or
Index: config/linux/sem.c
===================================================================
--- config/linux/sem.c (Revision 211)
+++ config/linux/sem.c (Arbeitskopie)
@@ -31,7 +31,6 @@
#include "wait.h"
-
void
gomp_sem_wait_slow (gomp_sem_t *sem)
{
@@ -48,17 +47,18 @@
}
void
-gomp_sem_post_slow (gomp_sem_t *sem)
+gomp_sem_post_slow (gomp_sem_t *sem, unsigned count)
{
int old, tmp = *sem, wake;
do
{
old = tmp;
- wake = old > 0 ? old + 1 : 1;
+ wake = old > 0 ? old + count : count;
tmp = __sync_val_compare_and_swap (sem, old, wake);
}
while (old != tmp);
futex_wake (sem, wake);
}
+
Index: config/linux/sem.h
===================================================================
--- config/linux/sem.h (Revision 211)
+++ config/linux/sem.h (Arbeitskopie)
@@ -46,13 +46,20 @@
gomp_sem_wait_slow (sem);
}
-extern void gomp_sem_post_slow (gomp_sem_t *);
+extern void gomp_sem_post_slow (gomp_sem_t *, unsigned);
+
static inline void gomp_sem_post (gomp_sem_t *sem)
{
if (!__sync_bool_compare_and_swap (sem, 0, 1))
- gomp_sem_post_slow (sem);
+ gomp_sem_post_slow (sem, 1);
}
+static inline void
+gomp_sem_post_multiple (gomp_sem_t *sem, unsigned count)
+{
+ gomp_sem_post_slow (sem, count);
+}
+
static inline void gomp_sem_destroy (gomp_sem_t *sem)
{
}
Index: config/posix/sem.h
===================================================================
--- config/posix/sem.h (Revision 211)
+++ config/posix/sem.h (Arbeitskopie)
@@ -87,4 +87,13 @@
sem_destroy (sem);
}
#endif /* doesn't HAVE_BROKEN_POSIX_SEMAPHORES */
+
+static inline void
+gomp_sem_post_multiple (gomp_sem_t *sem, unsigned count)
+{
+ int i;
+ for (i = 0; i < count; i++)
+ gomp_sem_post (sem);
+}
+
#endif /* GOMP_SEM_H */