This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH][libgomp/gomp-3_0-branch] Thread pool for nested threads


This is a patch for gomp-3_0-branch, revision 135293.

A global thread pool for nested threads is added. The size of the pool (i.e. how many threads may block at the nested pool dock) is controlled by the GOMP_NESTED_POOL_SIZE environment variable, which is gomp_available_threads-1 by default. Slight changes to the internal semaphore were necessary to support wake-up of multiple threads.

This patch does not affect non-nested cases.

Tested x86_64-unknown-linux-gnu: No regressions
       i386-apple-darwin9.2.2: No regressions
       i686-pc-linux-gnu: No regressions

Please comment and/or approve.

2008-05-14 Jakob Blomer <jakob.blomer@ira.uka.de>

             * team.c (gomp_nstd_threads_idle,
               gomp_nstd_threads_dock, gomp_nstd_start_data,
               gomp_nstd_threads_dock): New global variables.
               (gomp_thread_start): Added pool support for nested
               case. Woken up threads grab their TLS data from
               gomp_nstd_start_data and proceed. They enqueue
               themselves back into the pool, as long as the pool
               has not more than gomp_nested_pool_size threads.
               (gomp_team_start): In nested case, initialize
               gomp_nstd_start_data and release threads from dock,
               adjust gomp_managed_threads accordingly.
               (gomp_team_end): Be aware of nested threads remain in
               the pool, i.e. they are still gomp managed threads.
               (initialize_team): Initialize global pool for nested
               threads.
             * env.c (gomp_nested_pool_size): New global variable.
               (initialize_env): Parse GOMP_NESTED_POOL_SIZE
               into gomp_nested_pool_size, default is
               gomp_available_cpus - 1.
             * libgomp.h (gomp_nested_pool_size): new extern variable.
             * config/$arch/sem.h (gomp_sem_post_multiple): New
               function.
             * config/linux/sem.h (gomp_sem_post_slow): Added unsigned
               parameter that specifies how many threads should be woken
               up.
             * config/linux/sem.c (gomp_sem_post_slow): Added parameter
               unsigned count.

Cheers,
Jakob



Index: team.c
===================================================================
--- team.c	(Revision 211)
+++ team.c	(Arbeitskopie)
@@ -38,7 +38,29 @@
 /* This key is for the thread destructor.  */
 pthread_key_t gomp_thread_destructor;

+/* Thread pool for nested threads,
+   gomp_nstd_threads_idle is the number of pooled nested threads. */
+unsigned gomp_nstd_threads_idle;
+gomp_sem_t gomp_nstd_threads_dock;
+/* This structure is used to communicate across nested threads,
+   woken up nested threads get their initialization from this struct. */
+struct
+{
+  void (*fn) (void *);
+  void *fn_data;
+  struct gomp_team_state ts;
+  struct gomp_task *task;
+  struct gomp_thread_pool *thread_pool;
+  struct gomp_task_icv *icv;
+  struct gomp_task *parent_task;
+  int id;
+#ifndef HAVE_SYNC_BUILTINS
+  gomp_mutex_t id_lock;
+#endif
+} gomp_nstd_start_data;
+gomp_mutex_t gomp_nstd_threads_lock;

+
 /* This is the libgomp per-thread data structure.  */
 #ifdef HAVE_TLS
 __thread struct gomp_thread gomp_tls_data;
@@ -93,14 +115,83 @@
   /* Make thread pool local. */
   pool = thr->thread_pool;

+  /* Switches to nested mode, thread will remain in global nested
+     thread pool, or run out, if there are too many threads managed */
   if (data->nested)
     {
-      gomp_barrier_wait (&thr->ts.team->barrier);
-      local_fn (local_data);
-      gomp_barrier_wait_last (&thr->ts.team->barrier);
+      struct gomp_task_icv *icv;
+      struct gomp_task *parent_task;
+      int team_id = thr->ts.team_id;
+      do
+        {
+          gomp_barrier_wait (&thr->ts.team->barrier);
+
+          /* Do actual work. */
+          local_fn (local_data);
+          gomp_end_task ();
+          gomp_barrier_wait_last (&thr->ts.team->barrier);
+
+          /* Don't overload the system with many threads in the pool,
+             eventually let some threads running out */
+          if (gomp_nstd_threads_idle >= gomp_nested_pool_size)
+            break;
+
+          /* Add to pool. */
+#ifdef HAVE_SYNC_BUILTINS
+          __sync_fetch_and_add (&gomp_nstd_threads_idle, 1);
+#else
+          gomp_mutex_lock (&gomp_nstd_threads_lock);
+          team_id = gomp_nstd_threads_idle++;
+          gomp_mutex_unlock (&gomp_nstd_threads_lock);
+#endif
+          gomp_sem_wait (&gomp_nstd_threads_dock);
+
+
+          /* Woken up from nested thread pool, initialize. */
+
+          local_fn = gomp_nstd_start_data.fn;
+          local_data = gomp_nstd_start_data.fn_data;
+          thr->ts = gomp_nstd_start_data.ts;
+          icv = gomp_nstd_start_data.icv;
+          parent_task = gomp_nstd_start_data.parent_task;
+#ifdef HAVE_SYNC_BUILTINS
+          team_id = __sync_fetch_and_add (&gomp_nstd_start_data.id, -1);
+#else
+          gomp_mutex_lock (&gomp_nstd_start_data.id_lock);
+          team_id = gomp_nstd_start_data.id--;
+          gomp_mutex_unlock (&gomp_nstd_start_data.id_lock);
+#endif
+          /* Last thread of the worker group, release
+             gomp_nstd_start_data */
+          if (team_id == 1)
+            gomp_mutex_unlock (&gomp_nstd_threads_lock);
+
+          /* This signal allows for waking up and killing a thread.
+             Currently this is not used. */
+          if (__builtin_expect(local_fn == NULL, 0))
+            break;
+
+          thr->ts.team_id = team_id;
+          thr->ts.team->ordered_release[team_id] = &thr->release;
+          thr->task = &thr->ts.team->implicit_task[team_id];
+          gomp_init_task (thr->task, parent_task, icv);
+      } while (local_fn);
+
+      /* This thread will die soon and is no longer managed by libgomp */
+#ifdef HAVE_SYNC_BUILTINS
+      __sync_fetch_and_add (&gomp_managed_threads, -1L);
+#else
+      gomp_mutex_lock (&gomp_remaining_threads_lock);
+      gomp_managed_threads--;
+      gomp_mutex_unlock (&gomp_remaining_threads_lock);
+#endif
     }
+  /* Non-nested mode, we are in the per user-created pthread
+     thread pool, which is essentially a barrier */
   else
     {
+      /* Make thread pool local and enqueue thread. */
+      struct gomp_thread_pool *pool = thr->thread_pool;
       pool->threads[thr->ts.team_id] = thr;

       gomp_barrier_wait (&pool->threads_dock);
@@ -275,7 +366,7 @@

   thr = gomp_thread ();
   nested = thr->ts.team != NULL;
-  if (__builtin_expect (thr->thread_pool == NULL, 0))
+  if (__builtin_expect ((thr->thread_pool == NULL) && (!nested), 0))
     {
       thr->thread_pool = gomp_new_thread_pool ();
       pthread_setspecific (gomp_thread_destructor, thr);
@@ -308,13 +399,51 @@

i = 1;

- /* We only allow the reuse of idle threads for non-nested PARALLEL
- regions. This appears to be implied by the semantics of
- threadprivate variables, but perhaps that's reading too much into
- things. Certainly it does prevent any locking problems, since
- only the initial program thread will modify gomp_threads. */
- if (!nested)
+ /* In the nested case, initialize the gomp_nstd_start_data
+ struct and release the necessary amount of threads.
+ We need to lock the structure. */
+ if (nested)
{
+ /* Will be unlocked by last worker thread, if a thread
+ from the nested pool is released at all. */
+ gomp_mutex_lock (&gomp_nstd_threads_lock);
+
+ n = gomp_nstd_threads_idle < nthreads-1 ? gomp_nstd_threads_idle : nthreads-1;
+
+ gomp_nstd_start_data.fn = fn;
+ gomp_nstd_start_data.fn_data = data;
+ gomp_nstd_start_data.ts.team = team;
+ gomp_nstd_start_data.ts.work_share = &team->work_shares[0];
+ gomp_nstd_start_data.ts.last_work_share = NULL;
+ gomp_nstd_start_data.id = n;
+ gomp_nstd_start_data.ts.level = team->prev_ts.level + 1;
+ gomp_nstd_start_data.ts.active_level = thr->ts.active_level;
+#ifdef HAVE_SYNC_BUILTINS
+ gomp_nstd_start_data.ts.single_count = 0;
+#endif
+ gomp_nstd_start_data.ts.static_trip = 0;
+ gomp_nstd_start_data.icv = icv;
+ gomp_nstd_start_data.parent_task = task;
+
+#ifdef HAVE_SYNC_BUILTINS
+ __sync_add_and_fetch (&gomp_nstd_threads_idle, -n);
+#else
+ /* Lock is already hold. */
+ gomp_nstd_threads_idle -= n;
+#endif
+ if (__builtin_expect(n > 0, 1))
+ {
+ gomp_sem_post_multiple (&gomp_nstd_threads_dock, n);
+ i += n;
+ }
+ else
+ gomp_mutex_unlock (&gomp_nstd_threads_lock);
+ }
+ /* Non-nested case, we are in the per user-created pthread
+ thread pool, which means we don't have to lock anything,
+ only the first-level master thread will run this code. */
+ else
+ {
old_threads_used = pool->threads_used;


       if (nthreads <= old_threads_used)
@@ -479,17 +608,6 @@
   gomp_end_task ();
   thr->ts = team->prev_ts;

-  if (__builtin_expect (thr->ts.team != NULL, 0))
-    {
-#ifdef HAVE_SYNC_BUILTINS
-      __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
-#else
-      gomp_mutex_lock (&gomp_remaining_threads_lock);
-      gomp_managed_threads -= team->nthreads - 1L;
-      gomp_mutex_unlock (&gomp_remaining_threads_lock);
-#endif
-    }
-
   free_team (team);
 }

@@ -511,6 +629,14 @@
   if (pthread_key_create (&gomp_thread_destructor, gomp_free_thread) != 0)
     gomp_fatal ("could not create thread pool destructor.");

+  /* Nested thread pool. */
+  gomp_mutex_init (&gomp_nstd_threads_lock);
+  gomp_sem_init (&gomp_nstd_threads_dock, 0);
+  gomp_nstd_threads_idle = 0;
+#ifndef HAVE_SYNC_BUILTINS
+  gomp_mutex_init (&gomp_nstd_start_data.id_lock);
+#endif
+
 #ifdef HAVE_TLS
   thr = &gomp_tls_data;
 #else
Index: env.c
===================================================================
--- env.c	(Revision 211)
+++ env.c	(Arbeitskopie)
@@ -47,7 +47,6 @@
 #include <limits.h>
 #include <errno.h>

-
struct gomp_task_icv gomp_global_icv = {
.nthreads_var = 1,
.run_sched_var = GFS_DYNAMIC,
@@ -64,7 +63,7 @@
#ifndef HAVE_SYNC_BUILTINS
gomp_mutex_t gomp_remaining_threads_lock;
#endif
-unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
+unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1, gomp_nested_pool_size = 0;
unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;


 /* Parse the OMP_SCHEDULE environment variable.  */
@@ -469,6 +468,7 @@
   return false;
 }

+
 static void __attribute__((constructor))
 initialize_env (void)
 {
@@ -519,6 +519,9 @@
   if (gomp_throttled_spin_count_var > gomp_spin_count_var)
     gomp_throttled_spin_count_var = gomp_spin_count_var;

+ if (!parse_unsigned_long ("GOMP_NESTED_POOL_SIZE", &gomp_nested_pool_size))
+ gomp_nested_pool_size = gomp_available_cpus - 1;
+
/* Not strictly environment related, but ordering constructors is tricky. */
pthread_attr_init (&gomp_thread_attr);
pthread_attr_setdetachstate (&gomp_thread_attr, PTHREAD_CREATE_DETACHED);
Index: libgomp.h
===================================================================
--- libgomp.h (Revision 211)
+++ libgomp.h (Arbeitskopie)
@@ -214,7 +214,7 @@
#endif
extern unsigned long gomp_max_active_levels_var;
extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
-extern unsigned long gomp_available_cpus, gomp_managed_threads;
+extern unsigned long gomp_available_cpus, gomp_managed_threads, gomp_nested_pool_size;


 /* This structure describes a "task" to be run by a thread.  At present
    we implement only synchronous tasks, i.e. no tasks are deferred or
Index: config/linux/sem.c
===================================================================
--- config/linux/sem.c	(Revision 211)
+++ config/linux/sem.c	(Arbeitskopie)
@@ -31,7 +31,6 @@

#include "wait.h"

-
 void
 gomp_sem_wait_slow (gomp_sem_t *sem)
 {
@@ -48,17 +47,18 @@
 }

 void
-gomp_sem_post_slow (gomp_sem_t *sem)
+gomp_sem_post_slow (gomp_sem_t *sem, unsigned count)
 {
   int old, tmp = *sem, wake;

   do
     {
       old = tmp;
-      wake = old > 0 ? old + 1 : 1;
+      wake = old > 0 ? old + count : count;
       tmp = __sync_val_compare_and_swap (sem, old, wake);
     }
   while (old != tmp);

   futex_wake (sem, wake);
 }
+
Index: config/linux/sem.h
===================================================================
--- config/linux/sem.h	(Revision 211)
+++ config/linux/sem.h	(Arbeitskopie)
@@ -46,13 +46,20 @@
     gomp_sem_wait_slow (sem);
 }

-extern void gomp_sem_post_slow (gomp_sem_t *);
+extern void gomp_sem_post_slow (gomp_sem_t *, unsigned);
+
 static inline void gomp_sem_post (gomp_sem_t *sem)
 {
   if (!__sync_bool_compare_and_swap (sem, 0, 1))
-    gomp_sem_post_slow (sem);
+    gomp_sem_post_slow (sem, 1);
 }

+static inline void
+gomp_sem_post_multiple (gomp_sem_t *sem, unsigned count)
+{
+  gomp_sem_post_slow (sem, count);
+}
+
 static inline void gomp_sem_destroy (gomp_sem_t *sem)
 {
 }
Index: config/posix/sem.h
===================================================================
--- config/posix/sem.h	(Revision 211)
+++ config/posix/sem.h	(Arbeitskopie)
@@ -87,4 +87,13 @@
   sem_destroy (sem);
 }
 #endif /* doesn't HAVE_BROKEN_POSIX_SEMAPHORES  */
+
+static inline void
+gomp_sem_post_multiple (gomp_sem_t *sem, unsigned count)
+{
+  int i;
+  for (i = 0; i < count; i++)
+    gomp_sem_post (sem);
+}
+
 #endif /* GOMP_SEM_H  */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]