[gomp3] Speed up #pragma omp single

Tue Mar 25 14:49:00 GMT 2008

Hi!

This patch speeds up #pragma omp single if the host has sync builtins.
No need to create any work shares for that case, just keep a per-thread
count of simple #pragma omp single directives already encountered
and one global per-team.

On 32-bit architectures this has a drawback that at most 4294967295
#pragma omp single nowait without any intervening barrier region
will work reliably.  I think it is very unlikely somebody has that
many #pragma omp single nowaits, without any #pragma omp barrier, or
#pragma single, or other region with implicit barrier at the end
in between (each #pragma omp single will additionally do one atomic
operation and some thread would need not to be scheduled while some
other threads goes through all those 2^32 singles.  On 64-bit hosts
where the counter is 64-bit I don't think we have to be afraid
of this, if you think it is unsafe to do this for 32-bit hosts, I can
limit that to defined __LP64__ && defined HAVE_SYNC_BUILTINS targets.

Unfortunately, I don't know about anything that fast for other
worksharing constructs which really need bigger state shared among
all threads.

GOMP_SPINCOUNT=infinity OMP_SCHEDULE=static,1 LD_LIBRARY_PATH=~/libgomp-vanilla/ ./micro2
barrier bench min 0.99359 max 0.994976 avg 0.994299
parallel bench min 0.594554 max 0.595928 avg 0.595133
static bench min 0.0318422 max 0.031979 avg 0.0319257
dynamic bench min 0.182006 max 0.182035 avg 0.18203
guided bench min 0.000437441 max 0.000440948 avg 0.000439584
runtime bench min 0.0130935 max 0.0131951 avg 0.0131098
single bench min 0.398182 max 0.39932 avg 0.398675
single_nowait bench min 0.217672 max 0.219753 avg 0.218876
short_dynamic bench min 0.503268 max 0.504159 avg 0.503786
GOMP_SPINCOUNT=infinity OMP_SCHEDULE=static,1 LD_LIBRARY_PATH=~/libgomp-patched/ ./micro2
barrier bench min 0.990719 max 0.992127 avg 0.991435
parallel bench min 0.611676 max 0.612867 avg 0.612108
static bench min 0.0321883 max 0.0322247 avg 0.0322054
dynamic bench min 0.182147 max 0.182205 avg 0.182192
guided bench min 0.00044621 max 0.000448061 avg 0.000447119
runtime bench min 0.0130923 max 0.0132604 avg 0.013151
single bench min 0.147117 max 0.147267 avg 0.147161
single_nowait bench min 0.0733903 max 0.0734115 avg 0.0733995
short_dynamic bench min 0.502892 max 0.504253 avg 0.503378

2008-03-25  Jakub Jelinek  <jakub@redhat.com>

	* libgomp.h (struct gomp_team_state): Add single_count field.
	(struct gomp_team): Likewise.
	* team.c (gomp_new_team): Clear single_count.
	(gomp_team_start): Likewise.
	* single.c (GOMP_single_start): Rewritten if HAVE_SYNC_BUILTINS.

--- libgomp/libgomp.h	(revision 133510)
+++ libgomp/libgomp.h	(working copy)
@@ -179,6 +179,9 @@ struct gomp_team_state
   /* Active nesting level.  Only active parallel regions are counted.  */
   unsigned active_level;
 
+  /* Number of single stmts encountered.  */
+  unsigned long single_count;
+
   /* For GFS_RUNTIME loops that resolved to GFS_STATIC, this is the
      trip number through the loop.  So first time a particular loop
      is encountered this number is 0, the second time through the loop
@@ -260,6 +263,10 @@ struct gomp_team
      with alloc_work_share.  */
   struct gomp_work_share *work_share_list_free;
 
+  /* Number of simple single regions encountered by threads in this
+     team.  */
+  unsigned long single_count;
+
   /* This barrier is used for most synchronization of the team.  */
   gomp_barrier_t barrier;
 
--- libgomp/team.c	(revision 133510)
+++ libgomp/team.c	(working copy)
@@ -149,6 +149,7 @@ gomp_new_team (unsigned nthreads)
   team = gomp_malloc (size);
 
   team->work_share_chunk = 8;
+  team->single_count = 0;
   gomp_init_work_share (&team->work_shares[0], false, nthreads);
   team->work_shares[0].next_alloc = NULL;
   team->work_share_list_free = NULL;
@@ -221,6 +222,7 @@ gomp_team_start (void (*fn) (void *), vo
     ++thr->ts.active_level;
   thr->ts.work_share = &team->work_shares[0];
   thr->ts.last_work_share = NULL;
+  thr->ts.single_count = 0;
   thr->ts.static_trip = 0;
   thr->task = &team->implicit_task[0];
   gomp_init_task (thr->task, task, icv);
@@ -270,6 +272,7 @@ gomp_team_start (void (*fn) (void *), vo
 	  nthr->ts.team_id = i;
 	  nthr->ts.level = team->prev_ts.level + 1;
 	  nthr->ts.active_level = thr->ts.active_level;
+	  nthr->ts.single_count = 0;
 	  nthr->ts.static_trip = 0;
 	  nthr->task = &team->implicit_task[i];
 	  gomp_init_task (nthr->task, task, icv);
@@ -338,6 +341,7 @@ gomp_team_start (void (*fn) (void *), vo
       start_data->ts.team_id = i;
       start_data->ts.level = team->prev_ts.level + 1;
       start_data->ts.active_level = thr->ts.active_level;
+      start_data->ts.single_count = 0;
       start_data->ts.static_trip = 0;
       start_data->task = &team->implicit_task[i];
       gomp_init_task (start_data->task, task, icv);
--- libgomp/single.c	(revision 133510)
+++ libgomp/single.c	(working copy)
@@ -37,11 +37,24 @@
 bool
 GOMP_single_start (void)
 {
+#ifdef HAVE_SYNC_BUILTINS
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  unsigned long single_count;
+
+  if (__builtin_expect (team == NULL, 0))
+    return true;
+
+  single_count = thr->ts.single_count++;
+  return __sync_bool_compare_and_swap (&team->single_count, single_count,
+				       single_count + 1L);
+#else
   bool ret = gomp_work_share_start (false);
   if (ret)
     gomp_work_share_init_done ();
   gomp_work_share_end_nowait ();
   return ret;
+#endif
 }
 
 /* This routine is called when first encountering a SINGLE construct that

	Jakub