[gomp4.5] Fix up doacross with dynamic scheduling

Fri Nov 13 18:26:00 GMT 2015

Hi!

Guided scheduling doesn't ensure that all assigned chunks have sizes
of multiple of chunk_size, chunk_size for guided scheduling actually means
just minimum size of the chunk.  That means we can't divide the IV or
number of iterations by chunk size though, because the precondition for that
is that all chunks assigned to each thread are multiples of the chunk size.

Fixed thusly, tested on x86_64-linux, committed to gomp-4_5-branch.

2015-11-13  Jakub Jelinek  <jakub@redhat.com>

	* ordered.c (gomp_doacross_init, GOMP_doacross_post,
	GOMP_doacross_wait, gomp_doacross_ull_init, GOMP_doacross_ull_post,
	GOMP_doacross_ull_wait): For GFS_GUIDED don't divide number of
	iterators or IV by chunk size.
	* testsuite/libgomp.c/doacross-3.c: New test.

--- libgomp/ordered.c.jj	2015-10-14 10:24:10.000000000 +0200
+++ libgomp/ordered.c	2015-11-13 19:14:11.877005602 +0100
@@ -297,6 +297,8 @@ gomp_doacross_init (unsigned ncounts, lo
 
   if (ws->sched == GFS_STATIC)
     num_ents = team->nthreads;
+  else if (ws->sched == GFS_GUIDED)
+    num_ents = counts[0];
   else
     num_ents = (counts[0] - 1) / chunk_size + 1;
   if (num_bits <= MAX_COLLAPSED_BITS)
@@ -366,6 +368,8 @@ GOMP_doacross_post (long *counts)
 
   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
     ent = thr->ts.team_id;
+  else if (ws->sched == GFS_GUIDED)
+    ent = counts[0];
   else
     ent = counts[0] / doacross->chunk_size;
   unsigned long *array = (unsigned long *) (doacross->array
@@ -426,6 +430,8 @@ GOMP_doacross_wait (long first, ...)
       else
 	ent = first / ws->chunk_size % thr->ts.team->nthreads;
     }
+  else if (ws->sched == GFS_GUIDED)
+    ent = first;
   else
     ent = first / doacross->chunk_size;
   unsigned long *array = (unsigned long *) (doacross->array
@@ -520,6 +526,8 @@ gomp_doacross_ull_init (unsigned ncounts
 
   if (ws->sched == GFS_STATIC)
     num_ents = team->nthreads;
+  else if (ws->sched == GFS_GUIDED)
+    num_ents = counts[0];
   else
     num_ents = (counts[0] - 1) / chunk_size + 1;
   if (num_bits <= MAX_COLLAPSED_BITS)
@@ -595,6 +603,8 @@ GOMP_doacross_ull_post (gomp_ull *counts
 
   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
     ent = thr->ts.team_id;
+  else if (ws->sched == GFS_GUIDED)
+    ent = counts[0];
   else
     ent = counts[0] / doacross->chunk_size_ull;
 
@@ -676,6 +686,8 @@ GOMP_doacross_ull_wait (gomp_ull first,
       else
 	ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
     }
+  else if (ws->sched == GFS_GUIDED)
+    ent = first;
   else
     ent = first / doacross->chunk_size_ull;
 
--- libgomp/testsuite/libgomp.c/doacross-3.c.jj	2015-11-13 19:08:22.191960410 +0100
+++ libgomp/testsuite/libgomp.c/doacross-3.c	2015-11-13 19:09:01.626401650 +0100
@@ -0,0 +1,225 @@
+extern void abort (void);
+
+#define N 256
+int a[N], b[N / 16][8][4], c[N / 32][8][8], g[N / 16][8][6];
+volatile int d, e;
+volatile unsigned long long f;
+
+int
+main ()
+{
+  unsigned long long i;
+  int j, k, l, m;
+  #pragma omp parallel private (l)
+  {
+    #pragma omp for schedule(guided, 3) ordered (1) nowait
+    for (i = 1; i < N + f; i++)
+      {
+	#pragma omp atomic write
+	a[i] = 1;
+	#pragma omp ordered depend(sink: i - 1)
+	if (i > 1)
+	  {
+	    #pragma omp atomic read
+	    l = a[i - 1];
+	    if (l < 2)
+	      abort ();
+	  }
+	#pragma omp atomic write
+	a[i] = 2;
+	if (i < N - 1)
+	  {
+	    #pragma omp atomic read
+	    l = a[i + 1];
+	    if (l == 3)
+	      abort ();
+	  }
+	#pragma omp ordered depend(source)
+	#pragma omp atomic write
+	a[i] = 3;
+      }
+    #pragma omp for schedule(guided) ordered (3) nowait
+    for (i = 3; i < N / 16 - 1 + f; i++)
+      for (j = 0; j < 8; j += 2)
+	for (k = 1; k <= 3; k++)
+	  {
+	    #pragma omp atomic write
+	    b[i][j][k] = 1;
+	    #pragma omp ordered depend(sink: i, j - 2, k - 1) \
+				depend(sink: i - 2, j - 2, k + 1)
+	    #pragma omp ordered depend(sink: i - 3, j + 2, k - 2)
+	    if (j >= 2 && k > 1)
+	      {
+		#pragma omp atomic read
+		l = b[i][j - 2][k - 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp atomic write
+	    b[i][j][k] = 2;
+	    if (i >= 5 && j >= 2 && k < 3)
+	      {
+		#pragma omp atomic read
+		l = b[i - 2][j - 2][k + 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    if (i >= 6 && j < N / 16 - 3 && k == 3)
+	      {
+		#pragma omp atomic read
+		l = b[i - 3][j + 2][k - 2];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp ordered depend(source)
+	    #pragma omp atomic write
+	    b[i][j][k] = 3;
+	  }
+#define A(n) int n;
+#define B(n) A(n##0) A(n##1) A(n##2) A(n##3)
+#define C(n) B(n##0) B(n##1) B(n##2) B(n##3)
+#define D(n) C(n##0) C(n##1) C(n##2) C(n##3)
+    D(m)
+#undef A
+    #pragma omp for collapse (2) ordered(61) schedule(guided, 15)
+    for (i = 2; i < N / 32 + f; i++)
+      for (j = 7; j > 1; j--)
+	for (k = 6; k >= 0; k -= 2)
+#define A(n) for (n = 4; n < 5; n++)
+	  D(m)
+#undef A
+	    {
+	      #pragma omp atomic write
+	      c[i][j][k] = 1;
+#define A(n) ,n
+#define E(n) C(n##0) C(n##1) C(n##2) B(n##30) B(n##31) A(n##320) A(n##321)
+	      #pragma omp ordered depend (sink: i, j, k + 2 E(m)) \
+				  depend (sink:i - 2, j + 1, k - 4 E(m)) \
+				  depend(sink: i - 1, j - 2, k - 2 E(m))
+	      if (k <= 4)
+		{
+		  l = c[i][j][k + 2];
+		  if (l < 2)
+		    abort ();
+		}
+	      #pragma omp atomic write
+	      c[i][j][k] = 2;
+	      if (i >= 4 && j < 7 && k >= 4)
+		{
+		  l = c[i - 2][j + 1][k - 4];
+		  if (l < 2)
+		    abort ();
+		}
+	      if (i >= 3 && j >= 4 && k >= 2)
+		{
+		  l = c[i - 1][j - 2][k - 2];
+		  if (l < 2)
+		    abort ();
+		}
+	      #pragma omp ordered depend (source)
+	      #pragma omp atomic write
+	      c[i][j][k] = 3;
+	    }
+    #pragma omp for schedule(guided, 5) ordered (3) nowait
+    for (j = 0; j < N / 16 - 1; j++)
+      for (k = 0; k < 8; k += 2)
+	for (i = 3; i <= 5 + f; i++)
+	  {
+	    #pragma omp atomic write
+	    g[j][k][i] = 1;
+	    #pragma omp ordered depend(sink: j, k - 2, i - 1) \
+				depend(sink: j - 2, k - 2, i + 1)
+	    #pragma omp ordered depend(sink: j - 3, k + 2, i - 2)
+	    if (k >= 2 && i > 3)
+	      {
+		#pragma omp atomic read
+		l = g[j][k - 2][i - 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp atomic write
+	    g[j][k][i] = 2;
+	    if (j >= 2 && k >= 2 && i < 5)
+	      {
+		#pragma omp atomic read
+		l = g[j - 2][k - 2][i + 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    if (j >= 3 && k < N / 16 - 3 && i == 5)
+	      {
+		#pragma omp atomic read
+		l = g[j - 3][k + 2][i - 2];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp ordered depend(source)
+	    #pragma omp atomic write
+	    g[j][k][i] = 3;
+	  }
+    #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k)
+    for (i = 2; i < f + 3; i++)
+      for (j = d + 1; j >= 0; j--)
+	for (k = 0; k < d; k++)
+	  for (l = 0; l < d + 2; l++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+	      if (!e)
+		abort ();
+	    }
+    #pragma omp single
+    {
+      if (i != 3 || j != -1 || k != 0)
+	abort ();
+      i = 8; j = 9; k = 10;
+    }
+    #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k, m)
+    for (i = 2; i < f + 3; i++)
+      for (j = d + 1; j >= 0; j--)
+	for (k = 0; k < d + 2; k++)
+	  for (m = 0; m < d; m++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, m)
+	      abort ();
+	    }
+    #pragma omp single
+    if (i != 3 || j != -1 || k != 2 || m != 0)
+      abort ();
+    #pragma omp for collapse(2) ordered(4) nowait
+    for (i = 2; i < f + 3; i++)
+      for (j = d; j > 0; j--)
+	for (k = 0; k < d + 2; k++)
+	  for (l = 0; l < d + 4; l++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+	      if (!e)
+		abort ();
+	    }
+    #pragma omp for nowait
+    for (i = 0; i < N; i++)
+      if (a[i] != 3 * (i >= 1))
+	abort ();
+    #pragma omp for collapse(2) private(k) nowait
+    for (i = 0; i < N / 16; i++)
+      for (j = 0; j < 8; j++)
+	for (k = 0; k < 4; k++)
+	  if (b[i][j][k] != 3 * (i >= 3 && i < N / 16 - 1 && (j & 1) == 0 && k >= 1))
+	    abort ();
+    #pragma omp for collapse(3) nowait
+    for (i = 0; i < N / 32; i++)
+      for (j = 0; j < 8; j++)
+	for (k = 0; k < 8; k++)
+	  if (c[i][j][k] != 3 * (i >= 2 && j >= 2 && (k & 1) == 0))
+	    abort ();
+    #pragma omp for collapse(2) private(k) nowait
+    for (i = 0; i < N / 16; i++)
+      for (j = 0; j < 8; j++)
+	for (k = 0; k < 6; k++)
+	  if (g[i][j][k] != 3 * (i < N / 16 - 1 && (j & 1) == 0 && k >= 3))
+	    abort ();
+  }
+  return 0;
+}

	Jakub