[gcc/devel/c++-coroutines] openmp: Adjust outer bounds of non-rect loops

Iain D Sandoe iains@gcc.gnu.org
Tue Jul 14 18:42:45 GMT 2020


https://gcc.gnu.org/g:f418bd4b92a03ee0ec0fe4cfcd896e86e11ac2cf

commit f418bd4b92a03ee0ec0fe4cfcd896e86e11ac2cf
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Tue Jul 14 10:31:59 2020 +0200

    openmp: Adjust outer bounds of non-rect loops
    
    In loops like:
      #pragma omp parallel for collapse(2)
      for (i = -4; i < 8; i++)
        for (j = 3 * i; j > 2 * i; j--)
    for some outer loop iterations there are no inner loop iterations at all,
    the condition is false.  In order to use Summæ Potestate to count number
    of iterations or to transform the logical iteration number to actual
    iterator values using quadratic non-equation root discovery the outer
    iterator range needs to be adjusted, such that the inner loop has at least
    one iteration for each of the outer loop iterator value in the reduced
    range.  Sometimes this adjustment is done at the start of the range,
    at other times at the end.
    
    This patch implements it during the compile time number of loop computation
    (if all expressions are compile time constants).
    
    2020-07-14  Jakub Jelinek  <jakub@redhat.com>
    
            * omp-general.h (struct omp_for_data): Add adjn1 member.
            * omp-general.c (omp_extract_for_data): For non-rect loop, punt on
            count computing if n1, n2 or step are not INTEGER_CST earlier.
            Narrow the outer iterator range if needed so that non-rect loop
            has at least one iteration for each outer range iteration.  Compute
            adjn1.
            * omp-expand.c (expand_omp_for_init_vars): Use adjn1 if non-NULL
            instead of the outer loop's n1.
    
            * testsuite/libgomp.c/loop-21.c: New test.

Diff:
---
 gcc/omp-expand.c                      |   3 +-
 gcc/omp-general.c                     | 129 ++++++++++++++++++-
 gcc/omp-general.h                     |   2 +
 libgomp/testsuite/libgomp.c/loop-21.c | 230 ++++++++++++++++++++++++++++++++++
 4 files changed, 360 insertions(+), 4 deletions(-)

diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index c3b8820e213..a721940a617 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -2262,6 +2262,7 @@ expand_omp_for_init_vars (struct omp_for_data *fd, gimple_stmt_iterator *gsi,
 	      && (optab_handler (sqrt_optab, TYPE_MODE (double_type_node))
 		  != CODE_FOR_nothing))
 	    {
+	      tree outer_n1 = fd->adjn1 ? fd->adjn1 : fd->loops[i - 1].n1;
 	      tree itype = TREE_TYPE (fd->loops[i].v);
 	      tree min_inner_iterations = fd->min_inner_iterations;
 	      tree factor = fd->factor;
@@ -2384,7 +2385,7 @@ expand_omp_for_init_vars (struct omp_for_data *fd, gimple_stmt_iterator *gsi,
 	      *gsi = gsi_after_labels (e->dest);
 	      t = fold_convert (itype, c);
 	      t = fold_build2 (MULT_EXPR, itype, t, fd->loops[i - 1].step);
-	      t = fold_build2 (PLUS_EXPR, itype, fd->loops[i - 1].n1, t);
+	      t = fold_build2 (PLUS_EXPR, itype, outer_n1, t);
 	      t = force_gimple_operand_gsi (gsi, t, true, NULL_TREE, false,
 					    GSI_CONTINUE_LINKING);
 	      expand_omp_build_assign (gsi, fd->loops[i - 1].v, t, true);
diff --git a/gcc/omp-general.c b/gcc/omp-general.c
index c6878cfec66..b2ce4083b27 100644
--- a/gcc/omp-general.c
+++ b/gcc/omp-general.c
@@ -214,6 +214,7 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
   fd->simd_schedule = false;
   fd->min_inner_iterations = NULL_TREE;
   fd->factor = NULL_TREE;
+  fd->adjn1 = NULL_TREE;
   collapse_iter = NULL;
   collapse_count = NULL;
 
@@ -508,7 +509,10 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
 		continue;
 	      if (single_nonrect == -1
 		  || (loop->m1 && TREE_CODE (loop->m1) != INTEGER_CST)
-		  || (loop->m2 && TREE_CODE (loop->m2) != INTEGER_CST))
+		  || (loop->m2 && TREE_CODE (loop->m2) != INTEGER_CST)
+		  || TREE_CODE (loop->n1) != INTEGER_CST
+		  || TREE_CODE (loop->n2) != INTEGER_CST
+		  || TREE_CODE (loop->step) != INTEGER_CST)
 		{
 		  count = NULL_TREE;
 		  continue;
@@ -574,12 +578,129 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
 	      else if (t && t2 && integer_zerop (t) && integer_zerop (t2))
 		/* No iterations of the inner loop.  count will be set to
 		   zero cst below.  */;
-	      else
+	      else if (TYPE_UNSIGNED (itype)
+		       || t == NULL_TREE
+		       || t2 == NULL_TREE
+		       || TREE_CODE (t) != INTEGER_CST
+		       || TREE_CODE (t2) != INTEGER_CST)
 		{
 		  /* Punt (for now).  */
 		  count = NULL_TREE;
 		  continue;
 		}
+	      else
+		{
+		  /* Some iterations of the outer loop have zero iterations
+		     of the inner loop, while others have at least one.
+		     In this case, we need to adjust one of those outer
+		     loop bounds.  If ADJ_FIRST, we need to adjust outer n1
+		     (first), otherwise outer n2 (last).  */
+		  bool adj_first = integer_zerop (t);
+		  tree n1 = fold_convert (itype, loop->n1);
+		  tree n2 = fold_convert (itype, loop->n2);
+		  tree m1 = loop->m1 ? fold_convert (itype, loop->m1)
+				     : build_zero_cst (itype);
+		  tree m2 = loop->m2 ? fold_convert (itype, loop->m2)
+				     : build_zero_cst (itype);
+		  t = fold_binary (MINUS_EXPR, itype, n1, n2);
+		  t2 = fold_binary (MINUS_EXPR, itype, m2, m1);
+		  t = fold_binary (TRUNC_DIV_EXPR, itype, t, t2);
+		  t2 = fold_binary (MINUS_EXPR, itype, t, first);
+		  t2 = fold_binary (TRUNC_MOD_EXPR, itype, t2, ostep);
+		  t = fold_binary (MINUS_EXPR, itype, t, t2);
+		  tree n1cur
+		    = fold_binary (PLUS_EXPR, itype, n1,
+				   fold_binary (MULT_EXPR, itype, m1, t));
+		  tree n2cur
+		    = fold_binary (PLUS_EXPR, itype, n2,
+				   fold_binary (MULT_EXPR, itype, m2, t));
+		  t2 = fold_binary (loop->cond_code, boolean_type_node,
+				    n1cur, n2cur);
+		  tree t3 = fold_binary (MULT_EXPR, itype, m1, ostep);
+		  tree t4 = fold_binary (MULT_EXPR, itype, m2, ostep);
+		  tree diff;
+		  if (adj_first)
+		    {
+		      tree new_first;
+		      if (integer_nonzerop (t2))
+			{
+			  new_first = t;
+			  n1first = n1cur;
+			  n2first = n2cur;
+			  if (flag_checking)
+			    {
+			      t3 = fold_binary (MINUS_EXPR, itype, n1cur, t3);
+			      t4 = fold_binary (MINUS_EXPR, itype, n2cur, t4);
+			      t3 = fold_binary (loop->cond_code,
+						boolean_type_node, t3, t4);
+			      gcc_assert (integer_zerop (t3));
+			    }
+			}
+		      else
+			{
+			  t3 = fold_binary (PLUS_EXPR, itype, n1cur, t3);
+			  t4 = fold_binary (PLUS_EXPR, itype, n2cur, t4);
+			  new_first = fold_binary (PLUS_EXPR, itype, t, ostep);
+			  n1first = t3;
+			  n2first = t4;
+			  if (flag_checking)
+			    {
+			      t3 = fold_binary (loop->cond_code,
+						boolean_type_node, t3, t4);
+			      gcc_assert (integer_nonzerop (t3));
+			    }
+			}
+		      diff = fold_binary (MINUS_EXPR, itype, new_first, first);
+		      first = new_first;
+		      fd->adjn1 = first;
+		    }
+		  else
+		    {
+		      tree new_last;
+		      if (integer_zerop (t2))
+			{
+			  t3 = fold_binary (MINUS_EXPR, itype, n1cur, t3);
+			  t4 = fold_binary (MINUS_EXPR, itype, n2cur, t4);
+			  new_last = fold_binary (MINUS_EXPR, itype, t, ostep);
+			  n1last = t3;
+			  n2last = t4;
+			  if (flag_checking)
+			    {
+			      t3 = fold_binary (loop->cond_code,
+						boolean_type_node, t3, t4);
+			      gcc_assert (integer_nonzerop (t3));
+			    }
+			}
+		      else
+			{
+			  new_last = t;
+			  n1last = n1cur;
+			  n2last = n2cur;
+			  if (flag_checking)
+			    {
+			      t3 = fold_binary (PLUS_EXPR, itype, n1cur, t3);
+			      t4 = fold_binary (PLUS_EXPR, itype, n2cur, t4);
+			      t3 = fold_binary (loop->cond_code,
+						boolean_type_node, t3, t4);
+			      gcc_assert (integer_zerop (t3));
+			    }
+			}
+		      diff = fold_binary (MINUS_EXPR, itype, last, new_last);
+		    }
+		  if (TYPE_UNSIGNED (itype)
+		      && single_nonrect_cond_code == GT_EXPR)
+		    diff = fold_binary (TRUNC_DIV_EXPR, itype,
+					fold_unary (NEGATE_EXPR, itype, diff),
+					fold_unary (NEGATE_EXPR, itype,
+						    ostep));
+		  else
+		    diff = fold_binary (TRUNC_DIV_EXPR, itype, diff, ostep);
+		  diff = fold_convert (long_long_unsigned_type_node, diff);
+		  single_nonrect_count
+		    = fold_binary (MINUS_EXPR, long_long_unsigned_type_node,
+				   single_nonrect_count, diff);
+		  t = NULL_TREE;
+		}
 	    }
 	  else
 	    t = fold_binary (loop->cond_code, boolean_type_node,
@@ -715,10 +836,11 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
 	  *collapse_count = fold_convert_loc (loc, iter_type, count);
 	  if (fd->min_inner_iterations && fd->factor)
 	    {
-	      t = make_tree_vec (3);
+	      t = make_tree_vec (4);
 	      TREE_VEC_ELT (t, 0) = *collapse_count;
 	      TREE_VEC_ELT (t, 1) = fd->min_inner_iterations;
 	      TREE_VEC_ELT (t, 2) = fd->factor;
+	      TREE_VEC_ELT (t, 3) = fd->adjn1;
 	      *collapse_count = t;
 	    }
 	}
@@ -736,6 +858,7 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
 	  gcc_assert (fd->non_rect);
 	  fd->min_inner_iterations = TREE_VEC_ELT (fd->loop.n2, 1);
 	  fd->factor = TREE_VEC_ELT (fd->loop.n2, 2);
+	  fd->adjn1 = TREE_VEC_ELT (fd->loop.n2, 3);
 	  fd->loop.n2 = TREE_VEC_ELT (fd->loop.n2, 0);
 	}
       fd->loop.step = build_int_cst (TREE_TYPE (fd->loop.v), 1);
diff --git a/gcc/omp-general.h b/gcc/omp-general.h
index ec0f2a4becb..2da4d14b310 100644
--- a/gcc/omp-general.h
+++ b/gcc/omp-general.h
@@ -85,6 +85,8 @@ struct omp_for_data
 				outer iterator, depending on which
 				results in fewer iterations.  */
   tree factor; /* (m2 - m1) * outer_step / inner_step.  */
+  /* Adjusted n1 of the outer loop in such loop nests (if needed).  */
+  tree adjn1;
 };
 
 #define OACC_FN_ATTRIB "oacc function"
diff --git a/libgomp/testsuite/libgomp.c/loop-21.c b/libgomp/testsuite/libgomp.c/loop-21.c
new file mode 100644
index 00000000000..1baf13d84db
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/loop-21.c
@@ -0,0 +1,230 @@
+/* { dg-do run } */
+
+extern void abort (void);
+
+int x, i, j;
+volatile int a, b, c, d, e, f, g, h;
+int k[13][27];
+
+int
+main ()
+{
+  int niters;
+  for (i = -4; i < 8; i++)
+    for (j = 3 * i; j > 2 * i; j--)
+      k[i + 5][j + 5] = 1;
+  a = -4; b = 8; c = 1; d = 3; e = 0; f = 2; g = 0; h = -1;
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = -4; i < 8; i++)
+    for (j = 3 * i; j > 2 * i; j--)
+      {
+	if (i < -4 || i >= 8 || j > 3 * i || j <= i * 2 || k[i + 5][j + 5] != 1)
+	  abort ();
+	k[i + 5][j + 5]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != 8 || j != 14 || x != 7183 || niters != 28)
+    abort ();
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = a; i < b; i += c)
+    for (j = d * i + e; j > g + i * f; j += h)
+      {
+	if (i < -4 || i >= 8 || j > 3 * i || j <= i * 2 || k[i + 5][j + 5] != 2)
+	  abort ();
+	k[i + 5][j + 5]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != 8 || j != 14 || x != 7183 || niters != 28)
+    abort ();
+  for (int i = -4; i < 8; i++)
+    for (int j = 3 * i; j > 2 * i; j--)
+      if (k[i + 5][j + 5] == 3)
+	k[i + 5][j + 5] = 0;
+      else
+	abort ();
+  for (i = -2; i < 4; i++)
+    for (j = -2 * i + 3; j > -3; j -= 2)
+      k[i + 5][j + 5] = 1;
+  a = -2; b = 4; c = 1; d = -2; e = 3; f = 0; g = -3; h = -2;
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = -2; i < 4; i++)
+    for (j = -2 * i + 3; j > -3; j -= 2)
+      {
+	if (i < -2 || i >= 4 || j <= -3 || j > -2 * i + 3 || k[i + 5][j + 5] != 1)
+	  abort ();
+	k[i + 5][j + 5]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (/* i != 4 || j != -3 || */x != 3071 || niters != 15)
+    abort ();
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = a; i < b; i += c)
+    for (j = d * i + e; j > g + i * f; j += h)
+      {
+	if (i < -2 || i >= 4 || j <= -3 || j > -2 * i + 3 || k[i + 5][j + 5] != 2)
+	  abort ();
+	k[i + 5][j + 5]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (/*i != 4 || j != -3 || */x != 3071 || niters != 15)
+    abort ();
+  for (i = -2; i < 4; i++)
+    for (j = -2 * i + 3; j > -3; j -= 2)
+      if (k[i + 5][j + 5] == 3)
+	k[i + 5][j + 5] = 0;
+      else
+	abort ();
+  for (i = 3; i > -3; i--)
+    for (j = -2 * i + 7; j > 2 * i + 1; j--)
+      k[i + 5][j + 5] = 1;
+  a = 3; b = -3; c = -1; d = -2; e = 7; f = 2; g = 1; h = -1;
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = 3; i > -3; i--)
+    for (j = -2 * i + 7; j > 2 * i + 1; j--)
+      {
+	if (i <= -3 || i > 3 || j <= 2 * i + 1 || j > -2 * i + 7 || k[i + 5][j + 5] != 1)
+	  abort ();
+	k[i + 5][j + 5]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != -3 || j != -3 || x != -1026 || niters != 32)
+    abort ();
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = a; i > b; i += c)
+    for (j = d * i + e; j > g + i * f; j += h)
+      {
+	if (i <= -3 || i > 3 || j <= 2 * i + 1 || j > -2 * i + 7 || k[i + 5][j + 5] != 2)
+	  abort ();
+	k[i + 5][j + 5]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != -3 || j != -3 || x != -1026 || niters != 32)
+    abort ();
+  for (i = 3; i > -3; i--)
+    for (j = -2 * i + 7; j > 2 * i + 1; j--)
+      if (k[i + 5][j + 5] == 3)
+	k[i + 5][j + 5] = 0;
+      else
+	abort ();
+  for (i = 3; i > -3; i--)
+    for (j = 2 * i + 7; j > -2 * i + 1; j--)
+      k[i + 5][j + 5] = 1;
+  a = 3; b = -3; c = -1; d = 2; e = 7; f = -2; g = 1; h = -1;
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = 3; i > -3; i--)
+    for (j = 2 * i + 7; j > -2 * i + 1; j--)
+      {
+	if (i <= -3 || i > 3 || j <= -2 * i + 1 || j > 2 * i + 7 || k[i + 5][j + 5] != 1)
+	  abort ();
+	k[i + 5][j + 5]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (/*i != -3 || j != 3 || */x != -1020 || niters != 50)
+    abort ();
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = a; i > b; i += c)
+    for (j = d * i + e; j > g + i * f; j += h)
+      {
+	if (i <= -3 || i > 3 || j <= -2 * i + 1 || j > 2 * i + 7 || k[i + 5][j + 5] != 2)
+	  abort ();
+	k[i + 5][j + 5]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (/*i != -3 || j != 3 || */x != -1020 || niters != 50)
+    abort ();
+  for (i = 3; i > -3; i--)
+    for (j = 2 * i + 7; j > -2 * i + 1; j--)
+      if (k[i + 5][j + 5] == 3)
+	k[i + 5][j + 5] = 0;
+      else
+	abort ();
+  for (i = 6; i > -6; i--)
+    for (j = 2 * i + 7; j <= -2 * i + 1; j++)
+      k[i + 5][j + 5] = 1;
+  a = 6; b = -6; c = -1; d = 2; e = 7; f = -2; g = 2; h = 1;
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = 6; i > -6; i--)
+    for (j = 2 * i + 7; j <= -2 * i + 1; j++)
+      {
+	if (i <= -6 || i > 6 || j < 2 * i + 7 || j >= -2 * i + 2 || k[i + 5][j + 5] != 1)
+	  abort ();
+	k[i + 5][j + 5]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != -6 || j != 12 || x != -5109 || niters != 36)
+    abort ();
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = a; i > b; i += c)
+    for (j = d * i + e; j < g + i * f; j += h)
+      {
+	if (i <= -6 || i > 6 || j < 2 * i + 7 || j >= -2 * i + 2 || k[i + 5][j + 5] != 2)
+	  abort ();
+	k[i + 5][j + 5]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != -6 || j != 12 || x != -5109 || niters != 36)
+    abort ();
+  for (i = 6; i > -6; i--)
+    for (j = 2 * i + 7; j <= -2 * i + 1; j++)
+      if (k[i + 5][j + 5] == 3)
+	k[i + 5][j + 5] = 0;
+      else
+	abort ();
+  for (i = 6; i > -6; i -= 2)
+    for (j = -2 * i + 7; j <= 2 * i + 1; j++)
+      k[i + 5][j + 5] = 1;
+  a = 6; b = -6; c = -2; d = -2; e = 7; f = 2; g = 2; h = 1;
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = 6; i > -6; i -= 2)
+    for (j = -2 * i + 7; j <= 2 * i + 1; j++)
+      {
+	if (i <= -6 || i > 6 || j < -2 * i + 7 || j >= 2 * i + 2 || k[i + 5][j + 5] != 1)
+	  abort ();
+	k[i + 5][j + 5]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (/*i != -6 || j != 15 || */x != 2053 || niters != 33)
+    abort ();
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = a; i > b; i += c)
+    for (j = d * i + e; j < g + i * f; j += h)
+      {
+	if (i <= -6 || i > 6 || j < -2 * i + 7 || j >= 2 * i + 2 || k[i + 5][j + 5] != 2)
+	  abort ();
+	k[i + 5][j + 5]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (/*i != -6 || j != 15 || */x != 2053 || niters != 33)
+    abort ();
+  for (i = 6; i > -6; i -= 2)
+    for (j = -2 * i + 7; j <= 2 * i + 1; j++)
+      if (k[i + 5][j + 5] == 3)
+	k[i + 5][j + 5] = 0;
+      else
+	abort ();
+  return 0;
+}


More information about the Gcc-cvs mailing list