This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [Fortran][PATCH][gomp4]: Transform OpenACC loop directive


Hi Tobias!

Thanks a lot for your review!

On 26.03.2014 03:16, Tobias Burnus wrote:

* And do concurrent also supports masks:

This is doable: generate mask conditions inside of the deepest for loop (see applied patch).


That will work in the most common cases but not in general. At least it is my understanding that Fortran requires that one first evaluates the mask expression before one enters the loop. That's made explicit for FORALL and DO CONCURRENT uses a forall header and does some refs to FORALL (esp. 7.2.4.2.2 and 7.2.4.2.3), but it does not state so explicitly.

I missed that, thanks!

I believe that there is no general way to support nested DO CONCURRENT loops with mask expressions if they must be collapsed.

I mean if we have example like

outer: DO CONCURRENT (i=1:5, j=1:5, i .ne. j)
  inner: DO CONCURRENT (i=1:5, b(i,j) .eq. 1)
    ! do something
  ENDDO inner
ENDDO outer

we must generate mask of inner loop inside of outer one. Hence, the loop cannot be collapsed.

Is this acceptable?

--
Ilmir.
>From b3a6435a08612951ec8a330877e002a92a9b00f4 Mon Sep 17 00:00:00 2001
From: Ilmir Usmanov <i.usmanov@samsung.com>
Date: Tue, 1 Apr 2014 21:02:50 +0400
Subject: [PATCH] Transform OpenACC loop directive to GENERIC

---
	gcc/fortran/
	* openmp.c (resolve_oacc_nested_loops): New check.
	* trans-openmp.c (gfc_trans_oacc_loop_generate_for): New helper function.
	(gfc_trans_oacc_loop): New function.
	(gfc_trans_oacc_combined_directive, gfc_trans_oacc_directive): Call it.
	gcc/
	* omp-low.c (scan_sharing_clauses): Skip OpenACC LOOP clauses.
	gcc/testsuite/
	* gfortran.dg/goacc/loop-3.f95: New check.
	* gfortran.dg/goacc/loop-4.f95: New test.
	* gfortran.dg/goacc/loop-tree.f95: Likewise.
	

diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c
index 447faf8..2be8ba6 100644
--- a/gcc/fortran/openmp.c
+++ b/gcc/fortran/openmp.c
@@ -2573,6 +2573,9 @@ resolve_oacc_nested_loops (gfc_code *code, gfc_code* do_code, int collapse,
 	  break;
 	}
       gcc_assert (do_code->op == EXEC_DO || do_code->op == EXEC_DO_CONCURRENT);
+      if (do_code->op == EXEC_DO_CONCURRENT && do_code->expr1)
+	gfc_error ("OpenACC LOOP directive doesn't support DO CONCURRENT loops "
+		   "with mask expressions at %L", &do_code->loc);
       if (do_code->ext.iterator->var->ts.type != BT_INTEGER)
 	gfc_error ("!$ACC LOOP iteration variable must be of type integer at %L",
 		   &do_code->loc);
diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c
index 29364f4..6997625 100644
--- a/gcc/fortran/trans-openmp.c
+++ b/gcc/fortran/trans-openmp.c
@@ -1571,11 +1571,258 @@ typedef struct dovar_init_d {
   tree init;
 } dovar_init;
 
+/* Helper function to generate a single for loop.  */
+static void
+gfc_trans_oacc_loop_generate_for (stmtblock_t *pblock, gfc_se *se, 
+				  gfc_expr *var_expr, gfc_expr *start_expr,
+				  gfc_expr *end_expr, gfc_expr *step_expr,
+				  int i, tree *init, tree *cond, tree *incr,
+				  vec<dovar_init>* inits)
+{
+  int simple = 0;
+  tree dovar, from, to, step, type, tmp, count = NULL_TREE;
+
+  /* Evaluate all the expressions.  */
+  gfc_init_se (se, NULL);
+  gfc_conv_expr_lhs (se, var_expr);
+  gfc_add_block_to_block (pblock, &se->pre);
+  dovar = se->expr;
+  type = TREE_TYPE (dovar);
+  gcc_assert (TREE_CODE (type) == INTEGER_TYPE);
+
+  gfc_init_se (se, NULL);
+  gfc_conv_expr_val (se, start_expr);
+  gfc_add_block_to_block (pblock, &se->pre);
+  from = gfc_evaluate_now (se->expr, pblock);
+
+  gfc_init_se (se, NULL);
+  gfc_conv_expr_val (se, end_expr);
+  gfc_add_block_to_block (pblock, &se->pre);
+  to = gfc_evaluate_now (se->expr, pblock);
+
+  gfc_init_se (se, NULL);
+  gfc_conv_expr_val (se, step_expr);
+  gfc_add_block_to_block (pblock, &se->pre);
+  step = gfc_evaluate_now (se->expr, pblock);
+
+  /* Special case simple loops.  */
+  if (TREE_CODE (dovar) == VAR_DECL)
+    {
+      if (integer_onep (step))
+	simple = 1;
+      else if (tree_int_cst_equal (step, integer_minus_one_node))
+	simple = -1;
+    }
+
+  /* Loop body.  */
+  if (simple)
+    {
+      TREE_VEC_ELT (*init, i) = build2_v (MODIFY_EXPR, dovar, from);
+      /* The condition should not be folded.  */
+      TREE_VEC_ELT (*cond, i) = build2_loc (input_location, simple > 0
+					    ? LE_EXPR : GE_EXPR,
+					    boolean_type_node, dovar, to);
+      TREE_VEC_ELT (*incr, i) = fold_build2_loc (input_location, PLUS_EXPR,
+						 type, dovar, step);
+      TREE_VEC_ELT (*incr, i) = fold_build2_loc (input_location,
+						 MODIFY_EXPR,
+						 type, dovar,
+						 TREE_VEC_ELT (*incr, i));
+    }
+  else
+    {
+      /* STEP is not 1 or -1.  Use:
+	 for (count = 0; count < (to + step - from) / step; count++)
+	   {
+	     dovar = from + count * step;
+	     body;
+	   cycle_label:;
+	   }  */
+      tmp = fold_build2_loc (input_location, MINUS_EXPR, type, step, from);
+      tmp = fold_build2_loc (input_location, PLUS_EXPR, type, to, tmp);
+      tmp = fold_build2_loc (input_location, TRUNC_DIV_EXPR, type, tmp,
+			     step);
+      tmp = gfc_evaluate_now (tmp, pblock);
+      count = gfc_create_var (type, "count");
+      TREE_VEC_ELT (*init, i) = build2_v (MODIFY_EXPR, count,
+					 build_int_cst (type, 0));
+      /* The condition should not be folded.  */
+      TREE_VEC_ELT (*cond, i) = build2_loc (input_location, LT_EXPR,
+					    boolean_type_node,
+					    count, tmp);
+      TREE_VEC_ELT (*incr, i) = fold_build2_loc (input_location, PLUS_EXPR,
+						 type, count,
+						 build_int_cst (type, 1));
+      TREE_VEC_ELT (*incr, i) = fold_build2_loc (input_location,
+						 MODIFY_EXPR, type, count,
+						 TREE_VEC_ELT (*incr, i));
+
+      /* Initialize DOVAR.  */
+      tmp = fold_build2_loc (input_location, MULT_EXPR, type, count, step);
+      tmp = fold_build2_loc (input_location, PLUS_EXPR, type, from, tmp);
+      dovar_init e = {dovar, tmp};
+      inits->safe_push (e);
+    }
+}
+
+/* Unlike OpenMP's one, OpenACC implementation supports DO CONCURRENT loops.
+   For each dovar in DO CONCURRENT loop it generates single for loop.
+   All generated for loops must be perfectly nested (and collapsed later). 
+   However, since mask expressions must be evaluated before the loop,
+   they are not allowed.
+
+   For example, if we have loop like
+
+   !$ACC LOOP
+   DO CONCURRENT (i=1:64:2,j=1:64:2,k=1:64:2)
+     body
+   ENDDO
+
+   The result must be like
+
+   #pragma acc loop collapse(3)
+   for(count.0=0; count.0<32; count.0=count.0+1)
+     for(count.1=0; count.1<32; count.1=count.1+1)
+       for(count.2=0; count.2<32; count.2=count.2+1)
+	 {
+	   i = count.0 * 2 + 1;
+	   j = count.1 * 2 + 1;
+	   k = count.2 * 2 + 1;
+	   body;
+	   cycle_label:;
+	 }
+   */
+static tree
+gfc_trans_oacc_loop (gfc_code *code, stmtblock_t *pblock,
+		     gfc_omp_clauses *loop_clauses)
+{
+  gfc_se se;
+  tree init, cond, incr, stmt, cycle_label, tmp, omp_clauses;
+  stmtblock_t block;
+  stmtblock_t body;
+  gfc_omp_clauses *clauses = code->ext.omp_clauses;
+  int i, collapse = clauses->collapse;
+  vec<dovar_init> inits = vNULL;
+  dovar_init *di;
+  unsigned ix;
+  gfc_code *old_code;
+
+  /* DO CONCURRENT specific vars.  */
+  gfc_forall_iterator *fai;
+  int nforloops = 0;
+  int current_for = 0;
+
+  if (collapse <= 0)
+    collapse = 1;
+
+  code = code->block->next;
+  gcc_assert (code->op == EXEC_DO || code->op == EXEC_DO_CONCURRENT);
+
+  if (pblock == NULL)
+    {
+      gfc_start_block (&block);
+      pblock = &block;
+    }
+
+  /* Calculate number of required for loops.  */
+  old_code = code;
+  for (i = 0; i < collapse; i++)
+    {
+      if (code->op == EXEC_DO)
+	nforloops++;
+      else if (code->op == EXEC_DO_CONCURRENT)
+	{
+	  gcc_assert (code->expr1 == NULL);
+	  for (fai = code->ext.forall_iterator; fai; fai = fai->next)
+	    nforloops++;
+	}
+      else
+	gcc_unreachable ();
+      code = code->block->next;
+    }
+  code = old_code;
+
+  /* Set the number of required for loops for collapse.  */
+  loop_clauses->collapse = nforloops;
+
+  omp_clauses = gfc_trans_omp_clauses (pblock, loop_clauses, code->loc);
+
+  init = make_tree_vec (nforloops);
+  cond = make_tree_vec (nforloops);
+  incr = make_tree_vec (nforloops);
+
+  for (i = 0; i < collapse; i++)
+    {
+      if (code->op == EXEC_DO)
+	gfc_trans_oacc_loop_generate_for (pblock, &se, code->ext.iterator->var,
+					  code->ext.iterator->start, 
+					  code->ext.iterator->end,
+					  code->ext.iterator->step,
+					  current_for++, &init, &cond, &incr,
+					  &inits);
+      else if (code->op == EXEC_DO_CONCURRENT)
+	for (fai = code->ext.forall_iterator; fai; fai = fai->next)
+	  gfc_trans_oacc_loop_generate_for (pblock, &se, fai->var, fai->start,
+					    fai->end, fai->stride, current_for++,
+					    &init, &cond, &incr, &inits);
+      else
+	gcc_unreachable ();
+      if (i + 1 < collapse)
+	code = code->block->next;
+    }
+
+  if (pblock != &block)
+    {
+      pushlevel ();
+      gfc_start_block (&block);
+    }
+
+  gfc_start_block (&body);
+
+  /* Generate complicated dovars.  */
+  FOR_EACH_VEC_ELT (inits, ix, di)
+    gfc_add_modify (&body, di->var, di->init);
+  inits.release ();
+
+  /* Cycle statement is implemented with a goto.  Exit statement must not be
+     present for this loop.  */
+  cycle_label = gfc_build_label_decl (NULL_TREE);
+
+  /* Put these labels where they can be found later.  */
+
+  code->cycle_label = cycle_label;
+  code->exit_label = NULL_TREE;
+
+  /* Main loop body.  */
+  tmp = gfc_trans_omp_code (code->block->next, true);
+  gfc_add_expr_to_block (&body, tmp);
+
+  /* Label for cycle statements (if needed).  */
+  if (TREE_USED (cycle_label))
+    {
+      tmp = build1_v (LABEL_EXPR, cycle_label);
+      gfc_add_expr_to_block (&body, tmp);
+    }
+
+  /* End of loop body.  */
+  stmt = make_node (OACC_LOOP);
+
+  TREE_TYPE (stmt) = void_type_node;
+  OMP_FOR_BODY (stmt) = gfc_finish_block (&body);
+  OMP_FOR_CLAUSES (stmt) = omp_clauses;
+  OMP_FOR_INIT (stmt) = init;
+  OMP_FOR_COND (stmt) = cond;
+  OMP_FOR_INCR (stmt) = incr;
+  gfc_add_expr_to_block (&block, stmt);
+
+  return gfc_finish_block (&block);
+}
+
 /* parallel loop and kernels loop. */
 static tree
 gfc_trans_oacc_combined_directive (gfc_code *code)
 {
-  stmtblock_t block;
+  stmtblock_t block, *pblock = NULL;
   gfc_omp_clauses construct_clauses, loop_clauses;
   tree stmt, oacc_clauses = NULL_TREE;
   enum tree_code construct_code;
@@ -1614,11 +1861,21 @@ gfc_trans_oacc_combined_directive (gfc_code *code)
       oacc_clauses = gfc_trans_omp_clauses (&block, &construct_clauses,
 					    code->loc);
     }
-    
-  gfc_error ("!$ACC LOOP directive not implemented yet %L", &code->loc);
-  stmt = gfc_trans_omp_code (code->block->next, true);
+  if (!loop_clauses.seq)
+    pblock = &block;
+  else
+    pushlevel ();
+  stmt = gfc_trans_oacc_loop (code, pblock, &loop_clauses);
+  if (TREE_CODE (stmt) != BIND_EXPR)
+    stmt = build3_v (BIND_EXPR, NULL, stmt, poplevel (1, 0));
+  else
+    poplevel (0, 0);
   stmt = build2_loc (input_location, construct_code, void_type_node, stmt,
 		     oacc_clauses);
+  if (code->op == EXEC_OACC_KERNELS_LOOP)
+    OACC_KERNELS_COMBINED (stmt) = 1;
+  else
+    OACC_PARALLEL_COMBINED (stmt) = 1;
   gfc_add_expr_to_block (&block, stmt);
   return gfc_finish_block (&block);
 }
@@ -2258,8 +2515,7 @@ gfc_trans_oacc_directive (gfc_code *code)
     case EXEC_OACC_HOST_DATA:
       return gfc_trans_oacc_construct (code);
     case EXEC_OACC_LOOP:
-      gfc_error ("!$ACC LOOP directive not implemented yet %L", &code->loc);
-      return NULL_TREE;
+      return gfc_trans_oacc_loop (code, NULL, code->ext.omp_clauses);
     case EXEC_OACC_UPDATE:
     case EXEC_OACC_WAIT:
     case EXEC_OACC_CACHE:
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index a7b93bc..c1b35d6 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -1557,7 +1557,7 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
 	case OMP_CLAUSE_REDUCTION:
 	  if (is_gimple_omp_oacc_specifically (ctx->stmt))
 	    {
-	      sorry ("clause not supported yet");
+	      sorry ("Clause not supported yet");
 	      break;
 	    }
 	case OMP_CLAUSE_LINEAR:
@@ -1613,7 +1613,7 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
 	case OMP_CLAUSE_IF:
 	  if (is_gimple_omp_oacc_specifically (ctx->stmt))
 	    {
-	      sorry ("clause not supported yet");
+	      sorry ("Clause not supported yet");
 	      break;
 	    }
 	case OMP_CLAUSE_FINAL:
@@ -1739,9 +1739,14 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
 	    }
 	  break;
 
-	case OMP_CLAUSE_NOWAIT:
 	case OMP_CLAUSE_ORDERED:
 	case OMP_CLAUSE_COLLAPSE:
+	  if (is_gimple_omp_oacc_specifically (ctx->stmt))
+	    {
+	      sorry ("Clause not supported yet");
+	      break;
+	    }
+	case OMP_CLAUSE_NOWAIT:
 	case OMP_CLAUSE_UNTIED:
 	case OMP_CLAUSE_MERGEABLE:
 	case OMP_CLAUSE_PROC_BIND:
@@ -1795,7 +1800,7 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
 	case OMP_CLAUSE_REDUCTION:
 	  if (is_gimple_omp_oacc_specifically (ctx->stmt))
 	    {
-	      sorry ("clause not supported yet");
+	      sorry ("Clause not supported yet");
 	      break;
 	    }
 	case OMP_CLAUSE_LINEAR:
@@ -1864,9 +1869,11 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
 	  break;
 
 	case OMP_CLAUSE_IF:
+	case OMP_CLAUSE_ORDERED:
+	case OMP_CLAUSE_COLLAPSE:
 	  if (is_gimple_omp_oacc_specifically (ctx->stmt))
 	    {
-	      sorry ("clause not supported yet");
+	      sorry ("Clause not supported yet");
 	      break;
 	    }
 	case OMP_CLAUSE_COPYPRIVATE:
@@ -1879,8 +1886,6 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
 	case OMP_CLAUSE_SCHEDULE:
 	case OMP_CLAUSE_DIST_SCHEDULE:
 	case OMP_CLAUSE_NOWAIT:
-	case OMP_CLAUSE_ORDERED:
-	case OMP_CLAUSE_COLLAPSE:
 	case OMP_CLAUSE_UNTIED:
 	case OMP_CLAUSE_FINAL:
 	case OMP_CLAUSE_MERGEABLE:
diff --git a/gcc/testsuite/gfortran.dg/goacc/loop-3.f95 b/gcc/testsuite/gfortran.dg/goacc/loop-3.f95
index 2a866c7..c13542c 100644
--- a/gcc/testsuite/gfortran.dg/goacc/loop-3.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/loop-3.f95
@@ -29,6 +29,11 @@ subroutine test1
   do concurrent (i = 1:5)
   end do
 
+  ! However, mask expressions are not allowed
+  !$acc loop
+  do concurrent (i = 1:5, i .eq. 1) ! { dg-error "mask expressions" }
+  end do
+
   !$acc loop
   outer_loop: do i = 1, 5
     inner_loop: do j = 1,5
diff --git a/gcc/testsuite/gfortran.dg/goacc/loop-4.f95 b/gcc/testsuite/gfortran.dg/goacc/loop-4.f95
new file mode 100644
index 0000000..92b7fd6
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/loop-4.f95
@@ -0,0 +1,16 @@
+! { dg-do compile }
+! { dg-additional-options "-fdump-tree-original -std=f2008" } 
+
+PROGRAM test
+  IMPLICIT NONE
+  INTEGER :: a(64), b(64), c(64), i, j, k
+  ! Must be replaced by three loops.
+  !$acc loop
+  DO CONCURRENT (i=1:64, j=1:64, k=1:64)
+    a(i) = b(j)
+    c(k) = b(j)
+  END DO
+END PROGRAM test
+! { dg-prune-output "sorry, unimplemented: Clause not supported yet" }
+! { dg-final { scan-tree-dump-times "collapse\\(3\\)" 1 "original" } } 
+! { dg-final { cleanup-tree-dump "original" } } 
diff --git a/gcc/testsuite/gfortran.dg/goacc/loop-tree.f95 b/gcc/testsuite/gfortran.dg/goacc/loop-tree.f95
new file mode 100644
index 0000000..ec1fb1f
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/loop-tree.f95
@@ -0,0 +1,50 @@
+! { dg-do compile } 
+! { dg-additional-options "-fdump-tree-original -std=f2008" } 
+
+! test for tree-dump-original and spaces-commas
+
+program test
+  implicit none
+  integer :: i, j, k, m, sum
+  REAL :: a(64), b(64), c(64)
+
+  !$acc kernels 
+  !$acc loop seq collapse(2)
+  DO i = 1,10
+    DO j = 1,10
+    ENDDO
+  ENDDO
+
+  !$acc loop independent gang (3)
+  DO i = 1,10
+    !$acc loop worker(3) ! { dg-error "work-sharing region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region" }
+    DO j = 1,10
+      !$acc loop vector(5)
+      DO k = 1,10
+      ENDDO
+    ENDDO
+  ENDDO
+  !$acc end kernels
+
+  sum = 0
+  !$acc parallel
+  !$acc loop private(m) reduction(+:sum)
+  DO i = 1,10
+    sum = sum + 1
+  ENDDO
+  !$acc end parallel
+
+end program test
+! { dg-prune-output "sorry, unimplemented: Clause not supported yet" }
+! { dg-final { scan-tree-dump-times "pragma acc loop" 5 "original" } } 
+
+! { dg-final { scan-tree-dump-times "ordered" 1 "original" } }
+! { dg-final { scan-tree-dump-times "collapse\\(2\\)" 1 "original" } } 
+! { dg-final { scan-tree-dump-times "independent" 1 "original" } } 
+! { dg-final { scan-tree-dump-times "gang\\(3\\)" 1 "original" } } 
+! { dg-final { scan-tree-dump-times "worker\\(3\\)" 1 "original" } } 
+! { dg-final { scan-tree-dump-times "vector\\(5\\)" 1 "original" } } 
+
+! { dg-final { scan-tree-dump-times "private\\(m\\)" 1 "original" } } 
+! { dg-final { scan-tree-dump-times "reduction\\(\\+:sum\\)" 1 "original" } } 
+! { dg-final { cleanup-tree-dump "original" } } 
\ No newline at end of file
diff --git a/gcc/tree-pretty-print.c b/gcc/tree-pretty-print.c
index 6c311790..59632e2 100644
--- a/gcc/tree-pretty-print.c
+++ b/gcc/tree-pretty-print.c
@@ -674,13 +674,15 @@ dump_omp_clause (pretty_printer *buffer, tree clause, int spc, int flags)
 
     case OMP_CLAUSE_WORKER:
       pp_string (buffer, "worker(");
-      dump_generic_node (buffer, OMP_CLAUSE_DECL (clause), spc, flags, false);
+      dump_generic_node (buffer, OMP_CLAUSE_WORKER_EXPR (clause), spc, flags, 
+			 false);
       pp_character(buffer, ')');
       break;
 
     case OMP_CLAUSE_VECTOR:
       pp_string (buffer, "vector(");
-      dump_generic_node (buffer, OMP_CLAUSE_DECL (clause), spc, flags, false);
+      dump_generic_node (buffer, OMP_CLAUSE_VECTOR_EXPR (clause), spc, flags,
+			 false);
       pp_character(buffer, ')');
       break;
 
-- 
1.8.3.2


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]