This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[gomp4] auto partitioning
- From: Nathan Sidwell <nathan at acm dot org>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>
- Date: Mon, 19 Oct 2015 13:23:28 -0400
- Subject: [gomp4] auto partitioning
- Authentication-results: sourceware.org; auth=none
I've committed this patch to gomp4 branch.
It implements handling of the 'auto' clause on a loop. such loops can be
implicitly partitioned, if they are (explicitly or implicitly) 'independent'.
This patch walks the loop structure after explicit partitioning has been
handled, and attempts to allocate a partitioning for such auto loops. If
there's no available partitioning a diagnostic is emitted.
Auto partitioning caused a failure of a collapse testcase. I considered this a
latent bug and forced that testcase to retain the original behaviour of a 'seq'
loop.
nathan
2015-10-19 Nathan Sidwell <nathan@codesourcery.com>
gcc/
* omp-low.c (oacc_loop_auto_partitions): New.
(oacc_loop_partition): Call it.
gcc/testsuite/
* gfortran.dg/goacc/routine-4.f90: Add diagnostic.
* gfortran.dg/goacc/routine-5.f90: Add diagnostic.
* c-c++-common/goacc-gomp/nesting-1.c: Add diagnostic.
* c-c++-common/goacc/routine-6.c: Add diagnostic.
* c-c++-common/goacc/routine-7.c: Add diagnostic.
libgomp/
* testsuite/libgomp.oacc-c-c++-common/collapse-2.c: Force
serialization.
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c (revision 228960)
+++ gcc/omp-low.c (working copy)
@@ -16244,6 +16244,50 @@ oacc_loop_fixed_partitions (oacc_loop *l
return has_auto;
}
+/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
+ OUTER_MASK is the partitioning this loop is contained within.
+ Return the cumulative partitioning used by this loop, siblings and
+ children. */
+
+static unsigned
+oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
+{
+ unsigned inner_mask = 0;
+ bool noisy = true;
+
+#ifdef ACCEL_COMPILER
+ /* When device_type is supported, we want the device compiler to be
+ noisy, if the loop parameters are device_type-specific. */
+ noisy = false;
+#endif
+
+ if (loop->child)
+ inner_mask |= oacc_loop_auto_partitions (loop->child,
+ outer_mask | loop->mask);
+
+ if ((loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT))
+ {
+ unsigned this_mask = 0;
+
+ /* Pick the innermost free partitioning. */
+ this_mask = inner_mask | GOMP_DIM_MASK (GOMP_DIM_MAX);
+ this_mask = (this_mask & -this_mask) >> 1;
+ this_mask &= ~outer_mask;
+
+ if (!this_mask && noisy)
+ warning_at (loop->loc, 0,
+ "insufficient parallelism available to partition loop");
+
+ loop->mask = this_mask;
+ }
+ inner_mask |= loop->mask;
+
+ if (loop->sibling)
+ inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask);
+
+ return inner_mask;
+}
+
/* Walk the OpenACC loop heirarchy to check and assign partitioning
axes. */
@@ -16255,7 +16299,8 @@ oacc_loop_partition (oacc_loop *loop, in
if (fn_level >= 0)
outer_mask = GOMP_DIM_MASK (fn_level) - 1;
- oacc_loop_fixed_partitions (loop, outer_mask);
+ if (oacc_loop_fixed_partitions (loop, outer_mask))
+ oacc_loop_auto_partitions (loop, outer_mask);
}
/* Default launch dimension validator. Force everything to 1. A
Index: gcc/testsuite/gfortran.dg/goacc/routine-4.f90
===================================================================
--- gcc/testsuite/gfortran.dg/goacc/routine-4.f90 (revision 228960)
+++ gcc/testsuite/gfortran.dg/goacc/routine-4.f90 (working copy)
@@ -44,7 +44,7 @@ program main
!
!$acc parallel copy (a)
- !$acc loop
+ !$acc loop ! { dg-warning "insufficient parallelism" }
do i = 1, N
call gang (a)
end do
Index: gcc/testsuite/gfortran.dg/goacc/routine-5.f90
===================================================================
--- gcc/testsuite/gfortran.dg/goacc/routine-5.f90 (revision 228960)
+++ gcc/testsuite/gfortran.dg/goacc/routine-5.f90 (working copy)
@@ -87,7 +87,7 @@ subroutine seq (a)
integer, intent (inout) :: a(N)
integer :: i
- !$acc loop
+ !$acc loop ! { dg-warning "insufficient parallelism" }
do i = 1, N
a(i) = a(i) - a(i)
end do
Index: gcc/testsuite/c-c++-common/goacc-gomp/nesting-1.c
===================================================================
--- gcc/testsuite/c-c++-common/goacc-gomp/nesting-1.c (revision 228960)
+++ gcc/testsuite/c-c++-common/goacc-gomp/nesting-1.c (working copy)
@@ -20,7 +20,7 @@ f_acc_kernels (void)
}
}
-#pragma acc routine
+#pragma acc routine vector
void
f_acc_loop (void)
{
Index: gcc/testsuite/c-c++-common/goacc/routine-7.c
===================================================================
--- gcc/testsuite/c-c++-common/goacc/routine-7.c (revision 228960)
+++ gcc/testsuite/c-c++-common/goacc/routine-7.c (working copy)
@@ -74,7 +74,7 @@ vector (int red)
int
seq (int red)
{
-#pragma acc loop reduction (+:red)
+#pragma acc loop reduction (+:red) // { dg-warning "insufficient parallelism" }
for (int i = 0; i < 10; i++)
red ++;
Index: gcc/testsuite/c-c++-common/goacc/routine-6.c
===================================================================
--- gcc/testsuite/c-c++-common/goacc/routine-6.c (revision 228960)
+++ gcc/testsuite/c-c++-common/goacc/routine-6.c (working copy)
@@ -36,7 +36,7 @@ main ()
#pragma acc parallel copy (red)
{
/* Independent/seq loop tests. */
-#pragma acc loop reduction (+:red)
+#pragma acc loop reduction (+:red) // { dg-warning "insufficient parallelism" }
for (int i = 0; i < 10; i++)
red += gang ();
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/collapse-2.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/collapse-2.c (revision 228960)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/collapse-2.c (working copy)
@@ -9,7 +9,7 @@ main (void)
int m1 = 4, m2 = -5, m3 = 17;
#pragma acc parallel copy(l)
- #pragma acc loop collapse(3) reduction(+:l)
+ #pragma acc loop seq collapse(3) reduction(+:l)
for (i = -2; i < m1; i++)
for (j = m2; j < -2; j++)
{