[gomp] Move openacc vector& worker single handling to RTL
Jakub Jelinek
jakub@redhat.com
Wed Jul 8 14:58:00 GMT 2015
On Wed, Jul 08, 2015 at 10:47:56AM -0400, Nathan Sidwell wrote:
> +/* Generate loop head markers in outer->inner order. */
> +
> +static void
> +gen_oacc_fork (gimple_seq *seq, unsigned mask)
> +{
> + {
> + // TODDO: Determine this information from the parallel region itself
TODO ?
> + // and emit it once in the offload function. Currently the target
> + // geometry definition is being extracted early. For now inform
> + // the backend we're using all axes of parallelism, which is a
> + // safe default.
> + gcall *call = gimple_build_call_internal
> + (IFN_GOACC_MODES, 1,
> + build_int_cst (unsigned_type_node,
> + OACC_LOOP_MASK (OACC_gang)
> + | OACC_LOOP_MASK (OACC_vector)
> + | OACC_LOOP_MASK (OACC_worker)));
The formatting is too ugly. I'd say you just want
tree arg = build_int_cst (unsigned_type_node,
OACC_LOOP_MASK (OACC_gang)
| OACC_LOOP_MASK (OACC_vector)
| OACC_LOOP_MASK (OACC_worker));
gcall *call = gimple_build_call_internal (IFN_GOACC_MODES, 1, arg);
> + | OACC_LOOP_MASK (OACC_vector)
> + for (level = OACC_gang; level != OACC_HWM; level++)
> + if (mask & OACC_LOOP_MASK (level))
> + {
> + tree arg = build_int_cst (unsigned_type_node, level);
> + gcall *call = gimple_build_call_internal
> + (IFN_GOACC_FORK, 1, arg);
Why the line-break? That should fit into 80 columns just fine.
> + gimple_seq_add_stmt (seq, call);
> + }
> +}
> +
> +/* Generate loop tail markers in inner->outer order. */
> +
> +static void
> +gen_oacc_join (gimple_seq *seq, unsigned mask)
> +{
> + unsigned level;
> +
> + for (level = OACC_HWM; level-- != OACC_gang; )
> + if (mask & OACC_LOOP_MASK (level))
> + {
> + tree arg = build_int_cst (unsigned_type_node, level);
> + gcall *call = gimple_build_call_internal
> + (IFN_GOACC_JOIN, 1, arg);
> + gimple_seq_add_stmt (seq, call);
> + }
> +}
>
> /* Find the mapping for DECL in CTX or the immediately enclosing
> context that has a mapping for DECL.
> @@ -6777,21 +6808,6 @@ expand_omp_for_generic (struct omp_regio
> }
> }
>
> -
> -/* True if a barrier is needed after a loop partitioned over
> - gangs/workers/vectors as specified by GWV_BITS. OpenACC semantics specify
> - that a (conceptual) barrier is needed after worker and vector-partitioned
> - loops, but not after gang-partitioned loops. Currently we are relying on
> - warp reconvergence to synchronise threads within a warp after vector loops,
> - so an explicit barrier is not helpful after those. */
> -
> -static bool
> -oacc_loop_needs_threadbarrier_p (int gwv_bits)
> -{
> - return !(gwv_bits & OACC_LOOP_MASK (OACC_gang))
> - && (gwv_bits & OACC_LOOP_MASK (OACC_worker));
> -}
> -
> /* A subroutine of expand_omp_for. Generate code for a parallel
> loop with static schedule and no specified chunk size. Given
> parameters:
> @@ -6800,6 +6816,7 @@ oacc_loop_needs_threadbarrier_p (int gwv
>
> where COND is "<" or ">", we generate pseudocode
>
> + OACC_FORK
> if ((__typeof (V)) -1 > 0 && N2 cond N1) goto L2;
> if (cond is <)
> adj = STEP - 1;
> @@ -6827,6 +6844,11 @@ oacc_loop_needs_threadbarrier_p (int gwv
> V += STEP;
> if (V cond e) goto L1;
> L2:
> + OACC_JOIN
> +
> + It'd be better to place the OACC_LOOP markers just inside the outer
> + conditional, so they can be entirely eliminated if the loop is
> + unreachable.
Putting OACC_FORK/OACC_JOIN unconditionally into the comment is very
confusing. The expand_omp_for_static_nochunk routine is used for
#pragma omp for schedule(static), #pragma omp distribute etc. which
certainly don't want to emit such markers in there. So perhaps mention
somewhere that you wrap all the above sequence in between
OACC_FORK/OACC_JOIN markers.
> @@ -7220,6 +7249,7 @@ find_phi_with_arg_on_edge (tree arg, edg
>
> where COND is "<" or ">", we generate pseudocode
>
> +OACC_FORK
> if ((__typeof (V)) -1 > 0 && N2 cond N1) goto L2;
> if (cond is <)
> adj = STEP - 1;
> @@ -7230,6 +7260,7 @@ find_phi_with_arg_on_edge (tree arg, edg
> else
> n = (adj + N2 - N1) / STEP;
> trip = 0;
> +
> V = threadid * CHUNK * STEP + N1; -- this extra definition of V is
> here so that V is defined
> if the loop is not entered
> @@ -7248,6 +7279,7 @@ find_phi_with_arg_on_edge (tree arg, edg
> trip += 1;
> goto L0;
> L4:
> +OACC_JOIN
> */
Likewise.
>
> static void
> @@ -7281,10 +7313,6 @@ expand_omp_for_static_chunk (struct omp_
> gcc_assert (EDGE_COUNT (iter_part_bb->succs) == 2);
> fin_bb = BRANCH_EDGE (iter_part_bb)->dest;
>
> - /* Broadcast variables to OpenACC threads. */
> - entry_bb = oacc_broadcast (entry_bb, fin_bb, region);
> - region->entry = entry_bb;
> -
> gcc_assert (broken_loop
> || fin_bb == FALLTHRU_EDGE (cont_bb)->dest);
> seq_start_bb = split_edge (FALLTHRU_EDGE (iter_part_bb));
> @@ -7296,7 +7324,7 @@ expand_omp_for_static_chunk (struct omp_
> trip_update_bb = split_edge (FALLTHRU_EDGE (cont_bb));
> }
> exit_bb = region->exit;
> -
> +
Please avoid such whitespace changes.
In any case, as it is a gomp-4_0-branch patch, I'll defer full review to the
branch maintainers.
Jakub
More information about the Gcc-patches
mailing list