[gomp] Move openacc vector& worker single handling to RTL

Jakub Jelinek jakub@redhat.com
Wed Jul 8 14:58:00 GMT 2015


On Wed, Jul 08, 2015 at 10:47:56AM -0400, Nathan Sidwell wrote:
> +/* Generate loop head markers in outer->inner order.  */
> +
> +static void
> +gen_oacc_fork (gimple_seq *seq, unsigned mask)
> +{
> +  {
> +    // TODDO: Determine this information from the parallel region itself

TODO ?

> +    // and emit it once in the offload function.  Currently the target
> +    // geometry definition is being extracted early.  For now inform
> +    // the backend we're using all axes of parallelism, which is a
> +    // safe default.
> +    gcall *call = gimple_build_call_internal
> +      (IFN_GOACC_MODES, 1, 
> +       build_int_cst (unsigned_type_node,
> +		      OACC_LOOP_MASK (OACC_gang)
> +		      | OACC_LOOP_MASK (OACC_vector)
> +		      | OACC_LOOP_MASK (OACC_worker)));

The formatting is too ugly.  I'd say you just want

    tree arg = build_int_cst (unsigned_type_node,
			      OACC_LOOP_MASK (OACC_gang)
			      | OACC_LOOP_MASK (OACC_vector)
			      | OACC_LOOP_MASK (OACC_worker));
    gcall *call = gimple_build_call_internal (IFN_GOACC_MODES, 1, arg);
> +                   | OACC_LOOP_MASK (OACC_vector)   

> +  for (level = OACC_gang; level != OACC_HWM; level++)
> +    if (mask & OACC_LOOP_MASK (level))
> +      {
> +	tree arg = build_int_cst (unsigned_type_node, level);
> +	gcall *call = gimple_build_call_internal
> +	  (IFN_GOACC_FORK, 1, arg);

Why the line-break?  That should fit into 80 columns just fine.

> +	gimple_seq_add_stmt (seq, call);
> +      }
> +}
> +
> +/* Generate loop tail markers in inner->outer order.  */
> +
> +static void
> +gen_oacc_join (gimple_seq *seq, unsigned mask)
> +{
> +  unsigned level;
> +
> +  for (level = OACC_HWM; level-- != OACC_gang; )
> +    if (mask & OACC_LOOP_MASK (level))
> +      {
> +	tree arg = build_int_cst (unsigned_type_node, level);
> +	gcall *call = gimple_build_call_internal
> +	  (IFN_GOACC_JOIN, 1, arg);
> +	gimple_seq_add_stmt (seq, call);
> +      }
> +}
>  
>  /* Find the mapping for DECL in CTX or the immediately enclosing
>     context that has a mapping for DECL.
> @@ -6777,21 +6808,6 @@ expand_omp_for_generic (struct omp_regio
>      }
>  }
>  
> -
> -/* True if a barrier is needed after a loop partitioned over
> -   gangs/workers/vectors as specified by GWV_BITS.  OpenACC semantics specify
> -   that a (conceptual) barrier is needed after worker and vector-partitioned
> -   loops, but not after gang-partitioned loops.  Currently we are relying on
> -   warp reconvergence to synchronise threads within a warp after vector loops,
> -   so an explicit barrier is not helpful after those.  */
> -
> -static bool
> -oacc_loop_needs_threadbarrier_p (int gwv_bits)
> -{
> -  return !(gwv_bits & OACC_LOOP_MASK (OACC_gang))
> -    && (gwv_bits & OACC_LOOP_MASK (OACC_worker));
> -}
> -
>  /* A subroutine of expand_omp_for.  Generate code for a parallel
>     loop with static schedule and no specified chunk size.  Given
>     parameters:
> @@ -6800,6 +6816,7 @@ oacc_loop_needs_threadbarrier_p (int gwv
>  
>     where COND is "<" or ">", we generate pseudocode
>  
> +  OACC_FORK
>  	if ((__typeof (V)) -1 > 0 && N2 cond N1) goto L2;
>  	if (cond is <)
>  	  adj = STEP - 1;
> @@ -6827,6 +6844,11 @@ oacc_loop_needs_threadbarrier_p (int gwv
>  	V += STEP;
>  	if (V cond e) goto L1;
>      L2:
> + OACC_JOIN
> +
> + It'd be better to place the OACC_LOOP markers just inside the outer
> + conditional, so they can be entirely eliminated if the loop is
> + unreachable.

Putting OACC_FORK/OACC_JOIN unconditionally into the comment is very
confusing.  The expand_omp_for_static_nochunk routine is used for
#pragma omp for schedule(static), #pragma omp distribute etc. which
certainly don't want to emit such markers in there.  So perhaps mention
somewhere that you wrap all the above sequence in between
OACC_FORK/OACC_JOIN markers.

> @@ -7220,6 +7249,7 @@ find_phi_with_arg_on_edge (tree arg, edg
>  
>     where COND is "<" or ">", we generate pseudocode
>  
> +OACC_FORK
>  	if ((__typeof (V)) -1 > 0 && N2 cond N1) goto L2;
>  	if (cond is <)
>  	  adj = STEP - 1;
> @@ -7230,6 +7260,7 @@ find_phi_with_arg_on_edge (tree arg, edg
>  	else
>  	  n = (adj + N2 - N1) / STEP;
>  	trip = 0;
> +
>  	V = threadid * CHUNK * STEP + N1;  -- this extra definition of V is
>  					      here so that V is defined
>  					      if the loop is not entered
> @@ -7248,6 +7279,7 @@ find_phi_with_arg_on_edge (tree arg, edg
>  	trip += 1;
>  	goto L0;
>      L4:
> +OACC_JOIN
>  */

Likewise.
>  
>  static void
> @@ -7281,10 +7313,6 @@ expand_omp_for_static_chunk (struct omp_
>    gcc_assert (EDGE_COUNT (iter_part_bb->succs) == 2);
>    fin_bb = BRANCH_EDGE (iter_part_bb)->dest;
>  
> -  /* Broadcast variables to OpenACC threads.  */
> -  entry_bb = oacc_broadcast (entry_bb, fin_bb, region);
> -  region->entry = entry_bb;
> -
>    gcc_assert (broken_loop
>  	      || fin_bb == FALLTHRU_EDGE (cont_bb)->dest);
>    seq_start_bb = split_edge (FALLTHRU_EDGE (iter_part_bb));
> @@ -7296,7 +7324,7 @@ expand_omp_for_static_chunk (struct omp_
>        trip_update_bb = split_edge (FALLTHRU_EDGE (cont_bb));
>      }
>    exit_bb = region->exit;
> -
> +  

Please avoid such whitespace changes.

In any case, as it is a gomp-4_0-branch patch, I'll defer full review to the
branch maintainers.

	Jakub



More information about the Gcc-patches mailing list