OpenACC middle end changes

Thu Nov 13 18:31:00 GMT 2014

On Thu, Nov 13, 2014 at 05:59:11PM +0100, Thomas Schwinge wrote:
>   * should gcc/oacc-builtins.def just be merged into
>     gcc/omp-builtins.def;

Why not.  The reason why they aren't in gcc/builtins.def is that
the Fortran FE doesn't source those, but OpenACC supports the same
languages as OpenMP.

> --- gcc/builtins.c
> +++ gcc/builtins.c
> @@ -5751,6 +5751,49 @@ expand_stack_save (void)
>    return ret;
>  }
>  
> +
> +/* Expand OpenACC acc_on_device.
> +
> +   This has to happen late (that is, not in early folding; expand_builtin_*,
> +   rather than fold_builtin_*), as we have to act differently for host and
> +   acceleration device (ACCEL_COMPILER conditional).  */
> +
> +static rtx
> +expand_builtin_acc_on_device (tree exp, rtx target ATTRIBUTE_UNUSED)
> +{
> +  if (!validate_arglist (exp, INTEGER_TYPE, VOID_TYPE))
> +    return NULL_RTX;
> +
> +  tree arg, v1, v2, ret;
> +  location_t loc;
> +
> +  arg = CALL_EXPR_ARG (exp, 0);
> +  arg = builtin_save_expr (arg);
> +  loc = EXPR_LOCATION (exp);
> +
> +  /* Build: (arg == v1 || arg == v2) ? 1 : 0.  */
> +
> +#ifdef ACCEL_COMPILER
> +  v1 = build_int_cst (TREE_TYPE (arg), /* TODO: acc_device_not_host */ 3);
> +  v2 = build_int_cst (TREE_TYPE (arg), ACCEL_COMPILER_acc_device);
> +#else
> +  v1 = build_int_cst (TREE_TYPE (arg), /* TODO: acc_device_none */ 0);
> +  v2 = build_int_cst (TREE_TYPE (arg), /* TODO: acc_device_host */ 2);
> +#endif
> +
> +  v1 = fold_build2_loc (loc, EQ_EXPR, integer_type_node, arg, v1);
> +  v2 = fold_build2_loc (loc, EQ_EXPR, integer_type_node, arg, v2);
> +
> +  /* Can't use TRUTH_ORIF_EXPR, as that is not supported by
> +     expand_expr_real*.  */
> +  ret = fold_build3_loc (loc, COND_EXPR, integer_type_node, v1, v1, v2);
> +  ret = fold_build3_loc (loc, COND_EXPR, integer_type_node,
> +			 ret, integer_one_node, integer_zero_node);
> +
> +  return expand_normal (ret);

If you can't fold it late (which is indeed a problem for -O0),
then I'd suggest to implement this more RTL-ish.
So, avoid the builtin_save_expr, instead
  rtx op = expand_normal (arg);
Don't build v1/v2 as trees (and, please fix the TODOs), but rtxes,
just
  rtx v1 = GEN_INT (...);
  rtx v2 = GEN_INT (...);
  machine_mode mode = TYPE_MODE (TREE_TYPE (arg));
  rtx ret = gen_reg_rtx (TYPE_MODE (integer_type_node));
  emit_move_insn (ret, const0_rtx);
  rtx_code_label *done_label = gen_label_rtx ();
  emit_cmp_and_jump_insns (op, v1, NE, NULL_RTX, mode,
			   false, done_label, PROB_EVEN);
  emit_cmp_and_jump_insns (op, v2, NE, NULL_RTX, mode,
			   false, done_label, PROB_EVEN);
  emit_move_insn (ret, const1_rtx);
  emit_label (done_label);
  return ret;
or similar.

Note, it would still be worthwhile to fold the builtin, at least
when optimizing, after IPA.  Dunno if we have some property you can check,
and Richard B. could suggest where it would be most appropriate (if GIMPLE
guarded match.pd entry, or what), gimple_fold, etc.

I bet I should handle omp_is_initial_device (); similarly.

> @@ -1818,7 +1818,7 @@ There are also several varieties of complex statements.
>  * Empty Statements::
>  * Jumps::
>  * Cleanups::
> -* OpenMP::
> +* OpenACC and OpenMP::

I think it might be better just to have separate sections for each, not
put them into the same.  Start with OpenMP, and in OpenACC section put the
OACC specific stuff and say what is shared with OpenMP (clauses, etc.).

> --- gcc/doc/gimple.texi
> +++ gcc/doc/gimple.texi
> @@ -439,6 +439,8 @@ The following table briefly describes the GIMPLE instruction set.
>  @item @code{GIMPLE_GOTO}		@tab x			@tab x
>  @item @code{GIMPLE_LABEL}		@tab x			@tab x
>  @item @code{GIMPLE_NOP}			@tab x			@tab x
> +@item @code{GIMPLE_OACC_KERNELS}	@tab x			@tab x
> +@item @code{GIMPLE_OACC_PARALLEL}	@tab x			@tab x
>  @item @code{GIMPLE_OMP_ATOMIC_LOAD}	@tab x			@tab x
>  @item @code{GIMPLE_OMP_ATOMIC_STORE}	@tab x			@tab x
>  @item @code{GIMPLE_OMP_CONTINUE}	@tab x			@tab x
> @@ -1006,6 +1008,8 @@ Return a deep copy of statement @code{STMT}.
>  * @code{GIMPLE_EH_FILTER}::
>  * @code{GIMPLE_LABEL}::
>  * @code{GIMPLE_NOP}::
> +* @code{GIMPLE_OACC_KERNELS}::
> +* @code{GIMPLE_OACC_PARALLEL}::
>  * @code{GIMPLE_OMP_ATOMIC_LOAD}::
>  * @code{GIMPLE_OMP_ATOMIC_STORE}::
>  * @code{GIMPLE_OMP_CONTINUE}::

This will likely change, right?

> --- gcc/gimple-pretty-print.c
> +++ gcc/gimple-pretty-print.c
> @@ -1136,18 +1136,21 @@ dump_gimple_omp_for (pretty_printer *buffer, gimple gs, int spc, int flags)
>  	case GF_OMP_FOR_KIND_FOR:
>  	  kind = "";
>  	  break;
> -	case GF_OMP_FOR_KIND_SIMD:
> -	  kind = " simd";
> -	  break;
> -	case GF_OMP_FOR_KIND_CILKSIMD:
> -	  kind = " cilksimd";
> -	  break;
>  	case GF_OMP_FOR_KIND_DISTRIBUTE:
>  	  kind = " distribute";
>  	  break;
>  	case GF_OMP_FOR_KIND_CILKFOR:
>  	  kind = " _Cilk_for";
>  	  break;
> +	case GF_OMP_FOR_KIND_OACC_LOOP:
> +	  kind = " oacc_loop";
> +	  break;
> +	case GF_OMP_FOR_KIND_SIMD:
> +	  kind = " simd";
> +	  break;
> +	case GF_OMP_FOR_KIND_CILKSIMD:
> +	  kind = " cilksimd";
> +	  break;

Why the reshuffling?  The result isn't alphabetically sorted
anyway.  I'd just add new stuff at the end ;)

> @@ -1684,7 +1718,12 @@ gimple_copy (gimple stmt)
>  	  gimple_try_set_cleanup (copy, new_seq);
>  	  break;
>  
> +	case GIMPLE_OACC_KERNELS:
> +	case GIMPLE_OACC_PARALLEL:
> +          gcc_unreachable ();
> +
>  	case GIMPLE_OMP_FOR:
> +	  gcc_assert (!is_gimple_omp_oacc_specifically (stmt));
>  	  new_seq = gimple_seq_copy (gimple_omp_for_pre_body (stmt));
>  	  gimple_omp_for_set_pre_body (copy, new_seq);
>  	  t = unshare_expr (gimple_omp_for_clauses (stmt));
> @@ -1754,6 +1793,7 @@ gimple_copy (gimple stmt)
>  	case GIMPLE_OMP_TASKGROUP:
>  	case GIMPLE_OMP_ORDERED:
>  	copy_omp_body:
> +	  gcc_assert (!is_gimple_omp_oacc_specifically (stmt));
>  	  new_seq = gimple_seq_copy (gimple_omp_body (stmt));
>  	  gimple_omp_set_body (copy, new_seq);
>  	  break;

Why are you so afraid to support oacc in gimple_copy?

> +/* GIMPLE_OACC_PARALLEL */
> +struct GTY((tag("GSS_OMP_PARALLEL_LAYOUT")))
> +  gimple_statement_oacc_parallel : public gimple_statement_omp_parallel_layout
> +{
> +    /* No extra fields; adds invariant:
> +         stmt->code == GIMPLE_OACC_PARALLEL.  */

Formatting, there is too big indentation.  That said, this will be replaced
with GF_OMP_TARGET_KIND_*, right?

> +static inline tree
> +gimple_oacc_kernels_clauses (const_gimple gs)
> +{
> +  const gimple_statement_oacc_kernels *oacc_kernels_stmt =
> +    as_a <const gimple_statement_oacc_kernels *> (gs);

Wrong formatting (many times, = goes on the second line.
Seems you probably just copied it from preexisting bad formatting,
we should ping Dave Malcolm to fix up his scripts.

> +/* Return true if STMT is any of the OpenACC types specifically.  */
> +
> +static inline bool
> +is_gimple_omp_oacc_specifically (const_gimple stmt)

Why not is_gimple_oacc or gimple_oacc_p ?

> @@ -99,15 +103,16 @@ enum gimplify_omp_var_data
>  
>  enum omp_region_type
>  {
> -  ORT_WORKSHARE = 0,
> -  ORT_SIMD = 1,
> -  ORT_PARALLEL = 2,
> -  ORT_COMBINED_PARALLEL = 3,
> -  ORT_TASK = 4,
> -  ORT_UNTIED_TASK = 5,
> -  ORT_TEAMS = 8,
> -  ORT_TARGET_DATA = 16,
> -  ORT_TARGET = 32
> +  /* An undefined region type.  */
> +  ORT_INVALID = 0,
> +
> +  ORT_WORKSHARE,
> +  ORT_SIMD,
> +  ORT_PARALLEL,
> +  ORT_COMBINED_PARALLEL,
> +  ORT_TASK,
> +  ORT_TEAMS,
> +  ORT_TARGET
>  };

If you want to shift from bitmasks in the enum
to extra on the side bits (why?), then combined
for parallel is another thing.

> @@ -356,7 +376,7 @@ splay_tree_compare_decl_uid (splay_tree_key xa, splay_tree_key xb)
>  /* Create a new omp construct that deals with variable remapping.  */
>  
>  static struct gimplify_omp_ctx *
> -new_omp_context (enum omp_region_type region_type)
> +new_omp_context (void)

But I don't like this match, if you really want on the side
integer, just add another argument to the function?

> @@ -5938,19 +5978,17 @@ omp_check_private (struct gimplify_omp_ctx *ctx, tree decl, bool copyprivate)
>    return false;
>  }
>  
> -/* Scan the OpenMP clauses in *LIST_P, installing mappings into a new
> -   and previous omp contexts.  */
> +/* Scan the clauses in *LIST_P, installing mappings into CTX as well as outer
> +   contexts, if applicable.  Before returning, CTX will also be pushed to the
> +   top of GIMPLIFY_OMP_CTXP.  */
>  
>  static void
>  gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p,
> -			   enum omp_region_type region_type)
> +			   struct gimplify_omp_ctx *ctx)
>  {
> -  struct gimplify_omp_ctx *ctx, *outer_ctx;
> +  struct gimplify_omp_ctx *outer_ctx = ctx->outer_context;
>    tree c;
>  
> -  ctx = new_omp_context (region_type);
> -  outer_ctx = ctx->outer_context;
> -

Why?

> @@ -233,6 +242,90 @@ static tree scan_omp_1_op (tree *, int *, void *);
>        *handled_ops_p = false; \
>        break;
>  
> +/* Helper function to get the reduction array name */
> +static const char *
> +omp_get_id (tree node)

Be more specific in the function name what it is for?

> +{
> +  const char *id = IDENTIFIER_POINTER (DECL_NAME (node));
> +  int len = strlen ("omp$") + strlen (id);
> +  char *temp_name = (char *)alloca (len+1);
> +  snprintf (temp_name, len+1, "gfc$%s", id);

gfc$ ?
Use
  char *temp_name = XALLOCAVEC (char, len + 1);
instead?

> +  return IDENTIFIER_POINTER(get_identifier (temp_name));

Formatting (missing space before ( ).

> @@ -868,6 +981,25 @@ maybe_lookup_field (tree var, omp_context *ctx)
>    return n ? (tree) n->value : NULL_TREE;
>  }
>  
> +static inline tree
> +lookup_reduction (const char *id, omp_context *ctx)

Can't you use oacc_ in the name of OpenACC specific functions?

>  	  by_ref = use_pointer_for_field (decl, NULL);
>  	  install_var_field (decl, by_ref, 3, ctx);
>  	  break;
>  
>  	case OMP_CLAUSE_DEFAULT:
> +	  gcc_assert (!is_gimple_omp_oacc_specifically (ctx->stmt));
>  	  ctx->default_kind = OMP_CLAUSE_DEFAULT_KIND (c);
>  	  break;

For clauses which are not parsed for OpenACC directives I find
the gcc_assert (!is_gimple_omp_oacc_specifically (ctx->stmt));
completely unnecessary.

>  	case OMP_CLAUSE_FINAL:
> -	case OMP_CLAUSE_IF:
>  	case OMP_CLAUSE_NUM_THREADS:
>  	case OMP_CLAUSE_NUM_TEAMS:
>  	case OMP_CLAUSE_THREAD_LIMIT:
> @@ -1625,13 +1799,41 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
>  	case OMP_CLAUSE_DIST_SCHEDULE:
>  	case OMP_CLAUSE_DEPEND:
>  	case OMP_CLAUSE__CILK_FOR_COUNT_:
> +	  gcc_assert (!is_gimple_omp_oacc_specifically (ctx->stmt));
> +	  /* FALLTHRU */
> +	case OMP_CLAUSE_IF:

Then you can avoid reshuffling like this.  And, even if there are
some spots you want to keep them in for now, consider gcc_checking_assert
instead.

> +	  {
> +	    error_at (gimple_location (stmt),
> +		      "may not be nested");

Fits on one line?
> +	if (is_gimple_omp (ctx_->stmt)
> +	    && is_gimple_omp_oacc_specifically (ctx_->stmt))
> +	  {
> +	    error_at (gimple_location (stmt),
> +		      "may not be nested");

Likewise.

> @@ -4162,6 +4572,8 @@ static void
>  lower_copyprivate_clauses (tree clauses, gimple_seq *slist, gimple_seq *rlist,
>  			    omp_context *ctx)
>  {
> +  gcc_assert (!is_gimple_omp_oacc_specifically (ctx->stmt));
> +
>    tree c;
>  
>    for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c))
> @@ -4212,6 +4624,8 @@ static void
>  lower_send_clauses (tree clauses, gimple_seq *ilist, gimple_seq *olist,
>      		    omp_context *ctx)
>  {
> +  gcc_assert (!is_gimple_omp_oacc_specifically (ctx->stmt));
> +
>    tree c;
>  
>    for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c))

Like I said above.

> -      gcc_assert (stmt && gimple_code (stmt) == GIMPLE_OMP_TARGET
> -		  && gimple_omp_target_kind (stmt)
> -		     == GF_OMP_TARGET_KIND_REGION);
> +      gcc_assert (stmt &&
> +		  gimple_code (stmt) == gimple_code (entry_stmt));

Formatting error, && should be on the next line.

> @@ -8475,14 +9030,16 @@ expand_omp_target (struct omp_region *region)
>        tree tmp_var;
>  
>        tmp_var = create_tmp_var (TREE_TYPE (device), NULL);
> -      if (kind != GF_OMP_TARGET_KIND_REGION)
> +      if (offloaded)
> +	{
> +	  e = split_block (new_bb, NULL);
> +	}
> +      else

Formatting, no {} around single line then statement.

> @@ -8834,6 +9492,397 @@ make_pass_expand_omp (gcc::context *ctxt)
> 
>  /* Routines to lower OpenMP directives into OMP-GIMPLE.  */
>  
> +/* Helper function to preform, potentially COMPLEX_TYPE, operation and
> +   convert it to gimple.  */
> +static void
> +omp_gimple_assign_with_ops (tree_code op, tree dest, tree src, gimple_seq *seq)

Makes me wonder why don't you put the reduction code earlier into reduction
clause GENERIC and then lower into clauses' GIMPLE seq.
If there is some reason, please name it oacc at least.

> +static void
> +initialize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
> +			   omp_context *ctx)

Likewise.

> +/* Helper function to process the array of partial reductions.  Nthreads
> +   indicates the number of threads.  Unfortunately, GOACC_GET_NUM_THREADS
> +   cannot be used here, because nthreads on the host may be different than
> +   on the accelerator. */
> +
> +static void
> +finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
> +			 omp_context *ctx)

Likewise.

> +/* Scan through all of the gimple stmts searching for an OMP_FOR_EXPR, and
> +   scan that for reductions.  */
> +
> +static void
> +process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp,
> +			gimple_seq *out_stmt_seqp, omp_context *ctx)

Likewise.

> --- gcc/tree-nested.c
> +++ gcc/tree-nested.c
> @@ -627,6 +627,8 @@ walk_gimple_omp_for (gimple for_stmt,
>      		     walk_stmt_fn callback_stmt, walk_tree_fn callback_op,
>      		     struct nesting_info *info)
>  {
> +  gcc_assert (!is_gimple_omp_oacc_specifically (for_stmt));
> +

That surely can be reached and you can easily construct testcase, can't you?

> @@ -1323,6 +1325,10 @@ convert_nonlocal_reference_stmt (gimple_stmt_iterator *gsi, bool *handled_ops_p,
>  	}
>        break;
>  
> +    case GIMPLE_OACC_KERNELS:
> +    case GIMPLE_OACC_PARALLEL:
> +      gcc_unreachable ();
> +

Ditto etc.

	Jakub