This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[gomp-nvptx 9/9] adjust SIMD loop lowering for SIMT targets


This is incomplete.

This handles OpenMP SIMD for NVPTX in simple cases, partly by punting on
anything unusual such as simduid loops, partly by getting lucky, as testcases
do not expose the missing bits.

What it currently does is transform SIMD loop

  for (V = N1; V cmp N2; V + STEP) BODY;

into

  for (V = N1 + (STEP * LANE); V cmp N2; V + (STEP * VF)) BODY;

and then folding LANE/VF to 0/1 on non-NVPTX post-ipa.

To make it proper, I'll need to handle SIMDUID loops (still thinking how to
best approach that), and SAFELEN (but that simply need a condition jump around
the loop, "if (LANE >= SAFELEN)").  Handling collapsed loops eventually should
be nice too.

Also, it needs something like __nvptx_{enter/exit}_simd() calls around the
loop, to switch from uniform to non-uniform SIMT execution (set bitmask in
__nvptx_uni from 0 to -1, and back on exit), and to switch from per-warp
soft-stacks to per-hwthread hard-stacks (by reserving a small area in .local
memory, and setting __nvptx_stacks[] pointer to top of that area).

Also, since SIMD regions should run on per-hwthread stacks, I'm thinking I'll
have to outline the loop into its own function.  Can I do that post-ipa
easily?
---
 gcc/internal-fn.c   |  22 +++++++++
 gcc/internal-fn.def |   2 +
 gcc/omp-low.c       | 138 +++++++++++++++++++++++++++++++++++++++++++++++++---
 gcc/passes.def      |   1 +
 gcc/tree-pass.h     |   2 +
 5 files changed, 158 insertions(+), 7 deletions(-)

diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index a3c4a90..3189e96 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -142,6 +142,28 @@ expand_ANNOTATE (gcall *)
   gcc_unreachable ();
 }
 
+/* Lane index on SIMT targets: thread index in the warp on NVPTX.  On targets
+   without SIMT execution this should be expanded in omp_device_lower pass.  */
+
+static void
+expand_GOMP_SIMT_LANE (gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  /* FIXME: use a separate pattern for OpenMP?  */
+  gcc_assert (targetm.have_oacc_dim_pos ());
+  emit_insn (targetm.gen_oacc_dim_pos (target, const2_rtx));
+}
+
+/* This should get expanded in omp_device_lower pass.  */
+
+static void
+expand_GOMP_SIMT_VF (gcall *)
+{
+  gcc_unreachable ();
+}
+
 /* This should get expanded in adjust_simduid_builtins.  */
 
 static void
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 1cb14a8..66c7422 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -41,6 +41,8 @@ along with GCC; see the file COPYING3.  If not see
 
 DEF_INTERNAL_FN (LOAD_LANES, ECF_CONST | ECF_LEAF, NULL)
 DEF_INTERNAL_FN (STORE_LANES, ECF_CONST | ECF_LEAF, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMT_VF, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_SIMD_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index cc0435e..51ac0e5 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -10173,7 +10173,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
 				  OMP_CLAUSE_SAFELEN);
   tree simduid = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt),
 				  OMP_CLAUSE__SIMDUID_);
-  tree n1, n2;
+  tree n1, n2, step;
 
   type = TREE_TYPE (fd->loop.v);
   entry_bb = region->entry;
@@ -10218,12 +10218,37 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
 
   n1 = fd->loop.n1;
   n2 = fd->loop.n2;
+  step = fd->loop.step;
+  bool do_simt_transform
+    = (cgraph_node::get (current_function_decl)->offloadable
+       && !broken_loop
+       && !safelen
+       && !simduid
+       && !(fd->collapse > 1));
+  if (do_simt_transform)
+    {
+      tree simt_lane
+	= build_call_expr_internal_loc (UNKNOWN_LOCATION, IFN_GOMP_SIMT_LANE,
+					integer_type_node, 0);
+      simt_lane = fold_convert (TREE_TYPE (step), simt_lane);
+      simt_lane = fold_build2 (MULT_EXPR, TREE_TYPE (step), step, simt_lane);
+      cfun->curr_properties &= ~PROP_gimple_lomp_dev;
+    }
+
   if (gimple_omp_for_combined_into_p (fd->for_stmt))
     {
       tree innerc = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt),
 				     OMP_CLAUSE__LOOPTEMP_);
       gcc_assert (innerc);
       n1 = OMP_CLAUSE_DECL (innerc);
+      if (do_simt_transform)
+	{
+	  n1 = fold_convert (type, n1);
+	  if (POINTER_TYPE_P (type))
+	    n1 = fold_build_pointer_plus (n1, simt_lane);
+	  else
+	    n1 = fold_build2 (PLUS_EXPR, type, n1, fold_convert (type, simt_lane));
+	}
       innerc = find_omp_clause (OMP_CLAUSE_CHAIN (innerc),
 				OMP_CLAUSE__LOOPTEMP_);
       gcc_assert (innerc);
@@ -10239,8 +10264,15 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
     }
   else
     {
-      expand_omp_build_assign (&gsi, fd->loop.v,
-			       fold_convert (type, fd->loop.n1));
+      if (do_simt_transform)
+	{
+	  n1 = fold_convert (type, n1);
+	  if (POINTER_TYPE_P (type))
+	    n1 = fold_build_pointer_plus (n1, simt_lane);
+	  else
+	    n1 = fold_build2 (PLUS_EXPR, type, n1, fold_convert (type, simt_lane));
+	}
+      expand_omp_build_assign (&gsi, fd->loop.v, fold_convert (type, n1));
       if (fd->collapse > 1)
 	for (i = 0; i < fd->collapse; i++)
 	  {
@@ -10262,10 +10294,18 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
       stmt = gsi_stmt (gsi);
       gcc_assert (gimple_code (stmt) == GIMPLE_OMP_CONTINUE);
 
+      if (do_simt_transform)
+	{
+	  tree simt_vf
+	    = build_call_expr_internal_loc (UNKNOWN_LOCATION, IFN_GOMP_SIMT_VF,
+					    integer_type_node, 0);
+	  simt_vf = fold_convert (TREE_TYPE (step), simt_vf);
+	  step = fold_build2 (MULT_EXPR, TREE_TYPE (step), step, simt_vf);
+	}
       if (POINTER_TYPE_P (type))
-	t = fold_build_pointer_plus (fd->loop.v, fd->loop.step);
+	t = fold_build_pointer_plus (fd->loop.v, step);
       else
-	t = fold_build2 (PLUS_EXPR, type, fd->loop.v, fd->loop.step);
+	t = fold_build2 (PLUS_EXPR, type, fd->loop.v, step);
       expand_omp_build_assign (&gsi, fd->loop.v, t);
 
       if (fd->collapse > 1)
@@ -12960,7 +13000,6 @@ expand_omp (struct omp_region *region)
     }
 }
 
-
 /* Helper for build_omp_regions.  Scan the dominator tree starting at
    block BB.  PARENT is the region that contains BB.  If SINGLE_TREE is
    true, the function ends once a single tree is built (otherwise, whole
@@ -16235,7 +16274,7 @@ const pass_data pass_data_lower_omp =
   OPTGROUP_NONE, /* optinfo_flags */
   TV_NONE, /* tv_id */
   PROP_gimple_any, /* properties_required */
-  PROP_gimple_lomp, /* properties_provided */
+  PROP_gimple_lomp | PROP_gimple_lomp_dev, /* properties_provided */
   0, /* properties_destroyed */
   0, /* todo_flags_start */
   0, /* todo_flags_finish */
@@ -19470,5 +19509,90 @@ make_pass_oacc_device_lower (gcc::context *ctxt)
 {
   return new pass_oacc_device_lower (ctxt);
 }
+
+
+/* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
+   VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
+   LANE is kept to be expanded to RTL later on.  */
+
+static unsigned int
+execute_omp_device_lower ()
+{
+  int vf = 1;
+  if (targetm.simt.vf)
+    vf = targetm.simt.vf ();
+  tree vf_tree = build_int_cst (integer_type_node, vf);
+  basic_block bb;
+  gimple_stmt_iterator gsi;
+  FOR_EACH_BB_FN (bb, cfun)
+    for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+	gimple *stmt = gsi_stmt (gsi);
+	if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
+	  continue;
+	tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
+	switch (gimple_call_internal_fn (stmt))
+	  {
+	  case IFN_GOMP_SIMT_LANE:
+	    rhs = vf == 1 ? integer_zero_node : NULL_TREE;
+	    break;
+	  case IFN_GOMP_SIMT_VF:
+	    rhs = vf_tree;
+	    break;
+	  default:
+	    break;
+	  }
+	if (!rhs)
+	  continue;
+	stmt = gimple_build_assign (lhs, rhs);
+	gsi_replace (&gsi, stmt, false);
+      }
+  if (vf != 1)
+    cfun->has_force_vectorize_loops = false;
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_omp_device_lower =
+{
+  GIMPLE_PASS, /* type */
+  "ompdevlow", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  PROP_cfg, /* properties_required */
+  PROP_gimple_lomp_dev, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_update_ssa, /* todo_flags_finish */
+};
+
+class pass_omp_device_lower : public gimple_opt_pass
+{
+public:
+  pass_omp_device_lower (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *fun)
+    {
+      /* FIXME: inlining does not propagate the lomp_dev property.  */
+      return 1 || !(fun->curr_properties & PROP_gimple_lomp_dev);
+    }
+  virtual unsigned int execute (function *)
+    {
+      return execute_omp_device_lower ();
+    }
+
+}; // class pass_expand_omp_ssa
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_omp_device_lower (gcc::context *ctxt)
+{
+  return new pass_omp_device_lower (ctxt);
+}
 
 #include "gt-omp-low.h"
diff --git a/gcc/passes.def b/gcc/passes.def
index c0ab6b9..ec049f8 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -151,6 +151,7 @@ along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_fixup_cfg);
   NEXT_PASS (pass_lower_eh_dispatch);
   NEXT_PASS (pass_oacc_device_lower);
+  NEXT_PASS (pass_omp_device_lower);
   NEXT_PASS (pass_all_optimizations);
   PUSH_INSERT_PASSES_WITHIN (pass_all_optimizations)
       NEXT_PASS (pass_remove_cgraph_callee_edges);
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 49e22a9..71b2561 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -226,6 +226,7 @@ protected:
 						   of math functions; the
 						   current choices have
 						   been optimized.  */
+#define PROP_gimple_lomp_dev	(1 << 16)	/* done omp_device_lower */
 
 #define PROP_trees \
   (PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh | PROP_gimple_lomp)
@@ -414,6 +415,7 @@ extern gimple_opt_pass *make_pass_diagnose_omp_blocks (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_expand_omp_ssa (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_oacc_device_lower (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_omp_device_lower (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_object_sizes (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_strlen (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_fold_builtins (gcc::context *ctxt);


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]