This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[gomp4] Worker-single predication


This extends the previous vector-single support to also handle worker-level predication. We can't use the shfl insn because workers will live across multiple warps, so we use a location in memory to broadcast the branch target. This also fixes the oversight where basic blocks inside a parallel region but outside all loops weren't being predicated.

A special case is added for worker-single vector-partitioned; we add a jump over the entire loop that is taken by the inactive workers and add no predication inside this loop.

Committed on gomp-4_0-branch.


Bernd
Index: gcc/ChangeLog.gomp
===================================================================
--- gcc/ChangeLog.gomp	(revision 223974)
+++ gcc/ChangeLog.gomp	(working copy)
@@ -1,3 +1,29 @@
+2015-06-01  Bernd Schmidt  <bernds@codesourcery.com>
+
+	* gimple.h (struct gimple_statement_omp_parallel_layout): Add a
+	broadcast_array field.
+	(gimple_omp_target_broadcast_array,
+	gimple_omp_target_set_broadcast_array): New inline functions.
+	* omp-low.c (struct omp_region): Add a broadcast_array field.
+	(find_omp_target_region_data): Initialize it.  Change STMT arg to
+	be a gomp_target *.  All callers changed.
+	(struct omp_context): Add worker_sync_elt field.
+	(alloc_var_ganglocal): Properly handle a NULL underlying_var.
+	(oacc_alloc_broadcast_storage): Likewise.
+	(required_predication_mask): New function.
+	(requires_vector_predicate): Remove.  All callers changed to use
+	required_predication_mask.
+	(generate_vector_broadcast): Return a statement suitable as a
+	block splitting point.
+	(generate_oacc_broadcast, make_predication_test): New static
+	functions.
+	(predicate_bb): New arg MASK.  All callers changed.  Use
+	generate_oacc_braodcast and use the split point it returns.
+	Handle WSVP regions by jumping across entire loops.  Use
+	make_predication_test.  Correctly handle GIMPLE_OMP_RETURN.
+	(lower_omp_target): Call oacc_alloc_broadcast_storage.  Call
+	gimple_omp_target_set_broadcast_array on the stmt.
+
 2015-06-01  Tom de Vries  <tom@codesourcery.com>
 
 	* omp-low.c (expand_omp_target): Fix GIMPLE_OMP_ENTRY_END handling for
Index: gcc/gimple.h
===================================================================
--- gcc/gimple.h	(revision 223974)
+++ gcc/gimple.h	(working copy)
@@ -580,6 +580,10 @@ struct GTY((tag("GSS_OMP_PARALLEL_LAYOUT
   /* [ WORD 11 ]
      Size of the gang-local memory to allocate.  */
   tree ganglocal_size;
+
+  /* [ WORD 12 ]
+     A pointer to the array to be used for broadcasting across threads.  */
+  tree broadcast_array;
 };
 
 /* GIMPLE_OMP_PARALLEL or GIMPLE_TASK */
@@ -5243,6 +5247,25 @@ gimple_omp_target_set_ganglocal_size (go
 }
 
 
+/* Return the pointer to the broadcast array associated with OMP_TARGET GS.  */
+
+static inline tree
+gimple_omp_target_broadcast_array (const gomp_target *omp_target_stmt)
+{
+  return omp_target_stmt->broadcast_array;
+}
+
+
+/* Set PTR to be the broadcast array associated with OMP_TARGET
+   GS.  */
+
+static inline void
+gimple_omp_target_set_broadcast_array (gomp_target *omp_target_stmt, tree ptr)
+{
+  omp_target_stmt->broadcast_array = ptr;
+}
+
+
 /* Return the clauses associated with OMP_TEAMS GS.  */
 
 static inline tree
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 223974)
+++ gcc/omp-low.c	(working copy)
@@ -162,6 +162,8 @@ struct omp_region
 
   /* For an OpenACC loop, the level of parallelism requested.  */
   int gwv_this;
+
+  tree broadcast_array;
 };
 
 /* Levels of parallelism as defined by OpenACC.  Increasing numbers
@@ -248,6 +250,11 @@ typedef struct omp_context
      of workers.  */
   tree worker_var;
   tree worker_count;
+
+  /* For offloaded regions, at runtime this variable holds a pointer
+     to the location that should be used for thread
+     synchronization.  */
+  tree worker_sync_elt;
 } omp_context;
 
 /* A structure holding the elements of:
@@ -1490,7 +1497,7 @@ alloc_var_ganglocal (tree underlying_var
   /* If this is a pointer mapping, then we need to create too mappings, one
      for the pointer, and another to the data.  Add the offset for the pointer
      to size.  */
-  bool pointer = is_reference (underlying_var);
+  bool pointer = underlying_var ? is_reference (underlying_var) : false;
 
   if (pointer)
     size = fold_build2 (PLUS_EXPR, TREE_TYPE (size), size,
@@ -9394,6 +9401,25 @@ expand_omp_atomic (struct omp_region *re
   expand_omp_atomic_mutex (load_bb, store_bb, addr, loaded_val, stored_val);
 }
 
+/* Allocate storage for OpenACC worker threads in CTX to broadcast
+   condition results.  CLAUSES are the clauses of the parallel construct.  */
+
+static void
+oacc_alloc_broadcast_storage (omp_context *ctx, tree clauses)
+{
+  tree vull_type_node = build_qualified_type (long_long_unsigned_type_node,
+					       TYPE_QUAL_VOLATILE);
+  tree uptr_node = build_pointer_type (vull_type_node);
+
+  tree clause = find_omp_clause (clauses, OMP_CLAUSE_NUM_WORKERS);
+  tree host_count = integer_one_node;
+  if (clause)
+    host_count = OMP_CLAUSE_NUM_WORKERS_EXPR (clause);
+
+  ctx->worker_sync_elt
+    = alloc_var_ganglocal (NULL_TREE, long_long_unsigned_type_node,
+			   ctx, TYPE_SIZE_UNIT (long_long_unsigned_type_node));
+}
 
 /* Expand the GIMPLE_OMP_TARGET starting at REGION.  */
 
@@ -10080,7 +10106,8 @@ find_omp_for_region_data (struct omp_reg
    OMP_TARGET STMT.  */
 
 static void
-find_omp_target_region_data (struct omp_region *region, gimple stmt)
+find_omp_target_region_data (struct omp_region *region,
+			     gomp_target *stmt)
 {
   if (!is_gimple_omp_oacc (stmt))
     return;
@@ -10092,6 +10119,7 @@ find_omp_target_region_data (struct omp_
     region->gwv_this |= MASK_WORKER;
   if (find_omp_clause (clauses, OMP_CLAUSE_VECTOR_LENGTH))
     region->gwv_this |= MASK_VECTOR;
+  region->broadcast_array = gimple_omp_target_broadcast_array (stmt);
 }
 
 /* Helper for build_omp_regions.  Scan the dominator tree starting at
@@ -10163,7 +10191,8 @@ build_omp_regions_1 (basic_block bb, str
 		case GF_OMP_TARGET_KIND_OACC_PARALLEL:
 		case GF_OMP_TARGET_KIND_OACC_KERNELS:
 		case GF_OMP_TARGET_KIND_OACC_DATA:
-		  find_omp_target_region_data (region, stmt);
+		  find_omp_target_region_data (region,
+					       as_a <gomp_target *> (stmt));
 		  break;
 		case GF_OMP_TARGET_KIND_UPDATE:
 		case GF_OMP_TARGET_KIND_OACC_UPDATE:
@@ -10244,34 +10273,54 @@ enclosing_target_region (omp_region *reg
   return region;
 }
 
-/* Return true if basic blocks in REGION require OpenACC vector
-   predication.  */
-static bool
-requires_vector_predicate (struct omp_region *region)
+/* Return a mask of GWV_ values indicating the kind of OpenACC
+   predication required for basic blocks in REGION.  */
+
+static int
+required_predication_mask (omp_region *region)
 {
   while (region
 	 && region->type != GIMPLE_OMP_FOR && region->type != GIMPLE_OMP_TARGET)
     region = region->outer;
   if (!region)
-    return false;
-  omp_region *outer_target = enclosing_target_region (region);
-  if (!outer_target || (outer_target->gwv_this & MASK_VECTOR) == 0)
-    return false;
-  if (region->type == GIMPLE_OMP_FOR && (region->gwv_this & MASK_VECTOR) == 0)
-    return true;
-  return false;
+    return 0;
+
+  int outer_masks = region->gwv_this;
+  omp_region *outer_target = region;
+  while (outer_target != NULL && outer_target->type != GIMPLE_OMP_TARGET)
+    {
+      if (outer_target->type == GIMPLE_OMP_FOR)
+	outer_masks |= outer_target->gwv_this;
+      outer_target = outer_target->outer;
+    }
+  if (!outer_target)
+    return 0;
+
+  int mask = 0;
+  if ((outer_target->gwv_this & MASK_WORKER) != 0
+      && (region->type == GIMPLE_OMP_TARGET
+	  || (outer_masks & MASK_WORKER) == 0))
+    mask |= MASK_WORKER;
+  if ((outer_target->gwv_this & MASK_VECTOR) != 0
+      && (region->type == GIMPLE_OMP_TARGET
+	  || (outer_masks & MASK_VECTOR) == 0))
+    mask |= MASK_VECTOR;
+  return mask;
 }
 
 /* Generate a broadcast across OpenACC vector threads (a warp on GPUs)
-   so that VAR is broadcast to DEST_VAR.  The new statements are
-   added after WHERE.  */
-static void
+   so that VAR is broadcast to DEST_VAR.  The new statements are added
+   after WHERE.  Return the stmt after which the block should be split.  */
+
+static gimple
 generate_vector_broadcast (tree dest_var, tree var,
 			   gimple_stmt_iterator &where)
 {
+  gimple retval = gsi_stmt (where);
   tree vartype = TREE_TYPE (var);
   tree call_arg_type = unsigned_type_node;
   enum built_in_function fn = BUILT_IN_GOACC_THREAD_BROADCAST;
+
   if (TYPE_PRECISION (vartype) > TYPE_PRECISION (call_arg_type))
     {
       fn = BUILT_IN_GOACC_THREAD_BROADCAST_LL;
@@ -10299,16 +10348,106 @@ generate_vector_broadcast (tree dest_var
       gsi_insert_after (&where, conv2, GSI_CONTINUE_LINKING);
     }
   gimple_call_set_lhs (call, casted_dest);
+  return retval;
+}
+
+/* Generate a broadcast across OpenACC threads in REGION so that VAR
+   is broadcast to DEST_VAR.  MASK specifies the parallelism level and
+   thereby the broadcast method.  If it is equal to MASK_VECTOR, we
+   can use a warp broadcast, otherwise we fall back to memory
+   store/load.  */
+
+static gimple
+generate_oacc_broadcast (omp_region *region, tree dest_var, tree var,
+			 gimple_stmt_iterator &where, int mask)
+{
+  if (mask == MASK_VECTOR)
+    return generate_vector_broadcast (dest_var, var, where);
+
+  omp_region *parent = enclosing_target_region (region);
+
+  tree elttype = build_qualified_type (TREE_TYPE (var), TYPE_QUAL_VOLATILE);
+  tree ptr = create_tmp_var (build_pointer_type (elttype));
+  gassign *cast1 = gimple_build_assign (ptr, NOP_EXPR,
+				       parent->broadcast_array);
+  gsi_insert_after (&where, cast1, GSI_NEW_STMT);
+  gassign *st = gimple_build_assign (build_simple_mem_ref (ptr), var);
+  gsi_insert_after (&where, st, GSI_NEW_STMT);
 
+  tree fndecl = builtin_decl_explicit (BUILT_IN_GOACC_THREADBARRIER);
+  gcall *sync_bar = gimple_build_call (fndecl, 0);
+  gsi_insert_after (&where, sync_bar, GSI_NEW_STMT);
+
+  gassign *cast2 = gimple_build_assign (ptr, NOP_EXPR,
+					parent->broadcast_array);
+  gsi_insert_after (&where, cast2, GSI_NEW_STMT);
+  gassign *ld = gimple_build_assign (dest_var, build_simple_mem_ref (ptr));
+  gsi_insert_after (&where, ld, GSI_NEW_STMT);
+
+  return st;
 }
 
-/* Apply OpenACC vector predication to basic block BB which is in
-   region PARENT.  */
+/* Build a test for OpenACC predication.  TRUE_EDGE is the edge that should be
+   taken if the block should be executed.  SKIP_DEST_BB is the destination to
+   jump to otherwise.  MASK specifies the type of predication, it can contain
+   the bits MASK_VECTOR and/or MASK_WORKER.  */
 
 static void
-predicate_bb (basic_block bb, struct omp_region *parent)
+make_predication_test (edge true_edge, basic_block skip_dest_bb, int mask)
 {
-  if (!requires_vector_predicate (parent))
+  basic_block cond_bb = true_edge->src;
+  
+  gimple_stmt_iterator tmp_gsi = gsi_last_bb (cond_bb);
+  tree decl = builtin_decl_explicit (BUILT_IN_GOACC_TID);
+
+  tree vvar = NULL_TREE, wvar = NULL_TREE;
+  tree comp_var = NULL_TREE;
+  if (mask & MASK_VECTOR)
+    {
+      gimple call = gimple_build_call (decl, 1, integer_zero_node);
+      vvar = create_tmp_var (unsigned_type_node);
+      comp_var = vvar;
+      gimple_call_set_lhs (call, vvar);
+      gsi_insert_after (&tmp_gsi, call, GSI_NEW_STMT);
+    }
+  if (mask & MASK_WORKER)
+    {
+      gimple call = gimple_build_call (decl, 1, integer_one_node);
+      wvar = create_tmp_var (unsigned_type_node);
+      comp_var = wvar;
+      gimple_call_set_lhs (call, wvar);
+      gsi_insert_after (&tmp_gsi, call, GSI_NEW_STMT);
+    }
+  if (wvar && vvar)
+    {
+      comp_var = create_tmp_var (unsigned_type_node);
+      gassign *ior = gimple_build_assign (comp_var, BIT_IOR_EXPR, wvar, vvar);
+      gsi_insert_after (&tmp_gsi, ior, GSI_NEW_STMT);
+    }
+  tree cond = build2 (EQ_EXPR, boolean_type_node, comp_var,
+		      fold_convert (unsigned_type_node, integer_zero_node));
+  gimple cond_stmt = gimple_build_cond_empty (cond);
+  gsi_insert_after (&tmp_gsi, cond_stmt, GSI_NEW_STMT);
+
+  true_edge->flags = EDGE_TRUE_VALUE;
+  make_edge (cond_bb, skip_dest_bb, EDGE_FALSE_VALUE);
+}
+
+/* Apply OpenACC predication to basic block BB which is in
+   region PARENT.  MASK has a bitmask of levels that need to be
+   applied; MASK_VECTOR and/or MASK_WORKER may be set.  */
+
+static void
+predicate_bb (basic_block bb, struct omp_region *parent, int mask)
+{
+  /* We handle worker-single vector-partitioned loops by jumping
+     around them if not in the controlling worker.  Don't insert
+     unnecessary (and incorrect) predication.  */
+  if (parent->type == GIMPLE_OMP_FOR
+      && (parent->gwv_this & MASK_VECTOR))
+    mask &= ~MASK_WORKER;
+
+  if (mask == 0)
     return;
 
   gimple_stmt_iterator gsi;
@@ -10336,9 +10475,11 @@ predicate_bb (basic_block bb, struct omp
       gsi_insert_before (&gsi, asgn, GSI_CONTINUE_LINKING);
       gimple_stmt_iterator gsi_asgn = gsi_for_stmt (asgn);
 
-      generate_vector_broadcast (broadcast_cond, cond_var, gsi_asgn);
+      gimple splitpoint = generate_oacc_broadcast (parent, broadcast_cond,
+						   cond_var, gsi_asgn,
+						   mask);
 
-      edge e = split_block (bb, asgn);
+      edge e = split_block (bb, splitpoint);
       skip_dest_bb = e->dest;
 
       gimple_cond_set_condition (as_a <gcond *> (stmt), EQ_EXPR,
@@ -10354,9 +10495,10 @@ predicate_bb (basic_block bb, struct omp
       gsi_insert_before (&gsi, asgn, GSI_CONTINUE_LINKING);
       gimple_stmt_iterator gsi_asgn = gsi_for_stmt (asgn);
 
-      generate_vector_broadcast (new_var, var, gsi_asgn);
+      gimple splitpoint = generate_oacc_broadcast (parent, new_var, var,
+						   gsi_asgn, mask);
 
-      edge e = split_block (bb, asgn);
+      edge e = split_block (bb, splitpoint);
       skip_dest_bb = e->dest;
 
       gimple_switch_set_index (sstmt, new_var);
@@ -10364,21 +10506,58 @@ predicate_bb (basic_block bb, struct omp
   else if (is_gimple_omp (stmt))
     {
       gsi_prev (&gsi);
+      gimple split_stmt = gsi_stmt (gsi);
+
+      /* First, see if we must predicate away an entire loop.  */
+      if (gimple_code (stmt) == GIMPLE_OMP_FOR)
+	{
+	  omp_region *inner;
+	  inner = *bb_region_map->get (FALLTHRU_EDGE (bb)->dest);
+	  skip_dest_bb = single_succ (inner->exit);
+	  gcc_assert (inner->entry == bb);
+	  if ((inner->gwv_this & (MASK_VECTOR | MASK_WORKER)) == MASK_VECTOR
+	      && (mask & MASK_WORKER) != 0)
+	    {
+	      gimple_stmt_iterator head_gsi = gsi_start_bb (bb);
+	      gsi_prev (&head_gsi);
+	      edge e0 = split_block (bb, gsi_stmt (head_gsi));
+
+	      if (!split_stmt)
+		{
+		  /* The simple case: nothing here except the for,
+		     so we just need to make one branch around the
+		     entire loop.  */
+		  inner->entry = e0->dest;
+		  make_predication_test (e0, skip_dest_bb,
+					 mask & ~MASK_VECTOR);
+		  return;
+		}
+	      basic_block for_block = e0->dest;
+	      /* The general case, make two conditions - a full one around the
+		 code preceding the for, and one branch around the loop.  */
+	      edge e1 = split_block (for_block, split_stmt);
+	      basic_block bb3 = e1->dest;
+	      edge e2 = split_block (for_block, split_stmt);
+	      basic_block bb2 = e2->dest;
+
+	      make_predication_test (e0, bb2, mask);
+	      make_predication_test (single_pred_edge (bb3), skip_dest_bb,
+				     mask & ~MASK_VECTOR);
+	      inner->entry = bb3;
+	      return;
+	    }
+	}
+
       /* Only a few statements need special treatment.  */
       if (gimple_code (stmt) != GIMPLE_OMP_FOR
-	  && gimple_code (stmt) != GIMPLE_OMP_CONTINUE)
+	       && gimple_code (stmt) != GIMPLE_OMP_CONTINUE
+	       && gimple_code (stmt) != GIMPLE_OMP_RETURN)
 	{
 	  edge e = single_succ_edge (bb);
 	  skip_dest_bb = e->dest;
-	  if (gimple_code (stmt) == GIMPLE_OMP_RETURN)
-	    {
-	      gcc_assert (parent->exit == bb);
-	      adjust_bb_ptr = &parent->exit;
-	    }
 	}
       else
 	{
-	  gimple split_stmt = gsi_stmt (gsi);
 	  if (!split_stmt)
 	    return;
 	  edge e = split_block (bb, split_stmt);
@@ -10388,6 +10567,11 @@ predicate_bb (basic_block bb, struct omp
 	      gcc_assert (parent->cont == bb);
 	      parent->cont = skip_dest_bb;
 	    }
+	  else if (gimple_code (stmt) == GIMPLE_OMP_RETURN)
+	    {
+	      gcc_assert (parent->exit == bb);
+	      parent->exit = skip_dest_bb;
+	    }
 	  else if (gimple_code (stmt) == GIMPLE_OMP_FOR)
 	    {
 	      omp_region *inner;
@@ -10412,37 +10596,18 @@ predicate_bb (basic_block bb, struct omp
       gimple_stmt_iterator head_gsi = gsi_start_bb (bb);
       gsi_prev (&head_gsi);
       edge e2 = split_block (bb, gsi_stmt (head_gsi));
-      basic_block cond_bb = e2->src;
-
-      if (adjust_bb_ptr)
-	*adjust_bb_ptr = e2->dest;
-
-      gimple_stmt_iterator tmp_gsi = gsi_last_bb (cond_bb);
-
-      tree decl = builtin_decl_explicit (BUILT_IN_GOACC_TID);
-      gimple call = gimple_build_call (decl, 1, integer_zero_node);
-      tree tmp_var = create_tmp_var (unsigned_type_node);
-      gimple_call_set_lhs (call, tmp_var);
-
-      gsi_insert_after (&tmp_gsi, call, GSI_NEW_STMT);
-
-      tree cond = build2 (EQ_EXPR, boolean_type_node, tmp_var,
-			  fold_convert (unsigned_type_node, integer_zero_node));
-      gimple cond_stmt = gimple_build_cond_empty (cond);
-      gsi_insert_after (&tmp_gsi, cond_stmt, GSI_CONTINUE_LINKING);
-
-      e2->flags = EDGE_TRUE_VALUE;
-      make_edge (cond_bb, skip_dest_bb, EDGE_FALSE_VALUE);
+      make_predication_test (e2, skip_dest_bb, mask);
     }
 }
 
 /* Walk the dominator tree starting at BB to collect basic blocks in
    WORKLIST which need OpenACC vector predication applied to them.  */
+
 static void
 find_predicatable_bbs (basic_block bb, vec<basic_block> &worklist)
 {
   struct omp_region *parent = *bb_region_map->get (bb);
-  if (requires_vector_predicate (parent))
+  if (required_predication_mask (parent) != 0)
     worklist.safe_push (bb);
   basic_block son;
   for (son = first_dom_son (CDI_DOMINATORS, bb);
@@ -10453,6 +10618,7 @@ find_predicatable_bbs (basic_block bb, v
 
 /* Apply OpenACC vector predication to all basic blocks.  HEAD_BB is the
    first.  */
+
 static void
 predicate_omp_regions (basic_block head_bb)
 {
@@ -10461,7 +10627,11 @@ predicate_omp_regions (basic_block head_
   int i;
   basic_block bb;
   FOR_EACH_VEC_ELT (worklist, i, bb)
-    predicate_bb (bb, *bb_region_map->get (bb));
+    {
+      omp_region *region = *bb_region_map->get (bb);
+      int mask = required_predication_mask (region);
+      predicate_bb (bb, region, mask);
+    }
 }
 
 /* Main entry point for expanding OMP-GIMPLE into runtime calls.  */
@@ -12503,7 +12673,10 @@ lower_omp_target (gimple_stmt_iterator *
   orlist = NULL;
 
   if (is_gimple_omp_oacc (stmt))
-    oacc_init_count_vars (ctx, clauses);
+    {
+      oacc_init_count_vars (ctx, clauses);
+      oacc_alloc_broadcast_storage (ctx, clauses);
+    }
 
   if (has_reduction)
     {
@@ -12790,7 +12963,7 @@ lower_omp_target (gimple_stmt_iterator *
   gsi_insert_seq_before (gsi_p, sz_ilist, GSI_SAME_STMT);
 
   gimple_omp_target_set_ganglocal_size (stmt, sz);
-
+  gimple_omp_target_set_broadcast_array (stmt, ctx->worker_sync_elt);
   pop_gimplify_context (NULL);
 }
 

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]