[PATCH 04/11] [nvptx] Make nvptx state propagation function names more generic

cesar@codesourcery.com cesar@codesourcery.com
Tue Jul 24 20:47:00 GMT 2018


From: Cesar Philippidis <cesar@codesourcery.com>

This patch renames various state propagation functions into somewhat
that reflects their usage in generic worker and vector contexts. E.g.,
whereas before nvptx_wpropagate used to be used exclusively for worker
state propagation, it will eventually be used for any state
propagation using multiple warps. Because variable length vectors will
be able use both shared memory and warp shuffles for broadcasting, the
old vector-specific functions now contain 'warp' in their name.

2018-XX-YY  Cesar Philippidis  <cesar@codesourcery.com>

	gcc/
	* config/nvptx/nvptx.c (nvptx_gen_wcast): Rename as
	nvptx_gen_warp_bcast.
	(nvptx_gen_wcast): Rename to nvptx_gen_shared_bcast, add bool
	vector argument, and update call to nvptx_gen_shared_bcast.
	(propagator_fn): Add bool argument.
	(nvptx_propagate): New bool argument, pass bool argument to fn.
	(vprop_gen): Rename to warp_prop_gen, update call to
	nvptx_gen_warp_bcast.
	(nvptx_vpropagate): Rename to nvptx_warp_propagate, update call to
	nvptx_propagate.
	(wprop_gen): Rename to shared_prop_gen, update call to
	nvptx_gen_shared_bcast.
	(nvptx_wpropagate): Rename to nvptx_shared_propagate, update call
	to nvptx_propagate.
	(nvptx_wsync): Rename to nvptx_cta_sync.
	(nvptx_single): Update calls to nvptx_gen_warp_bcast,
	nvptx_gen_shared_bcast and nvptx_cta_sync.
	(nvptx_process_pars): Likewise.
	(write_worker_buffer): Rename as write_shared_buffer.
	(nvptx_file_end): Update calls to write_shared_buffer.
	(nvptx_expand_worker_addr): Rename as nvptx_expand_shared_addr.
	(nvptx_expand_builtin): Update call to nvptx_expand_shared_addr.
	(nvptx_get_worker_red_addr): Rename as nvptx_get_shared_red_addr.
	(nvptx_goacc_reduction_setup): Update call to
	nvptx_get_shared_red_addr.
	(nvptx_goacc_reduction_fini): Likewise.
	(nvptx_goacc_reduction_teardown): Likewise.

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 1b83b3c..447425f 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -1750,7 +1750,7 @@ nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
    across the vectors of a single warp.  */
 
 static rtx
-nvptx_gen_vcast (rtx reg)
+nvptx_gen_warp_bcast (rtx reg)
 {
   return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
 }
@@ -1781,7 +1781,8 @@ enum propagate_mask
    how many loop iterations will be executed (0 for not a loop).  */
    
 static rtx
-nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, broadcast_data_t *data)
+nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
+			broadcast_data_t *data, bool vector)
 {
   rtx  res;
   machine_mode mode = GET_MODE (reg);
@@ -1795,7 +1796,7 @@ nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, broadcast_data_t *dat
 	start_sequence ();
 	if (pm & PM_read)
 	  emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
-	emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
+	emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
 	if (pm & PM_write)
 	  emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
 	res = get_insns ();
@@ -1815,6 +1816,7 @@ nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, broadcast_data_t *dat
 	      oacc_bcast_align = align;
 	    data->offset = (data->offset + align - 1) & ~(align - 1);
 	    addr = data->base;
+	    gcc_assert (data->base != NULL);
 	    if (data->offset)
 	      addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
 	  }
@@ -3816,11 +3818,11 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
    regions and (b) only propagating stack entries that are used.  The
    latter might be quite hard to determine.  */
 
-typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
+typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
 
 static bool
 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
-		 propagate_mask rw, propagator_fn fn, void *data)
+		 propagate_mask rw, propagator_fn fn, void *data, bool vector)
 {
   bitmap live = DF_LIVE_IN (block);
   bitmap_iterator iterator;
@@ -3855,7 +3857,7 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
 	  
 	  emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
 	  /* Allow worker function to initialize anything needed.  */
-	  rtx init = fn (tmp, PM_loop_begin, fs, data);
+	  rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
 	  if (init)
 	    emit_insn (init);
 	  emit_label (label);
@@ -3864,7 +3866,7 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
 	}
       if (rw & PM_read)
 	emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
-      emit_insn (fn (tmp, rw, fs, data));
+      emit_insn (fn (tmp, rw, fs, data, vector));
       if (rw & PM_write)
 	emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
       if (fs)
@@ -3872,7 +3874,7 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
 	  emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
 	  emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
 	  emit_insn (gen_br_true_uni (pred, label));
-	  rtx fini = fn (tmp, PM_loop_end, fs, data);
+	  rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
 	  if (fini)
 	    emit_insn (fini);
 	  emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
@@ -3892,7 +3894,7 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
 
 	if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
 	  {
-	    rtx bcast = fn (reg, rw, 0, data);
+	    rtx bcast = fn (reg, rw, 0, data, vector);
 
 	    insn = emit_insn_after (bcast, insn);
 	    empty = false;
@@ -3901,16 +3903,17 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
   return empty;
 }
 
-/* Worker for nvptx_vpropagate.  */
+/* Worker for nvptx_warp_propagate.  */
 
 static rtx
-vprop_gen (rtx reg, propagate_mask pm,
-	   unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
+warp_prop_gen (rtx reg, propagate_mask pm,
+	       unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
+	       bool ARG_UNUSED (vector))
 {
   if (!(pm & PM_read_write))
     return 0;
   
-  return nvptx_gen_vcast (reg);
+  return nvptx_gen_warp_bcast (reg);
 }
 
 /* Propagate state that is live at start of BLOCK across the vectors
@@ -3918,15 +3921,17 @@ vprop_gen (rtx reg, propagate_mask pm,
    IS_CALL and return as for nvptx_propagate.  */
 
 static bool
-nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
+nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
 {
-  return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
+  return nvptx_propagate (is_call, block, insn, PM_read_write,
+			  warp_prop_gen, 0, false);
 }
 
-/* Worker for nvptx_wpropagate.  */
+/* Worker for nvptx_shared_propagate.  */
 
 static rtx
-wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
+shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
+		 bool vector)
 {
   broadcast_data_t *data = (broadcast_data_t *)data_;
 
@@ -3950,7 +3955,7 @@ wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
       return clobber;
     }
   else
-    return nvptx_gen_wcast (reg, pm, rep, data);
+    return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
 }
 
 /* Spill or fill live state that is live at start of BLOCK.  PRE_P
@@ -3959,7 +3964,8 @@ wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
    INSN.  IS_CALL and return as for nvptx_propagate.  */
 
 static bool
-nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
+nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
+			rtx_insn *insn, bool vector)
 {
   broadcast_data_t data;
 
@@ -3968,7 +3974,8 @@ nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
   data.ptr = NULL_RTX;
 
   bool empty = nvptx_propagate (is_call, block, insn,
-				pre_p ? PM_read : PM_write, wprop_gen, &data);
+				pre_p ? PM_read : PM_write, shared_prop_gen,
+				&data, vector);
   gcc_assert (empty == !data.offset);
   if (data.offset)
     {
@@ -3982,11 +3989,11 @@ nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
   return empty;
 }
 
-/* Emit a worker-level synchronization barrier.  We use different
+/* Emit a CTA-level synchronization barrier.  We use different
    markers for before and after synchronizations.  */
 
 static rtx
-nvptx_wsync (bool after)
+nvptx_cta_sync (bool after)
 {
   return gen_nvptx_barsync (GEN_INT (after));
 }
@@ -4341,7 +4348,7 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	  emit_insn_before (gen_rtx_SET (tmp, pvar), label);
 	  emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
 #endif
-	  emit_insn_before (nvptx_gen_vcast (pvar), tail);
+	  emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
 	}
       else
 	{
@@ -4356,16 +4363,18 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	    oacc_bcast_size = GET_MODE_SIZE (SImode);
 
 	  data.offset = 0;
-	  emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
+	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
+						    false),
 			    before);
 	  /* Barrier so other workers can see the write.  */
-	  emit_insn_before (nvptx_wsync (false), tail);
+	  emit_insn_before (nvptx_cta_sync (false), tail);
 	  data.offset = 0;
-	  emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
+	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
+						    false), tail);
 	  /* This barrier is needed to avoid worker zero clobbering
 	     the broadcast buffer before all the other workers have
 	     had a chance to read this instance of it.  */
-	  emit_insn_before (nvptx_wsync (true), tail);
+	  emit_insn_before (nvptx_cta_sync (true), tail);
 	}
 
       extract_insn (tail);
@@ -4482,19 +4491,21 @@ nvptx_process_pars (parallel *par)
 
   if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
     {
-      nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
-      bool empty = nvptx_wpropagate (true, is_call,
-				     par->forked_block, par->fork_insn);
+      nvptx_shared_propagate (false, is_call, par->forked_block,
+			      par->forked_insn, false);
+      bool empty = nvptx_shared_propagate (true, is_call,
+					   par->forked_block, par->fork_insn,
+					   false);
 
       if (!empty || !is_call)
 	{
 	  /* Insert begin and end synchronizations.  */
-	  emit_insn_before (nvptx_wsync (false), par->forked_insn);
-	  emit_insn_before (nvptx_wsync (true), par->join_insn);
+	  emit_insn_before (nvptx_cta_sync (false), par->forked_insn);
+	  emit_insn_before (nvptx_cta_sync (true), par->join_insn);
 	}
     }
   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
-    nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
+    nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
 
   /* Now do siblings.  */
   if (par->next)
@@ -4997,10 +5008,11 @@ nvptx_file_start (void)
   fputs ("// END PREAMBLE\n", asm_out_file);
 }
 
-/* Emit a declaration for a worker-level buffer in .shared memory.  */
+/* Emit a declaration for a worker and vector-level buffer in .shared
+   memory.  */
 
 static void
-write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
+write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
 {
   const char *name = XSTR (sym, 0);
 
@@ -5022,11 +5034,11 @@ nvptx_file_end (void)
   fputs (func_decls.str().c_str(), asm_out_file);
 
   if (oacc_bcast_size)
-    write_worker_buffer (asm_out_file, oacc_bcast_sym,
+    write_shared_buffer (asm_out_file, oacc_bcast_sym,
 			 oacc_bcast_align, oacc_bcast_size);
 
   if (worker_red_size)
-    write_worker_buffer (asm_out_file, worker_red_sym,
+    write_shared_buffer (asm_out_file, worker_red_sym,
 			 worker_red_align, worker_red_size);
 
   if (need_softstack_decl)
@@ -5077,7 +5089,7 @@ nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
 /* Worker reduction address expander.  */
 
 static rtx
-nvptx_expand_worker_addr (tree exp, rtx target,
+nvptx_expand_shared_addr (tree exp, rtx target,
 			  machine_mode ARG_UNUSED (mode), int ignore)
 {
   if (ignore)
@@ -5213,7 +5225,7 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
       return nvptx_expand_shuffle (exp, target, mode, ignore);
 
     case NVPTX_BUILTIN_WORKER_ADDR:
-      return nvptx_expand_worker_addr (exp, target, mode, ignore);
+      return nvptx_expand_shared_addr (exp, target, mode, ignore);
 
     case NVPTX_BUILTIN_CMP_SWAP:
     case NVPTX_BUILTIN_CMP_SWAPLL:
@@ -5323,7 +5335,7 @@ nvptx_goacc_fork_join (gcall *call, const int dims[],
    data at that location.  */
 
 static tree
-nvptx_get_worker_red_addr (tree type, tree offset)
+nvptx_get_shared_red_addr (tree type, tree offset)
 {
   machine_mode mode = TYPE_MODE (type);
   tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
@@ -5665,7 +5677,7 @@ nvptx_goacc_reduction_setup (gcall *call)
     {
       /* Store incoming value to worker reduction buffer.  */
       tree offset = gimple_call_arg (call, 5);
-      tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
+      tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset);
       tree ptr = make_ssa_name (TREE_TYPE (call));
 
       gimplify_assign (ptr, call, &seq);
@@ -5807,7 +5819,7 @@ nvptx_goacc_reduction_fini (gcall *call)
 	{
 	  /* Get reduction buffer address.  */
 	  tree offset = gimple_call_arg (call, 5);
-	  tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
+	  tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset);
 	  tree ptr = make_ssa_name (TREE_TYPE (call));
 
 	  gimplify_assign (ptr, call, &seq);
@@ -5851,7 +5863,7 @@ nvptx_goacc_reduction_teardown (gcall *call)
     {
       /* Read the worker reduction buffer.  */
       tree offset = gimple_call_arg (call, 5);
-      tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
+      tree call = nvptx_get_shared_red_addr(TREE_TYPE (var), offset);
       tree ptr = make_ssa_name (TREE_TYPE (call));
 
       gimplify_assign (ptr, call, &seq);
-- 
2.7.4



More information about the Gcc-patches mailing list