This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[nvptx, committed] Make nvptx state propagation function names more generic
- From: Tom de Vries <tdevries at suse dot de>
- To: "Schwinge, Thomas" <Thomas_Schwinge at mentor dot com>
- Cc: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Wed, 19 Dec 2018 11:37:41 +0100
- Subject: [nvptx, committed] Make nvptx state propagation function names more generic
- References: <d101a77b-fc5f-0396-b8d1-c13f34fd1c38@codesourcery.com> <2ece5d7b-3675-84ab-f255-3c56a2ffd7dc@suse.de> <91b927af-d854-2865-7cbd-9a9a835ab5cc@codesourcery.com> <1394d89c-896e-f6a3-5f9a-78e98b16e85c@suse.de>
[ was: Re: [nvptx] vector length patch series ]
On 14-12-18 20:58, Tom de Vries wrote:
> 0008-nvptx-make-nvptx-state-propagation-function-names-mo.patch
Committed.
Thanks,
- Tom
[nvptx] Make nvptx state propagation function names more generic
Rename state propagation functions to avoid worker/vector terminology.
Build and reg-tested on x86_64 with nvptx accelerator.
2018-12-17 Tom de Vries <tdevries@suse.de>
* config/nvptx/nvptx.c (nvptx_gen_wcast): Rename as
nvptx_gen_warp_bcast.
(nvptx_gen_wcast): Rename to nvptx_gen_shared_bcast, add bool
vector argument, and update call to nvptx_gen_shared_bcast.
(propagator_fn): Add bool argument.
(nvptx_propagate): New bool argument, pass bool argument to fn.
(vprop_gen): Rename to warp_prop_gen, update call to
nvptx_gen_warp_bcast.
(nvptx_vpropagate): Rename to nvptx_warp_propagate, update call to
nvptx_propagate.
(wprop_gen): Rename to shared_prop_gen, update call to
nvptx_gen_shared_bcast.
(nvptx_wpropagate): Rename to nvptx_shared_propagate, update call
to nvptx_propagate.
(nvptx_wsync): Rename to nvptx_cta_sync.
(nvptx_single): Update calls to nvptx_gen_warp_bcast,
nvptx_gen_shared_bcast and nvptx_cta_sync.
(nvptx_process_pars): Likewise.
(write_worker_buffer): Rename as write_shared_buffer.
(nvptx_file_end): Update calls to write_shared_buffer.
(nvptx_expand_worker_addr): Rename as nvptx_expand_shared_addr.
(nvptx_expand_builtin): Update call to nvptx_expand_shared_addr.
(nvptx_get_worker_red_addr): Rename as nvptx_get_shared_red_addr.
(nvptx_goacc_reduction_setup): Update call to
nvptx_get_shared_red_addr.
(nvptx_goacc_reduction_fini): Likewise.
(nvptx_goacc_reduction_teardown): Likewise.
---
gcc/config/nvptx/nvptx.c | 96 +++++++++++++++++++++++++++---------------------
1 file changed, 54 insertions(+), 42 deletions(-)
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 9625ac86aa1..163f2268e5f 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -1748,7 +1748,7 @@ nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
across the vectors of a single warp. */
static rtx
-nvptx_gen_vcast (rtx reg)
+nvptx_gen_warp_bcast (rtx reg)
{
return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
}
@@ -1779,7 +1779,8 @@ enum propagate_mask
how many loop iterations will be executed (0 for not a loop). */
static rtx
-nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, broadcast_data_t *data)
+nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
+ broadcast_data_t *data, bool vector)
{
rtx res;
machine_mode mode = GET_MODE (reg);
@@ -1793,7 +1794,7 @@ nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, broadcast_data_t *dat
start_sequence ();
if (pm & PM_read)
emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
- emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
+ emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
if (pm & PM_write)
emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
res = get_insns ();
@@ -1813,6 +1814,7 @@ nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, broadcast_data_t *dat
oacc_bcast_align = align;
data->offset = (data->offset + align - 1) & ~(align - 1);
addr = data->base;
+ gcc_assert (data->base != NULL);
if (data->offset)
addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
}
@@ -3803,11 +3805,11 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions)
regions and (b) only propagating stack entries that are used. The
latter might be quite hard to determine. */
-typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
+typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
static bool
nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
- propagate_mask rw, propagator_fn fn, void *data)
+ propagate_mask rw, propagator_fn fn, void *data, bool vector)
{
bitmap live = DF_LIVE_IN (block);
bitmap_iterator iterator;
@@ -3842,7 +3844,7 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
/* Allow worker function to initialize anything needed. */
- rtx init = fn (tmp, PM_loop_begin, fs, data);
+ rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
if (init)
emit_insn (init);
emit_label (label);
@@ -3851,7 +3853,7 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
}
if (rw & PM_read)
emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
- emit_insn (fn (tmp, rw, fs, data));
+ emit_insn (fn (tmp, rw, fs, data, vector));
if (rw & PM_write)
emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
if (fs)
@@ -3859,7 +3861,7 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
emit_insn (gen_br_true_uni (pred, label));
- rtx fini = fn (tmp, PM_loop_end, fs, data);
+ rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
if (fini)
emit_insn (fini);
emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
@@ -3879,7 +3881,7 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
{
- rtx bcast = fn (reg, rw, 0, data);
+ rtx bcast = fn (reg, rw, 0, data, vector);
insn = emit_insn_after (bcast, insn);
empty = false;
@@ -3888,16 +3890,17 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
return empty;
}
-/* Worker for nvptx_vpropagate. */
+/* Worker for nvptx_warp_propagate. */
static rtx
-vprop_gen (rtx reg, propagate_mask pm,
- unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
+warp_prop_gen (rtx reg, propagate_mask pm,
+ unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
+ bool ARG_UNUSED (vector))
{
if (!(pm & PM_read_write))
return 0;
- return nvptx_gen_vcast (reg);
+ return nvptx_gen_warp_bcast (reg);
}
/* Propagate state that is live at start of BLOCK across the vectors
@@ -3905,15 +3908,17 @@ vprop_gen (rtx reg, propagate_mask pm,
IS_CALL and return as for nvptx_propagate. */
static bool
-nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
+nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
{
- return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
+ return nvptx_propagate (is_call, block, insn, PM_read_write,
+ warp_prop_gen, 0, false);
}
-/* Worker for nvptx_wpropagate. */
+/* Worker for nvptx_shared_propagate. */
static rtx
-wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
+shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
+ bool vector)
{
broadcast_data_t *data = (broadcast_data_t *)data_;
@@ -3937,7 +3942,7 @@ wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
return clobber;
}
else
- return nvptx_gen_wcast (reg, pm, rep, data);
+ return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
}
/* Spill or fill live state that is live at start of BLOCK. PRE_P
@@ -3946,7 +3951,8 @@ wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
INSN. IS_CALL and return as for nvptx_propagate. */
static bool
-nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
+nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
+ rtx_insn *insn, bool vector)
{
broadcast_data_t data;
@@ -3955,7 +3961,8 @@ nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
data.ptr = NULL_RTX;
bool empty = nvptx_propagate (is_call, block, insn,
- pre_p ? PM_read : PM_write, wprop_gen, &data);
+ pre_p ? PM_read : PM_write, shared_prop_gen,
+ &data, vector);
gcc_assert (empty == !data.offset);
if (data.offset)
{
@@ -3973,7 +3980,7 @@ nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
markers for before and after synchronizations. */
static rtx
-nvptx_wsync (bool after)
+nvptx_cta_sync (bool after)
{
return gen_nvptx_barsync (GEN_INT (after), GEN_INT (0));
}
@@ -4328,7 +4335,7 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
emit_insn_before (gen_rtx_SET (tmp, pvar), label);
emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
#endif
- emit_insn_before (nvptx_gen_vcast (pvar), tail);
+ emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
}
else
{
@@ -4343,16 +4350,18 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
oacc_bcast_size = GET_MODE_SIZE (SImode);
data.offset = 0;
- emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
+ emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
+ false),
before);
/* Barrier so other workers can see the write. */
- emit_insn_before (nvptx_wsync (false), tail);
+ emit_insn_before (nvptx_cta_sync (false), tail);
data.offset = 0;
- emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
+ emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
+ false), tail);
/* This barrier is needed to avoid worker zero clobbering
the broadcast buffer before all the other workers have
had a chance to read this instance of it. */
- emit_insn_before (nvptx_wsync (false), tail);
+ emit_insn_before (nvptx_cta_sync (false), tail);
}
extract_insn (tail);
@@ -4469,19 +4478,21 @@ nvptx_process_pars (parallel *par)
if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
{
- nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
- bool empty = nvptx_wpropagate (true, is_call,
- par->forked_block, par->fork_insn);
+ nvptx_shared_propagate (false, is_call, par->forked_block,
+ par->forked_insn, false);
+ bool empty = nvptx_shared_propagate (true, is_call,
+ par->forked_block, par->fork_insn,
+ false);
if (!empty || !is_call)
{
/* Insert begin and end synchronizations. */
- emit_insn_before (nvptx_wsync (false), par->forked_insn);
- emit_insn_before (nvptx_wsync (false), par->join_insn);
+ emit_insn_before (nvptx_cta_sync (false), par->forked_insn);
+ emit_insn_before (nvptx_cta_sync (false), par->join_insn);
}
}
else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
- nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
+ nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
/* Now do siblings. */
if (par->next)
@@ -4945,10 +4956,11 @@ nvptx_file_start (void)
fputs ("// END PREAMBLE\n", asm_out_file);
}
-/* Emit a declaration for a worker-level buffer in .shared memory. */
+/* Emit a declaration for a worker and vector-level buffer in .shared
+ memory. */
static void
-write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
+write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
{
const char *name = XSTR (sym, 0);
@@ -4970,11 +4982,11 @@ nvptx_file_end (void)
fputs (func_decls.str().c_str(), asm_out_file);
if (oacc_bcast_size)
- write_worker_buffer (asm_out_file, oacc_bcast_sym,
+ write_shared_buffer (asm_out_file, oacc_bcast_sym,
oacc_bcast_align, oacc_bcast_size);
if (worker_red_size)
- write_worker_buffer (asm_out_file, worker_red_sym,
+ write_shared_buffer (asm_out_file, worker_red_sym,
worker_red_align, worker_red_size);
if (need_softstack_decl)
@@ -5025,7 +5037,7 @@ nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
/* Worker reduction address expander. */
static rtx
-nvptx_expand_worker_addr (tree exp, rtx target,
+nvptx_expand_shared_addr (tree exp, rtx target,
machine_mode ARG_UNUSED (mode), int ignore)
{
if (ignore)
@@ -5161,7 +5173,7 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
return nvptx_expand_shuffle (exp, target, mode, ignore);
case NVPTX_BUILTIN_WORKER_ADDR:
- return nvptx_expand_worker_addr (exp, target, mode, ignore);
+ return nvptx_expand_shared_addr (exp, target, mode, ignore);
case NVPTX_BUILTIN_CMP_SWAP:
case NVPTX_BUILTIN_CMP_SWAPLL:
@@ -5330,7 +5342,7 @@ nvptx_goacc_fork_join (gcall *call, const int dims[],
data at that location. */
static tree
-nvptx_get_worker_red_addr (tree type, tree offset)
+nvptx_get_shared_red_addr (tree type, tree offset)
{
machine_mode mode = TYPE_MODE (type);
tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
@@ -5672,7 +5684,7 @@ nvptx_goacc_reduction_setup (gcall *call)
{
/* Store incoming value to worker reduction buffer. */
tree offset = gimple_call_arg (call, 5);
- tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
+ tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset);
tree ptr = make_ssa_name (TREE_TYPE (call));
gimplify_assign (ptr, call, &seq);
@@ -5814,7 +5826,7 @@ nvptx_goacc_reduction_fini (gcall *call)
{
/* Get reduction buffer address. */
tree offset = gimple_call_arg (call, 5);
- tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
+ tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset);
tree ptr = make_ssa_name (TREE_TYPE (call));
gimplify_assign (ptr, call, &seq);
@@ -5858,7 +5870,7 @@ nvptx_goacc_reduction_teardown (gcall *call)
{
/* Read the worker reduction buffer. */
tree offset = gimple_call_arg (call, 5);
- tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
+ tree call = nvptx_get_shared_red_addr(TREE_TYPE (var), offset);
tree ptr = make_ssa_name (TREE_TYPE (call));
gimplify_assign (ptr, call, &seq);