This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [og7] vector_length extension part 2: Generalize state propagation and synchronization
- From: Tom de Vries <Tom_deVries at mentor dot com>
- To: Cesar Philippidis <cesar at codesourcery dot com>
- Cc: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Tue, 3 Apr 2018 17:00:37 +0200
- Subject: Re: [og7] vector_length extension part 2: Generalize state propagation and synchronization
- References: <d6642c62-6d01-10ce-dda2-f5fa453ed971@codesourcery.com> <823cc381-8752-14df-d6e2-0203de5da2fb@codesourcery.com>
On 03/02/2018 05:55 PM, Cesar Philippidis wrote:
* config/nvptx/nvptx.c (oacc_bcast_partition): Declare.
One last thing: this variable needs to be reset to zero for every function.
Without this reset, we can generated different code for a function
depending on whether there's another function in front or not.
(populate_offload_attrs): Handle the situation where the default
runtime geometry has not been initialized yet for reductions.
I've moved this bit to "vector_length extension part 4: target hooks and
automatic parallelism".
Build on x86_64 with nvptx accelerator and tested libgomp.
Committed.
Thanks,
- Tom
[nvptx] Generalize state propagation and synchronization
2018-04-03 Cesar Philippidis <cesar@codesourcery.com>
Tom de Vries <tom@codesourcery.com>
* config/nvptx/nvptx.c (oacc_bcast_partition): Declare.
(nvptx_option_override): Init oacc_bcast_partition.
(nvptx_init_oacc_workers): New function.
(nvptx_declare_function_name): Call nvptx_init_oacc_workers.
(nvptx_needs_shared_bcast): New function.
(nvptx_find_par): Generalize to enable vectors to use shared-memory
to propagate state.
(nvptx_shared_propagate): Initialize vector bcast partition and
synchronization state.
(nvptx_single): Generalize to enable vectors to use shared-memory
to propagate state.
(nvptx_process_pars): Likewise.
* config/nvptx/nvptx.h (struct machine_function): Add
bcast_partition and sync_bar members.
---
gcc/config/nvptx/nvptx.c | 137 ++++++++++++++++++++++++++++++++++++++++++-----
gcc/config/nvptx/nvptx.h | 4 ++
2 files changed, 129 insertions(+), 12 deletions(-)
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index d4ff730..0b46e13 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -133,6 +133,7 @@ static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
memory. It'd be nice if PTX supported common blocks, because then
this could be shared across TUs (taking the largest size). */
static unsigned oacc_bcast_size;
+static unsigned oacc_bcast_partition;
static unsigned oacc_bcast_align;
static GTY(()) rtx oacc_bcast_sym;
@@ -157,6 +158,8 @@ static bool need_softstack_decl;
/* True if any function references __nvptx_uni. */
static bool need_unisimt_decl;
+static int nvptx_mach_max_workers ();
+
/* Allocate a new, cleared machine_function structure. */
static struct machine_function *
@@ -210,6 +213,7 @@ nvptx_option_override (void)
oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+ oacc_bcast_partition = 0;
worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
@@ -1097,6 +1101,40 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
fprintf (file, "\t}\n");
}
+/* Emit code to initialize OpenACC worker broadcast and synchronization
+ registers. */
+
+static void
+nvptx_init_oacc_workers (FILE *file)
+{
+ fprintf (file, "\t{\n");
+ fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
+ if (cfun->machine->bcast_partition)
+ {
+ fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
+ fprintf (file, "\t\t.reg.u64\t%%y64;\n");
+ }
+ fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
+ if (cfun->machine->bcast_partition)
+ {
+ fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
+ fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
+ fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
+ fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
+ "// vector broadcast offset\n",
+ REGNO (cfun->machine->bcast_partition),
+ oacc_bcast_partition);
+ }
+ /* Verify oacc_bcast_size. */
+ gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
+ <= oacc_bcast_size);
+ if (cfun->machine->sync_bar)
+ fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
+ "// vector synchronization barrier\n",
+ REGNO (cfun->machine->sync_bar));
+ fprintf (file, "\t}\n");
+}
+
/* Emit code to initialize predicate and master lane index registers for
-muniform-simt code generation variant. */
@@ -1323,6 +1361,8 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
if (cfun->machine->unisimt_predicate
|| (cfun->machine->has_simtreg && !crtl->is_leaf))
nvptx_init_unisimt_predicate (file);
+ if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
+ nvptx_init_oacc_workers (file);
}
/* Output code for switching uniform-simt state. ENTERING indicates whether
@@ -3000,6 +3040,19 @@ nvptx_split_blocks (bb_insn_map_t *map)
}
}
+/* Return true if MASK contains parallelism that requires shared
+ memory to broadcast. */
+
+static bool
+nvptx_needs_shared_bcast (unsigned mask)
+{
+ bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
+ bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+ && nvptx_mach_vector_length () != PTX_WARP_SIZE;
+
+ return worker || large_vector;
+}
+
/* BLOCK is a basic block containing a head or tail instruction.
Locate the associated prehead or pretail instruction, which must be
in the single predecessor block. */
@@ -3075,7 +3128,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
par = new parallel (par, mask);
par->forked_block = block;
par->forked_insn = end;
- if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ if (nvptx_needs_shared_bcast (mask))
par->fork_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
}
@@ -3090,7 +3143,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
gcc_assert (par->mask == mask);
par->join_block = block;
par->join_insn = end;
- if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ if (nvptx_needs_shared_bcast (mask))
par->joining_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
par = par->parent;
@@ -3947,11 +4000,33 @@ nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
gcc_assert (empty == !data.offset);
if (data.offset)
{
+ rtx bcast_sym = oacc_bcast_sym;
+
/* Stuff was emitted, initialize the base pointer now. */
- rtx init = gen_rtx_SET (data.base, oacc_bcast_sym);
+ if (vector && nvptx_mach_max_workers () > 1)
+ {
+ if (!cfun->machine->bcast_partition)
+ {
+ /* It would be nice to place this register in
+ DATA_AREA_SHARED. */
+ cfun->machine->bcast_partition = gen_reg_rtx (DImode);
+ }
+ if (!cfun->machine->sync_bar)
+ cfun->machine->sync_bar = gen_reg_rtx (SImode);
+
+ bcast_sym = cfun->machine->bcast_partition;
+ }
+
+ rtx init = gen_rtx_SET (data.base, bcast_sym);
emit_insn_after (init, insn);
- oacc_bcast_size = MAX (oacc_bcast_size, data.offset);
+ unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
+ unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
+ ? nvptx_mach_max_workers () + 1
+ : 1);
+
+ oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
+ oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
}
return empty;
}
@@ -4146,7 +4221,8 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
{
rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
- if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
+ if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
+ && nvptx_mach_vector_length () == PTX_WARP_SIZE)
{
/* Vector mode only, do a shuffle. */
#if WORKAROUND_PTXJIT_BUG
@@ -4213,23 +4289,51 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
/* Includes worker mode, do spill & fill. By construction
we should never have worker mode only. */
broadcast_data_t data;
+ unsigned size = GET_MODE_SIZE (SImode);
+ bool vector = true;
rtx barrier = GEN_INT (0);
int threads = 0;
+ if (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask)
+ vector = false;
+
data.base = oacc_bcast_sym;
data.ptr = 0;
- oacc_bcast_size = MAX (oacc_bcast_size, GET_MODE_SIZE (SImode));
+ if (vector
+ && nvptx_mach_max_workers () > 1
+ && cfun->machine->bcast_partition)
+ data.base = cfun->machine->bcast_partition;
+
+ gcc_assert (data.base != NULL);
+
+ unsigned int psize = ROUND_UP (size, oacc_bcast_align);
+ unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
+ ? nvptx_mach_max_workers () + 1
+ : 1);
+
+ oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
+ oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
data.offset = 0;
emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
- false),
+ vector),
before);
+
+ if (vector
+ && nvptx_mach_max_workers () > 1
+ && cfun->machine->sync_bar)
+ {
+ barrier = cfun->machine->sync_bar;
+ threads = nvptx_mach_vector_length ();
+ }
+
/* Barrier so other workers can see the write. */
emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
data.offset = 0;
emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
- false), tail);
+ vector),
+ tail);
/* This barrier is needed to avoid worker zero clobbering
the broadcast buffer before all the other workers have
had a chance to read this instance of it. */
@@ -4342,17 +4446,26 @@ nvptx_process_pars (parallel *par)
}
bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
-
- if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
+ bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+ && nvptx_mach_vector_length () > PTX_WARP_SIZE);
+
+ if (worker || large_vector)
{
nvptx_shared_propagate (false, is_call, par->forked_block,
- par->forked_insn, false);
+ par->forked_insn, !worker);
bool empty = nvptx_shared_propagate (true, is_call,
par->forked_block, par->fork_insn,
- false);
+ !worker);
rtx barrier = GEN_INT (0);
int threads = 0;
+ if (!worker && cfun->machine->sync_bar)
+ {
+ barrier = cfun->machine->sync_bar;
+ threads = nvptx_mach_vector_length ();
+ }
+
if (!empty || !is_call)
{
/* Insert begin and end synchronizations. */
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 784628e..fb9f04b 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -228,6 +228,10 @@ struct GTY(()) machine_function
rtx axis_predicate[2]; /* Neutering predicates. */
int axis_dim[2]; /* Maximum number of threads on each axis, dim[0] is
vector_length, dim[1] is num_workers. */
+ rtx bcast_partition; /* Register containing the size of each
+ vector's partition of share-memory used to
+ broadcast state. */
+ rtx sync_bar; /* Synchronization barrier ID for vectors. */
rtx unisimt_master; /* 'Master lane index' for -muniform-simt. */
rtx unisimt_predicate; /* Predicate for -muniform-simt. */
rtx unisimt_location; /* Mask location for -muniform-simt. */