[gomp4] Re: [2/3] OpenACC reductions
Thomas Schwinge
thomas@codesourcery.com
Fri Nov 6 10:48:00 GMT 2015
Hi Nathan!
On Wed, 4 Nov 2015 11:59:28 -0500, Nathan Sidwell <nathan@acm.org> wrote:
> [PTX backend pieces of OpenACC reduction handling]
Merged your trunk r229768 into gomp-4_0-branch in r229836:
commit 089a0224af68e30b55f42734de48adc645eb7370
Merge: 2b76127 78a78aa
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Fri Nov 6 09:38:10 2015 +0000
svn merge -r 229767:229768 svn+ssh://gcc.gnu.org/svn/gcc/trunk
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@229836 138bc75d-0d04-0410-961f-82ee72b054a4
gcc/ChangeLog | 23 +++++++
gcc/config/nvptx/nvptx.c | 169 +++++++++++++++++++++++------------------------
2 files changed, 107 insertions(+), 85 deletions(-)
I hope I did the right thing replacing the existing code on
gomp-4_0-branch with what you committed to trunk: in particular, the
nvptx_lockless_update and nvptx_goacc_reduction_init functions. That is,
in the merge commit, I effectively applied the following patch
(gomp-4_0-branch before vs. after):
--- gcc/ChangeLog
+++ gcc/ChangeLog
[...]
--- gcc/config/nvptx/nvptx.c
+++ gcc/config/nvptx/nvptx.c
@@ -57,21 +57,22 @@
[#include directives reshuffled]
@@ -104,19 +105,18 @@ struct tree_hasher : ggc_cache_ptr_hash<tree_node>
static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
-/* Size of buffer needed to broadcast across workers. This is used
- for both worker-neutering and worker broadcasting. It is shared
- by all functions emitted. The buffer is placed in shared memory.
- It'd be nice if PTX supported common blocks, because then this
- could be shared across TUs (taking the largest size). */
+/* Buffer needed to broadcast across workers. This is used for both
+ worker-neutering and worker broadcasting. It is shared by all
+ functions emitted. The buffer is placed in shared memory. It'd be
+ nice if PTX supported common blocks, because then this could be
+ shared across TUs (taking the largest size). */
static unsigned worker_bcast_size;
static unsigned worker_bcast_align;
#define worker_bcast_name "__worker_bcast"
static GTY(()) rtx worker_bcast_sym;
-/* Size of buffer needed for worker reductions. This has to be
- distinct from the worker broadcast array, as both may be live
- concurrently. */
+/* Buffer needed for worker reductions. This has to be distinct from
+ the worker broadcast array, as both may be live concurrently. */
static unsigned worker_red_size;
static unsigned worker_red_align;
#define worker_red_name "__worker_red"
@@ -3977,8 +3977,8 @@ nvptx_file_end (void)
{
/* Define the reduction buffer. */
- worker_red_size = (worker_red_size + worker_red_align - 1)
- & ~(worker_red_align - 1);
+ worker_red_size = ((worker_red_size + worker_red_align - 1)
+ & ~(worker_red_align - 1));
fprintf (asm_out_file, "// BEGIN VAR DEF: %s\n", worker_red_name);
fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
@@ -3986,7 +3986,7 @@ nvptx_file_end (void)
worker_red_name, worker_red_size);
}
}
-
+
/* Expander for the shuffle builtins. */
static rtx
@@ -4046,6 +4046,10 @@ nvptx_expand_worker_addr (tree exp, rtx target,
return target;
}
+/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
+ not require taking the address of any object, other than the memory
+ cell being operated on. */
+
static rtx
nvptx_expand_cmp_swap (tree exp, rtx target,
machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
@@ -4096,7 +4100,7 @@ static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
/* Return the NVPTX builtin for CODE. */
static tree
-nvptx_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
+nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
{
if (code >= NVPTX_BUILTIN_MAX)
return error_mark_node;
@@ -4110,10 +4114,10 @@ static void
nvptx_init_builtins (void)
{
#define DEF(ID, NAME, T) \
- (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] = \
- add_builtin_function ("__builtin_nvptx_" NAME, \
- build_function_type_list T, \
- NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
+ (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
+ = add_builtin_function ("__builtin_nvptx_" NAME, \
+ build_function_type_list T, \
+ NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
#define ST sizetype
#define UINT unsigned_type_node
#define LLUINT long_long_unsigned_type_node
@@ -4140,7 +4144,7 @@ nvptx_init_builtins (void)
IGNORE is nonzero if the value is to be ignored. */
static rtx
-nvptx_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
+nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
machine_mode mode, int ignore)
{
tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
@@ -4239,6 +4243,10 @@ nvptx_goacc_fork_join (gcall *call, const int dims[],
return true;
}
+/* Generate a PTX builtin function call that returns the address in
+ the worker reduction buffer at OFFSET. TYPE is the type of the
+ data at that location. */
+
static tree
nvptx_get_worker_red_addr (tree type, tree offset)
{
@@ -4263,30 +4271,19 @@ nvptx_generate_vector_shuffle (location_t loc,
unsigned fn = NVPTX_BUILTIN_SHUFFLE;
tree_code code = NOP_EXPR;
tree type = unsigned_type_node;
+ enum machine_mode mode = TYPE_MODE (TREE_TYPE (var));
- switch (TYPE_MODE (TREE_TYPE (var)))
+ if (!INTEGRAL_MODE_P (mode))
+ code = VIEW_CONVERT_EXPR;
+ if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode))
{
- case SFmode:
- code = VIEW_CONVERT_EXPR;
- /* FALLTHROUGH */
- case SImode:
- break;
-
- case DFmode:
- code = VIEW_CONVERT_EXPR;
- /* FALLTHROUGH */
- case DImode:
- type = long_long_unsigned_type_node;
fn = NVPTX_BUILTIN_SHUFFLELL;
- break;
-
- default:
- gcc_unreachable ();
+ type = long_long_unsigned_type_node;
}
tree call = nvptx_builtin_decl (fn, true);
call = build_call_expr_loc
- (loc, call, 3, build1 (code, type, var),
+ (loc, call, 3, fold_build1 (code, type, var),
build_int_cst (unsigned_type_node, shift),
build_int_cst (unsigned_type_node, SHUFFLE_DOWN));
@@ -4295,6 +4292,9 @@ nvptx_generate_vector_shuffle (location_t loc,
gimplify_assign (dest_var, call, seq);
}
+/* Insert code to locklessly update *PTR with *PTR OP VAR just before
+ GSI. */
+
static tree
nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
tree ptr, tree var, tree_code op)
@@ -4303,24 +4303,14 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
tree_code code = NOP_EXPR;
tree type = unsigned_type_node;
- switch (TYPE_MODE (TREE_TYPE (var)))
- {
- case SFmode:
- code = VIEW_CONVERT_EXPR;
- /* FALLTHROUGH */
- case SImode:
- break;
+ enum machine_mode mode = TYPE_MODE (TREE_TYPE (var));
- case DFmode:
- code = VIEW_CONVERT_EXPR;
- /* FALLTHROUGH */
- case DImode:
- type = long_long_unsigned_type_node;
+ if (!INTEGRAL_MODE_P (mode))
+ code = VIEW_CONVERT_EXPR;
+ if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode))
+ {
fn = NVPTX_BUILTIN_CMP_SWAPLL;
- break;
-
- default:
- gcc_unreachable ();
+ type = long_long_unsigned_type_node;
}
gimple_seq init_seq = NULL;
@@ -4354,21 +4344,26 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
/* Split the block just after the init stmts. */
basic_block pre_bb = gsi_bb (*gsi);
edge pre_edge = split_block (pre_bb, init_end);
- basic_block post_bb = pre_edge->dest;
+ basic_block loop_bb = pre_edge->dest;
+ pre_bb = pre_edge->src;
/* Reset the iterator. */
*gsi = gsi_for_stmt (gsi_stmt (*gsi));
- basic_block loop_bb = create_empty_bb (pre_bb);
- gimple_stmt_iterator loop_gsi = gsi_start_bb (loop_bb);
- gsi_insert_seq_after (&loop_gsi, loop_seq, GSI_CONTINUE_LINKING);
+ /* Insert the loop statements. */
+ gimple *loop_end = gimple_seq_last (loop_seq);
+ gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT);
- make_edge (loop_bb, post_bb, EDGE_TRUE_VALUE);
- redirect_edge_succ (pre_edge, loop_bb);
+ /* Split the block just after the loop stmts. */
+ edge post_edge = split_block (loop_bb, loop_end);
+ basic_block post_bb = post_edge->dest;
+ loop_bb = post_edge->src;
+ *gsi = gsi_for_stmt (gsi_stmt (*gsi));
+
+ post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
- add_bb_to_loop (loop_bb, pre_bb->loop_father);
set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
-
+
gphi *phi = create_phi_node (expect_var, loop_bb);
add_phi_arg (phi, init_var, pre_edge, loc);
add_phi_arg (phi, actual_var, loop_edge, loc);
@@ -4455,34 +4450,38 @@ nvptx_goacc_reduction_init (gcall *call)
gimple_seq_add_stmt (&seq, cond_stmt);
/* Split the block just after the call. */
- basic_block call_bb = gsi_bb (gsi);
- edge nop_edge = split_block (call_bb, call);
- basic_block dst_bb = nop_edge->dest;
+ edge init_edge = split_block (gsi_bb (gsi), call);
+ basic_block init_bb = init_edge->dest;
+ basic_block call_bb = init_edge->src;
- /* Create the initialization block. */
+ /* Fixup flags from call_bb to init_bb. */
+ init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
+
+ /* Set the initialization stmts. */
gimple_seq init_seq = NULL;
tree init_var = make_ssa_name (TREE_TYPE (var));
gimplify_assign (init_var, init, &init_seq);
- /* One would think create_basic_block is the right thing to use
- here to create a new BB and set its gimple sequence. Sadly
- that doesn't set the stmts' bb field :( */
- basic_block init_bb = create_empty_bb (call_bb);
- gimple_stmt_iterator init_gsi = gsi_start_bb (init_bb);
- gsi_insert_seq_after (&init_gsi, init_seq, GSI_CONTINUE_LINKING);
-
- /* Link the init block in between the call and dst blocks. */
- make_edge (call_bb, init_bb, EDGE_TRUE_VALUE);
- edge init_edge = make_edge (init_bb, dst_bb, EDGE_FALLTHRU);
- add_bb_to_loop (init_bb, call_bb->loop_father);
- set_immediate_dominator (CDI_DOMINATORS, init_bb, call_bb);
-
- /* Mark the edge linking call to dst to non-fallthrough false edge. */
- nop_edge->flags ^= EDGE_FALLTHRU | EDGE_FALSE_VALUE;
+ gsi = gsi_start_bb (init_bb);
+ gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
+
+ /* Split block just after the init stmt. */
+ gsi_prev (&gsi);
+ edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
+ basic_block dst_bb = inited_edge->dest;
+ /* Create false edge from call_bb to dst_bb. */
+ edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
+
/* Create phi node in dst block. */
gphi *phi = create_phi_node (lhs, dst_bb);
- add_phi_arg (phi, init_var, init_edge, gimple_location (call));
+ add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
add_phi_arg (phi, var, nop_edge, gimple_location (call));
+
+ /* Reset dominator of dst bb. */
+ set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
+
+ /* Reset the gsi. */
+ gsi = gsi_for_stmt (call);
}
else
{
Grüße
Thomas
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 472 bytes
Desc: not available
URL: <http://gcc.gnu.org/pipermail/gcc-patches/attachments/20151106/a2775c18/attachment.sig>
More information about the Gcc-patches
mailing list