This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH, 4/16] Implement -foffload-alias
- From: Tom de Vries <Tom_deVries at mentor dot com>
- To: Jakub Jelinek <jakub at redhat dot com>, Richard Biener <rguenther at suse dot de>
- Cc: "gcc-patches at gnu dot org" <gcc-patches at gnu dot org>
- Date: Sat, 21 Nov 2015 12:43:44 +0100
- Subject: Re: [PATCH, 4/16] Implement -foffload-alias
- Authentication-results: sourceware.org; auth=none
- References: <5640BD31 dot 2060602 at mentor dot com> <5640C560 dot 1000007 at mentor dot com> <alpine dot LSU dot 2 dot 11 dot 1511111150020 dot 4884 at t29 dot fhfr dot qr> <20151111110034 dot GF5675 at tucnak dot redhat dot com> <5644B84D dot 6050504 at mentor dot com> <alpine dot LSU dot 2 dot 11 dot 1511130944270 dot 4884 at t29 dot fhfr dot qr> <5645C33B dot 9080802 at mentor dot com> <alpine dot LSU dot 2 dot 11 dot 1511131228450 dot 4884 at t29 dot fhfr dot qr> <20151113113938 dot GM5675 at tucnak dot redhat dot com>
On 13/11/15 12:39, Jakub Jelinek wrote:
On Fri, Nov 13, 2015 at 12:29:51PM +0100, Richard Biener wrote:
thanks for the explanation. Filed as PR68331 - '[meta-bug] fipa-pta issues'.
Any feedback on the '#pragma GCC offload-alias=<none|pointer|all>' bit above?
Is that sort of what you had in mind?
Yes. Whether that makes sense is another question of course. You can
annotate memory references with MR_DEPENDENCE_BASE/CLIQUE yourself
as well if you know dependences without the users intervention.
I really don't like even the GCC offload-alias, I just don't see anything
special on the offload code. Not to mention that the same issue is already
with other outlined functions, like OpenMP tasks or parallel regions, those
aren't offloaded, yet they can suffer from worse alias/points-to analysis
too.
AFAIU there is one aspect that is different for offloaded code: the
setup of the data on the device.
Consider this example:
...
unsigned int a[N];
unsigned int b[N];
unsigned int c[N];
int
main (void)
{
...
#pragma acc kernels copyin (a) copyin (b) copyout (c)
{
for (COUNTERTYPE ii = 0; ii < N; ii++)
c[ii] = a[ii] + b[ii];
}
...
...
At gimple level, we have:
...
#pragma omp target oacc_kernels \
map(force_from:c [len: 2097152]) \
map(force_to:b [len: 2097152]) \
map(force_to:a [len: 2097152])
...
[ The meaning of the force_from/force_to mappings is given in
include/gomp-constants.h:
...
/* Allocate. */
GOMP_MAP_FORCE_ALLOC = (GOMP_MAP_FLAG_FORCE | GOMP_MAP_ALLOC),
/* ..., and copy to device. */
GOMP_MAP_FORCE_TO = (GOMP_MAP_FLAG_FORCE | GOMP_MAP_TO),
/* ..., and copy from device. */
GOMP_MAP_FORCE_FROM = (GOMP_MAP_FLAG_FORCE | GOMP_MAP_FROM),
/* ..., and copy to and from device. */
GOMP_MAP_FORCE_TOFROM = (GOMP_MAP_FLAG_FORCE | GOMP_MAP_TOFROM),
... ]
So before calling the offloaded function, a separate alloc is done for
a, b and c, and the base pointers of the newly allocated objects are
passed to the offloaded function.
This means we can mark those base pointers as restrict in the offloaded
function.
Attached proof-of-concept patch implements that.
We simply have some compiler internal interface between the caller and
callee of the outlined regions, each interface in between those has
its own structure type used to communicate the info;
we can attach attributes on the fields, or some flags to indicate some
properties interesting from aliasing POV.
We don't really need to perform
full IPA-PTA, perhaps it would be enough to a) record somewhere in cgraph
the relationship in between such callers and callees (for offloading regions
we already have "omp target entrypoint" attribute on the callee and a
singler caller), tell LTO if possible not to split those into different
partitions if easily possible, and then just for these pairs perform
aliasing/points-to analysis in the caller and the result record using
cliques/special attributes/whatever to the callee side, so that the callee
(outlined OpenMP/OpenACC/Cilk+ region) can then improve its alias analysis.
As a start, is the approach of this patch OK?
It will allow us to commit the oacc kernels patch series with the
ability to parallelize non-trivial testcases, and work on improving the
alias bit after that.
Thanks,
- Tom
Mark pointers to allocated target vars as restricted, if possible
---
gcc/omp-low.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 62 insertions(+), 5 deletions(-)
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 268b67b..0ce822d 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -1372,7 +1372,8 @@ build_sender_ref (tree var, omp_context *ctx)
/* Add a new field for VAR inside the structure CTX->SENDER_DECL. */
static void
-install_var_field (tree var, bool by_ref, int mask, omp_context *ctx)
+install_var_field_1 (tree var, bool by_ref, int mask, omp_context *ctx,
+ bool base_pointers_restrict)
{
tree field, type, sfield = NULL_TREE;
splay_tree_key key = (splay_tree_key) var;
@@ -1396,7 +1397,11 @@ install_var_field (tree var, bool by_ref, int mask, omp_context *ctx)
type = build_pointer_type (build_pointer_type (type));
}
else if (by_ref)
- type = build_pointer_type (type);
+ {
+ type = build_pointer_type (type);
+ if (base_pointers_restrict)
+ type = build_qualified_type (type, TYPE_QUAL_RESTRICT);
+ }
else if ((mask & 3) == 1 && is_reference (var))
type = TREE_TYPE (type);
@@ -1460,6 +1465,12 @@ install_var_field (tree var, bool by_ref, int mask, omp_context *ctx)
splay_tree_insert (ctx->sfield_map, key, (splay_tree_value) sfield);
}
+static void
+install_var_field (tree var, bool by_ref, int mask, omp_context *ctx)
+{
+ install_var_field_1 (var, by_ref, mask, ctx, false);
+}
+
static tree
install_var_local (tree var, omp_context *ctx)
{
@@ -1816,7 +1827,8 @@ fixup_child_record_type (omp_context *ctx)
specified by CLAUSES. */
static void
-scan_sharing_clauses (tree clauses, omp_context *ctx)
+scan_sharing_clauses_1 (tree clauses, omp_context *ctx,
+ bool base_pointers_restrict)
{
tree c, decl;
bool scan_array_reductions = false;
@@ -2073,7 +2085,7 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
&& TREE_CODE (TREE_TYPE (decl)) == ARRAY_TYPE)
install_var_field (decl, true, 7, ctx);
else
- install_var_field (decl, true, 3, ctx);
+ install_var_field_1 (decl, true, 3, ctx, base_pointers_restrict);
if (is_gimple_omp_offloaded (ctx->stmt))
install_var_local (decl, ctx);
}
@@ -2339,6 +2351,12 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
scan_omp (&OMP_CLAUSE_LINEAR_GIMPLE_SEQ (c), ctx);
}
+static void
+scan_sharing_clauses (tree clauses, omp_context *ctx)
+{
+ scan_sharing_clauses_1 (clauses, ctx, false);
+}
+
/* Create a new name for omp child function. Returns an identifier. If
IS_CILK_FOR is true then the suffix for the child function is
"_cilk_for_fn." */
@@ -3056,13 +3074,52 @@ scan_omp_target (gomp_target *stmt, omp_context *outer_ctx)
DECL_NAMELESS (name) = 1;
TYPE_NAME (ctx->record_type) = name;
TYPE_ARTIFICIAL (ctx->record_type) = 1;
+
+ bool base_pointers_restrict = false;
if (offloaded)
{
create_omp_child_function (ctx, false);
gimple_omp_target_set_child_fn (stmt, ctx->cb.dst_fn);
+
+ /* If all the clauses force allocation, we can be certain that the objects
+ on the target are disjoint, and therefore mark the base pointers as
+ restrict. */
+ base_pointers_restrict = true;
+ tree c;
+ for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ {
+ switch (OMP_CLAUSE_CODE (c))
+ {
+ case OMP_CLAUSE_MAP:
+ switch (OMP_CLAUSE_MAP_KIND (c))
+ {
+ case GOMP_MAP_ALLOC:
+ case GOMP_MAP_FORCE_TO:
+ case GOMP_MAP_FORCE_FROM:
+ case GOMP_MAP_FORCE_TOFROM:
+ break;
+ default:
+ base_pointers_restrict = false;
+ break;
+ }
+ break;
+
+ default:
+ base_pointers_restrict = false;
+ break;
+ }
+
+ if (!base_pointers_restrict)
+ break;
+ }
+ if (base_pointers_restrict)
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Base pointers in offloaded function are restrict\n");
+ }
}
- scan_sharing_clauses (clauses, ctx);
+ scan_sharing_clauses_1 (clauses, ctx, base_pointers_restrict);
scan_omp (gimple_omp_body_ptr (stmt), ctx);
if (TYPE_FIELDS (ctx->record_type) == NULL)