This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Fix PR43023: fuse_partitions_with_similar_memory_accesses.
- From: Sebastian Pop <sebpop at gmail dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: rguenther at suse dot de, Sebastian Pop <sebpop at gmail dot com>
- Date: Fri, 10 Dec 2010 13:45:31 -0600
- Subject: [PATCH] Fix PR43023: fuse_partitions_with_similar_memory_accesses.
- References: <AANLkTinip9ErfoARM1SaJNo_jWJ0LHTxwiWDx=U8c29g@mail.gmail.com>
Hi,
here is the backport of the same patch for the gcc4.5 branch. Please
let me know when you want to commit this patch to the branch. For now
I sent this out for test on amd64-linux.
Sebastian
2010-12-10 Sebastian Pop <sebastian.pop@amd.com>
PR tree-optimization/43023
* tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p):
Removed.
(stores_zero_from_loop): Call stmt_stores_zero.
(stmt_with_adjacent_zero_store_dr_p): New.
* tree-data-ref.h (stmt_with_adjacent_zero_store_dr_p): Declared.
(stride_of_unit_type_p): New.
* tree-loop-distribution.c (generate_memset_zero): Do not return a
boolean. Call gcc_assert on stride_of_unit_type_p.
(generate_builtin): Call stmt_stores_zero.
(rdg_flag_all_uses): Removed.
(rdg_flag_similar_memory_accesses): Removed.
(build_rdg_partition_for_component): Removed parameter
other_stores. Removed call to rdg_flag_similar_memory_accesses.
(can_generate_builtin): New.
(similar_memory_accesses): New.
(fuse_partitions_with_similar_memory_accesses): New.
(rdg_build_partitions): Call
fuse_partitions_with_similar_memory_accesses.
* gfortran.dg/ldist-1.f90: Adjust pattern.
* gfortran.dg/ldist-pr43023.f90: New.
---
gcc/ChangeLog | 22 +++
gcc/testsuite/ChangeLog | 6 +
gcc/testsuite/gfortran.dg/ldist-1.f90 | 5 +-
gcc/testsuite/gfortran.dg/ldist-pr43023.f90 | 31 ++++
gcc/tree-data-ref.c | 34 ++++-
gcc/tree-data-ref.h | 12 ++
gcc/tree-loop-distribution.c | 223 +++++++++++----------------
7 files changed, 201 insertions(+), 132 deletions(-)
create mode 100644 gcc/testsuite/gfortran.dg/ldist-pr43023.f90
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index abecb83..dfe45ed 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,25 @@
+2010-12-10 Sebastian Pop <sebastian.pop@amd.com>
+
+ PR tree-optimization/43023
+ * tree-data-ref.c (mem_write_stride_of_same_size_as_unit_type_p):
+ Removed.
+ (stores_zero_from_loop): Call stmt_stores_zero.
+ (stmt_with_adjacent_zero_store_dr_p): New.
+ * tree-data-ref.h (stmt_with_adjacent_zero_store_dr_p): Declared.
+ (stride_of_unit_type_p): New.
+ * tree-loop-distribution.c (generate_memset_zero): Do not return a
+ boolean. Call gcc_assert on stride_of_unit_type_p.
+ (generate_builtin): Call stmt_stores_zero.
+ (rdg_flag_all_uses): Removed.
+ (rdg_flag_similar_memory_accesses): Removed.
+ (build_rdg_partition_for_component): Removed parameter
+ other_stores. Removed call to rdg_flag_similar_memory_accesses.
+ (can_generate_builtin): New.
+ (similar_memory_accesses): New.
+ (fuse_partitions_with_similar_memory_accesses): New.
+ (rdg_build_partitions): Call
+ fuse_partitions_with_similar_memory_accesses.
+
2010-12-07 Jakub Jelinek <jakub@redhat.com>
Backport from mainline
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index e3a9d24..36fd59d 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,9 @@
+2010-12-10 Sebastian Pop <sebastian.pop@amd.com>
+
+ PR tree-optimization/43023
+ * gfortran.dg/ldist-1.f90: Adjust pattern.
+ * gfortran.dg/ldist-pr43023.f90: New.
+
2010-12-07 Sebastian Pop <sebastian.pop@amd.com>
PR tree-optimization/44676
diff --git a/gcc/testsuite/gfortran.dg/ldist-1.f90 b/gcc/testsuite/gfortran.dg/ldist-1.f90
index dd1f02a..bbce2f3 100644
--- a/gcc/testsuite/gfortran.dg/ldist-1.f90
+++ b/gcc/testsuite/gfortran.dg/ldist-1.f90
@@ -29,5 +29,8 @@ Subroutine PADEC(DKS,DKDS,HVAR,WM,WG,FN,NS,AN,BN,CN,IT)
return
end Subroutine PADEC
-! { dg-final { scan-tree-dump-times "distributed: split to 4 loops" 1 "ldist" } }
+! There are 5 legal partitions in this code. Based on the data
+! locality heuristic, this loop should not be split.
+
+! { dg-final { scan-tree-dump-not "distributed: split to" "ldist" } }
! { dg-final { cleanup-tree-dump "ldist" } }
diff --git a/gcc/testsuite/gfortran.dg/ldist-pr43023.f90 b/gcc/testsuite/gfortran.dg/ldist-pr43023.f90
new file mode 100644
index 0000000..3e2d04c
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/ldist-pr43023.f90
@@ -0,0 +1,31 @@
+! { dg-do compile }
+! { dg-options "-O2 -ftree-loop-distribution" }
+
+MODULE NFT_mod
+
+implicit none
+integer :: Nangle
+real:: Z0
+real, dimension(:,:), allocatable :: Angle
+real, dimension(:), allocatable :: exth, ezth, hxth, hyth, hyphi
+
+CONTAINS
+
+SUBROUTINE NFT_Init()
+
+real :: th, fi
+integer :: n
+
+do n = 1,Nangle
+ th = Angle(n,1)
+ fi = Angle(n,2)
+
+ exth(n) = cos(fi)*cos(th)
+ ezth(n) = -sin(th)
+ hxth(n) = -sin(fi)
+ hyth(n) = cos(fi)
+ hyphi(n) = -sin(fi)
+end do
+END SUBROUTINE NFT_Init
+
+END MODULE NFT_mod
diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
index a89d151..dab0177 100644
--- a/gcc/tree-data-ref.c
+++ b/gcc/tree-data-ref.c
@@ -4594,7 +4594,7 @@ dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
for (e = v->succ; e; e = e->succ_next)
fprintf (file, " %d", e->dest);
- fprintf (file, ") \n");
+ fprintf (file, ")\n");
print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
fprintf (file, ")\n");
}
@@ -4991,6 +4991,38 @@ stores_from_loop (struct loop *loop, VEC (gimple, heap) **stmts)
free (bbs);
}
+/* Returns true when the statement at STMT is of the form "A[i] = 0"
+ that contains a data reference on its LHS with a stride of the same
+ size as its unit type. */
+
+bool
+stmt_with_adjacent_zero_store_dr_p (gimple stmt)
+{
+ tree op0, op1;
+ bool res;
+ struct data_reference *dr;
+
+ if (!stmt
+ || !gimple_vdef (stmt)
+ || !is_gimple_assign (stmt)
+ || !gimple_assign_single_p (stmt)
+ || !(op1 = gimple_assign_rhs1 (stmt))
+ || !(integer_zerop (op1) || real_zerop (op1)))
+ return false;
+
+ dr = XCNEW (struct data_reference);
+ op0 = gimple_assign_lhs (stmt);
+
+ DR_STMT (dr) = stmt;
+ DR_REF (dr) = op0;
+
+ res = dr_analyze_innermost (dr)
+ && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0));
+
+ free_data_ref (dr);
+ return res;
+}
+
/* For a data reference REF, return the declaration of its base
address or NULL_TREE if the base is not determined. */
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
index 678eb10..90cbca1 100644
--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
@@ -567,6 +567,18 @@ void stores_from_loop (struct loop *, VEC (gimple, heap) **);
void remove_similar_memory_refs (VEC (gimple, heap) **);
bool rdg_defs_used_in_other_loops_p (struct graph *, int);
bool have_similar_memory_accesses (gimple, gimple);
+bool stmt_with_adjacent_zero_store_dr_p (gimple);
+
+/* Returns true when STRIDE is equal in absolute value to the size of
+ the unit type of TYPE. */
+
+static inline bool
+stride_of_unit_type_p (tree stride, tree type)
+{
+ return tree_int_cst_equal (fold_unary (ABS_EXPR, TREE_TYPE (stride),
+ stride),
+ TYPE_SIZE_UNIT (type));
+}
/* Determines whether RDG vertices V1 and V2 access to similar memory
locations, in which case they have to be in the same partition. */
diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
index a328860..2e420ea 100644
--- a/gcc/tree-loop-distribution.c
+++ b/gcc/tree-loop-distribution.c
@@ -251,7 +251,7 @@ build_size_arg_loc (location_t loc, tree nb_iter, tree op,
/* Generate a call to memset. Return true when the operation succeeded. */
-static bool
+static void
generate_memset_zero (gimple stmt, tree op0, tree nb_iter,
gimple_stmt_iterator bsi)
{
@@ -265,45 +265,27 @@ generate_memset_zero (gimple stmt, tree op0, tree nb_iter,
DR_STMT (dr) = stmt;
DR_REF (dr) = op0;
- if (!dr_analyze_innermost (dr))
- goto end;
+ res = dr_analyze_innermost (dr);
+ gcc_assert (res && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0)));
- /* Test for a positive stride, iterating over every element. */
- if (integer_zerop (size_binop (MINUS_EXPR,
- fold_convert (sizetype, DR_STEP (dr)),
- TYPE_SIZE_UNIT (TREE_TYPE (op0)))))
- {
- addr_base = fold_convert_loc (loc, sizetype,
- size_binop_loc (loc, PLUS_EXPR,
- DR_OFFSET (dr),
- DR_INIT (dr)));
- addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR,
- TREE_TYPE (DR_BASE_ADDRESS (dr)),
- DR_BASE_ADDRESS (dr), addr_base);
-
- nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list);
- }
+ nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list);
+ addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
+ addr_base = fold_convert_loc (loc, sizetype, addr_base);
/* Test for a negative stride, iterating over every element. */
- else if (integer_zerop (size_binop (PLUS_EXPR,
- TYPE_SIZE_UNIT (TREE_TYPE (op0)),
- fold_convert (sizetype, DR_STEP (dr)))))
+ if (integer_zerop (size_binop (PLUS_EXPR,
+ TYPE_SIZE_UNIT (TREE_TYPE (op0)),
+ fold_convert (sizetype, DR_STEP (dr)))))
{
- nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list);
-
- addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
- addr_base = fold_convert_loc (loc, sizetype, addr_base);
addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base,
fold_convert_loc (loc, sizetype, nb_bytes));
addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base,
TYPE_SIZE_UNIT (TREE_TYPE (op0)));
- addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR,
- TREE_TYPE (DR_BASE_ADDRESS (dr)),
- DR_BASE_ADDRESS (dr), addr_base);
}
- else
- goto end;
+ addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR,
+ TREE_TYPE (DR_BASE_ADDRESS (dr)),
+ DR_BASE_ADDRESS (dr), addr_base);
mem = force_gimple_operand (addr_base, &stmts, true, NULL);
gimple_seq_add_seq (&stmt_list, stmts);
@@ -311,14 +293,11 @@ generate_memset_zero (gimple stmt, tree op0, tree nb_iter,
fn_call = gimple_build_call (fn, 3, mem, integer_zero_node, nb_bytes);
gimple_seq_add_stmt (&stmt_list, fn_call);
gsi_insert_seq_after (&bsi, stmt_list, GSI_CONTINUE_LINKING);
- res = true;
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "generated memset zero\n");
- end:
free_data_ref (dr);
- return res;
}
/* Tries to generate a builtin function for the instructions of LOOP
@@ -332,7 +311,6 @@ generate_builtin (struct loop *loop, bitmap partition, bool copy_p)
unsigned i, x = 0;
basic_block *bbs;
gimple write = NULL;
- tree op0, op1;
gimple_stmt_iterator bsi;
tree nb_iter = number_of_exit_cond_executions (loop);
@@ -368,26 +346,17 @@ generate_builtin (struct loop *loop, bitmap partition, bool copy_p)
}
}
- if (!write)
- goto end;
-
- op0 = gimple_assign_lhs (write);
- op1 = gimple_assign_rhs1 (write);
-
- if (!(TREE_CODE (op0) == ARRAY_REF
- || TREE_CODE (op0) == INDIRECT_REF))
+ if (!stmt_with_adjacent_zero_store_dr_p (write))
goto end;
/* The new statements will be placed before LOOP. */
bsi = gsi_last_bb (loop_preheader_edge (loop)->src);
-
- if (gimple_assign_rhs_code (write) == INTEGER_CST
- && (integer_zerop (op1) || real_zerop (op1)))
- res = generate_memset_zero (write, op0, nb_iter, bsi);
+ generate_memset_zero (write, gimple_assign_lhs (write), nb_iter, bsi);
+ res = true;
/* If this is the last partition for which we generate code, we have
to destroy the loop. */
- if (res && !copy_p)
+ if (!copy_p)
{
unsigned nbbs = loop->num_nodes;
edge exit = single_exit (loop);
@@ -531,24 +500,6 @@ has_upstream_mem_writes (int u)
static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap,
bitmap, bool *);
-/* Flag all the uses of U. */
-
-static void
-rdg_flag_all_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
- bitmap processed, bool *part_has_writes)
-{
- struct graph_edge *e;
-
- for (e = rdg->vertices[u].succ; e; e = e->succ_next)
- if (!bitmap_bit_p (processed, e->dest))
- {
- rdg_flag_vertex_and_dependent (rdg, e->dest, partition, loops,
- processed, part_has_writes);
- rdg_flag_all_uses (rdg, e->dest, partition, loops, processed,
- part_has_writes);
- }
-}
-
/* Flag the uses of U stopping following the information from
upstream_mem_writes. */
@@ -720,68 +671,13 @@ rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition,
}
}
-/* Flag all the nodes of RDG containing memory accesses that could
- potentially belong to arrays already accessed in the current
- PARTITION. */
-
-static void
-rdg_flag_similar_memory_accesses (struct graph *rdg, bitmap partition,
- bitmap loops, bitmap processed,
- VEC (int, heap) **other_stores)
-{
- bool foo;
- unsigned i, n;
- int j, k, kk;
- bitmap_iterator ii;
- struct graph_edge *e;
-
- EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii)
- if (RDG_MEM_WRITE_STMT (rdg, i)
- || RDG_MEM_READS_STMT (rdg, i))
- {
- for (j = 0; j < rdg->n_vertices; j++)
- if (!bitmap_bit_p (processed, j)
- && (RDG_MEM_WRITE_STMT (rdg, j)
- || RDG_MEM_READS_STMT (rdg, j))
- && rdg_has_similar_memory_accesses (rdg, i, j))
- {
- /* Flag first the node J itself, and all the nodes that
- are needed to compute J. */
- rdg_flag_vertex_and_dependent (rdg, j, partition, loops,
- processed, &foo);
-
- /* When J is a read, we want to coalesce in the same
- PARTITION all the nodes that are using J: this is
- needed for better cache locality. */
- rdg_flag_all_uses (rdg, j, partition, loops, processed, &foo);
-
- /* Remove from OTHER_STORES the vertex that we flagged. */
- if (RDG_MEM_WRITE_STMT (rdg, j))
- for (k = 0; VEC_iterate (int, *other_stores, k, kk); k++)
- if (kk == j)
- {
- VEC_unordered_remove (int, *other_stores, k);
- break;
- }
- }
-
- /* If the node I has two uses, then keep these together in the
- same PARTITION. */
- for (n = 0, e = rdg->vertices[i].succ; e; e = e->succ_next, n++);
-
- if (n > 1)
- rdg_flag_all_uses (rdg, i, partition, loops, processed, &foo);
- }
-}
-
/* Returns a bitmap in which all the statements needed for computing
the strongly connected component C of the RDG are flagged, also
including the loop exit conditions. */
static bitmap
build_rdg_partition_for_component (struct graph *rdg, rdgc c,
- bool *part_has_writes,
- VEC (int, heap) **other_stores)
+ bool *part_has_writes)
{
int i, v;
bitmap partition = BITMAP_ALLOC (NULL);
@@ -793,13 +689,6 @@ build_rdg_partition_for_component (struct graph *rdg, rdgc c,
rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed,
part_has_writes);
- /* Also iterate on the array of stores not in the starting vertices,
- and determine those vertices that have some memory affinity with
- the current nodes in the component: these are stores to the same
- arrays, i.e. we're taking care of cache locality. */
- rdg_flag_similar_memory_accesses (rdg, partition, loops, processed,
- other_stores);
-
rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes);
BITMAP_FREE (processed);
@@ -863,6 +752,79 @@ rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices,
BITMAP_FREE (saved_components);
}
+/* Returns true when it is possible to generate a builtin pattern for
+ the PARTITION of RDG. For the moment we detect only the memset
+ zero pattern. */
+
+static bool
+can_generate_builtin (struct graph *rdg, bitmap partition)
+{
+ unsigned i;
+ bitmap_iterator bi;
+ int nb_reads = 0;
+ int nb_writes = 0;
+ int stores_zero = 0;
+
+ EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, bi)
+ if (RDG_MEM_READS_STMT (rdg, i))
+ nb_reads++;
+ else if (RDG_MEM_WRITE_STMT (rdg, i))
+ {
+ nb_writes++;
+ if (stmt_with_adjacent_zero_store_dr_p (RDG_STMT (rdg, i)))
+ stores_zero++;
+ }
+
+ return stores_zero == 1 && nb_writes == 1 && nb_reads == 0;
+}
+
+/* Returns true when PARTITION1 and PARTITION2 have similar memory
+ accesses in RDG. */
+
+static bool
+similar_memory_accesses (struct graph *rdg, bitmap partition1,
+ bitmap partition2)
+{
+ unsigned i, j;
+ bitmap_iterator bi, bj;
+
+ EXECUTE_IF_SET_IN_BITMAP (partition1, 0, i, bi)
+ if (RDG_MEM_WRITE_STMT (rdg, i)
+ || RDG_MEM_READS_STMT (rdg, i))
+ EXECUTE_IF_SET_IN_BITMAP (partition2, 0, j, bj)
+ if (RDG_MEM_WRITE_STMT (rdg, j)
+ || RDG_MEM_READS_STMT (rdg, j))
+ if (rdg_has_similar_memory_accesses (rdg, i, j))
+ return true;
+
+ return false;
+}
+
+/* Fuse all the partitions from PARTITIONS that contain similar memory
+ references, i.e., we're taking care of cache locality. This
+ function does not fuse those partitions that contain patterns that
+ can be code generated with builtins. */
+
+static void
+fuse_partitions_with_similar_memory_accesses (struct graph *rdg,
+ VEC (bitmap, heap) **partitions)
+{
+ int p1, p2;
+ bitmap partition1, partition2;
+
+ for (p1 = 0; VEC_iterate (bitmap, *partitions, p1, partition1); p1++)
+ if (!can_generate_builtin (rdg, partition1))
+ for (p2 = 0; VEC_iterate (bitmap, *partitions, p2, partition2); p2++)
+ if (p1 != p2
+ && !can_generate_builtin (rdg, partition2)
+ && similar_memory_accesses (rdg, partition1, partition2))
+ {
+ bitmap_ior_into (partition1, partition2);
+ VEC_ordered_remove (bitmap, *partitions, p2);
+ p2--;
+ }
+}
+
/* Aggregate several components into a useful partition that is
registered in the PARTITIONS vector. Partitions will be
distributed in different loops. */
@@ -885,8 +847,7 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components,
if (bitmap_bit_p (processed, v))
continue;
- np = build_rdg_partition_for_component (rdg, x, &part_has_writes,
- other_stores);
+ np = build_rdg_partition_for_component (rdg, x, &part_has_writes);
bitmap_ior_into (partition, np);
bitmap_ior_into (processed, np);
BITMAP_FREE (np);
@@ -932,6 +893,8 @@ rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components,
VEC_safe_push (bitmap, heap, *partitions, partition);
else
BITMAP_FREE (partition);
+
+ fuse_partitions_with_similar_memory_accesses (rdg, partitions);
}
/* Dump to FILE the PARTITIONS. */
--
1.7.1