This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[graphite] Loop distribution: fix perf regression on GemsFDTD
- From: "Sebastian Pop" <sebpop at gmail dot com>
- To: "GCC Patches" <gcc-patches at gcc dot gnu dot org>
- Date: Sun, 27 Jan 2008 12:20:16 -0600
- Subject: [graphite] Loop distribution: fix perf regression on GemsFDTD
Hi,
I committed the attached patch to the graphite-branch. This fixes a
regression performance for GemsFDTD, one of the Cpu2006 benchmark by
aggregating the reads to the same array in the same partition. Before
this patch, ldist would divide one of the critical loops into 12, one
loop per write to a different array.
do ih=1,no_Huy_applies ! no_Huy_applies is equal to 1 or 2
coeff_X_Hy(ih) = dxinv*dtdmuEpol(3,ih)
coeff_X_Hz(ih) = dxinv*dtdmuEpol(2,ih)
coeff_Y_Hx(ih) = dyinv*dtdmuEpol(3,ih)
coeff_Y_Hz(ih) = dyinv*dtdmuEpol(1,ih)
coeff_Z_Hx(ih) = dzinv*dtdmuEpol(2,ih)
coeff_Z_Hy(ih) = dzinv*dtdmuEpol(1,ih)
coeff_X_Ey(ih) = dxinv*dtdepsHpol(3,ih)
coeff_X_Ez(ih) = dxinv*dtdepsHpol(2,ih)
coeff_Y_Ex(ih) = dyinv*dtdepsHpol(3,ih)
coeff_Y_Ez(ih) = dyinv*dtdepsHpol(1,ih)
coeff_Z_Ex(ih) = dzinv*dtdepsHpol(2,ih)
coeff_Z_Ey(ih) = dzinv*dtdepsHpol(1,ih)
end do
With the attached patch, this loop gets splitted into 2.
The attached patch improves the code generation for the simple loops
that initialize a memory to zero by generating a memset zero, avoiding
the duplication of the loop control. This is a quite common pattern:
on the whole Cpu2006 suite, it appears 216 times.
# grep "distributed" code/spec/result/CPU2006.016.log | wc -l
362
# grep "generated memset zero" code/spec/result/CPU2006.016.log | wc -l
216
--
Sebastian
AMD - GNU Tools
* tree-loop-distribution.c (generate_memset_zero,
generate_builtin, generate_code_for_partition, rdg_flag_all_uses): New.
(rdg_flag_uses): Gather in the same partition the statements defining
the VUSES of the current statement.
(rdg_flag_similar_stores): Renamed rdg_flag_similar_memory_accesses.
Gather in the same partition not only the stores to the same memory
access, but also the reads.
(ldist_generate_loops): Renamed ldist_gen.
Index: tree-loop-distribution.c
===================================================================
--- tree-loop-distribution.c (revision 131611)
+++ tree-loop-distribution.c (working copy)
@@ -1,24 +1,23 @@
-/* Loop Distribution
- Copyright (C) 2006, 2007 Free Software Foundation, Inc.
+/* Loop distribution.
+ Copyright (C) 2006, 2007, 2008 Free Software Foundation, Inc.
Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
and Sebastian Pop <sebastian.pop@amd.com>.
This file is part of GCC.
-
-GCC is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free
-Software Foundation; either version 2, or (at your option) any later
-version.
-
-GCC is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
-
+
You should have received a copy of the GNU General Public License
-along with GCC; see the file COPYING. If not, write to the Free
-Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
-02110-1301, USA. */
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
/* This pass performs loop distribution: for example, the loop
@@ -171,7 +170,7 @@ create_bb_after_loop (struct loop *loop)
static bool
generate_loops_for_partition (struct loop *loop, bitmap partition, bool copy_p)
{
- unsigned i, x = 0;
+ unsigned i, x;
block_stmt_iterator bsi;
basic_block *bbs;
@@ -190,7 +189,7 @@ generate_loops_for_partition (struct loo
stmts_from_loop. */
bbs = get_loop_body_in_dom_order (loop);
- for (i = 0; i < loop->num_nodes; i++)
+ for (x = 0, i = 0; i < loop->num_nodes; i++)
{
basic_block bb = bbs[i];
tree phi, prev = NULL_TREE, next;
@@ -222,6 +221,197 @@ generate_loops_for_partition (struct loo
return true;
}
+/* Generate a call to memset. Return true when the operation succeeded. */
+
+static bool
+generate_memset_zero (tree stmt, tree op0, tree nb_iter,
+ block_stmt_iterator bsi)
+{
+ tree s, t, stmts, nb_bytes, addr_base;
+ bool res = false;
+ tree stmt_list = NULL_TREE;
+ tree args [3];
+ tree fn_call, mem, fndecl, fntype, fn;
+ tree_stmt_iterator i;
+ ssa_op_iter iter;
+ struct data_reference *dr = XCNEW (struct data_reference);
+
+ nb_bytes = fold_build2 (MULT_EXPR, TREE_TYPE (nb_iter),
+ nb_iter, TYPE_SIZE_UNIT (TREE_TYPE (op0)));
+ nb_bytes = force_gimple_operand (nb_bytes, &stmts, true, NULL);
+ append_to_statement_list_force (stmts, &stmt_list);
+
+ DR_STMT (dr) = stmt;
+ DR_REF (dr) = op0;
+ dr_analyze_innermost (dr);
+
+ /* Test for a positive stride, iterating over every element. */
+ if (integer_zerop (fold_build2 (MINUS_EXPR, integer_type_node, DR_STEP (dr),
+ TYPE_SIZE_UNIT (TREE_TYPE (op0)))))
+ addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_BASE_ADDRESS (dr)),
+ DR_BASE_ADDRESS (dr),
+ size_binop (PLUS_EXPR,
+ DR_OFFSET (dr), DR_INIT (dr)));
+
+ /* Test for a negative stride, iterating over every element. */
+ else if (integer_zerop (fold_build2 (PLUS_EXPR, integer_type_node,
+ TYPE_SIZE_UNIT (TREE_TYPE (op0)),
+ DR_STEP (dr))))
+ {
+ addr_base = size_binop (PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
+ addr_base = fold_build2 (MINUS_EXPR, sizetype, addr_base, nb_bytes);
+ addr_base = force_gimple_operand (addr_base, &stmts, true, NULL);
+ append_to_statement_list_force (stmts, &stmt_list);
+
+ addr_base = fold_build2 (POINTER_PLUS_EXPR,
+ TREE_TYPE (DR_BASE_ADDRESS (dr)),
+ DR_BASE_ADDRESS (dr), addr_base);
+ }
+ else
+ goto end;
+
+ mem = force_gimple_operand (addr_base, &stmts, true, NULL);
+ append_to_statement_list_force (stmts, &stmt_list);
+
+
+ fndecl = implicit_built_in_decls [BUILT_IN_MEMSET];
+ fntype = TREE_TYPE (fndecl);
+ fn = build1 (ADDR_EXPR, build_pointer_type (fntype), fndecl);
+
+ args[0] = mem;
+ args[1] = integer_zero_node;
+ args[2] = nb_bytes;
+
+ fn_call = build_call_array (fntype, fn, 3, args);
+ append_to_statement_list_force (fn_call, &stmt_list);
+
+ for (i = tsi_start (stmt_list); !tsi_end_p (i); tsi_next (&i))
+ {
+ s = tsi_stmt (i);
+ update_stmt_if_modified (s);
+
+ FOR_EACH_SSA_TREE_OPERAND (t, s, iter, SSA_OP_VIRTUAL_DEFS)
+ {
+ if (TREE_CODE (t) == SSA_NAME)
+ t = SSA_NAME_VAR (t);
+ mark_sym_for_renaming (t);
+ }
+ }
+
+ /* Mark also the uses of the VDEFS of STMT to be renamed. */
+ FOR_EACH_SSA_TREE_OPERAND (t, stmt, iter, SSA_OP_VIRTUAL_DEFS)
+ {
+ if (TREE_CODE (t) == SSA_NAME)
+ {
+ imm_use_iterator imm_iter;
+
+ FOR_EACH_IMM_USE_STMT (s, imm_iter, t)
+ update_stmt (s);
+
+ t = SSA_NAME_VAR (t);
+ }
+ mark_sym_for_renaming (t);
+ }
+
+ bsi_insert_after (&bsi, stmt_list, BSI_CONTINUE_LINKING);
+ res = true;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "generated memset zero\n");
+
+ if (0)
+ fprintf (stdout, "generated memset zero\n");
+
+ end:
+ free_data_ref (dr);
+ return res;
+}
+
+/* Tries to generate a builtin function for the instructions of LOOP
+ pointed to by the bits set in PARTITION. Returns true when the
+ operation succeeded. */
+
+static bool
+generate_builtin (struct loop *loop, bitmap partition, bool copy_p)
+{
+ bool res = false;
+ unsigned i, x = 0;
+ basic_block *bbs;
+ tree write = NULL_TREE;
+ tree op0, op1;
+ block_stmt_iterator bsi;
+ tree nb_iter = number_of_exit_cond_executions (loop);
+
+ if (!nb_iter || nb_iter == chrec_dont_know)
+ return false;
+
+ bbs = get_loop_body_in_dom_order (loop);
+
+ for (i = 0; i < loop->num_nodes; i++)
+ {
+ basic_block bb = bbs[i];
+ tree phi;
+
+ for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
+ x++;
+
+ for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
+ {
+ tree stmt = bsi_stmt (bsi);
+
+ if (bitmap_bit_p (partition, x++)
+ && TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
+ && !is_gimple_reg (GIMPLE_STMT_OPERAND (stmt, 0)))
+ {
+ /* Don't generate the builtins when there are more than
+ one memory write. */
+ if (write != NULL)
+ goto end;
+
+ write = stmt;
+ }
+ }
+ }
+
+ if (!write)
+ goto end;
+
+ op0 = GIMPLE_STMT_OPERAND (write, 0);
+ op1 = GIMPLE_STMT_OPERAND (write, 1);
+
+ if (!(TREE_CODE (op0) == ARRAY_REF
+ || TREE_CODE (op0) == INDIRECT_REF))
+ goto end;
+
+ /* The new statements will be placed before LOOP. */
+ bsi = bsi_last (loop_preheader_edge (loop)->src);
+
+ if (integer_zerop (op1) || real_zerop (op1))
+ res = generate_memset_zero (write, op0, nb_iter, bsi);
+
+ /* If this is the last partition for which we generate code, we have
+ to destroy the loop. */
+ if (res && !copy_p)
+ cancel_loop_tree (loop);
+
+ end:
+ free (bbs);
+ return res;
+}
+
+/* Generates code for PARTITION. For simple loops, this function can
+ generate a built-in. */
+
+static bool
+generate_code_for_partition (struct loop *loop, bitmap partition, bool copy_p)
+{
+ if (generate_builtin (loop, partition, copy_p))
+ return true;
+
+ return generate_loops_for_partition (loop, partition, copy_p);
+}
+
+
/* Returns true if the node V of RDG cannot be recomputed. */
static bool
@@ -335,9 +525,30 @@ static void rdg_flag_vertex_and_dependen
/* Flag all the uses of U. */
static void
+rdg_flag_all_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
+ bitmap processed, bool *part_has_writes)
+{
+ struct graph_edge *e;
+
+ for (e = rdg->vertices[u].succ; e; e = e->succ_next)
+ if (!bitmap_bit_p (processed, e->dest))
+ {
+ rdg_flag_vertex_and_dependent (rdg, e->dest, partition, loops,
+ processed, part_has_writes);
+ rdg_flag_all_uses (rdg, e->dest, partition, loops, processed,
+ part_has_writes);
+ }
+}
+
+/* Flag the uses of U stopping following the information from
+ upstream_mem_writes. */
+
+static void
rdg_flag_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
bitmap processed, bool *part_has_writes)
{
+ ssa_op_iter iter;
+ use_operand_p use_p;
struct vertex *x = &(rdg->vertices[u]);
tree stmt = RDGV_STMT (x);
struct graph_edge *anti_dep = has_anti_dependence (x);
@@ -354,6 +565,24 @@ rdg_flag_uses (struct graph *rdg, int u,
processed, part_has_writes);
}
+ if (TREE_CODE (stmt) != PHI_NODE)
+ {
+ FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VIRTUAL_USES)
+ {
+ tree use = USE_FROM_PTR (use_p);
+
+ if (TREE_CODE (use) == SSA_NAME)
+ {
+ tree def_stmt = SSA_NAME_DEF_STMT (use);
+ int v = rdg_vertex_for_stmt (rdg, def_stmt);
+
+ if (v >= 0)
+ rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
+ processed, part_has_writes);
+ }
+ }
+ }
+
if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
&& has_upstream_mem_writes (u))
{
@@ -494,32 +723,58 @@ typedef struct rdg_component
DEF_VEC_P (rdgc);
DEF_VEC_ALLOC_P (rdgc, heap);
-/* Remove from OTHER_STORES those vertices that are storing to the
- same arrays as the stores from C. */
+/* Flag all the nodes of RDG containing memory accesses that could
+ potentially belong arrays already accessed in the current
+ PARTITION. */
static void
-rdg_flag_similar_stores (struct graph *rdg, rdgc c, bitmap partition, bitmap loops,
- bitmap processed, VEC (int, heap) **other_stores)
-{
- int i, j, x, v;
-
- if (VEC_length (int, *other_stores) == 0)
- return;
+rdg_flag_similar_memory_accesses (struct graph *rdg, bitmap partition,
+ bitmap loops, bitmap processed,
+ VEC (int, heap) **other_stores)
+{
+ bool foo;
+ unsigned i, n;
+ int j, k, kk;
+ bitmap_iterator ii;
+ struct graph_edge *e;
- for (i = 0; VEC_iterate (int, c->vertices, i, v); i++)
- if (RDG_MEM_WRITE_STMT (rdg, v))
- for (j = 0; VEC_iterate (int, *other_stores, j, x); )
- {
- if (rdg_has_similar_memory_accesses (rdg, v, x))
+ EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii)
+ if (RDG_MEM_WRITE_STMT (rdg, i)
+ || RDG_MEM_READS_STMT (rdg, i))
+ {
+ for (j = 0; j < rdg->n_vertices; j++)
+ if (!bitmap_bit_p (processed, j)
+ && (RDG_MEM_WRITE_STMT (rdg, j)
+ || RDG_MEM_READS_STMT (rdg, j))
+ && rdg_has_similar_memory_accesses (rdg, i, j))
{
- bool foo;
- rdg_flag_vertex_and_dependent (rdg, x, partition, loops,
+ /* Flag first the node J itself, and all the nodes that
+ are needed to compute J. */
+ rdg_flag_vertex_and_dependent (rdg, j, partition, loops,
processed, &foo);
- VEC_unordered_remove (int, *other_stores, j);
+
+ /* When J is a read, we want to coalesce in the same
+ PARTITION all the nodes that are using J: this is
+ needed for better cache locality. */
+ rdg_flag_all_uses (rdg, j, partition, loops, processed, &foo);
+
+ /* Remove from OTHER_STORES the vertex that we flagged. */
+ if (RDG_MEM_WRITE_STMT (rdg, j))
+ for (k = 0; VEC_iterate (int, *other_stores, k, kk); k++)
+ if (kk == j)
+ {
+ VEC_unordered_remove (int, *other_stores, k);
+ break;
+ }
}
- else
- j++;
- }
+
+ /* If the node I has two uses, then keep these together in the
+ same PARTITION. */
+ for (n = 0, e = rdg->vertices[i].succ; e; e = e->succ_next, n++);
+
+ if (n > 1)
+ rdg_flag_all_uses (rdg, i, partition, loops, processed, &foo);
+ }
}
/* Returns a bitmap in which all the statements needed for computing
@@ -545,9 +800,8 @@ build_rdg_partition_for_component (struc
and determine those vertices that have some memory affinity with
the current nodes in the component: these are stores to the same
arrays, i.e. we're taking care of cache locality. */
- if (part_has_writes)
- rdg_flag_similar_stores (rdg, c, partition, loops, processed,
- other_stores);
+ rdg_flag_similar_memory_accesses (rdg, partition, loops, processed,
+ other_stores);
rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes);
@@ -707,13 +961,12 @@ debug_rdg_partitions (VEC (bitmap, heap)
dump_rdg_partitions (stderr, partitions);
}
-/* Split the statements contained in LOOP in several loops following
- the STARTING_VERTICES from RDG. Returns the number of distributed
- loops. */
+/* Generate code from STARTING_VERTICES in RDG. Returns the number of
+ distributed loops. */
static int
-ldist_generate_loops (struct loop *loop, struct graph *rdg,
- VEC (int, heap) *starting_vertices)
+ldist_gen (struct loop *loop, struct graph *rdg,
+ VEC (int, heap) *starting_vertices)
{
int i, nbp;
VEC (rdgc, heap) *components = VEC_alloc (rdgc, heap, 3);
@@ -762,7 +1015,7 @@ ldist_generate_loops (struct loop *loop,
dump_rdg_partitions (dump_file, partitions);
for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
- if (!generate_loops_for_partition (loop, partition, i < nbp - 1))
+ if (!generate_code_for_partition (loop, partition, i < nbp - 1))
goto ldist_done;
rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
@@ -838,7 +1091,7 @@ distribute_loop (struct loop *loop, VEC
}
}
- res = ldist_generate_loops (loop, rdg, vertices);
+ res = ldist_gen (loop, rdg, vertices);
VEC_free (int, heap, vertices);
free_rdg (rdg);