This is the mail archive of the
fortran@gcc.gnu.org
mailing list for the GNU Fortran project.
Re: Polyhedron tests on Intel Darwin8/9
- From: Paul Thomas <paulthomas2 at wanadoo dot fr>
- To: Dominique Dhumieres <dominiq at lps dot ens dot fr>, Fortran List <fortran at gcc dot gnu dot org>
- Date: Wed, 21 Nov 2007 00:03:32 +0100
- Subject: Re: Polyhedron tests on Intel Darwin8/9
- References: <20071120141552.0C95B5BB6C@mailhost.lps.ens.fr>
Dominique,
I have also a optimized vaerion of induct which almost three time faster:
induct_v3 11.72 164228 34.40 5 0.0189
(the dotproducts have been replaced by mul/adds taking into account the
zero values and I moved some invariant blocks.)
Encouraged by what you described, I modified (see patch) the intrinsic
dot_product to give the following:
{
integer(kind=4) D.2133;
integer(kind=4) D.2132;
real(kind=8) tmp2.220;
real(kind=8) z2.219;
real(kind=8) tmp1.218;
real(kind=8) z1.217;
real(kind=8) val.216;
val.216 = 0.0;
z1.217 = 0.0;
z2.219 = 0.0;
D.2132 = offset.126;
D.2133 = ubound.124;
{
integer(kind=4) D.2136;
integer(kind=4) D.2135;
integer(kind=4) S.221;
D.2135 = D.2132 + NON_LVALUE_EXPR <stride.123>;
D.2136 = stride.125;
S.221 = 1;
while (1)
{
if (S.221 > 3) goto L.52;
tmp1.218 = (*rotate_coil.0)[S.221 * D.2136 + D.2135];
tmp2.220 = coil_tmp_vector[S.221 + -1];
if (tmp1.218 == z1.217 || tmp2.220 == z2.219)
{
(void) 0;
}
else
{
val.216 = val.216 + tmp1.218 * tmp2.220;
}
S.221 = S.221 + 1;
}
L.52:;
}
coil_current_vec[0] = val.216;
}
Unfortunately, it takes twice as long to execute....
What did you implement?
Cheers
Paul
Index: gcc/fortran/trans-intrinsic.c
===================================================================
*** gcc/fortran/trans-intrinsic.c (revision 130285)
--- gcc/fortran/trans-intrinsic.c (working copy)
*************** gfc_conv_intrinsic_dot_product (gfc_se *
*** 1955,1961 ****
tree type;
stmtblock_t body;
stmtblock_t block;
! tree tmp;
gfc_loopinfo loop;
gfc_actual_arglist *actual;
gfc_ss *arrayss1, *arrayss2;
--- 1955,1962 ----
tree type;
stmtblock_t body;
stmtblock_t block;
! tree tmp, tmp1, tmp2;
! tree zero1, zero2;
gfc_loopinfo loop;
gfc_actual_arglist *actual;
gfc_ss *arrayss1, *arrayss2;
*************** gfc_conv_intrinsic_dot_product (gfc_se *
*** 1978,1989 ****
--- 1979,2000 ----
arrayexpr1 = actual->expr;
arrayss1 = gfc_walk_expr (arrayexpr1);
gcc_assert (arrayss1 != gfc_ss_terminator);
+ tmp = gfc_typenode_for_spec (&arrayexpr1->ts);
+ zero1 = gfc_create_var (tmp, "z1");
+ tmp1 = gfc_create_var (tmp, "tmp1");
+ gfc_add_modify_expr (&se->pre, zero1,
+ gfc_build_const (tmp, integer_zero_node));
/* Walk argument #2. */
actual = actual->next;
arrayexpr2 = actual->expr;
arrayss2 = gfc_walk_expr (arrayexpr2);
gcc_assert (arrayss2 != gfc_ss_terminator);
+ tmp = gfc_typenode_for_spec (&arrayexpr2->ts);
+ zero2 = gfc_create_var (tmp, "z2");
+ tmp2 = gfc_create_var (tmp, "tmp2");
+ gfc_add_modify_expr (&se->pre, zero2,
+ gfc_build_const (tmp, integer_zero_node));
/* Initialize the scalarizer. */
gfc_init_loopinfo (&loop);
*************** gfc_conv_intrinsic_dot_product (gfc_se *
*** 2008,2037 ****
gfc_conv_expr_val (&arrayse1, arrayexpr1);
if (expr->ts.type == BT_COMPLEX)
arrayse1.expr = build1 (CONJ_EXPR, type, arrayse1.expr);
! gfc_add_block_to_block (&block, &arrayse1.pre);
/* Make the tree expression for array2. */
gfc_init_se (&arrayse2, NULL);
gfc_copy_loopinfo_to_se (&arrayse2, &loop);
arrayse2.ss = arrayss2;
gfc_conv_expr_val (&arrayse2, arrayexpr2);
! gfc_add_block_to_block (&block, &arrayse2.pre);
/* Do the actual product and sum. */
if (expr->ts.type == BT_LOGICAL)
{
tmp = build2 (TRUTH_AND_EXPR, type, arrayse1.expr, arrayse2.expr);
tmp = build2 (TRUTH_OR_EXPR, type, resvar, tmp);
}
else
{
! tmp = build2 (MULT_EXPR, type, arrayse1.expr, arrayse2.expr);
tmp = build2 (PLUS_EXPR, type, resvar, tmp);
}
- gfc_add_modify_expr (&block, resvar, tmp);
/* Finish up the loop block and the loop. */
- tmp = gfc_finish_block (&block);
gfc_add_expr_to_block (&body, tmp);
gfc_trans_scalarizing_loops (&loop, &body);
--- 2019,2060 ----
gfc_conv_expr_val (&arrayse1, arrayexpr1);
if (expr->ts.type == BT_COMPLEX)
arrayse1.expr = build1 (CONJ_EXPR, type, arrayse1.expr);
! gfc_add_block_to_block (&se->pre, &arrayse1.pre);
/* Make the tree expression for array2. */
gfc_init_se (&arrayse2, NULL);
gfc_copy_loopinfo_to_se (&arrayse2, &loop);
arrayse2.ss = arrayss2;
gfc_conv_expr_val (&arrayse2, arrayexpr2);
! gfc_add_block_to_block (&se->pre, &arrayse2.pre);
/* Do the actual product and sum. */
if (expr->ts.type == BT_LOGICAL)
{
tmp = build2 (TRUTH_AND_EXPR, type, arrayse1.expr, arrayse2.expr);
tmp = build2 (TRUTH_OR_EXPR, type, resvar, tmp);
+ gfc_add_modify_expr (&block, resvar, tmp);
+ tmp = gfc_finish_block (&block);
}
else
{
! tree cond, cond1, cond2;
! gfc_add_modify_expr (&body, tmp1, arrayse1.expr);
! gfc_add_modify_expr (&body, tmp2, arrayse2.expr);
! cond1 = fold_build2 (EQ_EXPR, boolean_type_node,
! tmp1, zero1);
! cond2 = fold_build2 (EQ_EXPR, boolean_type_node,
! tmp2, zero2);
! cond = build2 (TRUTH_OR_EXPR, boolean_type_node, cond1, cond2);
!
! tmp = build2 (MULT_EXPR, type, tmp1, tmp2);
tmp = build2 (PLUS_EXPR, type, resvar, tmp);
+ gfc_add_modify_expr (&block, resvar, tmp);
+ tmp = gfc_finish_block (&block);
+ tmp = build3_v (COND_EXPR, cond, build_empty_stmt (), tmp);
}
/* Finish up the loop block and the loop. */
gfc_add_expr_to_block (&body, tmp);
gfc_trans_scalarizing_loops (&loop, &body);