This is the mail archive of the fortran@gcc.gnu.org mailing list for the GNU Fortran project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: Polyhedron tests on Intel Darwin8/9


Dominique,

I have also a optimized vaerion of induct which almost three time faster:


induct_v3 11.72 164228 34.40 5 0.0189

(the dotproducts have been replaced by mul/adds taking into account the zero values and I moved some invariant blocks.)

Encouraged by what you described, I modified (see patch) the intrinsic dot_product to give the following:

             {
               integer(kind=4) D.2133;
               integer(kind=4) D.2132;
               real(kind=8) tmp2.220;
               real(kind=8) z2.219;
               real(kind=8) tmp1.218;
               real(kind=8) z1.217;
               real(kind=8) val.216;

               val.216 = 0.0;
               z1.217 = 0.0;
               z2.219 = 0.0;
               D.2132 = offset.126;
               D.2133 = ubound.124;
               {
                 integer(kind=4) D.2136;
                 integer(kind=4) D.2135;
                 integer(kind=4) S.221;

                 D.2135 = D.2132 + NON_LVALUE_EXPR <stride.123>;
                 D.2136 = stride.125;
                 S.221 = 1;
                 while (1)
                   {
                     if (S.221 > 3) goto L.52;
                     tmp1.218 = (*rotate_coil.0)[S.221 * D.2136 + D.2135];
                     tmp2.220 = coil_tmp_vector[S.221 + -1];
                     if (tmp1.218 == z1.217 || tmp2.220 == z2.219)
                       {
                         (void) 0;
                       }
                     else
                       {
                         val.216 = val.216 + tmp1.218 * tmp2.220;
                       }
                     S.221 = S.221 + 1;
                   }
                 L.52:;
               }
               coil_current_vec[0] = val.216;
             }

Unfortunately, it takes twice as long to execute....

What did you implement?

Cheers

Paul

Index: gcc/fortran/trans-intrinsic.c
===================================================================
*** gcc/fortran/trans-intrinsic.c	(revision 130285)
--- gcc/fortran/trans-intrinsic.c	(working copy)
*************** gfc_conv_intrinsic_dot_product (gfc_se *
*** 1955,1961 ****
    tree type;
    stmtblock_t body;
    stmtblock_t block;
!   tree tmp;
    gfc_loopinfo loop;
    gfc_actual_arglist *actual;
    gfc_ss *arrayss1, *arrayss2;
--- 1955,1962 ----
    tree type;
    stmtblock_t body;
    stmtblock_t block;
!   tree tmp, tmp1, tmp2;
!   tree zero1, zero2;
    gfc_loopinfo loop;
    gfc_actual_arglist *actual;
    gfc_ss *arrayss1, *arrayss2;
*************** gfc_conv_intrinsic_dot_product (gfc_se *
*** 1978,1989 ****
--- 1979,2000 ----
    arrayexpr1 = actual->expr;
    arrayss1 = gfc_walk_expr (arrayexpr1);
    gcc_assert (arrayss1 != gfc_ss_terminator);
+   tmp = gfc_typenode_for_spec (&arrayexpr1->ts);
+   zero1 = gfc_create_var (tmp, "z1");
+   tmp1 = gfc_create_var (tmp, "tmp1");
+   gfc_add_modify_expr (&se->pre, zero1, 
+ 		       gfc_build_const (tmp, integer_zero_node));
  
    /* Walk argument #2.  */
    actual = actual->next;
    arrayexpr2 = actual->expr;
    arrayss2 = gfc_walk_expr (arrayexpr2);
    gcc_assert (arrayss2 != gfc_ss_terminator);
+   tmp = gfc_typenode_for_spec (&arrayexpr2->ts);
+   zero2 = gfc_create_var (tmp, "z2");
+   tmp2 = gfc_create_var (tmp, "tmp2");
+   gfc_add_modify_expr (&se->pre, zero2, 
+ 		       gfc_build_const (tmp, integer_zero_node));
  
    /* Initialize the scalarizer.  */
    gfc_init_loopinfo (&loop);
*************** gfc_conv_intrinsic_dot_product (gfc_se *
*** 2008,2037 ****
    gfc_conv_expr_val (&arrayse1, arrayexpr1);
    if (expr->ts.type == BT_COMPLEX)
      arrayse1.expr = build1 (CONJ_EXPR, type, arrayse1.expr);
!   gfc_add_block_to_block (&block, &arrayse1.pre);
  
    /* Make the tree expression for array2.  */
    gfc_init_se (&arrayse2, NULL);
    gfc_copy_loopinfo_to_se (&arrayse2, &loop);
    arrayse2.ss = arrayss2;
    gfc_conv_expr_val (&arrayse2, arrayexpr2);
!   gfc_add_block_to_block (&block, &arrayse2.pre);
  
    /* Do the actual product and sum.  */
    if (expr->ts.type == BT_LOGICAL)
      {
        tmp = build2 (TRUTH_AND_EXPR, type, arrayse1.expr, arrayse2.expr);
        tmp = build2 (TRUTH_OR_EXPR, type, resvar, tmp);
      }
    else
      {
!       tmp = build2 (MULT_EXPR, type, arrayse1.expr, arrayse2.expr);
        tmp = build2 (PLUS_EXPR, type, resvar, tmp);
      }
-   gfc_add_modify_expr (&block, resvar, tmp);
  
    /* Finish up the loop block and the loop.  */
-   tmp = gfc_finish_block (&block);
    gfc_add_expr_to_block (&body, tmp);
  
    gfc_trans_scalarizing_loops (&loop, &body);
--- 2019,2060 ----
    gfc_conv_expr_val (&arrayse1, arrayexpr1);
    if (expr->ts.type == BT_COMPLEX)
      arrayse1.expr = build1 (CONJ_EXPR, type, arrayse1.expr);
!   gfc_add_block_to_block (&se->pre, &arrayse1.pre);
  
    /* Make the tree expression for array2.  */
    gfc_init_se (&arrayse2, NULL);
    gfc_copy_loopinfo_to_se (&arrayse2, &loop);
    arrayse2.ss = arrayss2;
    gfc_conv_expr_val (&arrayse2, arrayexpr2);
!   gfc_add_block_to_block (&se->pre, &arrayse2.pre);
  
    /* Do the actual product and sum.  */
    if (expr->ts.type == BT_LOGICAL)
      {
        tmp = build2 (TRUTH_AND_EXPR, type, arrayse1.expr, arrayse2.expr);
        tmp = build2 (TRUTH_OR_EXPR, type, resvar, tmp);
+       gfc_add_modify_expr (&block, resvar, tmp);
+       tmp = gfc_finish_block (&block);
      }
    else
      {
!       tree cond, cond1, cond2;
!       gfc_add_modify_expr (&body, tmp1, arrayse1.expr);
!       gfc_add_modify_expr (&body, tmp2, arrayse2.expr);
!       cond1 = fold_build2 (EQ_EXPR, boolean_type_node, 
! 			   tmp1, zero1);
!       cond2 = fold_build2 (EQ_EXPR, boolean_type_node, 
! 			   tmp2, zero2);
!       cond = build2 (TRUTH_OR_EXPR, boolean_type_node, cond1, cond2);
! 
!       tmp = build2 (MULT_EXPR, type, tmp1, tmp2);
        tmp = build2 (PLUS_EXPR, type, resvar, tmp);
+       gfc_add_modify_expr (&block, resvar, tmp);
+       tmp = gfc_finish_block (&block);
+       tmp = build3_v (COND_EXPR, cond, build_empty_stmt (), tmp);
      }
  
    /* Finish up the loop block and the loop.  */
    gfc_add_expr_to_block (&body, tmp);
  
    gfc_trans_scalarizing_loops (&loop, &body);

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]