This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[Bug target/83008] [performance] Is it better to avoid extra instructions in data passing between loops?

From: "sergey.shalnov at intel dot com" <gcc-bugzilla at gcc dot gnu dot org>
To: gcc-bugs at gcc dot gnu dot org
Date: Wed, 10 Jan 2018 08:39:57 +0000
Subject: [Bug target/83008] [performance] Is it better to avoid extra instructions in data passing between loops?
Auto-submitted: auto-generated
References: <bug-83008-4@http.gcc.gnu.org/bugzilla/>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83008

--- Comment #21 from sergey.shalnov at intel dot com ---
Thanks Richard for your comments. 
Based on our discussion I've produced the patch attached and
run it on SPEC2017intrate/fprate on skylake server (with [-Ofast -flto
-march=skylake-avx512 -mfpmath=sse -funroll-loops]).
Please note, I used your last proposed patch and changed loop trip count
calculation ("ncopies_for_cost * nunits / group_size" is always 1).

I see the following performance changes:
SPEC CPU 2017 intrate
500.perlbench_r -0.7%
525.x264_r +7.2%
Geomean: +0.8%

SPEC CPU 2017 fprate
527.cam4_r -1.1%
538.imagick_r +4.7%
544.nab_r +3.6%
Geomean: +0.6%

I believe that after appropriate cost model tweaks for other targets a gain
could be observed but I haven't checked it carefully.
It provides a good performance gain for the original case and a few other
improvements.

Can you please take a look at the patch and comment (or might propose another
method)?

Sergey

From 41e5094cbdce72d4cc5e04fc3d11c01c3c1adbb7 Mon Sep 17 00:00:00 2001
From: Sergey Shalnov <Sergey.Shalnov@intel.com>
Date: Tue, 9 Jan 2018 14:37:14 +0100
Subject: [PATCH, SLP] SLP_common_algo_changes

---
 gcc/config/i386/x86-tune-costs.h |  4 ++--
 gcc/tree-vect-slp.c              | 41 ++++++++++++++++++++++++++++++----------
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/x86-tune-costs.h
b/gcc/config/i386/x86-tune-costs.h
index 312467d..3e0f904 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1555,7 +1555,7 @@ struct processor_costs skylake_cost = {
   {4, 4, 4},                           /* cost of loading integer registers
                                           in QImode, HImode and SImode.
                                           Relative to reg-reg move (2).  */
-  {6, 6, 6},                           /* cost of storing integer registers */
+  {6, 6, 4},                           /* cost of storing integer registers. 
*/
   2,                                   /* cost of reg,reg fld/fst */
   {6, 6, 8},                           /* cost of loading fp registers
                                           in SFmode, DFmode and XFmode */
@@ -1570,7 +1570,7 @@ struct processor_costs skylake_cost = {
   {6, 6, 6, 10, 20},                   /* cost of loading SSE registers
                                           in 32,64,128,256 and 512-bit */
   {6, 6, 6, 10, 20},                   /* cost of unaligned loads.  */
-  {8, 8, 8, 8, 16},                    /* cost of storing SSE registers
+  {8, 8, 8, 16, 32},                   /* cost of storing SSE registers
                                           in 32,64,128,256 and 512-bit */
   {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
   2, 2,                                        /* SSE->integer and
integer->SSE moves */
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 0ca42b4..7e63a1c 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -1815,18 +1815,39 @@ vect_analyze_slp_cost_1 (slp_instance instance,
slp_tree node,
       enum vect_def_type dt;
       if (!op || op == lhs)
        continue;
-      if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt))
+      if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt)
+        && (dt == vect_constant_def || dt == vect_external_def))
        {
          /* Without looking at the actual initializer a vector of
             constants can be implemented as load from the constant pool.
-            ???  We need to pass down stmt_info for a vector type
-            even if it points to the wrong stmt.  */
-         if (dt == vect_constant_def)
-           record_stmt_cost (prologue_cost_vec, 1, vector_load,
-                             stmt_info, 0, vect_prologue);
-         else if (dt == vect_external_def)
-           record_stmt_cost (prologue_cost_vec, 1, vec_construct,
-                             stmt_info, 0, vect_prologue);
+            When all elements are the same we can use a splat.  */
+         unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
+         tree elt = NULL_TREE;
+         unsigned nelt = 0;
+         for (unsigned j = 0; j < ncopies_for_cost; ++j)
+           for (unsigned k = 0; k < group_size; ++k)
+             {
+               if (nelt == 0)
+                 elt = gimple_op (SLP_TREE_SCALAR_STMTS (node)[nelt], i);
+               /* ???  We're just tracking whether all operands of a single
+                  vector initializer are the same, ideally we'd check if
+                  we emitted the same one already.  */
+               else if (elt != gimple_op (SLP_TREE_SCALAR_STMTS (node)[nelt],
+                                          i))
+                 elt = NULL_TREE;
+               nelt++;
+               if (nelt == group_size)
+                 {
+                   /* ???  We need to pass down stmt_info for a vector type
+                      even if it points to the wrong stmt.  */
+                   record_stmt_cost (prologue_cost_vec, 1,
+                                     dt == vect_external_def
+                                     ? (elt ? scalar_to_vec : vec_construct)
+                                     : vector_load,
+                                     stmt_info, 0, vect_prologue);
+                   nelt = 0;
+                 }
+             }
        }
     }

@@ -2820,7 +2841,7 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
      the cost estimate is otherwise quite pessimistic (constant uses are
      free on the scalar side but cost a load on the vector side for
      example).  */
-  if (vec_outside_cost + vec_inside_cost > scalar_cost)
+  if (vec_outside_cost + vec_inside_cost >= scalar_cost)
     return false;

   return true;
--
1.8.3.1

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]