This is the mail archive of the libstdc++@gcc.gnu.org mailing list for the libstdc++ project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Patch] Tweak for performance __gslice_to_index


Hi,

finally I decided to refine this improvement. The implementation had various inefficiencies, particularly evident when multiple dimensions are used: in particular a new "inner_product" was recomputed from scratch for every __j and the final loop over __k2 didn't terminate as soon as __t[__k2] < __l[__k2]. As an example, on a 2.4G Core 2 Duo I'm seeing this kind of improvement for the snippet I mean to add to the performance testsuite:

base
====

real    0m23.649s
user    0m19.393s
sys     0m4.220s

peak
====

real    0m9.927s
user    0m5.748s
sys     0m4.180s

Tested x86-64-linux, will wait until tomorrow italian time.

Paolo.

PS: As part of the patch I'm also removing the builtin_alloca call, which doesn't seem necessary performance-wise in all my tests (another tiny bit of libstdc++/28277, if you want)

/////////////////
2006-12-10  Paolo Carlini  <pcarlini@suse.de>

	* src/valarray-inst.cc (__gslice_to_index): Optimize performance.
	* testsuite/performance/26_numerics/valarray_gslice_to_index.cc: New.
Index: src/valarray-inst.cc
===================================================================
--- src/valarray-inst.cc	(revision 119545)
+++ src/valarray-inst.cc	(working copy)
@@ -1,6 +1,7 @@
 // Explicit instantiation file.
 
-// Copyright (C) 2001, 2004, 2005 Free Software Foundation, Inc.
+// Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006
+// Free Software Foundation, Inc.
 //
 // This file is part of the GNU ISO C++ Library.  This library is free
 // software; you can redistribute it and/or modify it under the
@@ -69,40 +70,34 @@
                     const valarray<size_t>& __s, valarray<size_t>& __i)
   {
     // There are as much as dimensions as there are strides.
-    size_t __n = __l.size();
+    const size_t __n = __l.size();
 
-    // Get a buffer to hold current multi-index as we go through
-    // the gslice for the purpose of computing its linear-image.
-    size_t* const __t = static_cast<size_t*>
-      (__builtin_alloca(__n * sizeof (size_t)));
-    __valarray_fill(__t, __n, size_t(0));
+    // Holds current multi-index as we go through the gslice for the
+    // purpose of computing its linear-image.
+    valarray<size_t> __t(__l);
 
     // Note that this should match the product of all numbers appearing
     // in __l which describes the multidimensional sizes of the
-    // the generalized slice.
+    // generalized slice.
     const size_t __z = __i.size();
-    
+
     for (size_t __j = 0; __j < __z; ++__j)
       {
-        // Compute the linear-index image of (t_0, ... t_{n-1}).
-        // Normaly, we should use inner_product<>(), but we do it the
-        // the hard way here to avoid link-time can of worms.
-        size_t __a = __o;
-        for (size_t __k = 0; __k < __n; ++__k)
-          __a += __s[__k] * __t[__k];
+	// Compute the linear-index image of (t_0, ... t_{n-1}).
+	__i[__j] = __o;
 
-        __i[__j] = __a;
+	--__t[__n - 1];
+	__o += __s[__n - 1];
 
         // Process the next multi-index.  The loop ought to be
-        // backward since we're making a lexicagraphical visit.
-        ++__t[__n - 1];
-        for (size_t __k2 = __n - 1; __k2; --__k2)
+        // backward since we're making a lexicographical visit.
+        for (size_t __k2 = __n - 1; __k2 && !__t[__k2]; --__k2)
           {
-            if (__t[__k2] >= __l[__k2])
-              {
-                __t[__k2] = 0;
-                ++__t[__k2 - 1];
-              }
+	    __o -= __s[__k2] * __l[__k2];
+	    __t[__k2] = __l[__k2];
+
+	    --__t[__k2 - 1];
+	    __o += __s[__k2 - 1];
           }
       }
   }
Index: testsuite/performance/26_numerics/valarray_gslice_to_index.cc
===================================================================
--- testsuite/performance/26_numerics/valarray_gslice_to_index.cc	(revision 0)
+++ testsuite/performance/26_numerics/valarray_gslice_to_index.cc	(revision 0)
@@ -0,0 +1,57 @@
+// Copyright (C) 2006 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 2, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING.  If not, write to the Free
+// Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+// USA.
+
+// As a special exception, you may use this file as part of a free software
+// library without restriction.  Specifically, if other files instantiate
+// templates or use macros or inline functions from this file, or you compile
+// this file and link it with other files to produce an executable, this
+// file does not by itself cause the resulting executable to be covered by
+// the GNU General Public License.  This exception does not however
+// invalidate any other reasons why the executable file might be covered by
+// the GNU General Public License.
+
+#include <valarray>
+#include <testsuite_performance.h>
+
+int main()
+{
+  using namespace std;
+  using namespace __gnu_test;
+
+  time_counter time;
+  resource_counter resource;
+
+  valarray<double> va(1000000);
+  
+  for (int i = 0; i < 1000000; ++i)
+    va[i] = i;
+
+  size_t lengthvalues[] = { 10, 10, 10, 10, 10, 10 };
+  size_t stridevalues[] = { 1, 1, 1, 1, 1, 1 };
+
+  valarray<size_t> lengths(lengthvalues, 6);
+  valarray<size_t> stride(stridevalues, 6);
+
+  start_counters(time, resource);
+  for (int j = 0; j < 1000; ++j)
+    va[gslice(0, lengths, stride)];
+  stop_counters(time, resource);
+  report_performance(__FILE__, "", time, resource);
+
+  return 0;
+}

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]