This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PATCH][libstdc++-v3 parallel mode] Throw out running time measurement

From: Johannes Singler <singler at ira dot uka dot de>
To: libstdc++ <libstdc++ at gcc dot gnu dot org>, gcc-patches at gcc dot gnu dot org
Date: Thu, 25 Oct 2007 14:45:48 +0200
Subject: [PATCH][libstdc++-v3 parallel mode] Throw out running time measurement

This patch removes obsolete running time measurement code (that was
already deactivated).

Tested xx86_64-unknown-linux-gnu: no regressions

Please approve.


2007-10-25  Johannes Singler  <singler@ira.uka.de>

      * include/parallel/multiway_merge.h: Removed Timing<inactive_tag>
      * include/parallel/random_shuffle.h: Same
      * include/parallel/set_operations.h: Same
      * include/parallel/tree.h: Same
      * include/parallel/multiway_mergesort.h: Same
      * include/parallel/timing.h: Removed completely

Johannes

Index: include/parallel/multiway_merge.h
===================================================================
--- include/parallel/multiway_merge.h	(revision 129315)
+++ include/parallel/multiway_merge.h	(working copy)
@@ -52,7 +52,6 @@
 #include <parallel/parallel.h>
 #include <parallel/merge.h>
 #include <parallel/losertree.h>
-#include <parallel/timing.h>
 #if _GLIBCXX_ASSERTIONS
 #include <parallel/checkers.h>
 #endif
@@ -1354,11 +1353,6 @@
 
     thread_index_t num_threads = static_cast<thread_index_t>(std::min(static_cast<difference_type>(get_max_threads()), total_length));
 
-    Timing<sequential_tag>* t = new Timing<sequential_tag>[num_threads];
-
-    for (int pr = 0; pr < num_threads; pr++)
-      t[pr].tic();
-
     bool tight = (total_length == length);
 
     // Thread t will have to merge pieces[iam][0..k - 1]
@@ -1456,15 +1450,10 @@
 	delete[] offsets;
       }
 
-    for (int pr = 0; pr < num_threads; pr++)
-      t[pr].tic();
-
 #	pragma omp parallel num_threads(num_threads)
     {
       thread_index_t iam = omp_get_thread_num();
 
-      t[iam].tic();
-
       difference_type target_position = 0;
 
       for (int c = 0; c < k; c++)
@@ -1498,14 +1487,8 @@
 			(pieces[iam][0].second - pieces[iam][0].first) + (pieces[iam][1].second - pieces[iam][1].first),
 			comp);
 	}
-
-      t[iam].tic();
-
     }
 
-    for (int pr = 0; pr < num_threads; pr++)
-      t[pr].tic();
-
 #if _GLIBCXX_ASSERTIONS
     _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp));
 #endif
@@ -1516,12 +1499,6 @@
 
     delete[] pieces;
 
-    for (int pr = 0; pr < num_threads; pr++)
-      t[pr].tic();
-    for (int pr = 0; pr < num_threads; pr++)
-      t[pr].print();
-    delete[] t;
-
     return target + length;
   }
 
Index: include/parallel/random_shuffle.h
===================================================================
--- include/parallel/random_shuffle.h	(revision 129315)
+++ include/parallel/random_shuffle.h	(working copy)
@@ -42,7 +42,6 @@
 #include <bits/stl_numeric.h>
 #include <parallel/parallel.h>
 #include <parallel/random_number.h>
-#include <parallel/timing.h>
 
 namespace __gnu_parallel
 {
@@ -136,9 +135,6 @@
     typedef typename traits_type::value_type value_type;
     typedef typename traits_type::difference_type difference_type;
 
-    Timing<sequential_tag> t;
-    t.tic();
-
     DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()];
     DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
     thread_index_t iam = d->iam;
@@ -170,12 +166,8 @@
     for (bin_index b = 0; b < sd->num_bins + 1; b++)
       sd->dist[b][iam + 1] = dist[b];
 
-    t.tic();
-
 #pragma omp barrier
 
-    t.tic();
-
 #pragma omp single
     {
       // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
@@ -188,8 +180,6 @@
 
 #pragma omp barrier
 
-    t.tic();
-
     sequence_index_t offset = 0, global_offset = 0;
     for (bin_index s = 0; s < d->bins_begin; s++)
       global_offset += sd->dist[s + 1][d->num_threads];
@@ -205,12 +195,8 @@
 
     sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * offset));
 
-    t.tic();
-
 #pragma omp barrier
 
-    t.tic();
-
     // Draw local copies to avoid false sharing.
     for (bin_index b = 0; b < sd->num_bins + 1; b++)
       dist[b] = sd->dist[b][iam];
@@ -237,12 +223,8 @@
     delete[] bin_proc;
     delete[] temporaries;
 
-    t.tic();
-
 #pragma omp barrier
 
-    t.tic();
-
     // Shuffle bins internally.
     for (bin_index b = d->bins_begin; b < d->bins_end; b++)
       {
@@ -253,10 +235,6 @@
       }
 
     delete[] sd->temporaries[iam];
-
-    t.tic();
-
-    t.print();
   }
 
   /** @brief Round up to the next greater power of 2.
@@ -453,9 +431,6 @@
 	for (int b = 0; b < num_bins + 1; b++)
 	  dist0[b] = 0;
 
-	Timing<sequential_tag> t;
-	t.tic();
-
 	random_number bitrng(rng(0xFFFFFFFF));
 
 	for (difference_type i = 0; i < n; i++)
@@ -467,16 +442,12 @@
 	    dist0[oracle + 1]++;
 	  }
 
-	t.tic();
-
 	// Sum up bins.
 	__gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0);
 
 	for (int b = 0; b < num_bins + 1; b++)
 	  dist1[b] = dist0[b];
 
-	t.tic();
-
 	// Distribute according to oracles.
 	for (difference_type i = 0; i < n; i++)
 	  target[(dist0[oracles[i]])++] = *(begin + i);
@@ -485,9 +456,7 @@
 	  {
 	    sequential_random_shuffle(target + dist1[b], target + dist1[b + 1],
 				      rng);
-	    t.tic();
 	  }
-	t.print();
 
 	delete[] dist0;
 	delete[] dist1;
Index: include/parallel/set_operations.h
===================================================================
--- include/parallel/set_operations.h	(revision 129315)
+++ include/parallel/set_operations.h	(working copy)
@@ -381,10 +381,6 @@
 
 #pragma omp parallel num_threads(num_threads)
     {
-      Timing<sequential_tag> t;
-
-      t.tic();
-
       // Result from multiseq_partition.
       InputIterator offset[2];
       const int iam = omp_get_thread_num();
@@ -407,13 +403,9 @@
 
       iterator_pair block_end = block_begins[ iam + 1 ] = iterator_pair(offset[ 0 ], offset[ 1 ]);
 
-      t.tic();
-
       // Make sure all threads have their block_begin result written out.
 #pragma omp barrier
 
-      t.tic();
-
       iterator_pair block_begin = block_begins[ iam ];
 
       // Begin working for the first block, while the others except
@@ -429,12 +421,9 @@
 				   block_begin.second, block_end.second);
 	}
 
-      t.tic();
-
       // Make sure everyone wrote their lengths.
 #pragma omp barrier
 
-      t.tic();
       OutputIterator r = result;
 
       if (iam == 0)
@@ -458,9 +447,6 @@
 	  op.invoke(block_begin.first, block_end.first,
 		    block_begin.second, block_end.second, r);
 	}
-
-      t.tic();
-      t.print();
     }
     return return_value;
   }
Index: include/parallel/tree.h
===================================================================
--- include/parallel/tree.h	(revision 129315)
+++ include/parallel/tree.h	(working copy)
@@ -57,13 +57,6 @@
 
 #include <parallel/list_partition.h>
 
-//#define _GLIBCXX_TIMING
-#ifdef _GLIBCXX_TIMING
-#define _timing_tag parallel_tag
-#else
-#define _timing_tag sequential_tag
-#endif
-
 namespace std
 {
   // XXX Declaration should go to stl_tree.h.
@@ -1217,10 +1210,6 @@
     void
     _M_bulk_insertion_construction(const _InputIterator __first, const _InputIterator __last, const bool is_construction, StrictlyLessOrLessEqual strictly_less_or_less_equal)
     {
-      Timing<_timing_tag> t;
-
-      t.tic();
-
       thread_index_t num_threads = get_max_threads();
       size_type n;
       size_type beg_partition[num_threads+1];
@@ -1228,8 +1217,6 @@
       beg_partition[0] = 0;
       bool is_sorted= is_sorted_distance_accessors(__first, __last, access, beg_partition,n, num_threads, std::__iterator_category(__first));
 
-      t.tic("is_sorted");
-
       if (not is_sorted)
 	{
 	  _M_not_sorted_bulk_insertion_construction(access, beg_partition, n, num_threads, is_construction, strictly_less_or_less_equal);
@@ -1260,10 +1247,6 @@
 	    _M_sorted_bulk_insertion(access, beg_partition, n, num_threads, 
 				     strictly_less_or_less_equal);
 	}
-
-      t.tic("main work");
-
-      t.print();
     }
 
     /** @brief Bulk construction and insertion helper method on an
@@ -1349,31 +1332,19 @@
     _M_not_sorted_bulk_insertion_construction(size_type* beg_partition, ElementsToSort* v, Comparator comp, const size_type n, thread_index_t num_threads, const bool is_construction, StrictlyLessOrLessEqual strictly_less_or_less_equal)
     {
       // The accessors have been calculated for the non sorted.
-      Timing<_timing_tag> t;
-
-      t.tic();
-
       num_threads = static_cast<thread_index_t>(std::min<size_type>(num_threads, n));
 
       std::stable_sort(v, v+n, comp);
 
-      t.tic("sort");
-
       IteratorSortedElements sorted_access[num_threads+1];
       range_accessors(IteratorSortedElements(v), IteratorSortedElements(v+n), sorted_access, beg_partition, n, num_threads, std::__iterator_category(v));
 
-      t.tic("range_accessors");
-
       // Partial template specialization not available.
       if (is_construction)
 	_M_sorted_bulk_construction(sorted_access, beg_partition, n, num_threads, strictly_less_or_less_equal);
       else
 	_M_sorted_bulk_insertion(sorted_access, beg_partition, n, num_threads, strictly_less_or_less_equal);
       delete v;
-
-      t.tic("actual construction or insertion");
-
-      t.print();
     }
 
     /** @brief Construct a tree sequentially using the parallel routine
@@ -1753,17 +1724,11 @@
     void
     _M_sorted_bulk_construction(_Iterator* access, size_type* beg_partition, const size_type n, thread_index_t num_threads, StrictlyLessOrLessEqual strictly_less_or_less_equal)
     {
-      Timing<_timing_tag> t;
-
       // Dealing with repetitions (EFFICIENCY ISSUE).
       size_type rank_shift[num_threads+1];
 
-      t.tic();
-
       _Rb_tree_node_ptr* r = _M_sorted_bulk_allocation_and_initialization(access, beg_partition, rank_shift, n, num_threads, strictly_less_or_less_equal);
 
-      t.tic("bulk allocation and initialization");
-
       // Link the tree appropriately.
       // Dealing with repetitions (EFFICIENCY ISSUE).
       ranker_gaps rank(beg_partition, rank_shift, num_threads);
@@ -1818,11 +1783,7 @@
       base_type::_M_impl._M_header._M_parent = nodes_init.get_root();
       nodes_init.get_root()->_M_parent= &base_type::_M_impl._M_header;
 
-      t.tic("linking nodes");
       ::operator delete(r);
-
-      t.tic("delete array of pointers");
-      t.print();
     }
 
 
@@ -1850,10 +1811,6 @@
     _M_sorted_bulk_insertion(_Iterator* access, size_type* beg_partition, size_type k, thread_index_t num_threads, StrictlyLessOrLessEqual strictly_less_or_less_equal)
     {
       _GLIBCXX_PARALLEL_ASSERT((size_type)num_threads <= k);
-      Timing<_timing_tag> t;
-
-      t.tic();
-
       // num_thr-1 problems in the upper part of the tree
       // num_thr problems to further parallelize
       std::vector<size_type> existing(num_threads,0);
@@ -1873,7 +1830,6 @@
 	  // 1. Construct the nodes with their corresponding data
 #if _GLIBCXX_TREE_INITIAL_SPLITTING
 	  r = _M_sorted_bulk_allocation_and_initialization(access, beg_partition, rank_shift, k, num_threads, strictly_less_or_less_equal);
-	  t.tic("bulk allocation and initialization");
 #else
 	  r = _M_sorted_no_gapped_bulk_allocation_and_initialization(access, beg_partition, k, num_threads, strictly_less_or_less_equal);
 #endif
@@ -1896,8 +1852,6 @@
       repetitions (EFFICIENCY ISSUE) *****/
       size_type last = beg_partition[num_threads] - (rank_shift[num_threads] - rank_shift[num_threads - 1]);
 
-      t.tic("last element to be inserted");
-
       //2. Split the tree according to access in num_threads parts
       //Initialize upper concat_problems
       //Allocate them dynamically because they are afterwards so erased
@@ -1960,8 +1914,6 @@
       size_type last = k;
 #endif
 
-      t.tic("sorted_no_gapped...");
-
       // 3. Split the range according to tree and create
       // 3. insertion/concatenation problems to be solved in parallel
 #if _GLIBCXX_TREE_DYNAMIC_BALANCING
@@ -2018,8 +1970,6 @@
 	  } while (change);
       }
 
-      t.tic("merging");
-
       // Update root and sizes.
       base_type::_M_root() = root_problem->t;
       root_problem->t->_M_parent = &(base_type::_M_impl._M_header);
@@ -2069,9 +2019,6 @@
 
       // Delete array of pointers
       ::operator delete(r);
-
-      t.tic();
-      t.print();
     }
 
 
Index: include/parallel/multiway_mergesort.h
===================================================================
--- include/parallel/multiway_mergesort.h	(revision 129315)
+++ include/parallel/multiway_mergesort.h	(working copy)
@@ -44,7 +44,6 @@
 #include <bits/stl_algo.h>
 #include <parallel/parallel.h>
 #include <parallel/multiway_merge.h>
-#include <parallel/timing.h>
 
 namespace __gnu_parallel
 {
@@ -160,9 +159,6 @@
     typedef typename traits_type::value_type value_type;
     typedef typename traits_type::difference_type difference_type;
 
-    Timing<sequential_tag> t;
-    t.tic();
-
     PMWMSSortingData<RandomAccessIterator>* sd = d->sd;
     thread_index_t iam = d->iam;
 
@@ -196,7 +192,6 @@
 
     // Invariant: locally sorted subsequence in sd->sorting_places[iam],
     // sd->sorting_places[iam] + length_local.
-    t.tic("local sort");
 
     if (Settings::sort_splitting == Settings::SAMPLING)
       {
@@ -205,8 +200,6 @@
 
 #pragma omp barrier
 
-	t.tic("sample/wait");
-
 #pragma omp single
 	__gnu_sequential::sort(sd->samples, 
 			       sd->samples + (num_samples * d->num_threads), 
@@ -241,8 +234,6 @@
       {
 #pragma omp barrier
 
-	t.tic("wait");
-
 	std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads);
 	for (int s = 0; s < d->num_threads; s++)
 	  seqs[s] = std::make_pair(sd->sorting_places[s], sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]);
@@ -276,8 +267,6 @@
 	  }
       }
 
-    t.tic("split");
-
     // Offset from target begin, length after merging.
     difference_type offset = 0, length_am = 0;
     for (int s = 0; s < d->num_threads; s++)
@@ -308,8 +297,6 @@
 
     multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, d->stable, false, sequential_tag());
 
-    t.tic("merge");
-
 #if _GLIBCXX_ASSERTIONS
     _GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->merging_places[iam], sd->merging_places[iam] + length_am, comp));
 #endif
@@ -323,10 +310,6 @@
 #endif
 
     delete[] sd->temporaries[iam];
-
-    t.tic("copy back");
-
-    t.print();
   }
 
   /** @brief PMWMS main call.
Index: include/parallel/timing.h
===================================================================
--- include/parallel/timing.h	(revision 129315)
+++ include/parallel/timing.h	(working copy)
@@ -1,217 +0,0 @@
-// -*- C++ -*-
-
-// Copyright (C) 2007 Free Software Foundation, Inc.
-//
-// This file is part of the GNU ISO C++ Library.  This library is free
-// software; you can redistribute it and/or modify it under the terms
-// of the GNU General Public License as published by the Free Software
-// Foundation; either version 2, or (at your option) any later
-// version.
-
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// General Public License for more details.
-
-// You should have received a copy of the GNU General Public License
-// along with this library; see the file COPYING.  If not, write to
-// the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
-// MA 02111-1307, USA.
-
-// As a special exception, you may use this file as part of a free
-// software library without restriction.  Specifically, if other files
-// instantiate templates or use macros or inline functions from this
-// file, or you compile this file and link it with other files to
-// produce an executable, this file does not by itself cause the
-// resulting executable to be covered by the GNU General Public
-// License.  This exception does not however invalidate any other
-// reasons why the executable file might be covered by the GNU General
-// Public License.
-
-/** @file parallel/timing.h
- *  @brief Provides a simple tool to do performance debugging, also in
- *  parallel code.
- *  This file is a GNU parallel extension to the Standard C++ Library.
- */
-
-// Written by Johannes Singler.
-
-#ifndef _GLIBCXX_PARALLEL_TIMING_H
-#define _GLIBCXX_PARALLEL_TIMING_H 1
-
-#include <omp.h>
-#include <cstdio>
-#include <cstring>
-#include <parallel/tags.h>
-
-namespace __gnu_parallel
-{
-  // XXX integrate with existing performance testing infrastructure.
-  /** @brief Type of of point in time, used for the Timing classes. */
-  typedef double point_in_time;
-
-  template<typename tag, typename must_be_int = int>
-  class Timing;
-
-  /** @brief A class that provides simple run time measurements, also
-      for parallel code.
-   *  @param tag If parallel_tag, then the measurements are actually done.
-   *  Otherwise, no code at all is emitted by the compiler. */
-  template<typename must_be_int>
-  class Timing<parallel_tag, must_be_int>
-  {
-  private:
-    static const int max_points_in_time = 100;
-    point_in_time points_in_time[max_points_in_time];
-    point_in_time active, last_start;
-    int pos;
-    char* str;
-    const char* tags[max_points_in_time];
-
-  public:
-    Timing()
-    {
-      str = NULL;
-      pos = 0;
-      active = 0.0;
-      last_start = -1.0;
-    }
-
-    ~Timing()
-    {
-      delete[] str;
-    }
-
-    /** @brief Take a running time measurement.
-     *  @param tag Optional description that will be output again with
-     *  the timings.
-     *  It should describe the operation before the tic(). To time a
-     *  series of @c n operations, there should be @c n+1 calls to
-     *  tic(), and one call to print(). */
-    inline void
-    tic(const char* tag = NULL)
-    {
-      points_in_time[pos] = omp_get_wtime();
-      tags[pos] = tag;
-      pos++;
-    }
-
-    /** @brief Start the running time measurement.
-     *
-     *  Should be paired with stop(). */
-    inline void
-    start()
-    {
-      _GLIBCXX_PARALLEL_ASSERT(last_start == -1.0);
-      last_start = omp_get_wtime();
-    }
-
-    /** @brief Stop the running time measurement.
-     *
-     *  Should be paired with start(). */
-    inline void
-    stop()
-    {
-      _GLIBCXX_PARALLEL_ASSERT(last_start != -1.0);
-      active += (omp_get_wtime() - last_start);
-      last_start = -1.0;
-    }
-
-    /** @brief Reset running time accumulation. */
-    inline void
-    reset()
-    {
-      active = 0.0;
-      last_start = -1.0;
-    }
-
-    /** @brief Accumulate the time between all pairs of start() and
-	stop() so far */
-    inline point_in_time
-    active_time()
-    { return active; }
-
-    /** @brief Total time between first and last tic() */
-    inline point_in_time
-    total_time()
-    { return (points_in_time[pos - 1] - points_in_time[0]) * 1000.0; }
-
-  private:
-    /** @brief Construct string to print out, presenting the timings. */
-    const char*
-    c_str()
-    {
-      // Avoid stream library here, to avoid cyclic dependencies in
-      // header files.
-      char tmp[1000];
-
-      if (!str)
-	str = new char[pos * 200];
-      else
-	str[0] = '\0';
-
-      sprintf(str, "t %2d      T[ms]", omp_get_thread_num());
-      strcat(str, "\n");
-
-      for (int i = 0; i < pos; )
-	{
-	  point_in_time last = points_in_time[i];
-	  i++;
-	  if (i == pos)
-	    break;
-	  if (tags[i] == NULL)
-	    sprintf(tmp, "%2d:     ", i - 1);
-	  else
-	    sprintf(tmp, "%20s:     ", tags[i]);
-	  strcat(str, tmp);
-
-	  sprintf(tmp, "%7.2f     ", (points_in_time[i] - last) * 1000.0);
-	  strcat(str, tmp);
-	  strcat(str, "\n");
-	}
-
-      return str;
-    }
-
-  public:
-    /** @brief Print the running times between the tic()s. */
-    void
-    print()
-    {
-      printf("print\n");
-#pragma omp barrier
-#pragma omp master
-      printf("\n\n");
-#pragma omp critical
-      printf("%s\n", c_str());
-    }
-  };
-
-  /** @brief A class that provides simple run time measurements, also
-      for parallel code.
-   *  @param tag If parallel_tag, then the measurements are actually done,
-   *  otherwise, no code at all is emitted by the compiler.
-   */
-  template<typename must_be_int>
-  class Timing<sequential_tag, must_be_int>
-  {
-  private:
-    static const char* empty_string;
-
-  public:
-    inline void tic(const char* /*tag*/ = NULL) { }
-    inline void start() { }
-    inline void stop() { }
-    inline void reset() { }
-    inline point_in_time active_time() { return -1.0; }
-    inline point_in_time total_time() { return -1.0; }
-    inline const char* c_str() { return empty_string; }
-    inline void print() { }
-  };
-
-  template<typename must_be_int>
-  const char* Timing<sequential_tag, must_be_int>::empty_string = "";
-
-}
-
-#endif

Follow-Ups:
- Re: [PATCH][libstdc++-v3 parallel mode] Throw out running time measurement
  - From: Benjamin Kosnik

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]