This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH] add self-tuning to x86 hardware fast path in libitm

From: Nuno Diegues <nmld at ist dot utl dot pt>
To: gcc-patches at gcc dot gnu dot org
Date: Tue, 7 Apr 2015 18:35:14 +0100
Subject: [PATCH] add self-tuning to x86 hardware fast path in libitm
Authentication-results: sourceware.org; auth=none
Hi,

the libitm package contains a fast path for x86 to use Intel
Restricted Transactional Memory (RTM) when available. This Hardware
Transactional Memory (HTM) requires a software-based fallback to
execute the atomic blocks when the hardware fails. This may happen
because the transaction is too large, for instance.

In fact, the performance of this HTM (and others that are equivalently
best-effort) depends significantly on that interplay between the
hardware and the software fallback. Wide experimentation showed that
there does not seem to exist a single configuration for that fallback
that can perform best independently of the application and workload
[2].

Hence, in the scope of my work I have developed a self-tuning approach
that exploits lightweight reinforcement learning techniques to
identify the optimal RTM configuration in a workload-oblivious manner,
i.e., not requiring any off-line sampling of the application.

The implementation in this patch follows closely the following work:
[1] "Self-Tuning Intel Transactional Synchronization Extensions", Nuno
Diegues and Paolo Romano, Proceedings of the International Conference
on Autonomic Computing, ICAC 2014
http://homepages.gsd.inesc-id.pt/~ndiegues/files/icac14-ndiegues.pdf

[2] "Virtues and Limitations of Commodity Hardware Transactional
Memory", Nuno Diegues, Paolo Romano, and Luis Rodrigues, Proceedings
of the International Conference on Parallel Architectures and Compiler
Techniques, PACT 2014


The copyright assignment for this is still in progress. Also, please
bear in mind that this is my first (ever) attempt to contribute to
GCC. As such, any suggestion (as simple as it may seem) will be most
welcome.


Output of: "svn diff --extensions -u"

Index: libitm/method-serial.cc
===================================================================
--- libitm/method-serial.cc (revision 219316)
+++ libitm/method-serial.cc (working copy)
@@ -223,7 +223,23 @@ struct htm_mg : public method_group
     // initially disabled.
 #ifdef USE_HTM_FASTPATH
     htm_fastpath = htm_init();
+#ifdef HAVE_AS_RTM
+    optimizer.optimized_mode = STUBBORN;
+    optimizer.optimized_attempts = htm_fastpath;
+    optimizer.last_cycles = rdtsc();
+    optimizer.last_total_txs_executed = 0;
+    optimizer.last_throughput = 0.0;
+    optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1;
+    optimizer.best_ever_throughput = 0.0;
+    optimizer.best_ever_attempts = htm_fastpath;
+    optimizer.txns_while_stubborn = 1;
+    optimizer.cycles_while_stubborn = 100;
+    optimizer.txns_while_halven = 1;
+    optimizer.cycles_while_halven = 100;
+    optimizer.txns_while_giveup = 1;
+    optimizer.cycles_while_giveup = 100;
 #endif
+#endif
   }
   virtual void fini()
   {
Index: libitm/config/x86/sjlj.S
===================================================================
--- libitm/config/x86/sjlj.S (revision 219316)
+++ libitm/config/x86/sjlj.S (working copy)
@@ -59,12 +59,14 @@
 #define pr_hasNoAbort 0x08
 #define pr_HTMRetryableAbort 0x800000
 #define pr_HTMRetriedAfterAbort 0x1000000
+#define pr_HTMCapacityAbort     0x2000000
 #define a_runInstrumentedCode 0x01
 #define a_runUninstrumentedCode 0x02
 #define a_tryHTMFastPath 0x20

 #define _XABORT_EXPLICIT (1 << 0)
 #define _XABORT_RETRY (1 << 1)
+#define _XABORT_CAPACITY (1 << 3)

  .text

@@ -108,9 +110,12 @@ SYM(_ITM_beginTransaction):
 .Ltxn_abort:
  /* If it might make sense to retry the HTM fast path, let the C++
     code decide.  */
- testl $(_XABORT_RETRY|_XABORT_EXPLICIT), %eax
+ testl $(_XABORT_RETRY|_XABORT_EXPLICIT|_XABORT_CAPACITY), %eax
  jz .Lno_htm
  orl $pr_HTMRetryableAbort, %edi
+ testl   $(_XABORT_CAPACITY), %eax
+ jz      .Lno_htm
+ orl     $pr_HTMCapacityAbort, %edi
  /* Let the C++ code handle the retry policy.  */
 .Lno_htm:
 #endif
Index: libitm/config/x86/target.h
===================================================================
--- libitm/config/x86/target.h (revision 219316)
+++ libitm/config/x86/target.h (working copy)
@@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret)
   return begin_ret & _XABORT_RETRY;
 }

+static inline bool
+htm_is_capacity_abort(uint32_t begin_ret)
+{
+  return begin_ret & _XABORT_CAPACITY;
+}
+
 /* Returns true iff a hardware transaction is currently being executed.  */
 static inline bool
 htm_transaction_active ()
 {
   return _xtest() != 0;
 }
+
+static inline uint64_t rdtsc()
+{
+    uint32_t hi, lo;
+    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+}
 #endif


Index: libitm/libitm_i.h
===================================================================
--- libitm/libitm_i.h (revision 219316)
+++ libitm/libitm_i.h (working copy)
@@ -242,6 +242,9 @@ struct gtm_thread
   uint32_t restart_reason[NUM_RESTARTS];
   uint32_t restart_total;

+  // Keeps track of how many transactions were successfully executed.
+  uint64_t number_executed_txns;
+
   // *** The shared part of gtm_thread starts here. ***
   // Shared state is on separate cachelines to avoid false sharing with
   // thread-local parts of gtm_thread.
@@ -286,6 +289,8 @@ struct gtm_thread
   static void *operator new(size_t);
   static void operator delete(void *);

+  static void reoptimize_htm_execution();
+
   // Invoked from assembly language, thus the "asm" specifier on
   // the name, avoiding complex name mangling.
   static uint32_t begin_transaction(uint32_t, const gtm_jmpbuf *)
@@ -309,6 +314,40 @@ struct gtm_thread
   void commit_user_actions ();
 };

+// Different ways to deal with capacity aborts in HTM execution.
+enum gtm_capacity_abort_mode
+{
+  STUBBORN,
+  HALVEN,
+  GIVEUP
+};
+
+// Maintains the current values optimized for HTM execution and the
+// corresponding statistics gathered for the decision-making.
+struct gtm_global_optimizer
+{
+  // Mode chosen to currently deal with capacity aborts.
+  gtm_capacity_abort_mode optimized_mode;
+  // Number of attempts chosen to currently insist on HTM execution.
+  uint32_t optimized_attempts;
+
+  uint64_t last_cycles;
+  uint64_t last_total_txs_executed;
+
+  double last_throughput;
+  uint32_t last_attempts;
+
+  double best_ever_throughput;
+  uint32_t best_ever_attempts;
+
+  uint64_t txns_while_stubborn;
+  uint64_t cycles_while_stubborn;
+  uint64_t txns_while_halven;
+  uint64_t cycles_while_halven;
+  uint64_t txns_while_giveup;
+  uint64_t cycles_while_giveup;
+};
+
 } // namespace GTM

 #include "tls.h"
@@ -346,6 +385,9 @@ extern gtm_cacheline_mask gtm_mask_stack(gtm_cache
 // the name, avoiding complex name mangling.
 extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath");

+// Maintains the optimization for HTM execution.
+extern gtm_global_optimizer optimizer;
+
 } // namespace GTM

 #endif // LIBITM_I_H
Index: libitm/beginend.cc
===================================================================
--- libitm/beginend.cc (revision 219316)
+++ libitm/beginend.cc (working copy)
@@ -24,8 +24,9 @@

 #include "libitm_i.h"
 #include <pthread.h>
+#include <math.h>
+//#include <cstdio>

-
 using namespace GTM;

 #if !defined(HAVE_ARCH_GTM_THREAD) || !defined(HAVE_ARCH_GTM_THREAD_DISP)
@@ -39,6 +40,9 @@ unsigned GTM::gtm_thread::number_of_threads = 0;
 gtm_stmlock GTM::gtm_stmlock_array[LOCK_ARRAY_SIZE];
 atomic<gtm_version> GTM::gtm_clock;

+// Optimization of HTM executions.
+gtm_global_optimizer GTM::optimizer;
+
 /* ??? Move elsewhere when we figure out library initialization.  */
 uint64_t GTM::gtm_spin_count_var = 1000;

@@ -149,6 +153,147 @@ choose_code_path(uint32_t prop, abi_dispatch *disp
     return a_runInstrumentedCode;
 }

+static inline float fastLog(float x)
+{
+  union { float f; uint32_t i; } vx = { x };
+  float y = vx.i;
+  y *= 8.2629582881927490e-8f;
+  return y - 87.989971088f;
+}
+
+static inline float fastSqrt(float x)
+{
+  union
+  {
+    int i;
+    float x;
+  } u;
+
+  u.x = x;
+  u.i = (1<<29) + (u.i >> 1) - (1<<22);
+  return u.x;
+}
+
+void
+GTM::gtm_thread::reoptimize_htm_execution()
+{
+  gtm_thread *tx = gtm_thr();
+  uint64_t total_txs_executed = 0;
+
+  // Collect the statistics obtained so far.
+  serial_lock.read_lock(tx);
+  gtm_thread *ptr = list_of_threads;
+  for (; ptr; ptr = ptr->next_thread)
+    {
+      total_txs_executed += ptr->number_executed_txns;
+    }
+  serial_lock.read_unlock(tx);
+
+  // Obtain the delta performance with respect to the last period.
+  uint64_t current_cycles = rdtsc();
+  uint64_t cycles_used = current_cycles - optimizer.last_cycles;
+  uint64_t new_txs_executed =
+    total_txs_executed - optimizer.last_total_txs_executed;
+  optimizer.last_cycles = current_cycles;
+  optimizer.last_total_txs_executed = total_txs_executed;
+  if (optimizer.optimized_mode == STUBBORN)
+    {
+      optimizer.txns_while_stubborn += new_txs_executed;
+      optimizer.cycles_while_stubborn += cycles_used;
+    }
+  else if (optimizer.optimized_mode == HALVEN)
+    {
+      optimizer.txns_while_halven += new_txs_executed;
+      optimizer.cycles_while_halven += cycles_used;
+    }
+  else
+    {
+      optimizer.txns_while_giveup += new_txs_executed;
+      optimizer.cycles_while_giveup += cycles_used;
+    }
+
+  // Compute Upper Confidence Bounds for the mode to choose next.
+  float log_sum = 2 * fastLog(static_cast<float>(
+    optimizer.txns_while_stubborn + optimizer.txns_while_halven +
+    optimizer.txns_while_giveup));
+  float ucb_stubborn = ((static_cast<float>(optimizer.txns_while_stubborn) /
+    optimizer.cycles_while_stubborn) / optimizer.best_ever_throughput) +
+    fastSqrt(log_sum / static_cast<float>(optimizer.txns_while_stubborn));
+  float ucb_halven = ((static_cast<float>(optimizer.txns_while_halven) /
+    optimizer.cycles_while_halven) / optimizer.best_ever_throughput) +
+    fastSqrt(log_sum / static_cast<float>(optimizer.txns_while_halven));
+  float ucb_giveup = ((static_cast<float>(optimizer.txns_while_giveup) /
+    optimizer.cycles_while_giveup) / optimizer.best_ever_throughput) +
+    fastSqrt(log_sum / static_cast<float>(optimizer.txns_while_giveup));
+  if (ucb_stubborn > ucb_halven && ucb_stubborn > ucb_giveup)
+    {
+      optimizer.optimized_mode = STUBBORN;
+    }
+  else if (ucb_halven > ucb_giveup)
+    {
+      optimizer.optimized_mode = HALVEN;
+    }
+  else
+    {
+      optimizer.optimized_mode = GIVEUP;
+    }
+
+  // Helps to obtain more meaningful numbers. Not necessary for correctness.
+  const double avg_cycles_per_tx = 5000.0;
+  double current_throughput =
+    (new_txs_executed * avg_cycles_per_tx) / cycles_used;
+
+  // Compute gradient descent for the number of retries.
+  double change_for_better = current_throughput / optimizer.last_throughput;
+  double change_for_worse = optimizer.last_throughput / current_throughput;
+  int32_t last_attempts = optimizer.last_attempts;
+  int32_t current_attempts = optimizer.optimized_attempts;
+  int32_t new_attempts = current_attempts;
+  if (unlikely(change_for_worse > 1.40))
+    {
+      optimizer.optimized_attempts = optimizer.best_ever_attempts;
+      optimizer.last_throughput = current_throughput;
+      optimizer.last_attempts = current_attempts;
+      return;
+    }
+
+  if (unlikely(random() % 100 < 1))
+    {
+      optimizer.last_attempts = optimizer.optimized_attempts;
+      optimizer.last_throughput = current_throughput;
+      optimizer.optimized_attempts = random() % 18 + 2;
+      return;
+    }
+
+      if (change_for_better > 1.05)
+        {
+          new_attempts += current_attempts - last_attempts;
+          if (current_attempts == last_attempts)
+            new_attempts += random() % 2 == 0 ? -1 : 1;
+        }
+      else if (change_for_worse > 1.05)
+        {
+          new_attempts -= current_attempts - last_attempts;
+          if (current_attempts == last_attempts)
+            new_attempts += random() % 2 == 0 ? -1 : 1;
+        }
+      if (unlikely(new_attempts > 20))
+        new_attempts = 20;
+      else if (unlikely(new_attempts < 2))
+        new_attempts = 2;
+      if (current_attempts != new_attempts)
+        {
+          optimizer.last_attempts = current_attempts;
+          optimizer.last_throughput = current_throughput;
+        }
+      optimizer.optimized_attempts = new_attempts;
+      if (unlikely(current_throughput > optimizer.best_ever_throughput))
+        {
+          optimizer.best_ever_throughput = current_throughput;
+          optimizer.best_ever_attempts = current_attempts;
+        }
+}
+
 uint32_t
 GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
 {
@@ -190,7 +335,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
 #ifndef HTM_CUSTOM_FASTPATH
   if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
     {
-      for (uint32_t t = htm_fastpath; t; t--)
+      for (uint32_t t = optimizer.optimized_attempts; t; t--)
  {
    uint32_t ret = htm_begin();
    if (htm_begin_success(ret))
@@ -209,19 +354,36 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
      }
    // The transaction has aborted.  Don't retry if it's unlikely that
    // retrying the transaction will be successful.
-   if (!htm_abort_should_retry(ret))
+#ifdef HAVE_AS_RTM
+          if (htm_is_capacity_abort(ret))
+            {
+              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
+              if (capacity_mode == HALVEN)
+                t = t / 2 + 1;
+              else if (capacity_mode == GIVEUP)
+                t = 1;
+            }
+          // Only one thread performs this to avoid the need for
+          // synchronization. We use the oldest thread that is active.
+          // We also reoptimize at most once per transaction.
+          if (unlikely(tx->next_thread == NULL &&
+              tx->number_executed_txns % 500 == 0 && t == 1))
+              reoptimize_htm_execution();
+#else
+          if (!htm_abort_should_retry(ret))
      break;
+#endif
    // Wait until any concurrent serial-mode transactions have finished.
    // This is an empty critical section, but won't be elided.
    if (serial_lock.is_write_locked())
      {
-       tx = gtm_thr();
-       if (unlikely(tx == NULL))
-         {
-           // See below.
-           tx = new gtm_thread();
-           set_gtm_thr(tx);
-         }
+              tx = gtm_thr();
+              if (unlikely(tx == NULL))
+                {
+                  // See below.
+                  tx = new gtm_thread();
+                  set_gtm_thr(tx);
+                }
        // Check whether there is an enclosing serial-mode transaction;
        // if so, we just continue as a nested transaction and don't
        // try to use the HTM fastpath.  This case can happen when an
@@ -262,12 +424,26 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
       // other fallback will use serial transactions, which don't use
       // restart_total but will reset it when committing.
       if (!(prop & pr_HTMRetriedAfterAbort))
- tx->restart_total = htm_fastpath;
+        tx->restart_total = optimizer.optimized_attempts;

       if (--tx->restart_total > 0)
  {
    // Wait until any concurrent serial-mode transactions have finished.
    // Essentially the same code as above.
+#ifdef HAVE_AS_RTM
+          if (prop & pr_HTMCapacityAbort)
+            {
+              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
+              if (capacity_mode == HALVEN)
+                tx->restart_total = tx->restart_total;
+              else if (capacity_mode == GIVEUP)
+                goto stop_custom_htm_fastpath;
+            }
+
+          if (unlikely(tx->next_thread == NULL &&
+              tx->number_executed_txns % 500 == 0 && tx->restart_total == 1))
+              reoptimize_htm_execution();
+#endif
    if (serial_lock.is_write_locked())
      {
        if (tx->nesting > 0)
@@ -665,12 +841,21 @@ _ITM_commitTransaction(void)
   if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
     {
       htm_commit();
+      gtm_thread *tx = gtm_thr();
+      if (unlikely(tx == NULL))
+        {
+          tx = new gtm_thread();
+          set_gtm_thr(tx);
+        }
+      tx->number_executed_txns++;
       return;
     }
 #endif
   gtm_thread *tx = gtm_thr();
   if (!tx->trycommit ())
     tx->restart (RESTART_VALIDATE_COMMIT);
+
+  tx->number_executed_txns++;
 }

 void ITM_REGPARM
@@ -681,6 +866,13 @@ _ITM_commitTransactionEH(void *exc_ptr)
   if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
     {
       htm_commit();
+      gtm_thread *tx = gtm_thr();
+      if (unlikely(tx == NULL))
+        {
+          tx = new gtm_thread();
+          set_gtm_thr(tx);
+        }
+      tx->number_executed_txns++;
       return;
     }
 #endif
@@ -690,4 +882,5 @@ _ITM_commitTransactionEH(void *exc_ptr)
       tx->eh_in_flight = exc_ptr;
       tx->restart (RESTART_VALIDATE_COMMIT);
     }
+  tx->number_executed_txns++;
 }
Index: libitm/libitm.h
===================================================================
--- libitm/libitm.h (revision 219316)
+++ libitm/libitm.h (working copy)
@@ -101,7 +101,8 @@ typedef enum
    /* These are not part of the ABI but used for custom HTM fast paths.  See
       ITM_beginTransaction and gtm_thread::begin_transaction.  */
    pr_HTMRetryableAbort = 0x800000,
-   pr_HTMRetriedAfterAbort = 0x1000000
+   pr_HTMRetriedAfterAbort = 0x1000000,
+   pr_HTMCapacityAbort          = 0x2000000
 } _ITM_codeProperties;

 /* Result from startTransaction that describes what actions to take.
Follow-Ups:
- Re: [PATCH] add self-tuning to x86 hardware fast path in libitm
  - From: Andi Kleen
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]