[PATCH] add self-tuning to x86 hardware fast path in libitm
Nuno Diegues
nmld@ist.utl.pt
Thu Apr 30 03:52:00 GMT 2015
Hello,
I have taken the chance to improve the patch by addressing the
comments above in this thread.
Namely:
- to use a simple random generator managed inside the library only
- removed floating point usage and replaced by fixed arithmetic
- added some comments where relevant
Re-running the STAMP benchmarks shows similar gains (to those shown
above) with respect to an unmodified GCC 5.0.0:
benchmark: speedup
genome: 1.58
intruder: 1.78
labyrinth: 1.0
ssca2: 1.01
yada: 1.0
kmeans-high: 1.16
kmeans-low: 1.16
vacation-high: 2.05
vacation-low: 2.81
I appreciate any feedback and comments!
Index: libitm/libitm_i.h
===================================================================
--- libitm/libitm_i.h (revision 219316)
+++ libitm/libitm_i.h (working copy)
@@ -242,6 +242,9 @@ struct gtm_thread
uint32_t restart_reason[NUM_RESTARTS];
uint32_t restart_total;
+ // Keeps track of how many transactions were successfully executed.
+ uint64_t number_executed_txns;
+
// *** The shared part of gtm_thread starts here. ***
// Shared state is on separate cachelines to avoid false sharing with
// thread-local parts of gtm_thread.
@@ -286,6 +289,8 @@ struct gtm_thread
static void *operator new(size_t);
static void operator delete(void *);
+ static void reoptimize_htm_execution();
+
// Invoked from assembly language, thus the "asm" specifier on
// the name, avoiding complex name mangling.
static uint32_t begin_transaction(uint32_t, const gtm_jmpbuf *)
@@ -309,6 +314,59 @@ struct gtm_thread
void commit_user_actions ();
};
+// Different ways to deal with capacity aborts in HTM execution.
+enum gtm_capacity_abort_mode
+{
+ STUBBORN,
+ HALVEN,
+ GIVEUP
+};
+
+// Definition of fixed point arithmetic types.
+// Half the bits are dedicated to the fractional type, and the rest to the
+// "whole" part.
+#define FIXED_PT_WIDTH 64
+#define FIXED_PT_INTEGER_WIDTH 32
+typedef uint64_t fixed_pt_t;
+typedef __uint128_t fixed_pt_td;
+
+#define FIXED_PT_FRAC_WIDTH (FIXED_PT_WIDTH - FIXED_PT_INTEGER_WIDTH)
+#define FIXED_PT_ONE ((fixed_pt_t)((fixed_pt_t)1 << FIXED_PT_FRAC_WIDTH))
+#define FIXED_PT_TWO (FIXED_PT_ONE + FIXED_PT_ONE)
+
+#define fixed_mul(A,B) \
+ ((fixed_pt_t)(((fixed_pt_td)(A) * (fixed_pt_td)(B)) >> FIXED_PT_FRAC_WIDTH))
+#define fixed_div(A,B) \
+ ((fixed_pt_t)(((fixed_pt_td)(A) << FIXED_PT_FRAC_WIDTH) / (fixed_pt_td)(B)))
+#define fixed_const(R) \
+ ((fixed_pt_t)((R) * FIXED_PT_ONE + ((R) >= 0 ? 0.5 : -0.5)))
+
+// Maintains the current values optimized for HTM execution and the
+// corresponding statistics gathered for the decision-making.
+struct gtm_global_optimizer
+{
+ // Mode chosen to currently deal with capacity aborts.
+ gtm_capacity_abort_mode optimized_mode;
+ // Number of attempts chosen to currently insist on HTM execution.
+ uint32_t optimized_attempts;
+
+ uint64_t last_cycles;
+ uint64_t last_total_txs_executed;
+
+ fixed_pt_t last_throughput;
+ uint32_t last_attempts;
+
+ fixed_pt_t best_ever_throughput;
+ uint32_t best_ever_attempts;
+
+ uint64_t txns_while_stubborn;
+ uint64_t cycles_while_stubborn;
+ uint64_t txns_while_halven;
+ uint64_t cycles_while_halven;
+ uint64_t txns_while_giveup;
+ uint64_t cycles_while_giveup;
+};
+
} // namespace GTM
#include "tls.h"
@@ -346,6 +404,9 @@ extern gtm_cacheline_mask gtm_mask_stack(gtm_cache
// the name, avoiding complex name mangling.
extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath");
+// Maintains the optimization for HTM execution.
+extern gtm_global_optimizer optimizer;
+
} // namespace GTM
#endif // LIBITM_I_H
Index: libitm/libitm.h
===================================================================
--- libitm/libitm.h (revision 219316)
+++ libitm/libitm.h (working copy)
@@ -101,7 +101,8 @@ typedef enum
/* These are not part of the ABI but used for custom HTM fast paths. See
ITM_beginTransaction and gtm_thread::begin_transaction. */
pr_HTMRetryableAbort = 0x800000,
- pr_HTMRetriedAfterAbort = 0x1000000
+ pr_HTMRetriedAfterAbort = 0x1000000,
+ pr_HTMCapacityAbort = 0x2000000
} _ITM_codeProperties;
/* Result from startTransaction that describes what actions to take.
Index: libitm/method-serial.cc
===================================================================
--- libitm/method-serial.cc (revision 219316)
+++ libitm/method-serial.cc (working copy)
@@ -223,7 +223,23 @@ struct htm_mg : public method_group
// initially disabled.
#ifdef USE_HTM_FASTPATH
htm_fastpath = htm_init();
+#ifdef HAVE_AS_RTM
+ optimizer.optimized_mode = STUBBORN;
+ optimizer.optimized_attempts = htm_fastpath;
+ optimizer.last_cycles = rdtsc();
+ optimizer.last_total_txs_executed = 0;
+ optimizer.last_throughput = fixed_const(0.0001);
+ optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1;
+ optimizer.best_ever_throughput = optimizer.last_throughput;
+ optimizer.best_ever_attempts = htm_fastpath;
+ optimizer.txns_while_stubborn = 1;
+ optimizer.cycles_while_stubborn = 100;
+ optimizer.txns_while_halven = 1;
+ optimizer.cycles_while_halven = 100;
+ optimizer.txns_while_giveup = 1;
+ optimizer.cycles_while_giveup = 100;
#endif
+#endif
}
virtual void fini()
{
Index: libitm/beginend.cc
===================================================================
--- libitm/beginend.cc (revision 219316)
+++ libitm/beginend.cc (working copy)
@@ -25,7 +25,6 @@
#include "libitm_i.h"
#include <pthread.h>
-
using namespace GTM;
#if !defined(HAVE_ARCH_GTM_THREAD) || !defined(HAVE_ARCH_GTM_THREAD_DISP)
@@ -39,6 +38,9 @@ unsigned GTM::gtm_thread::number_of_threads = 0;
gtm_stmlock GTM::gtm_stmlock_array[LOCK_ARRAY_SIZE];
atomic<gtm_version> GTM::gtm_clock;
+// Optimization of HTM executions.
+gtm_global_optimizer GTM::optimizer;
+
/* ??? Move elsewhere when we figure out library initialization. */
uint64_t GTM::gtm_spin_count_var = 1000;
@@ -149,6 +151,225 @@ choose_code_path(uint32_t prop, abi_dispatch *disp
return a_runInstrumentedCode;
}
+static inline fixed_pt_t
+fixed_sqrt(fixed_pt_t n)
+{
+ int invert = 0;
+ int iter = FIXED_PT_FRAC_WIDTH;
+ int l, i;
+
+ if (n == 0 || n == FIXED_PT_ONE)
+ {
+ return n;
+ }
+ if (n < FIXED_PT_ONE && n > 6)
+ {
+ invert = 1;
+ n = fixed_div(FIXED_PT_ONE, n);
+ }
+ if (n > FIXED_PT_ONE)
+ {
+ int s = n;
+ iter = 0;
+ while (s > 0)
+ {
+ s >>= 2;
+ iter++;
+ }
+ }
+
+ l = (n >> 1) + 1;
+ for (i = 0; i < iter; i++)
+ {
+ l = (l + fixed_div(n, l)) >> 1;
+ }
+ if (invert)
+ {
+ return (fixed_div(FIXED_PT_ONE, l));
+ }
+ return l;
+}
+
+static inline fixed_pt_t
+fixed_ln(fixed_pt_t x)
+{
+ const fixed_pt_t LN2 = fixed_const(0.69314718055994530942);
+ const fixed_pt_t LG[7] = {
+ fixed_const(6.666666666666735130e-01),
+ fixed_const(3.999999999940941908e-01),
+ fixed_const(2.857142874366239149e-01),
+ fixed_const(2.222219843214978396e-01),
+ fixed_const(1.818357216161805012e-01),
+ fixed_const(1.531383769920937332e-01),
+ fixed_const(1.479819860511658591e-01)
+ };
+
+ if (x == 0)
+ {
+ return 0xffffffff;
+ }
+
+ fixed_pt_t log2 = 0;
+ fixed_pt_t xi = x;
+ while (xi > FIXED_PT_TWO)
+ {
+ xi >>= 1;
+ log2++;
+ }
+
+ fixed_pt_t f = xi - FIXED_PT_ONE;
+ fixed_pt_t s = fixed_div(f, FIXED_PT_TWO + f);
+ fixed_pt_t z = fixed_mul(s, s);
+ fixed_pt_t w = fixed_mul(z, z);
+ fixed_pt_t R =
+ fixed_mul(w, LG[1] + fixed_mul(w, LG[3] + fixed_mul(w, LG[5])))
+ + fixed_mul(z, LG[0] + fixed_mul(w, LG[2]
+ + fixed_mul(w, LG[4] + fixed_mul(w, LG[6]))));
+ return fixed_mul(LN2, (log2 << FIXED_PT_FRAC_WIDTH))
+ + f - fixed_mul(s, f - R);
+}
+
+// State not synchronized; assumes single thread usage at any time.
+// Invoked only by the thread reoptimizing, so assumption holds.
+int
+obtainRandomInt(int max)
+{
+ static int seed = 123456789;
+ seed = (1103515245 * seed + 12345) % 4294967296;
+ return seed;
+}
+
+// Called by the thread at the tail of the list of threads.
+void
+GTM::gtm_thread::reoptimize_htm_execution()
+{
+ gtm_thread *tx = gtm_thr();
+ uint64_t total_txs_executed = 0;
+
+ // Collect the statistics obtained so far.
+ serial_lock.read_lock(tx);
+ gtm_thread *ptr = list_of_threads;
+ for (; ptr; ptr = ptr->next_thread)
+ {
+ total_txs_executed += ptr->number_executed_txns;
+ }
+ serial_lock.read_unlock(tx);
+
+ // Obtain the delta performance with respect to the last period.
+ const uint64_t current_cycles = rdtsc();
+ const uint64_t cycles_used = current_cycles - optimizer.last_cycles;
+ const uint64_t new_txs_executed =
+ total_txs_executed - optimizer.last_total_txs_executed;
+ optimizer.last_cycles = current_cycles;
+ optimizer.last_total_txs_executed = total_txs_executed;
+ if (optimizer.optimized_mode == STUBBORN)
+ {
+ optimizer.txns_while_stubborn += new_txs_executed;
+ optimizer.cycles_while_stubborn += cycles_used;
+ }
+ else if (optimizer.optimized_mode == HALVEN)
+ {
+ optimizer.txns_while_halven += new_txs_executed;
+ optimizer.cycles_while_halven += cycles_used;
+ }
+ else
+ {
+ optimizer.txns_while_giveup += new_txs_executed;
+ optimizer.cycles_while_giveup += cycles_used;
+ }
+
+ // Compute Upper Confidence Bounds for the mode to choose next.
+ // Use fixed point arithmetic types to spare floating point usage.
+ const fixed_pt_t log_sum =
+ fixed_mul(FIXED_PT_TWO,
+ fixed_ln(fixed_const(optimizer.txns_while_stubborn)
+ + fixed_const(optimizer.txns_while_halven)
+ + fixed_const(optimizer.txns_while_giveup)));
+ const fixed_pt_t ucb_stubborn =
+ fixed_div(fixed_div(fixed_const(optimizer.txns_while_stubborn),
+ fixed_const(optimizer.cycles_while_stubborn)),
+ optimizer.best_ever_throughput)
+ + fixed_sqrt(fixed_div(log_sum,
+ fixed_const(optimizer.txns_while_stubborn)));
+ const fixed_pt_t ucb_halven =
+ fixed_div(fixed_div(fixed_const(optimizer.txns_while_halven),
+ fixed_const(optimizer.cycles_while_halven)),
+ optimizer.best_ever_throughput)
+ + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_halven)));
+ const fixed_pt_t ucb_giveup =
+ fixed_div(fixed_div(fixed_const(optimizer.txns_while_giveup),
+ fixed_const(optimizer.cycles_while_giveup)),
+ optimizer.best_ever_throughput)
+ + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_giveup)));
+
+ if (ucb_stubborn > ucb_halven && ucb_stubborn > ucb_giveup)
+ {
+ optimizer.optimized_mode = STUBBORN;
+ }
+ else if (ucb_halven > ucb_giveup)
+ {
+ optimizer.optimized_mode = HALVEN;
+ }
+ else
+ {
+ optimizer.optimized_mode = GIVEUP;
+ }
+
+ // Compute gradient descent for the number of retries.
+ const fixed_pt_t current_throughput =
+ fixed_div(fixed_const(new_txs_executed), fixed_const(cycles_used));
+ const fixed_pt_t change_for_better =
+ fixed_div(current_throughput, optimizer.last_throughput);
+ const fixed_pt_t change_for_worse =
+ fixed_div(optimizer.last_throughput, current_throughput);
+ int32_t last_attempts = optimizer.last_attempts;
+ int32_t current_attempts = optimizer.optimized_attempts;
+ int32_t new_attempts = current_attempts;
+ if (unlikely(change_for_worse > fixed_const(1.40)))
+ {
+ optimizer.optimized_attempts = optimizer.best_ever_attempts;
+ optimizer.last_throughput = current_throughput;
+ optimizer.last_attempts = current_attempts;
+ return;
+ }
+
+ if (unlikely(obtainRandomInt(100) == 0))
+ {
+ optimizer.last_attempts = optimizer.optimized_attempts;
+ optimizer.last_throughput = current_throughput;
+ optimizer.optimized_attempts = obtainRandomInt(18) + 2;
+ return;
+ }
+
+ if (change_for_better > fixed_const(1.05))
+ {
+ new_attempts += current_attempts - last_attempts;
+ if (current_attempts == last_attempts)
+ new_attempts += obtainRandomInt(2) == 0 ? -1 : 1;
+ }
+ else if (change_for_worse > fixed_const(1.05))
+ {
+ new_attempts -= current_attempts - last_attempts;
+ if (current_attempts == last_attempts)
+ new_attempts += obtainRandomInt(2) == 0 ? -1 : 1;
+ }
+ if (unlikely(new_attempts > 20))
+ new_attempts = 20;
+ else if (unlikely(new_attempts < 2))
+ new_attempts = 2;
+ if (current_attempts != new_attempts)
+ {
+ optimizer.last_attempts = current_attempts;
+ optimizer.last_throughput = current_throughput;
+ }
+ optimizer.optimized_attempts = new_attempts;
+ if (unlikely(current_throughput > optimizer.best_ever_throughput))
+ {
+ optimizer.best_ever_throughput = current_throughput;
+ optimizer.best_ever_attempts = current_attempts;
+ }
+}
+
uint32_t
GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
{
@@ -190,7 +411,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
#ifndef HTM_CUSTOM_FASTPATH
if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
{
- for (uint32_t t = htm_fastpath; t; t--)
+ for (uint32_t t = optimizer.optimized_attempts; t; t--)
{
uint32_t ret = htm_begin();
if (htm_begin_success(ret))
@@ -209,19 +430,36 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
}
// The transaction has aborted. Don't retry if it's unlikely that
// retrying the transaction will be successful.
- if (!htm_abort_should_retry(ret))
+#ifdef HAVE_AS_RTM
+ if (htm_is_capacity_abort(ret))
+ {
+ gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
+ if (capacity_mode == HALVEN)
+ t = t / 2 + 1;
+ else if (capacity_mode == GIVEUP)
+ t = 1;
+ }
+ // Only one thread performs this to avoid the need for
+ // synchronization. We use the oldest thread that is active.
+ // We also reoptimize at most once per transaction.
+ if (unlikely(tx->next_thread == NULL &&
+ tx->number_executed_txns % 500 == 0 && t == 1))
+ reoptimize_htm_execution();
+#else
+ if (!htm_abort_should_retry(ret))
break;
+#endif
// Wait until any concurrent serial-mode transactions have finished.
// This is an empty critical section, but won't be elided.
if (serial_lock.is_write_locked())
{
- tx = gtm_thr();
- if (unlikely(tx == NULL))
- {
- // See below.
- tx = new gtm_thread();
- set_gtm_thr(tx);
- }
+ tx = gtm_thr();
+ if (unlikely(tx == NULL))
+ {
+ // See below.
+ tx = new gtm_thread();
+ set_gtm_thr(tx);
+ }
// Check whether there is an enclosing serial-mode transaction;
// if so, we just continue as a nested transaction and don't
// try to use the HTM fastpath. This case can happen when an
@@ -262,12 +500,26 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
// other fallback will use serial transactions, which don't use
// restart_total but will reset it when committing.
if (!(prop & pr_HTMRetriedAfterAbort))
- tx->restart_total = htm_fastpath;
+ tx->restart_total = optimizer.optimized_attempts;
if (--tx->restart_total > 0)
{
// Wait until any concurrent serial-mode transactions have finished.
// Essentially the same code as above.
+#ifdef HAVE_AS_RTM
+ if (prop & pr_HTMCapacityAbort)
+ {
+ gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
+ if (capacity_mode == HALVEN)
+ tx->restart_total = tx->restart_total;
+ else if (capacity_mode == GIVEUP)
+ goto stop_custom_htm_fastpath;
+ }
+
+ if (unlikely(tx->next_thread == NULL &&
+ tx->number_executed_txns % 500 == 0 && tx->restart_total == 1))
+ reoptimize_htm_execution();
+#endif
if (serial_lock.is_write_locked())
{
if (tx->nesting > 0)
@@ -665,12 +917,21 @@ _ITM_commitTransaction(void)
if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
{
htm_commit();
+ gtm_thread *tx = gtm_thr();
+ if (unlikely(tx == NULL))
+ {
+ tx = new gtm_thread();
+ set_gtm_thr(tx);
+ }
+ tx->number_executed_txns++;
return;
}
#endif
gtm_thread *tx = gtm_thr();
if (!tx->trycommit ())
tx->restart (RESTART_VALIDATE_COMMIT);
+
+ tx->number_executed_txns++;
}
void ITM_REGPARM
@@ -681,6 +942,13 @@ _ITM_commitTransactionEH(void *exc_ptr)
if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
{
htm_commit();
+ gtm_thread *tx = gtm_thr();
+ if (unlikely(tx == NULL))
+ {
+ tx = new gtm_thread();
+ set_gtm_thr(tx);
+ }
+ tx->number_executed_txns++;
return;
}
#endif
@@ -690,4 +958,5 @@ _ITM_commitTransactionEH(void *exc_ptr)
tx->eh_in_flight = exc_ptr;
tx->restart (RESTART_VALIDATE_COMMIT);
}
+ tx->number_executed_txns++;
}
Index: libitm/config/x86/target.h
===================================================================
--- libitm/config/x86/target.h (revision 219316)
+++ libitm/config/x86/target.h (working copy)
@@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret)
return begin_ret & _XABORT_RETRY;
}
+static inline bool
+htm_is_capacity_abort(uint32_t begin_ret)
+{
+ return begin_ret & _XABORT_CAPACITY;
+}
+
/* Returns true iff a hardware transaction is currently being executed. */
static inline bool
htm_transaction_active ()
{
return _xtest() != 0;
}
+
+static inline uint64_t rdtsc()
+{
+ uint32_t hi, lo;
+ __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+ return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+}
#endif
Index: libitm/config/x86/sjlj.S
===================================================================
--- libitm/config/x86/sjlj.S (revision 219316)
+++ libitm/config/x86/sjlj.S (working copy)
@@ -59,12 +59,14 @@
#define pr_hasNoAbort 0x08
#define pr_HTMRetryableAbort 0x800000
#define pr_HTMRetriedAfterAbort 0x1000000
+#define pr_HTMCapacityAbort 0x2000000
#define a_runInstrumentedCode 0x01
#define a_runUninstrumentedCode 0x02
#define a_tryHTMFastPath 0x20
#define _XABORT_EXPLICIT (1 << 0)
#define _XABORT_RETRY (1 << 1)
+#define _XABORT_CAPACITY (1 << 3)
.text
@@ -108,9 +110,12 @@ SYM(_ITM_beginTransaction):
.Ltxn_abort:
/* If it might make sense to retry the HTM fast path, let the C++
code decide. */
- testl $(_XABORT_RETRY|_XABORT_EXPLICIT), %eax
+ testl $(_XABORT_RETRY|_XABORT_EXPLICIT|_XABORT_CAPACITY), %eax
jz .Lno_htm
orl $pr_HTMRetryableAbort, %edi
+ testl $(_XABORT_CAPACITY), %eax
+ jz .Lno_htm
+ orl $pr_HTMCapacityAbort, %edi
/* Let the C++ code handle the retry policy. */
.Lno_htm:
#endif
On Fri, Apr 10, 2015 at 8:24 AM, Andi Kleen <andi@firstfloor.org> wrote:
> Nuno Diegues <nmld@ist.utl.pt> writes:
>>
>> One general question on how to proceed:
>> given that I make some further changes, should I post the whole patch again?
>
> Yes please resend the patch.
>
> -Andi
More information about the Gcc-patches
mailing list