[PATCH] libitm: Add custom HTM fast path for RTM on x86_64.
Torvald Riegel
triegel@redhat.com
Wed Aug 21 14:17:00 GMT 2013
This patch adds a custom HTM fast path for RTM on x86_64, which moves
the core HTM fast path bits from gtm_thread::begin_transaction into the
x86-specific ITM_beginTransaction implementation. It extends/changes
the previous patch by Andi:
http://gcc.gnu.org/ml/gcc-patches/2013-01/msg01228.html
The custom fast path decreases the overheads of using HW transactions.
gtm_thread::begin_transaction remains responsible for handling the retry
policy after aborts of HW transactions, including when to switch to the
fallback execution method. Right now, the C++ retry code isn't aware of
the specific abort reason but just counts the number of retries for a
particular transaction; it might make sense to add this in the future.
Tested on Haswell with microbenchmarks and STAMP Vacation. OK for
trunk? (Please take a closer look at the asm pieces of this.)
(I've seen failures for STAMP Genome during my recent tests, but those
happen also with just ITM_DEFAULT_METHOD=serialirr and a single thread,
and AFAICT they don't seem to be related to the changes in
_ITM_beginTransaction. I'll have a look...)
Andreas and Peter: Is this sufficient as a proof of concept for custom
fast paths on your architectures, or would you like to see any changes?
Torvald
-------------- next part --------------
commit 9329bd4504d13d415542d93418157d588b599b4e
Author: Torvald Riegel <triegel@redhat.com>
Date: Wed Aug 21 11:40:54 2013 +0200
Add custom HTM fast path for RTM on x86_64.
* libitm_i.h (gtm_thread): Assign an asm name to serial_lock.
(htm_fastpath): Assign an asm name.
* libitm.h (_ITM_codeProperties): Add non-ABI flags used by custom
HTM fast paths.
(_ITM_actions): Likewise.
* config/x86/target.h (HTM_CUSTOM_FASTPATH): Enable custom fastpath on
x86_64.
* config/x86/sjlj.S (_ITM_beginTransaction): Add custom HTM fast path.
* config/posix/rwlock.h (gtm_rwlock): Update comments. Move summary
field to the start of the structure.
* config/linux/rwlock.h (gtm_rwlock): Update comments.
* beginend.cc (gtm_thread::begin_transaction): Add retry policy
handling for custom HTM fast paths.
diff --git a/libitm/beginend.cc b/libitm/beginend.cc
index a3bf549..bd7b19e 100644
--- a/libitm/beginend.cc
+++ b/libitm/beginend.cc
@@ -165,7 +165,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
if (unlikely(prop & pr_undoLogCode))
GTM_fatal("pr_undoLogCode not supported");
-#if defined(USE_HTM_FASTPATH) && !defined(HTM_CUSTOM_FASTPATH)
+#ifdef USE_HTM_FASTPATH
// HTM fastpath. Only chosen in the absence of transaction_cancel to allow
// using an uninstrumented code path.
// The fastpath is enabled only by dispatch_htm's method group, which uses
@@ -187,6 +187,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
// indeed in serial mode, and HW transactions should never need serial mode
// for any internal changes (e.g., they never abort visibly to the STM code
// and thus do not trigger the standard retry handling).
+#ifndef HTM_CUSTOM_FASTPATH
if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
{
for (uint32_t t = htm_fastpath; t; t--)
@@ -237,6 +238,49 @@ GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
}
}
}
+#else
+ // If we have a custom HTM fastpath in ITM_beginTransaction, we implement
+ // just the retry policy here. We communicate with the custom fastpath
+ // through additional property bits and return codes, and either transfer
+ // control back to the custom fastpath or run the fallback mechanism. The
+ // fastpath synchronization algorithm itself is the same.
+ // pr_HTMRetryableAbort states that a HW transaction started by the custom
+ // HTM fastpath aborted, and that we thus have to decide whether to retry
+ // the fastpath (returning a_tryHTMFastPath) or just proceed with the
+ // fallback method.
+ if (likely(htm_fastpath && (prop & pr_HTMRetryableAbort)))
+ {
+ tx = gtm_thr();
+ if (unlikely(tx == NULL))
+ {
+ // See below.
+ tx = new gtm_thread();
+ set_gtm_thr(tx);
+ }
+ // If this is the first abort, reset the retry count. We abuse
+ // restart_total for the retry count, which is fine because our only
+ // other fallback will use serial transactions, which don't use
+ // restart_total but will reset it when committing.
+ if (!(prop & pr_HTMRetriedAfterAbort))
+ tx->restart_total = htm_fastpath;
+
+ if (--tx->restart_total > 0)
+ {
+ // Wait until any concurrent serial-mode transactions have finished.
+ // Essentially the same code as above.
+ if (serial_lock.is_write_locked())
+ {
+ if (tx->nesting > 0)
+ goto stop_custom_htm_fastpath;
+ serial_lock.read_lock(tx);
+ serial_lock.read_unlock(tx);
+ }
+ // Let ITM_beginTransaction retry the custom HTM fastpath.
+ return a_tryHTMFastPath;
+ }
+ }
+ stop_custom_htm_fastpath:
+#endif
#endif
tx = gtm_thr();
diff --git a/libitm/config/linux/rwlock.h b/libitm/config/linux/rwlock.h
index 428299f..c761edf 100644
--- a/libitm/config/linux/rwlock.h
+++ b/libitm/config/linux/rwlock.h
@@ -39,6 +39,11 @@ struct gtm_thread;
//
// In this implementation, writers are given highest priority access but
// read-to-write upgrades do not have a higher priority than writers.
+//
+// Do not change the layout of this class; it must remain a POD type with
+// standard layout, and the WRITERS field must be first (i.e., so the
+// assembler code can assume that its address is equal to the address of the
+// respective instance of the class).
class gtm_rwlock
{
diff --git a/libitm/config/posix/rwlock.h b/libitm/config/posix/rwlock.h
index 2e41528..b2fd517 100644
--- a/libitm/config/posix/rwlock.h
+++ b/libitm/config/posix/rwlock.h
@@ -44,19 +44,25 @@ struct gtm_thread;
//
// In this implementation, writers are given highest priority access but
// read-to-write upgrades do not have a higher priority than writers.
+//
+// Do not change the layout of this class; it must remain a POD type with
+// standard layout, and the SUMMARY field must be first (i.e., so the
+// assembler code can assume that its address is equal to the address of the
+// respective instance of the class).
class gtm_rwlock
{
- pthread_mutex_t mutex; // Held if manipulating any field.
- pthread_cond_t c_readers; // Readers wait here
- pthread_cond_t c_writers; // Writers wait here for writers
- pthread_cond_t c_confirmed_writers; // Writers wait here for readers
-
static const unsigned a_writer = 1; // An active writer.
static const unsigned w_writer = 2; // The w_writers field != 0
static const unsigned w_reader = 4; // The w_readers field != 0
std::atomic<unsigned int> summary; // Bitmask of the above.
+
+ pthread_mutex_t mutex; // Held if manipulating any field.
+ pthread_cond_t c_readers; // Readers wait here
+ pthread_cond_t c_writers; // Writers wait here for writers
+ pthread_cond_t c_confirmed_writers; // Writers wait here for readers
+
unsigned int a_readers; // Nr active readers as observed by a writer
unsigned int w_readers; // Nr waiting readers
unsigned int w_writers; // Nr waiting writers
diff --git a/libitm/config/x86/sjlj.S b/libitm/config/x86/sjlj.S
index 993f698..2700180 100644
--- a/libitm/config/x86/sjlj.S
+++ b/libitm/config/x86/sjlj.S
@@ -24,6 +24,7 @@
#include "asmcfi.h"
+#include "config.h"
#define CONCAT1(a, b) CONCAT2(a, b)
#define CONCAT2(a, b) a ## b
@@ -52,6 +53,18 @@
# endif
#endif
+/* These are duplicates of the canonical definitions in libitm.h. */
+#define pr_uninstrumentedCode 0x02
+#define pr_hasNoAbort 0x08
+#define pr_HTMRetryableAbort 0x800000
+#define pr_HTMRetriedAfterAbort 0x1000000
+#define a_runInstrumentedCode 0x01
+#define a_runUninstrumentedCode 0x02
+#define a_tryHTMFastPath 0x20
+
+#define _XABORT_EXPLICIT (1 << 0)
+#define _XABORT_RETRY (1 << 1)
+
.text
.align 4
@@ -60,9 +73,51 @@
SYM(_ITM_beginTransaction):
cfi_startproc
#ifdef __x86_64__
+#ifdef HAVE_AS_RTM
+ /* Custom HTM fast path. We start the HW transaction here and let
+ gtm_thread::begin_transaction (aka GTM_begin_transaction) decide
+ how to proceed on aborts: We either retry the fast path, or fall
+ back to another execution method. RTM restores all registers after
+ a HW transaction abort, so we can do the SW setjmp after aborts,
+ and we have to because we might choose a SW fall back. However,
+ we have to explicitly save/restore the first argument (edi). */
+ cmpl $0, SYM(gtm_htm_fastpath)(%rip)
+ jz no_htm
+ testl $pr_hasNoAbort, %edi
+ jz no_htm
+htm_fastpath:
+ xbegin txn_abort
+ /* Monitor the serial lock (specifically, the 32b writer/summary field
+ at its start), and only continue if there is no serial-mode
+ transaction. Note that we might be just a nested transaction and
+ our outermost transaction might be in serial mode; we check for
+ this case in the retry policy implementation. */
+ cmpl $0, SYM(gtm_serial_lock)(%rip)
+ jnz 1f
+ /* Everything is good. Run the transaction, preferably using the
+ uninstrumented code path. */
+ testl $pr_uninstrumentedCode, %edi
+ mov $a_runInstrumentedCode, %eax
+ mov $a_runUninstrumentedCode, %ecx
+ cmovnz %ecx, %eax
+ ret
+ /* There is a serial-mode transaction, so abort (see htm_abort()
+ regarding the abort code). */
+1: xabort $0xff
+txn_abort:
+ /* If it might make sense to retry the HTM fast path, let the C++
+ code decide. */
+ testl $(_XABORT_RETRY|_XABORT_EXPLICIT), %eax
+ jz no_htm
+ /* Store edi for future HTM fast path retries. */
+ movl %edi, -8(%rsp)
+ orl $pr_HTMRetryableAbort, %edi
+ /* Let the C++ code handle the retry policy. */
+no_htm:
+#endif
leaq 8(%rsp), %rax
- subq $56, %rsp
- cfi_def_cfa_offset(64)
+ subq $64, %rsp
+ cfi_def_cfa_offset(72)
movq %rax, (%rsp)
movq %rbx, 8(%rsp)
movq %rbp, 16(%rsp)
@@ -72,8 +127,21 @@ SYM(_ITM_beginTransaction):
movq %r15, 48(%rsp)
movq %rsp, %rsi
call SYM(GTM_begin_transaction)
- addq $56, %rsp
+ addq $64, %rsp
cfi_def_cfa_offset(8)
+#ifdef HAVE_AS_RTM
+ /* If a_tryHTMFastPath was returned, then we need to retry the
+ fast path. We also restore edi and set pr_HTMRetriedAfterAbort
+ to state that we have retried the fast path already (it's harmless
+ if this bit is set even if we don't retry the fast path because it
+ is checked iff pr_HTMRetryableAbort is set). */
+ cmpl $a_tryHTMFastPath, %eax
+ jnz 2f
+ movl -8(%rsp), %edi
+ orl $pr_HTMRetriedAfterAbort, %edi
+ jmp htm_fastpath
+2:
+#endif
#else
leal 4(%esp), %ecx
movl 4(%esp), %eax
diff --git a/libitm/config/x86/target.h b/libitm/config/x86/target.h
index 063c09e..65efb31 100644
--- a/libitm/config/x86/target.h
+++ b/libitm/config/x86/target.h
@@ -70,6 +70,10 @@ cpu_relax (void)
// See gtm_thread::begin_transaction for how these functions are used.
#ifdef HAVE_AS_RTM
#define USE_HTM_FASTPATH
+#ifdef __x86_64__
+// Use the custom fastpath in ITM_beginTransaction.
+#define HTM_CUSTOM_FASTPATH
+#endif
static inline bool
htm_available ()
diff --git a/libitm/libitm.h b/libitm/libitm.h
index 20ac59a..4bee0a2 100644
--- a/libitm/libitm.h
+++ b/libitm/libitm.h
@@ -72,7 +72,9 @@ typedef enum
inIrrevocableTransaction
} _ITM_howExecuting;
-/* Values to describe properties of code, passed in to beginTransaction */
+/* Values to describe properties of code, passed in to beginTransaction.
+ Some of these constants are duplicated in some of the ITM_beginTransaction
+ implementations, so update those too when applying any changes. */
typedef enum
{
pr_instrumentedCode = 0x0001,
@@ -95,10 +97,16 @@ typedef enum
pr_exceptionBlock = 0x1000,
pr_hasElse = 0x2000,
pr_readOnly = 0x4000,
- pr_hasNoSimpleReads = 0x400000
+ pr_hasNoSimpleReads = 0x400000,
+ /* These are not part of the ABI but used for custom HTM fast paths. See
+ ITM_beginTransaction and gtm_thread::begin_transaction. */
+ pr_HTMRetryableAbort = 0x800000,
+ pr_HTMRetriedAfterAbort = 0x1000000
} _ITM_codeProperties;
-/* Result from startTransaction that describes what actions to take. */
+/* Result from startTransaction that describes what actions to take.
+ Some of these constants are duplicated in some of the ITM_beginTransaction
+ implementations, so update those too when applying any changes. */
typedef enum
{
a_runInstrumentedCode = 0x01,
@@ -106,6 +114,7 @@ typedef enum
a_saveLiveVariables = 0x04,
a_restoreLiveVariables = 0x08,
a_abortTransaction = 0x10,
+ a_tryHTMFastPath = 0x20
} _ITM_actions;
typedef struct
diff --git a/libitm/libitm_i.h b/libitm/libitm_i.h
index 421b349..e4162a6 100644
--- a/libitm/libitm_i.h
+++ b/libitm/libitm_i.h
@@ -230,6 +230,7 @@ struct gtm_thread
// be used for the next iteration of the transaction.
// Only restart_total is reset to zero when the transaction commits, the
// other counters are total values for all previously executed transactions.
+ // restart_total is also used by the HTM fastpath in a different way.
uint32_t restart_reason[NUM_RESTARTS];
uint32_t restart_total;
@@ -247,7 +248,17 @@ struct gtm_thread
// The lock that provides access to serial mode. Non-serialized
// transactions acquire read locks; a serialized transaction aquires
// a write lock.
- static gtm_rwlock serial_lock;
+ // Accessed from assembly language, thus the "asm" specifier on
+ // the name, avoiding complex name mangling.
+#ifdef __USER_LABEL_PREFIX__
+#define UPFX1(t) UPFX(t)
+#define UPFX(t) #t
+ static gtm_rwlock serial_lock
+ __asm__(UPFX1(__USER_LABEL_PREFIX__) "gtm_serial_lock");
+#else
+ static gtm_rwlock serial_lock
+ __asm__("gtm_serial_lock");
+#endif
// The head of the list of all threads' transactions.
static gtm_thread *list_of_threads;
@@ -278,8 +289,6 @@ struct gtm_thread
// Invoked from assembly language, thus the "asm" specifier on
// the name, avoiding complex name mangling.
#ifdef __USER_LABEL_PREFIX__
-#define UPFX1(t) UPFX(t)
-#define UPFX(t) #t
static uint32_t begin_transaction(uint32_t, const gtm_jmpbuf *)
__asm__(UPFX1(__USER_LABEL_PREFIX__) "GTM_begin_transaction") ITM_REGPARM;
#else
@@ -338,7 +347,15 @@ extern gtm_cacheline_mask gtm_mask_stack(gtm_cacheline *, gtm_cacheline_mask);
// Control variable for the HTM fastpath that uses serial mode as fallback.
// Non-zero if the HTM fastpath is enabled. See gtm_thread::begin_transaction.
-extern uint32_t htm_fastpath;
+// Accessed from assembly language, thus the "asm" specifier on
+// the name, avoiding complex name mangling.
+#ifdef __USER_LABEL_PREFIX__
+extern uint32_t htm_fastpath
+ __asm__(UPFX1(__USER_LABEL_PREFIX__) "gtm_htm_fastpath");
+#else
+extern uint32_t htm_fastpath
+ __asm__("gtm_htm_fastpath");
+#endif
} // namespace GTM
More information about the Gcc-patches
mailing list