[cxx-mem-model 2/3] i386: Implement atomic_load/store.

Thu Nov 3 08:23:00 GMT 2011

* config/i386/i386.md (UNSPEC_MOVA): New.
        * config/i386/sync.md (ATOMIC): New mode iterator.
        (atomic_load<ATOMIC>, atomic_store<ATOMIC>): New.
        (atomic_loaddi_fpu, atomic_storedi_fpu, movdi_via_fpu): New.
---
 gcc/ChangeLog.mm        |    7 ++
 gcc/config/i386/i386.md |    3 +
 gcc/config/i386/sync.md |  163 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 173 insertions(+), 0 deletions(-)

diff --git a/gcc/ChangeLog.mm b/gcc/ChangeLog.mm
index 4454e93..24137ab 100644
--- a/gcc/ChangeLog.mm
+++ b/gcc/ChangeLog.mm
@@ -1,5 +1,12 @@
 2011-11-02  Richard Henderson  <rth@redhat.com>
 
+	* config/i386/i386.md (UNSPEC_MOVA): New.
+	* config/i386/sync.md (ATOMIC): New mode iterator.
+	(atomic_load<ATOMIC>, atomic_store<ATOMIC>): New.
+	(atomic_loaddi_fpu, atomic_storedi_fpu, movdi_via_fpu): New.
+
+2011-11-02  Richard Henderson  <rth@redhat.com>
+
 	* optabs.c (expand_atomic_load): Don't try compare-and-swap.
 
 2011-11-02  Aldy Hernandez  <aldyh@redhat.com>
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index f11998c..d073cb9 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -248,6 +248,9 @@
   ;; For BMI2 support
   UNSPEC_PDEP
   UNSPEC_PEXT
+
+  ;; For __atomic support
+  UNSPEC_MOVA
 ])
 
 (define_c_enum "unspecv" [
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index e5579b1..0ff1712 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -46,6 +46,169 @@
   "lock{%;} or{l}\t{$0, (%%esp)|DWORD PTR [esp], 0}"
   [(set_attr "memory" "unknown")])
 
+;; ??? From volume 3 section 7.1.1 Guaranteed Atomic Operations,
+;; Only beginning at Pentium family processors do we get any guarantee of
+;; atomicity in aligned 64-bit quantities.  Beginning at P6, we get a
+;; guarantee for 64-bit accesses that do not cross a cacheline boundary.
+;;
+;; Note that the TARGET_CMPXCHG8B test below is a stand-in for "Pentium".
+;;
+;; Importantly, *no* processor makes atomicity guarantees for larger
+;; accesses.  In particular, there's no way to perform an atomic TImode
+;; move, despite the apparent applicability of MOVDQA et al.
+
+(define_mode_iterator ATOMIC
+   [QI HI SI
+    (DI "TARGET_64BIT || (TARGET_CMPXCHG8B && (TARGET_80387 || TARGET_SSE))")
+   ])
+
+(define_expand "atomic_load<mode>"
+  [(set (match_operand:ATOMIC 0 "register_operand" "")
+	(unspec:ATOMIC [(match_operand:ATOMIC 1 "memory_operand" "")
+			(match_operand:SI 2 "const_int_operand" "")]
+		       UNSPEC_MOVA))]
+  ""
+{
+  /* For DImode on 32-bit, we can use the FPU to perform the load.  */
+  if (<MODE>mode == DImode && !TARGET_64BIT)
+    emit_insn (gen_atomic_loaddi_fpu
+	       (operands[0], operands[1],
+	        assign_386_stack_local (DImode,
+					(virtuals_instantiated
+					 ? SLOT_TEMP : SLOT_VIRTUAL))));
+  else
+    emit_move_insn (operands[0], operands[1]);
+  DONE;
+})
+
+(define_insn_and_split "atomic_loaddi_fpu"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=x,m,?r")
+	(unspec:DI [(match_operand:DI 1 "memory_operand" "m,m,m")]
+		   UNSPEC_MOVA))
+   (clobber (match_operand:DI 2 "memory_operand" "=X,X,m"))
+   (clobber (match_scratch:DF 3 "=X,xf,xf"))]
+  "!TARGET_64BIT && (TARGET_80387 || TARGET_SSE)"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx dst = operands[0], src = operands[1];
+  rtx mem = operands[2], tmp = operands[3];
+
+  if (SSE_REG_P (dst))
+    emit_move_insn (dst, src);
+  else
+    {
+      if (MEM_P (dst))
+	mem = dst;
+
+      if (FP_REG_P (tmp))
+	emit_insn (gen_movdi_via_fpu (mem, src, tmp));
+      else
+	{
+	  adjust_reg_mode (tmp, DImode);
+	  emit_move_insn (tmp, src);
+	  emit_move_insn (mem, tmp);
+	}
+
+      if (mem != dst)
+	emit_move_insn (dst, mem);
+    }
+  DONE;
+})
+
+(define_expand "atomic_store<mode>"
+  [(set (match_operand:ATOMIC 0 "memory_operand" "")
+	(unspec:ATOMIC [(match_operand:ATOMIC 1 "register_operand" "")
+			(match_operand:SI 2 "const_int_operand" "")]
+		       UNSPEC_MOVA))]
+  ""
+{
+  enum memmodel model = (enum memmodel) INTVAL (operands[2]);
+
+  if (<MODE>mode == DImode && !TARGET_64BIT)
+    {
+      /* For DImode on 32-bit, we can use the FPU to perform the store.  */
+      /* Note that while we could perform a cmpxchg8b loop, that turns
+	 out to be significantly larger than this plus a barrier.  */
+      emit_insn (gen_atomic_storedi_fpu
+		 (operands[0], operands[1],
+	          assign_386_stack_local (DImode,
+					  (virtuals_instantiated
+					   ? SLOT_TEMP : SLOT_VIRTUAL))));
+    }
+  else
+    {
+      /* For seq-cst stores, when we lack MFENCE, use XCHG.  */
+      if (model == MEMMODEL_SEQ_CST && !(TARGET_64BIT || TARGET_SSE2))
+	{
+	  emit_insn (gen_atomic_exchange<mode> (gen_reg_rtx (<MODE>mode),
+						operands[0], operands[1],
+						operands[2]));
+	  DONE;
+	}
+
+      /* Otherwise use a normal store.  */
+      emit_move_insn (operands[0], operands[1]);
+    }
+  /* ... followed by an MFENCE, if required.  */
+  if (model == MEMMODEL_SEQ_CST)
+    emit_insn (gen_mem_thread_fence (operands[2]));
+  DONE;
+})
+
+(define_insn_and_split "atomic_storedi_fpu"
+  [(set (match_operand:DI 0 "memory_operand" "=m,m,m")
+	(unspec:DI [(match_operand:DI 1 "register_operand" "x,m,?r")]
+		   UNSPEC_MOVA))
+   (clobber (match_operand:DI 2 "memory_operand" "=X,X,m"))
+   (clobber (match_scratch:DF 3 "=X,xf,xf"))]
+  "!TARGET_64BIT && (TARGET_80387 || TARGET_SSE)"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx dst = operands[0], src = operands[1];
+  rtx mem = operands[2], tmp = operands[3];
+
+  if (!SSE_REG_P (src))
+    {
+      if (REG_P (src))
+	{
+	  emit_move_insn (mem, src);
+	  src = mem;
+	}
+
+      if (FP_REG_P (tmp))
+	{
+	  emit_insn (gen_movdi_via_fpu (dst, src, tmp));
+	  DONE;
+	}
+      else
+	{
+	  adjust_reg_mode (tmp, DImode);
+	  emit_move_insn (tmp, mem);
+	  src = tmp;
+	}
+    }
+  emit_move_insn (dst, src);
+  DONE;
+})
+
+;; ??? You'd think that we'd be able to perform this via FLOAT + FIX_TRUNC
+;; operations.  But the fix_trunc patterns want way more setup than we want
+;; to provide.  Note that the scratch is DFmode instead of XFmode in order
+;; to make it easy to allocate a scratch in either SSE or FP_REGs above.
+(define_insn "movdi_via_fpu"
+  [(set (match_operand:DI 0 "memory_operand" "=m")
+	(unspec:DI [(match_operand:DI 1 "memory_operand" "m")] UNSPEC_MOVA))
+   (clobber (match_operand:DF 2 "register_operand" "=f"))]
+  "TARGET_80387"
+  "fild\t%1\;fistp\t%0"
+  [(set_attr "type" "multi")
+   ;; Worst case based on full sib+offset32 addressing modes
+   (set_attr "length" "14")])
+
 (define_expand "atomic_compare_and_swap<mode>"
   [(match_operand:QI 0 "register_operand" "")		;; bool success output
    (match_operand:SWI124 1 "register_operand" "")	;; oldval output
-- 
1.7.6.4