[PATCH #2] PR59909, Fix powerpcle64 power8 bootstrap (quad memory support)

Michael Meissner meissner@linux.vnet.ibm.com
Thu Jan 23 23:36:00 GMT 2014


This is my fixed patch that correctly swaps the words for atomic quad memory
accesses in little endian.  It would be nice in the future to eliminate some of
the extra moves for things like compare and add.

I have tested this with a bootstrap (using --with-cpu=power8) on a little
endian power8 system, and I verified the example program works.  I'm running
the rest of the test suite right now.  Assuming the test suite doesn't show
anything wrong, is it ok to commit?

[gcc]
2014-01-23  Michael Meissner  <meissner@linux.vnet.ibm.com>

	PR target/59909
	* doc/invoke.texi (RS/6000 and PowerPC Options): Document
	-mquad-memory-atomic.  Update -mquad-memory documentation to say
	it is only used for non-atomic loads/stores.

	* config/rs6000/predicates.md (quad_int_reg_operand): Allow either
	-mquad-memory or -mquad-memory-atomic switches.

	* config/rs6000/rs6000-cpus.def (ISA_2_7_MASKS_SERVER): Add
	-mquad-memory-atomic to ISA 2.07 support.

	* config/rs6000/rs6000.opt (-mquad-memory-atomic): Add new switch
	to separate support of normal quad word memory operations (ldq,
	stq) from the atomic quad word memory operations.

	* config/rs6000/rs6000.c (rs6000_option_override_internal): Add
	support to separate non-atomic quad word operations from atomic
	quad word operations.  Disable non-atomic quad word operations in
	little endian mode so that we don't have to swap words after the
	load and before the store.
	(quad_load_store_p): Add comment about atomic quad word support.
	(rs6000_opt_masks): Add -mquad-memory-atomic to the list of
	options printed with -mdebug=reg.

	* config/rs6000/rs6000.h (TARGET_SYNC_TI): Use
	-mquad-memory-atomic as the test for whether we have quad word
	atomic instructions.
	(TARGET_SYNC_HI_QI): If either -mquad-memory-atomic,
	-mquad-memory, or -mp8-vector are used, allow byte/half-word
	atomic operations.

	* config/rs6000/sync.md (load_lockedti): Insure that the address
	is a proper indexed or indirect address for the lqarx instruction.
	On little endian systems, swap the hi/lo registers after the lqarx
	instruction.
	(load_lockedpti): Use indexed_or_indirect_operand predicate to
	insure the address is valid for the lqarx instruction.
	(store_conditionalti): Insure that the address is a proper indexed
	or indirect address for the stqcrx. instruction.  On little endian
	systems, swap the hi/lo registers before doing the stqcrx.
	instruction.
	(store_conditionalpti): Use indexed_or_indirect_operand predicate to
	insure the address is valid for the stqcrx. instruction.

	* gcc/config/rs6000/rs6000.md (UNSPEC_TI_PTI_SWAP): New UNSPEC for
	dealing with quad word atomic instructions in little endian mode.

	* gcc/config/rs6000/rs6000-c.c (rs6000_target_modify_macros):
	Define __QUAD_MEMORY__ and __QUAD_MEMORY_ATOMIC__ based on what
	type of quad memory support is available.

[gcc/testsuite]
2014-01-23  Michael Meissner  <meissner@linux.vnet.ibm.com>

	PR target/59909
	* gcc.target/powerpc/quad-atomic.c: New file to test power8 quad
	word atomic functions at runtime.


-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meissner@linux.vnet.ibm.com, phone: +1 (978) 899-4797
-------------- next part --------------
Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi	(revision 206895)
+++ gcc/doc/invoke.texi	(working copy)
@@ -919,6 +919,7 @@ See RS/6000 and PowerPC Options.
 -mpower8-fusion -mno-mpower8-fusion -mpower8-vector -mno-power8-vector @gol
 -mcrypto -mno-crypto -mdirect-move -mno-direct-move @gol
 -mquad-memory -mno-quad-memory @gol
+-mquad-memory-atomic -mno-quad-memory-atomic @gol
 -mcompat-align-parm -mno-compat-align-parm}
 
 @emph{RX Options}
@@ -18853,7 +18854,8 @@ following options:
 -mpopcntb -mpopcntd  -mpowerpc64 @gol
 -mpowerpc-gpopt  -mpowerpc-gfxopt  -msingle-float -mdouble-float @gol
 -msimple-fpu -mstring  -mmulhw  -mdlmzb  -mmfpgpr -mvsx @gol
--mcrypto -mdirect-move -mpower8-fusion -mpower8-vector -mquad-memory}
+-mcrypto -mdirect-move -mpower8-fusion -mpower8-vector @gol
+-mquad-memory -mquad-memory-atomic}
 
 The particular options set for any particular CPU varies between
 compiler versions, depending on what setting seems to produce optimal
@@ -19040,10 +19042,18 @@ the vector instructions.
 @itemx -mno-quad-memory
 @opindex mquad-memory
 @opindex mno-quad-memory
-Generate code that uses (does not use) the quad word memory
+Generate code that uses (does not use) the non-atomic quad word memory
 instructions.  The @option{-mquad-memory} option requires use of
 64-bit mode.
 
+@item -mquad-memory-atomic
+@itemx -mno-quad-memory-atomic
+@opindex mquad-memory-atomic
+@opindex mno-quad-memory-atomic
+Generate code that uses (does not use) the atomic quad word memory
+instructions.  The @option{-mquad-memory-atomic} option requires use of
+64-bit mode.
+
 @item -mfloat-gprs=@var{yes/single/double/no}
 @itemx -mfloat-gprs
 @opindex mfloat-gprs
Index: gcc/config/rs6000/predicates.md
===================================================================
--- gcc/config/rs6000/predicates.md	(revision 206895)
+++ gcc/config/rs6000/predicates.md	(working copy)
@@ -270,7 +270,7 @@ (define_predicate "quad_int_reg_operand"
 {
   HOST_WIDE_INT r;
 
-  if (!TARGET_QUAD_MEMORY)
+  if (!TARGET_QUAD_MEMORY && !TARGET_QUAD_MEMORY_ATOMIC)
     return 0;
 
   if (GET_CODE (op) == SUBREG)
@@ -624,6 +624,7 @@ (define_predicate "offsettable_mem_opera
        (match_test "offsettable_nonstrict_memref_p (op)")))
 
 ;; Return 1 if the operand is suitable for load/store quad memory.
+;; This predicate only checks for non-atomic loads/stores.
 (define_predicate "quad_memory_operand"
   (match_code "mem")
 {
Index: gcc/config/rs6000/rs6000-cpus.def
===================================================================
--- gcc/config/rs6000/rs6000-cpus.def	(revision 206895)
+++ gcc/config/rs6000/rs6000-cpus.def	(working copy)
@@ -53,7 +53,8 @@
 				 | OPTION_MASK_CRYPTO			\
 				 | OPTION_MASK_DIRECT_MOVE		\
 				 | OPTION_MASK_HTM			\
-				 | OPTION_MASK_QUAD_MEMORY)
+				 | OPTION_MASK_QUAD_MEMORY		\
+  				 | OPTION_MASK_QUAD_MEMORY_ATOMIC)
 
 #define POWERPC_7400_MASK	(OPTION_MASK_PPC_GFXOPT | OPTION_MASK_ALTIVEC)
 
Index: gcc/config/rs6000/rs6000.opt
===================================================================
--- gcc/config/rs6000/rs6000.opt	(revision 206895)
+++ gcc/config/rs6000/rs6000.opt	(working copy)
@@ -571,7 +571,11 @@ Use ISA 2.07 transactional memory (HTM) 
 
 mquad-memory
 Target Report Mask(QUAD_MEMORY) Var(rs6000_isa_flags)
-Generate the quad word memory instructions (lq/stq/lqarx/stqcx).
+Generate the quad word memory instructions (lq/stq).
+
+mquad-memory-atomic
+Target Report Mask(QUAD_MEMORY_ATOMIC) Var(rs6000_isa_flags)
+Generate the quad word memory atomic instructions (lqarx/stqcx).
 
 mcompat-align-parm
 Target Report Var(rs6000_compat_align_parm) Init(0) Save
Index: gcc/config/rs6000/rs6000-c.c
===================================================================
--- gcc/config/rs6000/rs6000-c.c	(revision 206895)
+++ gcc/config/rs6000/rs6000-c.c	(working copy)
@@ -339,6 +339,10 @@ rs6000_target_modify_macros (bool define
     rs6000_define_or_undefine_macro (define_p, "__HTM__");
   if ((flags & OPTION_MASK_P8_VECTOR) != 0)
     rs6000_define_or_undefine_macro (define_p, "__POWER8_VECTOR__");
+  if ((flags & OPTION_MASK_QUAD_MEMORY) != 0)
+    rs6000_define_or_undefine_macro (define_p, "__QUAD_MEMORY__");
+  if ((flags & OPTION_MASK_QUAD_MEMORY_ATOMIC) != 0)
+    rs6000_define_or_undefine_macro (define_p, "__QUAD_MEMORY_ATOMIC__");
   if ((flags & OPTION_MASK_CRYPTO) != 0)
     rs6000_define_or_undefine_macro (define_p, "__CRYPTO__");
 
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 206895)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -3356,14 +3356,37 @@ rs6000_option_override_internal (bool gl
 
   /* The quad memory instructions only works in 64-bit mode. In 32-bit mode,
      silently turn off quad memory mode.  */
-  if (TARGET_QUAD_MEMORY && !TARGET_POWERPC64)
+  if ((TARGET_QUAD_MEMORY || TARGET_QUAD_MEMORY_ATOMIC) && !TARGET_POWERPC64)
     {
       if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY) != 0)
 	warning (0, N_("-mquad-memory requires 64-bit mode"));
 
+      if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY_ATOMIC) != 0)
+	warning (0, N_("-mquad-memory-atomic requires 64-bit mode"));
+
+      rs6000_isa_flags &= ~(OPTION_MASK_QUAD_MEMORY
+			    | OPTION_MASK_QUAD_MEMORY_ATOMIC);
+    }
+
+  /* Non-atomic quad memory load/store are disabled for little endian, since
+     the words are reversed, but atomic operations can still be done by
+     swapping the words.  */
+  if (TARGET_QUAD_MEMORY && !WORDS_BIG_ENDIAN)
+    {
+      if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY) != 0)
+	warning (0, N_("-mquad-memory is not available in little endian mode"));
+
       rs6000_isa_flags &= ~OPTION_MASK_QUAD_MEMORY;
     }
 
+  /* Assume if the user asked for normal quad memory instructions, they want
+     the atomic versions as well, unless they explicity told us not to use quad
+     word atomic instructions.  */
+  if (TARGET_QUAD_MEMORY
+      && !TARGET_QUAD_MEMORY_ATOMIC
+      && ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY_ATOMIC) == 0))
+    rs6000_isa_flags |= OPTION_MASK_QUAD_MEMORY_ATOMIC;
+
   /* Enable power8 fusion if we are tuning for power8, even if we aren't
      generating power8 instructions.  */
   if (!(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION))
@@ -5939,7 +5962,8 @@ direct_move_p (rtx op0, rtx op1)
   return false;
 }
 
-/* Return true if this is a load or store quad operation.  */
+/* Return true if this is a load or store quad operation.  This function does
+   not handle the atomic quad memory instructions.  */
 
 bool
 quad_load_store_p (rtx op0, rtx op1)
@@ -30753,6 +30777,7 @@ static struct rs6000_opt_mask const rs60
   { "powerpc-gfxopt",		OPTION_MASK_PPC_GFXOPT,		false, true  },
   { "powerpc-gpopt",		OPTION_MASK_PPC_GPOPT,		false, true  },
   { "quad-memory",		OPTION_MASK_QUAD_MEMORY,	false, true  },
+  { "quad-memory-atomic",	OPTION_MASK_QUAD_MEMORY_ATOMIC,	false, true  },
   { "recip-precision",		OPTION_MASK_RECIP_PRECISION,	false, true  },
   { "string",			OPTION_MASK_STRING,		false, true  },
   { "update",			OPTION_MASK_NO_UPDATE,		true , true  },
Index: gcc/config/rs6000/rs6000.h
===================================================================
--- gcc/config/rs6000/rs6000.h	(revision 206895)
+++ gcc/config/rs6000/rs6000.h	(working copy)
@@ -533,8 +533,11 @@ extern int rs6000_vector_align[];
 /* Byte/char syncs were added as phased in for ISA 2.06B, but are not present
    in power7, so conditionalize them on p8 features.  TImode syncs need quad
    memory support.  */
-#define TARGET_SYNC_HI_QI	(TARGET_QUAD_MEMORY || TARGET_DIRECT_MOVE)
-#define TARGET_SYNC_TI		TARGET_QUAD_MEMORY
+#define TARGET_SYNC_HI_QI	(TARGET_QUAD_MEMORY			\
+				 || TARGET_QUAD_MEMORY_ATOMIC		\
+				 || TARGET_DIRECT_MOVE)
+
+#define TARGET_SYNC_TI		TARGET_QUAD_MEMORY_ATOMIC
 
 /* Power7 has both 32-bit load and store integer for the FPRs, so we don't need
    to allocate the SDmode stack slot to get the value into the proper location
Index: gcc/config/rs6000/sync.md
===================================================================
--- gcc/config/rs6000/sync.md	(revision 206895)
+++ gcc/config/rs6000/sync.md	(working copy)
@@ -204,25 +204,46 @@ (define_insn "load_locked<QHI:mode>_si"
   "<QHI:larx> %0,%y1"
   [(set_attr "type" "load_l")])
 
-;; Use PTImode to get even/odd register pairs
+;; Use PTImode to get even/odd register pairs.
+;; Use a temporary register to force getting an even register for the
+;; lqarx/stqcrx. instructions.  Normal optimizations will eliminate this extra
+;; copy on big endian systems.
+
+;; On little endian systems where non-atomic quad word load/store instructions
+;; are not used, the address can be register+offset, so make sure the address
+;; is indexed or indirect before register allocation.
+
 (define_expand "load_lockedti"
   [(use (match_operand:TI 0 "quad_int_reg_operand" ""))
    (use (match_operand:TI 1 "memory_operand" ""))]
   "TARGET_SYNC_TI"
 {
-  /* Use a temporary register to force getting an even register for the
-     lqarx/stqcrx. instructions.  Normal optimizations will eliminate this
-     extra copy.  */
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
   rtx pti = gen_reg_rtx (PTImode);
-  emit_insn (gen_load_lockedpti (pti, operands[1]));
-  emit_move_insn (operands[0], gen_lowpart (TImode, pti));
+
+  if (!indexed_or_indirect_operand (op1, TImode))
+    {
+      rtx old_addr = XEXP (op1, 0);
+      rtx new_addr = force_reg (Pmode, old_addr);
+      operands[1] = op1 = change_address (op1, TImode, new_addr);
+    }
+
+  emit_insn (gen_load_lockedpti (pti, op1));
+  if (WORDS_BIG_ENDIAN)
+    emit_move_insn (op0, gen_lowpart (TImode, pti));
+  else
+    {
+      emit_move_insn (gen_lowpart (DImode, op0), gen_highpart (DImode, pti));
+      emit_move_insn (gen_highpart (DImode, op0), gen_lowpart (DImode, pti));
+    }
   DONE;
 })
 
 (define_insn "load_lockedpti"
   [(set (match_operand:PTI 0 "quad_int_reg_operand" "=&r")
 	(unspec_volatile:PTI
-         [(match_operand:TI 1 "memory_operand" "Z")] UNSPECV_LL))]
+         [(match_operand:TI 1 "indexed_or_indirect_operand" "Z")] UNSPECV_LL))]
   "TARGET_SYNC_TI
    && !reg_mentioned_p (operands[0], operands[1])
    && quad_int_reg_operand (operands[0], PTImode)"
@@ -238,6 +259,14 @@ (define_insn "store_conditional<mode>"
   "<stcx> %2,%y1"
   [(set_attr "type" "store_c")])
 
+;; Use a temporary register to force getting an even register for the
+;; lqarx/stqcrx. instructions.  Normal optimizations will eliminate this extra
+;; copy on big endian systems.
+
+;; On little endian systems where non-atomic quad word load/store instructions
+;; are not used, the address can be register+offset, so make sure the address
+;; is indexed or indirect before register allocation.
+
 (define_expand "store_conditionalti"
   [(use (match_operand:CC 0 "cc_reg_operand" ""))
    (use (match_operand:TI 1 "memory_operand" ""))
@@ -247,21 +276,36 @@ (define_expand "store_conditionalti"
   rtx op0 = operands[0];
   rtx op1 = operands[1];
   rtx op2 = operands[2];
-  rtx pti_op1 = change_address (op1, PTImode, XEXP (op1, 0));
-  rtx pti_op2 = gen_reg_rtx (PTImode);
+  rtx addr = XEXP (op1, 0);
+  rtx pti_mem;
+  rtx pti_reg;
+
+  if (!indexed_or_indirect_operand (op1, TImode))
+    {
+      rtx new_addr = force_reg (Pmode, addr);
+      operands[1] = op1 = change_address (op1, TImode, new_addr);
+      addr = new_addr;
+    }
+
+  pti_mem = change_address (op1, PTImode, addr);
+  pti_reg = gen_reg_rtx (PTImode);
+
+  if (WORDS_BIG_ENDIAN)
+    emit_move_insn (pti_reg, gen_lowpart (PTImode, op2));
+  else
+    {
+      emit_move_insn (gen_lowpart (DImode, pti_reg), gen_highpart (DImode, op2));
+      emit_move_insn (gen_highpart (DImode, pti_reg), gen_lowpart (DImode, op2));
+    }
 
-  /* Use a temporary register to force getting an even register for the
-     lqarx/stqcrx. instructions.  Normal optimizations will eliminate this
-     extra copy.  */
-  emit_move_insn (pti_op2, gen_lowpart (PTImode, op2));
-  emit_insn (gen_store_conditionalpti (op0, pti_op1, pti_op2));
+  emit_insn (gen_store_conditionalpti (op0, pti_mem, pti_reg));
   DONE;
 })
 
 (define_insn "store_conditionalpti"
   [(set (match_operand:CC 0 "cc_reg_operand" "=x")
 	(unspec_volatile:CC [(const_int 0)] UNSPECV_SC))
-   (set (match_operand:PTI 1 "memory_operand" "=Z")
+   (set (match_operand:PTI 1 "indexed_or_indirect_operand" "=Z")
 	(match_operand:PTI 2 "quad_int_reg_operand" "r"))]
   "TARGET_SYNC_TI && quad_int_reg_operand (operands[2], PTImode)"
   "stqcx. %2,%y1"
Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md	(revision 206895)
+++ gcc/config/rs6000/rs6000.md	(working copy)
@@ -125,6 +125,7 @@ (define_c_enum "unspec"
    UNSPEC_P8V_MTVSRD
    UNSPEC_P8V_XXPERMDI
    UNSPEC_P8V_RELOAD_FROM_VSX
+   UNSPEC_TI_PTI_SWAP
   ])
 
 ;;
Index: gcc/testsuite/gcc.target/powerpc/quad-atomic.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/quad-atomic.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/quad-atomic.c	(revision 0)
@@ -0,0 +1,67 @@
+/* { dg-do run { target { powerpc*-*-linux* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */
+/* { dg-require-effective-target p8vector_hw } */
+/* { dg-options "-mcpu=power8 -O2" } */
+
+/* Test whether we get the right bits for quad word atomic instructions.  */
+#include <stdlib.h>
+
+static __int128_t quad_fetch_and (__int128_t *, __int128_t value) __attribute__((__noinline__));
+static __int128_t quad_fetch_or  (__int128_t *, __int128_t value) __attribute__((__noinline__));
+static __int128_t quad_fetch_add (__int128_t *, __int128_t value) __attribute__((__noinline__));
+
+static __int128_t
+quad_fetch_and (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+static __int128_t
+quad_fetch_or (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_or (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+static __int128_t
+quad_fetch_add (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_add (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+int
+main (void)
+{
+  __int128_t result;
+  __int128_t value;
+  __int128_t and_input	= ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL));
+  __int128_t and_value	= ((((__int128_t) 0xfffffffffffffff0ULL) << 64) | ((__int128_t) 0xfffffffffffffff0ULL));
+  __int128_t and_exp	= ((((__int128_t) 0x1234567890abcde0ULL) << 64) | ((__int128_t) 0xfedcba0987654320ULL));
+
+  __int128_t or_input	= ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL));
+  __int128_t or_value	= ((((__int128_t) 0x0000000000000010ULL) << 64) | ((__int128_t) 0x000000000000000eULL));
+  __int128_t or_exp	= ((((__int128_t) 0x1234567890abcdffULL) << 64) | ((__int128_t) 0xfedcba098765432fULL));
+
+  __int128_t add_input	= ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL));
+  __int128_t add_value	= ((((__int128_t) 0x0000000001000000ULL) << 64) | ((__int128_t) 0x0000001000000000ULL));
+  __int128_t add_exp	= ((((__int128_t) 0x1234567891abcdefULL) << 64) | ((__int128_t) 0xfedcba1987654321ULL));
+
+
+  value = and_input;
+  result = quad_fetch_and (&value, and_value);
+  if (result != and_input || value != and_exp)
+    abort ();
+
+  value = or_input;
+  result = quad_fetch_or (&value, or_value);
+  if (result != or_input || value != or_exp)
+    abort ();
+
+  value = add_input;
+  result = quad_fetch_add (&value, add_value);
+  if (result != add_input || value != add_exp)
+    abort ();
+
+  return 0;
+}
+
-------------- next part --------------
Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi	(revision 206932)
+++ gcc/doc/invoke.texi	(working copy)
@@ -858,7 +858,8 @@ See RS/6000 and PowerPC Options.
 -msave-toc-indirect -mno-save-toc-indirect @gol
 -mpower8-fusion -mno-mpower8-fusion -mpower8-vector -mno-power8-vector @gol
 -mcrypto -mno-crypto -mdirect-move -mno-direct-move @gol
--mquad-memory -mno-quad-memory}
+-mquad-memory -mno-quad-memory @gol
+-mquad-memory-atomic -mno-quad-memory-atomic}
 
 @emph{RX Options}
 @gccoptlist{-m64bit-doubles  -m32bit-doubles  -fpu  -nofpu@gol
@@ -17397,10 +17398,18 @@ the vector instructions.
 @itemx -mno-quad-memory
 @opindex mquad-memory
 @opindex mno-quad-memory
-Generate code that uses (does not use) the quad word memory
+Generate code that uses (does not use) the non-atomic quad word memory
 instructions.  The @option{-mquad-memory} option requires use of
 64-bit mode.
 
+@item -mquad-memory-atomic
+@itemx -mno-quad-memory-atomic
+@opindex mquad-memory-atomic
+@opindex mno-quad-memory-atomic
+Generate code that uses (does not use) the atomic quad word memory
+instructions.  The @option{-mquad-memory-atomic} option requires use of
+64-bit mode.
+
 @item -mfloat-gprs=@var{yes/single/double/no}
 @itemx -mfloat-gprs
 @opindex mfloat-gprs
Index: gcc/config/rs6000/predicates.md
===================================================================
--- gcc/config/rs6000/predicates.md	(revision 206932)
+++ gcc/config/rs6000/predicates.md	(working copy)
@@ -270,7 +270,7 @@ (define_predicate "quad_int_reg_operand"
 {
   HOST_WIDE_INT r;
 
-  if (!TARGET_QUAD_MEMORY)
+  if (!TARGET_QUAD_MEMORY && !TARGET_QUAD_MEMORY_ATOMIC)
     return 0;
 
   if (GET_CODE (op) == SUBREG)
@@ -633,6 +633,7 @@ (define_predicate "offsettable_mem_opera
        (match_test "offsettable_nonstrict_memref_p (op)")))
 
 ;; Return 1 if the operand is suitable for load/store quad memory.
+;; This predicate only checks for non-atomic loads/stores.
 (define_predicate "quad_memory_operand"
   (match_code "mem")
 {
Index: gcc/config/rs6000/rs6000-cpus.def
===================================================================
--- gcc/config/rs6000/rs6000-cpus.def	(revision 206932)
+++ gcc/config/rs6000/rs6000-cpus.def	(working copy)
@@ -53,7 +53,8 @@
 				 | OPTION_MASK_CRYPTO			\
 				 | OPTION_MASK_DIRECT_MOVE		\
 				 | OPTION_MASK_HTM			\
-				 | OPTION_MASK_QUAD_MEMORY)
+				 | OPTION_MASK_QUAD_MEMORY		\
+  				 | OPTION_MASK_QUAD_MEMORY_ATOMIC)
 
 #define POWERPC_7400_MASK	(OPTION_MASK_PPC_GFXOPT | OPTION_MASK_ALTIVEC)
 
Index: gcc/config/rs6000/rs6000-c.c
===================================================================
--- gcc/config/rs6000/rs6000-c.c	(revision 206932)
+++ gcc/config/rs6000/rs6000-c.c	(working copy)
@@ -337,6 +337,10 @@ rs6000_target_modify_macros (bool define
     rs6000_define_or_undefine_macro (define_p, "__HTM__");
   if ((flags & OPTION_MASK_P8_VECTOR) != 0)
     rs6000_define_or_undefine_macro (define_p, "__POWER8_VECTOR__");
+  if ((flags & OPTION_MASK_QUAD_MEMORY) != 0)
+    rs6000_define_or_undefine_macro (define_p, "__QUAD_MEMORY__");
+  if ((flags & OPTION_MASK_QUAD_MEMORY_ATOMIC) != 0)
+    rs6000_define_or_undefine_macro (define_p, "__QUAD_MEMORY_ATOMIC__");
   if ((flags & OPTION_MASK_CRYPTO) != 0)
     rs6000_define_or_undefine_macro (define_p, "__CRYPTO__");
 
Index: gcc/config/rs6000/rs6000.opt
===================================================================
--- gcc/config/rs6000/rs6000.opt	(revision 206932)
+++ gcc/config/rs6000/rs6000.opt	(working copy)
@@ -556,7 +556,11 @@ Use ISA 2.07 transactional memory (HTM) 
 
 mquad-memory
 Target Report Mask(QUAD_MEMORY) Var(rs6000_isa_flags)
-Generate the quad word memory instructions (lq/stq/lqarx/stqcx).
+Generate the quad word memory instructions (lq/stq).
+
+mquad-memory-atomic
+Target Report Mask(QUAD_MEMORY_ATOMIC) Var(rs6000_isa_flags)
+Generate the quad word memory atomic instructions (lqarx/stqcx).
 
 mcompat-align-parm
 Target Report Var(rs6000_compat_align_parm) Init(1) Save
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 206932)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -3317,14 +3317,37 @@ rs6000_option_override_internal (bool gl
 
   /* The quad memory instructions only works in 64-bit mode. In 32-bit mode,
      silently turn off quad memory mode.  */
-  if (TARGET_QUAD_MEMORY && !TARGET_POWERPC64)
+  if ((TARGET_QUAD_MEMORY || TARGET_QUAD_MEMORY_ATOMIC) && !TARGET_POWERPC64)
     {
       if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY) != 0)
 	warning (0, N_("-mquad-memory requires 64-bit mode"));
 
+      if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY_ATOMIC) != 0)
+	warning (0, N_("-mquad-memory-atomic requires 64-bit mode"));
+
+      rs6000_isa_flags &= ~(OPTION_MASK_QUAD_MEMORY
+			    | OPTION_MASK_QUAD_MEMORY_ATOMIC);
+    }
+
+  /* Non-atomic quad memory load/store are disabled for little endian, since
+     the words are reversed, but atomic operations can still be done by
+     swapping the words.  */
+  if (TARGET_QUAD_MEMORY && !WORDS_BIG_ENDIAN)
+    {
+      if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY) != 0)
+	warning (0, N_("-mquad-memory is not available in little endian mode"));
+
       rs6000_isa_flags &= ~OPTION_MASK_QUAD_MEMORY;
     }
 
+  /* Assume if the user asked for normal quad memory instructions, they want
+     the atomic versions as well, unless they explicity told us not to use quad
+     word atomic instructions.  */
+  if (TARGET_QUAD_MEMORY
+      && !TARGET_QUAD_MEMORY_ATOMIC
+      && ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY_ATOMIC) == 0))
+    rs6000_isa_flags |= OPTION_MASK_QUAD_MEMORY_ATOMIC;
+
   /* Enable power8 fusion if we are tuning for power8, even if we aren't
      generating power8 instructions.  */
   if (!(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION))
@@ -5875,7 +5898,8 @@ direct_move_p (rtx op0, rtx op1)
   return false;
 }
 
-/* Return true if this is a load or store quad operation.  */
+/* Return true if this is a load or store quad operation.  This function does
+   not handle the atomic quad memory instructions.  */
 
 bool
 quad_load_store_p (rtx op0, rtx op1)
@@ -30675,6 +30699,7 @@ static struct rs6000_opt_mask const rs60
   { "powerpc-gfxopt",		OPTION_MASK_PPC_GFXOPT,		false, true  },
   { "powerpc-gpopt",		OPTION_MASK_PPC_GPOPT,		false, true  },
   { "quad-memory",		OPTION_MASK_QUAD_MEMORY,	false, true  },
+  { "quad-memory-atomic",	OPTION_MASK_QUAD_MEMORY_ATOMIC,	false, true  },
   { "recip-precision",		OPTION_MASK_RECIP_PRECISION,	false, true  },
   { "string",			OPTION_MASK_STRING,		false, true  },
   { "update",			OPTION_MASK_NO_UPDATE,		true , true  },
Index: gcc/config/rs6000/rs6000.h
===================================================================
--- gcc/config/rs6000/rs6000.h	(revision 206932)
+++ gcc/config/rs6000/rs6000.h	(working copy)
@@ -524,8 +524,11 @@ extern int rs6000_vector_align[];
 /* Byte/char syncs were added as phased in for ISA 2.06B, but are not present
    in power7, so conditionalize them on p8 features.  TImode syncs need quad
    memory support.  */
-#define TARGET_SYNC_HI_QI	(TARGET_QUAD_MEMORY || TARGET_DIRECT_MOVE)
-#define TARGET_SYNC_TI		TARGET_QUAD_MEMORY
+#define TARGET_SYNC_HI_QI	(TARGET_QUAD_MEMORY			\
+				 || TARGET_QUAD_MEMORY_ATOMIC		\
+				 || TARGET_DIRECT_MOVE)
+
+#define TARGET_SYNC_TI		TARGET_QUAD_MEMORY_ATOMIC
 
 /* Power7 has both 32-bit load and store integer for the FPRs, so we don't need
    to allocate the SDmode stack slot to get the value into the proper location
Index: gcc/config/rs6000/sync.md
===================================================================
--- gcc/config/rs6000/sync.md	(revision 206932)
+++ gcc/config/rs6000/sync.md	(working copy)
@@ -1,5 +1,5 @@
 ;; Machine description for PowerPC synchronization instructions.
-;; Copyright (C) 2005-2013 Free Software Foundation, Inc.
+;; Copyright (C) 2005-2014 Free Software Foundation, Inc.
 ;; Contributed by Geoffrey Keating.
 
 ;; This file is part of GCC.
@@ -204,25 +204,46 @@ (define_insn "load_locked<QHI:mode>_si"
   "<QHI:larx> %0,%y1"
   [(set_attr "type" "load_l")])
 
-;; Use PTImode to get even/odd register pairs
+;; Use PTImode to get even/odd register pairs.
+;; Use a temporary register to force getting an even register for the
+;; lqarx/stqcrx. instructions.  Normal optimizations will eliminate this extra
+;; copy on big endian systems.
+
+;; On little endian systems where non-atomic quad word load/store instructions
+;; are not used, the address can be register+offset, so make sure the address
+;; is indexed or indirect before register allocation.
+
 (define_expand "load_lockedti"
   [(use (match_operand:TI 0 "quad_int_reg_operand" ""))
    (use (match_operand:TI 1 "memory_operand" ""))]
   "TARGET_SYNC_TI"
 {
-  /* Use a temporary register to force getting an even register for the
-     lqarx/stqcrx. instructions.  Normal optimizations will eliminate this
-     extra copy.  */
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
   rtx pti = gen_reg_rtx (PTImode);
-  emit_insn (gen_load_lockedpti (pti, operands[1]));
-  emit_move_insn (operands[0], gen_lowpart (TImode, pti));
+
+  if (!indexed_or_indirect_operand (op1, TImode))
+    {
+      rtx old_addr = XEXP (op1, 0);
+      rtx new_addr = force_reg (Pmode, old_addr);
+      operands[1] = op1 = change_address (op1, TImode, new_addr);
+    }
+
+  emit_insn (gen_load_lockedpti (pti, op1));
+  if (WORDS_BIG_ENDIAN)
+    emit_move_insn (op0, gen_lowpart (TImode, pti));
+  else
+    {
+      emit_move_insn (gen_lowpart (DImode, op0), gen_highpart (DImode, pti));
+      emit_move_insn (gen_highpart (DImode, op0), gen_lowpart (DImode, pti));
+    }
   DONE;
 })
 
 (define_insn "load_lockedpti"
   [(set (match_operand:PTI 0 "quad_int_reg_operand" "=&r")
 	(unspec_volatile:PTI
-         [(match_operand:TI 1 "memory_operand" "Z")] UNSPECV_LL))]
+         [(match_operand:TI 1 "indexed_or_indirect_operand" "Z")] UNSPECV_LL))]
   "TARGET_SYNC_TI
    && !reg_mentioned_p (operands[0], operands[1])
    && quad_int_reg_operand (operands[0], PTImode)"
@@ -238,6 +259,14 @@ (define_insn "store_conditional<mode>"
   "<stcx> %2,%y1"
   [(set_attr "type" "store_c")])
 
+;; Use a temporary register to force getting an even register for the
+;; lqarx/stqcrx. instructions.  Normal optimizations will eliminate this extra
+;; copy on big endian systems.
+
+;; On little endian systems where non-atomic quad word load/store instructions
+;; are not used, the address can be register+offset, so make sure the address
+;; is indexed or indirect before register allocation.
+
 (define_expand "store_conditionalti"
   [(use (match_operand:CC 0 "cc_reg_operand" ""))
    (use (match_operand:TI 1 "memory_operand" ""))
@@ -247,21 +276,36 @@ (define_expand "store_conditionalti"
   rtx op0 = operands[0];
   rtx op1 = operands[1];
   rtx op2 = operands[2];
-  rtx pti_op1 = change_address (op1, PTImode, XEXP (op1, 0));
-  rtx pti_op2 = gen_reg_rtx (PTImode);
+  rtx addr = XEXP (op1, 0);
+  rtx pti_mem;
+  rtx pti_reg;
+
+  if (!indexed_or_indirect_operand (op1, TImode))
+    {
+      rtx new_addr = force_reg (Pmode, addr);
+      operands[1] = op1 = change_address (op1, TImode, new_addr);
+      addr = new_addr;
+    }
+
+  pti_mem = change_address (op1, PTImode, addr);
+  pti_reg = gen_reg_rtx (PTImode);
+
+  if (WORDS_BIG_ENDIAN)
+    emit_move_insn (pti_reg, gen_lowpart (PTImode, op2));
+  else
+    {
+      emit_move_insn (gen_lowpart (DImode, pti_reg), gen_highpart (DImode, op2));
+      emit_move_insn (gen_highpart (DImode, pti_reg), gen_lowpart (DImode, op2));
+    }
 
-  /* Use a temporary register to force getting an even register for the
-     lqarx/stqcrx. instructions.  Normal optimizations will eliminate this
-     extra copy.  */
-  emit_move_insn (pti_op2, gen_lowpart (PTImode, op2));
-  emit_insn (gen_store_conditionalpti (op0, pti_op1, pti_op2));
+  emit_insn (gen_store_conditionalpti (op0, pti_mem, pti_reg));
   DONE;
 })
 
 (define_insn "store_conditionalpti"
   [(set (match_operand:CC 0 "cc_reg_operand" "=x")
 	(unspec_volatile:CC [(const_int 0)] UNSPECV_SC))
-   (set (match_operand:PTI 1 "memory_operand" "=Z")
+   (set (match_operand:PTI 1 "indexed_or_indirect_operand" "=Z")
 	(match_operand:PTI 2 "quad_int_reg_operand" "r"))]
   "TARGET_SYNC_TI && quad_int_reg_operand (operands[2], PTImode)"
   "stqcx. %2,%y1"
Index: gcc/testsuite/gcc.target/powerpc/quad-atomic.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/quad-atomic.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/quad-atomic.c	(revision 0)
@@ -0,0 +1,67 @@
+/* { dg-do run { target { powerpc*-*-linux* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */
+/* { dg-require-effective-target p8vector_hw } */
+/* { dg-options "-mcpu=power8 -O2" } */
+
+/* Test whether we get the right bits for quad word atomic instructions.  */
+#include <stdlib.h>
+
+static __int128_t quad_fetch_and (__int128_t *, __int128_t value) __attribute__((__noinline__));
+static __int128_t quad_fetch_or  (__int128_t *, __int128_t value) __attribute__((__noinline__));
+static __int128_t quad_fetch_add (__int128_t *, __int128_t value) __attribute__((__noinline__));
+
+static __int128_t
+quad_fetch_and (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+static __int128_t
+quad_fetch_or (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_or (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+static __int128_t
+quad_fetch_add (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_add (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+int
+main (void)
+{
+  __int128_t result;
+  __int128_t value;
+  __int128_t and_input	= ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL));
+  __int128_t and_value	= ((((__int128_t) 0xfffffffffffffff0ULL) << 64) | ((__int128_t) 0xfffffffffffffff0ULL));
+  __int128_t and_exp	= ((((__int128_t) 0x1234567890abcde0ULL) << 64) | ((__int128_t) 0xfedcba0987654320ULL));
+
+  __int128_t or_input	= ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL));
+  __int128_t or_value	= ((((__int128_t) 0x0000000000000010ULL) << 64) | ((__int128_t) 0x000000000000000eULL));
+  __int128_t or_exp	= ((((__int128_t) 0x1234567890abcdffULL) << 64) | ((__int128_t) 0xfedcba098765432fULL));
+
+  __int128_t add_input	= ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL));
+  __int128_t add_value	= ((((__int128_t) 0x0000000001000000ULL) << 64) | ((__int128_t) 0x0000001000000000ULL));
+  __int128_t add_exp	= ((((__int128_t) 0x1234567891abcdefULL) << 64) | ((__int128_t) 0xfedcba1987654321ULL));
+
+
+  value = and_input;
+  result = quad_fetch_and (&value, and_value);
+  if (result != and_input || value != and_exp)
+    abort ();
+
+  value = or_input;
+  result = quad_fetch_or (&value, or_value);
+  if (result != or_input || value != or_exp)
+    abort ();
+
+  value = add_input;
+  result = quad_fetch_add (&value, add_value);
+  if (result != add_input || value != add_exp)
+    abort ();
+
+  return 0;
+}
+


More information about the Gcc-patches mailing list