[PATCH, rs6000] power8 patches, patch #7, quad/byte/half-word atomic instructions

Tue Jun 11 23:56:00 GMT 2013

I needed to rework the sync.md so that it would work correctly with no
optimization (using SUBREG's at -O0 did not give us the even registers for
holding PTImode values, so I created a PTImode temporary in load_lockedti and
store_conditionalti, which is normally optimized out.

[gcc]
2013-06-11  Michael Meissner  <meissner@linux.vnet.ibm.com>
	    Pat Haugen <pthaugen@us.ibm.com>
	    Peter Bergner <bergner@vnet.ibm.com>

	* config/rs6000/rs6000.c (emit_load_locked): Add support for
	power8 byte, half-word, and quad-word atomic instructions.
	(emit_store_conditional): Likewise.
	(rs6000_expand_atomic_compare_and_swap): Likewise.
	(rs6000_expand_atomic_op): Likewise.

	* config/rs6000/sync.md (larx): Add new modes for power8.
	(stcx): Likewise.
	(AINT): New mode iterator to include TImode as well as normal
	integer modes on power8.
	(fetchop_pred): Use int_reg_operand instead of gpc_reg_operand so
	that VSX registers are not considered.  Use AINT mode iterator
	instead of INT1 to allow inclusion of quad word atomic operations
	on power8.
	(load_locked<mode>): Likewise.
	(store_conditional<mode>): Likewise.
	(atomic_compare_and_swap<mode>): Likewise.
	(atomic_exchange<mode>): Likewise.
	(atomic_nand<mode>): Likewise.
	(atomic_fetch_<fetchop_name><mode>): Likewise.
	(atomic_nand_fetch<mode>): Likewise.
	(mem_thread_fence): Use gen_loadsync_<mode> instead of enumerating
	each type.
	(ATOMIC): On power8, add QImode, HImode modes.
	(load_locked<QHI:mode>_si): Varients of load_locked for QI/HI
	modes that promote to SImode.
	(load_lockedti): Convert TImode arguments to PTImode, so that we
	get a guaranteed even/odd register pair.
	(load_lockedpti): Likewise.
	(store_conditionalti): Likewise.
	(store_conditionalpti): Likewise.

	* config/rs6000/rs6000.md (QHI): New mode iterator for power8
	atomic load/store instructions.
	(HSI): Likewise.

[gcc/testsuite]
2013-06-11  Michael Meissner  <meissner@linux.vnet.ibm.com>
	    Pat Haugen <pthaugen@us.ibm.com>
	    Peter Bergner <bergner@vnet.ibm.com>

	* gcc.target/powerpc/atomic-p7.c: New file, add tests for atomic
	load/store instructions on power7, power8.
	* gcc.target/powerpc/atomic-p8.c: Likewise.

Given these changes went beyond the original request to fix a spelling error
and improve the logic, I figured to send these patches out again.  David, do
you have any problem with the new patches?

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460, USA
email: meissner@linux.vnet.ibm.com, phone: +1 (978) 899-4797
-------------- next part --------------
Index: gcc/config/rs6000/rs6000.c
===================================================================

--- gcc/config/rs6000/rs6000.c	(revision 199965)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -17748,7 +17748,8 @@ emit_unlikely_jump (rtx cond, rtx label)
 }
 
 /* A subroutine of the atomic operation splitters.  Emit a load-locked
-   instruction in MODE.  */
+   instruction in MODE.  For QI/HImode, possibly use a pattern than includes
+   the zero_extend operation.  */
 
 static void
 emit_load_locked (enum machine_mode mode, rtx reg, rtx mem)
@@ -17757,12 +17758,26 @@ emit_load_locked (enum machine_mode mode
 
   switch (mode)
     {
+    case QImode:
+      fn = gen_load_lockedqi;
+      break;
+    case HImode:
+      fn = gen_load_lockedhi;
+      break;
     case SImode:
-      fn = gen_load_lockedsi;
+      if (GET_MODE (mem) == QImode)
+	fn = gen_load_lockedqi_si;
+      else if (GET_MODE (mem) == HImode)
+	fn = gen_load_lockedhi_si;
+      else
+	fn = gen_load_lockedsi;
       break;
     case DImode:
       fn = gen_load_lockeddi;
       break;
+    case TImode:
+      fn = gen_load_lockedti;
+      break;
     default:
       gcc_unreachable ();
     }
@@ -17779,12 +17794,21 @@ emit_store_conditional (enum machine_mod
 
   switch (mode)
     {
+    case QImode:
+      fn = gen_store_conditionalqi;
+      break;
+    case HImode:
+      fn = gen_store_conditionalhi;
+      break;
     case SImode:
       fn = gen_store_conditionalsi;
       break;
     case DImode:
       fn = gen_store_conditionaldi;
       break;
+    case TImode:
+      fn = gen_store_conditionalti;
+      break;
     default:
       gcc_unreachable ();
     }
@@ -17931,7 +17955,7 @@ rs6000_expand_atomic_compare_and_swap (r
 {
   rtx boolval, retval, mem, oldval, newval, cond;
   rtx label1, label2, x, mask, shift;
-  enum machine_mode mode;
+  enum machine_mode mode, orig_mode;
   enum memmodel mod_s, mod_f;
   bool is_weak;
 
@@ -17943,22 +17967,29 @@ rs6000_expand_atomic_compare_and_swap (r
   is_weak = (INTVAL (operands[5]) != 0);
   mod_s = (enum memmodel) INTVAL (operands[6]);
   mod_f = (enum memmodel) INTVAL (operands[7]);
-  mode = GET_MODE (mem);
+  orig_mode = mode = GET_MODE (mem);
 
   mask = shift = NULL_RTX;
   if (mode == QImode || mode == HImode)
     {
-      mem = rs6000_adjust_atomic_subword (mem, &shift, &mask);
-
-      /* Shift and mask OLDVAL into position with the word.  */
+      /* Before power8, we didn't have access to lbarx/lharx, so generate a
+	 lwarx and shift/mask operations.  With power8, we need to do the
+	 comparison in SImode, but the store is still done in QI/HImode.  */
       oldval = convert_modes (SImode, mode, oldval, 1);
-      oldval = expand_simple_binop (SImode, ASHIFT, oldval, shift,
-				    NULL_RTX, 1, OPTAB_LIB_WIDEN);
 
-      /* Shift and mask NEWVAL into position within the word.  */
-      newval = convert_modes (SImode, mode, newval, 1);
-      newval = expand_simple_binop (SImode, ASHIFT, newval, shift,
-				    NULL_RTX, 1, OPTAB_LIB_WIDEN);
+      if (!TARGET_SYNC_HI_QI)
+	{
+	  mem = rs6000_adjust_atomic_subword (mem, &shift, &mask);
+
+	  /* Shift and mask OLDVAL into position with the word.  */
+	  oldval = expand_simple_binop (SImode, ASHIFT, oldval, shift,
+					NULL_RTX, 1, OPTAB_LIB_WIDEN);
+
+	  /* Shift and mask NEWVAL into position within the word.  */
+	  newval = convert_modes (SImode, mode, newval, 1);
+	  newval = expand_simple_binop (SImode, ASHIFT, newval, shift,
+					NULL_RTX, 1, OPTAB_LIB_WIDEN);
+	}
 
       /* Prepare to adjust the return value.  */
       retval = gen_reg_rtx (SImode);
@@ -17987,7 +18018,25 @@ rs6000_expand_atomic_compare_and_swap (r
     }
 
   cond = gen_reg_rtx (CCmode);
-  x = gen_rtx_COMPARE (CCmode, x, oldval);
+  /* If we have TImode, synthesize a comparison.  */
+  if (mode != TImode)
+    x = gen_rtx_COMPARE (CCmode, x, oldval);
+  else
+    {
+      rtx xor1_result = gen_reg_rtx (DImode);
+      rtx xor2_result = gen_reg_rtx (DImode);
+      rtx or_result = gen_reg_rtx (DImode);
+      rtx new_word0 = simplify_gen_subreg (DImode, x, TImode, 0);
+      rtx new_word1 = simplify_gen_subreg (DImode, x, TImode, 8);
+      rtx old_word0 = simplify_gen_subreg (DImode, oldval, TImode, 0);
+      rtx old_word1 = simplify_gen_subreg (DImode, oldval, TImode, 8);
+
+      emit_insn (gen_xordi3 (xor1_result, new_word0, old_word0));
+      emit_insn (gen_xordi3 (xor2_result, new_word1, old_word1));
+      emit_insn (gen_iordi3 (or_result, xor1_result, xor2_result));
+      x = gen_rtx_COMPARE (CCmode, or_result, const0_rtx);
+    }
+
   emit_insn (gen_rtx_SET (VOIDmode, cond, x));
 
   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
@@ -17997,7 +18046,7 @@ rs6000_expand_atomic_compare_and_swap (r
   if (mask)
     x = rs6000_mask_atomic_subword (retval, newval, mask);
 
-  emit_store_conditional (mode, cond, mem, x);
+  emit_store_conditional (orig_mode, cond, mem, x);
 
   if (!is_weak)
     {
@@ -18015,6 +18064,8 @@ rs6000_expand_atomic_compare_and_swap (r
 
   if (shift)
     rs6000_finish_atomic_subword (operands[1], retval, shift);
+  else if (mode != GET_MODE (operands[1]))
+    convert_move (operands[1], retval, 1);
 
   /* In all cases, CR0 contains EQ on success, and NE on failure.  */
   x = gen_rtx_EQ (SImode, cond, const0_rtx);
@@ -18038,7 +18089,7 @@ rs6000_expand_atomic_exchange (rtx opera
   mode = GET_MODE (mem);
 
   mask = shift = NULL_RTX;
-  if (mode == QImode || mode == HImode)
+  if (!TARGET_SYNC_HI_QI && (mode == QImode || mode == HImode))
     {
       mem = rs6000_adjust_atomic_subword (mem, &shift, &mask);
 
@@ -18087,53 +18138,70 @@ rs6000_expand_atomic_op (enum rtx_code c
 {
   enum memmodel model = (enum memmodel) INTVAL (model_rtx);
   enum machine_mode mode = GET_MODE (mem);
+  enum machine_mode store_mode = mode;
   rtx label, x, cond, mask, shift;
   rtx before = orig_before, after = orig_after;
 
   mask = shift = NULL_RTX;
+  /* On power8, we want to use SImode for the operation.  On previous systems,
+     use the operation in a subword and shift/mask to get the proper byte or
+     halfword.  */
   if (mode == QImode || mode == HImode)
     {
-      mem = rs6000_adjust_atomic_subword (mem, &shift, &mask);
-
-      /* Shift and mask VAL into position with the word.  */
-      val = convert_modes (SImode, mode, val, 1);
-      val = expand_simple_binop (SImode, ASHIFT, val, shift,
-				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
+      if (TARGET_SYNC_HI_QI)
+	{
+	  val = convert_modes (SImode, mode, val, 1);
 
-      switch (code)
+	  /* Prepare to adjust the return value.  */
+	  before = gen_reg_rtx (SImode);
+	  if (after)
+	    after = gen_reg_rtx (SImode);
+	  mode = SImode;
+	}
+      else
 	{
-	case IOR:
-	case XOR:
-	  /* We've already zero-extended VAL.  That is sufficient to
-	     make certain that it does not affect other bits.  */
-	  mask = NULL;
-	  break;
+	  mem = rs6000_adjust_atomic_subword (mem, &shift, &mask);
 
-	case AND:
-	  /* If we make certain that all of the other bits in VAL are
-	     set, that will be sufficient to not affect other bits.  */
-	  x = gen_rtx_NOT (SImode, mask);
-	  x = gen_rtx_IOR (SImode, x, val);
-	  emit_insn (gen_rtx_SET (VOIDmode, val, x));
-	  mask = NULL;
-	  break;
+	  /* Shift and mask VAL into position with the word.  */
+	  val = convert_modes (SImode, mode, val, 1);
+	  val = expand_simple_binop (SImode, ASHIFT, val, shift,
+				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
 
-	case NOT:
-	case PLUS:
-	case MINUS:
-	  /* These will all affect bits outside the field and need
-	     adjustment via MASK within the loop.  */
-	  break;
+	  switch (code)
+	    {
+	    case IOR:
+	    case XOR:
+	      /* We've already zero-extended VAL.  That is sufficient to
+		 make certain that it does not affect other bits.  */
+	      mask = NULL;
+	      break;
 
-	default:
-	  gcc_unreachable ();
-	}
+	    case AND:
+	      /* If we make certain that all of the other bits in VAL are
+		 set, that will be sufficient to not affect other bits.  */
+	      x = gen_rtx_NOT (SImode, mask);
+	      x = gen_rtx_IOR (SImode, x, val);
+	      emit_insn (gen_rtx_SET (VOIDmode, val, x));
+	      mask = NULL;
+	      break;
 
-      /* Prepare to adjust the return value.  */
-      before = gen_reg_rtx (SImode);
-      if (after)
-	after = gen_reg_rtx (SImode);
-      mode = SImode;
+	    case NOT:
+	    case PLUS:
+	    case MINUS:
+	      /* These will all affect bits outside the field and need
+		 adjustment via MASK within the loop.  */
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	    }
+
+	  /* Prepare to adjust the return value.  */
+	  before = gen_reg_rtx (SImode);
+	  if (after)
+	    after = gen_reg_rtx (SImode);
+	  store_mode = mode = SImode;
+	}
     }
 
   mem = rs6000_pre_atomic_barrier (mem, model);
@@ -18166,9 +18234,11 @@ rs6000_expand_atomic_op (enum rtx_code c
 			       NULL_RTX, 1, OPTAB_LIB_WIDEN);
       x = rs6000_mask_atomic_subword (before, x, mask);
     }
+  else if (store_mode != mode)
+    x = convert_modes (store_mode, mode, x, 1);
 
   cond = gen_reg_rtx (CCmode);
-  emit_store_conditional (mode, cond, mem, x);
+  emit_store_conditional (store_mode, cond, mem, x);
 
   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
   emit_unlikely_jump (x, label);
@@ -18177,11 +18247,22 @@ rs6000_expand_atomic_op (enum rtx_code c
 
   if (shift)
     {
+      /* QImode/HImode on machines without lbarx/lharx where we do a lwarx and
+	 then do the calcuations in a SImode register.  */
       if (orig_before)
 	rs6000_finish_atomic_subword (orig_before, before, shift);
       if (orig_after)
 	rs6000_finish_atomic_subword (orig_after, after, shift);
     }
+  else if (store_mode != mode)
+    {
+      /* QImode/HImode on machines with lbarx/lharx where we do the native
+	 operation and then do the calcuations in a SImode register.  */
+      if (orig_before)
+	convert_move (orig_before, before, 1);
+      if (orig_after)
+	convert_move (orig_after, after, 1);
+    }
   else if (orig_after && after != orig_after)
     emit_move_insn (orig_after, after);
 }
Index: gcc/config/rs6000/sync.md
===================================================================
--- gcc/config/rs6000/sync.md	(revision 199965)
+++ gcc/config/rs6000/sync.md	(working copy)
@@ -18,14 +18,23 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; <http://www.gnu.org/licenses/>.
 
-(define_mode_attr larx [(SI "lwarx") (DI "ldarx")])
-(define_mode_attr stcx [(SI "stwcx.") (DI "stdcx.")])
+(define_mode_attr larx [(QI "lbarx")
+			(HI "lharx")
+			(SI "lwarx")
+			(DI "ldarx")
+			(TI "lqarx")])
+
+(define_mode_attr stcx [(QI "stbcx.")
+			(HI "sthcx.")
+			(SI "stwcx.")
+			(DI "stdcx.")
+			(TI "stqcx.")])
 
 (define_code_iterator FETCHOP [plus minus ior xor and])
 (define_code_attr fetchop_name
   [(plus "add") (minus "sub") (ior "or") (xor "xor") (and "and")])
 (define_code_attr fetchop_pred
-  [(plus "add_operand") (minus "gpc_reg_operand")
+  [(plus "add_operand") (minus "int_reg_operand")
    (ior "logical_operand") (xor "logical_operand") (and "and_operand")])
 
 (define_expand "mem_thread_fence"
@@ -129,16 +138,7 @@ (define_expand "atomic_load<mode>"
     case MEMMODEL_CONSUME:
     case MEMMODEL_ACQUIRE:
     case MEMMODEL_SEQ_CST:
-      if (GET_MODE (operands[0]) == QImode)
-	emit_insn (gen_loadsync_qi (operands[0]));
-      else if (GET_MODE (operands[0]) == HImode)
-	emit_insn (gen_loadsync_hi (operands[0]));
-      else if (GET_MODE (operands[0]) == SImode)
-	emit_insn (gen_loadsync_si (operands[0]));
-      else if (GET_MODE (operands[0]) == DImode)
-	emit_insn (gen_loadsync_di (operands[0]));
-      else
-	gcc_unreachable ();
+      emit_insn (gen_loadsync_<mode> (operands[0]));
       break;
     default:
       gcc_unreachable ();
@@ -170,35 +170,109 @@ (define_expand "atomic_store<mode>"
   DONE;
 })
 
-;; ??? Power ISA 2.06B says that there *is* a load-{byte,half}-and-reserve
-;; opcode that is "phased-in".  Not implemented as of Power7, so not yet used,
-;; but let's prepare the macros anyway.
-
-(define_mode_iterator ATOMIC    [SI (DI "TARGET_POWERPC64")])
+;; Any supported integer mode that has atomic l<x>arx/st<x>cx. instrucitons
+;; other than the quad memory operations, which have special restrictions.
+;; Byte/halfword atomic instructions were added in ISA 2.06B, but were phased
+;; in and did not show up until power8.  TImode atomic lqarx/stqcx. require
+;; special handling due to even/odd register requirements.
+(define_mode_iterator ATOMIC [(QI "TARGET_SYNC_HI_QI")
+			      (HI "TARGET_SYNC_HI_QI")
+			      SI
+			      (DI "TARGET_POWERPC64")])
+
+;; Types that we should provide atomic instructions for.
+
+(define_mode_iterator AINT [QI
+			    HI
+			    SI
+			    (DI "TARGET_POWERPC64")
+			    (TI "TARGET_SYNC_TI")])
 
 (define_insn "load_locked<mode>"
-  [(set (match_operand:ATOMIC 0 "gpc_reg_operand" "=r")
+  [(set (match_operand:ATOMIC 0 "int_reg_operand" "=r")
 	(unspec_volatile:ATOMIC
          [(match_operand:ATOMIC 1 "memory_operand" "Z")] UNSPECV_LL))]
   ""
   "<larx> %0,%y1"
   [(set_attr "type" "load_l")])
 
+(define_insn "load_locked<QHI:mode>_si"
+  [(set (match_operand:SI 0 "int_reg_operand" "=r")
+	(unspec_volatile:SI
+	  [(match_operand:QHI 1 "memory_operand" "Z")] UNSPECV_LL))]
+  "TARGET_SYNC_HI_QI"
+  "<QHI:larx> %0,%y1"
+  [(set_attr "type" "load_l")])
+
+;; Use PTImode to get even/odd register pairs
+(define_expand "load_lockedti"
+  [(use (match_operand:TI 0 "quad_int_reg_operand" ""))
+   (use (match_operand:TI 1 "memory_operand" ""))]
+  "TARGET_SYNC_TI"
+{
+  /* Use a temporary register to force getting an even register for the
+     lqarx/stqcrx. instructions.  Normal optimizations will eliminate this
+     extra copy.  */
+  rtx pti = gen_reg_rtx (PTImode);
+  emit_insn (gen_load_lockedpti (pti, operands[1]));
+  emit_move_insn (operands[0], gen_lowpart (TImode, pti));
+  DONE;
+})
+
+(define_insn "load_lockedpti"
+  [(set (match_operand:PTI 0 "quad_int_reg_operand" "=&r")
+	(unspec_volatile:PTI
+         [(match_operand:TI 1 "memory_operand" "Z")] UNSPECV_LL))]
+  "TARGET_SYNC_TI
+   && !reg_mentioned_p (operands[0], operands[1])
+   && quad_int_reg_operand (operands[0], PTImode)"
+  "lqarx %0,%y1"
+  [(set_attr "type" "load_l")])
+
 (define_insn "store_conditional<mode>"
   [(set (match_operand:CC 0 "cc_reg_operand" "=x")
 	(unspec_volatile:CC [(const_int 0)] UNSPECV_SC))
    (set (match_operand:ATOMIC 1 "memory_operand" "=Z")
-	(match_operand:ATOMIC 2 "gpc_reg_operand" "r"))]
+	(match_operand:ATOMIC 2 "int_reg_operand" "r"))]
   ""
   "<stcx> %2,%y1"
   [(set_attr "type" "store_c")])
 
+(define_expand "store_conditionalti"
+  [(use (match_operand:CC 0 "cc_reg_operand" ""))
+   (use (match_operand:TI 1 "memory_operand" ""))
+   (use (match_operand:TI 2 "quad_int_reg_operand" ""))]
+  "TARGET_SYNC_TI"
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx pti_op1 = change_address (op1, PTImode, XEXP (op1, 0));
+  rtx pti_op2 = gen_reg_rtx (PTImode);
+
+  /* Use a temporary register to force getting an even register for the
+     lqarx/stqcrx. instructions.  Normal optimizations will eliminate this
+     extra copy.  */
+  emit_move_insn (pti_op2, gen_lowpart (PTImode, op2));
+  emit_insn (gen_store_conditionalpti (op0, pti_op1, pti_op2));
+  DONE;
+})
+
+(define_insn "store_conditionalpti"
+  [(set (match_operand:CC 0 "cc_reg_operand" "=x")
+	(unspec_volatile:CC [(const_int 0)] UNSPECV_SC))
+   (set (match_operand:PTI 1 "memory_operand" "=Z")
+	(match_operand:PTI 2 "quad_int_reg_operand" "r"))]
+  "TARGET_SYNC_TI && quad_int_reg_operand (operands[2], PTImode)"
+  "stqcx. %2,%y1"
+  [(set_attr "type" "store_c")])
+
 (define_expand "atomic_compare_and_swap<mode>"
-  [(match_operand:SI 0 "gpc_reg_operand" "")		;; bool out
-   (match_operand:INT1 1 "gpc_reg_operand" "")		;; val out
-   (match_operand:INT1 2 "memory_operand" "")		;; memory
-   (match_operand:INT1 3 "reg_or_short_operand" "")	;; expected
-   (match_operand:INT1 4 "gpc_reg_operand" "")		;; desired
+  [(match_operand:SI 0 "int_reg_operand" "")		;; bool out
+   (match_operand:AINT 1 "int_reg_operand" "")		;; val out
+   (match_operand:AINT 2 "memory_operand" "")		;; memory
+   (match_operand:AINT 3 "reg_or_short_operand" "")	;; expected
+   (match_operand:AINT 4 "int_reg_operand" "")		;; desired
    (match_operand:SI 5 "const_int_operand" "")		;; is_weak
    (match_operand:SI 6 "const_int_operand" "")		;; model succ
    (match_operand:SI 7 "const_int_operand" "")]		;; model fail
@@ -209,9 +283,9 @@ (define_expand "atomic_compare_and_swap<
 })
 
 (define_expand "atomic_exchange<mode>"
-  [(match_operand:INT1 0 "gpc_reg_operand" "")		;; output
-   (match_operand:INT1 1 "memory_operand" "")		;; memory
-   (match_operand:INT1 2 "gpc_reg_operand" "")		;; input
+  [(match_operand:AINT 0 "int_reg_operand" "")		;; output
+   (match_operand:AINT 1 "memory_operand" "")		;; memory
+   (match_operand:AINT 2 "int_reg_operand" "")		;; input
    (match_operand:SI 3 "const_int_operand" "")]		;; model
   ""
 {
@@ -220,9 +294,9 @@ (define_expand "atomic_exchange<mode>"
 })
 
 (define_expand "atomic_<fetchop_name><mode>"
-  [(match_operand:INT1 0 "memory_operand" "")		;; memory
-   (FETCHOP:INT1 (match_dup 0)
-     (match_operand:INT1 1 "<fetchop_pred>" ""))	;; operand
+  [(match_operand:AINT 0 "memory_operand" "")		;; memory
+   (FETCHOP:AINT (match_dup 0)
+     (match_operand:AINT 1 "<fetchop_pred>" ""))	;; operand
    (match_operand:SI 2 "const_int_operand" "")]		;; model
   ""
 {
@@ -232,8 +306,8 @@ (define_expand "atomic_<fetchop_name><mo
 })
 
 (define_expand "atomic_nand<mode>"
-  [(match_operand:INT1 0 "memory_operand" "")		;; memory
-   (match_operand:INT1 1 "gpc_reg_operand" "")		;; operand
+  [(match_operand:AINT 0 "memory_operand" "")		;; memory
+   (match_operand:AINT 1 "int_reg_operand" "")		;; operand
    (match_operand:SI 2 "const_int_operand" "")]		;; model
   ""
 {
@@ -243,10 +317,10 @@ (define_expand "atomic_nand<mode>"
 })
 
 (define_expand "atomic_fetch_<fetchop_name><mode>"
-  [(match_operand:INT1 0 "gpc_reg_operand" "")		;; output
-   (match_operand:INT1 1 "memory_operand" "")		;; memory
-   (FETCHOP:INT1 (match_dup 1)
-     (match_operand:INT1 2 "<fetchop_pred>" ""))	;; operand
+  [(match_operand:AINT 0 "int_reg_operand" "")		;; output
+   (match_operand:AINT 1 "memory_operand" "")		;; memory
+   (FETCHOP:AINT (match_dup 1)
+     (match_operand:AINT 2 "<fetchop_pred>" ""))	;; operand
    (match_operand:SI 3 "const_int_operand" "")]		;; model
   ""
 { 
@@ -256,9 +330,9 @@ (define_expand "atomic_fetch_<fetchop_na
 })
 
 (define_expand "atomic_fetch_nand<mode>"
-  [(match_operand:INT1 0 "gpc_reg_operand" "")		;; output
-   (match_operand:INT1 1 "memory_operand" "")		;; memory
-   (match_operand:INT1 2 "gpc_reg_operand" "")		;; operand
+  [(match_operand:AINT 0 "int_reg_operand" "")		;; output
+   (match_operand:AINT 1 "memory_operand" "")		;; memory
+   (match_operand:AINT 2 "int_reg_operand" "")		;; operand
    (match_operand:SI 3 "const_int_operand" "")]		;; model
   ""
 {
@@ -268,10 +342,10 @@ (define_expand "atomic_fetch_nand<mode>"
 })
 
 (define_expand "atomic_<fetchop_name>_fetch<mode>"
-  [(match_operand:INT1 0 "gpc_reg_operand" "")		;; output
-   (match_operand:INT1 1 "memory_operand" "")		;; memory
-   (FETCHOP:INT1 (match_dup 1)
-     (match_operand:INT1 2 "<fetchop_pred>" ""))	;; operand
+  [(match_operand:AINT 0 "int_reg_operand" "")		;; output
+   (match_operand:AINT 1 "memory_operand" "")		;; memory
+   (FETCHOP:AINT (match_dup 1)
+     (match_operand:AINT 2 "<fetchop_pred>" ""))	;; operand
    (match_operand:SI 3 "const_int_operand" "")]		;; model
   ""
 {
@@ -281,9 +355,9 @@ (define_expand "atomic_<fetchop_name>_fe
 })
 
 (define_expand "atomic_nand_fetch<mode>"
-  [(match_operand:INT1 0 "gpc_reg_operand" "")		;; output
-   (match_operand:INT1 1 "memory_operand" "")		;; memory
-   (match_operand:INT1 2 "gpc_reg_operand" "")		;; operand
+  [(match_operand:AINT 0 "int_reg_operand" "")		;; output
+   (match_operand:AINT 1 "memory_operand" "")		;; memory
+   (match_operand:AINT 2 "int_reg_operand" "")		;; operand
    (match_operand:SI 3 "const_int_operand" "")]		;; model
   ""
 {
Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md	(revision 199965)
+++ gcc/config/rs6000/rs6000.md	(working copy)
@@ -239,6 +239,12 @@ (define_mode_iterator INT1 [QI HI SI (DI
 ; extend modes for DImode
 (define_mode_iterator QHSI [QI HI SI])
 
+; QImode or HImode for small atomic ops
+(define_mode_iterator QHI [QI HI])
+
+; HImode or SImode for sign extended fusion ops
+(define_mode_iterator HSI [HI SI])
+
 ; SImode or DImode, even if DImode doesn't fit in GPRs.
 (define_mode_iterator SDI [SI DI])
 
Index: gcc/testsuite/gcc.target/powerpc/atomic-p7.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/atomic-p7.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/atomic-p7.c	(revision 0)
@@ -0,0 +1,207 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-mcpu=power7 -O2" } */
+/* { dg-final { scan-assembler-not "lbarx" } } */
+/* { dg-final { scan-assembler-not "lharx" } } */
+/* { dg-final { scan-assembler-times "lwarx" 18 } } */
+/* { dg-final { scan-assembler-times "ldarx" 6 } } */
+/* { dg-final { scan-assembler-not "lqarx" } } */
+/* { dg-final { scan-assembler-not "stbcx" } } */
+/* { dg-final { scan-assembler-not "sthcx" } } */
+/* { dg-final { scan-assembler-times "stwcx" 18 } } */
+/* { dg-final { scan-assembler-times "stdcx" 6 } } */
+/* { dg-final { scan-assembler-not "stqcx" } } */
+/* { dg-final { scan-assembler-times "bl __atomic" 6 } } */
+/* { dg-final { scan-assembler-times "isync" 12 } } */
+/* { dg-final { scan-assembler-times "lwsync" 8 } } */
+/* { dg-final { scan-assembler-not "mtvsrd" } } */
+/* { dg-final { scan-assembler-not "mtvsrwa" } } */
+/* { dg-final { scan-assembler-not "mtvsrwz" } } */
+/* { dg-final { scan-assembler-not "mfvsrd" } } */
+/* { dg-final { scan-assembler-not "mfvsrwz" } } */
+
+/* Test for the byte atomic operations on power8 using lbarx/stbcx.  */
+char
+char_fetch_add_relaxed (char *ptr, int value)
+{
+  return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED);
+}
+
+char
+char_fetch_sub_consume (char *ptr, int value)
+{
+  return __atomic_fetch_sub (ptr, value, __ATOMIC_CONSUME);
+}
+
+char
+char_fetch_and_acquire (char *ptr, int value)
+{
+  return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+char
+char_fetch_ior_release (char *ptr, int value)
+{
+  return __atomic_fetch_or (ptr, value, __ATOMIC_RELEASE);
+}
+
+char
+char_fetch_xor_acq_rel (char *ptr, int value)
+{
+  return __atomic_fetch_xor (ptr, value, __ATOMIC_ACQ_REL);
+}
+
+char
+char_fetch_nand_seq_cst (char *ptr, int value)
+{
+  return __atomic_fetch_nand (ptr, value, __ATOMIC_SEQ_CST);
+}
+
+/* Test for the half word atomic operations on power8 using lharx/sthcx.  */
+short
+short_fetch_add_relaxed (short *ptr, int value)
+{
+  return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED);
+}
+
+short
+short_fetch_sub_consume (short *ptr, int value)
+{
+  return __atomic_fetch_sub (ptr, value, __ATOMIC_CONSUME);
+}
+
+short
+short_fetch_and_acquire (short *ptr, int value)
+{
+  return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+short
+short_fetch_ior_release (short *ptr, int value)
+{
+  return __atomic_fetch_or (ptr, value, __ATOMIC_RELEASE);
+}
+
+short
+short_fetch_xor_acq_rel (short *ptr, int value)
+{
+  return __atomic_fetch_xor (ptr, value, __ATOMIC_ACQ_REL);
+}
+
+short
+short_fetch_nand_seq_cst (short *ptr, int value)
+{
+  return __atomic_fetch_nand (ptr, value, __ATOMIC_SEQ_CST);
+}
+
+/* Test for the word atomic operations on power8 using lwarx/stwcx.  */
+int
+int_fetch_add_relaxed (int *ptr, int value)
+{
+  return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED);
+}
+
+int
+int_fetch_sub_consume (int *ptr, int value)
+{
+  return __atomic_fetch_sub (ptr, value, __ATOMIC_CONSUME);
+}
+
+int
+int_fetch_and_acquire (int *ptr, int value)
+{
+  return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+int
+int_fetch_ior_release (int *ptr, int value)
+{
+  return __atomic_fetch_or (ptr, value, __ATOMIC_RELEASE);
+}
+
+int
+int_fetch_xor_acq_rel (int *ptr, int value)
+{
+  return __atomic_fetch_xor (ptr, value, __ATOMIC_ACQ_REL);
+}
+
+int
+int_fetch_nand_seq_cst (int *ptr, int value)
+{
+  return __atomic_fetch_nand (ptr, value, __ATOMIC_SEQ_CST);
+}
+
+/* Test for the double word atomic operations on power8 using ldarx/stdcx.  */
+long
+long_fetch_add_relaxed (long *ptr, long value)
+{
+  return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED);
+}
+
+long
+long_fetch_sub_consume (long *ptr, long value)
+{
+  return __atomic_fetch_sub (ptr, value, __ATOMIC_CONSUME);
+}
+
+long
+long_fetch_and_acquire (long *ptr, long value)
+{
+  return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+long
+long_fetch_ior_release (long *ptr, long value)
+{
+  return __atomic_fetch_or (ptr, value, __ATOMIC_RELEASE);
+}
+
+long
+long_fetch_xor_acq_rel (long *ptr, long value)
+{
+  return __atomic_fetch_xor (ptr, value, __ATOMIC_ACQ_REL);
+}
+
+long
+long_fetch_nand_seq_cst (long *ptr, long value)
+{
+  return __atomic_fetch_nand (ptr, value, __ATOMIC_SEQ_CST);
+}
+
+/* Test for the quad word atomic operations on power8 using ldarx/stdcx.  */
+__int128_t
+quad_fetch_add_relaxed (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED);
+}
+
+__int128_t
+quad_fetch_sub_consume (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_sub (ptr, value, __ATOMIC_CONSUME);
+}
+
+__int128_t
+quad_fetch_and_acquire (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+__int128_t
+quad_fetch_ior_release (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_or (ptr, value, __ATOMIC_RELEASE);
+}
+
+__int128_t
+quad_fetch_xor_acq_rel (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_xor (ptr, value, __ATOMIC_ACQ_REL);
+}
+
+__int128_t
+quad_fetch_nand_seq_cst (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_nand (ptr, value, __ATOMIC_SEQ_CST);
+}
Index: gcc/testsuite/gcc.target/powerpc/atomic-p8.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/atomic-p8.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/atomic-p8.c	(revision 0)
@@ -0,0 +1,237 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mcpu=power8 -O2" } */
+/* { dg-final { scan-assembler-times "lbarx" 7 } } */
+/* { dg-final { scan-assembler-times "lharx" 7 } } */
+/* { dg-final { scan-assembler-times "lwarx" 7 } } */
+/* { dg-final { scan-assembler-times "ldarx" 7 } } */
+/* { dg-final { scan-assembler-times "lqarx" 7 } } */
+/* { dg-final { scan-assembler-times "stbcx" 7 } } */
+/* { dg-final { scan-assembler-times "sthcx" 7 } } */
+/* { dg-final { scan-assembler-times "stwcx" 7 } } */
+/* { dg-final { scan-assembler-times "stdcx" 7 } } */
+/* { dg-final { scan-assembler-times "stqcx" 7 } } */
+/* { dg-final { scan-assembler-not "bl __atomic" } } */
+/* { dg-final { scan-assembler-times "isync" 20 } } */
+/* { dg-final { scan-assembler-times "lwsync" 10 } } */
+/* { dg-final { scan-assembler-not "mtvsrd" } } */
+/* { dg-final { scan-assembler-not "mtvsrwa" } } */
+/* { dg-final { scan-assembler-not "mtvsrwz" } } */
+/* { dg-final { scan-assembler-not "mfvsrd" } } */
+/* { dg-final { scan-assembler-not "mfvsrwz" } } */
+
+/* Test for the byte atomic operations on power8 using lbarx/stbcx.  */
+char
+char_fetch_add_relaxed (char *ptr, int value)
+{
+  return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED);
+}
+
+char
+char_fetch_sub_consume (char *ptr, int value)
+{
+  return __atomic_fetch_sub (ptr, value, __ATOMIC_CONSUME);
+}
+
+char
+char_fetch_and_acquire (char *ptr, int value)
+{
+  return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+char
+char_fetch_ior_release (char *ptr, int value)
+{
+  return __atomic_fetch_or (ptr, value, __ATOMIC_RELEASE);
+}
+
+char
+char_fetch_xor_acq_rel (char *ptr, int value)
+{
+  return __atomic_fetch_xor (ptr, value, __ATOMIC_ACQ_REL);
+}
+
+char
+char_fetch_nand_seq_cst (char *ptr, int value)
+{
+  return __atomic_fetch_nand (ptr, value, __ATOMIC_SEQ_CST);
+}
+
+void
+char_val_compare_and_swap (char *p, int i, int j, char *q)
+{
+  *q = __sync_val_compare_and_swap (p, i, j);
+}
+
+/* Test for the half word atomic operations on power8 using lharx/sthcx.  */
+short
+short_fetch_add_relaxed (short *ptr, int value)
+{
+  return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED);
+}
+
+short
+short_fetch_sub_consume (short *ptr, int value)
+{
+  return __atomic_fetch_sub (ptr, value, __ATOMIC_CONSUME);
+}
+
+short
+short_fetch_and_acquire (short *ptr, int value)
+{
+  return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+short
+short_fetch_ior_release (short *ptr, int value)
+{
+  return __atomic_fetch_or (ptr, value, __ATOMIC_RELEASE);
+}
+
+short
+short_fetch_xor_acq_rel (short *ptr, int value)
+{
+  return __atomic_fetch_xor (ptr, value, __ATOMIC_ACQ_REL);
+}
+
+short
+short_fetch_nand_seq_cst (short *ptr, int value)
+{
+  return __atomic_fetch_nand (ptr, value, __ATOMIC_SEQ_CST);
+}
+
+void
+short_val_compare_and_swap (short *p, int i, int j, short *q)
+{
+  *q = __sync_val_compare_and_swap (p, i, j);
+}
+
+/* Test for the word atomic operations on power8 using lwarx/stwcx.  */
+int
+int_fetch_add_relaxed (int *ptr, int value)
+{
+  return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED);
+}
+
+int
+int_fetch_sub_consume (int *ptr, int value)
+{
+  return __atomic_fetch_sub (ptr, value, __ATOMIC_CONSUME);
+}
+
+int
+int_fetch_and_acquire (int *ptr, int value)
+{
+  return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+int
+int_fetch_ior_release (int *ptr, int value)
+{
+  return __atomic_fetch_or (ptr, value, __ATOMIC_RELEASE);
+}
+
+int
+int_fetch_xor_acq_rel (int *ptr, int value)
+{
+  return __atomic_fetch_xor (ptr, value, __ATOMIC_ACQ_REL);
+}
+
+int
+int_fetch_nand_seq_cst (int *ptr, int value)
+{
+  return __atomic_fetch_nand (ptr, value, __ATOMIC_SEQ_CST);
+}
+
+void
+int_val_compare_and_swap (int *p, int i, int j, int *q)
+{
+  *q = __sync_val_compare_and_swap (p, i, j);
+}
+
+/* Test for the double word atomic operations on power8 using ldarx/stdcx.  */
+long
+long_fetch_add_relaxed (long *ptr, long value)
+{
+  return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED);
+}
+
+long
+long_fetch_sub_consume (long *ptr, long value)
+{
+  return __atomic_fetch_sub (ptr, value, __ATOMIC_CONSUME);
+}
+
+long
+long_fetch_and_acquire (long *ptr, long value)
+{
+  return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+long
+long_fetch_ior_release (long *ptr, long value)
+{
+  return __atomic_fetch_or (ptr, value, __ATOMIC_RELEASE);
+}
+
+long
+long_fetch_xor_acq_rel (long *ptr, long value)
+{
+  return __atomic_fetch_xor (ptr, value, __ATOMIC_ACQ_REL);
+}
+
+long
+long_fetch_nand_seq_cst (long *ptr, long value)
+{
+  return __atomic_fetch_nand (ptr, value, __ATOMIC_SEQ_CST);
+}
+
+void
+long_val_compare_and_swap (long *p, long i, long j, long *q)
+{
+  *q = __sync_val_compare_and_swap (p, i, j);
+}
+
+/* Test for the quad word atomic operations on power8 using ldarx/stdcx.  */
+__int128_t
+quad_fetch_add_relaxed (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED);
+}
+
+__int128_t
+quad_fetch_sub_consume (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_sub (ptr, value, __ATOMIC_CONSUME);
+}
+
+__int128_t
+quad_fetch_and_acquire (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE);
+}
+
+__int128_t
+quad_fetch_ior_release (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_or (ptr, value, __ATOMIC_RELEASE);
+}
+
+__int128_t
+quad_fetch_xor_acq_rel (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_xor (ptr, value, __ATOMIC_ACQ_REL);
+}
+
+__int128_t
+quad_fetch_nand_seq_cst (__int128_t *ptr, __int128_t value)
+{
+  return __atomic_fetch_nand (ptr, value, __ATOMIC_SEQ_CST);
+}
+
+void
+quad_val_compare_and_swap (__int128_t *p, __int128_t i, __int128_t j, __int128_t *q)
+{
+  *q = __sync_val_compare_and_swap (p, i, j);
+}