This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH][GCC][AARCH64] Use STLUR for atomic_store


On 02/08/18 17:26, matthew.malcomson@arm.com wrote:
> Use the STLUR instruction introduced in Armv8.4-a.
> This insruction has the store-release semantic like STLR but can take a
> 9-bit unscaled signed immediate offset.
> 
> Example test case:
> ```
> void
> foo ()
> {
>     int32_t *atomic_vals = calloc (4, sizeof (int32_t));
>     atomic_store_explicit (atomic_vals + 1, 2, memory_order_release);
> }
> ```
> 
> Before patch generates
> ```
> foo:
> 	stp	x29, x30, [sp, -16]!
> 	mov	x1, 4
> 	mov	x0, x1
> 	mov	x29, sp
> 	bl	calloc
> 	mov	w1, 2
> 	add	x0, x0, 4
> 	stlr	w1, [x0]
> 	ldp	x29, x30, [sp], 16
> 	ret
> ```
> 
> After patch generates
> ```
> foo:
> 	stp	x29, x30, [sp, -16]!
> 	mov	x1, 4
> 	mov	x0, x1
> 	mov	x29, sp
> 	bl	calloc
> 	mov	w1, 2
> 	stlur	w1, [x0, 4]
> 	ldp	x29, x30, [sp], 16
> 	ret
> ```
> 
> Full bootstrap and regression test done on aarch64.
> 
> Ok for trunk?
> 
> gcc/
> 2018-07-26  Matthew Malcomson  <matthew.malcomson@arm.com>
> 
>         * config/aarch64/aarch64-protos.h
>         (aarch64_offset_9bit_signed_unscaled_p): New declaration.
>         * config/aarch64/aarch64.c
>         (aarch64_offset_9bit_signed_unscaled_p): Rename from
>         offset_9bit_signed_unscaled_p.
>         * config/aarch64/aarch64.h (TARGET_ARMV8_4): Add feature macro.
>         * config/aarch64/atomics.md (atomic_store<mode>): Allow offset
>         and use stlur.
>         * config/aarch64/constraints.md (Ust): New constraint.
>         * config/aarch64/predicates.md.
>         (aarch64_sync_or_stlur_memory_operand): New predicate.

Thinking further ahead, there were several new instructions added that
have the same basic form; not all of them are STLUR.  At some point I
expect we will want to add support for those into GCC.  So I think you
should consider a more generic name, perhaps
aarch64_sync_offset_memory_operand?

R.

> 
> gcc/testsuite/
> 2018-07-26  Matthew Malcomson  <matthew.malcomson@arm.com>
> 
> 	* gcc.target/aarch64/atomic-store.c: New.
> 
> 
> use-stlur-instruction.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index af5db9c595385f7586692258f750b6aceb3ed9c8..630a75bf776fcdc374aa9ffa4bb020fea3719320 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -393,6 +393,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);
>  bool aarch64_mov_operand_p (rtx, machine_mode);
>  rtx aarch64_reverse_mask (machine_mode, unsigned int);
>  bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
> +bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);
>  char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
>  char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
>  char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index c1218503bab19323eee1cca8b7e4bea8fbfcf573..328512e11f4230e24223bc51e55bdca8b31f6a20 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -237,6 +237,9 @@ extern unsigned aarch64_architecture_version;
>  /* ARMv8.3-A features.  */
>  #define TARGET_ARMV8_3	(AARCH64_ISA_V8_3)
>  
> +/* ARMv8.4-A features.  */
> +#define TARGET_ARMV8_4	(AARCH64_ISA_V8_4)
> +
>  /* Make sure this is always defined so we don't have to check for ifdefs
>     but rather use normal ifs.  */
>  #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index fa01475aa9ee579b6a3b2526295b622157120660..8560150d552b4f830335ccd420a0749f01d4bb6a 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -4546,8 +4546,8 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
>  
>  /* Return true if OFFSET is a signed 9-bit value.  */
>  
> -static inline bool
> -offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
> +bool
> +aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
>  			       poly_int64 offset)
>  {
>    HOST_WIDE_INT const_offset;
> @@ -5823,7 +5823,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  	     instruction memory accesses.  */
>  	  if (mode == TImode || mode == TFmode)
>  	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
> -		    && (offset_9bit_signed_unscaled_p (mode, offset)
> +		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
>  			|| offset_12bit_unsigned_scaled_p (mode, offset)));
>  
>  	  /* A 7bit offset check because OImode will emit a ldp/stp
> @@ -5837,7 +5837,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  	     ldr/str instructions (only big endian will get here).  */
>  	  if (mode == CImode)
>  	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
> -		    && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
> +		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
>  			|| offset_12bit_unsigned_scaled_p (V16QImode,
>  							   offset + 32)));
>  
> @@ -5877,7 +5877,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  		     || known_eq (GET_MODE_SIZE (mode), 16))
>  		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
>  	  else
> -	    return (offset_9bit_signed_unscaled_p (mode, offset)
> +	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
>  		    || offset_12bit_unsigned_scaled_p (mode, offset));
>  	}
>  
> @@ -5930,7 +5930,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  	   */
>  	  if (mode == TImode || mode == TFmode)
>  	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
> -		    && offset_9bit_signed_unscaled_p (mode, offset));
> +		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
>  
>  	  if (load_store_pair_p)
>  	    return ((known_eq (GET_MODE_SIZE (mode), 4)
> @@ -5938,7 +5938,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
>  		     || known_eq (GET_MODE_SIZE (mode), 16))
>  		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
>  	  else
> -	    return offset_9bit_signed_unscaled_p (mode, offset);
> +	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
>  	}
>        return false;
>  
> diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
> index cd8c44e20ab0900481373193ec4b7fda86aadfe5..95d96261dcb1173088ff9debd6cb5aae3da010aa 100644
> --- a/gcc/config/aarch64/atomics.md
> +++ b/gcc/config/aarch64/atomics.md
> @@ -481,9 +481,9 @@
>  )
>  
>  (define_insn "atomic_store<mode>"
> -  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
> +  [(set (match_operand:ALLI 0 "aarch64_sync_or_stlur_memory_operand" "=Q,Ust")
>      (unspec_volatile:ALLI
> -      [(match_operand:ALLI 1 "general_operand" "rZ")
> +      [(match_operand:ALLI 1 "general_operand" "rZ,rZ")
>         (match_operand:SI 2 "const_int_operand")]			;; model
>        UNSPECV_STL))]
>    ""
> @@ -491,11 +491,14 @@
>      enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
>      if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
>        return "str<atomic_sfx>\t%<w>1, %0";
> -    else
> +    else if (which_alternative == 0)
>        return "stlr<atomic_sfx>\t%<w>1, %0";
> +    else
> +      return "stlur<atomic_sfx>\t%<w>1, %0";
>    }
>  )
>  
> +
>  (define_insn "aarch64_load_exclusive<mode>"
>    [(set (match_operand:SI 0 "register_operand" "=r")
>      (zero_extend:SI
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..2b2192d8d0a15ef7157915b2c045e953067bce63 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -218,6 +218,13 @@
>   (and (match_code "mem")
>        (match_test "REG_P (XEXP (op, 0))")))
>  
> +;; STLUR instruction constraint requires Armv8.4
> +(define_special_memory_constraint "Ust"
> + "@internal
> + A memory address suitable for use with an stlur instruction."
> +  (and (match_operand 0 "aarch64_sync_or_stlur_memory_operand")
> +       (match_test "TARGET_ARMV8_4")))
> +
>  (define_memory_constraint "Ump"
>    "@internal
>    A memory address suitable for a load/store pair operation."
> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
> index d8f377b9603e76a29dd92f95e9905121eaf7b800..1df2f914ebc87841671c0ebb097d4603e3cecd41 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -359,6 +359,33 @@
>    (and (match_operand 0 "memory_operand")
>         (match_code "reg" "0")))
>  
> +;; True if the operand is memory reference valid for one of a str or stlur
> +;; operation.
> +(define_predicate "aarch64_sync_or_stlur_memory_operand"
> +  (ior (match_operand 0 "aarch64_sync_memory_operand")
> +       (and (match_operand 0 "memory_operand")
> +	    (match_code "plus" "0")
> +	    (match_code "reg" "00")
> +	    (match_code "const_int" "01")))
> +{
> +  if (aarch64_sync_memory_operand (op, mode))
> +    return true;
> +
> +  if (!TARGET_ARMV8_4)
> +    return false;
> +
> +  rtx mem_op = XEXP (op, 0);
> +  rtx plus_op0 = XEXP (mem_op, 0);
> +  rtx plus_op1 = XEXP (mem_op, 1);
> +
> +  if (GET_MODE (plus_op0) != DImode)
> +    return false;
> +
> +  poly_int64 offset;
> +  poly_int_rtx_p (plus_op1, &offset);
> +  return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
> +})
> +
>  ;; Predicates for parallel expanders based on mode.
>  (define_special_predicate "vect_par_cnst_hi_half"
>    (match_code "parallel")
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..0ffc88029bbfe748cd0fc3b6ff4fdd47373b0bb5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> @@ -0,0 +1,72 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv8.4-a -O2" } */
> +
> +#include <stdlib.h>
> +#include <stdatomic.h>
> +#include <stdint.h>
> +
> +#define STORE_TESTS(size) \
> +  void \
> +  foo##size () \
> +{ \
> +  int##size##_t *atomic_vals = calloc (4, sizeof (int##size##_t)); \
> +  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
> +  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
> +  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
> +  atomic_store ((atomic_vals + 2), 2); \
> +  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
> +}
> +
> +STORE_TESTS (8);
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
> +
> +STORE_TESTS (16);
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
> +
> +STORE_TESTS (32);
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
> +
> +STORE_TESTS (64);
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
> +
> +void
> +foo_toolarge_offset ()
> +{
> +  int64_t *atomic_vals = calloc (4, sizeof (int64_t));
> +  /* 9bit signed unscaled immediate => largest representable value +255.
> +     32 * 8 == 256  */
> +  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
> +}
> +
> +void
> +foo_negative (int8_t *atomic_vals)
> +{
> +  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
> +}
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 } } */
> +
> +#pragma GCC target ("arch=armv8.3-a")
> +void
> +foo_older_arch ()
> +{
> +  int64_t *atomic_vals = calloc (4, sizeof (int64_t));
> +  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
> +}
> +
> +/* Three times, one for each of the three above functions.  */
> +/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 3 } } */
> 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]