[PATCH V2] aarch64: Model zero-high-half semantics of ADDHN/SUBHN instructions

Wed Jun 16 12:33:08 GMT 2021

Jonathan Wright <Jonathan.Wright@arm.com> writes:
> Hi,
>
> Version 2 of this patch adds tests to verify the benefit of this change.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-06-14  Jonathan Wright  <jonathan.wright@arm.com>
>
>         * config/aarch64/aarch64-simd.md (aarch64_<sur><addsub>hn<mode>):
>         Change to an expander that emits the correct instruction
>         depending on endianness.
>         (aarch64_<sur><addsub>hn<mode>_insn_le): Define.
>         (aarch64_<sur><addsub>hn<mode>_insn_be): Define.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/narrow_zero_high_half.c: Add new tests.

OK, thanks.

> From: Gcc-patches <gcc-patches-bounces+jonathan.wright=arm.com@gcc.gnu.org> on behalf of Jonathan Wright via Gcc-patches <gcc-patches@gcc.gnu.org>
> Sent: 15 June 2021 11:02
> To: gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>
> Subject: [PATCH] aarch64: Model zero-high-half semantics of ADDHN/SUBHN instructions
>
> Hi,
>
> As subject, this patch models the zero-high-half semantics of the
> narrowing arithmetic Neon instructions in the
> aarch64_<sur><addsub>hn<mode> RTL pattern. Modeling these
> semantics allows for better RTL combinations while also removing
> some register allocation issues as the compiler now knows that the
> operation is totally destructive.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-06-14  Jonathan Wright  <jonathan.wright@arm.com>
>
>         * config/aarch64/aarch64-simd.md (aarch64_<sur><addsub>hn<mode>):
>         Change to an expander that emits the correct instruction
>         depending on endianness.
>         (aarch64_<sur><addsub>hn<mode>_insn_le): Define.
>         (aarch64_<sur><addsub>hn<mode>_insn_be): Define.
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 2b75e57eb77a0dea449f2c13bd77a88f48c4cea5..540244cf0a919b3ea1d6ebf5929be50fed395179 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -4661,16 +4661,53 @@
>  
>  ;; <r><addsub>hn<q>.
>  
> -(define_insn "aarch64_<sur><addsub>hn<mode>"
> -  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
> -        (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
> -			    (match_operand:VQN 2 "register_operand" "w")]
> -                           ADDSUBHN))]
> -  "TARGET_SIMD"
> +(define_insn "aarch64_<sur><addsub>hn<mode>_insn_le"
> +  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
> +	(vec_concat:<VNARROWQ2>
> +	  (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
> +			      (match_operand:VQN 2 "register_operand" "w")]
> +			     ADDSUBHN)
> +	  (match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero")))]
> +  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
> +  "<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
> +  [(set_attr "type" "neon_<addsub>_halve_narrow_q")]
> +)
> +
> +(define_insn "aarch64_<sur><addsub>hn<mode>_insn_be"
> +  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
> +	(vec_concat:<VNARROWQ2>
> +	  (match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero")
> +	  (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
> +			      (match_operand:VQN 2 "register_operand" "w")]
> +			     ADDSUBHN)))]
> +  "TARGET_SIMD && BYTES_BIG_ENDIAN"
>    "<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
>    [(set_attr "type" "neon_<addsub>_halve_narrow_q")]
>  )
>  
> +(define_expand "aarch64_<sur><addsub>hn<mode>"
> +  [(set (match_operand:<VNARROWQ> 0 "register_operand")
> +	(unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand")
> +			    (match_operand:VQN 2 "register_operand")]
> +			   ADDSUBHN))]
> +  "TARGET_SIMD"
> +  {
> +    rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
> +    if (BYTES_BIG_ENDIAN)
> +      emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_be (tmp, operands[1],
> +				operands[2], CONST0_RTX (<VNARROWQ>mode)));
> +    else
> +      emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_le (tmp, operands[1],
> +				operands[2], CONST0_RTX (<VNARROWQ>mode)));
> +
> +    /* The intrinsic expects a narrow result, so emit a subreg that will get
> +       optimized away as appropriate.  */
> +    emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
> +						 <VNARROWQ2>mode));
> +    DONE;
> +  }
> +)
> +
>  (define_insn "aarch64_<sur><addsub>hn2<mode>_insn_le"
>    [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
>  	(vec_concat:<VNARROWQ2>
> diff --git a/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c b/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
> index 3061c15eb8aa6fe30a509cd46b959cf44edcdb73..97342de58bb5586a8317f1b4c92dcb9d6db83733 100644
> --- a/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
> +++ b/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
> @@ -74,6 +74,42 @@ TEST_UNARY (vqmovn, uint8x16_t, uint16x8_t, u16, u8)
>  TEST_UNARY (vqmovn, uint16x8_t, uint32x4_t, u32, u16)
>  TEST_UNARY (vqmovn, uint32x4_t, uint64x2_t, u64, u32)
>  
> +#define TEST_ARITH(name, rettype, intype, fs, rs) \
> +  rettype test_ ## name ## _ ## fs ## _zero_high \
> +		(intype a, intype b) \
> +	{ \
> +		return vcombine_ ## rs (name ## _ ## fs (a, b), \
> +					vdup_n_ ## rs (0)); \
> +	}
> +
> +TEST_ARITH (vaddhn, int8x16_t, int16x8_t, s16, s8)
> +TEST_ARITH (vaddhn, int16x8_t, int32x4_t, s32, s16)
> +TEST_ARITH (vaddhn, int32x4_t, int64x2_t, s64, s32)
> +TEST_ARITH (vaddhn, uint8x16_t, uint16x8_t, u16, u8)
> +TEST_ARITH (vaddhn, uint16x8_t, uint32x4_t, u32, u16)
> +TEST_ARITH (vaddhn, uint32x4_t, uint64x2_t, u64, u32)
> +
> +TEST_ARITH (vraddhn, int8x16_t, int16x8_t, s16, s8)
> +TEST_ARITH (vraddhn, int16x8_t, int32x4_t, s32, s16)
> +TEST_ARITH (vraddhn, int32x4_t, int64x2_t, s64, s32)
> +TEST_ARITH (vraddhn, uint8x16_t, uint16x8_t, u16, u8)
> +TEST_ARITH (vraddhn, uint16x8_t, uint32x4_t, u32, u16)
> +TEST_ARITH (vraddhn, uint32x4_t, uint64x2_t, u64, u32)
> +
> +TEST_ARITH (vsubhn, int8x16_t, int16x8_t, s16, s8)
> +TEST_ARITH (vsubhn, int16x8_t, int32x4_t, s32, s16)
> +TEST_ARITH (vsubhn, int32x4_t, int64x2_t, s64, s32)
> +TEST_ARITH (vsubhn, uint8x16_t, uint16x8_t, u16, u8)
> +TEST_ARITH (vsubhn, uint16x8_t, uint32x4_t, u32, u16)
> +TEST_ARITH (vsubhn, uint32x4_t, uint64x2_t, u64, u32)
> +
> +TEST_ARITH (vrsubhn, int8x16_t, int16x8_t, s16, s8)
> +TEST_ARITH (vrsubhn, int16x8_t, int32x4_t, s32, s16)
> +TEST_ARITH (vrsubhn, int32x4_t, int64x2_t, s64, s32)
> +TEST_ARITH (vrsubhn, uint8x16_t, uint16x8_t, u16, u8)
> +TEST_ARITH (vrsubhn, uint16x8_t, uint32x4_t, u32, u16)
> +TEST_ARITH (vrsubhn, uint32x4_t, uint64x2_t, u64, u32)
> +
>  /* { dg-final { scan-assembler-not "dup\\t" } } */
>  
>  /* { dg-final { scan-assembler-times "\\trshrn\\tv" 6} }  */
> @@ -88,3 +124,7 @@ TEST_UNARY (vqmovn, uint32x4_t, uint64x2_t, u64, u32)
>  /* { dg-final { scan-assembler-times "\\tsqxtun\\tv" 3} }  */
>  /* { dg-final { scan-assembler-times "\\tuqxtn\\tv" 3} }  */
>  /* { dg-final { scan-assembler-times "\\tsqxtn\\tv" 3} }  */
> +/* { dg-final { scan-assembler-times "\\taddhn\\tv" 6} }  */
> +/* { dg-final { scan-assembler-times "\\tsubhn\\tv" 6} }  */
> +/* { dg-final { scan-assembler-times "\\trsubhn\\tv" 6} }  */
> +/* { dg-final { scan-assembler-times "\\traddhn\\tv" 6} }  */