This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [AArch64, PATCH] Improve Neon store of zero
- From: "Richard Earnshaw (lists)" <Richard dot Earnshaw at arm dot com>
- To: Jackson Woodruff <jackson dot woodruff at foss dot arm dot com>, gcc-patches at gcc dot gnu dot org
- Cc: James Greenhalgh <james dot greenhalgh at arm dot com>
- Date: Fri, 11 Aug 2017 14:56:50 +0100
- Subject: Re: [AArch64, PATCH] Improve Neon store of zero
- Authentication-results: sourceware.org; auth=none
- References: <fc20ca5e-edf7-f092-f3e4-d4a61082485a@foss.arm.com>
On 10/08/17 14:12, Jackson Woodruff wrote:
> Hi all,
>
> This patch changes patterns in aarch64-simd.md to replace
>
> movi v0.4s, 0
> str q0, [x0, 16]
>
> With:
>
> stp xzr, xzr, [x0, 16]
>
> When we are storing zeros to vectors like this:
>
> void f(uint32x4_t *p) {
> uint32x4_t x = { 0, 0, 0, 0};
> p[1] = x;
> }
>
> Bootstrapped and regtested on aarch64 with no regressions.
> OK for trunk?
>
> Jackson
>
> gcc/
>
> 2017-08-09 Jackson Woodruff <jackson.woodruff@arm.com>
>
> * aarch64-simd.md (mov<mode>): No longer force zero
> immediate into register.
> (*aarch64_simd_mov<mode>): Add new case for stp
> using zero immediate.
>
>
> gcc/testsuite
>
> 2017-08-09 Jackson Woodruff <jackson.woodruff@arm.com>
>
> * gcc.target/aarch64/simd/neon_str_zero.c: New.
>
>
> patchfile
>
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 74de9b8c89dd5e4e3d87504594c969de0e0128ce..0149a742d34ae4fd5b3fd705b03c845f94aa1d59 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -23,7 +23,10 @@
> (match_operand:VALL_F16 1 "general_operand" ""))]
> "TARGET_SIMD"
> "
> - if (GET_CODE (operands[0]) == MEM)
> + if (GET_CODE (operands[0]) == MEM
> + && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
> + && aarch64_legitimate_address_p (<MODE>mode, operands[0],
> + PARALLEL, 1)))
> operands[1] = force_reg (<MODE>mode, operands[1]);
> "
> )
> @@ -94,63 +97,70 @@
>
> (define_insn "*aarch64_simd_mov<mode>"
> [(set (match_operand:VD 0 "nonimmediate_operand"
> - "=w, m, w, ?r, ?w, ?r, w")
> + "=w, m, m, w, ?r, ?w, ?r, w")
> (match_operand:VD 1 "general_operand"
> - "m, w, w, w, r, r, Dn"))]
> + "m, Dz, w, w, w, r, r, Dn"))]
> "TARGET_SIMD
> - && (register_operand (operands[0], <MODE>mode)
> - || register_operand (operands[1], <MODE>mode))"
> + && ((register_operand (operands[0], <MODE>mode)
> + || register_operand (operands[1], <MODE>mode))
> + || (memory_operand (operands[0], <MODE>mode)
> + && immediate_operand (operands[1], <MODE>mode)))"
Allowing any immediate here seems too lax - it allows any immediate
value which then could cause reload operations to be inserted (that in
turn might cause register pressure calculations to be incorrect).
Wouldn't it be better to use something like aarch64_simd_reg_or_zero?
Similarly below.
R.
> {
> switch (which_alternative)
> {
> case 0: return "ldr\\t%d0, %1";
> - case 1: return "str\\t%d1, %0";
> - case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
> - case 3: return "umov\t%0, %1.d[0]";
> - case 4: return "fmov\t%d0, %1";
> - case 5: return "mov\t%0, %1";
> - case 6:
> + case 1: return "str\\txzr, %0";
> + case 2: return "str\\t%d1, %0";
> + case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
> + case 4: return "umov\t%0, %1.d[0]";
> + case 5: return "fmov\t%d0, %1";
> + case 6: return "mov\t%0, %1";
> + case 7:
> return aarch64_output_simd_mov_immediate (operands[1],
> <MODE>mode, 64);
> default: gcc_unreachable ();
> }
> }
> - [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
> + [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\
> neon_logic<q>, neon_to_gp<q>, f_mcr,\
> mov_reg, neon_move<q>")]
> )
>
> (define_insn "*aarch64_simd_mov<mode>"
> [(set (match_operand:VQ 0 "nonimmediate_operand"
> - "=w, m, w, ?r, ?w, ?r, w")
> + "=w, Ump, m, w, ?r, ?w, ?r, w")
> (match_operand:VQ 1 "general_operand"
> - "m, w, w, w, r, r, Dn"))]
> + "m, Dz, w, w, w, r, r, Dn"))]
> "TARGET_SIMD
> - && (register_operand (operands[0], <MODE>mode)
> - || register_operand (operands[1], <MODE>mode))"
> + && ((register_operand (operands[0], <MODE>mode)
> + || register_operand (operands[1], <MODE>mode))
> + || (memory_operand (operands[0], <MODE>mode)
> + && immediate_operand (operands[1], <MODE>mode)))"
> {
> switch (which_alternative)
> {
> case 0:
> return "ldr\\t%q0, %1";
> case 1:
> - return "str\\t%q1, %0";
> + return "stp\\txzr, xzr, %0";
> case 2:
> - return "mov\t%0.<Vbtype>, %1.<Vbtype>";
> + return "str\\t%q1, %0";
> case 3:
> + return "mov\t%0.<Vbtype>, %1.<Vbtype>";
> case 4:
> case 5:
> - return "#";
> case 6:
> + return "#";
> + case 7:
> return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
> default:
> gcc_unreachable ();
> }
> }
> [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
> - neon_logic<q>, multiple, multiple, multiple,\
> - neon_move<q>")
> - (set_attr "length" "4,4,4,8,8,8,4")]
> + neon_stp, neon_logic<q>, multiple, multiple,\
> + multiple, neon_move<q>")
> + (set_attr "length" "4,4,4,4,8,8,8,4")]
> )
>
> ;; When storing lane zero we can use the normal STR and its more permissive
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1" } */
> +
> +#include <arm_neon.h>
> +
> +void
> +f (uint32x4_t *p)
> +{
> + uint32x4_t x = { 0, 0, 0, 0};
> + p[1] = x;
> +
> + /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
> +}
> +
> +void
> +g (float32x2_t *p)
> +{
> + float32x2_t x = {0.0, 0.0};
> + p[0] = x;
> +
> + /* { dg-final { scan-assembler "str\txzr, " } } */
> +}
>