[PATCH]AArch64[RFC] Force complicated constant to memory when beneficial

Fri Oct 8 16:12:19 GMT 2021

Catching up on backlog, sorry for the very late response:

Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> Consider the following case
>
> #include <arm_neon.h>
>
> uint64_t
> test4 (uint8x16_t input)
> {
>     uint8x16_t bool_input = vshrq_n_u8(input, 7);
>     poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
>     poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
> 				vgetq_lane_p64(mask, 0));
>     poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
>     uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
>     return vget_lane_u16((uint16x4_t)res, 3);
> }
>
> which generates (after my CSE patches):
>
> test4:
> 	ushr	v0.16b, v0.16b, 7
> 	mov	x0, 16512
> 	movk	x0, 0x1020, lsl 16
> 	movk	x0, 0x408, lsl 32
> 	movk	x0, 0x102, lsl 48
> 	fmov	d1, x0
> 	pmull	v2.1q, v0.1d, v1.1d
> 	dup	v1.2d, v1.d[0]
> 	pmull2	v0.1q, v0.2d, v1.2d
> 	trn2	v2.8b, v2.8b, v0.8b
> 	umov	w0, v2.h[3]
> 	re
>
> which is suboptimal since the constant is never needed on the genreg side and
> should have been materialized on the SIMD side since the constant is so big
> that it requires 5 instruction to create otherwise. 4 mov/movk and one fmov.
>
> The problem is that the choice of on which side to materialize the constant can
> only be done during reload.  We may need an extra register (to hold the
> addressing) and so can't be done after reload.
>
> I have tried to support this with a pattern during reload, but the problem is I
> can't seem to find a way to tell reload it should spill a constant under
> condition x.  Instead I tried with a split which reload selects when the
> condition hold.

If this is still an issue, one thing to try would be to put a "$" before
the "r" in the GPR alternative.  If that doesn't work then yeah,
I think we're out of luck describing this directly.  If "$" does work,
it'd be interesting to see whether "^" does too.

Thanks,
Richard

>
> This has a couple of issues:
>
> 1. The pattern can be expanded late (could be fixed with !reload_completed).
> 2. Because it's split so late we can't seem to be able to share the anchors for
>    the ADRP.
> 3. Because it's split so late and basically reload doesn't know about the spill
>    and so the ADD lo12 isn't pushed into the addressing mode of the LDR.
>
> I don't know how to properly fix these since I think the only way is for reload
> to do the spill properly itself, but in this case not having the patter makes it
> avoid the mem pattern and pick r <- n instead followed by r -> w.
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (*movdi_aarch6): Add Dx -> W.
> 	* config/aarch64/constraints.md (Dx): New.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64"
>  )
>  
>  (define_insn_and_split "*movdi_aarch64"
> -  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m,  r,  r, w,r,w, w")
> -	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
> +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w  ,r  ,r,w, m,m,  r,  r, w,r,w,w")
> +	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
>    "(register_operand (operands[0], DImode)
>      || aarch64_reg_or_zero (operands[1], DImode))"
>    "@
> @@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64"
>     mov\\t%x0, %1
>     mov\\t%w0, %1
>     #
> +   #
>     * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
>     ldr\\t%x0, %1
>     ldr\\t%d0, %1
> @@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64"
>     fmov\\t%d0, %d1
>     * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
>     "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
> -    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
> +    && REG_P (operands[0])
> +    && (GP_REGNUM_P (REGNO (operands[0]))
> +	|| (can_create_pseudo_p ()
> +	    && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))"
>     [(const_int 0)]
>     "{
> -       aarch64_expand_mov_immediate (operands[0], operands[1]);
> +       if (GP_REGNUM_P (REGNO (operands[0])))
> +	 aarch64_expand_mov_immediate (operands[0], operands[1]);
> +       else
> +	 {
> +	   rtx mem = force_const_mem (DImode, operands[1]);
> +	   gcc_assert (mem);
> +	   emit_move_insn (operands[0], mem);
> +	 }
>         DONE;
>      }"
>    ;; The "mov_imm" type for CNTD is just a placeholder.
> -  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
> +  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm,
>  		     load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,
>  		     neon_move")
> -   (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
> +   (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
>  )
>  
>  (define_insn "insv_imm<mode>"
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -474,6 +474,14 @@ (define_address_constraint "Dp"
>   An address valid for a prefetch instruction."
>   (match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
>  
> +(define_constraint "Dx"
> +  "@internal
> + A constraint that matches an integer immediate operand not valid\
> + for AdvSIMD scalar operations in DImode."
> + (and (match_code "const_int")
> +      (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)")
> +      (match_test "!aarch64_move_imm (INTVAL (op), DImode)")))
> +
>  (define_constraint "vgb"
>    "@internal
>     A constraint that matches an immediate offset valid for SVE LD1B