This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[AArch64] Tweak handling of fp moves via int registers
- From: Richard Sandiford <richard dot sandiford at arm dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: richard dot earnshaw at arm dot com, james dot greenhalgh at arm dot com, marcus dot shawcroft at arm dot com
- Date: Wed, 07 Aug 2019 19:12:19 +0100
- Subject: [AArch64] Tweak handling of fp moves via int registers
The AArch64 port uses define_splits to prefer moving certain float
constants via integer registers over loading them from memory. E.g.:
(set (reg:SF X) (const_double:SF C))
splits to:
(set (reg:SI tmp) (const_int C'))
(set (reg:SF X) (subreg:SF (reg:SI tmp) 0))
The problem with using splits for this -- especially when the split
instruction is a constant move -- is that the original form is still
valid and can be recreated by later pre-RA passes. (And I think that's
a valid thing for them to do, since they're folding away what appears in
rtl terms to be a redundant instruction.)
One pass that can do this is ira's combine_and_move_insns, which among
other things looks for registers that are set once and used once.
If the register is set to a rematerialisable value, the code tries
to fold that value into the single use.
We don't normally see this effect at -O2 and above because
combine_and_move_insns isn't run when -fsched-pressure is enabled
(which it is by default on AArch64). But arguably the combine part is
useful independently of -fsched-pressure, and only the move part is
suspect. So I don't think we should rely on the combination not
happening here.
The new tests demonstrate the problem by running the original tests
at -O instead of -O2.
This patch does the optimisation by splitting the moves at generation
time and rejecting the combined form while the split is still possible.
REG_EQUAL notes on the second move still give the original floating-point
value for passes that need it.
Tested on aarch64-linux-gnu (with and without SVE) and aarch64_be-elf.
OK to install?
Richard
2019-08-07 Richard Sandiford <richard.sandiford@arm.com>
gcc/
* config/aarch64/aarch64-protos.h (aarch64_move_float_via_int_p):
Declare.
* config/aarch64/aarch64.c (aarch64_move_float_via_int_p): New
function, extracted from the GPF_HF move splitter.
* config/aarch64/aarch64.md: Remove GPF_HF move splitter.
(mov<GPF_TF_F16:mode>): Move via an integer register if
aarch64_move_float_via_int_p.
(*movhf_aarch64, *movsf_aarch64, *movdf_aarch64): Check
aarch64_move_float_via_int_p.
* config/aarch64/iterators.md (fcvt_target): Handle TI and TF.
(FCVT_TARGET): Likewise.
gcc/testsuite/
* gcc.target/aarch64/dbl_mov_immediate_2.c: New test.
* gcc.target/aarch64/f16_mov_immediate_5.c: Likewise.
* gcc.target/aarch64/flt_mov_immediate_2.c: Likewise.
Index: gcc/config/aarch64/aarch64-protos.h
===================================================================
--- gcc/config/aarch64/aarch64-protos.h 2019-07-01 09:37:06.704528805 +0100
+++ gcc/config/aarch64/aarch64-protos.h 2019-08-07 19:07:38.199739765 +0100
@@ -519,6 +519,7 @@ const char * aarch64_output_probe_stack_
const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
void aarch64_err_no_fpadvsimd (machine_mode);
void aarch64_expand_epilogue (bool);
+bool aarch64_move_float_via_int_p (rtx);
void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
rtx aarch64_ptrue_reg (machine_mode);
rtx aarch64_pfalse_reg (machine_mode);
Index: gcc/config/aarch64/aarch64.c
===================================================================
--- gcc/config/aarch64/aarch64.c 2019-07-16 09:11:06.449416469 +0100
+++ gcc/config/aarch64/aarch64.c 2019-08-07 19:07:38.203739735 +0100
@@ -3278,6 +3278,22 @@ aarch64_expand_sve_const_vector (rtx des
gcc_assert (vectors[0] == dest);
}
+/* Return true if floating-point value SRC should be moved into an
+ integer register first and then moved into a floating-point register.
+ This means that SRC is a constant that cannot be moved directly into
+ floating-point registers but assembling it in integer registers is
+ better than forcing it to memory. */
+bool
+aarch64_move_float_via_int_p (rtx src)
+{
+ return (GET_MODE (src) != TFmode
+ && GET_CODE (src) == CONST_DOUBLE
+ && can_create_pseudo_p ()
+ && !aarch64_can_const_movi_rtx_p (src, GET_MODE (src))
+ && !aarch64_float_const_representable_p (src)
+ && aarch64_float_const_rtx_p (src));
+}
+
/* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
is a pattern that can be used to set DEST to a replicated scalar
element. */
Index: gcc/config/aarch64/aarch64.md
===================================================================
--- gcc/config/aarch64/aarch64.md 2019-08-05 17:46:20.713723611 +0100
+++ gcc/config/aarch64/aarch64.md 2019-08-07 19:07:38.203739735 +0100
@@ -1249,14 +1249,24 @@ (define_expand "mov<mode>"
&& ! (GET_CODE (operands[1]) == CONST_DOUBLE
&& aarch64_float_const_zero_rtx_p (operands[1])))
operands[1] = force_reg (<MODE>mode, operands[1]);
+
+ if (aarch64_move_float_via_int_p (operands[1]))
+ {
+ rtx imm = simplify_gen_subreg (<FCVT_TARGET>mode, operands[1],
+ <MODE>mode, 0);
+ rtx tmp = force_reg (<FCVT_TARGET>mode, imm);
+ operands[1] = gen_lowpart (<MODE>mode, tmp);
+ }
}
)
(define_insn "*movhf_aarch64"
[(set (match_operand:HF 0 "nonimmediate_operand" "=w,w , w,?r,w,w ,w ,w,m,r,m ,r")
(match_operand:HF 1 "general_operand" "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
- "TARGET_FLOAT && (register_operand (operands[0], HFmode)
- || aarch64_reg_or_fp_zero (operands[1], HFmode))"
+ "TARGET_FLOAT
+ && (register_operand (operands[0], HFmode)
+ || aarch64_reg_or_fp_zero (operands[1], HFmode))
+ && !aarch64_move_float_via_int_p (operands[1])"
"@
movi\\t%0.4h, #0
fmov\\t%h0, %w1
@@ -1278,8 +1288,10 @@ (define_insn "*movhf_aarch64"
(define_insn "*movsf_aarch64"
[(set (match_operand:SF 0 "nonimmediate_operand" "=w,w ,?r,w,w ,w ,w,m,r,m ,r,r")
(match_operand:SF 1 "general_operand" "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r,M"))]
- "TARGET_FLOAT && (register_operand (operands[0], SFmode)
- || aarch64_reg_or_fp_zero (operands[1], SFmode))"
+ "TARGET_FLOAT
+ && (register_operand (operands[0], SFmode)
+ || aarch64_reg_or_fp_zero (operands[1], SFmode))
+ && !aarch64_move_float_via_int_p (operands[1])"
"@
movi\\t%0.2s, #0
fmov\\t%s0, %w1
@@ -1302,8 +1314,10 @@ (define_insn "*movsf_aarch64"
(define_insn "*movdf_aarch64"
[(set (match_operand:DF 0 "nonimmediate_operand" "=w, w ,?r,w,w ,w ,w,m,r,m ,r,r")
(match_operand:DF 1 "general_operand" "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
- "TARGET_FLOAT && (register_operand (operands[0], DFmode)
- || aarch64_reg_or_fp_zero (operands[1], DFmode))"
+ "TARGET_FLOAT
+ && (register_operand (operands[0], DFmode)
+ || aarch64_reg_or_fp_zero (operands[1], DFmode))
+ && !aarch64_move_float_via_int_p (operands[1])"
"@
movi\\t%d0, #0
fmov\\t%d0, %x1
@@ -1323,26 +1337,6 @@ (define_insn "*movdf_aarch64"
(set_attr "arch" "simd,*,*,*,*,simd,*,*,*,*,*,*")]
)
-(define_split
- [(set (match_operand:GPF_HF 0 "nonimmediate_operand")
- (match_operand:GPF_HF 1 "general_operand"))]
- "can_create_pseudo_p ()
- && !aarch64_can_const_movi_rtx_p (operands[1], <MODE>mode)
- && !aarch64_float_const_representable_p (operands[1])
- && aarch64_float_const_rtx_p (operands[1])"
- [(const_int 0)]
- {
- unsigned HOST_WIDE_INT ival;
- if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
- FAIL;
-
- rtx tmp = gen_reg_rtx (<FCVT_TARGET>mode);
- emit_move_insn (tmp, gen_int_mode (ival, <FCVT_TARGET>mode));
- emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
- DONE;
- }
-)
-
(define_insn "*movtf_aarch64"
[(set (match_operand:TF 0
"nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,m,?r,m ,m")
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md 2019-07-29 09:39:46.658190164 +0100
+++ gcc/config/aarch64/iterators.md 2019-08-07 19:07:38.203739735 +0100
@@ -960,12 +960,14 @@ (define_mode_attr fcvt_target [(V2DF "v2
(V2DI "v2df") (V4SI "v4sf") (V2SI "v2sf")
(SF "si") (DF "di") (SI "sf") (DI "df")
(V4HF "v4hi") (V8HF "v8hi") (V4HI "v4hf")
- (V8HI "v8hf") (HF "hi") (HI "hf")])
+ (V8HI "v8hf") (HF "hi") (HI "hf")
+ (TI "tf") (TF "ti")])
(define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")
(V2DI "V2DF") (V4SI "V4SF") (V2SI "V2SF")
(SF "SI") (DF "DI") (SI "SF") (DI "DF")
(V4HF "V4HI") (V8HF "V8HI") (V4HI "V4HF")
- (V8HI "V8HF") (HF "HI") (HI "HF")])
+ (V8HI "V8HF") (HF "HI") (HI "HF")
+ (TI "TF") (TF "TI")])
;; for the inequal width integer to fp conversions
Index: gcc/testsuite/gcc.target/aarch64/dbl_mov_immediate_2.c
===================================================================
--- /dev/null 2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/dbl_mov_immediate_2.c 2019-08-07 19:07:38.203739735 +0100
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mno-pc-relative-literal-loads" } */
+/* { dg-skip-if "Tiny model won't generate adrp" { *-*-* } { "-mcmodel=tiny" } { "" } } */
+
+#include "dbl_mov_immediate_1.c"
+
+/* { dg-final { scan-assembler-times "movi\td\[0-9\]+, #?0" 1 } } */
+
+/* { dg-final { scan-assembler-times "adrp\tx\[0-9\]+, \.LC\[0-9\]" 2 } } */
+/* { dg-final { scan-assembler-times "ldr\td\[0-9\]+, \\\[x\[0-9\], #:lo12:\.LC\[0-9\]\\\]" 2 } } */
+
+/* { dg-final { scan-assembler-times "fmov\td\[0-9\]+, 1\\\.5e\\\+0" 1 } } */
+
+/* { dg-final { scan-assembler-times "mov\tx\[0-9\]+, 25838523252736" 1 } } */
+/* { dg-final { scan-assembler-times "movk\tx\[0-9\]+, 0x40fe, lsl 48" 1 } } */
+/* { dg-final { scan-assembler-times "mov\tx\[0-9\]+, -9223372036854775808" 1 } } */
+/* { dg-final { scan-assembler-times "fmov\td\[0-9\]+, x\[0-9\]+" 2 } } */
+
Index: gcc/testsuite/gcc.target/aarch64/f16_mov_immediate_5.c
===================================================================
--- /dev/null 2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/f16_mov_immediate_5.c 2019-08-07 19:07:38.203739735 +0100
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+
+#include "f16_mov_immediate_2.c"
+
+/* { dg-final { scan-assembler-times "movi\tv\[0-9\]+\\\.4h, ?#0" 1 } } */
+/* { dg-final { scan-assembler-times "movi\tv\[0-9\]+\\\.4h, 0x80, lsl 8" 1 } } */
+/* { dg-final { scan-assembler-times "movi\tv\[0-9\]+\\\.4h, 0x5c, lsl 8" 1 } } */
+/* { dg-final { scan-assembler-times "movi\tv\[0-9\]+\\\.4h, 0x7c, lsl 8" 1 } } */
+
+/* { dg-final { scan-assembler-times {fmov\th[0-9]+, #?1.7e\+1} 1 } } */
Index: gcc/testsuite/gcc.target/aarch64/flt_mov_immediate_2.c
===================================================================
--- /dev/null 2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/flt_mov_immediate_2.c 2019-08-07 19:07:38.203739735 +0100
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include "flt_mov_immediate_1.c"
+
+/* { dg-final { scan-assembler-times "movi\tv\[0-9\]+\\\.2s, ?#0" 1 } } */
+/* { dg-final { scan-assembler-times "movi\tv\[0-9\]+\\\.2s, 0x80, lsl 24" 1 } } */
+/* { dg-final { scan-assembler-times "movi\tv\[0-9\]+\\\.2s, 0x80, lsl 24" 1 } } */
+
+/* { dg-final { scan-assembler-times "mov\tw\[0-9\]+, 48128" 1 } } */
+/* { dg-final { scan-assembler-times "movk\tw\[0-9\]+, 0x47f0, lsl 16" 1 } } */
+
+/* { dg-final { scan-assembler-times "fmov\ts\[0-9\]+, 2\\\.0e\\\+0" 1 } } */
+
+/* { dg-final { scan-assembler-times "mov\tw\[0-9\]+, 16435" 1 } } */
+/* { dg-final { scan-assembler-times "movk\tw\[0-9\]+, 0xc69c, lsl 16" 1 } } */