This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH][ARM] Implement TARGET_SCHED_MACRO_FUSION_PAIR_P
- From: Kyrill Tkachov <kyrylo dot tkachov at arm dot com>
- To: Ramana Radhakrishnan <Ramana dot Radhakrishnan at arm dot com>
- Cc: GCC Patches <gcc-patches at gcc dot gnu dot org>, Richard Earnshaw <Richard dot Earnshaw at arm dot com>
- Date: Thu, 04 Dec 2014 09:19:28 +0000
- Subject: Re: [PATCH][ARM] Implement TARGET_SCHED_MACRO_FUSION_PAIR_P
- Authentication-results: sourceware.org; auth=none
- References: <5461F922 dot 1020106 at arm dot com> <CAJA7tRbLwWOixYc6CakgCpsmUy6qqyMLbG==4WNgp_kZi1FD7w at mail dot gmail dot com>
On 02/12/14 22:58, Ramana Radhakrishnan wrote:
On Tue, Nov 11, 2014 at 11:55 AM, Kyrill Tkachov <kyrylo.tkachov@arm.com> wrote:
Hi all,
This is the arm implementation of the macro fusion hook.
It tries to fuse movw+movt operations together. It also tries to take lo_sum
RTXs into account since those generate movt instructions as well.
Bootstrapped and tested on arm-none-linux-gnueabihf.
Ok for trunk?
if (current_tune->fuseable_ops & ARM_FUSE_MOVW_MOVT)
+ {
+ /* We are trying to fuse
+ movw imm / movt imm
+ instructions as a group that gets scheduled together. */
+
A comment here about the insn structure would be useful.
Done. It's similar to the aarch64 adrp+add case. It does make it easier
to read, thanks.
2014-12-04 Kyrylo Tkachov kyrylo.tkachov@arm.com\
* config/arm/arm-protos.h (tune_params): Add fuseable_ops field.
* config/arm/arm.c (arm_macro_fusion_p): New function.
(arm_macro_fusion_pair_p): Likewise.
(TARGET_SCHED_MACRO_FUSION_P): Define.
(TARGET_SCHED_MACRO_FUSION_PAIR_P): Likewise.
(ARM_FUSE_NOTHING): Likewise.
(ARM_FUSE_MOVW_MOVT): Likewise.
(arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,
arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,
arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,
arm_cortex_a53_tune, arm_cortex_a57_tune, arm_cortex_a9_tune,
arm_cortex_a12_tune, arm_v7m_tune, arm_v6m_tune, arm_fa726te_tune
arm_cortex_a5_tune): Specify fuseable_ops value.
+ set_dest = SET_DEST (curr_set);
+ if (GET_CODE (set_dest) == ZERO_EXTRACT)
+ {
+ if (CONST_INT_P (SET_SRC (curr_set))
+ && CONST_INT_P (SET_SRC (prev_set))
+ && REG_P (XEXP (set_dest, 0))
+ && REG_P (SET_DEST (prev_set))
+ && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
+ return true;
+ }
+ else if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
+ && REG_P (SET_DEST (curr_set))
+ && REG_P (SET_DEST (prev_set))
+ && GET_CODE (SET_SRC (prev_set)) == HIGH
+ && REGNO (SET_DEST (curr_set)) == REGNO (SET_DEST (prev_set)))
+ {
+ return true;
+ }
Can we add a fast path exit to be
if (GET_MODE (set_dest) != SImode)
return false;
Done, but if/when we extend the function to handle more fusion cases it
will need to be
refactored, since we will want to just bail out of this MOVW+MOVT case
rather than the whole function.
I did think whether we wanted to use reg_overlap_mentioned_p as that
may simplify the logic a bit but that's overkill here as we still
want to restrict it to the cases above.
Otherwise OK.
Here's the updated patch. I've tested on arm-none-eabi and made sure
that the
fusion still happens on the benchmarks I looked at.
Ok?
Thanks,
Kyrill
Ramana
+ }
+ return false;
Thanks,
Kyrill
2014-11-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/arm/arm-protos.h (tune_params): Add fuseable_ops field.
* config/arm/arm.c (arm_macro_fusion_p): New function.
(arm_macro_fusion_pair_p): Likewise.
(TARGET_SCHED_MACRO_FUSION_P): Define.
(TARGET_SCHED_MACRO_FUSION_PAIR_P): Likewise.
(ARM_FUSE_NOTHING): Likewise.
(ARM_FUSE_MOVW_MOVT): Likewise.
(arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,
arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,
arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,
arm_cortex_a53_tune, arm_cortex_a57_tune, arm_cortex_a9_tune,
arm_cortex_a12_tune, arm_v7m_tune, arm_v6m_tune, arm_fa726te_tune
arm_cortex_a5_tune): Specify fuseable_ops value.
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 20cfa9f..19925e9 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -289,6 +289,8 @@ struct tune_params
bool string_ops_prefer_neon;
/* Maximum number of instructions to inline calls to memset. */
int max_insns_inline_memset;
+ /* Bitfield encoding the fuseable pairs of instructions. */
+ unsigned int fuseable_ops;
};
extern const struct tune_params *current_tune;
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 64494e8..6f847d6 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -251,6 +251,7 @@ static void arm_expand_builtin_va_start (tree, rtx);
static tree arm_gimplify_va_arg_expr (tree, tree, gimple_seq *, gimple_seq *);
static void arm_option_override (void);
static unsigned HOST_WIDE_INT arm_shift_truncation_mask (machine_mode);
+static bool arm_macro_fusion_p (void);
static bool arm_cannot_copy_insn_p (rtx_insn *);
static int arm_issue_rate (void);
static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
@@ -291,6 +292,8 @@ static int arm_cortex_m_branch_cost (bool, bool);
static bool arm_vectorize_vec_perm_const_ok (machine_mode vmode,
const unsigned char *sel);
+static bool aarch_macro_fusion_pair_p (rtx_insn*, rtx_insn*);
+
static int arm_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
tree vectype,
int misalign ATTRIBUTE_UNUSED);
@@ -398,6 +401,12 @@ static const struct attribute_spec arm_attribute_table[] =
#undef TARGET_COMP_TYPE_ATTRIBUTES
#define TARGET_COMP_TYPE_ATTRIBUTES arm_comp_type_attributes
+#undef TARGET_SCHED_MACRO_FUSION_P
+#define TARGET_SCHED_MACRO_FUSION_P arm_macro_fusion_p
+
+#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
+#define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
+
#undef TARGET_SET_DEFAULT_TYPE_ATTRIBUTES
#define TARGET_SET_DEFAULT_TYPE_ATTRIBUTES arm_set_default_type_attributes
@@ -1641,6 +1650,9 @@ const struct cpu_cost_table v7m_extra_costs =
}
};
+#define ARM_FUSE_NOTHING (0)
+#define ARM_FUSE_MOVW_MOVT (1 << 0)
+
const struct tune_params arm_slowmul_tune =
{
arm_slowmul_rtx_costs,
@@ -1657,7 +1669,8 @@ const struct tune_params arm_slowmul_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
const struct tune_params arm_fastmul_tune =
@@ -1676,7 +1689,8 @@ const struct tune_params arm_fastmul_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
/* StrongARM has early execution of branches, so a sequence that is worth
@@ -1698,7 +1712,8 @@ const struct tune_params arm_strongarm_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
const struct tune_params arm_xscale_tune =
@@ -1717,7 +1732,8 @@ const struct tune_params arm_xscale_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
const struct tune_params arm_9e_tune =
@@ -1736,7 +1752,8 @@ const struct tune_params arm_9e_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
const struct tune_params arm_v6t2_tune =
@@ -1755,7 +1772,8 @@ const struct tune_params arm_v6t2_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
/* Generic Cortex tuning. Use more specific tunings if appropriate. */
@@ -1775,7 +1793,8 @@ const struct tune_params arm_cortex_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
const struct tune_params arm_cortex_a8_tune =
@@ -1794,7 +1813,8 @@ const struct tune_params arm_cortex_a8_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
const struct tune_params arm_cortex_a7_tune =
@@ -1813,7 +1833,8 @@ const struct tune_params arm_cortex_a7_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
const struct tune_params arm_cortex_a15_tune =
@@ -1832,7 +1853,8 @@ const struct tune_params arm_cortex_a15_tune =
false, /* Prefer Neon for 64-bits bitops. */
true, true, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
const struct tune_params arm_cortex_a53_tune =
@@ -1851,7 +1873,8 @@ const struct tune_params arm_cortex_a53_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_MOVW_MOVT /* Fuseable pairs of instructions. */
};
const struct tune_params arm_cortex_a57_tune =
@@ -1870,7 +1893,8 @@ const struct tune_params arm_cortex_a57_tune =
false, /* Prefer Neon for 64-bits bitops. */
true, true, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_MOVW_MOVT /* Fuseable pairs of instructions. */
};
/* Branches can be dual-issued on Cortex-A5, so conditional execution is
@@ -1892,7 +1916,8 @@ const struct tune_params arm_cortex_a5_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
const struct tune_params arm_cortex_a9_tune =
@@ -1911,7 +1936,8 @@ const struct tune_params arm_cortex_a9_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
const struct tune_params arm_cortex_a12_tune =
@@ -1930,7 +1956,8 @@ const struct tune_params arm_cortex_a12_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_MOVW_MOVT /* Fuseable pairs of instructions. */
};
/* armv7m tuning. On Cortex-M4 cores for example, MOVW/MOVT take a single
@@ -1956,7 +1983,8 @@ const struct tune_params arm_v7m_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
/* Cortex-M7 tuning. */
@@ -1977,7 +2005,8 @@ const struct tune_params arm_cortex_m7_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
/* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
@@ -1998,7 +2027,8 @@ const struct tune_params arm_v6m_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
const struct tune_params arm_fa726te_tune =
@@ -2017,7 +2047,8 @@ const struct tune_params arm_fa726te_tune =
false, /* Prefer Neon for 64-bits bitops. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
- 8 /* Maximum insns to inline memset. */
+ 8, /* Maximum insns to inline memset. */
+ ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
};
@@ -29142,6 +29173,73 @@ arm_gen_setmem (rtx *operands)
return arm_block_set_aligned_non_vect (dstbase, length, value, align);
}
+
+static bool
+arm_macro_fusion_p (void)
+{
+ return current_tune->fuseable_ops != ARM_FUSE_NOTHING;
+}
+
+
+static bool
+aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
+{
+ rtx set_dest;
+ rtx prev_set = single_set (prev);
+ rtx curr_set = single_set (curr);
+
+ if (!prev_set
+ || !curr_set)
+ return false;
+
+ if (any_condjump_p (curr))
+ return false;
+
+ if (!arm_macro_fusion_p ())
+ return false;
+
+ if (current_tune->fuseable_ops & ARM_FUSE_MOVW_MOVT)
+ {
+ /* We are trying to fuse
+ movw imm / movt imm
+ instructions as a group that gets scheduled together. */
+
+ set_dest = SET_DEST (curr_set);
+
+ if (GET_MODE (set_dest) != SImode)
+ return false;
+
+ /* We are trying to match:
+ prev (movw) == (set (reg r0) (const_int imm16))
+ curr (movt) == (set (zero_extract (reg r0)
+ (const_int 16)
+ (const_int 16))
+ (const_int imm16_1))
+ or
+ prev (movw) == (set (reg r1)
+ (high (symbol_ref ("SYM"))))
+ curr (movt) == (set (reg r0)
+ (lo_sum (reg r1)
+ (symbol_ref ("SYM")))) */
+ if (GET_CODE (set_dest) == ZERO_EXTRACT)
+ {
+ if (CONST_INT_P (SET_SRC (curr_set))
+ && CONST_INT_P (SET_SRC (prev_set))
+ && REG_P (XEXP (set_dest, 0))
+ && REG_P (SET_DEST (prev_set))
+ && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
+ return true;
+ }
+ else if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
+ && REG_P (SET_DEST (curr_set))
+ && REG_P (SET_DEST (prev_set))
+ && GET_CODE (SET_SRC (prev_set)) == HIGH
+ && REGNO (SET_DEST (curr_set)) == REGNO (SET_DEST (prev_set)))
+ return true;
+ }
+ return false;
+}
+
/* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
static unsigned HOST_WIDE_INT