This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: Fwd: [PATCH] Scheduling result adjustment to enable macro-fusion
- From: Wei Mi <wmi at google dot com>
- To: Jan Hubicka <hubicka at ucw dot cz>
- Cc: "H.J. Lu" <hjl dot tools at gmail dot com>, Alexander Monakov <amonakov at ispras dot ru>, Steven Bosscher <stevenb dot gcc at gmail dot com>, GCC Patches <gcc-patches at gcc dot gnu dot org>, David Li <davidxl at google dot com>, Kirill Yukhin <kirill dot yukhin at gmail dot com>
- Date: Tue, 24 Sep 2013 12:06:06 -0700
- Subject: Re: Fwd: [PATCH] Scheduling result adjustment to enable macro-fusion
- Authentication-results: sourceware.org; auth=none
- References: <CAMe9rOo-dc7=ax8_pA21wuxnqphLBvf_Voi2n1OHJX7ZEab=ew at mail dot gmail dot com> <CA+4CFy4fqCRvM2Luw2_p6AEZOmucSV1KemntEO3_XU5TfzA-7A at mail dot gmail dot com> <CA+4CFy6gdxREYiJa2B70RBe2aUtLY3zQ9ShK9jGEy26Hdn9QOg at mail dot gmail dot com> <CAMe9rOp1R8XACsL=v-JZkvpPzTOFiZhZPMqQXWkmPgHW5cjC6w at mail dot gmail dot com> <CA+4CFy5nM2Dw7kv0G61N5PKHoAanmAaKm+45oS4pN22TKgSAFg at mail dot gmail dot com> <20130922095726 dot GA23006 at atrey dot karlin dot mff dot cuni dot cz> <20130922101916 dot GA31130 at atrey dot karlin dot mff dot cuni dot cz> <CA+4CFy5n=rTH+fmndNXLJkJgzLd4uCmucvPf+QfGWNsvhPQ1CQ at mail dot gmail dot com>
This is the updated patch2.
Changed:
1. For cmp/test with rip-relative addressing mem operand, don't group
insns. Bulldozer also doesn't support fusion for cmp/test with both
displacement MEM and immediate operand, while m_CORE_ALL doesn't
support fusion for cmp/test with MEM and immediate operand. I simplify
choose to use the more stringent constraint here (m_CORE_ALL's
constraint).
2. Add Budozer back and merge TARGET_FUSE_CMP_AND_BRANCH_64 and
TARGET_FUSE_CMP_AND_BRANCH_32.
bootstrap and regression pass. ok for trunk?
2013-09-24 Wei Mi <wmi@google.com>
* gcc/config/i386/i386.c (rip_relative_addr_p): New Function.
(ix86_macro_fusion_p): Ditto.
(ix86_macro_fusion_pair_p): Ditto.
* gcc/config/i386/i386.h: Add new tune features about macro-fusion.
* gcc/config/i386/x86-tune.def (DEF_TUNE): Ditto.
* gcc/doc/tm.texi: Generated.
* gcc/doc/tm.texi.in: Ditto.
* gcc/haifa-sched.c (try_group_insn): New Function.
(group_insns_for_macro_fusion): Ditto.
(sched_init): Call group_insns_for_macro_fusion.
* gcc/sched-rgn.c (add_branch_dependences): Keep insns in
a SCHED_GROUP at the end of BB to remain their location.
* gcc/target.def: Add two hooks: macro_fusion_p and
macro_fusion_pair_p.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 1fd3f60..4a04778 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -24856,6 +24856,167 @@ ia32_multipass_dfa_lookahead (void)
}
}
+/* Extracted from ix86_print_operand_address. Check whether ADDR is a
+ rip-relative address. */
+
+static bool
+rip_relative_addr_p (rtx addr)
+{
+ struct ix86_address parts;
+ rtx base, index, disp;
+ int ok;
+
+ if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
+ {
+ ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
+ parts.index = XVECEXP (addr, 0, 1);
+ }
+ else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
+ ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
+ else
+ ok = ix86_decompose_address (addr, &parts);
+
+ gcc_assert (ok);
+ base = parts.base;
+ index = parts.index;
+ disp = parts.disp;
+
+ if (TARGET_64BIT && !base && !index)
+ {
+ rtx symbol = disp;
+
+ if (GET_CODE (disp) == CONST
+ && GET_CODE (XEXP (disp, 0)) == PLUS
+ && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
+ symbol = XEXP (XEXP (disp, 0), 0);
+
+ if (GET_CODE (symbol) == LABEL_REF
+ || (GET_CODE (symbol) == SYMBOL_REF
+ && SYMBOL_REF_TLS_MODEL (symbol) == 0))
+ return true;
+ }
+ if (flag_pic && !base && !index)
+ {
+ if (GET_CODE (disp) == CONST
+ && GET_CODE (XEXP (disp, 0)) == UNSPEC
+ && (XINT (XEXP (disp, 0), 1) == UNSPEC_PCREL
+ || XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
+ || (TARGET_64BIT
+ && XINT (XEXP (disp, 0), 1) == UNSPEC_GOTNTPOFF)))
+ return true;
+ }
+ return false;
+}
+
+/* Return true if target platform supports macro-fusion. */
+
+static bool
+ix86_macro_fusion_p ()
+{
+ if (TARGET_FUSE_CMP_AND_BRANCH)
+ return true;
+ else
+ return false;
+}
+
+/* Check whether current microarchitecture support macro fusion
+ for insn pair "CONDGEN + CONDJMP". Refer to
+ "Intel Architectures Optimization Reference Manual". */
+
+static bool
+ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
+{
+ rtx src, dest;
+ rtx single_set = single_set (condgen);
+ enum rtx_code ccode;
+ rtx compare_set = NULL_RTX, test_if, cond;
+ rtx alu_set = NULL_RTX, addr = NULL_RTX;
+
+ if (get_attr_type (condgen) != TYPE_TEST
+ && get_attr_type (condgen) != TYPE_ICMP
+ && get_attr_type (condgen) != TYPE_INCDEC
+ && get_attr_type (condgen) != TYPE_ALU)
+ return false;
+
+ if (single_set == NULL_RTX
+ && !TARGET_FUSE_ALU_AND_BRANCH)
+ return false;
+
+ if (single_set != NULL_RTX)
+ compare_set = single_set;
+ else
+ {
+ int i;
+ rtx pat = PATTERN (condgen);
+ for (i = 0; i < XVECLEN (pat, 0); i++)
+ if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
+ {
+ rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
+ if (GET_CODE (set_src) == COMPARE)
+ compare_set = XVECEXP (pat, 0, i);
+ else
+ alu_set = XVECEXP (pat, 0, i);
+ }
+ }
+ if (compare_set == NULL_RTX)
+ return false;
+ src = SET_SRC (compare_set);
+ if (GET_CODE (src) != COMPARE)
+ return false;
+
+ /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
+ supported. */
+ if ((MEM_P (XEXP (src, 0))
+ && CONST_INT_P (XEXP (src, 1)))
+ || (MEM_P (XEXP (src, 1))
+ && CONST_INT_P (XEXP (src, 0))))
+ return false;
+
+ /* No fusion for RIP-relative address. */
+ if (MEM_P (XEXP (src, 0)))
+ addr = XEXP (XEXP (src, 0), 0);
+ else if (MEM_P (XEXP (src, 1)))
+ addr = XEXP (XEXP (src, 1), 0);
+ if (addr && rip_relative_addr_p (addr))
+ return false;
+
+ test_if = SET_SRC (pc_set (condjmp));
+ cond = XEXP (test_if, 0);
+ ccode = GET_CODE (cond);
+ /* Check whether conditional jump use Sign or Overflow Flags. */
+ if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
+ && (ccode == GE
+ || ccode == GT
+ || ccode == LE
+ || ccode == LT))
+ return false;
+
+ /* Return true for TYPE_TEST and TYPE_ICMP. */
+ if (get_attr_type (condgen) == TYPE_TEST
+ || get_attr_type (condgen) == TYPE_ICMP)
+ return true;
+
+ /* The following is the case that macro-fusion for alu + jmp. */
+ if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
+ return false;
+
+ /* No fusion for alu op with memory destination operand. */
+ dest = SET_DEST (alu_set);
+ if (MEM_P (dest))
+ return false;
+
+ /* Macro-fusion for inc/dec + unsigned conditional jump is not
+ supported. */
+ if (get_attr_type (condgen) == TYPE_INCDEC
+ && (ccode == GEU
+ || ccode == GTU
+ || ccode == LEU
+ || ccode == LTU))
+ return false;
+
+ return true;
+}
+
/* Try to reorder ready list to take advantage of Atom pipelined IMUL
execution. It is applied if
(1) IMUL instruction is on the top of list;
@@ -42993,6 +43154,10 @@ ix86_memmodel_check (unsigned HOST_WIDE_INT val)
#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
ia32_multipass_dfa_lookahead
+#undef TARGET_SCHED_MACRO_FUSION_P
+#define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
+#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
+#define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
#undef TARGET_FUNCTION_OK_FOR_SIBCALL
#define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 788cb8a..68fabd9 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -362,8 +362,17 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_USE_VECTOR_FP_CONVERTS]
#define TARGET_USE_VECTOR_CONVERTS \
ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS]
+#define TARGET_FUSE_CMP_AND_BRANCH_32 \
+ ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
+#define TARGET_FUSE_CMP_AND_BRANCH_64 \
+ ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_64]
#define TARGET_FUSE_CMP_AND_BRANCH \
- ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH]
+ (TARGET_64BIT ? TARGET_FUSE_CMP_AND_BRANCH_64 \
+ : TARGET_FUSE_CMP_AND_BRANCH_32)
+#define TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS \
+ ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
+#define TARGET_FUSE_ALU_AND_BRANCH \
+ ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
#define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
#define TARGET_VECTORIZE_DOUBLE \
ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE]
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 4ae5f70..3d395b0 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -193,10 +193,24 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS,
"use_vector_fp_converts",
/* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
from integer to FP. */
DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
-/* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
- with a subsequent conditional jump instruction into a single
- compare-and-branch uop. */
-DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER)
+/* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
+ conditional jump instruction for 32 bit TARGET. */
+DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
+ m_CORE_ALL | m_BDVER)
+/* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
+ conditional jump instruction for TARGET_64BIT. */
+DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
+ m_COREI7 | m_COREI7_AVX | m_HASWELL | m_BDVER)
+/* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
+ subsequent conditional jump instruction when the condition jump
+ check sign flag (SF) or overflow flag (OF). */
+DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
+ m_COREI7 | m_COREI7_AVX | m_HASWELL | m_BDVER)
+/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
+ jump instruction when the alu instruction produces the CCFLAG consumed by
+ the conditional jump instruction. */
+DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
+ m_COREI7_AVX | m_HASWELL)
/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
will impact LEA instruction selection. */
DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index d15f53c..66b45b9 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6553,6 +6553,17 @@ scheduling one insn causes other insns to
become ready in the same
cycle. These other insns can then be taken into account properly.
@end deftypefn
+@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_P (void)
+This hook is used to check whether target platform supports macro fusion.
+@end deftypefn
+
+@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_PAIR_P (rtx
@var{condgen}, rtx @var{condjmp})
+This hook is used to check whether two insns could be macro fused for
+target microarchitecture. If this hook returns true for the given insn pair
+(@var{condgen} and @var{condjmp}), scheduler will put them into a sched
+group, and they will not be scheduled apart.
+@end deftypefn
+
@deftypefn {Target Hook} void
TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK (rtx @var{head}, rtx
@var{tail})
This hook is called after evaluation forward dependencies of insns in
chain given by two parameter values (@var{head} and @var{tail}
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index b51d7b3..361ee87 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4940,6 +4940,10 @@ them: try the first ones in this list first.
@hook TARGET_SCHED_REORDER2
+@hook TARGET_SCHED_MACRO_FUSION_P
+
+@hook TARGET_SCHED_MACRO_FUSION_PAIR_P
+
@hook TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
@hook TARGET_SCHED_INIT
diff --git a/gcc/haifa-sched.c b/gcc/haifa-sched.c
index 61eaaef..d6726a9 100644
--- a/gcc/haifa-sched.c
+++ b/gcc/haifa-sched.c
@@ -6519,6 +6519,44 @@ setup_sched_dump (void)
? stderr : dump_file);
}
+static void
+try_group_insn (rtx insn)
+{
+ unsigned int condreg1, condreg2;
+ rtx cc_reg_1;
+ rtx prev;
+
+ targetm.fixed_condition_code_regs (&condreg1, &condreg2);
+ cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+ prev = prev_nonnote_nondebug_insn (insn);
+ if (!any_condjump_p (insn)
+ || !reg_referenced_p (cc_reg_1, PATTERN (insn))
+ || !prev
+ || !modified_in_p (cc_reg_1, prev))
+ return;
+
+ /* Different microarchitectures support macro fusions for different
+ combinations of insn pairs. */
+ if (!targetm.sched.macro_fusion_pair_p
+ || !targetm.sched.macro_fusion_pair_p (prev, insn))
+ return;
+
+ SCHED_GROUP_P (insn) = 1;
+}
+
+/* If the last cond jump and the cond register defining insn are consecutive
+ before scheduling, we want them to be in a schedule group. This is good
+ for performance on microarchitectures supporting macro-fusion. */
+
+static void
+group_insns_for_macro_fusion ()
+{
+ basic_block bb;
+
+ FOR_EACH_BB (bb)
+ try_group_insn (BB_END (bb));
+}
+
/* Initialize some global state for the scheduler. This function works
with the common data shared between all the schedulers. It is called
from the scheduler specific initialization routine. */
@@ -6645,6 +6683,11 @@ sched_init (void)
}
curr_state = xmalloc (dfa_state_size);
+
+ /* Group compare and branch insns for macro-fusion. */
+ if (targetm.sched.macro_fusion_p
+ && targetm.sched.macro_fusion_p ())
+ group_insns_for_macro_fusion ();
}
static void haifa_init_only_bb (basic_block, basic_block);
diff --git a/gcc/sched-rgn.c b/gcc/sched-rgn.c
index e1a2dce..156359e 100644
--- a/gcc/sched-rgn.c
+++ b/gcc/sched-rgn.c
@@ -2443,6 +2443,8 @@ add_branch_dependences (rtx head, rtx tail)
cc0 setters remain at the end because they can't be moved away from
their cc0 user.
+ Predecessors of SCHED_GROUP_P instructions at the end remain at the end.
+
COND_EXEC insns cannot be moved past a branch (see e.g. PR17808).
Insns setting TARGET_CLASS_LIKELY_SPILLED_P registers (usually return
@@ -2465,7 +2467,8 @@ add_branch_dependences (rtx head, rtx tail)
#endif
|| (!reload_completed
&& sets_likely_spilled (PATTERN (insn)))))
- || NOTE_P (insn))
+ || NOTE_P (insn)
+ || (last != 0 && SCHED_GROUP_P (last)))
{
if (!NOTE_P (insn))
{
diff --git a/gcc/target.def b/gcc/target.def
index 6de513f..dae0378 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1041,6 +1041,19 @@ scheduling one insn causes other insns to
become ready in the same\n\
cycle. These other insns can then be taken into account properly.",
int, (FILE *file, int verbose, rtx *ready, int *n_readyp, int clock), NULL)
+DEFHOOK
+(macro_fusion_p,
+ "This hook is used to check whether target platform supports macro fusion.",
+ bool, (void), NULL)
+
+DEFHOOK
+(macro_fusion_pair_p,
+ "This hook is used to check whether two insns could be macro fused for\n\
+target microarchitecture. If this hook returns true for the given insn pair\n\
+(@var{condgen} and @var{condjmp}), scheduler will put them into a sched\n\
+group, and they will not be scheduled apart.",
+ bool, (rtx condgen, rtx condjmp), NULL)
+
/* The following member value is a pointer to a function called
after evaluation forward dependencies of insns in chain given
by two parameter values (head and tail correspondingly). */