This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: Fwd: [PATCH] Scheduling result adjustment to enable macro-fusion


Ping.  Is it ok for x86 maintainer?

Thanks,
Wei Mi.

On Wed, Oct 16, 2013 at 4:25 PM, Wei Mi <wmi@google.com> wrote:
>> Go ahead and consider that pre-approved.  Just send it to the list with a
>> note that I approved it in this thread.
>>
>> Jeff
>
> Thanks! The new patch addressed Jeff's comments.
>
> Is it ok for x86 maintainer?
>
> Thanks,
> Wei Mi.
>
> 2013-10-16  Wei Mi  <wmi@google.com>
>
>         * gcc/config/i386/i386.c (memory_address_length): Extract a part
>         of code to rip_relative_addr_p.
>         (rip_relative_addr_p): New Function.
>         (ix86_macro_fusion_p): Ditto.
>         (ix86_macro_fusion_pair_p): Ditto.
>         * gcc/config/i386/i386.h: Add new tune features about macro-fusion.
>         * gcc/config/i386/x86-tune.def (DEF_TUNE): Ditto.
>         * gcc/doc/tm.texi: Generated.
>         * gcc/doc/tm.texi.in: Ditto.
>         * gcc/haifa-sched.c (try_group_insn): New Function.
>         (group_insns_for_macro_fusion): Ditto.
>         (sched_init): Call group_insns_for_macro_fusion.
>         * gcc/target.def: Add two hooks: macro_fusion_p and
>         macro_fusion_pair_p.
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 1fd3f60..59b0bcf 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -24204,6 +24204,42 @@ ix86_instantiate_decls (void)
>        instantiate_decl_rtl (s->rtl);
>  }
>
> +/* Check whether x86 address PARTS is a pc-relative address.  */
> +
> +static bool
> +rip_relative_addr_p (struct ix86_address *parts)
> +{
> +  rtx base, index, disp;
> +
> +  base = parts->base;
> +  index = parts->index;
> +  disp = parts->disp;
> +
> +  if (disp && !base && !index)
> +    {
> +      if (TARGET_64BIT)
> +       {
> +         rtx symbol = disp;
> +
> +         if (GET_CODE (disp) == CONST)
> +           symbol = XEXP (disp, 0);
> +         if (GET_CODE (symbol) == PLUS
> +             && CONST_INT_P (XEXP (symbol, 1)))
> +           symbol = XEXP (symbol, 0);
> +
> +         if (GET_CODE (symbol) == LABEL_REF
> +             || (GET_CODE (symbol) == SYMBOL_REF
> +                 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
> +             || (GET_CODE (symbol) == UNSPEC
> +                 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
> +                     || XINT (symbol, 1) == UNSPEC_PCREL
> +                     || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
> +           return true;
> +       }
> +    }
> +  return false;
> +}
> +
>  /* Calculate the length of the memory address in the instruction encoding.
>     Includes addr32 prefix, does not include the one-byte modrm, opcode,
>     or other prefixes.  We never generate addr32 prefix for LEA insn.  */
> @@ -24275,25 +24311,8 @@ memory_address_length (rtx addr, bool lea)
>    else if (disp && !base && !index)
>      {
>        len += 4;
> -      if (TARGET_64BIT)
> -       {
> -         rtx symbol = disp;
> -
> -         if (GET_CODE (disp) == CONST)
> -           symbol = XEXP (disp, 0);
> -         if (GET_CODE (symbol) == PLUS
> -             && CONST_INT_P (XEXP (symbol, 1)))
> -           symbol = XEXP (symbol, 0);
> -
> -         if (GET_CODE (symbol) != LABEL_REF
> -             && (GET_CODE (symbol) != SYMBOL_REF
> -                 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
> -             && (GET_CODE (symbol) != UNSPEC
> -                 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
> -                     && XINT (symbol, 1) != UNSPEC_PCREL
> -                     && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
> -           len++;
> -       }
> +      if (rip_relative_addr_p (&parts))
> +       len++;
>      }
>    else
>      {
> @@ -24856,6 +24875,122 @@ ia32_multipass_dfa_lookahead (void)
>      }
>  }
>
> +/* Return true if target platform supports macro-fusion.  */
> +
> +static bool
> +ix86_macro_fusion_p ()
> +{
> +  if (TARGET_FUSE_CMP_AND_BRANCH)
> +    return true;
> +  else
> +    return false;
> +}
> +
> +/* Check whether current microarchitecture support macro fusion
> +   for insn pair "CONDGEN + CONDJMP". Refer to
> +   "Intel Architectures Optimization Reference Manual". */
> +
> +static bool
> +ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
> +{
> +  rtx src, dest;
> +  rtx single_set = single_set (condgen);
> +  enum rtx_code ccode;
> +  rtx compare_set = NULL_RTX, test_if, cond;
> +  rtx alu_set = NULL_RTX, addr = NULL_RTX;
> +
> +  if (get_attr_type (condgen) != TYPE_TEST
> +      && get_attr_type (condgen) != TYPE_ICMP
> +      && get_attr_type (condgen) != TYPE_INCDEC
> +      && get_attr_type (condgen) != TYPE_ALU)
> +    return false;
> +
> +  if (single_set == NULL_RTX
> +      && !TARGET_FUSE_ALU_AND_BRANCH)
> +    return false;
> +
> +  if (single_set != NULL_RTX)
> +    compare_set = single_set;
> +  else
> +    {
> +      int i;
> +      rtx pat = PATTERN (condgen);
> +      for (i = 0; i < XVECLEN (pat, 0); i++)
> +       if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
> +         {
> +           rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
> +           if (GET_CODE (set_src) == COMPARE)
> +             compare_set = XVECEXP (pat, 0, i);
> +           else
> +             alu_set = XVECEXP (pat, 0, i);
> +         }
> +    }
> +  if (compare_set == NULL_RTX)
> +    return false;
> +  src = SET_SRC (compare_set);
> +  if (GET_CODE (src) != COMPARE)
> +    return false;
> +
> +  /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
> +     supported.  */
> +  if ((MEM_P (XEXP (src, 0))
> +       && CONST_INT_P (XEXP (src, 1)))
> +      || (MEM_P (XEXP (src, 1))
> +         && CONST_INT_P (XEXP (src, 0))))
> +    return false;
> +
> +  /* No fusion for RIP-relative address.  */
> +  if (MEM_P (XEXP (src, 0)))
> +    addr = XEXP (XEXP (src, 0), 0);
> +  else if (MEM_P (XEXP (src, 1)))
> +    addr = XEXP (XEXP (src, 1), 0);
> +
> +  if (addr) {
> +    ix86_address parts;
> +    int ok = ix86_decompose_address (addr, &parts);
> +    gcc_assert (ok);
> +
> +    if (rip_relative_addr_p (&parts))
> +      return false;
> +  }
> +
> +  test_if = SET_SRC (pc_set (condjmp));
> +  cond = XEXP (test_if, 0);
> +  ccode = GET_CODE (cond);
> +  /* Check whether conditional jump use Sign or Overflow Flags.  */
> +  if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
> +      && (ccode == GE
> +          || ccode == GT
> +         || ccode == LE
> +         || ccode == LT))
> +    return false;
> +
> +  /* Return true for TYPE_TEST and TYPE_ICMP.  */
> +  if (get_attr_type (condgen) == TYPE_TEST
> +      || get_attr_type (condgen) == TYPE_ICMP)
> +    return true;
> +
> +  /* The following is the case that macro-fusion for alu + jmp.  */
> +  if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
> +    return false;
> +
> +  /* No fusion for alu op with memory destination operand.  */
> +  dest = SET_DEST (alu_set);
> +  if (MEM_P (dest))
> +    return false;
> +
> +  /* Macro-fusion for inc/dec + unsigned conditional jump is not
> +     supported.  */
> +  if (get_attr_type (condgen) == TYPE_INCDEC
> +      && (ccode == GEU
> +         || ccode == GTU
> +         || ccode == LEU
> +         || ccode == LTU))
> +    return false;
> +
> +  return true;
> +}
> +
>  /* Try to reorder ready list to take advantage of Atom pipelined IMUL
>     execution. It is applied if
>     (1) IMUL instruction is on the top of list;
> @@ -42993,6 +43128,10 @@ ix86_memmodel_check (unsigned HOST_WIDE_INT val)
>  #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
>  #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
>    ia32_multipass_dfa_lookahead
> +#undef TARGET_SCHED_MACRO_FUSION_P
> +#define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
> +#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
> +#define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
>
>  #undef TARGET_FUNCTION_OK_FOR_SIBCALL
>  #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 788cb8a..68fabd9 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -362,8 +362,17 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
>         ix86_tune_features[X86_TUNE_USE_VECTOR_FP_CONVERTS]
>  #define TARGET_USE_VECTOR_CONVERTS \
>         ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS]
> +#define TARGET_FUSE_CMP_AND_BRANCH_32 \
> +       ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
> +#define TARGET_FUSE_CMP_AND_BRANCH_64 \
> +       ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_64]
>  #define TARGET_FUSE_CMP_AND_BRANCH \
> -       ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH]
> +       (TARGET_64BIT ? TARGET_FUSE_CMP_AND_BRANCH_64 \
> +        : TARGET_FUSE_CMP_AND_BRANCH_32)
> +#define TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS \
> +       ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
> +#define TARGET_FUSE_ALU_AND_BRANCH \
> +       ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
>  #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
>  #define TARGET_VECTORIZE_DOUBLE \
>         ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE]
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index 4ae5f70..3d395b0 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -193,10 +193,24 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS,
> "use_vector_fp_converts",
>  /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
>     from integer to FP. */
>  DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
> -/* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
> -   with a subsequent conditional jump instruction into a single
> -   compare-and-branch uop.  */
> -DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER)
> +/* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
> +   conditional jump instruction for 32 bit TARGET.  */
> +DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
> +          m_CORE_ALL | m_BDVER)
> +/* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
> +   conditional jump instruction for TARGET_64BIT.  */
> +DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
> +          m_COREI7 | m_COREI7_AVX | m_HASWELL | m_BDVER)
> +/* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
> +   subsequent conditional jump instruction when the condition jump
> +   check sign flag (SF) or overflow flag (OF).  */
> +DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
> +          m_COREI7 | m_COREI7_AVX | m_HASWELL | m_BDVER)
> +/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
> +   jump instruction when the alu instruction produces the CCFLAG consumed by
> +   the conditional jump instruction. */
> +DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
> +          m_COREI7_AVX | m_HASWELL)
>  /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
>     will impact LEA instruction selection. */
>  DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index d15f53c..66b45b9 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -6553,6 +6553,17 @@ scheduling one insn causes other insns to
> become ready in the same
>  cycle.  These other insns can then be taken into account properly.
>  @end deftypefn
>
> +@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_P (void)
> +This hook is used to check whether target platform supports macro fusion.
> +@end deftypefn
> +
> +@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_PAIR_P (rtx
> @var{condgen}, rtx @var{condjmp})
> +This hook is used to check whether two insns could be macro fused for
> +target microarchitecture. If this hook returns true for the given insn pair
> +(@var{condgen} and @var{condjmp}), scheduler will put them into a sched
> +group, and they will not be scheduled apart.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} void
> TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK (rtx @var{head}, rtx
> @var{tail})
>  This hook is called after evaluation forward dependencies of insns in
>  chain given by two parameter values (@var{head} and @var{tail}
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index b51d7b3..361ee87 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -4940,6 +4940,10 @@ them: try the first ones in this list first.
>
>  @hook TARGET_SCHED_REORDER2
>
> +@hook TARGET_SCHED_MACRO_FUSION_P
> +
> +@hook TARGET_SCHED_MACRO_FUSION_PAIR_P
> +
>  @hook TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
>
>  @hook TARGET_SCHED_INIT
> diff --git a/gcc/haifa-sched.c b/gcc/haifa-sched.c
> index 61eaaef..e24009d 100644
> --- a/gcc/haifa-sched.c
> +++ b/gcc/haifa-sched.c
> @@ -6519,6 +6519,50 @@ setup_sched_dump (void)
>                 ? stderr : dump_file);
>  }
>
> +/* Try to group comparison and the following conditional jump INSN if
> +   they're already adjacent. This is to prevent scheduler from scheduling
> +   them apart.  */
> +
> +static void
> +try_group_insn (rtx insn)
> +{
> +  unsigned int condreg1, condreg2;
> +  rtx cc_reg_1;
> +  rtx prev;
> +
> +  if (!any_condjump_p (insn))
> +    return;
> +
> +  targetm.fixed_condition_code_regs (&condreg1, &condreg2);
> +  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
> +  prev = prev_nonnote_nondebug_insn (insn);
> +  if (!reg_referenced_p (cc_reg_1, PATTERN (insn))
> +      || !prev
> +      || !modified_in_p (cc_reg_1, prev))
> +    return;
> +
> +  /* Different microarchitectures support macro fusions for different
> +     combinations of insn pairs.  */
> +  if (!targetm.sched.macro_fusion_pair_p
> +      || !targetm.sched.macro_fusion_pair_p (prev, insn))
> +    return;
> +
> +  SCHED_GROUP_P (insn) = 1;
> +}
> +
> +/* If the last cond jump and the cond register defining insn are consecutive
> +   before scheduling, we want them to be in a schedule group. This is good
> +   for performance on microarchitectures supporting macro-fusion.  */
> +
> +static void
> +group_insns_for_macro_fusion ()
> +{
> +  basic_block bb;
> +
> +  FOR_EACH_BB (bb)
> +    try_group_insn (BB_END (bb));
> +}
> +
>  /* Initialize some global state for the scheduler.  This function works
>     with the common data shared between all the schedulers.  It is called
>     from the scheduler specific initialization routine.  */
> @@ -6645,6 +6689,11 @@ sched_init (void)
>      }
>
>    curr_state = xmalloc (dfa_state_size);
> +
> +  /* Group compare and branch insns for macro-fusion.  */
> +  if (targetm.sched.macro_fusion_p
> +      && targetm.sched.macro_fusion_p ())
> +    group_insns_for_macro_fusion ();
>  }
>
>  static void haifa_init_only_bb (basic_block, basic_block);
> diff --git a/gcc/target.def b/gcc/target.def
> index 6de513f..dae0378 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -1041,6 +1041,19 @@ scheduling one insn causes other insns to
> become ready in the same\n\
>  cycle.  These other insns can then be taken into account properly.",
>   int, (FILE *file, int verbose, rtx *ready, int *n_readyp, int clock), NULL)
>
> +DEFHOOK
> +(macro_fusion_p,
> + "This hook is used to check whether target platform supports macro fusion.",
> + bool, (void), NULL)
> +
> +DEFHOOK
> +(macro_fusion_pair_p,
> + "This hook is used to check whether two insns could be macro fused for\n\
> +target microarchitecture. If this hook returns true for the given insn pair\n\
> +(@var{condgen} and @var{condjmp}), scheduler will put them into a sched\n\
> +group, and they will not be scheduled apart.",
> + bool, (rtx condgen, rtx condjmp), NULL)
> +
>  /* The following member value is a pointer to a function called
>     after evaluation forward dependencies of insns in chain given
>     by two parameter values (head and tail correspondingly).  */
>
>
>
>
> 2013-10-16  Wei Mi  <wmi@google.com>
>         * gcc/sched-rgn.c (add_branch_dependences): Keep insns in
>         a SCHED_GROUP at the end of BB to remain their location.
>
> diff --git a/gcc/sched-rgn.c b/gcc/sched-rgn.c
> index e1a2dce..156359e 100644
> --- a/gcc/sched-rgn.c
> +++ b/gcc/sched-rgn.c
> @@ -2443,6 +2443,8 @@ add_branch_dependences (rtx head, rtx tail)
>       cc0 setters remain at the end because they can't be moved away from
>       their cc0 user.
>
> +     Predecessors of SCHED_GROUP_P instructions at the end remain at the end.
> +
>       COND_EXEC insns cannot be moved past a branch (see e.g. PR17808).
>
>       Insns setting TARGET_CLASS_LIKELY_SPILLED_P registers (usually return
> @@ -2465,7 +2467,8 @@ add_branch_dependences (rtx head, rtx tail)
>  #endif
>                  || (!reload_completed
>                      && sets_likely_spilled (PATTERN (insn)))))
> -        || NOTE_P (insn))
> +        || NOTE_P (insn)
> +        || (last != 0 && SCHED_GROUP_P (last)))
>      {
>        if (!NOTE_P (insn))
>         {


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]