From 55805e5496208b399bc3faa047fc54a5af3f1a36 Mon Sep 17 00:00:00 2001 From: Yuri Rumyantsev Date: Fri, 31 May 2013 08:52:42 -0700 Subject: [PATCH] Silvermont (SLM) architecture performance tuning 2013-05-31 Yuri Rumyantsev Igor Zamyatin * config/i386/i386.h (enum ix86_tune_indices): Add X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS. (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS): New define. * config/i386/i386.c (initial_ix86_tune_features) : Initialize. (ix86_lea_outperforms): Handle Silvermont tuning. (ix86_avoid_lea_for_add): Add new argument to ix86_lea_outperforms call. (ix86_use_lea_for_mov): Likewise. (ix86_avoid_lea_for_addr): Likewise. (ix86_lea_for_add_ok): Likewise. (exact_dependency_1): New function. (exact_store_load_dependency): Likewise. (ix86_adjust_cost): Handle Silvermont tuning. (do_reoder_for_imul): Likewise. (swap_top_of_ready_list): New function. (ix86_sched_reorder): Changed to handle Silvermont tuning. * config/i386/i386.md (peepholes that split memory operand in fp converts): New. From-SVN: r199546 --- gcc/ChangeLog | 36 ++++- gcc/config/i386/i386.c | 310 ++++++++++++++++++++++++++++++++++------ gcc/config/i386/i386.h | 3 + gcc/config/i386/i386.md | 24 ++++ 4 files changed, 323 insertions(+), 50 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 0c83918e6c56..f357e85a759d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,29 @@ +2013-05-31 Yuri Rumyantsev + Igor Zamyatin + + Silvermont (SLM) architecture performance tuning. + * config/i386/i386.h (enum ix86_tune_indices): Add + X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS. + (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS): New define. + + * config/i386/i386.c (initial_ix86_tune_features) + : Initialize. + (ix86_lea_outperforms): Handle Silvermont tuning. + (ix86_avoid_lea_for_add): Add new argument to ix86_lea_outperforms + call. + (ix86_use_lea_for_mov): Likewise. + (ix86_avoid_lea_for_addr): Likewise. + (ix86_lea_for_add_ok): Likewise. + (exact_dependency_1): New function. + (exact_store_load_dependency): Likewise. + (ix86_adjust_cost): Handle Silvermont tuning. + (do_reoder_for_imul): Likewise. + (swap_top_of_ready_list): New function. + (ix86_sched_reorder): Changed to handle Silvermont tuning. + + * config/i386/i386.md (peepholes that split memory operand in fp + converts): New. + 2013-05-31 Marcus Shawcroft * config/aarch64/aarch64.c (aarch64_load_symref_appropriately): @@ -718,11 +744,11 @@ 2013-05-24 Vladimir Makarov - * lra-constraints.c (emit_spill_move): Use smaller mode for + * lra-constraints.c (emit_spill_move): Use smaller mode for mem-mem moves. - (check_and_process_move): Consider mem-reg moves for secondary + (check_and_process_move): Consider mem-reg moves for secondary too. - (curr_insn_transform): Don't lose insns emitted before for + (curr_insn_transform): Don't lose insns emitted before for secondary memory moves. (inherit_in_ebb): Mark defined reg. Add usage only if it is not a reg set up in the current insn. @@ -1085,8 +1111,8 @@ 2013-05-21 Christian Bruel - * dwarf2out.c (multiple_reg_loc_descriptor): Use dbx_reg_number for - spanning registers. LEAF_REG_REMAP is supported only for contiguous + * dwarf2out.c (multiple_reg_loc_descriptor): Use dbx_reg_number for + spanning registers. LEAF_REG_REMAP is supported only for contiguous registers. Set register size out of the PARALLEL loop. 2013-05-20 Oleg Endo diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index e8f47c9d417d..69c8165390b6 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -2108,7 +2108,12 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for a conditional move. */ - m_ATOM + m_ATOM, + + /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for + fp converts to destination register. */ + m_SLM + }; /* Feature tests against the various architecture variations. */ @@ -17392,10 +17397,24 @@ distance_agu_use (unsigned int regno0, rtx insn) static bool ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1, - unsigned int regno2, int split_cost) + unsigned int regno2, int split_cost, bool has_scale) { int dist_define, dist_use; + /* For Silvermont if using a 2-source or 3-source LEA for + non-destructive destination purposes, or due to wanting + ability to use SCALE, the use of LEA is justified. */ + if (ix86_tune == PROCESSOR_SLM) + { + if (has_scale) + return true; + if (split_cost < 1) + return false; + if (regno0 == regno1 || regno0 == regno2) + return false; + return true; + } + dist_define = distance_non_agu_define (regno1, regno2, insn); dist_use = distance_agu_use (regno0, insn); @@ -17484,7 +17503,7 @@ ix86_avoid_lea_for_add (rtx insn, rtx operands[]) if (regno0 == regno1 || regno0 == regno2) return false; else - return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1); + return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false); } /* Return true if we should emit lea instruction instead of mov @@ -17506,7 +17525,7 @@ ix86_use_lea_for_mov (rtx insn, rtx operands[]) regno0 = true_regnum (operands[0]); regno1 = true_regnum (operands[1]); - return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0); + return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false); } /* Return true if we need to split lea into a sequence of @@ -17585,7 +17604,8 @@ ix86_avoid_lea_for_addr (rtx insn, rtx operands[]) split_cost -= 1; } - return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost); + return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost, + parts.scale > 1); } /* Emit x86 binary operand CODE in mode MODE, where the first operand @@ -17770,7 +17790,7 @@ ix86_lea_for_add_ok (rtx insn, rtx operands[]) if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) return false; - return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0); + return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false); } /* Return true if destination reg of SET_BODY is shift count of @@ -24368,6 +24388,73 @@ ix86_agi_dependent (rtx set_insn, rtx use_insn) return false; } +/* Helper function for exact_store_load_dependency. + Return true if addr is found in insn. */ +static bool +exact_dependency_1 (rtx addr, rtx insn) +{ + enum rtx_code code; + const char *format_ptr; + int i, j; + + code = GET_CODE (insn); + switch (code) + { + case MEM: + if (rtx_equal_p (addr, insn)) + return true; + break; + case REG: + CASE_CONST_ANY: + case SYMBOL_REF: + case CODE_LABEL: + case PC: + case CC0: + case EXPR_LIST: + return false; + default: + break; + } + + format_ptr = GET_RTX_FORMAT (code); + for (i = 0; i < GET_RTX_LENGTH (code); i++) + { + switch (*format_ptr++) + { + case 'e': + if (exact_dependency_1 (addr, XEXP (insn, i))) + return true; + break; + case 'E': + for (j = 0; j < XVECLEN (insn, i); j++) + if (exact_dependency_1 (addr, XVECEXP (insn, i, j))) + return true; + break; + } + } + return false; +} + +/* Return true if there exists exact dependency for store & load, i.e. + the same memory address is used in them. */ +static bool +exact_store_load_dependency (rtx store, rtx load) +{ + rtx set1, set2; + + set1 = single_set (store); + if (!set1) + return false; + if (!MEM_P (SET_DEST (set1))) + return false; + set2 = single_set (load); + if (!set2) + return false; + if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2))) + return true; + return false; +} + static int ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) { @@ -24519,6 +24606,39 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) else cost = 0; } + break; + + case PROCESSOR_SLM: + if (!reload_completed) + return cost; + + /* Increase cost of integer loads. */ + memory = get_attr_memory (dep_insn); + if (memory == MEMORY_LOAD || memory == MEMORY_BOTH) + { + enum attr_unit unit = get_attr_unit (dep_insn); + if (unit == UNIT_INTEGER && cost == 1) + { + if (memory == MEMORY_LOAD) + cost = 3; + else + { + /* Increase cost of ld/st for short int types only + because of store forwarding issue. */ + rtx set = single_set (dep_insn); + if (set && (GET_MODE (SET_DEST (set)) == QImode + || GET_MODE (SET_DEST (set)) == HImode)) + { + /* Increase cost of store/load insn if exact + dependence exists and it is load insn. */ + enum attr_memory insn_memory = get_attr_memory (insn); + if (insn_memory == MEMORY_LOAD + && exact_store_load_dependency (dep_insn, insn)) + cost = 3; + } + } + } + } default: break; @@ -24565,47 +24685,32 @@ ia32_multipass_dfa_lookahead (void) execution. It is applied if (1) IMUL instruction is on the top of list; (2) There exists the only producer of independent IMUL instruction in - ready list; - (3) Put found producer on the top of ready list. - Returns issue rate. */ - + ready list. + Return index of IMUL producer if it was found and -1 otherwise. */ static int -ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready, - int clock_var ATTRIBUTE_UNUSED) +do_reoder_for_imul(rtx *ready, int n_ready) { - static int issue_rate = -1; - int n_ready = *pn_ready; - rtx insn, insn1, insn2; - int i; + rtx insn, set, insn1, insn2; sd_iterator_def sd_it; dep_t dep; int index = -1; + int i; - /* Set up issue rate. */ - issue_rate = ix86_issue_rate(); - - /* Do reodering for Atom only. */ if (ix86_tune != PROCESSOR_ATOM) - return issue_rate; + return index; + /* Do not perform ready list reodering for pre-reload schedule pass. */ if (!reload_completed) - return issue_rate; - /* Nothing to do if ready list contains only 1 instruction. */ - if (n_ready <= 1) - return issue_rate; + return index; /* Check that IMUL instruction is on the top of ready list. */ insn = ready[n_ready - 1]; - if (!NONDEBUG_INSN_P (insn)) - return issue_rate; - insn = PATTERN (insn); - if (GET_CODE (insn) == PARALLEL) - insn = XVECEXP (insn, 0, 0); - if (GET_CODE (insn) != SET) - return issue_rate; - if (!(GET_CODE (SET_SRC (insn)) == MULT - && GET_MODE (SET_SRC (insn)) == SImode)) - return issue_rate; + set = single_set (insn); + if (!set) + return index; + if (!(GET_CODE (SET_SRC (set)) == MULT + && GET_MODE (SET_SRC (set)) == SImode)) + return index; /* Search for producer of independent IMUL instruction. */ for (i = n_ready - 2; i>= 0; i--) @@ -24656,19 +24761,134 @@ ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready, if (index >= 0) break; } - if (index < 0) - return issue_rate; /* Didn't find IMUL producer. */ + return index; +} + +/* Try to find the best candidate on the top of ready list if two insns + have the same priority - candidate is best if its dependees were + scheduled earlier. Applied for Silvermont only. + Return true if top 2 insns must be interchanged. */ +static bool +swap_top_of_ready_list(rtx *ready, int n_ready) +{ + rtx top = ready[n_ready - 1]; + rtx next = ready[n_ready - 2]; + rtx set; + sd_iterator_def sd_it; + dep_t dep; + int clock1 = -1; + int clock2 = -1; + #define INSN_TICK(INSN) (HID (INSN)->tick) - if (sched_verbose > 1) - fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n", - INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1])); + if (ix86_tune != PROCESSOR_SLM) + return false; + if (!reload_completed) + return false; - /* Put IMUL producer (ready[index]) at the top of ready list. */ - insn1= ready[index]; - for (i = index; i < n_ready - 1; i++) - ready[i] = ready[i + 1]; - ready[n_ready - 1] = insn1; + if (!NONDEBUG_INSN_P (top)) + return false; + if (!NONJUMP_INSN_P (top)) + return false; + if (!NONDEBUG_INSN_P (next)) + return false; + if (!NONJUMP_INSN_P (next)) + return false; + set = single_set (top); + if (!set) + return false; + set = single_set (next); + if (!set) + return false; + if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next)) + { + if (INSN_PRIORITY (top) != INSN_PRIORITY (next)) + return false; + /* Determine winner more precise. */ + FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep) + { + rtx pro; + pro = DEP_PRO (dep); + if (!NONDEBUG_INSN_P (pro)) + continue; + if (INSN_TICK (pro) > clock1) + clock1 = INSN_TICK (pro); + } + FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep) + { + rtx pro; + pro = DEP_PRO (dep); + if (!NONDEBUG_INSN_P (pro)) + continue; + if (INSN_TICK (pro) > clock2) + clock2 = INSN_TICK (pro); + } + + if (clock1 == clock2) + { + /* Determine winner - load must win. */ + enum attr_memory memory1, memory2; + memory1 = get_attr_memory (top); + memory2 = get_attr_memory (next); + if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD) + return true; + } + return (bool) (clock2 < clock1); + } + return false; + #undef INSN_TICK +} + +/* Perform possible reodering of ready list for Atom/Silvermont only. + Return issue rate. */ +static int +ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready, + int clock_var) +{ + int issue_rate = -1; + int n_ready = *pn_ready; + int i; + rtx insn; + int index = -1; + + /* Set up issue rate. */ + issue_rate = ix86_issue_rate(); + + /* Do reodering for Atom/SLM only. */ + if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM) + return issue_rate; + + /* Nothing to do if ready list contains only 1 instruction. */ + if (n_ready <= 1) + return issue_rate; + + /* Do reodering for post-reload scheduler only. */ + if (!reload_completed) + return issue_rate; + + if ((index = do_reoder_for_imul (ready, n_ready)) >= 0) + { + if (sched_verbose > 1) + fprintf(dump, ";;\tatom sched_reorder: put %d insn on top\n", + INSN_UID (ready[index])); + + /* Put IMUL producer (ready[index]) at the top of ready list. */ + insn= ready[index]; + for (i = index; i < n_ready - 1; i++) + ready[i] = ready[i + 1]; + ready[n_ready - 1] = insn; + return issue_rate; + } + if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready)) + { + if (sched_verbose > 1) + fprintf(dump, ";;\tslm sched_reorder: swap %d and %d insns\n", + INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2])); + /* Swap 2 top elements of ready list. */ + insn = ready[n_ready - 1]; + ready[n_ready - 1] = ready[n_ready - 2]; + ready[n_ready - 2] = insn; + } return issue_rate; } diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 776582a66de6..85d1a6895808 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -333,6 +333,7 @@ enum ix86_tune_indices { X86_TUNE_REASSOC_FP_TO_PARALLEL, X86_TUNE_GENERAL_REGS_SSE_SPILL, X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, + X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, X86_TUNE_LAST }; @@ -443,6 +444,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_GENERAL_REGS_SSE_SPILL] #define TARGET_AVOID_MEM_OPND_FOR_CMOVE \ ix86_tune_features[X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE] +#define TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS \ + ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 28b0c78093a5..a9b4dae43d8f 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -3625,6 +3625,18 @@ CONST0_RTX (V4SFmode), operands[1])); }) +;; It's more profitable to split and then extend in the same register. +(define_peephole2 + [(set (match_operand:DF 0 "register_operand") + (float_extend:DF + (match_operand:SF 1 "memory_operand")))] + "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS + && optimize_insn_for_speed_p () + && SSE_REG_P (operands[0])" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (float_extend:DF (match_dup 2)))] + "operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));") + (define_insn "*extendsfdf2_mixed" [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x") (float_extend:DF @@ -3766,6 +3778,18 @@ CONST0_RTX (V2DFmode), operands[1])); }) +;; It's more profitable to split and then extend in the same register. +(define_peephole2 + [(set (match_operand:SF 0 "register_operand") + (float_truncate:SF + (match_operand:DF 1 "memory_operand")))] + "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS + && optimize_insn_for_speed_p () + && SSE_REG_P (operands[0])" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (float_truncate:SF (match_dup 2)))] + "operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));") + (define_expand "truncdfsf2_with_temp" [(parallel [(set (match_operand:SF 0) (float_truncate:SF (match_operand:DF 1))) -- 2.43.5