This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
More SSE conversion tweeks
- From: Jan Hubicka <jh at suse dot cz>
- To: gcc-patches at gcc dot gnu dot org, hongjiu dot lu at intel dot com, harsha dot jagasia at amd dot com
- Date: Mon, 10 Sep 2007 01:40:47 +0200
- Subject: More SSE conversion tweeks
Hi,
this patch adds TARGET_INTER_UNIT_CONVERSION tweeks that, if disabled,
intstruct gcc to output
mov reg, mem
cvtsi2sd mem, xmmreg
instead of doing from reg to xmmreg directly. This is win for amdfam10
in case packed instruction can't be used (ie source is 64bit). I would
like to test it for generic too once we settle down the other part of
conversions changes.
I've bootstrapped it for amdfam, testing x86_64-linux and will commit it
tomorrow if it passes and there are no complains.
* i386.h (ix86_tune_indices): Add X86_TUNE_INTER_UNIT_CONVERSIONS.
(TARGET_INTER_UNIT_CONVERSIONS): New.
* i386.md (floatsi expanders): Remove redundant check for SImode
source; offload to memory when asked for.
(floatsisf2_mixed, floatsisf2_sse, floatsidf2_mixed, floatsidf2_sse
floatdisf2_mixed, floatsisf2_sse, floatsidf2_mixed, floatsidf2_sse):
Update conditions;
(floatsisf2_mixed_memory, floatsisf2_sse_memory,
floatsidf2_mixed_memory, floatsidf2_sse_memory
floatdisf2_mixed_memory, floatsisf2_sse_memory,
floatsidf2_mixed_memory, floatsidf2_sse_memory): New.
Index: config/i386/i386.h
===================================================================
*** config/i386/i386.h (revision 128301)
--- config/i386/i386.h (working copy)
*************** enum ix86_tune_indices {
*** 243,248 ****
--- 243,249 ----
X86_TUNE_SHIFT1,
X86_TUNE_USE_FFREEP,
X86_TUNE_INTER_UNIT_MOVES,
+ X86_TUNE_INTER_UNIT_CONVERSIONS,
X86_TUNE_FOUR_JUMP_LIMIT,
X86_TUNE_SCHEDULE,
X86_TUNE_USE_BT,
*************** extern unsigned int ix86_tune_features[X
*** 320,325 ****
--- 321,328 ----
#define TARGET_SHIFT1 ix86_tune_features[X86_TUNE_SHIFT1]
#define TARGET_USE_FFREEP ix86_tune_features[X86_TUNE_USE_FFREEP]
#define TARGET_INTER_UNIT_MOVES ix86_tune_features[X86_TUNE_INTER_UNIT_MOVES]
+ #define TARGET_INTER_UNIT_CONVERSIONS\
+ ix86_tune_features[X86_TUNE_INTER_UNIT_CONVERSIONS]
#define TARGET_FOUR_JUMP_LIMIT ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT]
#define TARGET_SCHEDULE ix86_tune_features[X86_TUNE_SCHEDULE]
#define TARGET_USE_BT ix86_tune_features[X86_TUNE_USE_BT]
Index: config/i386/i386.md
===================================================================
*** config/i386/i386.md (revision 128301)
--- config/i386/i386.md (working copy)
***************
*** 4775,4788 ****
"TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
"
/* When we use vector converts, we can't have input in memory. */
! if (GET_MODE (operands[0]) == DFmode && GET_MODE (operands[1]) == SImode
&& TARGET_USE_VECTOR_CONVERTS && !optimize_size && TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (DFmode))
operands[1] = force_reg (SImode, operands[1]);
!
! if (GET_MODE (operands[0]) == SFmode && GET_MODE (operands[1]) == SImode
! && !optimize_size && TARGET_USE_VECTOR_CONVERTS && TARGET_SSE_MATH
! && SSE_FLOAT_MODE_P (SFmode))
{
/* When !flag_trapping_math, we handle SImode->SFmode vector
conversions same way as SImode->DFmode.
--- 4775,4787 ----
"TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
"
/* When we use vector converts, we can't have input in memory. */
! if (GET_MODE (operands[0]) == DFmode
&& TARGET_USE_VECTOR_CONVERTS && !optimize_size && TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (DFmode))
operands[1] = force_reg (SImode, operands[1]);
! else if (GET_MODE (operands[0]) == SFmode
! && !optimize_size && TARGET_USE_VECTOR_CONVERTS && TARGET_SSE_MATH
! && SSE_FLOAT_MODE_P (SFmode))
{
/* When !flag_trapping_math, we handle SImode->SFmode vector
conversions same way as SImode->DFmode.
***************
*** 4811,4816 ****
--- 4810,4828 ----
operands[1] = tmp;
}
}
+ /* Offload operand of cvtsi2ss and cvtsi2sd into memory for
+ !TARGET_INTER_UNIT_CONVERSIONS
+ It is neccesary for the patterns to not accept nonemmory operands
+ as we would optimize out later. */
+ else if (!TARGET_INTER_UNIT_CONVERSIONS
+ && TARGET_SSE_MATH && SSE_FLOAT_MODE_P (GET_MODE (operands[0]))
+ && !optimize_size
+ && !MEM_P (operands[1]))
+ {
+ rtx tmp = assign_386_stack_local (GET_MODE (operands[1]), SLOT_VIRTUAL);
+ emit_move_insn (tmp, operands[1]);
+ operands[1] = tmp;
+ }
")
(define_insn "*floatsisf2_mixed_vector"
***************
*** 4833,4839 ****
[(set (match_operand:SF 0 "register_operand" "=f,?f,x,x")
(float:SF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
"TARGET_MIX_SSE_I387
! && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
"@
fild%z1\t%1
#
--- 4845,4852 ----
[(set (match_operand:SF 0 "register_operand" "=f,?f,x,x")
(float:SF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
"TARGET_MIX_SSE_I387
! && ((!TARGET_USE_VECTOR_CONVERTS && TARGET_INTER_UNIT_CONVERSIONS)
! || optimize_size)"
"@
fild%z1\t%1
#
***************
*** 4846,4851 ****
--- 4859,4878 ----
(set_attr "amdfam10_decode" "*,*,vector,double")
(set_attr "fp_int_src" "true")])
+ (define_insn "*floatsisf2_mixed_memory"
+ [(set (match_operand:SF 0 "register_operand" "=f,x")
+ (float:SF (match_operand:SI 1 "memory_operand" "m,m")))]
+ "TARGET_MIX_SSE_I387
+ && !TARGET_INTER_UNIT_CONVERSIONS && !optimize_size"
+ "@
+ fild%z1\t%1
+ cvtsi2ss\t{%1, %0|%0, %1}"
+ [(set_attr "type" "fmov,sseicvt")
+ (set_attr "mode" "SF")
+ (set_attr "athlon_decode" "*,double")
+ (set_attr "amdfam10_decode" "*,double")
+ (set_attr "fp_int_src" "true")])
+
(define_insn "*floatsisf2_sse_vector_nointernunit"
[(set (match_operand:SF 0 "register_operand" "=x")
(float:SF (match_operand:SI 1 "memory_operand" "m")))]
***************
*** 4907,4913 ****
[(set (match_operand:SF 0 "register_operand" "=x,x")
(float:SF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
"TARGET_SSE_MATH
! && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
"cvtsi2ss\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
(set_attr "mode" "SF")
--- 4934,4941 ----
[(set (match_operand:SF 0 "register_operand" "=x,x")
(float:SF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
"TARGET_SSE_MATH
! && ((!TARGET_USE_VECTOR_CONVERTS && TARGET_INTER_UNIT_CONVERSIONS)
! || optimize_size)"
"cvtsi2ss\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
(set_attr "mode" "SF")
***************
*** 4915,4920 ****
--- 4943,4960 ----
(set_attr "amdfam10_decode" "vector,double")
(set_attr "fp_int_src" "true")])
+ (define_insn "*floatsisf2_sse_memory"
+ [(set (match_operand:SF 0 "register_operand" "=x")
+ (float:SF (match_operand:SI 1 "memory_operand" "m")))]
+ "TARGET_SSE_MATH
+ && !TARGET_INTER_UNIT_CONVERSIONS && !optimize_size"
+ "cvtsi2ss\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "mode" "SF")
+ (set_attr "athlon_decode" "double")
+ (set_attr "amdfam10_decode" "double")
+ (set_attr "fp_int_src" "true")])
+
(define_insn "*floatsidf2_mixed_vector"
[(set (match_operand:DF 0 "register_operand" "=x,f,f")
(float:DF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
***************
*** 4935,4941 ****
[(set (match_operand:DF 0 "register_operand" "=f,?f,x,x,!x")
(float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m,x")))]
"TARGET_SSE2 && TARGET_MIX_SSE_I387
! && (!TARGET_USE_VECTOR_CONVERTS || !optimize_size)"
"@
fild%z1\t%1
#
--- 4975,4982 ----
[(set (match_operand:DF 0 "register_operand" "=f,?f,x,x,!x")
(float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m,x")))]
"TARGET_SSE2 && TARGET_MIX_SSE_I387
! && ((!TARGET_USE_VECTOR_CONVERTS && TARGET_INTER_UNIT_CONVERSIONS)
! || optimize_size)"
"@
fild%z1\t%1
#
***************
*** 4949,4954 ****
--- 4990,5009 ----
(set_attr "amdfam10_decode" "*,*,vector,double,double")
(set_attr "fp_int_src" "true,true,true,true,false")])
+ (define_insn "*floatsidf2_mixed_memory"
+ [(set (match_operand:DF 0 "register_operand" "=f,x")
+ (float:DF (match_operand:SI 1 "memory_operand" "m,m")))]
+ "TARGET_SSE2 && TARGET_MIX_SSE_I387
+ && !TARGET_INTER_UNIT_CONVERSIONS && !optimize_size"
+ "@
+ fild%z1\t%1
+ cvtsi2sd\t{%1, %0|%0, %1}"
+ [(set_attr "type" "fmov,sseicvt")
+ (set_attr "mode" "DF")
+ (set_attr "athlon_decode" "*,direct")
+ (set_attr "amdfam10_decode" "*,double")
+ (set_attr "fp_int_src" "true")])
+
(define_insn "*floatsidf2_sse_vector"
[(set (match_operand:DF 0 "register_operand" "=x")
(float:DF (match_operand:SI 1 "register_operand" "x")))]
***************
*** 4981,4987 ****
[(set (match_operand:DF 0 "register_operand" "=x,x,!x")
(float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m,x")))]
"TARGET_SSE2 && TARGET_SSE_MATH
! && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
"@
cvtsi2sd\t{%1, %0|%0, %1}
cvtsi2sd\t{%1, %0|%0, %1}
--- 5036,5043 ----
[(set (match_operand:DF 0 "register_operand" "=x,x,!x")
(float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m,x")))]
"TARGET_SSE2 && TARGET_SSE_MATH
! && ((!TARGET_USE_VECTOR_CONVERTS && TARGET_INTER_UNIT_CONVERSIONS)
! || optimize_size)"
"@
cvtsi2sd\t{%1, %0|%0, %1}
cvtsi2sd\t{%1, %0|%0, %1}
***************
*** 4992,4997 ****
--- 5048,5066 ----
(set_attr "amdfam10_decode" "vector,double,double")
(set_attr "fp_int_src" "true")])
+ (define_insn "*floatsidf2_memory"
+ [(set (match_operand:DF 0 "register_operand" "=x")
+ (float:DF (match_operand:SI 1 "memory_operand" "x")))]
+ "TARGET_SSE2 && TARGET_SSE_MATH
+ && ((!TARGET_USE_VECTOR_CONVERTS && TARGET_INTER_UNIT_CONVERSIONS)
+ || optimize_size)"
+ "cvtsi2sd\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "mode" "DF")
+ (set_attr "athlon_decode" "direct")
+ (set_attr "amdfam10_decode" "double")
+ (set_attr "fp_int_src" "true")])
+
(define_insn "*floatsi<mode>2_i387"
[(set (match_operand:MODEF 0 "register_operand" "=f,f")
(float:MODEF
***************
*** 5010,5021 ****
[(set (match_operand:SF 0 "register_operand" "")
(float:SF (match_operand:DI 1 "nonimmediate_operand" "")))]
"TARGET_80387 || (TARGET_64BIT && TARGET_SSE_MATH)"
! "")
(define_insn "*floatdisf2_mixed"
[(set (match_operand:SF 0 "register_operand" "=f,?f,x,x")
(float:SF (match_operand:DI 1 "nonimmediate_operand" "m,r,r,m")))]
! "TARGET_64BIT && TARGET_MIX_SSE_I387"
"@
fild%z1\t%1
#
--- 5079,5101 ----
[(set (match_operand:SF 0 "register_operand" "")
(float:SF (match_operand:DI 1 "nonimmediate_operand" "")))]
"TARGET_80387 || (TARGET_64BIT && TARGET_SSE_MATH)"
! {
! if (!TARGET_INTER_UNIT_CONVERSIONS && TARGET_64BIT
! && TARGET_SSE_MATH && SSE_FLOAT_MODE_P (SFmode)
! && !optimize_size
! && !MEM_P (operands[1]))
! {
! rtx tmp = assign_386_stack_local (GET_MODE (operands[1]), SLOT_VIRTUAL);
! emit_move_insn (tmp, operands[1]);
! operands[1] = tmp;
! }
! })
(define_insn "*floatdisf2_mixed"
[(set (match_operand:SF 0 "register_operand" "=f,?f,x,x")
(float:SF (match_operand:DI 1 "nonimmediate_operand" "m,r,r,m")))]
! "TARGET_64BIT && TARGET_MIX_SSE_I387
! && (TARGET_INTER_UNIT_CONVERSIONS || optimize_size)"
"@
fild%z1\t%1
#
***************
*** 5028,5037 ****
(set_attr "amdfam10_decode" "*,*,vector,double")
(set_attr "fp_int_src" "true")])
(define_insn "*floatdisf2_sse"
[(set (match_operand:SF 0 "register_operand" "=x,x")
(float:SF (match_operand:DI 1 "nonimmediate_operand" "r,m")))]
! "TARGET_64BIT && TARGET_SSE_MATH"
"cvtsi2ss{q}\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
(set_attr "mode" "SF")
--- 5108,5132 ----
(set_attr "amdfam10_decode" "*,*,vector,double")
(set_attr "fp_int_src" "true")])
+ (define_insn "*floatdisf2_mixed"
+ [(set (match_operand:SF 0 "register_operand" "=f,x")
+ (float:SF (match_operand:DI 1 "memory_operand" "m,m")))]
+ "TARGET_64BIT && TARGET_MIX_SSE_I387
+ && !TARGET_INTER_UNIT_CONVERSIONS && !optimize_size"
+ "@
+ fild%z1\t%1
+ cvtsi2ss{q}\t{%1, %0|%0, %1}"
+ [(set_attr "type" "fmov,sseicvt")
+ (set_attr "mode" "SF")
+ (set_attr "athlon_decode" "*,double")
+ (set_attr "amdfam10_decode" "*,double")
+ (set_attr "fp_int_src" "true")])
+
(define_insn "*floatdisf2_sse"
[(set (match_operand:SF 0 "register_operand" "=x,x")
(float:SF (match_operand:DI 1 "nonimmediate_operand" "r,m")))]
! "TARGET_64BIT && TARGET_SSE_MATH
! && (TARGET_INTER_UNIT_CONVERSIONS || optimize_size)"
"cvtsi2ss{q}\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
(set_attr "mode" "SF")
***************
*** 5039,5044 ****
--- 5134,5151 ----
(set_attr "amdfam10_decode" "vector,double")
(set_attr "fp_int_src" "true")])
+ (define_insn "*floatdisf2_memory"
+ [(set (match_operand:SF 0 "register_operand" "=x")
+ (float:SF (match_operand:DI 1 "memory_operand" "m")))]
+ "TARGET_64BIT && TARGET_SSE_MATH
+ && !TARGET_INTER_UNIT_CONVERSIONS && !optimize_size"
+ "cvtsi2ss{q}\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "mode" "SF")
+ (set_attr "athlon_decode" "double")
+ (set_attr "amdfam10_decode" "double")
+ (set_attr "fp_int_src" "true")])
+
(define_expand "floatdidf2"
[(set (match_operand:DF 0 "register_operand" "")
(float:DF (match_operand:DI 1 "nonimmediate_operand" "")))]
***************
*** 5049,5060 ****
ix86_expand_convert_sign_didf_sse (operands[0], operands[1]);
DONE;
}
})
(define_insn "*floatdidf2_mixed"
[(set (match_operand:DF 0 "register_operand" "=f,?f,x,x")
(float:DF (match_operand:DI 1 "nonimmediate_operand" "m,r,r,m")))]
! "TARGET_64BIT && TARGET_SSE2 && TARGET_MIX_SSE_I387"
"@
fild%z1\t%1
#
--- 5156,5177 ----
ix86_expand_convert_sign_didf_sse (operands[0], operands[1]);
DONE;
}
+ if (!TARGET_INTER_UNIT_CONVERSIONS && TARGET_64BIT
+ && TARGET_SSE_MATH && SSE_FLOAT_MODE_P (DFmode)
+ && !optimize_size
+ && !MEM_P (operands[1]))
+ {
+ rtx tmp = assign_386_stack_local (GET_MODE (operands[1]), SLOT_VIRTUAL);
+ emit_move_insn (tmp, operands[1]);
+ operands[1] = tmp;
+ }
})
(define_insn "*floatdidf2_mixed"
[(set (match_operand:DF 0 "register_operand" "=f,?f,x,x")
(float:DF (match_operand:DI 1 "nonimmediate_operand" "m,r,r,m")))]
! "TARGET_64BIT && TARGET_SSE2 && TARGET_MIX_SSE_I387
! && (TARGET_INTER_UNIT_CONVERSIONS || optimize_size)"
"@
fild%z1\t%1
#
***************
*** 5067,5076 ****
(set_attr "amdfam10_decode" "*,*,vector,double")
(set_attr "fp_int_src" "true")])
(define_insn "*floatdidf2_sse"
[(set (match_operand:DF 0 "register_operand" "=x,x")
(float:DF (match_operand:DI 1 "nonimmediate_operand" "r,m")))]
! "TARGET_64BIT && TARGET_SSE2 && TARGET_SSE_MATH"
"cvtsi2sd{q}\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
(set_attr "mode" "DF")
--- 5184,5208 ----
(set_attr "amdfam10_decode" "*,*,vector,double")
(set_attr "fp_int_src" "true")])
+ (define_insn "*floatdidf2_mixed_memory"
+ [(set (match_operand:DF 0 "register_operand" "=f,x")
+ (float:DF (match_operand:DI 1 "memory_operand" "m,m")))]
+ "TARGET_64BIT && TARGET_SSE2 && TARGET_MIX_SSE_I387
+ && !TARGET_INTER_UNIT_CONVERSIONS && !optimize_size"
+ "@
+ fild%z1\t%1
+ cvtsi2sd{q}\t{%1, %0|%0, %1}"
+ [(set_attr "type" "fmov,sseicvt")
+ (set_attr "mode" "DF")
+ (set_attr "athlon_decode" "*,direct")
+ (set_attr "amdfam10_decode" "*,double")
+ (set_attr "fp_int_src" "true")])
+
(define_insn "*floatdidf2_sse"
[(set (match_operand:DF 0 "register_operand" "=x,x")
(float:DF (match_operand:DI 1 "nonimmediate_operand" "r,m")))]
! "TARGET_64BIT && TARGET_SSE2 && TARGET_SSE_MATH
! && (TARGET_INTER_UNIT_CONVERSIONS || optimize_size)"
"cvtsi2sd{q}\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
(set_attr "mode" "DF")
***************
*** 5078,5088 ****
(set_attr "amdfam10_decode" "vector,double")
(set_attr "fp_int_src" "true")])
(define_insn "*floatdi<mode>2_i387"
[(set (match_operand:MODEF 0 "register_operand" "=f,f")
(float:MODEF
(match_operand:DI 1 "nonimmediate_operand" "m,?r")))]
! "TARGET_80387"
"@
fild%z1\t%1
#"
--- 5210,5233 ----
(set_attr "amdfam10_decode" "vector,double")
(set_attr "fp_int_src" "true")])
+ (define_insn "*floatdidf2_sse_memory"
+ [(set (match_operand:DF 0 "register_operand" "=x")
+ (float:DF (match_operand:DI 1 "memory_operand" "m")))]
+ "TARGET_64BIT && TARGET_SSE2 && TARGET_SSE_MATH
+ && !TARGET_INTER_UNIT_CONVERSIONS && !optimize_size"
+ "cvtsi2sd{q}\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "mode" "DF")
+ (set_attr "athlon_decode" "direct")
+ (set_attr "amdfam10_decode" "double")
+ (set_attr "fp_int_src" "true")])
+
(define_insn "*floatdi<mode>2_i387"
[(set (match_operand:MODEF 0 "register_operand" "=f,f")
(float:MODEF
(match_operand:DI 1 "nonimmediate_operand" "m,?r")))]
! "TARGET_80387
! && (!TARGET_SSE_MATH || !SSE_FLOAT_MODE_P (GET_MODE (operands[0])))"
"@
fild%z1\t%1
#"
Index: config/i386/i386.c
===================================================================
*** config/i386/i386.c (revision 128301)
--- config/i386/i386.c (working copy)
*************** unsigned int ix86_tune_features[X86_TUNE
*** 1209,1214 ****
--- 1209,1217 ----
/* X86_TUNE_INTER_UNIT_MOVES */
~(m_ATHLON_K8_AMDFAM10 | m_GENERIC),
+ /* X86_TUNE_INTER_UNIT_CONVERSIONS */
+ ~(m_AMDFAM10),
+
/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
than 4 branch instructions in the 16 byte window. */
m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,