This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: SSE conversion optimization
Hi,
this is variant of patch I comitted. It change only amdfam10 codegen so
far until we figure out what would be best setting for generic and
core2.
For AMDFAM10 it is also better to offload to memory operand of
DImode->SF/DFmode conversions I will try to do next.
Honza
Index: ChangeLog
===================================================================
--- ChangeLog (revision 128300)
+++ ChangeLog (working copy)
@@ -1,3 +1,19 @@
+2007-09-09 Jan Hubicka <jh@suse.cz>
+ Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>
+
+ * i386.h (ix86_tune_indices): Add X86_USE_VECTOR_CONVERTS.
+ (TARGET_USE_VECTOR_CONVERTS): New.
+ * i386.md: New post-reload splitters for converting SF to DF and DF to
+ SF.
+ (floatsi* expander): Special case vector conversions.
+ (floatsisf2_mixed_vector, floatsisf2_sse_vector_nointernunit,
+ floatsisf2_sse_vector_internunit, floatsisf2_sse_vector,
+ floatsidf2_mixed_vector, floatsidf2_sse_vector): New.
+ (floatsisf2_mixed, floatsisf2_sse, floatsidf2_mixed, floatsidf2_sse):
+ Disable when doing vector converts.
+ (floatsi<mode>2_i387): Disable when
+ * sse.md (vec_dupv2df): Export.
+ * i386.c (ix86_tune_features): Enable SSE conversions.
+
2007-09-09 Richard Guenther <rguenther@suse.de>
* tree-ssa-operands.c (add_virtual_operand): Only mark
Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h (revision 128300)
+++ config/i386/i386.h (working copy)
@@ -257,6 +257,7 @@ enum ix86_tune_indices {
X86_TUNE_MOVE_M1_VIA_OR,
X86_TUNE_NOT_UNPAIRABLE,
X86_TUNE_NOT_VECTORMODE,
+ X86_USE_VECTOR_CONVERTS,
X86_TUNE_LAST
};
@@ -337,6 +338,7 @@ extern unsigned int ix86_tune_features[X
#define TARGET_MOVE_M1_VIA_OR ix86_tune_features[X86_TUNE_MOVE_M1_VIA_OR]
#define TARGET_NOT_UNPAIRABLE ix86_tune_features[X86_TUNE_NOT_UNPAIRABLE]
#define TARGET_NOT_VECTORMODE ix86_tune_features[X86_TUNE_NOT_VECTORMODE]
+#define TARGET_USE_VECTOR_CONVERTS ix86_tune_features[X86_USE_VECTOR_CONVERTS]
/* Feature tests against the various architecture variations. */
enum ix86_arch_indices {
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 128300)
+++ config/i386/i386.md (working copy)
@@ -3916,6 +3916,49 @@
}
})
+/* For converting SF(xmm2) to DF(xmm1), use the following code instead of
+ cvtss2sd:
+ unpcklps xmm2,xmm2 ; packed conversion might crash on signaling NaNs
+ cvtps2pd xmm2,xmm1
+ We do the conversion post reload to avoid producing of 128bit spills
+ that might lead to ICE on 32bit target. The sequence unlikely combine
+ anyway. */
+(define_split
+ [(set (match_operand:DF 0 "register_operand" "")
+ (float_extend:DF
+ (match_operand:SF 1 "nonimmediate_operand" "")))]
+ "TARGET_USE_VECTOR_CONVERTS && !optimize_size
+ && reload_completed && SSE_REG_P (operands[0])"
+ [(set (match_dup 2)
+ (float_extend:V2DF
+ (vec_select:V2SF
+ (match_dup 3)
+ (parallel [(const_int 0) (const_int 1)]))))]
+{
+ operands[2] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
+ operands[3] = simplify_gen_subreg (V4SFmode, operands[0], DFmode, 0);
+ /* Use movss for loading from memory, unpcklps reg, reg for registers.
+ Try to avoid move when unpacking can be done in source. */
+ if (REG_P (operands[1]))
+ {
+ /* If it is unsafe to overwrite upper half of source, we need
+ to move to destination and unpack there. */
+ if ((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
+ || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4)
+ && true_regnum (operands[0]) != true_regnum (operands[1]))
+ {
+ rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0]));
+ emit_move_insn (tmp, operands[1]);
+ }
+ else
+ operands[3] = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
+ emit_insn (gen_sse_unpcklps (operands[3], operands[3], operands[3]));
+ }
+ else
+ emit_insn (gen_vec_setv4sf_0 (operands[3],
+ CONST0_RTX (V4SFmode), operands[1]));
+})
+
(define_insn "*extendsfdf2_mixed"
[(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x")
(float_extend:DF
@@ -4009,6 +4052,51 @@
}
})
+/* For converting DF(xmm2) to SF(xmm1), use the following code instead of
+ cvtsd2ss:
+ unpcklpd xmm2,xmm2 ; packed conversion might crash on signaling NaNs
+ cvtpd2ps xmm2,xmm1
+ We do the conversion post reload to avoid producing of 128bit spills
+ that might lead to ICE on 32bit target. The sequence unlikely combine
+ anyway. */
+(define_split
+ [(set (match_operand:SF 0 "register_operand" "")
+ (float_truncate:SF
+ (match_operand:DF 1 "nonimmediate_operand" "")))]
+ "TARGET_USE_VECTOR_CONVERTS && !optimize_size
+ && reload_completed && SSE_REG_P (operands[0])"
+ [(set (match_dup 2)
+ (vec_concat:V4SF
+ (float_truncate:V2SF
+ (match_dup 4))
+ (match_dup 3)))]
+{
+ operands[2] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+ operands[3] = CONST0_RTX (V2SFmode);
+ operands[4] = simplify_gen_subreg (V2DFmode, operands[0], SFmode, 0);
+ /* Use movsd for loading from memory, unpcklpd for registers.
+ Try to avoid move when unpacking can be done in source, or SSE3
+ movddup is available. */
+ if (REG_P (operands[1]))
+ {
+ if (!TARGET_SSE3
+ && true_regnum (operands[0]) != true_regnum (operands[1])
+ && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
+ || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8))
+ {
+ rtx tmp = simplify_gen_subreg (DFmode, operands[0], SFmode, 0);
+ emit_move_insn (tmp, operands[1]);
+ operands[1] = tmp;
+ }
+ else if (!TARGET_SSE3)
+ operands[4] = simplify_gen_subreg (V2DFmode, operands[1], DFmode, 0);
+ emit_insn (gen_vec_dupv2df (operands[4], operands[1]));
+ }
+ else
+ emit_insn (gen_sse2_loadlpd (operands[4],
+ CONST0_RTX (V2DFmode), operands[1]));
+})
+
(define_expand "truncdfsf2_with_temp"
[(parallel [(set (match_operand:SF 0 "" "")
(float_truncate:SF (match_operand:DF 1 "" "")))
@@ -4685,12 +4773,67 @@
[(set (match_operand:MODEF 0 "register_operand" "")
(float:MODEF (match_operand:SI 1 "nonimmediate_operand" "")))]
"TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
- "")
+ "
+ /* When we use vector converts, we can't have input in memory. */
+ if (GET_MODE (operands[0]) == DFmode && GET_MODE (operands[1]) == SImode
+ && TARGET_USE_VECTOR_CONVERTS && !optimize_size && TARGET_SSE_MATH
+ && SSE_FLOAT_MODE_P (DFmode))
+ operands[1] = force_reg (SImode, operands[1]);
+
+ if (GET_MODE (operands[0]) == SFmode && GET_MODE (operands[1]) == SImode
+ && !optimize_size && TARGET_USE_VECTOR_CONVERTS && TARGET_SSE_MATH
+ && SSE_FLOAT_MODE_P (SFmode))
+ {
+ /* When !flag_trapping_math, we handle SImode->SFmode vector
+ conversions same way as SImode->DFmode.
+
+ For flat_trapping_math we can't safely use vector conversion without
+ clearing upper half, otherwise precision exception might occur.
+ However we can still generate the common sequence converting value
+ from general register to XMM register as:
+
+ mov reg32, mem32
+ movd mem32, xmm
+ cvtdq2pd xmm,xmm
+
+ because we know that movd clears the upper half.
+
+ Sadly in this case we can't rely on reload moving the value to XMM
+ register, since we need to know if upper half is OK, so we need
+ to do reloading by hand. We force operand to memory unless target
+ supports inter unit moves. */
+ if (!flag_trapping_math)
+ operands[1] = force_reg (SImode, operands[1]);
+ else if (!MEM_P (operands[1]))
+ {
+ rtx tmp = assign_386_stack_local (SImode, SLOT_VIRTUAL);
+ emit_move_insn (tmp, operands[1]);
+ operands[1] = tmp;
+ }
+ }
+ ")
+
+(define_insn "*floatsisf2_mixed_vector"
+ [(set (match_operand:SF 0 "register_operand" "=x,f,?f")
+ (float:SF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
+ "TARGET_MIX_SSE_I387 && !flag_trapping_math
+ && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+ "@
+ cvtpq2ps\t{%1, %0|%0, %1}
+ fild%z1\t%1
+ #"
+ [(set_attr "type" "sseicvt,fmov,multi")
+ (set_attr "mode" "SF")
+ (set_attr "unit" "*,i387,*")
+ (set_attr "athlon_decode" "double,*,*")
+ (set_attr "amdfam10_decode" "double,*,*")
+ (set_attr "fp_int_src" "false,true,true")])
(define_insn "*floatsisf2_mixed"
[(set (match_operand:SF 0 "register_operand" "=f,?f,x,x")
(float:SF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
- "TARGET_MIX_SSE_I387"
+ "TARGET_MIX_SSE_I387
+ && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
"@
fild%z1\t%1
#
@@ -4703,10 +4846,68 @@
(set_attr "amdfam10_decode" "*,*,vector,double")
(set_attr "fp_int_src" "true")])
+(define_insn "*floatsisf2_sse_vector_nointernunit"
+ [(set (match_operand:SF 0 "register_operand" "=x")
+ (float:SF (match_operand:SI 1 "memory_operand" "m")))]
+ "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+ && !TARGET_INTER_UNIT_MOVES"
+ "#"
+ [(set_attr "type" "multi")])
+
+(define_insn "*floatsisf2_sse_vector_internunit"
+ [(set (match_operand:SF 0 "register_operand" "=x,x")
+ (float:SF (match_operand:SI 1 "nonimmediate_operand" "rm,x")))]
+ "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+ && TARGET_INTER_UNIT_MOVES"
+ "#"
+ [(set_attr "type" "multi")])
+
+(define_split
+ [(set (match_operand:SF 0 "register_operand" "")
+ (float:SF (match_operand:SI 1 "nonimmediate_operand" "")))]
+ "flag_trapping_math
+ && TARGET_USE_VECTOR_CONVERTS && reload_completed
+ && (TARGET_INTER_UNIT_MOVES || MEM_P (operands[1]))
+ && !SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
+ [(set (match_dup 0)
+ (float:V4SF (match_dup 2)))]
+{
+ operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
+ operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+ emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1]));
+})
+
+(define_split
+ [(set (match_operand:SF 0 "register_operand" "")
+ (float:SF (match_operand:SI 1 "register_operand" "")))]
+ "flag_trapping_math
+ && TARGET_USE_VECTOR_CONVERTS && reload_completed
+ && SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
+ [(set (match_dup 2) (vec_duplicate:V4SI (match_dup 1)))
+ (set (match_dup 0)
+ (float:V4SF (match_dup 2)))]
+{
+ operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
+ operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+})
+
+(define_insn "*floatsisf2_sse_vector"
+ [(set (match_operand:SF 0 "register_operand" "=x")
+ (float:SF (match_operand:SI 1 "register_operand" "x")))]
+ "!flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+ && !TARGET_INTER_UNIT_MOVES"
+ "cvtpq2ps\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "mode" "SF")
+ (set_attr "athlon_decode" "double")
+ (set_attr "amdfam10_decode" "double")
+ (set_attr "fp_int_src" "true")])
+
(define_insn "*floatsisf2_sse"
[(set (match_operand:SF 0 "register_operand" "=x,x")
(float:SF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
- "TARGET_SSE_MATH"
+ "TARGET_SSE_MATH
+ && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
"cvtsi2ss\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
(set_attr "mode" "SF")
@@ -4714,38 +4915,89 @@
(set_attr "amdfam10_decode" "vector,double")
(set_attr "fp_int_src" "true")])
+(define_insn "*floatsidf2_mixed_vector"
+ [(set (match_operand:DF 0 "register_operand" "=x,f,f")
+ (float:DF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
+ "TARGET_SSE2 && TARGET_MIX_SSE_I387
+ && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+ "@
+ cvtdq2pd\t{%1, %0|%0, %1}
+ fild%z1\t%1
+ #"
+ [(set_attr "type" "sseicvt,fmov,multi")
+ (set_attr "mode" "V2DF,DF,DF")
+ (set_attr "unit" "*,*,i387")
+ (set_attr "athlon_decode" "double,*,*")
+ (set_attr "amdfam10_decode" "double,*,*")
+ (set_attr "fp_int_src" "false,true,true")])
+
(define_insn "*floatsidf2_mixed"
- [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x")
- (float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
- "TARGET_SSE2 && TARGET_MIX_SSE_I387"
+ [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x,!x")
+ (float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m,x")))]
+ "TARGET_SSE2 && TARGET_MIX_SSE_I387
+ && (!TARGET_USE_VECTOR_CONVERTS || !optimize_size)"
"@
fild%z1\t%1
#
cvtsi2sd\t{%1, %0|%0, %1}
- cvtsi2sd\t{%1, %0|%0, %1}"
- [(set_attr "type" "fmov,multi,sseicvt,sseicvt")
- (set_attr "mode" "DF")
- (set_attr "unit" "*,i387,*,*")
- (set_attr "athlon_decode" "*,*,double,direct")
- (set_attr "amdfam10_decode" "*,*,vector,double")
+ cvtsi2sd\t{%1, %0|%0, %1}
+ cvtdq2pd\t{%1, %0|%0, %1}"
+ [(set_attr "type" "fmov,multi,sseicvt,sseicvt,sseicvt")
+ (set_attr "mode" "DF,DF,DF,DF,V2DF")
+ (set_attr "unit" "*,i387,*,*,*")
+ (set_attr "athlon_decode" "*,*,double,direct,double")
+ (set_attr "amdfam10_decode" "*,*,vector,double,double")
+ (set_attr "fp_int_src" "true,true,true,true,false")])
+
+(define_insn "*floatsidf2_sse_vector"
+ [(set (match_operand:DF 0 "register_operand" "=x")
+ (float:DF (match_operand:SI 1 "register_operand" "x")))]
+ "TARGET_SSE2 && TARGET_SSE_MATH
+ && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+ "cvtdq2pd\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "mode" "V2DF")
+ (set_attr "athlon_decode" "double")
+ (set_attr "amdfam10_decode" "double")
(set_attr "fp_int_src" "true")])
+(define_split
+ [(set (match_operand:DF 0 "register_operand" "")
+ (float:DF (match_operand:SI 1 "memory_operand" "")))]
+ "TARGET_USE_VECTOR_CONVERTS && reload_completed
+ && SSE_REG_P (operands[0])"
+ [(set (match_dup 0)
+ (float:V2DF
+ (vec_select:V2SI
+ (match_dup 2)
+ (parallel [(const_int 0) (const_int 1)]))))]
+{
+ operands[2] = simplify_gen_subreg (V4SImode, operands[0], DFmode, 0);
+ operands[0] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
+ emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1]));
+})
+
(define_insn "*floatsidf2_sse"
- [(set (match_operand:DF 0 "register_operand" "=x,x")
- (float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
- "TARGET_SSE2 && TARGET_SSE_MATH"
- "cvtsi2sd\t{%1, %0|%0, %1}"
+ [(set (match_operand:DF 0 "register_operand" "=x,x,!x")
+ (float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m,x")))]
+ "TARGET_SSE2 && TARGET_SSE_MATH
+ && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
+ "@
+ cvtsi2sd\t{%1, %0|%0, %1}
+ cvtsi2sd\t{%1, %0|%0, %1}
+ cvtdq2pd\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
- (set_attr "mode" "DF")
- (set_attr "athlon_decode" "double,direct")
- (set_attr "amdfam10_decode" "vector,double")
+ (set_attr "mode" "DF,DF,V2DF")
+ (set_attr "athlon_decode" "double,direct,double")
+ (set_attr "amdfam10_decode" "vector,double,double")
(set_attr "fp_int_src" "true")])
(define_insn "*floatsi<mode>2_i387"
[(set (match_operand:MODEF 0 "register_operand" "=f,f")
(float:MODEF
(match_operand:SI 1 "nonimmediate_operand" "m,?r")))]
- "TARGET_80387"
+ "TARGET_80387
+ && (!TARGET_SSE_MATH || !SSE_FLOAT_MODE_P (GET_MODE (operands[0])))"
"@
fild%z1\t%1
#"
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md (revision 128300)
+++ config/i386/sse.md (working copy)
@@ -2740,7 +2740,7 @@
[(set_attr "type" "sselog1")
(set_attr "mode" "DF")])
-(define_insn "*vec_dupv2df"
+(define_insn "vec_dupv2df"
[(set (match_operand:V2DF 0 "register_operand" "=x")
(vec_duplicate:V2DF
(match_operand:DF 1 "register_operand" "0")))]
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c (revision 128300)
+++ config/i386/i386.c (working copy)
@@ -1258,6 +1258,10 @@ unsigned int ix86_tune_features[X86_TUNE
operand that cannot be represented using a modRM byte. The XOR
replacement is long decoded, so this split helps here as well. */
m_K6,
+
+ /* X86_USE_VECTOR_CONVERTS: Preffer vector packed SSE conversion from
+ integer to FP. */
+ m_AMDFAM10,
};
/* Feature tests against the various architecture variations. */