This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: SSE conversion optimization

From: Jan Hubicka <hubicka at ucw dot cz>
To: Jan Hubicka <jh at suse dot cz>
Cc: "H.J. Lu" <hjl at lucon dot org>, "Jagasia, Harsha" <harsha dot jagasia at amd dot com>, gcc-patches at gcc dot gnu dot org, "rajagopal, dwarak" <dwarak dot rajagopal at amd dot com>, hongjiu dot lu at intel dot com
Date: Sun, 9 Sep 2007 19:49:04 +0200
Subject: Re: SSE conversion optimization
References: <20070908231754.GA4425@lucon.org> <D5B24B5251882048AD03DDFA431BB790012F03A0@SAUSEXMB3.amd.com> <20070909001305.GA4886@lucon.org> <20070909122531.GF30523@kam.mff.cuni.cz>
Hi,
this is variant of patch I comitted.  It change only amdfam10 codegen so
far until we figure out what would be best setting for generic and
core2.
For AMDFAM10 it is also better to offload to memory operand of
DImode->SF/DFmode conversions I will try to do next. 

Honza

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 128300)
+++ ChangeLog	(working copy)
@@ -1,3 +1,19 @@
+2007-09-09  Jan Hubicka  <jh@suse.cz>
+	     Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>
+
+	* i386.h (ix86_tune_indices): Add X86_USE_VECTOR_CONVERTS.
+	(TARGET_USE_VECTOR_CONVERTS): New.
+	* i386.md: New post-reload splitters for converting SF to DF and DF to
+	SF.
+	(floatsi* expander): Special case vector conversions.
+	(floatsisf2_mixed_vector, floatsisf2_sse_vector_nointernunit,
+	floatsisf2_sse_vector_internunit, floatsisf2_sse_vector,
+	floatsidf2_mixed_vector, floatsidf2_sse_vector): New.
+	(floatsisf2_mixed, floatsisf2_sse, floatsidf2_mixed, floatsidf2_sse):
+	Disable when doing vector converts.
+	(floatsi<mode>2_i387): Disable when
+	* sse.md (vec_dupv2df): Export.
+	* i386.c (ix86_tune_features): Enable SSE conversions.
+
 2007-09-09  Richard Guenther  <rguenther@suse.de>
 
 	* tree-ssa-operands.c (add_virtual_operand): Only mark
Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h	(revision 128300)
+++ config/i386/i386.h	(working copy)
@@ -257,6 +257,7 @@ enum ix86_tune_indices {
   X86_TUNE_MOVE_M1_VIA_OR,
   X86_TUNE_NOT_UNPAIRABLE,
   X86_TUNE_NOT_VECTORMODE,
+  X86_USE_VECTOR_CONVERTS,
 
   X86_TUNE_LAST
 };
@@ -337,6 +338,7 @@ extern unsigned int ix86_tune_features[X
 #define	TARGET_MOVE_M1_VIA_OR	ix86_tune_features[X86_TUNE_MOVE_M1_VIA_OR]
 #define TARGET_NOT_UNPAIRABLE	ix86_tune_features[X86_TUNE_NOT_UNPAIRABLE]
 #define TARGET_NOT_VECTORMODE	ix86_tune_features[X86_TUNE_NOT_VECTORMODE]
+#define TARGET_USE_VECTOR_CONVERTS ix86_tune_features[X86_USE_VECTOR_CONVERTS]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 128300)
+++ config/i386/i386.md	(working copy)
@@ -3916,6 +3916,49 @@
     }
 })
 
+/* For converting SF(xmm2) to DF(xmm1), use the following code instead of
+   cvtss2sd:
+      unpcklps xmm2,xmm2   ; packed conversion might crash on signaling NaNs
+      cvtps2pd xmm2,xmm1
+   We do the conversion post reload to avoid producing of 128bit spills
+   that might lead to ICE on 32bit target.  The sequence unlikely combine
+   anyway.  */
+(define_split
+  [(set (match_operand:DF 0 "register_operand" "")
+        (float_extend:DF
+	  (match_operand:SF 1 "nonimmediate_operand" "")))]
+  "TARGET_USE_VECTOR_CONVERTS && !optimize_size 
+   && reload_completed && SSE_REG_P (operands[0])"
+   [(set (match_dup 2)
+	 (float_extend:V2DF
+	   (vec_select:V2SF
+	     (match_dup 3)
+	     (parallel [(const_int 0) (const_int 1)]))))]
+{
+  operands[2] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
+  operands[3] = simplify_gen_subreg (V4SFmode, operands[0], DFmode, 0);
+  /* Use movss for loading from memory, unpcklps reg, reg for registers.
+     Try to avoid move when unpacking can be done in source.  */
+  if (REG_P (operands[1]))
+    {
+      /* If it is unsafe to overwrite upper half of source, we need
+	 to move to destination and unpack there.  */
+      if ((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
+	   || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4)
+	  && true_regnum (operands[0]) != true_regnum (operands[1]))
+	{
+	  rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0]));
+	  emit_move_insn (tmp, operands[1]);
+	}
+      else
+	operands[3] = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
+      emit_insn (gen_sse_unpcklps (operands[3], operands[3], operands[3]));
+    }
+  else
+    emit_insn (gen_vec_setv4sf_0 (operands[3], 
+				  CONST0_RTX (V4SFmode), operands[1]));
+})
+
 (define_insn "*extendsfdf2_mixed"
   [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x")
         (float_extend:DF
@@ -4009,6 +4052,51 @@
     }
 })
 
+/* For converting DF(xmm2) to SF(xmm1), use the following code instead of
+   cvtsd2ss:
+      unpcklpd xmm2,xmm2   ; packed conversion might crash on signaling NaNs
+      cvtpd2ps xmm2,xmm1
+   We do the conversion post reload to avoid producing of 128bit spills
+   that might lead to ICE on 32bit target.  The sequence unlikely combine
+   anyway.  */
+(define_split
+  [(set (match_operand:SF 0 "register_operand" "")
+        (float_truncate:SF
+	  (match_operand:DF 1 "nonimmediate_operand" "")))]
+  "TARGET_USE_VECTOR_CONVERTS && !optimize_size 
+   && reload_completed && SSE_REG_P (operands[0])"
+   [(set (match_dup 2)
+	 (vec_concat:V4SF
+	   (float_truncate:V2SF
+	     (match_dup 4))
+	   (match_dup 3)))]
+{
+  operands[2] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+  operands[3] = CONST0_RTX (V2SFmode);
+  operands[4] = simplify_gen_subreg (V2DFmode, operands[0], SFmode, 0);
+  /* Use movsd for loading from memory, unpcklpd for registers.
+     Try to avoid move when unpacking can be done in source, or SSE3
+     movddup is available.  */
+  if (REG_P (operands[1]))
+    {
+      if (!TARGET_SSE3
+	  && true_regnum (operands[0]) != true_regnum (operands[1])
+	  && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
+	      || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8))
+	{
+	  rtx tmp = simplify_gen_subreg (DFmode, operands[0], SFmode, 0);
+	  emit_move_insn (tmp, operands[1]);
+	  operands[1] = tmp;
+	}
+      else if (!TARGET_SSE3)
+	operands[4] = simplify_gen_subreg (V2DFmode, operands[1], DFmode, 0);
+      emit_insn (gen_vec_dupv2df (operands[4], operands[1]));
+    }
+  else
+    emit_insn (gen_sse2_loadlpd (operands[4],
+				 CONST0_RTX (V2DFmode), operands[1]));
+})
+
 (define_expand "truncdfsf2_with_temp"
   [(parallel [(set (match_operand:SF 0 "" "")
 		   (float_truncate:SF (match_operand:DF 1 "" "")))
@@ -4685,12 +4773,67 @@
   [(set (match_operand:MODEF 0 "register_operand" "")
 	(float:MODEF (match_operand:SI 1 "nonimmediate_operand" "")))]
   "TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
-  "")
+  "
+   /* When we use vector converts, we can't have input in memory.  */
+   if (GET_MODE (operands[0]) == DFmode && GET_MODE (operands[1]) == SImode
+       && TARGET_USE_VECTOR_CONVERTS && !optimize_size && TARGET_SSE_MATH
+       && SSE_FLOAT_MODE_P (DFmode))
+     operands[1] = force_reg (SImode, operands[1]);
+   
+   if (GET_MODE (operands[0]) == SFmode && GET_MODE (operands[1]) == SImode
+       && !optimize_size && TARGET_USE_VECTOR_CONVERTS && TARGET_SSE_MATH
+       && SSE_FLOAT_MODE_P (SFmode))
+     {
+       /* When !flag_trapping_math, we handle SImode->SFmode vector
+	  conversions same way as SImode->DFmode.
+
+	  For flat_trapping_math we can't safely use vector conversion without
+	  clearing upper half, otherwise precision exception might occur.
+	  However we can still generate the common sequence converting value
+	  from general register to XMM register as:
+
+	    mov 	reg32, mem32
+	    movd	mem32, xmm
+	    cvtdq2pd xmm,xmm
+
+	  because we know that movd clears the upper half.
+
+	  Sadly in this case we can't rely on reload moving the value to XMM
+	  register, since we need to know if upper half is OK, so we need
+	  to do reloading by hand.  We force operand to memory unless target
+	  supports inter unit moves.  */
+       if (!flag_trapping_math)
+         operands[1] = force_reg (SImode, operands[1]);
+       else if (!MEM_P (operands[1]))
+	 {
+	   rtx tmp = assign_386_stack_local (SImode, SLOT_VIRTUAL);
+	   emit_move_insn (tmp, operands[1]);
+	   operands[1] = tmp;
+	 }
+     }
+  ")
+
+(define_insn "*floatsisf2_mixed_vector"
+  [(set (match_operand:SF 0 "register_operand" "=x,f,?f")
+	(float:SF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
+  "TARGET_MIX_SSE_I387 && !flag_trapping_math 
+   && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+  "@
+   cvtpq2ps\t{%1, %0|%0, %1}
+   fild%z1\t%1
+   #"
+  [(set_attr "type" "sseicvt,fmov,multi")
+   (set_attr "mode" "SF")
+   (set_attr "unit" "*,i387,*")
+   (set_attr "athlon_decode" "double,*,*")
+   (set_attr "amdfam10_decode" "double,*,*")
+   (set_attr "fp_int_src" "false,true,true")])
 
 (define_insn "*floatsisf2_mixed"
   [(set (match_operand:SF 0 "register_operand" "=f,?f,x,x")
 	(float:SF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
-  "TARGET_MIX_SSE_I387"
+  "TARGET_MIX_SSE_I387
+   && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
   "@
    fild%z1\t%1
    #
@@ -4703,10 +4846,68 @@
    (set_attr "amdfam10_decode" "*,*,vector,double")
    (set_attr "fp_int_src" "true")])
 
+(define_insn "*floatsisf2_sse_vector_nointernunit"
+  [(set (match_operand:SF 0 "register_operand" "=x")
+	(float:SF (match_operand:SI 1 "memory_operand" "m")))]
+  "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+   && !TARGET_INTER_UNIT_MOVES"
+  "#"
+  [(set_attr "type" "multi")])
+
+(define_insn "*floatsisf2_sse_vector_internunit"
+  [(set (match_operand:SF 0 "register_operand" "=x,x")
+	(float:SF (match_operand:SI 1 "nonimmediate_operand" "rm,x")))]
+  "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+   && TARGET_INTER_UNIT_MOVES"
+  "#"
+  [(set_attr "type" "multi")])
+
+(define_split 
+  [(set (match_operand:SF 0 "register_operand" "")
+	(float:SF (match_operand:SI 1 "nonimmediate_operand" "")))]
+  "flag_trapping_math
+   && TARGET_USE_VECTOR_CONVERTS && reload_completed
+   && (TARGET_INTER_UNIT_MOVES || MEM_P (operands[1]))
+   && !SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
+  [(set (match_dup 0)
+	(float:V4SF (match_dup 2)))]
+{
+  operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
+  operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+  emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1]));
+})
+
+(define_split 
+  [(set (match_operand:SF 0 "register_operand" "")
+	(float:SF (match_operand:SI 1 "register_operand" "")))]
+  "flag_trapping_math
+   && TARGET_USE_VECTOR_CONVERTS && reload_completed
+   && SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
+  [(set (match_dup 2) (vec_duplicate:V4SI (match_dup 1)))
+   (set (match_dup 0)
+	(float:V4SF (match_dup 2)))]
+{
+  operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
+  operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+})
+
+(define_insn "*floatsisf2_sse_vector"
+  [(set (match_operand:SF 0 "register_operand" "=x")
+	(float:SF (match_operand:SI 1 "register_operand" "x")))]
+  "!flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+   && !TARGET_INTER_UNIT_MOVES"
+  "cvtpq2ps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "mode" "SF")
+   (set_attr "athlon_decode" "double")
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "fp_int_src" "true")])
+
 (define_insn "*floatsisf2_sse"
   [(set (match_operand:SF 0 "register_operand" "=x,x")
 	(float:SF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
-  "TARGET_SSE_MATH"
+  "TARGET_SSE_MATH
+   && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
   "cvtsi2ss\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "mode" "SF")
@@ -4714,38 +4915,89 @@
    (set_attr "amdfam10_decode" "vector,double")
    (set_attr "fp_int_src" "true")])
 
+(define_insn "*floatsidf2_mixed_vector"
+  [(set (match_operand:DF 0 "register_operand" "=x,f,f")
+	(float:DF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
+  "TARGET_SSE2 && TARGET_MIX_SSE_I387
+   && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+  "@
+   cvtdq2pd\t{%1, %0|%0, %1}
+   fild%z1\t%1
+   #"
+  [(set_attr "type" "sseicvt,fmov,multi")
+   (set_attr "mode" "V2DF,DF,DF")
+   (set_attr "unit" "*,*,i387")
+   (set_attr "athlon_decode" "double,*,*")
+   (set_attr "amdfam10_decode" "double,*,*")
+   (set_attr "fp_int_src" "false,true,true")])
+
 (define_insn "*floatsidf2_mixed"
-  [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x")
-	(float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
-  "TARGET_SSE2 && TARGET_MIX_SSE_I387"
+  [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x,!x")
+	(float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m,x")))]
+  "TARGET_SSE2 && TARGET_MIX_SSE_I387
+    && (!TARGET_USE_VECTOR_CONVERTS || !optimize_size)"
   "@
    fild%z1\t%1
    #
    cvtsi2sd\t{%1, %0|%0, %1}
-   cvtsi2sd\t{%1, %0|%0, %1}"
-  [(set_attr "type" "fmov,multi,sseicvt,sseicvt")
-   (set_attr "mode" "DF")
-   (set_attr "unit" "*,i387,*,*")
-   (set_attr "athlon_decode" "*,*,double,direct")
-   (set_attr "amdfam10_decode" "*,*,vector,double")
+   cvtsi2sd\t{%1, %0|%0, %1}
+   cvtdq2pd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "fmov,multi,sseicvt,sseicvt,sseicvt")
+   (set_attr "mode" "DF,DF,DF,DF,V2DF")
+   (set_attr "unit" "*,i387,*,*,*")
+   (set_attr "athlon_decode" "*,*,double,direct,double")
+   (set_attr "amdfam10_decode" "*,*,vector,double,double")
+   (set_attr "fp_int_src" "true,true,true,true,false")])
+
+(define_insn "*floatsidf2_sse_vector"
+  [(set (match_operand:DF 0 "register_operand" "=x")
+	(float:DF (match_operand:SI 1 "register_operand" "x")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+  "cvtdq2pd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "mode" "V2DF")
+   (set_attr "athlon_decode" "double")
+   (set_attr "amdfam10_decode" "double")
    (set_attr "fp_int_src" "true")])
 
+(define_split 
+  [(set (match_operand:DF 0 "register_operand" "")
+	(float:DF (match_operand:SI 1 "memory_operand" "")))]
+  "TARGET_USE_VECTOR_CONVERTS && reload_completed
+   && SSE_REG_P (operands[0])"
+  [(set (match_dup 0)
+	(float:V2DF
+	  (vec_select:V2SI
+	    (match_dup 2)
+	    (parallel [(const_int 0) (const_int 1)]))))]
+{
+  operands[2] = simplify_gen_subreg (V4SImode, operands[0], DFmode, 0);
+  operands[0] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
+  emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1]));
+})
+
 (define_insn "*floatsidf2_sse"
-  [(set (match_operand:DF 0 "register_operand" "=x,x")
-	(float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
-  "TARGET_SSE2 && TARGET_SSE_MATH"
-  "cvtsi2sd\t{%1, %0|%0, %1}"
+  [(set (match_operand:DF 0 "register_operand" "=x,x,!x")
+	(float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m,x")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
+  "@
+   cvtsi2sd\t{%1, %0|%0, %1} 
+   cvtsi2sd\t{%1, %0|%0, %1} 
+   cvtdq2pd\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
-   (set_attr "mode" "DF")
-   (set_attr "athlon_decode" "double,direct")
-   (set_attr "amdfam10_decode" "vector,double")
+   (set_attr "mode" "DF,DF,V2DF")
+   (set_attr "athlon_decode" "double,direct,double")
+   (set_attr "amdfam10_decode" "vector,double,double")
    (set_attr "fp_int_src" "true")])
 
 (define_insn "*floatsi<mode>2_i387"
   [(set (match_operand:MODEF 0 "register_operand" "=f,f")
 	(float:MODEF
 	  (match_operand:SI 1 "nonimmediate_operand" "m,?r")))]
-  "TARGET_80387"
+  "TARGET_80387
+   && (!TARGET_SSE_MATH || !SSE_FLOAT_MODE_P (GET_MODE (operands[0])))"
   "@
    fild%z1\t%1
    #"
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md	(revision 128300)
+++ config/i386/sse.md	(working copy)
@@ -2740,7 +2740,7 @@
   [(set_attr "type" "sselog1")
    (set_attr "mode" "DF")])
 
-(define_insn "*vec_dupv2df"
+(define_insn "vec_dupv2df"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
 	(vec_duplicate:V2DF
 	  (match_operand:DF 1 "register_operand" "0")))]
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 128300)
+++ config/i386/i386.c	(working copy)
@@ -1258,6 +1258,10 @@ unsigned int ix86_tune_features[X86_TUNE
      operand that cannot be represented using a modRM byte.  The XOR
      replacement is long decoded, so this split helps here as well.  */
   m_K6,
+
+  /* X86_USE_VECTOR_CONVERTS: Preffer vector packed SSE conversion from
+  integer to FP. */
+  m_AMDFAM10,
 };
 
 /* Feature tests against the various architecture variations.  */
Follow-Ups:
- Re: SSE conversion optimization
  - From: H.J. Lu
- Re: SSE conversion optimization
  - From: H.J. Lu
References:
- Re: SSE conversion optimization
  - From: H.J. Lu
- RE: SSE conversion optimization
  - From: Jagasia, Harsha
- Re: SSE conversion optimization
  - From: H.J. Lu
- Re: SSE conversion optimization
  - From: Jan Hubicka
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]