SSE conversion optimization

Sat Sep 8 23:10:00 GMT 2007

Hi,
Amdfam10 preffers doing packed conversions destinating SSE register rather than scalar.
This means basically following replacments:

-      cvtss2sd (source in register) -> unpcklps + cvtps2pd
-      cvtss2sd (source in memory) -> movd + cvtps2pd
-      cvtsd2ss (source in register) -> unpcklpd + cvtpd2ps
-      cvtsd2ss (source in memory) -> movs + cvtpd2ps
-      cvtsi2sd -> movd + cvtdq2pd
-      cvtsi2ss -> movd + cvtdq2ps

I ended up implementing first 4 as post reload splitters - for 32bit backend we can't
expose the packed values too early or risk reload putting them on the stack and the
codegen depends on spilling (and on the other hand, I don't see exposing the split
forms early as too useful).

cvtsi2sd conversion is easy - one just add new pattern requiring source
to be in XMM register and do packed operation instead.  The option that
source will actually be in XMM register is not too likely, but reload
should handle it nicely.

cvtsi2ss is trickier, since one needs to care to not have garbage in the
upper half, otherwise precision exception might occur.  I use same trick
as for cvtsi2ss for -fno-trapping-math, for trapping math I either force
operand to memory at expansion time or do reloading via movd by hand on
INTER_UNIT_MOVES operands.

We are now testing if the patch is good for generic, but because benmarks won't
be done before 10th, I am sending this variant enabled for amdfam10 only.

The patch was bootstrapped/regtested on x86_64-linux with the conversion enabled
on generic, I am testing it without.

I will commit it tomorrow if there are no complains.

2007-09-08  Jan Hubicka  <jh@suse.cz>
	    Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>

	* i386.h (ix86_tune_indices): Add X86_USE_VECTOR_CONVERTS.
	(TARGET_USE_VECTOR_CONVERTS): New.
	* i386.md: New post-reload splitters for converting SF to DF and DF to SF.
	(floatsi* expander): Special case vector conversions.
	(floatsisf2_mixed_vector, floatsisf2_sse_vector_nointernunit,
	floatsisf2_sse_vector_internunit, floatsisf2_sse_vector,
	floatsidf2_mixed_vector, floatsidf2_sse_vector): New.
	(floatsisf2_mixed, floatsisf2_sse, floatsidf2_mixed, floatsidf2_sse):
	Disable when doing vector converts.
	(floatsi<mode>2_i387): Disable when 
	* sse.md (vec_dupv2df): Export.
	* i386.c (ix86_tune_features): Enable SSE conversions.
Index: config/i386/i386.h
===================================================================
*** config/i386/i386.h	(revision 128276)
--- config/i386/i386.h	(working copy)
*************** enum ix86_tune_indices {
*** 257,262 ****
--- 257,263 ----
    X86_TUNE_MOVE_M1_VIA_OR,
    X86_TUNE_NOT_UNPAIRABLE,
    X86_TUNE_NOT_VECTORMODE,
+   X86_USE_VECTOR_CONVERTS,

    X86_TUNE_LAST
  };
*************** extern unsigned int ix86_tune_features[X
*** 337,342 ****
--- 338,344 ----
  #define	TARGET_MOVE_M1_VIA_OR	ix86_tune_features[X86_TUNE_MOVE_M1_VIA_OR]
  #define TARGET_NOT_UNPAIRABLE	ix86_tune_features[X86_TUNE_NOT_UNPAIRABLE]
  #define TARGET_NOT_VECTORMODE	ix86_tune_features[X86_TUNE_NOT_VECTORMODE]
+ #define TARGET_USE_VECTOR_CONVERTS ix86_tune_features[X86_USE_VECTOR_CONVERTS]

  /* Feature tests against the various architecture variations.  */
  enum ix86_arch_indices {
Index: config/i386/i386.md
===================================================================
*** config/i386/i386.md	(revision 128276)
--- config/i386/i386.md	(working copy)
***************
*** 3919,3924 ****
--- 3919,3956 ----
      }
  })

+ /* For converting SF(xmm2) to DF(xmm1), use the following code instead of
+    cvtss2sd:
+       unpcklps xmm2,xmm2   ; packed conversion might crash on signaling NaNs
+       cvtps2pd xmm2,xmm1
+    We do the conversion post reload to avoid producing of 128bit spills
+    that might lead to ICE on 32bit target.  The sequence unlikely combine
+    anyway.  */
+ (define_split
+   [(set (match_operand:DF 0 "register_operand" "")
+         (float_extend:DF
+ 	  (match_operand:SF 1 "nonimmediate_operand" "")))]
+   "TARGET_USE_VECTOR_CONVERTS && !optimize_size 
+    && reload_completed && SSE_REG_P (operands[0])"
+    [(set (match_dup 2)
+ 	 (float_extend:V2DF
+ 	   (vec_select:V2SF
+ 	     (match_dup 0)
+ 	     (parallel [(const_int 0) (const_int 1)]))))]
+ {
+   operands[2] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
+   operands[0] = simplify_gen_subreg (V4SFmode, operands[0], DFmode, 0);
+   /* Use movss for loading from memory, unpcklps for registers.  */
+   if (REG_P (operands[1]))
+     {
+       operands[1] = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
+       emit_insn (gen_sse_unpcklps (operands[0], operands[0], operands[1]));
+     }
+   else
+     emit_insn (gen_vec_setv4sf_0 (operands[0], 
+ 				  CONST0_RTX (V4SFmode), operands[1]));
+ })
+ 
  (define_insn "*extendsfdf2_mixed"
    [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x")
          (float_extend:DF
***************
*** 4012,4017 ****
--- 4044,4079 ----
      }
  })

+ /* For converting DF(xmm2) to SF(xmm1), use the following code instead of
+    cvtsd2ss:
+       unpcklpd xmm2,xmm2   ; packed conversion might crash on signaling NaNs
+       cvtpd2ps xmm2,xmm1
+    We do the conversion post reload to avoid producing of 128bit spills
+    that might lead to ICE on 32bit target.  The sequence unlikely combine
+    anyway.  */
+ (define_split
+   [(set (match_operand:SF 0 "register_operand" "")
+         (float_truncate:SF
+ 	  (match_operand:DF 1 "nonimmediate_operand" "")))]
+   "TARGET_USE_VECTOR_CONVERTS && !optimize_size 
+    && reload_completed && SSE_REG_P (operands[0])"
+    [(set (match_dup 2)
+ 	 (vec_concat:V4SF
+ 	   (float_truncate:V2SF
+ 	     (match_dup 0))
+ 	   (match_dup 3)))]
+ {
+   operands[2] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+   operands[0] = simplify_gen_subreg (V2DFmode, operands[0], SFmode, 0);
+   operands[3] = CONST0_RTX (V2SFmode);
+   /* Use movsd for loading from memory, unpcklpd for registers.  */
+   if (REG_P (operands[1]))
+     emit_insn (gen_vec_dupv2df (operands[0], operands[1]));
+   else
+     emit_insn (gen_sse2_loadlpd (operands[0],
+ 				 CONST0_RTX (V2DFmode), operands[1]));
+ })
+ 
  (define_expand "truncdfsf2_with_temp"
    [(parallel [(set (match_operand:SF 0 "" "")
  		   (float_truncate:SF (match_operand:DF 1 "" "")))
***************
*** 4688,4699 ****
    [(set (match_operand:SSEMODEF 0 "register_operand" "")
  	(float:SSEMODEF (match_operand:SI 1 "nonimmediate_operand" "")))]
    "TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
!   "")

  (define_insn "*floatsisf2_mixed"
    [(set (match_operand:SF 0 "register_operand" "=f,?f,x,x")
  	(float:SF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
!   "TARGET_MIX_SSE_I387"
    "@
     fild%z1\t%1
     #
--- 4750,4816 ----
    [(set (match_operand:SSEMODEF 0 "register_operand" "")
  	(float:SSEMODEF (match_operand:SI 1 "nonimmediate_operand" "")))]
    "TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
!   "
!    /* When we use vector converts, we can't have input in memory.  */
!    if (GET_MODE (operands[0]) == DFmode && GET_MODE (operands[1]) == SImode
!        && TARGET_USE_VECTOR_CONVERTS && !optimize_size && TARGET_SSE_MATH
!        && SSE_FLOAT_MODE_P (DFmode))
!      operands[1] = force_reg (SImode, operands[1]);
!    
!    if (GET_MODE (operands[0]) == SFmode && GET_MODE (operands[1]) == SImode
!        && !optimize_size && TARGET_USE_VECTOR_CONVERTS && TARGET_SSE_MATH
!        && SSE_FLOAT_MODE_P (SFmode))
!      {
!        /* When !flag_trapping_math, we handle SImode->SFmode vector
! 	  conversions same way as SImode->DFmode.
! 
! 	  For flat_trapping_math we can't safely use vector conversion without
! 	  clearing upper half, otherwise precision exception might occur.
! 	  However we can still generate the common sequence converting value
! 	  from general register to XMM register as:
! 
! 	    mov 	reg32, mem32
! 	    movd	mem32, xmm
! 	    cvtdq2pd xmm,xmm
! 
! 	  because we know that movd clears the upper half.
! 
! 	  Sadly in this case we can't rely on reload moving the value to XMM
! 	  register, since we need to know if upper half is OK, so we need
! 	  to do reloading by hand.  We force operand to memory unless target
! 	  supports inter unit moves.  */
!        if (!flag_trapping_math)
!          operands[1] = force_reg (SImode, operands[1]);
!        else if (!MEM_P (operands[1]))
! 	 {
! 	   rtx tmp = assign_386_stack_local (SImode, SLOT_VIRTUAL);
! 	   emit_move_insn (tmp, operands[1]);
! 	   operands[1] = tmp;
! 	 }
!      }
!   ")
! 
! (define_insn "*floatsisf2_mixed_vector"
!   [(set (match_operand:SF 0 "register_operand" "=x,f,?f")
! 	(float:SF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
!   "TARGET_MIX_SSE_I387 && !flag_trapping_math
!    && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
!   "@
!    cvtpq2ps\t{%1, %0|%0, %1}
!    fild%z1\t%1
!    #"
!   [(set_attr "type" "sseicvt,fmov,multi")
!    (set_attr "mode" "SF")
!    (set_attr "unit" "*,i387,*")
!    (set_attr "athlon_decode" "double,*,*")
!    (set_attr "amdfam10_decode" "double,*,*")
!    (set_attr "fp_int_src" "false,true,true")])

  (define_insn "*floatsisf2_mixed"
    [(set (match_operand:SF 0 "register_operand" "=f,?f,x,x")
  	(float:SF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
!   "TARGET_MIX_SSE_I387
!    && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
    "@
     fild%z1\t%1
     #
***************
*** 4706,4715 ****
     (set_attr "amdfam10_decode" "*,*,vector,double")
     (set_attr "fp_int_src" "true")])

  (define_insn "*floatsisf2_sse"
    [(set (match_operand:SF 0 "register_operand" "=x,x")
  	(float:SF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
!   "TARGET_SSE_MATH"
    "cvtsi2ss\t{%1, %0|%0, %1}"
    [(set_attr "type" "sseicvt")
     (set_attr "mode" "SF")
--- 4823,4888 ----
     (set_attr "amdfam10_decode" "*,*,vector,double")
     (set_attr "fp_int_src" "true")])

+ (define_insn "*floatsisf2_sse_vector_nointernunit"
+   [(set (match_operand:SF 0 "register_operand" "=x")
+ 	(float:SF (match_operand:SI 1 "memory_operand" "m")))]
+   "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+    && !TARGET_INTER_UNIT_MOVES"
+   "#"
+   [(set_attr "type" "multi")])
+ 
+ (define_insn "*floatsisf2_sse_vector_internunit"
+   [(set (match_operand:SF 0 "register_operand" "=x,x")
+ 	(float:SF (match_operand:SI 1 "nonimmediate_operand" "rm,x")))]
+   "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+    && TARGET_INTER_UNIT_MOVES"
+   "#"
+   [(set_attr "type" "multi")])
+ 
+ (define_split 
+   [(set (match_operand:SF 0 "register_operand" "")
+ 	(float:SF (match_operand:SI 1 "nonimmediate_operand" "")))]
+   "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && reload_completed
+    && (TARGET_INTER_UNIT_MOVES || MEM_P (operands[1]))
+    && !SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
+   [(set (match_dup 0)
+ 	(float:V4SF (match_dup 2)))]
+ {
+   operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
+   operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+   emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1]));
+ })
+ 
+ (define_split 
+   [(set (match_operand:SF 0 "register_operand" "")
+ 	(float:SF (match_operand:SI 1 "register_operand" "")))]
+   "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && reload_completed
+    && SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
+   [(set (match_dup 2) (vec_duplicate:V4SI (match_dup 1)))
+    (set (match_dup 0)
+ 	(float:V4SF (match_dup 2)))]
+ {
+   operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
+   operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+ })
+ 
+ (define_insn "*floatsisf2_sse_vector"
+   [(set (match_operand:SF 0 "register_operand" "=x")
+ 	(float:SF (match_operand:SI 1 "register_operand" "x")))]
+   "!flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+    && !TARGET_INTER_UNIT_MOVES"
+   "cvtpq2ps\t{%1, %0|%0, %1}"
+   [(set_attr "type" "sseicvt")
+    (set_attr "mode" "SF")
+    (set_attr "athlon_decode" "double")
+    (set_attr "amdfam10_decode" "double")
+    (set_attr "fp_int_src" "true")])
+ 
  (define_insn "*floatsisf2_sse"
    [(set (match_operand:SF 0 "register_operand" "=x,x")
  	(float:SF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
!   "TARGET_SSE_MATH
!    && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
    "cvtsi2ss\t{%1, %0|%0, %1}"
    [(set_attr "type" "sseicvt")
     (set_attr "mode" "SF")
***************
*** 4717,4754 ****
     (set_attr "amdfam10_decode" "vector,double")
     (set_attr "fp_int_src" "true")])

  (define_insn "*floatsidf2_mixed"
!   [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x")
! 	(float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
!   "TARGET_SSE2 && TARGET_MIX_SSE_I387"
    "@
     fild%z1\t%1
     #
     cvtsi2sd\t{%1, %0|%0, %1}
!    cvtsi2sd\t{%1, %0|%0, %1}"
!   [(set_attr "type" "fmov,multi,sseicvt,sseicvt")
!    (set_attr "mode" "DF")
!    (set_attr "unit" "*,i387,*,*")
!    (set_attr "athlon_decode" "*,*,double,direct")
!    (set_attr "amdfam10_decode" "*,*,vector,double")
     (set_attr "fp_int_src" "true")])

  (define_insn "*floatsidf2_sse"
!   [(set (match_operand:DF 0 "register_operand" "=x,x")
! 	(float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
!   "TARGET_SSE2 && TARGET_SSE_MATH"
!   "cvtsi2sd\t{%1, %0|%0, %1}"
    [(set_attr "type" "sseicvt")
!    (set_attr "mode" "DF")
!    (set_attr "athlon_decode" "double,direct")
!    (set_attr "amdfam10_decode" "vector,double")
     (set_attr "fp_int_src" "true")])

  (define_insn "*floatsi<mode>2_i387"
    [(set (match_operand:X87MODEF12 0 "register_operand" "=f,f")
  	(float:X87MODEF12
  	  (match_operand:SI 1 "nonimmediate_operand" "m,?r")))]
!   "TARGET_80387"
    "@
     fild%z1\t%1
     #"
--- 4890,4962 ----
     (set_attr "amdfam10_decode" "vector,double")
     (set_attr "fp_int_src" "true")])

+ (define_insn "*floatsidf2_mixed_vector"
+   [(set (match_operand:DF 0 "register_operand" "=x,f,f")
+ 	(float:DF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
+   "TARGET_SSE2 && TARGET_MIX_SSE_I387
+     && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+   "@
+    cvtdq2pd\t{%1, %0|%0, %1}
+    fild%z1\t%1
+    #"
+   [(set_attr "type" "sseicvt,fmov,multi")
+    (set_attr "mode" "V2DF,DF,DF")
+    (set_attr "unit" "*,*,i387")
+    (set_attr "athlon_decode" "double,*,*")
+    (set_attr "amdfam10_decode" "double,*,*")
+    (set_attr "fp_int_src" "false,true,true")])
+ 
  (define_insn "*floatsidf2_mixed"
!   [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x,!x")
! 	(float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m,x")))]
!   "TARGET_SSE2 && TARGET_MIX_SSE_I387
!     && (!TARGET_USE_VECTOR_CONVERTS || !optimize_size)"
    "@
     fild%z1\t%1
     #
     cvtsi2sd\t{%1, %0|%0, %1}
!    cvtsi2sd\t{%1, %0|%0, %1}
!    cvtdq2pd\t{%1, %0|%0, %1}"
!   [(set_attr "type" "fmov,multi,sseicvt,sseicvt,sseicvt")
!    (set_attr "mode" "DF,DF,DF,DF,V2DF")
!    (set_attr "unit" "*,i387,*,*,*")
!    (set_attr "athlon_decode" "*,*,double,direct,double")
!    (set_attr "amdfam10_decode" "*,*,vector,double,double")
!    (set_attr "fp_int_src" "true,true,true,true,false")])
! 
! (define_insn "*floatsidf2_sse_vector"
!   [(set (match_operand:DF 0 "register_operand" "=!x")
! 	(float:DF (match_operand:SI 1 "register_operand" "x")))]
!   "TARGET_SSE2 && TARGET_SSE_MATH
!    && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
!   "cvtdq2pd\t{%1, %0|%0, %1}"
!   [(set_attr "type" "sseicvt")
!    (set_attr "mode" "V2DF")
!    (set_attr "athlon_decode" "double")
!    (set_attr "amdfam10_decode" "double")
     (set_attr "fp_int_src" "true")])

  (define_insn "*floatsidf2_sse"
!   [(set (match_operand:DF 0 "register_operand" "=x,x,!x")
! 	(float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m,x")))]
!   "TARGET_SSE2 && TARGET_SSE_MATH
!    && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
!   "@
!    cvtsi2sd\t{%1, %0|%0, %1} 
!    cvtsi2sd\t{%1, %0|%0, %1} 
!    cvtdq2pd\t{%1, %0|%0, %1}"
    [(set_attr "type" "sseicvt")
!    (set_attr "mode" "DF,DF,V2DF")
!    (set_attr "athlon_decode" "double,direct,double")
!    (set_attr "amdfam10_decode" "vector,double,double")
     (set_attr "fp_int_src" "true")])

  (define_insn "*floatsi<mode>2_i387"
    [(set (match_operand:X87MODEF12 0 "register_operand" "=f,f")
  	(float:X87MODEF12
  	  (match_operand:SI 1 "nonimmediate_operand" "m,?r")))]
!   "TARGET_80387
!    && (!TARGET_SSE_MATH || !SSE_FLOAT_MODE_P (GET_MODE (operands[0])))"
    "@
     fild%z1\t%1
     #"
Index: config/i386/sse.md
===================================================================
*** config/i386/sse.md	(revision 128276)
--- config/i386/sse.md	(working copy)
***************
*** 2740,2746 ****
    [(set_attr "type" "sselog1")
     (set_attr "mode" "DF")])

! (define_insn "*vec_dupv2df"
    [(set (match_operand:V2DF 0 "register_operand" "=x")
  	(vec_duplicate:V2DF
  	  (match_operand:DF 1 "register_operand" "0")))]
--- 2740,2746 ----
    [(set_attr "type" "sselog1")
     (set_attr "mode" "DF")])

! (define_insn "vec_dupv2df"
    [(set (match_operand:V2DF 0 "register_operand" "=x")
  	(vec_duplicate:V2DF
  	  (match_operand:DF 1 "register_operand" "0")))]
Index: config/i386/i386.c
===================================================================
*** config/i386/i386.c	(revision 128276)
--- config/i386/i386.c	(working copy)
*************** unsigned int ix86_tune_features[X86_TUNE
*** 1258,1263 ****
--- 1258,1267 ----
       operand that cannot be represented using a modRM byte.  The XOR
       replacement is long decoded, so this split helps here as well.  */
    m_K6,
+ 
+   /* X86_USE_VECTOR_CONVERTS: Preffer vector packed SSE conversion from
+   integer to FP. */
+   m_AMDFAM10,
  };

  /* Feature tests against the various architecture variations.  */