This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
SSE fix 4
- From: Jan Hubicka <jh at suse dot cz>
- To: gcc-patches at gcc dot gnu dot org, rth at cygnus dot com
- Date: Mon, 14 Oct 2002 17:22:59 +0200
- Subject: SSE fix 4
Hi,
We generate somewhat lousy code for SSE absolute values since previously load
of negation of negative zero into register didn't work (converting to
LONG_DOUBLE and back thrown away the unneeded parts of NAN resulting in
different number). I assume that Richard's rewrite made it work, as it just works now.
This patch makes SSE absolute values much faster by using vector operand and
and instead of nand that required to kill the constant operand.
In case this is OK, I will update the other splitters to use vector operands
too as it avoid one reformating on Athlon and add splitter to optimize loads of
vectors having 0 in upper half to loads of short vectors.
OK for 3.4-BIB branch?
Honza
Mon Oct 14 18:41:50 CEST 2002 Jan Hubicka <jh@suse.cz>
* i386.md (abssf,absdf): Use vector operands for SSE
(abssf2_ifs, absdf2_ifs, absdf2_ifs_rex64 and splitters): Update for
vector operand.
*** i386.md Mon Oct 14 18:15:53 2002
--- /p1/ssediv/egcs/gcc/config/i386/i386.md Mon Oct 14 18:24:01 2002
***************
*** 10068,10081 ****
{
/* Using SSE is tricky, since we need bitwise negation of -0
in register. */
! rtx reg = gen_reg_rtx (SFmode);
rtx dest = operands[0];
operands[1] = force_reg (SFmode, operands[1]);
operands[0] = force_reg (SFmode, operands[0]);
emit_move_insn (reg,
! gen_lowpart (SFmode,
! gen_int_mode (0x80000000, SImode)));
emit_insn (gen_abssf2_ifs (operands[0], operands[1], reg));
if (dest != operands[0])
emit_move_insn (dest, operands[0]);
--- 10068,10084 ----
{
/* Using SSE is tricky, since we need bitwise negation of -0
in register. */
! rtx reg = gen_reg_rtx (V4SFmode);
rtx dest = operands[0];
+ rtx imm;
operands[1] = force_reg (SFmode, operands[1]);
operands[0] = force_reg (SFmode, operands[0]);
+ imm = gen_lowpart (SFmode, gen_int_mode(~0x80000000, SImode));
emit_move_insn (reg,
! gen_rtx_CONST_VECTOR (V4SFmode,
! gen_rtvec (4, imm, CONST0_RTX (SFmode),
! CONST0_RTX (SFmode), CONST0_RTX (SFmode))));
emit_insn (gen_abssf2_ifs (operands[0], operands[1], reg));
if (dest != operands[0])
emit_move_insn (dest, operands[0]);
***************
*** 10092,10100 ****
"#")
(define_insn "abssf2_ifs"
! [(set (match_operand:SF 0 "nonimmediate_operand" "=x#fr,f#xr,rm#xf")
! (abs:SF (match_operand:SF 1 "nonimmediate_operand" "x,0,0")))
! (use (match_operand:SF 2 "nonmemory_operand" "*0#x,*g#x,*g#x"))
(clobber (reg:CC 17))]
"TARGET_SSE
&& (reload_in_progress || reload_completed
--- 10095,10103 ----
"#")
(define_insn "abssf2_ifs"
! [(set (match_operand:SF 0 "nonimmediate_operand" "=x#fr,x#fr,f#xr,rm#xf")
! (abs:SF (match_operand:SF 1 "nonimmediate_operand" "0,x,0,0")))
! (use (match_operand:V4SF 2 "nonimmediate_operand" "xm,0,x*rm,x*rm"))
(clobber (reg:CC 17))]
"TARGET_SSE
&& (reload_in_progress || reload_completed
***************
*** 10105,10111 ****
(define_split
[(set (match_operand:SF 0 "memory_operand" "")
(abs:SF (match_operand:SF 1 "memory_operand" "")))
! (use (match_operand:SF 2 "" ""))
(clobber (reg:CC 17))]
""
[(parallel [(set (match_dup 0)
--- 10108,10114 ----
(define_split
[(set (match_operand:SF 0 "memory_operand" "")
(abs:SF (match_operand:SF 1 "memory_operand" "")))
! (use (match_operand:V4SF 2 "" ""))
(clobber (reg:CC 17))]
""
[(parallel [(set (match_dup 0)
***************
*** 10115,10121 ****
(define_split
[(set (match_operand:SF 0 "register_operand" "")
(abs:SF (match_operand:SF 1 "register_operand" "")))
! (use (match_operand:SF 2 "" ""))
(clobber (reg:CC 17))]
"reload_completed && !SSE_REG_P (operands[0])"
[(parallel [(set (match_dup 0)
--- 10118,10124 ----
(define_split
[(set (match_operand:SF 0 "register_operand" "")
(abs:SF (match_operand:SF 1 "register_operand" "")))
! (use (match_operand:V4SF 2 "" ""))
(clobber (reg:CC 17))]
"reload_completed && !SSE_REG_P (operands[0])"
[(parallel [(set (match_dup 0)
***************
*** 10124,10136 ****
(define_split
[(set (match_operand:SF 0 "register_operand" "")
(abs:SF (match_operand:SF 1 "register_operand" "")))
! (use (match_operand:SF 2 "register_operand" ""))
(clobber (reg:CC 17))]
"reload_completed && SSE_REG_P (operands[0])"
[(set (subreg:TI (match_dup 0) 0)
! (and:TI (not:TI (subreg:TI (match_dup 2) 0))
! (subreg:TI (match_dup 1) 0)))])
;; Keep 'f' and 'r' in separate alternatives to avoid reload problems
;; because of secondary memory needed to reload from class FLOAT_INT_REGS
--- 10127,10158 ----
(define_split
[(set (match_operand:SF 0 "register_operand" "")
+ (abs:SF (match_dup 0)))
+ (use (match_operand:V4SF 1 "nonmemory_operand" ""))
+ (clobber (reg:CC 17))]
+ "reload_completed && SSE_REG_P (operands[0])"
+ [(set (subreg:TI (match_dup 0) 0)
+ (and:TI (subreg:TI (match_dup 0) 0)
+ (subreg:TI (match_dup 1) 0)))]
+ {
+ operands[0] = gen_rtx_SUBREG (V4SFmode, operands[0], 0);
+ })
+
+ (define_split
+ [(set (match_operand:SF 0 "register_operand" "")
(abs:SF (match_operand:SF 1 "register_operand" "")))
! (use (match_operand:V4SF 2 "register_operand" ""))
(clobber (reg:CC 17))]
"reload_completed && SSE_REG_P (operands[0])"
[(set (subreg:TI (match_dup 0) 0)
! (and:TI (subreg:TI (match_dup 0) 0)
! (subreg:TI (match_dup 1) 0)))]
! {
! /* Operand2 should match operand0, as the opposite case is handled above. */
! if (REGNO (operands[2]) != REGNO (operands[0]))
! abort ();
! operands[0] = gen_rtx_SUBREG (V4SFmode, operands[0], 0);
! })
;; Keep 'f' and 'r' in separate alternatives to avoid reload problems
;; because of secondary memory needed to reload from class FLOAT_INT_REGS
***************
*** 10193,10209 ****
{
/* Using SSE is tricky, since we need bitwise negation of -0
in register. */
! rtx reg = gen_reg_rtx (DFmode);
#if HOST_BITS_PER_WIDE_INT >= 64
! rtx imm = gen_int_mode (((HOST_WIDE_INT)1) << 63, DImode);
#else
! rtx imm = immed_double_const (0, 0x80000000, DImode);
#endif
rtx dest = operands[0];
operands[1] = force_reg (DFmode, operands[1]);
operands[0] = force_reg (DFmode, operands[0]);
! emit_move_insn (reg, gen_lowpart (DFmode, imm));
emit_insn (gen_absdf2_ifs (operands[0], operands[1], reg));
if (dest != operands[0])
emit_move_insn (dest, operands[0]);
--- 10215,10236 ----
{
/* Using SSE is tricky, since we need bitwise negation of -0
in register. */
! rtx reg = gen_reg_rtx (V2DFmode);
#if HOST_BITS_PER_WIDE_INT >= 64
! rtx imm = gen_int_mode (~(((HOST_WIDE_INT)1) << 63), DImode);
#else
! rtx imm = immed_double_const (~0, ~0x80000000, DImode);
#endif
rtx dest = operands[0];
operands[1] = force_reg (DFmode, operands[1]);
operands[0] = force_reg (DFmode, operands[0]);
!
! /* Produce LONG_DOUBLE with the proper immediate argument. */
! imm = gen_lowpart (DFmode, imm);
! emit_move_insn (reg,
! gen_rtx_CONST_VECTOR (V2DFmode,
! gen_rtvec (2, imm, CONST0_RTX (DFmode))));
emit_insn (gen_absdf2_ifs (operands[0], operands[1], reg));
if (dest != operands[0])
emit_move_insn (dest, operands[0]);
***************
*** 10220,10228 ****
"#")
(define_insn "absdf2_ifs"
! [(set (match_operand:DF 0 "nonimmediate_operand" "=Y#fr,mf#Yr,mr#Yf")
! (abs:DF (match_operand:DF 1 "nonimmediate_operand" "Y,0,0")))
! (use (match_operand:DF 2 "nonmemory_operand" "*0#Y,*g#Y,*g#Y"))
(clobber (reg:CC 17))]
"!TARGET_64BIT && TARGET_SSE2
&& (reload_in_progress || reload_completed
--- 10247,10255 ----
"#")
(define_insn "absdf2_ifs"
! [(set (match_operand:DF 0 "nonimmediate_operand" "=Y#fr,Y#fr,mf#Yr,mr#Yf")
! (abs:DF (match_operand:DF 1 "nonimmediate_operand" "0,Y,0,0")))
! (use (match_operand:V2DF 2 "nonimmediate_operand" "Ym,0,Y*rm,Y*rm"))
(clobber (reg:CC 17))]
"!TARGET_64BIT && TARGET_SSE2
&& (reload_in_progress || reload_completed
***************
*** 10231,10239 ****
"#")
(define_insn "*absdf2_ifs_rex64"
! [(set (match_operand:DF 0 "nonimmediate_operand" "=Y#fr,mf#Yr")
! (abs:DF (match_operand:DF 1 "nonimmediate_operand" "Y,0")))
! (use (match_operand:DF 2 "nonmemory_operand" "*0#Y,*g#Y"))
(clobber (reg:CC 17))]
"TARGET_64BIT && TARGET_SSE2
&& (reload_in_progress || reload_completed
--- 10258,10266 ----
"#")
(define_insn "*absdf2_ifs_rex64"
! [(set (match_operand:DF 0 "nonimmediate_operand" "=Y#fr,Y#fr,mf#Yr")
! (abs:DF (match_operand:DF 1 "nonimmediate_operand" "0,Y,0")))
! (use (match_operand:V2DF 2 "nonimmediate_operand" "Ym,*0,*Y*rm"))
(clobber (reg:CC 17))]
"TARGET_64BIT && TARGET_SSE2
&& (reload_in_progress || reload_completed
***************
*** 10244,10250 ****
(define_split
[(set (match_operand:DF 0 "memory_operand" "")
(abs:DF (match_operand:DF 1 "memory_operand" "")))
! (use (match_operand:DF 2 "" ""))
(clobber (reg:CC 17))]
""
[(parallel [(set (match_dup 0)
--- 10271,10277 ----
(define_split
[(set (match_operand:DF 0 "memory_operand" "")
(abs:DF (match_operand:DF 1 "memory_operand" "")))
! (use (match_operand:V2DF 2 "" ""))
(clobber (reg:CC 17))]
""
[(parallel [(set (match_dup 0)
***************
*** 10254,10260 ****
(define_split
[(set (match_operand:DF 0 "register_operand" "")
(abs:DF (match_operand:DF 1 "register_operand" "")))
! (use (match_operand:DF 2 "" ""))
(clobber (reg:CC 17))]
"reload_completed && !SSE_REG_P (operands[0])"
[(parallel [(set (match_dup 0)
--- 10281,10287 ----
(define_split
[(set (match_operand:DF 0 "register_operand" "")
(abs:DF (match_operand:DF 1 "register_operand" "")))
! (use (match_operand:V2DF 2 "" ""))
(clobber (reg:CC 17))]
"reload_completed && !SSE_REG_P (operands[0])"
[(parallel [(set (match_dup 0)
***************
*** 10263,10284 ****
(define_split
[(set (match_operand:DF 0 "register_operand" "")
(abs:DF (match_operand:DF 1 "register_operand" "")))
! (use (match_operand:DF 2 "register_operand" ""))
(clobber (reg:CC 17))]
"reload_completed && SSE_REG_P (operands[0])"
[(set (subreg:TI (match_dup 0) 0)
! (and:TI (not:TI (subreg:TI (match_dup 2) 0))
(subreg:TI (match_dup 1) 0)))]
{
/* Avoid possible reformating on the operands. */
if (TARGET_SSE_PARTIAL_REGS && !optimize_size)
! {
! rtx op = gen_rtx_SUBREG (V2DFmode, operands[1], 0);
! emit_insn (gen_sse2_unpcklpd (op, op, op));
! op = gen_rtx_SUBREG (V2DFmode, operands[2], 0);
! emit_insn (gen_sse2_unpcklpd (op, op, op));
! }
})
--- 10290,10326 ----
(define_split
[(set (match_operand:DF 0 "register_operand" "")
+ (abs:DF (match_dup:DF 0)))
+ (use (match_operand:V2DF 1 "nonimmediate_operand" ""))
+ (clobber (reg:CC 17))]
+ "reload_completed && SSE_REG_P (operands[0])"
+ [(set (subreg:TI (match_dup 0) 0)
+ (and:TI (subreg:TI (match_dup 0) 0)
+ (subreg:TI (match_dup 1) 0)))]
+ {
+ operands[0] = gen_rtx_SUBREG (V2DFmode, operands[0], 0);
+ /* Avoid possible reformating on the operands. */
+ if (TARGET_SSE_PARTIAL_REGS && !optimize_size)
+ emit_insn (gen_sse2_unpcklpd (operands[0], operands[0], operands[0]));
+ })
+
+ (define_split
+ [(set (match_operand:DF 0 "register_operand" "")
(abs:DF (match_operand:DF 1 "register_operand" "")))
! (use (match_operand:V2DF 2 "register_operand" ""))
(clobber (reg:CC 17))]
"reload_completed && SSE_REG_P (operands[0])"
[(set (subreg:TI (match_dup 0) 0)
! (and:TI (subreg:TI (match_dup 0) 0)
(subreg:TI (match_dup 1) 0)))]
{
+ operands[0] = gen_rtx_SUBREG (V2DFmode, operands[0], 0);
+ /* Operand2 should match operand0, as the opposite case is handled above. */
+ if (REGNO (operands[2]) != REGNO (operands[0]))
+ abort ();
/* Avoid possible reformating on the operands. */
if (TARGET_SSE_PARTIAL_REGS && !optimize_size)
! emit_insn (gen_sse2_unpcklpd (operands[0], operands[0], operands[0]));
})