This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

patch for sse vector misalignment


And here's the bits for x86 to implement the sse misaligned
vector loads.

Tested on i686 and x86_64.


r~


        * config/i386/i386.c (ix86_expand_vector_move): Tidy.
        (ix86_expand_vector_move_misalign): New.
        (ix86_misaligned_mem_ok): Remove.
        (TARGET_VECTORIZE_MISALIGNED_MEM_OK): Remove.
        * config/i386/i386-protos.h: Update.
        * config/i386/i386.md (SSEMODEI): Rename from SSEINT16.
        (MMXMODEI): Rename from MMXINT8.
        (SSEMODE, MMXMODE, movmisalign<mode>): New.

Index: i386-protos.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386-protos.h,v
retrieving revision 1.122
diff -c -p -d -r1.122 i386-protos.h
*** i386-protos.h	14 Dec 2004 22:45:23 -0000	1.122
--- i386-protos.h	23 Dec 2004 10:15:22 -0000
*************** extern void i386_output_dwarf_dtprel (FI
*** 125,130 ****
--- 125,131 ----
  extern void ix86_expand_clear (rtx);
  extern void ix86_expand_move (enum machine_mode, rtx[]);
  extern void ix86_expand_vector_move (enum machine_mode, rtx[]);
+ extern void ix86_expand_vector_move_misalign (enum machine_mode, rtx[]);
  extern void ix86_expand_binary_operator (enum rtx_code,
  					 enum machine_mode, rtx[]);
  extern int ix86_binary_operator_ok (enum rtx_code, enum machine_mode, rtx[]);
Index: i386.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.c,v
retrieving revision 1.762
diff -c -p -d -r1.762 i386.c
*** i386.c	23 Dec 2004 07:49:24 -0000	1.762
--- i386.c	23 Dec 2004 10:15:25 -0000
*************** static void ix86_expand_strlensi_unroll_
*** 867,873 ****
  static int ix86_issue_rate (void);
  static int ix86_adjust_cost (rtx, rtx, rtx, int);
  static int ia32_multipass_dfa_lookahead (void);
- static bool ix86_misaligned_mem_ok (enum machine_mode);
  static void ix86_init_mmx_sse_builtins (void);
  static rtx x86_this_parameter (tree);
  static void x86_output_mi_thunk (FILE *, tree, HOST_WIDE_INT,
--- 867,872 ----
*************** static void init_ext_80387_constants (vo
*** 1010,1018 ****
  #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
    ia32_multipass_dfa_lookahead
  
- #undef TARGET_VECTORIZE_MISALIGNED_MEM_OK
- #define TARGET_VECTORIZE_MISALIGNED_MEM_OK ix86_misaligned_mem_ok
- 
  #undef TARGET_FUNCTION_OK_FOR_SIBCALL
  #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
  
--- 1009,1014 ----
*************** ix86_expand_move (enum machine_mode mode
*** 7556,7583 ****
  void
  ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
  {
    /* Force constants other than zero into memory.  We do not know how
       the instructions used to build constants modify the upper 64 bits
       of the register, once we have that information we may be able
       to handle some of them more efficiently.  */
    if ((reload_in_progress | reload_completed) == 0
!       && register_operand (operands[0], mode)
!       && CONSTANT_P (operands[1]) && operands[1] != CONST0_RTX (mode))
!     operands[1] = validize_mem (force_const_mem (mode, operands[1]));
  
    /* Make operand1 a register if it isn't already.  */
    if (!no_new_pseudos
!       && !register_operand (operands[0], mode)
!       && !register_operand (operands[1], mode))
      {
!       rtx temp = force_reg (GET_MODE (operands[1]), operands[1]);
!       emit_move_insn (operands[0], temp);
        return;
      }
  
!   emit_insn (gen_rtx_SET (VOIDmode, operands[0], operands[1]));
  }
  
  /* Attempt to expand a binary operator.  Make the expansion closer to the
     actual machine, then just general_operand, which will allow 3 separate
     memory references (one output, two input) in a single insn.  */
--- 7552,7700 ----
  void
  ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
  {
+   rtx op0 = operands[0], op1 = operands[1];
+ 
    /* Force constants other than zero into memory.  We do not know how
       the instructions used to build constants modify the upper 64 bits
       of the register, once we have that information we may be able
       to handle some of them more efficiently.  */
    if ((reload_in_progress | reload_completed) == 0
!       && register_operand (op0, mode)
!       && CONSTANT_P (op1) && op1 != CONST0_RTX (mode))
!     op1 = validize_mem (force_const_mem (mode, op1));
  
    /* Make operand1 a register if it isn't already.  */
    if (!no_new_pseudos
!       && !register_operand (op0, mode)
!       && !register_operand (op1, mode))
      {
!       emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
        return;
      }
  
!   emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
! }
! 
! /* Implement the movmisalign patterns for SSE.  Non-SSE modes go 
!    straight to ix86_expand_vector_move.  */
! 
! void
! ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
! {
!   rtx op0, op1, m;
! 
!   op0 = operands[0];
!   op1 = operands[1];
! 
!   if (MEM_P (op1))
!     {
!       /* If we're optimizing for size, movups is the smallest.  */
!       if (optimize_size)
! 	{
! 	  op0 = gen_lowpart (V4SFmode, op0);
! 	  op1 = gen_lowpart (V4SFmode, op1);
! 	  emit_insn (gen_sse_movups (op0, op1));
! 	  return;
! 	}
! 
!       /* ??? If we have typed data, then it would appear that using
! 	 movdqu is the only way to get unaligned data loaded with
! 	 integer type.  */
!       if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
! 	{
! 	  op0 = gen_lowpart (V16QImode, op0);
! 	  op1 = gen_lowpart (V16QImode, op1);
! 	  emit_insn (gen_sse2_movdqu (op0, op1));
! 	  return;
! 	}
! 
!       if (TARGET_SSE2 && mode == V2DFmode)
! 	{
! 	  /* When SSE registers are split into halves, we can avoid
! 	     writing to the top half twice.  */
! 	  if (TARGET_SSE_SPLIT_REGS)
! 	    {
! 	      emit_insn (gen_rtx_CLOBBER (VOIDmode, op0));
! 	      m = adjust_address (op1, DFmode, 0);
! 	      emit_insn (gen_sse2_loadlpd (op0, op0, m));
! 	      m = adjust_address (op1, DFmode, 8);
! 	      emit_insn (gen_sse2_loadhpd (op0, op0, m));
! 	    }
! 	  else
! 	    {
! 	      /* ??? Not sure about the best option for the Intel chips.
! 		 The following would seem to satisfy; the register is
! 		 entirely cleared, breaking the dependency chain.  We
! 		 then store to the upper half, with a dependency depth
! 		 of one.  A rumor has it that Intel recommends two movsd
! 		 followed by an unpacklpd, but this is unconfirmed.  And
! 		 given that the dependency depth of the unpacklpd would
! 		 still be one, I'm not sure why this would be better.  */
! 	      m = adjust_address (op1, DFmode, 0);
! 	      emit_insn (gen_sse2_loadsd (op0, m));
! 	      m = adjust_address (op1, DFmode, 8);
! 	      emit_insn (gen_sse2_loadhpd (op0, op0, m));
! 	    }
! 	}
!       else
! 	{
! 	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
! 	    emit_move_insn (op0, CONST0_RTX (mode));
! 	  else
! 	    emit_insn (gen_rtx_CLOBBER (VOIDmode, op0));
! 
! 	  op0 = gen_lowpart (V4SFmode, op0);
! 	  m = adjust_address (op1, V4SFmode, 0);
! 	  emit_insn (gen_sse_movlps (op0, op0, m));
! 	  m = adjust_address (op1, V4SFmode, 8);
! 	  emit_insn (gen_sse_movhps (op0, op0, m));
! 	}
!     }
!   else if (MEM_P (op0))
!     {
!       /* If we're optimizing for size, movups is the smallest.  */
!       if (optimize_size)
! 	{
! 	  op0 = gen_lowpart (V4SFmode, op0);
! 	  op1 = gen_lowpart (V4SFmode, op1);
! 	  emit_insn (gen_sse_movups (op0, op1));
! 	  return;
! 	}
! 
!       /* ??? Similar to above, only less clear because of quote
! 	 typeless stores unquote.  */
!       if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
! 	  && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
!         {
! 	  op0 = gen_lowpart (V16QImode, op0);
! 	  op1 = gen_lowpart (V16QImode, op1);
! 	  emit_insn (gen_sse2_movdqu (op0, op1));
! 	  return;
! 	}
! 
!       if (TARGET_SSE2 && mode == V2DFmode)
! 	{
! 	  m = adjust_address (op0, DFmode, 0);
! 	  emit_insn (gen_sse2_storelpd (m, op1));
! 	  m = adjust_address (op0, DFmode, 8);
! 	  emit_insn (gen_sse2_storehpd (m, op1));
! 	  return;
! 	}
!       else
! 	{
! 	  op1 = gen_lowpart (V4SFmode, op1);
! 	  m = adjust_address (op0, V4SFmode, 0);
! 	  emit_insn (gen_sse_movlps (m, m, op1));
! 	  m = adjust_address (op0, V4SFmode, 8);
! 	  emit_insn (gen_sse_movhps (m, m, op1));
! 	  return;
! 	}
!     }
!   else
!     gcc_unreachable ();
  }
  
+ 
  /* Attempt to expand a binary operator.  Make the expansion closer to the
     actual machine, then just general_operand, which will allow 3 separate
     memory references (one output, two input) in a single insn.  */
*************** ia32_multipass_dfa_lookahead (void)
*** 11727,11743 ****
  }
  
  
- /* Implement the target hook targetm.vectorize.misaligned_mem_ok.  */
- 
- static bool
- ix86_misaligned_mem_ok (enum machine_mode mode)
- {
-   if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
-     return true;
-   else
-     return false;
- }
- 
  /* Compute the alignment given to a constant that is being placed in memory.
     EXP is the constant and ALIGN is the alignment that the object would
     ordinarily have.
--- 11844,11849 ----
Index: i386.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.md,v
retrieving revision 1.591
diff -c -p -d -r1.591 i386.md
*** i386.md	23 Dec 2004 07:49:24 -0000	1.591
--- i386.md	23 Dec 2004 10:15:29 -0000
***************
*** 19789,19799 ****
  
  ;; 16 byte integral modes handled by SSE, minus TImode, which gets
  ;; special-cased for TARGET_64BIT.
! (define_mode_macro SSEINT16 [V16QI V8HI V4SI V2DI])
  
  (define_expand "mov<mode>"
!   [(set (match_operand:SSEINT16 0 "nonimmediate_operand" "")
! 	(match_operand:SSEINT16 1 "nonimmediate_operand" ""))]
    "TARGET_SSE"
  {
    ix86_expand_vector_move (<MODE>mode, operands);
--- 19789,19799 ----
  
  ;; 16 byte integral modes handled by SSE, minus TImode, which gets
  ;; special-cased for TARGET_64BIT.
! (define_mode_macro SSEMODEI [V16QI V8HI V4SI V2DI])
  
  (define_expand "mov<mode>"
!   [(set (match_operand:SSEMODEI 0 "nonimmediate_operand" "")
! 	(match_operand:SSEMODEI 1 "nonimmediate_operand" ""))]
    "TARGET_SSE"
  {
    ix86_expand_vector_move (<MODE>mode, operands);
***************
*** 19801,19808 ****
  })
  
  (define_insn "*mov<mode>_internal"
!   [(set (match_operand:SSEINT16 0 "nonimmediate_operand" "=x,x ,m")
! 	(match_operand:SSEINT16 1 "vector_move_operand"  "C ,xm,x"))]
    "TARGET_SSE
     && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
  {
--- 19801,19808 ----
  })
  
  (define_insn "*mov<mode>_internal"
!   [(set (match_operand:SSEMODEI 0 "nonimmediate_operand" "=x,x ,m")
! 	(match_operand:SSEMODEI 1 "vector_move_operand"  "C ,xm,x"))]
    "TARGET_SSE
     && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
  {
***************
*** 19842,19852 ****
  	       (const_string "TI")))])
  
  ;; 8 byte integral modes handled by MMX (and by extension, SSE)
! (define_mode_macro MMXINT8 [V8QI V4HI V2SI])
  
  (define_expand "mov<mode>"
!   [(set (match_operand:MMXINT8 0 "nonimmediate_operand" "")
! 	(match_operand:MMXINT8 1 "nonimmediate_operand" ""))]
    "TARGET_MMX"
  {
    ix86_expand_vector_move (<MODE>mode, operands);
--- 19842,19852 ----
  	       (const_string "TI")))])
  
  ;; 8 byte integral modes handled by MMX (and by extension, SSE)
! (define_mode_macro MMXMODEI [V8QI V4HI V2SI])
  
  (define_expand "mov<mode>"
!   [(set (match_operand:MMXMODEI 0 "nonimmediate_operand" "")
! 	(match_operand:MMXMODEI 1 "nonimmediate_operand" ""))]
    "TARGET_MMX"
  {
    ix86_expand_vector_move (<MODE>mode, operands);
***************
*** 19854,19862 ****
  })
  
  (define_insn "*mov<mode>_internal"
!   [(set (match_operand:MMXINT8 0 "nonimmediate_operand"
  					"=y,y ,m,!y,!*Y,*x,?*x,?m")
! 	(match_operand:MMXINT8 1 "vector_move_operand"
  					"C ,ym,y,*Y,y  ,C ,*xm,*x"))]
    "TARGET_MMX
     && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
--- 19854,19862 ----
  })
  
  (define_insn "*mov<mode>_internal"
!   [(set (match_operand:MMXMODEI 0 "nonimmediate_operand"
  					"=y,y ,m,!y,!*Y,*x,?*x,?m")
! 	(match_operand:MMXMODEI 1 "vector_move_operand"
  					"C ,ym,y,*Y,y  ,C ,*xm,*x"))]
    "TARGET_MMX
     && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)"
***************
*** 20103,20108 ****
--- 20103,20132 ----
    [(const_int 0)]
    "ix86_split_long_move (operands); DONE;")
  
+ ;; All 16-byte vector modes handled by SSE
+ (define_mode_macro SSEMODE [V16QI V8HI V4SI V2DI V4SF V2DF])
+ 
+ (define_expand "movmisalign<mode>"
+   [(set (match_operand:SSEMODE 0 "nonimmediate_operand" "")
+ 	(match_operand:SSEMODE 1 "nonimmediate_operand" ""))]
+   "TARGET_SSE"
+ {
+   ix86_expand_vector_move_misalign (<MODE>mode, operands);
+   DONE;
+ })
+ 
+ ;; All 8-byte vector modes handled by MMX
+ (define_mode_macro MMXMODE [V8QI V4HI V2SI V2SF])
+ 
+ (define_expand "movmisalign<mode>"
+   [(set (match_operand:MMXMODE 0 "nonimmediate_operand" "")
+ 	(match_operand:MMXMODE 1 "nonimmediate_operand" ""))]
+   "TARGET_MMX"
+ {
+   ix86_expand_vector_move (<MODE>mode, operands);
+   DONE;
+ })
+ 
  ;; These two patterns are useful for specifying exactly whether to use
  ;; movaps or movups
  (define_expand "sse_movaps"


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]