This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Better SSE vector composition/decomposition code


Hi,
we are still hitting the problems with vector extensions on SSE producing both
lousy code and cause compiler ICES by using impossible subregs to access
individual scalar fields of the vectors.  This is obviously not a way to go so
here is an attempt to do something about it.  It does not implement all vector
modes, just packed floats and doubles.

I've added new named patterns to set/extract individual fields and to
initialize whole vector at once (as this can be done more effectivly than
setting it by parts).  The same is true about vector operations expanded to
scalars, but I don't see good way to describe this to middle end, so it
probably makes most sense to simply define machine specific patterns for each
such operation rather than adding hooks.

The patch has been bootstrapped/regetsted on i686-pc-gnu-linux and
x86_64-pc-gnu-linux and if I add the rest of modes, it would solve some of open
SSE PRs around.  Does this look like sane approach? Would be something like
this acceptable for mainline?

Honza
/* Double check that the vector initializers and basic operation works
   properly.  */
typedef int v2df __attribute__ ((mode (V2DF)));
union a
{
  v2df v;
  double s[2];
};
double a1 = 1;
double b1 = 3;

/* Work-around stack alignment issues on i386.  */
__attribute__ ((noinline)) void
notmain (void) 
{
  union a r;
  v2df a = { a1, 2 };
  v2df b = { b1, 4 };

  r.v = a + b;
  if (r.s[0] != 4)
    abort ();
  if (r.s[1] != 6)
    abort ();
}
int
main (void)
{
  notmain();
  return 0;
}


/* Double check that the vector initializers and basic operation works
   properly.  */
typedef int v4sf __attribute__ ((mode (V4SF)));
union a
{
  v4sf v;
  float s[4];
};
float a1 = 1;
float b1 = 5;

/* Work-around stack alignment issues on i386.  */
__attribute__ ((noinline)) void
notmain (void) 
{
  union a r;
  v4sf a = { a1, 2, 3, 4};
  v4sf b = { b1, 6, 7, 8 };

  r.v = a + b;
printf("%f %f %f %f\n",r.s[0],r.s[1],r.s[2],r.s[3]);
  if (r.s[0] != 6)
    abort ();
  if (r.s[1] != 8)
    abort ();
  if (r.s[2] != 10)
    abort ();
  if (r.s[3] != 12)
    abort ();
}
int
main (void)
{
  notmain();
  return 0;
}
	* expmed.c (store_bit_field, extract_bit_field): Use new named patterns
	* expr.c (store_constructor): Use vec_init pattern.
	* genopinit.c (optabs): Initailize vec_set/vec_extract/vec_init.
	* optabs.h (optab_index): ADD OTI_vec_set/OTI_vec_extract/OTI_vec_init
	(vec_set_optab, vec_extract_optab, vec_init_optab): New.
	* i386.md (vec_setv2df, vec_extractv2df, vec_setv4sf, vec_extractv4sf):
	New patterns.
	(sse2_unpc?pd): Fix pattern.
	(sse2_movlpd): Kill.
	(sse2_movsd): Deal with movlpd too.
	* i386.c (ix86_expand_builtin): Use sse2_movsd instead of sse2_movlpd.
	(ix86_expand_vector_init): New.
	* emmintrin.h (__mm_set_pd, __mm_set_ps): Use vector extensions.
	* md.texi (vec_set, vec_extract): Document
Index: expmed.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/expmed.c,v
retrieving revision 1.147
diff -c -3 -p -r1.147 expmed.c
*** expmed.c	8 Dec 2003 22:39:51 -0000	1.147
--- expmed.c	29 Dec 2003 23:39:19 -0000
*************** store_bit_field (rtx str_rtx, unsigned H
*** 313,318 ****
--- 313,365 ----
  
    value = protect_from_queue (value, 0);
  
+   /* Use vec_extract patterns for extracting parts of vectors whenever
+      available.  */
+   if (VECTOR_MODE_P (GET_MODE (op0))
+       && GET_CODE (op0) != MEM
+       && (vec_set_optab->handlers[(int)GET_MODE (op0)].insn_code
+ 	  != CODE_FOR_nothing)
+       && fieldmode == GET_MODE_INNER (GET_MODE (op0))
+       && bitsize == GET_MODE_BITSIZE (GET_MODE_INNER (GET_MODE (op0)))
+       && !(bitnum % GET_MODE_BITSIZE (GET_MODE_INNER (GET_MODE (op0)))))
+     {
+       enum machine_mode outermode = GET_MODE (op0);
+       enum machine_mode innermode = GET_MODE_INNER (outermode);
+       int icode = (int) vec_set_optab->handlers[(int) outermode].insn_code;
+       int pos = bitnum / GET_MODE_BITSIZE (innermode);
+       rtx rtxpos = GEN_INT (pos);
+       rtx src = value;
+       rtx dest = op0;
+       rtx pat, seq;
+       enum machine_mode mode0 = insn_data[icode].operand[0].mode;
+       enum machine_mode mode1 = insn_data[icode].operand[1].mode;
+       enum machine_mode mode2 = insn_data[icode].operand[2].mode;
+ 
+       start_sequence ();
+ 
+       if (! (*insn_data[icode].operand[1].predicate) (src, mode1))
+ 	src = copy_to_mode_reg (mode1, src);
+ 
+       if (! (*insn_data[icode].operand[2].predicate) (rtxpos, mode2))
+ 	rtxpos = copy_to_mode_reg (mode1, rtxpos);
+ 
+       /* We could handle this, but we should always be called with a pseudo
+ 	 for our targets and all insns should take them as outputs.  */
+       if (! (*insn_data[icode].operand[0].predicate) (dest, mode0)
+ 	  || ! (*insn_data[icode].operand[1].predicate) (src, mode1)
+ 	  || ! (*insn_data[icode].operand[2].predicate) (rtxpos, mode2))
+ 	abort ();
+       pat = GEN_FCN (icode) (dest, src, rtxpos);
+       seq = get_insns ();
+       end_sequence ();
+       if (pat)
+ 	{
+ 	  emit_insn (seq);
+ 	  emit_insn (pat);
+ 	  return dest;
+ 	}
+     }
+ 
    if (flag_force_mem)
      {
        int old_generating_concat_p = generating_concat_p;
*************** extract_bit_field (rtx str_rtx, unsigned
*** 1033,1038 ****
--- 1080,1141 ----
      {
        /* We're trying to extract a full register from itself.  */
        return op0;
+     }
+ 
+   /* Use vec_extract patterns for extracting parts of vectors whenever
+      available.  */
+   if (VECTOR_MODE_P (GET_MODE (op0))
+       && GET_CODE (op0) != MEM
+       && (vec_extract_optab->handlers[(int)GET_MODE (op0)].insn_code
+ 	  != CODE_FOR_nothing)
+       && ((bitsize + bitnum) / GET_MODE_BITSIZE (GET_MODE_INNER (GET_MODE (op0)))
+ 	  == bitsize / GET_MODE_BITSIZE (GET_MODE_INNER (GET_MODE (op0)))))
+     {
+       enum machine_mode outermode = GET_MODE (op0);
+       enum machine_mode innermode = GET_MODE_INNER (outermode);
+       int icode = (int) vec_extract_optab->handlers[(int) outermode].insn_code;
+       int pos = bitnum / GET_MODE_BITSIZE (innermode);
+       rtx rtxpos = GEN_INT (pos);
+       rtx src = op0;
+       rtx dest = NULL, pat, seq;
+       enum machine_mode mode0 = insn_data[icode].operand[0].mode;
+       enum machine_mode mode1 = insn_data[icode].operand[1].mode;
+       enum machine_mode mode2 = insn_data[icode].operand[2].mode;
+ 
+       if (innermode == tmode || innermode == mode)
+ 	dest = target;
+ 
+       if (!dest)
+ 	dest = gen_reg_rtx (innermode);
+ 
+       start_sequence ();
+ 
+       if (! (*insn_data[icode].operand[0].predicate) (dest, mode0))
+ 	dest = copy_to_mode_reg (mode0, dest);
+ 
+       if (! (*insn_data[icode].operand[1].predicate) (src, mode1))
+ 	src = copy_to_mode_reg (mode1, src);
+ 
+       if (! (*insn_data[icode].operand[2].predicate) (rtxpos, mode2))
+ 	rtxpos = copy_to_mode_reg (mode1, rtxpos);
+ 
+       /* We could handle this, but we should always be called with a pseudo
+ 	 for our targets and all insns should take them as outputs.  */
+       if (! (*insn_data[icode].operand[0].predicate) (dest, mode0)
+ 	  || ! (*insn_data[icode].operand[1].predicate) (src, mode1)
+ 	  || ! (*insn_data[icode].operand[2].predicate) (rtxpos, mode2))
+ 	abort ();
+       pat = GEN_FCN (icode) (dest, src, rtxpos);
+       seq = get_insns ();
+       end_sequence ();
+       if (pat)
+ 	{
+ 	  emit_insn (seq);
+ 	  emit_insn (pat);
+ 	  return extract_bit_field (dest, bitsize,
+ 				    bitnum - pos * GET_MODE_BITSIZE (innermode),
+ 				    unsignedp, target, mode, tmode, total_size);
+ 	}
      }
  
    /* Make sure we are playing with integral modes.  Pun with subregs
Index: expr.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/expr.c,v
retrieving revision 1.608
diff -c -3 -p -r1.608 expr.c
*** expr.c	20 Dec 2003 01:40:40 -0000	1.608
--- expr.c	29 Dec 2003 23:39:27 -0000
*************** store_constructor (tree exp, rtx target,
*** 4708,4713 ****
--- 4708,4717 ----
        int const_bounds_p;
        HOST_WIDE_INT minelt = 0;
        HOST_WIDE_INT maxelt = 0;
+       int icode = 0;
+       rtx *vector = NULL;
+       int elt_size = 0;
+       unsigned n_elts = 0;
  
        /* Vectors are like arrays, but the domain is stored via an array
  	 type indirectly.  */
*************** store_constructor (tree exp, rtx target,
*** 4718,4723 ****
--- 4722,4743 ----
  	     it always will.  */
  	  domain = TYPE_DEBUG_REPRESENTATION_TYPE (type);
  	  domain = TYPE_DOMAIN (TREE_TYPE (TYPE_FIELDS (domain)));
+ 	  if (REG_P (target) && VECTOR_MODE_P (GET_MODE (target)))
+ 	    {
+ 	      enum machine_mode mode = GET_MODE (target);
+ 
+ 	      icode = (int) vec_init_optab->handlers[mode].insn_code;
+ 	      if (icode != CODE_FOR_nothing)
+ 		{
+ 		  unsigned int i;
+ 
+ 		  elt_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
+ 		  n_elts = (GET_MODE_SIZE (mode) / elt_size);
+ 		  vector = alloca (n_elts);
+ 		  for (i = 0; i < n_elts; i++)
+ 		    vector [i] = CONST0_RTX (GET_MODE_INNER (mode));
+ 		}
+ 	    }
  	}
  
        const_bounds_p = (TYPE_MIN_VALUE (domain)
*************** store_constructor (tree exp, rtx target,
*** 4782,4788 ****
  	    need_to_clear = 1;
  	}
  
!       if (need_to_clear && size > 0)
  	{
  	  if (! cleared)
  	    {
--- 4802,4808 ----
  	    need_to_clear = 1;
  	}
  
!       if (need_to_clear && size > 0 && !vector)
  	{
  	  if (! cleared)
  	    {
*************** store_constructor (tree exp, rtx target,
*** 4833,4838 ****
--- 4853,4861 ----
  	      HOST_WIDE_INT lo, hi, count;
  	      tree position;
  
+ 	      if (vector)
+ 		abort ();
+ 
  	      /* If the range is constant and "small", unroll the loop.  */
  	      if (const_bounds_p
  		  && host_integerp (lo_index, 0)
*************** store_constructor (tree exp, rtx target,
*** 4924,4929 ****
--- 4947,4955 ----
  	    {
  	      tree position;
  
+ 	      if (vector)
+ 		abort ();
+ 
  	      if (index == 0)
  		index = ssize_int (1);
  
*************** store_constructor (tree exp, rtx target,
*** 4941,4946 ****
--- 4967,4982 ----
  	      xtarget = adjust_address (xtarget, mode, 0);
  	      store_expr (value, xtarget, 0);
  	    }
+ 	  else if (vector)
+ 	    {
+ 	      int pos;
+ 
+ 	      if (index != 0)
+ 		pos = tree_low_cst (index, 0) - minelt;
+ 	      else
+ 		pos = i;
+ 	      vector[pos] = expand_expr (value, NULL_RTX, VOIDmode, 0);
+ 	    }
  	  else
  	    {
  	      if (index != 0)
*************** store_constructor (tree exp, rtx target,
*** 4956,4966 ****
  		  target = copy_rtx (target);
  		  MEM_KEEP_ALIAS_SET_P (target) = 1;
  		}
! 
! 	      store_constructor_field (target, bitsize, bitpos, mode, value,
! 				       type, cleared, get_alias_set (elttype));
! 
  	    }
  	}
      }
  
--- 4992,5007 ----
  		  target = copy_rtx (target);
  		  MEM_KEEP_ALIAS_SET_P (target) = 1;
  		}
! 	      else
! 		store_constructor_field (target, bitsize, bitpos, mode, value,
! 					 type, cleared, get_alias_set (elttype));
  	    }
+ 	}
+       if (vector)
+ 	{
+ 	  emit_insn (GEN_FCN (icode) (target,
+ 				      gen_rtx_PARALLEL (GET_MODE (target),
+ 						        gen_rtvec_v (n_elts, vector))));
  	}
      }
  
Index: genopinit.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/genopinit.c,v
retrieving revision 1.64
diff -c -3 -p -r1.64 genopinit.c
*** genopinit.c	13 Oct 2003 21:16:18 -0000	1.64
--- genopinit.c	29 Dec 2003 23:39:27 -0000
*************** static const char * const optabs[] =
*** 152,158 ****
    "movstr_optab[$A] = CODE_FOR_$(movstr$a$)",
    "clrstr_optab[$A] = CODE_FOR_$(clrstr$a$)",
    "cmpstr_optab[$A] = CODE_FOR_$(cmpstr$a$)",
!   "cmpmem_optab[$A] = CODE_FOR_$(cmpmem$a$)" };
  
  static void gen_insn (rtx);
  
--- 152,161 ----
    "movstr_optab[$A] = CODE_FOR_$(movstr$a$)",
    "clrstr_optab[$A] = CODE_FOR_$(clrstr$a$)",
    "cmpstr_optab[$A] = CODE_FOR_$(cmpstr$a$)",
!   "cmpmem_optab[$A] = CODE_FOR_$(cmpmem$a$)",
!   "vec_set_optab->handlers[$A].insn_code = CODE_FOR_$(vec_set$a$)",
!   "vec_extract_optab->handlers[$A].insn_code = CODE_FOR_$(vec_extract$a$)",
!   "vec_init_optab->handlers[$A].insn_code = CODE_FOR_$(vec_init$a$)" };
  
  static void gen_insn (rtx);
  
Index: optabs.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/optabs.c,v
retrieving revision 1.203
diff -c -3 -p -r1.203 optabs.c
*** optabs.c	21 Nov 2003 06:52:23 -0000	1.203
--- optabs.c	29 Dec 2003 23:39:29 -0000
*************** init_optabs (void)
*** 5271,5276 ****
--- 5307,5315 ----
    cstore_optab = init_optab (UNKNOWN);
    push_optab = init_optab (UNKNOWN);
  
+   vec_extract_optab = init_optab (UNKNOWN);
+   vec_set_optab = init_optab (UNKNOWN);
+   vec_init_optab = init_optab (UNKNOWN);
    /* Conversions.  */
    sext_optab = init_convert_optab (SIGN_EXTEND);
    zext_optab = init_convert_optab (ZERO_EXTEND);
Index: optabs.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/optabs.h,v
retrieving revision 1.19
diff -c -3 -p -r1.19 optabs.h
*** optabs.h	7 Oct 2003 07:25:32 -0000	1.19
--- optabs.h	29 Dec 2003 23:39:29 -0000
*************** enum optab_index
*** 197,202 ****
--- 197,209 ----
    /* Conditional add instruction.  */
    OTI_addcc,
  
+   /* Set specified field of vector operand.  */
+   OTI_vec_set,
+   /* Extract specified field of vector operand.  */
+   OTI_vec_extract,
+   /* Initialize vector operand.  */
+   OTI_vec_init,
+ 
    OTI_MAX
  };
  
*************** extern GTY(()) optab optab_table[OTI_MAX
*** 280,285 ****
--- 287,296 ----
  #define cstore_optab (optab_table[OTI_cstore])
  #define push_optab (optab_table[OTI_push])
  #define addcc_optab (optab_table[OTI_addcc])
+ 
+ #define vec_set_optab (optab_table[OTI_vec_set])
+ #define vec_extract_optab (optab_table[OTI_vec_extract])
+ #define vec_init_optab (optab_table[OTI_vec_init])
  
  /* Conversion optabs have their own table and indexes.  */
  enum convert_optab_index
Index: config/i386/emmintrin.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/emmintrin.h,v
retrieving revision 1.3
diff -c -3 -p -r1.3 emmintrin.h
*** config/i386/emmintrin.h	26 Sep 2003 03:28:27 -0000	1.3
--- config/i386/emmintrin.h	29 Dec 2003 23:39:42 -0000
*************** _mm_set_pd1 (double __F)
*** 115,129 ****
  static __inline __m128d
  _mm_set_pd (double __Z, double __Y)
  {
!   union {
!     double __a[2];
!     __m128d __v;
!   } __u;
! 
!   __u.__a[0] = __Y;
!   __u.__a[1] = __Z;
! 
!   return __u.__v;
  }
  
  /* Create the vector [Y Z].  */
--- 115,121 ----
  static __inline __m128d
  _mm_set_pd (double __Z, double __Y)
  {
!   return (__v2df) {__Y, __Z};
  }
  
  /* Create the vector [Y Z].  */
Index: config/i386/i386-protos.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386-protos.h,v
retrieving revision 1.102
diff -c -3 -p -r1.102 i386-protos.h
*** config/i386/i386-protos.h	30 Oct 2003 02:02:38 -0000	1.102
--- config/i386/i386-protos.h	29 Dec 2003 23:39:42 -0000
*************** extern int x86_field_alignment (tree, in
*** 218,223 ****
--- 218,224 ----
  extern rtx ix86_tls_get_addr (void);
  extern bool ix86_must_pass_in_stack (enum machine_mode mode, tree);
  
+ extern void ix86_expand_vector_init (rtx, rtx);
  /* In winnt.c  */
  extern int i386_pe_dllexport_name_p (const char *);
  extern int i386_pe_dllimport_name_p (const char *);
Index: config/i386/i386.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.c,v
retrieving revision 1.626
diff -c -3 -p -r1.626 i386.c
*** config/i386/i386.c	27 Dec 2003 05:56:15 -0000	1.626
--- config/i386/i386.c	29 Dec 2003 23:39:50 -0000
*************** ix86_expand_builtin (tree exp, rtx targe
*** 14081,14087 ****
        icode = (fcode == IX86_BUILTIN_LOADHPS ? CODE_FOR_sse_movhps
  	       : fcode == IX86_BUILTIN_LOADLPS ? CODE_FOR_sse_movlps
  	       : fcode == IX86_BUILTIN_LOADHPD ? CODE_FOR_sse2_movhpd
! 	       : CODE_FOR_sse2_movlpd);
        arg0 = TREE_VALUE (arglist);
        arg1 = TREE_VALUE (TREE_CHAIN (arglist));
        op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0);
--- 14085,14091 ----
        icode = (fcode == IX86_BUILTIN_LOADHPS ? CODE_FOR_sse_movhps
  	       : fcode == IX86_BUILTIN_LOADLPS ? CODE_FOR_sse_movlps
  	       : fcode == IX86_BUILTIN_LOADHPD ? CODE_FOR_sse2_movhpd
! 	       : CODE_FOR_sse2_movsd);
        arg0 = TREE_VALUE (arglist);
        arg1 = TREE_VALUE (TREE_CHAIN (arglist));
        op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0);
*************** ix86_expand_builtin (tree exp, rtx targe
*** 14110,14116 ****
        icode = (fcode == IX86_BUILTIN_STOREHPS ? CODE_FOR_sse_movhps
  	       : fcode == IX86_BUILTIN_STORELPS ? CODE_FOR_sse_movlps
  	       : fcode == IX86_BUILTIN_STOREHPD ? CODE_FOR_sse2_movhpd
! 	       : CODE_FOR_sse2_movlpd);
        arg0 = TREE_VALUE (arglist);
        arg1 = TREE_VALUE (TREE_CHAIN (arglist));
        op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0);
--- 14114,14120 ----
        icode = (fcode == IX86_BUILTIN_STOREHPS ? CODE_FOR_sse_movhps
  	       : fcode == IX86_BUILTIN_STORELPS ? CODE_FOR_sse_movlps
  	       : fcode == IX86_BUILTIN_STOREHPD ? CODE_FOR_sse2_movhpd
! 	       : CODE_FOR_sse2_movsd);
        arg0 = TREE_VALUE (arglist);
        arg1 = TREE_VALUE (TREE_CHAIN (arglist));
        op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0);
*************** ix86_must_pass_in_stack (enum machine_mo
*** 15788,15793 ****
--- 15792,15885 ----
     if (default_must_pass_in_stack (mode, type))
       return true;
     return (!TARGET_64BIT && type && mode == TImode);
+ }
+ 
+ /* Initialize vector TARGET via VALS.  */
+ void
+ ix86_expand_vector_init (rtx target, rtx vals)
+ {
+   enum machine_mode mode = GET_MODE (target);
+   int elt_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
+   int n_elts = (GET_MODE_SIZE (mode) / elt_size);
+   int i;
+   
+   for (i = n_elts - 1; i >= 0; i--)
+     if (GET_CODE (XVECEXP (vals, 0, i)) != CONST_INT
+ 	&& GET_CODE (XVECEXP (vals, 0, i)) != CONST_DOUBLE)
+       break;
+ 
+   /* Few special cases first...  
+      ... constants are best loaded from constant pool.  */
+   if (i < 0)
+     {
+       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
+       return;
+     }
+ 
+   /* ... values where only first field is non-constant are best loaded
+      from the pool and overwriten via move later.  */
+   if (!i)
+     {
+       rtx op = simplify_gen_subreg (mode, XVECEXP (vals, 0, 0),
+ 				    GET_MODE_INNER (mode), 0);
+ 
+       op = force_reg (mode, op);
+       XVECEXP (vals, 0, 0) = CONST0_RTX (GET_MODE_INNER (mode));
+       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
+       switch (GET_MODE (target))
+ 	{
+ 	  case V2DFmode:
+ 	    emit_insn (gen_sse2_movsd (target, target, op));
+ 	    break;
+ 	  case V4SFmode:
+ 	    emit_insn (gen_sse_movss (target, target, op));
+ 	    break;
+ 	  default:
+ 	    break;
+ 	}
+       return;
+     }
+ 
+   /* And the busy sequence doing rotations.  */
+   switch (GET_MODE (target))
+     {
+       case V2DFmode:
+ 	{
+ 	  rtx vecop0 =
+ 	    simplify_gen_subreg (V2DFmode, XVECEXP (vals, 0, 0), DFmode, 0);
+ 	  rtx vecop1 =
+ 	    simplify_gen_subreg (V2DFmode, XVECEXP (vals, 0, 1), DFmode, 0);
+ 
+ 	  vecop0 = force_reg (V2DFmode, vecop0);
+ 	  vecop1 = force_reg (V2DFmode, vecop1);
+ 	  emit_insn (gen_sse2_unpcklpd (target, vecop0, vecop1));
+ 	}
+ 	break;
+       case V4SFmode:
+ 	{
+ 	  rtx vecop0 =
+ 	    simplify_gen_subreg (V4SFmode, XVECEXP (vals, 0, 0), SFmode, 0);
+ 	  rtx vecop1 =
+ 	    simplify_gen_subreg (V4SFmode, XVECEXP (vals, 0, 1), SFmode, 0);
+ 	  rtx vecop2 =
+ 	    simplify_gen_subreg (V4SFmode, XVECEXP (vals, 0, 2), SFmode, 0);
+ 	  rtx vecop3 =
+ 	    simplify_gen_subreg (V4SFmode, XVECEXP (vals, 0, 3), SFmode, 0);
+ 	  rtx tmp1 = gen_reg_rtx (V4SFmode);
+ 	  rtx tmp2 = gen_reg_rtx (V4SFmode);
+ 
+ 	  vecop0 = force_reg (V4SFmode, vecop0);
+ 	  vecop1 = force_reg (V4SFmode, vecop1);
+ 	  vecop2 = force_reg (V4SFmode, vecop2);
+ 	  vecop3 = force_reg (V4SFmode, vecop3);
+ 	  emit_insn (gen_sse_unpcklps (tmp1, vecop1, vecop3));
+ 	  emit_insn (gen_sse_unpcklps (tmp2, vecop0, vecop2));
+ 	  emit_insn (gen_sse_unpcklps (target, tmp2, tmp1));
+ 	}
+ 	break;
+       default:
+ 	abort ();
+     }
  }
  
  #include "gt-i386.h"
Index: config/i386/i386.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.md,v
retrieving revision 1.494
diff -c -3 -p -r1.494 i386.md
*** config/i386/i386.md	13 Dec 2003 04:44:05 -0000	1.494
--- config/i386/i386.md	29 Dec 2003 23:39:57 -0000
***************
*** 4729,4734 ****
--- 4729,4899 ----
    "TARGET_SSE2 && TARGET_SSE_MATH && TARGET_64BIT"
    "x86_emit_floatuns (operands); DONE;")
  
+ ;; SSE extract/set expanders
+ 
+ (define_expand "vec_setv2df"
+   [(match_operand:V2DF 0 "register_operand" "")
+    (match_operand:DF 1 "register_operand" "")
+    (match_operand 2 "const_int_operand" "")]
+   "TARGET_SSE2"
+ {
+   switch (INTVAL (operands[2]))
+     {
+     case 0:
+       emit_insn (gen_sse2_movsd (operands[0], operands[0],
+ 				 simplify_gen_subreg (V2DFmode, operands[1],
+ 						      DFmode, 0)));
+       break;
+     case 1:
+       {
+ 	rtx op1 = simplify_gen_subreg (V2DFmode, operands[1], DFmode, 0);
+ 
+ 	emit_insn (gen_sse2_unpcklpd (operands[0], operands[0], op1));
+       }
+       break;
+     default:
+       abort ();
+     }
+   DONE;
+ })
+ 
+ (define_expand "vec_extractv2df"
+   [(match_operand:DF 0 "register_operand" "")
+    (match_operand:V2DF 1 "register_operand" "")
+    (match_operand 2 "const_int_operand" "")]
+   "TARGET_SSE2"
+ {
+   switch (INTVAL (operands[2]))
+     {
+     case 0:
+       emit_move_insn (operands[0], gen_lowpart (DFmode, operands[1]));
+       break;
+     case 1:
+       {
+ 	rtx dest = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
+ 
+ 	emit_insn (gen_sse2_unpckhpd (dest, operands[1], operands[1]));
+       }
+       break;
+     default:
+       abort ();
+     }
+   DONE;
+ })
+ 
+ (define_expand "vec_initv2df"
+   [(match_operand:V2DF 0 "register_operand" "")
+    (match_operand 1 "" "")]
+   "TARGET_SSE2"
+ {
+   ix86_expand_vector_init (operands[0], operands[1]);
+   DONE;
+ })
+ 
+ (define_expand "vec_setv4sf"
+   [(match_operand:V4SF 0 "register_operand" "")
+    (match_operand:SF 1 "register_operand" "")
+    (match_operand 2 "const_int_operand" "")]
+   "TARGET_SSE"
+ {
+   switch (INTVAL (operands[2]))
+     {
+     case 0:
+       emit_insn (gen_sse_movss (operands[0], operands[0],
+ 				simplify_gen_subreg (V4SFmode, operands[1],
+ 						     SFmode, 0)));
+       break;
+     case 1:
+       {
+ 	rtx op1 = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
+ 	rtx tmp = gen_reg_rtx (V4SFmode);
+  
+         emit_move_insn (tmp, operands[0]);
+ 	emit_insn (gen_sse_unpcklps (operands[0], operands[0], operands[0]));
+ 	emit_insn (gen_sse_movss (operands[0], operands[0], op1));
+         emit_insn (gen_sse_shufps (operands[0], operands[0], tmp,
+                                    GEN_INT (1 + (0<<2) + (2<<4) + (3<<6))));
+       }
+     case 2:
+       {
+         rtx op1 = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
+         rtx tmp = gen_reg_rtx (V4SFmode);
+ 
+         emit_move_insn (tmp, operands[0]);
+         emit_insn (gen_sse_movss (tmp, tmp, op1));
+         emit_insn (gen_sse_shufps (operands[0], operands[0], tmp,
+                                    GEN_INT (0 + (1<<2) + (0<<4) + (3<<6))));
+       }
+       break;
+     case 3:
+       {
+         rtx op1 = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
+         rtx tmp = gen_reg_rtx (V4SFmode);
+ 
+         emit_move_insn (tmp, operands[0]);
+         emit_insn (gen_sse_movss (tmp, tmp, op1));
+         emit_insn (gen_sse_shufps (operands[0], operands[0], tmp,
+                                    GEN_INT (0 + (1<<2) + (2<<4) + (0<<6))));
+       }
+       break;
+     default:
+       abort ();
+     }
+   DONE;
+ })
+ 
+ (define_expand "vec_extractv4sf"
+   [(match_operand:SF 0 "register_operand" "")
+    (match_operand:V4SF 1 "register_operand" "")
+    (match_operand 2 "const_int_operand" "")]
+   "TARGET_SSE"
+ {
+   switch (INTVAL (operands[2]))
+     {
+     case 0:
+       emit_move_insn (operands[0], gen_lowpart (SFmode, operands[1]));
+       break;
+     case 1:
+       {
+ 	rtx op0 = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
+ 	rtx tmp = gen_reg_rtx (V4SFmode);
+  
+         emit_move_insn (tmp, operands[1]);
+         emit_insn (gen_sse_shufps (op0, tmp, tmp,
+                                    GEN_INT (1)));
+       }
+     case 2:
+       {
+ 	rtx op0 = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
+ 	rtx tmp = gen_reg_rtx (V4SFmode);
+  
+         emit_move_insn (tmp, operands[1]);
+         emit_insn (gen_sse_unpckhps (op0, tmp, tmp));
+       }
+     case 3:
+       {
+ 	rtx op0 = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
+ 	rtx tmp = gen_reg_rtx (V4SFmode);
+  
+         emit_move_insn (tmp, operands[1]);
+         emit_insn (gen_sse_shufps (op0, tmp, tmp,
+                                    GEN_INT (3)));
+       }
+     default:
+       abort ();
+     }
+   DONE;
+ })
+ 
+ (define_expand "vec_initv4sf"
+   [(match_operand:V4SF 0 "register_operand" "")
+    (match_operand 1 "" "")]
+   "TARGET_SSE"
+ {
+   ix86_expand_vector_init (operands[0], operands[1]);
+   DONE;
+ })
+ 
  ;; Add instructions
  
  ;; %%% splits for addsidi3
***************
*** 22359,22369 ****
  	 (vec_select:DF (match_operand:V2DF 1 "register_operand" "0")
  			(parallel [(const_int 1)]))
  	 (vec_select:DF (match_operand:V2DF 2 "register_operand" "x")
! 			(parallel [(const_int 0)]))))]
    "TARGET_SSE2"
    "unpckhpd\t{%2, %0|%0, %2}"
    [(set_attr "type" "ssecvt")
!    (set_attr "mode" "TI")])
  
  (define_insn "sse2_unpcklpd"
    [(set (match_operand:V2DF 0 "register_operand" "=x")
--- 22524,22534 ----
  	 (vec_select:DF (match_operand:V2DF 1 "register_operand" "0")
  			(parallel [(const_int 1)]))
  	 (vec_select:DF (match_operand:V2DF 2 "register_operand" "x")
! 			(parallel [(const_int 1)]))))]
    "TARGET_SSE2"
    "unpckhpd\t{%2, %0|%0, %2}"
    [(set_attr "type" "ssecvt")
!    (set_attr "mode" "V2DF")])
  
  (define_insn "sse2_unpcklpd"
    [(set (match_operand:V2DF 0 "register_operand" "=x")
***************
*** 22371,22381 ****
  	 (vec_select:DF (match_operand:V2DF 1 "register_operand" "0")
  			(parallel [(const_int 0)]))
  	 (vec_select:DF (match_operand:V2DF 2 "register_operand" "x")
! 			(parallel [(const_int 1)]))))]
    "TARGET_SSE2"
    "unpcklpd\t{%2, %0|%0, %2}"
    [(set_attr "type" "ssecvt")
!    (set_attr "mode" "TI")])
  
  ;; MMX pack/unpack insns.
  
--- 22536,22546 ----
  	 (vec_select:DF (match_operand:V2DF 1 "register_operand" "0")
  			(parallel [(const_int 0)]))
  	 (vec_select:DF (match_operand:V2DF 2 "register_operand" "x")
! 			(parallel [(const_int 0)]))))]
    "TARGET_SSE2"
    "unpcklpd\t{%2, %0|%0, %2}"
    [(set_attr "type" "ssecvt")
!    (set_attr "mode" "V2DF")])
  
  ;; MMX pack/unpack insns.
  
***************
*** 22691,22707 ****
    [(set_attr "type" "ssecvt")
     (set_attr "mode" "V2DF")])
  
- (define_insn "sse2_movlpd"
-   [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,m")
- 	(vec_merge:V2DF
- 	 (match_operand:V2DF 1 "nonimmediate_operand" "0,0")
- 	 (match_operand:V2DF 2 "nonimmediate_operand" "m,x")
- 	 (const_int 1)))]
-   "TARGET_SSE2 && (GET_CODE (operands[1]) == MEM || GET_CODE (operands[2]) == MEM)"
-   "movlpd\t{%2, %0|%0, %2}"
-   [(set_attr "type" "ssecvt")
-    (set_attr "mode" "V2DF")])
- 
  (define_expand "sse2_loadsd"
    [(match_operand:V2DF 0 "register_operand" "")
     (match_operand:DF 1 "memory_operand" "")]
--- 22856,22861 ----
***************
*** 22724,22738 ****
     (set_attr "mode" "DF")])
  
  (define_insn "sse2_movsd"
!   [(set (match_operand:V2DF 0 "register_operand" "=x")
  	(vec_merge:V2DF
! 	 (match_operand:V2DF 1 "register_operand" "0")
! 	 (match_operand:V2DF 2 "register_operand" "x")
  	 (const_int 1)))]
!   "TARGET_SSE2"
!   "movsd\t{%2, %0|%0, %2}"
    [(set_attr "type" "ssecvt")
!    (set_attr "mode" "DF")])
  
  (define_insn "sse2_storesd"
    [(set (match_operand:DF 0 "memory_operand" "=m")
--- 22878,22894 ----
     (set_attr "mode" "DF")])
  
  (define_insn "sse2_movsd"
!   [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m")
  	(vec_merge:V2DF
! 	 (match_operand:V2DF 1 "nonimmediate_operand" "0,0,0")
! 	 (match_operand:V2DF 2 "nonimmediate_operand" "x,m,x")
  	 (const_int 1)))]
!   "TARGET_SSE2 && ix86_binary_operator_ok (UNKNOWN, V2DFmode, operands)"
!   "@movsd\t{%2, %0|%0, %2}
!     movlpd\t{%2, %0|%0, %2}
!     movlpd\t{%2, %0|%0, %2}"
    [(set_attr "type" "ssecvt")
!    (set_attr "mode" "DF,V2DF,V2DF")])
  
  (define_insn "sse2_storesd"
    [(set (match_operand:DF 0 "memory_operand" "=m")
Index: config/i386/xmmintrin.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/xmmintrin.h,v
retrieving revision 1.26
diff -c -3 -p -r1.26 xmmintrin.h
*** config/i386/xmmintrin.h	26 Sep 2003 04:07:46 -0000	1.26
--- config/i386/xmmintrin.h	29 Dec 2003 23:39:58 -0000
*************** typedef int __m128 __attribute__ ((__mod
*** 42,48 ****
  
  /* Internal data types for implementing the intrinsics.  */
  typedef int __v4sf __attribute__ ((__mode__(__V4SF__)));
- typedef int __v4si __attribute__ ((__mode__(__V4SI__)));
  
  /* Create a selector for use with the SHUFPS instruction.  */
  #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
--- 42,47 ----
*************** _mm_set_ps1 (float __F)
*** 890,908 ****
  
  /* Create the vector [Z Y X W].  */
  static __inline __m128
! _mm_set_ps (float __Z, float __Y, float __X, float __W)
  {
!   union {
!     float __a[4];
!     __m128 __v;
!   } __u;
! 
!   __u.__a[0] = __W;
!   __u.__a[1] = __X;
!   __u.__a[2] = __Y;
!   __u.__a[3] = __Z;
! 
!   return __u.__v;
  }
  
  /* Create the vector [W X Y Z].  */
--- 889,897 ----
  
  /* Create the vector [Z Y X W].  */
  static __inline __m128
! _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
  {
!   return (__v4sf) {__W, __X, __Y, __Z};
  }
  
  /* Create the vector [W X Y Z].  */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]