[PATCH 5/6] mips: Implement vec_perm_const.

Sun Dec 11 12:51:00 GMT 2011

[Mingjie, please could you help with the Loongson question near the end?]

Richard Henderson <rth@redhat.com> writes:
> @@ -89,61 +89,102 @@
>    DONE;
>  })
>  
> -; pul.ps - Pair Upper Lower
> -(define_insn "mips_pul_ps"
> +(define_insn "vec_perm_const_ps"
>    [(set (match_operand:V2SF 0 "register_operand" "=f")
> -	(vec_merge:V2SF
> -	 (match_operand:V2SF 1 "register_operand" "f")
> -	 (match_operand:V2SF 2 "register_operand" "f")
> -	 (const_int 2)))]
> +	(vec_select:V2SF
> +	  (vec_concat:V4SF
> +	    (match_operand:V2SF 1 "register_operand" "f")
> +	    (match_operand:V2SF 2 "register_operand" "f"))
> +	  (parallel [(match_operand:SI 3 "const_0_or_1_operand" "")
> +		     (match_operand:SI 4 "const_2_or_3_operand" "")])))]
>    "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
> -  "pul.ps\t%0,%1,%2"
> +{
> +  static const int * const mnemonics[2][4] = {
> +    /* LE */ { "pll.ps\t%0,%2,%1", "pul.ps\t%0,%2,%1",
> +	       "plu.ps\t%0,%2,%1", "puu.ps\t%0,%2,%1" },
> +    /* BE */ { "puu.ps\t%0,%1,%2", "plu.ps\t%0,%1,%2",
> +	       "pul.ps\t%0,%1,%2", "pll.ps\t%0,%1,%2" },
> +  };
> +
> +  unsigned mask = INTVAL (operands[3]) * 2 + (INTVAL (operands[4]) - 2);
> +  return mnemonics[WORDS_BIG_ENDIAN][mask];
> +}

So I stared at this for fully an hour trying to work out all the
various orderings (vec_concat operands always in memory order,
parallel selector always in memory order, GCC vector element 0
being "upper" on big-endian and "lower" on little-endian,
P??.PS always specifying the upper part of the result first, etc.).
I ended up with:

  /* Let <op>L be the lower part of operand <op> and <op>U be the upper part.
     The P[UL][UL].PS instruction always specifies the upper part of the
     result first, so the instruction is:

     	P<aUL><bUL>.PS %0,<aop>,<bop>

     where 0U == <aop><aUL> and 0L == <bop><bUL>.

     GCC's vector indices are specified in memory order, which means
     that vector element 0 is the lower part (L) on little-endian targets
     and the upper part (U) on big-endian targets.  vec_concat likewise
     concatenates in memory order, which means that operand 3 (being
     0 or 1) selects part of operand 1 and operand 4 (being 2 or 3)
     selects part of operand 2.

     Let:

	I3 = INTVAL (operands[3])
	I4 = INTVAL (operands[4]) - 2

     Taking the two endiannesses in turn:

     Little-endian:

        The semantics of the RTL pattern are:

	{ 0L, 0U } = { X[I3], X[I4 + 2] }, where X = { 1L, 1U, 2L, 2U }

	so: 0L = { 1L, 1U }[I3] (= <bop><bUL>)
	    0U = { 2L, 2U }[I4] (= <aop><aUL>)

	    <aop> = 2, <aUL> = I4 ? U : L
	    <bop> = 1, <bUL> = I3 ? U : L

	    [LL] !I4 && !I3   [UL] I4 && !I3
	    [LU] !I4 && I3    [UU] I4 && I3

     Big-endian:

        The semantics of the RTL pattern are:

	{ 0U, 0L } = { X[I3], X[I4 + 2] }, where X = { 1U, 1L, 2U, 2L }

	so: 0U = { 1U, 1L }[I3] (= <aop><aUL>)
	    0L = { 2U, 2L }[I4] (= <bop><bUL>)

	    <aop> = 1, <aUL> = I3 ? L : U
	    <bop> = 2, <bUL> = I4 ? L : U

	    [UU] !I3 && !I4   [UL] !I3 && I4
	    [LU] I3 && !I4    [LL] I3 && I4.  */

which suggests that the PUL and PLU entries for big-endian should be
the other way around.  Does that sound right, or have I misunderstood?

(Also, "const char *" rather than "const int *".)

The same confusion hit me with the expanders:

> +(define_expand "mips_pul_ps"
> +  [(match_operand:V2SF 0 "register_operand" "")
> +   (match_operand:V2SF 1 "register_operand" "")
> +   (match_operand:V2SF 2 "register_operand" "")]
> +  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
> +{
> +  if (WORDS_BIG_ENDIAN)
> +    emit_insn (gen_vec_perm_const_ps (operands[0], operands[1], operands[2],
> +				      const0_rtx, const2_rtx));
> +  else
> +    emit_insn (gen_vec_perm_const_ps (operands[0], operands[2], operands[1],
> +				      const1_rtx, GEN_INT (3)));
> +  DONE;
> +})

This one looks like a pasto: the operands given here are the same
as for mips_puu_ps.  But...

> +(define_expand "mips_plu_ps"
> +  [(match_operand:V2SF 0 "register_operand" "")
> +   (match_operand:V2SF 1 "register_operand" "")
> +   (match_operand:V2SF 2 "register_operand" "")]
> +  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
> +{
> +  if (WORDS_BIG_ENDIAN)
> +    emit_insn (gen_vec_perm_const_ps (operands[0], operands[1], operands[2],
> +				      const1_rtx, const2_rtx));
> +  else
> +    emit_insn (gen_vec_perm_const_ps (operands[0], operands[2], operands[1],
> +				      const0_rtx, GEN_INT (3)));
> +  DONE;
> +})

...for little-endian, we need to pass the "U" and "L" components of the
mnemonic in the reverse order: the MIPS instruction specifies the upper
part first, whereas the rtl pattern specifies the lower part first.
And for little-endian, U refers to memory element 1 and L to memory
element 0.  So I think this should be:

  if (WORDS_BIG_ENDIAN)
    emit_insn (gen_vec_perm_const_ps (operands[0], operands[1], operands[2],
				      const1_rtx, const2_rtx));
  else
    emit_insn (gen_vec_perm_const_ps (operands[0], operands[2], operands[1],
				      const1_rtx, const2_rtx));

Again, please correct me if this isn't right :-)

> +/* Recognize patterns for the MIPS3D P[UL][UL].PS instructions.  */
> +
> +static bool
> +mips_expand_vpc_ps (struct expand_vec_perm_d *d)
> +{
> +  unsigned perm0, perm1;
> +
> +  if (!(TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT))
> +    return false;
> +  if (d->vmode != V2SFmode)
> +    return false;
> +
> +  perm0 = d->perm[0];
> +  perm1 = d->perm[1];
> +
> +  /* If the first selector comes from the second operand, swap.  */
> +  if (perm0 & 2)
> +    {
> +      rtx x;
> +      if (perm1 & 2)
> +	return false;
> +      perm0 -= 2;
> +      perm1 += 2;
> +      x = d->op0, d->op0 = d->op1, d->op1 = x;
> +    }
> +  /* If the second selector does not come from the second operand, fail.  */
> +  else if ((perm1 & 2) == 0)
> +    return false;
> +
> +  /* Success! */
> +  if (!d->testing_p)
> +    emit_insn (gen_vec_perm_const_ps (d->target, d->op0, d->op1,
> +				      GEN_INT (perm0), GEN_INT (perm1)));
> +
> +  return true;
> +}

AIUI, the caller has already detected the case where both parts of the
result come from the same input operand, and treated it as one_vector_p.
It has also adjusted the indices in that case so that they all come from
the first operand.  It looks like this function unnecessarily fails
after that.

With all the work the caller has done, we should be able to return true
unconditionally after:

  if (d->vmode != V2SFmode)
    return false;

Once we've swapped and fiddled the masks to cope with one_vector_p,
we should be able to assert that perm0 comes from the first operand and
perm1 comes from the second.

> +/* Recognize patterns for the Loongson PUNPCK* instructions.  */
> +
> +static bool
> +mips_expand_vpc_loongson_interleave (struct expand_vec_perm_d *d)
> +{
> +  unsigned int i, low, swap, nelt = d->nelt, mask;
> +  rtx x;
> +
> +  if (!(TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS))
> +    return false;
> +  if (GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT)
> +    return false;
> +  if (GET_MODE_SIZE (d->vmode) != 16)
> +    return false;
> +
> +  /* Note that these are big-endian tests.  Adjust for little-endian later. */
> +  low = nelt / 2;
> +  swap = nelt;
> +  if (d->perm[0] == swap + low)
> +    ;
> +  else if (d->perm[0] == swap)
> +    low = 0;
> +  else if (d->perm[0] == low)
> +    swap = 0;
> +  else if (d->perm[0] == 0)
> +    low = 0, swap = 0;
> +  else
> +    return false;
> +  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
> +
> +  for (i = 0; i < nelt / 2; i++)
> +    {
> +      unsigned elt;
> +      elt = i + low + swap;
> +      if (d->perm[i * 2] != elt)
> +	return false;
> +      elt = (elt + nelt) & mask;
> +      if (d->perm[i * 2 + 1] != elt)
> +	return false;
> +    }
> +
> +  /* Success!  */
> +  if (d->testing_p)
> +    return true;
> +
> +  /* Adjust for little-endian.  */
> +  if (!WORDS_BIG_ENDIAN)
> +    swap = !swap, low = !low;
> +  /* Adjust for matched swapped operand pattern.  */
> +  if (swap)
> +    x = d->op0, d->op0 = d->op1, d->op1 = x;
> +
> +  /* Generate one of the loongson_punpck* instructions.  */
> +  /* ??? We should consider using standard (vec_select (vec_concat)) form. */
> +  x = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, d->op0, d->op1),
> +		      low ? UNSPEC_LOONGSON_PUNPCKL : UNSPEC_LOONGSON_PUNPCKH);
> +  emit_insn (gen_rtx_SET (VOIDmode, d->target, x));
> +  return true;
> +}

Looks right to me too FWIW.

> +/* Recognize patterns for the Loongson PSHUFH instruction.  */
> +
> +static bool
> +mips_expand_vpc_loongson_pshufh (struct expand_vec_perm_d *d)
> +{
> +  unsigned i, mask;
> +
> +  if (!(TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS))
> +    return false;
> +  if (d->vmode != V4HImode)
> +    return false;
> +  if (!d->one_vector_p)
> +    return false;
> +  if (d->testing_p)
> +    return true;
> +
> +  /* Convert the selector into the packed 8-bit form for pshufh.  */
> +  for (i = mask = 0; i < 4; i++)
> +    mask |= (d->perm[i] & 3) << (i * 2);

I think this is endian-dependent.  For little-endian, the bottom two bits
of the mask determine element 0; for big-endian, the top two bits of the
mask do.  For little-endian, each mask element follows memory order
(0 first, 3 last); for big-endian, they're in opposition (^ 3).
So something like:

  /* Convert the selector into the packed 8-bit form for PSHUFH.
     The bottom two bits of the mask always control the bottom
     16 bits of the result; this is element 3 on big-endian targets
     and element 0 on little-endian targets.  Each pair of bits X
     specifies a right shift by X*16; again, this means that X==0
     refers to element 3 on big-endian targets and element 0
     on little-endian targets.  */
  ec = TARGET_BIG_ENDIAN ? 3 : 0;
  for (i = mask = 0; i < 4; i++)
    mask |= ((d->perm[i ^ ec] ^ ec) & 3) << (i * 2);

Does that sound right?  Like you, I haven't tested this on Loongson yet.
(There's a machine in the farm, but bootstrapping on it is rather slow.)

> +/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK.  */
> +
> +static bool
> +mips_vectorize_vec_perm_const_ok (enum machine_mode vmode,
> +				  const unsigned char *sel)
> +{
> +  struct expand_vec_perm_d d;
> +  unsigned int i, nelt, which;
> +  bool ret;
> +
> +  d.vmode = vmode;
> +  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
> +  d.testing_p = true;
> +  memcpy (d.perm, sel, nelt);
> +
> +  /* Categorize the set of elements in the selector.  */
> +  for (i = which = 0; i < nelt; ++i)
> +    {
> +      unsigned char e = d.perm[i];
> +      gcc_assert (e < 2 * nelt);
> +      which |= (e < nelt ? 1 : 2);
> +    }
> +
> +  /* For all elements from second vector, fold the elements to first.  */
> +  if (which == 2)
> +    for (i = 0; i < nelt; ++i)
> +      d.perm[i] -= nelt;
> +
> +  /* Check whether the mask can be applied to the vector type.  */
> +  d.one_vector_p = (which != 3);
> +
> +  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
> +  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
> +  if (!d.one_vector_p)
> +    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);

I don't think we need these registers, at least not if mips_expand_vpc_ps
is adjusted to handle testing_p before the swap.

I think a lot of the endianness stuff in the patch is dependent on byte
endianness rather than word endianness.  Since we only support two out
of the four combinations, it seems better not to worry which and simply
use TARGET_{BIG,LITTLE}_ENDIAN instead of {WORDS,BYTES}_{BIG,LITTLE}_ENDIAN.

As H-P mentioned, this changes the __builtin_* interface for the PSHUFH
intrinsics.  These intrinsics are supposed to be used via the inline
wrappers in loongson.h, so we can either keep the unused argument in
the pshufh_{u,s} or, as H-P suggests, remove the argument from both.
I don't know which is better.  loongson.h needs to change either way,
so in the patch below, I went for the former.  The latter would need
testsuite changes too.  Mingjie, which do you think is best?

The revised patch below seems to pass spot-testing on vect.exp
(from all languages) and mips.exp on little-endian and big-endian
targets.

Richard


Index: gcc/config/mips/loongson.h
===================================================================

--- gcc/config/mips/loongson.h	2011-12-11 12:16:27.000000000 +0000
+++ gcc/config/mips/loongson.h	2011-12-11 12:22:00.000000000 +0000
@@ -447,15 +447,17 @@ psadbh (uint8x8_t s, uint8x8_t t)
 
 /* Shuffle halfwords.  */
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-pshufh_u (uint16x4_t dest, uint16x4_t s, uint8_t order)
+pshufh_u (uint16x4_t dest __attribute__((__unused__)),
+	  uint16x4_t s, uint8_t order)
 {
-  return __builtin_loongson_pshufh_u (dest, s, order);
+  return __builtin_loongson_pshufh_u (s, order);
 }
 
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-pshufh_s (int16x4_t dest, int16x4_t s, uint8_t order)
+pshufh_s (int16x4_t dest __attribute__((__unused__)),
+	  int16x4_t s, uint8_t order)
 {
-  return __builtin_loongson_pshufh_s (dest, s, order);
+  return __builtin_loongson_pshufh_s (s, order);
 }
 
 /* Shift left logical.  */
Index: gcc/config/mips/loongson.md
===================================================================
--- gcc/config/mips/loongson.md	2011-12-10 15:12:45.000000000 +0000
+++ gcc/config/mips/loongson.md	2011-12-10 15:13:07.000000000 +0000
@@ -403,12 +403,11 @@ (define_insn "loongson_psadbh"
 ;; Shuffle halfwords.
 (define_insn "loongson_pshufh"
   [(set (match_operand:VH 0 "register_operand" "=f")
-        (unspec:VH [(match_operand:VH 1 "register_operand" "0")
-		    (match_operand:VH 2 "register_operand" "f")
-		    (match_operand:SI 3 "register_operand" "f")]
+        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
+		    (match_operand:SI 2 "register_operand" "f")]
 		   UNSPEC_LOONGSON_PSHUFH))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pshufh\t%0,%2,%3"
+  "pshufh\t%0,%1,%2"
   [(set_attr "type" "fmul")])
 
 ;; Shift left logical.
@@ -479,7 +478,7 @@ (define_insn "ussub<mode>3"
   [(set_attr "type" "fadd")])
 
 ;; Unpack high data.
-(define_insn "vec_interleave_high<mode>"
+(define_insn "loongson_punpckh<V_stretch_half_suffix>"
   [(set (match_operand:VWHB 0 "register_operand" "=f")
         (unspec:VWHB [(match_operand:VWHB 1 "register_operand" "f")
 		      (match_operand:VWHB 2 "register_operand" "f")]
@@ -489,7 +488,7 @@ (define_insn "vec_interleave_high<mode>"
   [(set_attr "type" "fdiv")])
 
 ;; Unpack low data.
-(define_insn "vec_interleave_low<mode>"
+(define_insn "loongson_punpckl<V_stretch_half_suffix>"
   [(set (match_operand:VWHB 0 "register_operand" "=f")
         (unspec:VWHB [(match_operand:VWHB 1 "register_operand" "f")
 		      (match_operand:VWHB 2 "register_operand" "f")]
@@ -498,6 +497,19 @@ (define_insn "vec_interleave_low<mode>"
   "punpckl<V_stretch_half_suffix>\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
+(define_expand "vec_perm_const<mode>"
+  [(match_operand:VWHB 0 "register_operand" "")
+   (match_operand:VWHB 1 "register_operand" "")
+   (match_operand:VWHB 2 "register_operand" "")
+   (match_operand:VWHB 3 "" "")]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  if (mips_expand_vec_perm_const (operands))
+    DONE;
+  else
+    FAIL;
+})
+
 ;; Integer division and modulus.  For integer multiplication, see mips.md.
 
 (define_insn "<u>div<mode>3"
Index: gcc/config/mips/mips-modes.def
===================================================================
--- gcc/config/mips/mips-modes.def	2011-12-10 15:12:45.000000000 +0000
+++ gcc/config/mips/mips-modes.def	2011-12-10 15:13:07.000000000 +0000
@@ -29,6 +29,7 @@ FLOAT_MODE (TF, 16, mips_quad_format);
 VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI */
 VECTOR_MODES (FLOAT, 8);      /*            V4HF V2SF */
 VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
+VECTOR_MODES (FLOAT, 16);
 
 VECTOR_MODES (FRACT, 4);	/* V4QQ  V2HQ */
 VECTOR_MODES (UFRACT, 4);	/* V4UQQ V2UHQ */
Index: gcc/config/mips/mips-protos.h
===================================================================
--- gcc/config/mips/mips-protos.h	2011-12-10 15:12:45.000000000 +0000
+++ gcc/config/mips/mips-protos.h	2011-12-10 15:13:07.000000000 +0000
@@ -328,6 +328,7 @@ extern void mips_expand_atomic_qihi (uni
 				     rtx, rtx, rtx, rtx);
 
 extern void mips_expand_vector_init (rtx, rtx);
+extern bool mips_expand_vec_perm_const (rtx op[4]);
 
 extern bool mips_eh_uses (unsigned int);
 extern bool mips_epilogue_uses (unsigned int);
Index: gcc/config/mips/mips-ps-3d.md
===================================================================
--- gcc/config/mips/mips-ps-3d.md	2011-12-10 15:12:45.000000000 +0000
+++ gcc/config/mips/mips-ps-3d.md	2011-12-11 12:43:03.000000000 +0000
@@ -89,61 +89,167 @@ (define_expand "movv2sfcc"
   DONE;
 })
 
-; pul.ps - Pair Upper Lower
-(define_insn "mips_pul_ps"
+(define_insn "vec_perm_const_ps"
   [(set (match_operand:V2SF 0 "register_operand" "=f")
-	(vec_merge:V2SF
-	 (match_operand:V2SF 1 "register_operand" "f")
-	 (match_operand:V2SF 2 "register_operand" "f")
-	 (const_int 2)))]
+	(vec_select:V2SF
+	  (vec_concat:V4SF
+	    (match_operand:V2SF 1 "register_operand" "f")
+	    (match_operand:V2SF 2 "register_operand" "f"))
+	  (parallel [(match_operand:SI 3 "const_0_or_1_operand" "")
+		     (match_operand:SI 4 "const_2_or_3_operand" "")])))]
   "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
-  "pul.ps\t%0,%1,%2"
+{
+  /* Let <op>L be the lower part of operand <op> and <op>U be the upper part.
+     The P[UL][UL].PS instruction always specifies the upper part of the
+     result first, so the instruction is:
+
+     	P<aUL><bUL>.PS %0,<aop>,<bop>
+
+     where 0U == <aop><aUL> and 0L == <bop><bUL>.
+
+     GCC's vector indices are specified in memory order, which means
+     that vector element 0 is the lower part (L) on little-endian targets
+     and the upper part (U) on big-endian targets.  vec_concat likewise
+     concatenates in memory order, which means that operand 3 (being
+     0 or 1) selects part of operand 1 and operand 4 (being 2 or 3)
+     selects part of operand 2.
+
+     Let:
+
+	I3 = INTVAL (operands[3])
+	I4 = INTVAL (operands[4]) - 2
+
+     Taking the two endiannesses in turn:
+
+     Little-endian:
+
+        The semantics of the RTL pattern are:
+
+	{ 0L, 0U } = { X[I3], X[I4 + 2] }, where X = { 1L, 1U, 2L, 2U }
+
+	so: 0L = { 1L, 1U }[I3] (= <bop><bUL>)
+	    0U = { 2L, 2U }[I4] (= <aop><aUL>)
+
+	    <aop> = 2, <aUL> = I4 ? U : L
+	    <bop> = 1, <bUL> = I3 ? U : L
+
+	    [LL] !I4 && !I3   [UL] I4 && !I3
+	    [LU] !I4 && I3    [UU] I4 && I3
+
+     Big-endian:
+
+        The semantics of the RTL pattern are:
+
+	{ 0U, 0L } = { X[I3], X[I4 + 2] }, where X = { 1U, 1L, 2U, 2L }
+
+	so: 0U = { 1U, 1L }[I3] (= <aop><aUL>)
+	    0L = { 2U, 2L }[I4] (= <bop><bUL>)
+
+	    <aop> = 1, <aUL> = I3 ? L : U
+	    <bop> = 2, <bUL> = I4 ? L : U
+
+	    [UU] !I3 && !I4   [UL] !I3 && I4
+	    [LU] I3 && !I4    [LL] I3 && I4.  */
+  static const char *const mnemonics[2][4] = {
+    /* LE */ { "pll.ps\t%0,%2,%1", "pul.ps\t%0,%2,%1",
+	       "plu.ps\t%0,%2,%1", "puu.ps\t%0,%2,%1" },
+    /* BE */ { "puu.ps\t%0,%1,%2", "pul.ps\t%0,%1,%2",
+	       "plu.ps\t%0,%1,%2", "pll.ps\t%0,%1,%2" },
+  };
+
+  unsigned mask = INTVAL (operands[3]) * 2 + (INTVAL (operands[4]) - 2);
+  return mnemonics[TARGET_BIG_ENDIAN][mask];
+}
   [(set_attr "type" "fmove")
    (set_attr "mode" "SF")])
 
-; puu.ps - Pair upper upper
-(define_insn "mips_puu_ps"
-  [(set (match_operand:V2SF 0 "register_operand" "=f")
-	(vec_merge:V2SF
-	 (match_operand:V2SF 1 "register_operand" "f")
-	 (vec_select:V2SF (match_operand:V2SF 2 "register_operand" "f")
-			  (parallel [(const_int 1)
-				     (const_int 0)]))
-	 (const_int 2)))]
+(define_expand "vec_perm_constv2sf"
+  [(match_operand:V2SF 0 "register_operand" "")
+   (match_operand:V2SF 1 "register_operand" "")
+   (match_operand:V2SF 2 "register_operand" "")
+   (match_operand:V2SI 3 "" "")]
   "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
-  "puu.ps\t%0,%1,%2"
-  [(set_attr "type" "fmove")
-   (set_attr "mode" "SF")])
+{
+  if (mips_expand_vec_perm_const (operands))
+    DONE;
+  else
+    FAIL;
+})
 
-; pll.ps - Pair Lower Lower
-(define_insn "mips_pll_ps"
-  [(set (match_operand:V2SF 0 "register_operand" "=f")
-	(vec_merge:V2SF
-	 (vec_select:V2SF (match_operand:V2SF 1 "register_operand" "f")
-			  (parallel [(const_int 1)
-				     (const_int 0)]))
-	 (match_operand:V2SF 2 "register_operand" "f")
-	 (const_int 2)))]
+;; Expanders for builtins.  The instruction:
+;;
+;;     P[UL][UL].PS <result>, <a>, <b>
+;;
+;; says that the upper part of <result> is taken from half of <a> and
+;; the lower part of <result> is taken from half of <b>.  This means
+;; that the P[UL][UL].PS operand order matches memory order on big-endian
+;; targets; <a> is element 0 of the V2SF result while <b> is element 1.
+;; However, the P[UL][UL].PS operand order is the reverse of memory order
+;; on little-endian targets; <a> is element 1 of the V2SF result while
+;; <b> is element 0.  The arguments to vec_perm_const_ps are always in
+;; memory order.
+;;
+;; Similarly, "U" corresponds to element 0 on big-endian targets but
+;; to element 1 on little-endian targets.
+(define_expand "mips_puu_ps"
+  [(match_operand:V2SF 0 "register_operand" "")
+   (match_operand:V2SF 1 "register_operand" "")
+   (match_operand:V2SF 2 "register_operand" "")]
   "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
-  "pll.ps\t%0,%1,%2"
-  [(set_attr "type" "fmove")
-   (set_attr "mode" "SF")])
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[1], operands[2],
+				      /* U */ const0_rtx, /* U */ const2_rtx));
+  else
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[2], operands[1],
+				      /* U */ const1_rtx, /* U */ GEN_INT (3)));
+  DONE;
+})
 
-; plu.ps - Pair Lower Upper
-(define_insn "mips_plu_ps"
-  [(set (match_operand:V2SF 0 "register_operand" "=f")
-	(vec_merge:V2SF
-	 (vec_select:V2SF (match_operand:V2SF 1 "register_operand" "f")
-			  (parallel [(const_int 1)
-				     (const_int 0)]))
-	 (vec_select:V2SF (match_operand:V2SF 2 "register_operand" "f")
-			  (parallel [(const_int 1)
-				     (const_int 0)]))
-	 (const_int 2)))]
+(define_expand "mips_pul_ps"
+  [(match_operand:V2SF 0 "register_operand" "")
+   (match_operand:V2SF 1 "register_operand" "")
+   (match_operand:V2SF 2 "register_operand" "")]
   "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
-  "plu.ps\t%0,%1,%2"
-  [(set_attr "type" "fmove")
-   (set_attr "mode" "SF")])
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[1], operands[2],
+				      /* U */ const0_rtx, /* L */ GEN_INT (3)));
+  else
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[2], operands[1],
+				      /* L */ const0_rtx, /* U */ GEN_INT (3)));
+  DONE;
+})
+
+(define_expand "mips_plu_ps"
+  [(match_operand:V2SF 0 "register_operand" "")
+   (match_operand:V2SF 1 "register_operand" "")
+   (match_operand:V2SF 2 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[1], operands[2],
+				      /* L */ const1_rtx, /* U */ const2_rtx));
+  else
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[2], operands[1],
+				      /* U */ const1_rtx, /* L */ const2_rtx));
+  DONE;
+})
+
+(define_expand "mips_pll_ps"
+  [(match_operand:V2SF 0 "register_operand" "")
+   (match_operand:V2SF 1 "register_operand" "")
+   (match_operand:V2SF 2 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[1], operands[2],
+				      /* L */ const1_rtx, /* L */ GEN_INT (3)));
+  else
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[2], operands[1],
+				      /* L */ const0_rtx, /* L */ const2_rtx));
+  DONE;
+})
 
 ; vec_init
 (define_expand "vec_initv2sf"
@@ -206,10 +312,10 @@ (define_expand "vec_setv2sf"
      then use a PUL instruction.  */
   temp = gen_reg_rtx (V2SFmode);
   emit_insn (gen_mips_cvt_ps_s (temp, operands[1], operands[1]));
-  if (INTVAL (operands[2]) == !BYTES_BIG_ENDIAN)
-    emit_insn (gen_mips_pul_ps (operands[0], temp, operands[0]));
-  else
-    emit_insn (gen_mips_pul_ps (operands[0], operands[0], temp));
+
+  emit_insn (gen_vec_perm_const_ps (operands[0], temp, operands[0],
+				    operands[2],
+				    GEN_INT (1 - INTVAL (operands[2]) + 2)));
   DONE;
 })
 
Index: gcc/config/mips/mips.c
===================================================================
--- gcc/config/mips/mips.c	2011-12-10 15:12:45.000000000 +0000
+++ gcc/config/mips/mips.c	2011-12-11 12:42:59.000000000 +0000
@@ -12774,12 +12774,6 @@ #define CODE_FOR_loongson_psubsh CODE_FO
 #define CODE_FOR_loongson_psubsb CODE_FOR_sssubv8qi3
 #define CODE_FOR_loongson_psubush CODE_FOR_ussubv4hi3
 #define CODE_FOR_loongson_psubusb CODE_FOR_ussubv8qi3
-#define CODE_FOR_loongson_punpckhbh CODE_FOR_vec_interleave_highv8qi
-#define CODE_FOR_loongson_punpckhhw CODE_FOR_vec_interleave_highv4hi
-#define CODE_FOR_loongson_punpckhwd CODE_FOR_vec_interleave_highv2si
-#define CODE_FOR_loongson_punpcklbh CODE_FOR_vec_interleave_lowv8qi
-#define CODE_FOR_loongson_punpcklhw CODE_FOR_vec_interleave_lowv4hi
-#define CODE_FOR_loongson_punpcklwd CODE_FOR_vec_interleave_lowv2si
 
 static const struct mips_builtin_description mips_builtins[] = {
   DIRECT_BUILTIN (pll_ps, MIPS_V2SF_FTYPE_V2SF_V2SF, paired_single),
@@ -13021,8 +13015,8 @@ static const struct mips_builtin_descrip
   LOONGSON_BUILTIN (pasubub, MIPS_UV8QI_FTYPE_UV8QI_UV8QI),
   LOONGSON_BUILTIN (biadd, MIPS_UV4HI_FTYPE_UV8QI),
   LOONGSON_BUILTIN (psadbh, MIPS_UV4HI_FTYPE_UV8QI_UV8QI),
-  LOONGSON_BUILTIN_SUFFIX (pshufh, u, MIPS_UV4HI_FTYPE_UV4HI_UV4HI_UQI),
-  LOONGSON_BUILTIN_SUFFIX (pshufh, s, MIPS_V4HI_FTYPE_V4HI_V4HI_UQI),
+  LOONGSON_BUILTIN_SUFFIX (pshufh, u, MIPS_UV4HI_FTYPE_UV4HI_UQI),
+  LOONGSON_BUILTIN_SUFFIX (pshufh, s, MIPS_V4HI_FTYPE_V4HI_UQI),
   LOONGSON_BUILTIN_SUFFIX (psllh, u, MIPS_UV4HI_FTYPE_UV4HI_UQI),
   LOONGSON_BUILTIN_SUFFIX (psllh, s, MIPS_V4HI_FTYPE_V4HI_UQI),
   LOONGSON_BUILTIN_SUFFIX (psllw, u, MIPS_UV2SI_FTYPE_UV2SI_UQI),
@@ -16326,6 +16320,262 @@ mips_shift_truncation_mask (enum machine
 }
 
 
+/* Generate or test for an insn that supports a constant permutation.  */
+
+#define MAX_VECT_LEN 8
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_vector_p;
+  bool testing_p;
+};
+
+/* Recognize patterns for the MIPS3D P[UL][UL].PS instructions.  */
+
+static bool
+mips_expand_vpc_ps (struct expand_vec_perm_d *d)
+{
+  unsigned perm0, perm1;
+
+  if (!(TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT))
+    return false;
+  if (d->vmode != V2SFmode)
+    return false;
+  if (d->testing_p)
+    return true;
+
+  perm0 = d->perm[0];
+  perm1 = d->perm[1];
+
+  if (d->one_vector_p)
+    perm1 += 2;
+  else if (perm0 & 2)
+    {
+      rtx x;
+      perm0 -= 2;
+      perm1 += 2;
+      x = d->op0, d->op0 = d->op1, d->op1 = x;
+    }
+  gcc_assert ((perm0 & 2) == 0);
+  gcc_assert (perm1 & 2);
+
+  emit_insn (gen_vec_perm_const_ps (d->target, d->op0, d->op1,
+				    GEN_INT (perm0), GEN_INT (perm1)));
+
+  return true;
+}
+
+/* Recognize patterns for the Loongson PUNPCK* instructions.  */
+
+static bool
+mips_expand_vpc_loongson_interleave (struct expand_vec_perm_d *d)
+{
+  unsigned int i, low, swap, nelt = d->nelt, mask;
+  rtx x;
+
+  if (!(TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS))
+    return false;
+  if (GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT)
+    return false;
+  if (GET_MODE_SIZE (d->vmode) != 16)
+    return false;
+
+  /* Note that these are big-endian tests.  Adjust for little-endian later. */
+  low = nelt / 2;
+  swap = nelt;
+  if (d->perm[0] == swap + low)
+    ;
+  else if (d->perm[0] == swap)
+    low = 0;
+  else if (d->perm[0] == low)
+    swap = 0;
+  else if (d->perm[0] == 0)
+    low = 0, swap = 0;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt / 2; i++)
+    {
+      unsigned elt;
+      elt = i + low + swap;
+      if (d->perm[i * 2] != elt)
+	return false;
+      elt = (elt + nelt) & mask;
+      if (d->perm[i * 2 + 1] != elt)
+	return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  /* Adjust for little-endian.  */
+  if (TARGET_LITTLE_ENDIAN)
+    swap = !swap, low = !low;
+  /* Adjust for matched swapped operand pattern.  */
+  if (swap)
+    x = d->op0, d->op0 = d->op1, d->op1 = x;
+
+  /* Generate one of the loongson_punpck* instructions.  */
+  /* ??? We should consider using standard (vec_select (vec_concat)) form. */
+  x = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, d->op0, d->op1),
+		      low ? UNSPEC_LOONGSON_PUNPCKL : UNSPEC_LOONGSON_PUNPCKH);
+  emit_insn (gen_rtx_SET (VOIDmode, d->target, x));
+  return true;
+}
+
+/* Recognize patterns for the Loongson PSHUFH instruction.  */
+
+static bool
+mips_expand_vpc_loongson_pshufh (struct expand_vec_perm_d *d)
+{
+  unsigned i, mask, ec;
+
+  if (!(TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS))
+    return false;
+  if (d->vmode != V4HImode)
+    return false;
+  if (!d->one_vector_p)
+    return false;
+  if (d->testing_p)
+    return true;
+
+  /* Convert the selector into the packed 8-bit form for PSHUFH.
+     The bottom two bits of the mask always control the bottom
+     16 bits of the result; this is element 3 on big-endian targets
+     and element 0 on little-endian targets.  Each pair of bits X
+     specifies a right shift by X*16; again, this means that X==0
+     refers to element 3 on big-endian targets and element 0
+     on little-endian targets.  */
+  ec = TARGET_BIG_ENDIAN ? 3 : 0;
+  for (i = mask = 0; i < 4; i++)
+    mask |= ((d->perm[i ^ ec] ^ ec) & 3) << (i * 2);
+
+  emit_insn (gen_loongson_pshufh (d->target, d->op0,
+				  force_reg (SImode, GEN_INT (mask))));
+  return true;
+}
+
+static bool
+mips_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+{
+  if (mips_expand_vpc_ps (d))
+    return true;
+  if (mips_expand_vpc_loongson_interleave (d))
+    return true;
+  if (mips_expand_vpc_loongson_pshufh (d))
+    return true;
+  return false;
+}
+
+/* Expand a vec_perm_const pattern.  */
+
+bool
+mips_expand_vec_perm_const (rtx operands[4])
+{
+  struct expand_vec_perm_d d;
+  int i, nelt, which;
+  rtx sel;
+
+  d.target = operands[0];
+  d.op0 = operands[1];
+  d.op1 = operands[2];
+  sel = operands[3];
+
+  d.vmode = GET_MODE (d.target);
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+
+  for (i = which = 0; i < nelt; ++i)
+    {
+      rtx e = XVECEXP (sel, 0, i);
+      int ei = INTVAL (e) & (2 * nelt - 1);
+      which |= (ei < nelt ? 1 : 2);
+      d.perm[i] = ei;
+    }
+
+  switch (which)
+    {
+    default:
+      gcc_unreachable();
+
+    case 3:
+      d.one_vector_p = false;
+      if (!rtx_equal_p (d.op0, d.op1))
+	break;
+
+      /* The elements of PERM do not suggest that only the first operand
+	 is used, but both operands are identical.  Allow easier matching
+	 of the permutation by folding the permutation into the single
+	 input vector.  */
+      for (i = 0; i < nelt; ++i)
+	if (d.perm[i] >= nelt)
+	  d.perm[i] -= nelt;
+      /* FALLTHRU */
+
+    case 1:
+      d.op1 = d.op0;
+      d.one_vector_p = true;
+      break;
+
+    case 2:
+      for (i = 0; i < nelt; ++i)
+        d.perm[i] -= nelt;
+      d.op0 = d.op1;
+      d.one_vector_p = true;
+      break;
+    }
+
+  return mips_expand_vec_perm_const_1 (&d);
+}
+
+/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK.  */
+
+static bool
+mips_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+				  const unsigned char *sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned int i, nelt, which;
+  bool ret;
+
+  d.target = NULL_RTX;
+  d.op0 = NULL_RTX;
+  d.op1 = NULL_RTX;
+  d.vmode = vmode;
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = true;
+  memcpy (d.perm, sel, nelt);
+
+  /* Categorize the set of elements in the selector.  */
+  for (i = which = 0; i < nelt; ++i)
+    {
+      unsigned char e = d.perm[i];
+      gcc_assert (e < 2 * nelt);
+      which |= (e < nelt ? 1 : 2);
+    }
+
+  /* For all elements from second vector, fold the elements to first.  */
+  if (which == 2)
+    for (i = 0; i < nelt; ++i)
+      d.perm[i] -= nelt;
+
+  /* Check whether the mask can be applied to the vector type.  */
+  d.one_vector_p = (which != 3);
+
+  start_sequence ();
+  ret = mips_expand_vec_perm_const_1 (&d);
+  end_sequence ();
+
+  return ret;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -16544,6 +16794,9 @@ #define TARGET_ASM_OUTPUT_SOURCE_FILENAM
 #undef TARGET_SHIFT_TRUNCATION_MASK
 #define TARGET_SHIFT_TRUNCATION_MASK mips_shift_truncation_mask
 
+#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
+#define TARGET_VECTORIZE_VEC_PERM_CONST_OK mips_vectorize_vec_perm_const_ok
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-mips.h"
Index: gcc/config/mips/predicates.md
===================================================================
--- gcc/config/mips/predicates.md	2011-12-10 15:12:45.000000000 +0000
+++ gcc/config/mips/predicates.md	2011-12-10 15:13:07.000000000 +0000
@@ -73,8 +73,11 @@ (define_predicate "reg_or_1_operand"
 ;; This is used for indexing into vectors, and hence only accepts const_int.
 (define_predicate "const_0_or_1_operand"
   (and (match_code "const_int")
-       (ior (match_test "op == CONST0_RTX (GET_MODE (op))")
-	    (match_test "op == CONST1_RTX (GET_MODE (op))"))))
+       (match_test "IN_RANGE (INTVAL (op), 0, 1)")))
+
+(define_predicate "const_2_or_3_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 2, 3)")))
 
 (define_predicate "qi_mask_operand"
   (and (match_code "const_int")