[PATCH v2] LoongArch: Merge constant vector permuatation implementations.

Thu Dec 28 12:26:46 GMT 2023

There are currently two versions of the implementations of constant
vector permutation: loongarch_expand_vec_perm_const_1 and
loongarch_expand_vec_perm_const_2.  The implementations of the two
versions are different. Currently, only the implementation of
loongarch_expand_vec_perm_const_1 is used for 256-bit vectors.  We
hope to streamline the code as much as possible while retaining the
better-performing implementation of the two.  By repeatedly testing
spec2006 and spec2017, we got the following Merged version.
Compared with the pre-merger version, the number of lines of code
in loongarch.cc has been reduced by 888 lines.  At the same time,
the performance of SPECint2006 under Ofast has been improved by 0.97%,
and the performance of SPEC2017 fprate has been improved by 0.27%.

gcc/ChangeLog:

	* config/loongarch/loongarch.cc (loongarch_is_odd_extraction):
	Remove useless forward declaration.
	(loongarch_is_even_extraction): Remove useless forward declaration.
	(loongarch_try_expand_lsx_vshuf_const): Removed.
	(loongarch_expand_vec_perm_const_1): Merged.
	(loongarch_is_double_duplicate): Removed.
	(loongarch_is_center_extraction): Ditto.
	(loongarch_is_reversing_permutation): Ditto.
	(loongarch_is_di_misalign_extract): Ditto.
	(loongarch_is_si_misalign_extract): Ditto.
	(loongarch_is_lasx_lowpart_extract): Ditto.
	(loongarch_is_op_reverse_perm): Ditto.
	(loongarch_is_single_op_perm): Ditto.
	(loongarch_is_divisible_perm): Ditto.
	(loongarch_is_triple_stride_extract): Ditto.
	(loongarch_expand_vec_perm_const_2): Merged.
	(loongarch_expand_vec_perm_const): New.
	(loongarch_vectorize_vec_perm_const): Adjust.
---
 gcc/config/loongarch/loongarch.cc | 1308 +++++------------------------
 1 file changed, 210 insertions(+), 1098 deletions(-)

diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 1d4d8f0b256..d5bf6a02a12 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -8769,143 +8769,6 @@ loongarch_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
     }
 }
 
-static bool
-loongarch_is_odd_extraction (struct expand_vec_perm_d *);
-
-static bool
-loongarch_is_even_extraction (struct expand_vec_perm_d *);
-
-static bool
-loongarch_try_expand_lsx_vshuf_const (struct expand_vec_perm_d *d)
-{
-  int i;
-  rtx target, op0, op1, sel, tmp;
-  rtx rperm[MAX_VECT_LEN];
-
-  if (d->vmode == E_V2DImode || d->vmode == E_V2DFmode
-	|| d->vmode == E_V4SImode || d->vmode == E_V4SFmode
-	|| d->vmode == E_V8HImode || d->vmode == E_V16QImode)
-    {
-      target = d->target;
-      op0 = d->op0;
-      op1 = d->one_vector_p ? d->op0 : d->op1;
-
-      if (GET_MODE (op0) != GET_MODE (op1)
-	  || GET_MODE (op0) != GET_MODE (target))
-	return false;
-
-      if (d->testing_p)
-	return true;
-
-      /* If match extract-even and extract-odd permutations pattern, use
-       * vselect much better than vshuf.  */
-      if (loongarch_is_odd_extraction (d)
-	  || loongarch_is_even_extraction (d))
-	{
-	  if (loongarch_expand_vselect_vconcat (d->target, d->op0, d->op1,
-						d->perm, d->nelt))
-	    return true;
-
-	  unsigned char perm2[MAX_VECT_LEN];
-	  for (i = 0; i < d->nelt; ++i)
-	    perm2[i] = (d->perm[i] + d->nelt) & (2 * d->nelt - 1);
-
-	  if (loongarch_expand_vselect_vconcat (d->target, d->op1, d->op0,
-						perm2, d->nelt))
-	    return true;
-	}
-
-      for (i = 0; i < d->nelt; i += 1)
-	{
-	  rperm[i] = GEN_INT (d->perm[i]);
-	}
-
-      if (d->vmode == E_V2DFmode)
-	{
-	  sel = gen_rtx_CONST_VECTOR (E_V2DImode, gen_rtvec_v (d->nelt, rperm));
-	  tmp = simplify_gen_subreg (E_V2DImode, d->target, d->vmode, 0);
-	  emit_move_insn (tmp, sel);
-	}
-      else if (d->vmode == E_V4SFmode)
-	{
-	  sel = gen_rtx_CONST_VECTOR (E_V4SImode, gen_rtvec_v (d->nelt, rperm));
-	  tmp = simplify_gen_subreg (E_V4SImode, d->target, d->vmode, 0);
-	  emit_move_insn (tmp, sel);
-	}
-      else
-	{
-	  sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, rperm));
-	  emit_move_insn (d->target, sel);
-	}
-
-      switch (d->vmode)
-	{
-	case E_V2DFmode:
-	  emit_insn (gen_lsx_vshuf_d_f (target, target, op1, op0));
-	  break;
-	case E_V2DImode:
-	  emit_insn (gen_lsx_vshuf_d (target, target, op1, op0));
-	  break;
-	case E_V4SFmode:
-	  emit_insn (gen_lsx_vshuf_w_f (target, target, op1, op0));
-	  break;
-	case E_V4SImode:
-	  emit_insn (gen_lsx_vshuf_w (target, target, op1, op0));
-	  break;
-	case E_V8HImode:
-	  emit_insn (gen_lsx_vshuf_h (target, target, op1, op0));
-	  break;
-	case E_V16QImode:
-	  emit_insn (gen_lsx_vshuf_b (target, op1, op0, target));
-	  break;
-	default:
-	  break;
-	}
-
-      return true;
-    }
-  return false;
-}
-
-static bool
-loongarch_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
-{
-  unsigned int i, nelt = d->nelt;
-  unsigned char perm2[MAX_VECT_LEN];
-
-  if (d->one_vector_p)
-    {
-      /* Try interleave with alternating operands.  */
-      memcpy (perm2, d->perm, sizeof (perm2));
-      for (i = 1; i < nelt; i += 2)
-	perm2[i] += nelt;
-      if (loongarch_expand_vselect_vconcat (d->target, d->op0, d->op1, perm2,
-					    nelt))
-	return true;
-    }
-  else
-    {
-      if (loongarch_expand_vselect_vconcat (d->target, d->op0, d->op1,
-					    d->perm, nelt))
-	return true;
-
-      /* Try again with swapped operands.  */
-      for (i = 0; i < nelt; ++i)
-	perm2[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
-      if (loongarch_expand_vselect_vconcat (d->target, d->op1, d->op0, perm2,
-					    nelt))
-	return true;
-    }
-
-  if (loongarch_expand_lsx_shuffle (d))
-    return true;
-  if (loongarch_expand_vec_perm_even_odd (d))
-    return true;
-  if (loongarch_expand_vec_perm_interleave (d))
-    return true;
-  return false;
-}
-
 /* Following are the assist function for const vector permutation support.  */
 static bool
 loongarch_is_quad_duplicate (struct expand_vec_perm_d *d)
@@ -8937,36 +8800,6 @@ loongarch_is_quad_duplicate (struct expand_vec_perm_d *d)
   return result;
 }
 
-static bool
-loongarch_is_double_duplicate (struct expand_vec_perm_d *d)
-{
-  if (!d->one_vector_p)
-    return false;
-
-  if (d->nelt < 8)
-    return false;
-
-  bool result = true;
-  unsigned char buf = d->perm[0];
-
-  for (int i = 1; i < d->nelt; i += 2)
-    {
-      if (d->perm[i] != buf)
-	{
-	  result = false;
-	  break;
-	}
-      if (d->perm[i - 1] != d->perm[i])
-	{
-	  result = false;
-	  break;
-	}
-      buf += d->nelt / 4;
-    }
-
-  return result;
-}
-
 static bool
 loongarch_is_odd_extraction (struct expand_vec_perm_d *d)
 {
@@ -9027,110 +8860,6 @@ loongarch_is_extraction_permutation (struct expand_vec_perm_d *d)
   return result;
 }
 
-static bool
-loongarch_is_center_extraction (struct expand_vec_perm_d *d)
-{
-  bool result = true;
-  unsigned buf = d->nelt / 2;
-
-  for (int i = 0; i < d->nelt; i += 1)
-    {
-      if (buf != d->perm[i])
-	{
-	  result = false;
-	  break;
-	}
-      buf += 1;
-    }
-
-  return result;
-}
-
-static bool
-loongarch_is_reversing_permutation (struct expand_vec_perm_d *d)
-{
-  if (!d->one_vector_p)
-    return false;
-
-  bool result = true;
-  unsigned char buf = d->nelt - 1;
-
-  for (int i = 0; i < d->nelt; i += 1)
-    {
-      if (d->perm[i] != buf)
-	{
-	  result = false;
-	  break;
-	}
-
-      buf -= 1;
-    }
-
-  return result;
-}
-
-static bool
-loongarch_is_di_misalign_extract (struct expand_vec_perm_d *d)
-{
-  if (d->nelt != 4 && d->nelt != 8)
-    return false;
-
-  bool result = true;
-  unsigned char buf;
-
-  if (d->nelt == 4)
-    {
-      buf = 1;
-      for (int i = 0; i < d->nelt; i += 1)
-	{
-	  if (buf != d->perm[i])
-	    {
-	      result = false;
-	      break;
-	    }
-
-	  buf += 1;
-	}
-    }
-  else if (d->nelt == 8)
-    {
-      buf = 2;
-      for (int i = 0; i < d->nelt; i += 1)
-	{
-	  if (buf != d->perm[i])
-	    {
-	      result = false;
-	      break;
-	    }
-
-	  buf += 1;
-	}
-    }
-
-  return result;
-}
-
-static bool
-loongarch_is_si_misalign_extract (struct expand_vec_perm_d *d)
-{
-  if (d->vmode != E_V8SImode && d->vmode != E_V8SFmode)
-    return false;
-  bool result = true;
-  unsigned char buf = 1;
-
-  for (int i = 0; i < d->nelt; i += 1)
-    {
-      if (buf != d->perm[i])
-	{
-	  result = false;
-	  break;
-	}
-      buf += 1;
-    }
-
-  return result;
-}
-
 static bool
 loongarch_is_lasx_lowpart_interleave (struct expand_vec_perm_d *d)
 {
@@ -9193,39 +8922,6 @@ loongarch_is_lasx_lowpart_interleave_2 (struct expand_vec_perm_d *d)
   return result;
 }
 
-static bool
-loongarch_is_lasx_lowpart_extract (struct expand_vec_perm_d *d)
-{
-  bool result = true;
-  unsigned char buf = 0;
-
-  for (int i = 0; i < d->nelt / 2; i += 1)
-    {
-      if (buf != d->perm[i])
-	{
-	  result = false;
-	  break;
-	}
-      buf += 1;
-    }
-
-  if (result)
-    {
-      buf = d->nelt;
-      for (int i = d->nelt / 2; i < d->nelt; i += 1)
-	{
-	  if (buf != d->perm[i])
-	    {
-	      result = false;
-	      break;
-	    }
-	  buf += 1;
-	}
-    }
-
-  return result;
-}
-
 static bool
 loongarch_is_lasx_highpart_interleave (expand_vec_perm_d *d)
 {
@@ -9307,538 +9003,195 @@ loongarch_is_elem_duplicate (struct expand_vec_perm_d *d)
   return result;
 }
 
-inline bool
-loongarch_is_op_reverse_perm (struct expand_vec_perm_d *d)
-{
-  return (d->vmode == E_V4DFmode)
-    && d->perm[0] == 2 && d->perm[1] == 3
-    && d->perm[2] == 0 && d->perm[3] == 1;
-}
+/* In LASX, some permutation insn does not have the behavior that gcc expects
+   when compiler wants to emit a vector permutation.
+
+   1.  What GCC provides via vectorize_vec_perm_const ()'s paramater:
+   When GCC wants to performs a vector permutation, it provides two op
+   reigster, one target register, and a selector.
+   In const vector permutation case, GCC provides selector as a char array
+   that contains original value; in variable vector permuatation
+   (performs via vec_perm<mode> insn template), it provides a vector register.
+   We assume that nelt is the elements numbers inside single vector in current
+   256bit vector mode.
+
+   2.  What GCC expects to perform:
+   Two op registers (op0, op1) will "combine" into a 512bit temp vector storage
+   that has 2*nelt elements inside it; the low 256bit is op0, and high 256bit
+   is op1, then the elements are indexed as below:
+   0 ~ nelt - 1		nelt ~ 2 * nelt - 1
+   |-------------------------|-------------------------|
+   Low 256bit (op0)	High 256bit (op1)
+   For example, the second element in op1 (V8SImode) will be indexed with 9.
+   Selector is a vector that has the same mode and number of elements  with
+   op0,op1 and target, it's look like this:
+   0 ~ nelt - 1
+   |-------------------------|
+   256bit (selector)
+   It describes which element from 512bit temp vector storage will fit into
+   target's every element slot.
+   GCC expects that every element in selector can be ANY indices of 512bit
+   vector storage (Selector can pick literally any element from op0 and op1, and
+   then fits into any place of target register). This is also what LSX 128bit
+   vshuf.* instruction do similarly, so we can handle 128bit vector permutation
+   by single instruction easily.
+
+   3.  What LASX permutation instruction does:
+   In short, it just execute two independent 128bit vector permuatation, and
+   it's the reason that we need to do the jobs below.  We will explain it.
+   op0, op1, target, and selector will be separate into high 128bit and low
+   128bit, and do permutation as the description below:
+
+   a) op0's low 128bit and op1's low 128bit "combines" into a 256bit temp
+   vector storage (TVS1), elements are indexed as below:
+   0 ~ nelt / 2 - 1	  nelt / 2 ~ nelt - 1
+   |---------------------|---------------------| TVS1
+   op0's low 128bit      op1's low 128bit
+   op0's high 128bit and op1's high 128bit are "combined" into TVS2 in the
+   same way.
+   0 ~ nelt / 2 - 1	  nelt / 2 ~ nelt - 1
+   |---------------------|---------------------| TVS2
+   op0's high 128bit	op1's high 128bit
+   b) Selector's low 128bit describes which elements from TVS1 will fit into
+   target vector's low 128bit.  No TVS2 elements are allowed.
+   c) Selector's high 128bit describes which elements from TVS2 will fit into
+   target vector's high 128bit.  No TVS1 elements are allowed.
+
+   As we can see, if we want to handle vector permutation correctly, we can
+   achieve it in three ways:
+   a) Modify selector's elements, to make sure that every elements can inform
+   correct value that will put into target vector.
+   b) Generate extra instruction before/after permutation instruction, for
+   adjusting op vector or target vector, to make sure target vector's value is
+   what GCC expects.
+   c) Use other instructions to process op and put correct result into target.
+   */
+
+/* Implementation of constant vector permuatation.  This function identifies
+   recognized pattern of permuation selector argument, and use one or more
+   instruction (s) to finish the permutation job correctly.  For unsupported
+   patterns, it will return false.  */
 
 static bool
-loongarch_is_single_op_perm (struct expand_vec_perm_d *d)
+loongarch_expand_vec_perm_const (struct expand_vec_perm_d *d)
 {
-  bool result = true;
+  bool flag = false;
+  unsigned int i;
+  unsigned char idx;
+  rtx target, op0, op1, sel, tmp;
+  rtx rperm[MAX_VECT_LEN];
+  unsigned int remapped[MAX_VECT_LEN];
+  unsigned char perm2[MAX_VECT_LEN];
 
-  for (int i = 0; i < d->nelt; i += 1)
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    return loongarch_expand_lsx_shuffle (d);
+  else
     {
-      if (d->perm[i] >= d->nelt)
+      if (d->one_vector_p)
 	{
-	  result = false;
-	  break;
+	  /* Try interleave with alternating operands.  */
+	  memcpy (perm2, d->perm, sizeof (perm2));
+	  for (i = 1; i < d->nelt; i += 2)
+	    perm2[i] += d->nelt;
+	  if (loongarch_expand_vselect_vconcat (d->target, d->op0, d->op1,
+						perm2, d->nelt))
+	    return true;
 	}
-    }
-
-  return result;
-}
-
-static bool
-loongarch_is_divisible_perm (struct expand_vec_perm_d *d)
-{
-  bool result = true;
-
-  for (int i = 0; i < d->nelt / 2; i += 1)
-    {
-      if (d->perm[i] >= d->nelt)
+      else
 	{
-	  result = false;
-	  break;
-	}
-    }
-
-  if (result)
-    {
-      for (int i = d->nelt / 2; i < d->nelt; i += 1)
-	{
-	  if (d->perm[i] < d->nelt)
-	    {
-	      result = false;
-	      break;
-	    }
-	}
-    }
-
-  return result;
-}
-
-inline bool
-loongarch_is_triple_stride_extract (struct expand_vec_perm_d *d)
-{
-  return (d->vmode == E_V4DImode || d->vmode == E_V4DFmode)
-    && d->perm[0] == 1 && d->perm[1] == 4
-    && d->perm[2] == 7 && d->perm[3] == 0;
-}
-
-/* In LASX, some permutation insn does not have the behavior that gcc expects
- * when compiler wants to emit a vector permutation.
- *
- * 1. What GCC provides via vectorize_vec_perm_const ()'s paramater:
- * When GCC wants to performs a vector permutation, it provides two op
- * reigster, one target register, and a selector.
- * In const vector permutation case, GCC provides selector as a char array
- * that contains original value; in variable vector permuatation
- * (performs via vec_perm<mode> insn template), it provides a vector register.
- * We assume that nelt is the elements numbers inside single vector in current
- * 256bit vector mode.
- *
- * 2. What GCC expects to perform:
- * Two op registers (op0, op1) will "combine" into a 512bit temp vector storage
- * that has 2*nelt elements inside it; the low 256bit is op0, and high 256bit
- * is op1, then the elements are indexed as below:
- *		  0 ~ nelt - 1		nelt ~ 2 * nelt - 1
- *	  |-------------------------|-------------------------|
- *		Low 256bit (op0)	High 256bit (op1)
- * For example, the second element in op1 (V8SImode) will be indexed with 9.
- * Selector is a vector that has the same mode and number of elements  with
- * op0,op1 and target, it's look like this:
- *	      0 ~ nelt - 1
- *	  |-------------------------|
- *	      256bit (selector)
- * It describes which element from 512bit temp vector storage will fit into
- * target's every element slot.
- * GCC expects that every element in selector can be ANY indices of 512bit
- * vector storage (Selector can pick literally any element from op0 and op1, and
- * then fits into any place of target register). This is also what LSX 128bit
- * vshuf.* instruction do similarly, so we can handle 128bit vector permutation
- * by single instruction easily.
- *
- * 3. What LASX permutation instruction does:
- * In short, it just execute two independent 128bit vector permuatation, and
- * it's the reason that we need to do the jobs below.  We will explain it.
- * op0, op1, target, and selector will be separate into high 128bit and low
- * 128bit, and do permutation as the description below:
- *
- *  a) op0's low 128bit and op1's low 128bit "combines" into a 256bit temp
- * vector storage (TVS1), elements are indexed as below:
- *	    0 ~ nelt / 2 - 1	  nelt / 2 ~ nelt - 1
- *	|---------------------|---------------------| TVS1
- *	    op0's low 128bit      op1's low 128bit
- *    op0's high 128bit and op1's high 128bit are "combined" into TVS2 in the
- *    same way.
- *	    0 ~ nelt / 2 - 1	  nelt / 2 ~ nelt - 1
- *	|---------------------|---------------------| TVS2
- *	    op0's high 128bit	op1's high 128bit
- *  b) Selector's low 128bit describes which elements from TVS1 will fit into
- *  target vector's low 128bit.  No TVS2 elements are allowed.
- *  c) Selector's high 128bit describes which elements from TVS2 will fit into
- *  target vector's high 128bit.  No TVS1 elements are allowed.
- *
- * As we can see, if we want to handle vector permutation correctly, we can
- * achieve it in three ways:
- *  a) Modify selector's elements, to make sure that every elements can inform
- *  correct value that will put into target vector.
-    b) Generate extra instruction before/after permutation instruction, for
-    adjusting op vector or target vector, to make sure target vector's value is
-    what GCC expects.
-    c) Use other instructions to process op and put correct result into target.
- */
-
-/* Implementation of constant vector permuatation.  This function identifies
- * recognized pattern of permuation selector argument, and use one or more
- * instruction(s) to finish the permutation job correctly.  For unsupported
- * patterns, it will return false.  */
-
-static bool
-loongarch_expand_vec_perm_const_2 (struct expand_vec_perm_d *d)
-{
-  /* Although we have the LSX vec_perm<mode> template, there's still some
-     128bit vector permuatation operations send to vectorize_vec_perm_const.
-     In this case, we just simpliy wrap them by single vshuf.* instruction,
-     because LSX vshuf.* instruction just have the same behavior that GCC
-     expects.  */
-  if (GET_MODE_SIZE (d->vmode) == 16)
-    return loongarch_try_expand_lsx_vshuf_const (d);
-  else
-    return false;
-
-  bool ok = false, reverse_hi_lo = false, extract_ev_od = false,
-       use_alt_op = false;
-  unsigned char idx;
-  int i;
-  rtx target, op0, op1, sel, tmp;
-  rtx op0_alt = NULL_RTX, op1_alt = NULL_RTX;
-  rtx rperm[MAX_VECT_LEN];
-  unsigned int remapped[MAX_VECT_LEN];
-
-  /* Try to figure out whether is a recognized permutation selector pattern, if
-     yes, we will reassign some elements with new value in selector argument,
-     and in some cases we will generate some assist insn to complete the
-     permutation. (Even in some cases, we use other insn to impl permutation
-     instead of xvshuf!)
+	  if (loongarch_expand_vselect_vconcat (d->target, d->op0, d->op1,
+						d->perm, d->nelt))
+	    return true;
 
-     Make sure to check d->testing_p is false everytime if you want to emit new
-     insn, unless you want to crash into ICE directly.  */
-  if (loongarch_is_quad_duplicate (d))
-    {
-      /* Selector example: E_V8SImode, { 0, 0, 0, 0, 4, 4, 4, 4 }
-	 copy first elem from original selector to all elem in new selector.  */
-      idx = d->perm[0];
-      for (i = 0; i < d->nelt; i += 1)
-	{
-	  remapped[i] = idx;
-	}
-      /* Selector after: { 0, 0, 0, 0, 0, 0, 0, 0 }.  */
-    }
-  else if (loongarch_is_double_duplicate (d))
-    {
-      /* Selector example: E_V8SImode, { 1, 1, 3, 3, 5, 5, 7, 7 }
-	 one_vector_p == true.  */
-      for (i = 0; i < d->nelt / 2; i += 1)
-	{
-	  idx = d->perm[i];
-	  remapped[i] = idx;
-	  remapped[i + d->nelt / 2] = idx;
+	  /* Try again with swapped operands.  */
+	  for (i = 0; i < d->nelt; ++i)
+	    perm2[i] = (d->perm[i] + d->nelt) & (2 * d->nelt - 1);
+	  if (loongarch_expand_vselect_vconcat (d->target, d->op1, d->op0,
+						perm2, d->nelt))
+	    return true;
 	}
-      /* Selector after: { 1, 1, 3, 3, 1, 1, 3, 3 }.  */
-    }
-  else if (loongarch_is_odd_extraction (d)
-	   || loongarch_is_even_extraction (d))
-    {
-      /* Odd extraction selector sample: E_V4DImode, { 1, 3, 5, 7 }
-	 Selector after: { 1, 3, 1, 3 }.
-	 Even extraction selector sample: E_V4DImode, { 0, 2, 4, 6 }
-	 Selector after: { 0, 2, 0, 2 }.  */
 
-      /* Better implement of extract-even and extract-odd permutations.  */
-      if (loongarch_expand_vec_perm_even_odd (d))
+      if (loongarch_expand_lsx_shuffle (d))
 	return true;
 
-      for (i = 0; i < d->nelt / 2; i += 1)
-	{
-	  idx = d->perm[i];
-	  remapped[i] = idx;
-	  remapped[i + d->nelt / 2] = idx;
-	}
-      /* Additional insn is required for correct result.  See codes below.  */
-      extract_ev_od = true;
-    }
-  else if (loongarch_is_extraction_permutation (d))
-    {
-      /* Selector sample: E_V8SImode, { 0, 1, 2, 3, 4, 5, 6, 7 }.  */
-      if (d->perm[0] == 0)
+      if (loongarch_is_odd_extraction (d)
+	  || loongarch_is_even_extraction (d))
 	{
-	  for (i = 0; i < d->nelt / 2; i += 1)
-	    {
-	      remapped[i] = i;
-	      remapped[i + d->nelt / 2] = i;
-	    }
+	  if (loongarch_expand_vec_perm_even_odd (d))
+	    return true;
 	}
-      else
+
+      if (loongarch_is_lasx_lowpart_interleave (d)
+	  || loongarch_is_lasx_lowpart_interleave_2 (d)
+	  || loongarch_is_lasx_highpart_interleave (d)
+	  || loongarch_is_lasx_highpart_interleave_2 (d))
 	{
-	  /* { 8, 9, 10, 11, 12, 13, 14, 15 }.  */
-	  for (i = 0; i < d->nelt / 2; i += 1)
-	    {
-	      idx = i + d->nelt / 2;
-	      remapped[i] = idx;
-	      remapped[i + d->nelt / 2] = idx;
-	    }
+	  if (loongarch_expand_vec_perm_interleave (d))
+	    return true;
 	}
-      /* Selector after: { 0, 1, 2, 3, 0, 1, 2, 3 }
-	 { 8, 9, 10, 11, 8, 9, 10, 11 }  */
-    }
-  else if (loongarch_is_center_extraction (d))
-    {
-      /* sample: E_V4DImode, { 2, 3, 4, 5 }
-	 In this condition, we can just copy high 128bit of op0 and low 128bit
-	 of op1 to the target register by using xvpermi.q insn.  */
-      if (!d->testing_p)
+
+      if (loongarch_is_quad_duplicate (d))
 	{
-	  emit_move_insn (d->target, d->op1);
-	  switch (d->vmode)
+	  if (d->testing_p)
+	    return true;
+	  /* Selector example: E_V8SImode, { 0, 0, 0, 0, 4, 4, 4, 4 }.  */
+	  for (i = 0; i < d->nelt; i += 1)
 	    {
-	      case E_V4DImode:
-		emit_insn (gen_lasx_xvpermi_q_v4di (d->target, d->target,
-						    d->op0, GEN_INT (0x21)));
-		break;
-	      case E_V4DFmode:
-		emit_insn (gen_lasx_xvpermi_q_v4df (d->target, d->target,
-						    d->op0, GEN_INT (0x21)));
-		break;
-	      case E_V8SImode:
-		emit_insn (gen_lasx_xvpermi_q_v8si (d->target, d->target,
-						    d->op0, GEN_INT (0x21)));
-		break;
-	      case E_V8SFmode:
-		emit_insn (gen_lasx_xvpermi_q_v8sf (d->target, d->target,
-						    d->op0, GEN_INT (0x21)));
-		break;
-	      case E_V16HImode:
-		emit_insn (gen_lasx_xvpermi_q_v16hi (d->target, d->target,
-						     d->op0, GEN_INT (0x21)));
-		break;
-	      case E_V32QImode:
-		emit_insn (gen_lasx_xvpermi_q_v32qi (d->target, d->target,
-						     d->op0, GEN_INT (0x21)));
-		break;
-	      default:
-		break;
+	      rperm[i] = GEN_INT (d->perm[0]);
 	    }
+	  /* Selector after: { 0, 0, 0, 0, 0, 0, 0, 0 }.  */
+	  flag = true;
+	  goto expand_perm_const_end;
 	}
-      ok = true;
-      /* Finish the funtion directly.  */
-      goto expand_perm_const_2_end;
-    }
-  else if (loongarch_is_reversing_permutation (d))
-    {
-      /* Selector sample: E_V8SImode, { 7, 6, 5, 4, 3, 2, 1, 0 }
-	 one_vector_p == true  */
-      idx = d->nelt / 2 - 1;
-      for (i = 0; i < d->nelt / 2; i += 1)
-	{
-	  remapped[i] = idx;
-	  remapped[i + d->nelt / 2] = idx;
-	  idx -= 1;
-	}
-      /* Selector after: { 3, 2, 1, 0, 3, 2, 1, 0 }
-	 Additional insn will be generated to swap hi and lo 128bit of target
-	 register.  */
-      reverse_hi_lo = true;
-    }
-  else if (loongarch_is_di_misalign_extract (d)
-	   || loongarch_is_si_misalign_extract (d))
-    {
-      /* Selector Sample:
-	 DI misalign: E_V4DImode, { 1, 2, 3, 4 }
-	 SI misalign: E_V8SImode, { 1, 2, 3, 4, 5, 6, 7, 8 }  */
-      if (!d->testing_p)
-	{
-	  /* Copy original op0/op1 value to new temp register.
-	     In some cases, operand register may be used in multiple place, so
-	     we need new regiter instead modify original one, to avoid runtime
-	     crashing or wrong value after execution.  */
-	  use_alt_op = true;
-	  op1_alt = gen_reg_rtx (d->vmode);
-	  emit_move_insn (op1_alt, d->op1);
-
-	  /* Adjust op1 for selecting correct value in high 128bit of target
-	     register.
-	     op1: E_V4DImode, { 4, 5, 6, 7 } -> { 2, 3, 4, 5 }.  */
-	  rtx conv_op1 = simplify_gen_subreg (E_V4DImode, op1_alt, d->vmode, 0);
-	  rtx conv_op0 = simplify_gen_subreg (E_V4DImode, d->op0, d->vmode, 0);
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1, conv_op1,
-					      conv_op0, GEN_INT (0x21)));
 
-	  for (i = 0; i < d->nelt / 2; i += 1)
-	    {
-	      remapped[i] = d->perm[i];
-	      remapped[i + d->nelt / 2] = d->perm[i];
-	    }
-	  /* Selector after:
-	     DI misalign: { 1, 2, 1, 2 }
-	     SI misalign: { 1, 2, 3, 4, 1, 2, 3, 4 }  */
-	}
-    }
-  else if (loongarch_is_lasx_lowpart_interleave (d))
-    {
-      /* Elements from op0's low 18bit and op1's 128bit are inserted into
-	 target register alternately.
-	 sample: E_V4DImode, { 0, 4, 1, 5 }  */
-      if (!d->testing_p)
-	{
-	  /* Prepare temp register instead of modify original op.  */
-	  use_alt_op = true;
-	  op1_alt = gen_reg_rtx (d->vmode);
-	  op0_alt = gen_reg_rtx (d->vmode);
-	  emit_move_insn (op1_alt, d->op1);
-	  emit_move_insn (op0_alt, d->op0);
-
-	  /* Generate subreg for fitting into insn gen function.  */
-	  rtx conv_op1 = simplify_gen_subreg (E_V4DImode, op1_alt, d->vmode, 0);
-	  rtx conv_op0 = simplify_gen_subreg (E_V4DImode, op0_alt, d->vmode, 0);
-
-	  /* Adjust op value in temp register.
-	     op0 = {0,1,2,3}, op1 = {4,5,0,1}  */
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1, conv_op1,
-					      conv_op0, GEN_INT (0x02)));
-	  /* op0 = {0,1,4,5}, op1 = {4,5,0,1}  */
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_op0, conv_op0,
-					      conv_op1, GEN_INT (0x01)));
-
-	  /* Remap indices in selector based on the location of index inside
-	     selector, and vector element numbers in current vector mode.  */
-
-	  /* Filling low 128bit of new selector.  */
-	  for (i = 0; i < d->nelt / 2; i += 1)
-	    {
-	      /* value in odd-indexed slot of low 128bit part of selector
-		 vector.  */
-	      remapped[i] = i % 2 != 0 ? d->perm[i] - d->nelt / 2 : d->perm[i];
-	    }
-	  /* Then filling the high 128bit.  */
-	  for (i = d->nelt / 2; i < d->nelt; i += 1)
+      if (loongarch_is_extraction_permutation (d))
+	{
+	  if (d->testing_p)
+	    return true;
+	  /* Selector sample: E_V8SImode, { 0, 1, 2, 3, 4, 5, 6, 7 }.  */
+	  if (d->perm[0] == 0)
 	    {
-	      /* value in even-indexed slot of high 128bit part of
-		 selector vector.  */
-	      remapped[i] = i % 2 == 0
-		? d->perm[i] + (d->nelt / 2) * 3 : d->perm[i];
+	      for (i = 0; i < d->nelt / 2; i += 1)
+		{
+		  remapped[i] = i;
+		  remapped[i + d->nelt / 2] = i;
+		}
 	    }
-	}
-    }
-  else if (loongarch_is_lasx_lowpart_interleave_2 (d))
-    {
-      /* Special lowpart interleave case in V32QI vector mode.  It does the same
-	 thing as we can see in if branch that above this line.
-	 Selector sample: E_V32QImode,
-	 {0, 1, 2, 3, 4, 5, 6, 7, 32, 33, 34, 35, 36, 37, 38, 39, 8,
-	 9, 10, 11, 12, 13, 14, 15, 40, 41, 42, 43, 44, 45, 46, 47}  */
-      if (!d->testing_p)
-	{
-	  /* Solution for this case in very simple - covert op into V4DI mode,
-	     and do same thing as previous if branch.  */
-	  op1_alt = gen_reg_rtx (d->vmode);
-	  op0_alt = gen_reg_rtx (d->vmode);
-	  emit_move_insn (op1_alt, d->op1);
-	  emit_move_insn (op0_alt, d->op0);
-
-	  rtx conv_op1 = simplify_gen_subreg (E_V4DImode, op1_alt, d->vmode, 0);
-	  rtx conv_op0 = simplify_gen_subreg (E_V4DImode, op0_alt, d->vmode, 0);
-	  rtx conv_target = simplify_gen_subreg (E_V4DImode, d->target,
-						 d->vmode, 0);
-
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1, conv_op1,
-					      conv_op0, GEN_INT (0x02)));
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_op0, conv_op0,
-					      conv_op1, GEN_INT (0x01)));
-	  remapped[0] = 0;
-	  remapped[1] = 4;
-	  remapped[2] = 1;
-	  remapped[3] = 5;
-
-	  for (i = 0; i < d->nelt; i += 1)
+	  else
 	    {
-	      rperm[i] = GEN_INT (remapped[i]);
+	      /* { 8, 9, 10, 11, 12, 13, 14, 15 }.  */
+	      for (i = 0; i < d->nelt / 2; i += 1)
+		{
+		  idx = i + d->nelt / 2;
+		  remapped[i] = idx;
+		  remapped[i + d->nelt / 2] = idx;
+		}
 	    }
+	  /* Selector after: { 0, 1, 2, 3, 0, 1, 2, 3 }
+	     { 8, 9, 10, 11, 8, 9, 10, 11 }  */
 
-	  sel = gen_rtx_CONST_VECTOR (E_V4DImode, gen_rtvec_v (4, rperm));
-	  sel = force_reg (E_V4DImode, sel);
-	  emit_insn (gen_lasx_xvshuf_d (conv_target, sel,
-					conv_op1, conv_op0));
-	}
-
-      ok = true;
-      goto expand_perm_const_2_end;
-    }
-  else if (loongarch_is_lasx_lowpart_extract (d))
-    {
-      /* Copy op0's low 128bit to target's low 128bit, and copy op1's low
-	 128bit to target's high 128bit.
-	 Selector sample: E_V4DImode, { 0, 1, 4 ,5 }  */
-      if (!d->testing_p)
-	{
-	  rtx conv_op1 = simplify_gen_subreg (E_V4DImode, d->op1, d->vmode, 0);
-	  rtx conv_op0 = simplify_gen_subreg (E_V4DImode, d->op0, d->vmode, 0);
-	  rtx conv_target = simplify_gen_subreg (E_V4DImode, d->target,
-						 d->vmode, 0);
-
-	  /* We can achieve the expectation by using sinple xvpermi.q insn.  */
-	  emit_move_insn (conv_target, conv_op1);
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_target, conv_target,
-					      conv_op0, GEN_INT (0x20)));
-	}
-
-      ok = true;
-      goto expand_perm_const_2_end;
-    }
-  else if (loongarch_is_lasx_highpart_interleave (d))
-    {
-      /* Similar to lowpart interleave, elements from op0's high 128bit and
-	 op1's high 128bit are inserted into target regiter alternately.
-	 Selector sample: E_V8SImode, { 4, 12, 5, 13, 6, 14, 7, 15 }  */
-      if (!d->testing_p)
-	{
-	  /* Prepare temp op register.  */
-	  use_alt_op = true;
-	  op1_alt = gen_reg_rtx (d->vmode);
-	  op0_alt = gen_reg_rtx (d->vmode);
-	  emit_move_insn (op1_alt, d->op1);
-	  emit_move_insn (op0_alt, d->op0);
-
-	  rtx conv_op1 = simplify_gen_subreg (E_V4DImode, op1_alt, d->vmode, 0);
-	  rtx conv_op0 = simplify_gen_subreg (E_V4DImode, op0_alt, d->vmode, 0);
-	  /* Adjust op value in temp regiter.
-	     op0 = { 0, 1, 2, 3 }, op1 = { 6, 7, 2, 3 }  */
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1, conv_op1,
-					      conv_op0, GEN_INT (0x13)));
-	  /* op0 = { 2, 3, 6, 7 }, op1 = { 6, 7, 2, 3 }  */
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_op0, conv_op0,
-					      conv_op1, GEN_INT (0x01)));
-	  /* Remap indices in selector based on the location of index inside
-	     selector, and vector element numbers in current vector mode.  */
-
-	  /* Filling low 128bit of new selector.  */
-	 for (i = 0; i < d->nelt / 2; i += 1)
-	   {
-	     /* value in even-indexed slot of low 128bit part of selector
-		vector.  */
-	     remapped[i] = i % 2 == 0 ? d->perm[i] - d->nelt / 2 : d->perm[i];
-	   }
-	  /* Then filling the high 128bit.  */
-	 for (i = d->nelt / 2; i < d->nelt; i += 1)
-	   {
-	     /* value in odd-indexed slot of high 128bit part of selector
-		vector.  */
-	      remapped[i] = i % 2 != 0
-		? d->perm[i] - (d->nelt / 2) * 3 : d->perm[i];
-	   }
-	}
-    }
-  else if (loongarch_is_lasx_highpart_interleave_2 (d))
-    {
-      /* Special highpart interleave case in V32QI vector mode.  It does the
-	 same thing as the normal version above.
-	 Selector sample: E_V32QImode,
-	 {16, 17, 18, 19, 20, 21, 22, 23, 48, 49, 50, 51, 52, 53, 54, 55,
-	 24, 25, 26, 27, 28, 29, 30, 31, 56, 57, 58, 59, 60, 61, 62, 63}
-      */
-      if (!d->testing_p)
-	{
-	  /* Convert op into V4DImode and do the things.  */
-	  op1_alt = gen_reg_rtx (d->vmode);
-	  op0_alt = gen_reg_rtx (d->vmode);
-	  emit_move_insn (op1_alt, d->op1);
-	  emit_move_insn (op0_alt, d->op0);
-
-	  rtx conv_op1 = simplify_gen_subreg (E_V4DImode, op1_alt, d->vmode, 0);
-	  rtx conv_op0 = simplify_gen_subreg (E_V4DImode, op0_alt, d->vmode, 0);
-	  rtx conv_target = simplify_gen_subreg (E_V4DImode, d->target,
-						 d->vmode, 0);
-
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1, conv_op1,
-					      conv_op0, GEN_INT (0x13)));
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_op0, conv_op0,
-					      conv_op1, GEN_INT (0x01)));
-	  remapped[0] = 2;
-	  remapped[1] = 6;
-	  remapped[2] = 3;
-	  remapped[3] = 7;
-
+	  /* Convert remapped selector array to RTL array.  */
 	  for (i = 0; i < d->nelt; i += 1)
 	    {
 	      rperm[i] = GEN_INT (remapped[i]);
 	    }
 
-	  sel = gen_rtx_CONST_VECTOR (E_V4DImode, gen_rtvec_v (4, rperm));
-	  sel = force_reg (E_V4DImode, sel);
-	  emit_insn (gen_lasx_xvshuf_d (conv_target, sel,
-					conv_op1, conv_op0));
+	  flag = true;
+	  goto expand_perm_const_end;
 	}
 
-	ok = true;
-	goto expand_perm_const_2_end;
-    }
-  else if (loongarch_is_elem_duplicate (d))
-    {
-      /* Brocast single element (from op0 or op1) to all slot of target
-	 register.
-	 Selector sample:E_V8SImode, { 2, 2, 2, 2, 2, 2, 2, 2 }  */
-      if (!d->testing_p)
+      if (loongarch_is_elem_duplicate (d))
 	{
+	  if (d->testing_p)
+	    return true;
+	  /* Brocast single element (from op0 or op1) to all slot of target
+	     register.
+	     Selector sample:E_V8SImode, { 2, 2, 2, 2, 2, 2, 2, 2 }  */
 	  rtx conv_op1 = simplify_gen_subreg (E_V4DImode, d->op1, d->vmode, 0);
 	  rtx conv_op0 = simplify_gen_subreg (E_V4DImode, d->op0, d->vmode, 0);
 	  rtx temp_reg = gen_reg_rtx (d->vmode);
 	  rtx conv_temp = simplify_gen_subreg (E_V4DImode, temp_reg,
 					       d->vmode, 0);
-
 	  emit_move_insn (temp_reg, d->op0);
 
 	  idx = d->perm[0];
@@ -9847,7 +9200,7 @@ loongarch_expand_vec_perm_const_2 (struct expand_vec_perm_d *d)
 	     value that we need to broardcast, because xvrepl128vei does the
 	     broardcast job from every 128bit of source register to
 	     corresponded part of target register! (A deep sigh.)  */
-	  if (/*idx >= 0 &&*/ idx < d->nelt / 2)
+	  if (idx < d->nelt / 2)
 	    {
 	      emit_insn (gen_lasx_xvpermi_q_v4di (conv_temp, conv_temp,
 						  conv_op0, GEN_INT (0x0)));
@@ -9902,310 +9255,75 @@ loongarch_expand_vec_perm_const_2 (struct expand_vec_perm_d *d)
 	      break;
 	    }
 
-	  /* finish func directly.  */
-	  ok = true;
-	  goto expand_perm_const_2_end;
-	}
-    }
-  else if (loongarch_is_op_reverse_perm (d))
-    {
-      /* reverse high 128bit and low 128bit in op0.
-	 Selector sample: E_V4DFmode, { 2, 3, 0, 1 }
-	 Use xvpermi.q for doing this job.  */
-      if (!d->testing_p)
-	{
-	  if (d->vmode == E_V4DImode)
-	    {
-	      emit_insn (gen_lasx_xvpermi_q_v4di (d->target, d->target, d->op0,
-						  GEN_INT (0x01)));
-	    }
-	  else if (d->vmode == E_V4DFmode)
-	    {
-	      emit_insn (gen_lasx_xvpermi_q_v4df (d->target, d->target, d->op0,
-						  GEN_INT (0x01)));
-	    }
-	  else
-	    {
-	      gcc_unreachable ();
-	    }
-	}
-
-      ok = true;
-      goto expand_perm_const_2_end;
-    }
-  else if (loongarch_is_single_op_perm (d))
-    {
-      /* Permutation that only select elements from op0.  */
-      if (!d->testing_p)
-	{
-	  /* Prepare temp register instead of modify original op.  */
-	  use_alt_op = true;
-	  op0_alt = gen_reg_rtx (d->vmode);
-	  op1_alt = gen_reg_rtx (d->vmode);
-
-	  emit_move_insn (op0_alt, d->op0);
-	  emit_move_insn (op1_alt, d->op1);
-
-	  rtx conv_op0 = simplify_gen_subreg (E_V4DImode, d->op0, d->vmode, 0);
-	  rtx conv_op0a = simplify_gen_subreg (E_V4DImode, op0_alt,
-					       d->vmode, 0);
-	  rtx conv_op1a = simplify_gen_subreg (E_V4DImode, op1_alt,
-					       d->vmode, 0);
-
-	  /* Duplicate op0's low 128bit in op0, then duplicate high 128bit
-	     in op1.  After this, xvshuf.* insn's selector argument can
-	     access all elements we need for correct permutation result.  */
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_op0a, conv_op0a, conv_op0,
-					      GEN_INT (0x00)));
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1a, conv_op1a, conv_op0,
-					      GEN_INT (0x11)));
-
-	  /* In this case, there's no need to remap selector's indices.  */
-	  for (i = 0; i < d->nelt; i += 1)
-	    {
-	      remapped[i] = d->perm[i];
-	    }
+	  return true;
 	}
-    }
-  else if (loongarch_is_divisible_perm (d))
-    {
-      /* Divisible perm:
-	 Low 128bit of selector only selects elements of op0,
-	 and high 128bit of selector only selects elements of op1.  */
 
-      if (!d->testing_p)
+expand_perm_const_end:
+      if (flag)
 	{
-	  /* Prepare temp register instead of modify original op.  */
-	  use_alt_op = true;
-	  op0_alt = gen_reg_rtx (d->vmode);
-	  op1_alt = gen_reg_rtx (d->vmode);
-
-	  emit_move_insn (op0_alt, d->op0);
-	  emit_move_insn (op1_alt, d->op1);
-
-	  rtx conv_op0a = simplify_gen_subreg (E_V4DImode, op0_alt,
-					       d->vmode, 0);
-	  rtx conv_op1a = simplify_gen_subreg (E_V4DImode, op1_alt,
-					       d->vmode, 0);
-	  rtx conv_op0 = simplify_gen_subreg (E_V4DImode, d->op0, d->vmode, 0);
-	  rtx conv_op1 = simplify_gen_subreg (E_V4DImode, d->op1, d->vmode, 0);
-
-	  /* Reorganize op0's hi/lo 128bit and op1's hi/lo 128bit, to make sure
-	     that selector's low 128bit can access all op0's elements, and
-	     selector's high 128bit can access all op1's elements.  */
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_op0a, conv_op0a, conv_op1,
-					      GEN_INT (0x02)));
-	  emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1a, conv_op1a, conv_op0,
-					      GEN_INT (0x31)));
-
-	  /* No need to modify indices.  */
-	  for (i = 0; i < d->nelt;i += 1)
+	  /* Copy selector vector from memory to vector register for later insn
+	     gen function.
+	     If vector's element in floating point value, we cannot fit
+	     selector argument into insn gen function directly, because of the
+	     insn template definition.  As a solution, generate a integral mode
+	     subreg of target, then copy selector vector (that is in integral
+	     mode) to this subreg.  */
+	  switch (d->vmode)
 	    {
-	      remapped[i] = d->perm[i];
+	    case E_V4DFmode:
+	      sel = gen_rtx_CONST_VECTOR (E_V4DImode, gen_rtvec_v (d->nelt,
+								   rperm));
+	      tmp = simplify_gen_subreg (E_V4DImode, d->target, d->vmode, 0);
+	      emit_move_insn (tmp, sel);
+	      break;
+	    case E_V8SFmode:
+	      sel = gen_rtx_CONST_VECTOR (E_V8SImode, gen_rtvec_v (d->nelt,
+								   rperm));
+	      tmp = simplify_gen_subreg (E_V8SImode, d->target, d->vmode, 0);
+	      emit_move_insn (tmp, sel);
+	      break;
+	    default:
+	      sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt,
+								 rperm));
+	      emit_move_insn (d->target, sel);
+	      break;
 	    }
-	}
-    }
-  else if (loongarch_is_triple_stride_extract (d))
-    {
-      /* Selector sample: E_V4DFmode, { 1, 4, 7, 0 }.  */
-      if (!d->testing_p)
-	{
-	  /* Resolve it with brute force modification.  */
-	  remapped[0] = 1;
-	  remapped[1] = 2;
-	  remapped[2] = 3;
-	  remapped[3] = 0;
-	}
-    }
-  else
-    {
-      /* When all of the detections above are failed, we will try last
-	 strategy.
-	 The for loop tries to detect following rules based on indices' value,
-	 its position inside of selector vector ,and strange behavior of
-	 xvshuf.* insn; Then we take corresponding action. (Replace with new
-	 value, or give up whole permutation expansion.)  */
-      for (i = 0; i < d->nelt; i += 1)
-	{
-	  /* % (2 * d->nelt)  */
-	  idx = d->perm[i];
 
-	  /* if index is located in low 128bit of selector vector.  */
-	  if (i < d->nelt / 2)
-	    {
-	      /* Fail case 1: index tries to reach element that located in op0's
-		 high 128bit.  */
-	      if (idx >= d->nelt / 2 && idx < d->nelt)
-		{
-		  goto expand_perm_const_2_end;
-		}
-	      /* Fail case 2: index tries to reach element that located in
-		 op1's high 128bit.  */
-	      if (idx >= (d->nelt + d->nelt / 2))
-		{
-		  goto expand_perm_const_2_end;
-		}
+	  target = d->target;
+	  op0 = d->op0;
+	  op1 = d->one_vector_p ? d->op0 : d->op1;
 
-	      /* Success case: index tries to reach elements that located in
-		 op1's low 128bit.  Apply - (nelt / 2) offset to original
-		 value.  */
-	      if (idx >= d->nelt && idx < (d->nelt + d->nelt / 2))
-		{
-		  idx -= d->nelt / 2;
-		}
-	    }
-	  /* if index is located in high 128bit of selector vector.  */
-	  else
+	  /* We FINALLY can generate xvshuf.* insn.  */
+	  switch (d->vmode)
 	    {
-	      /* Fail case 1: index tries to reach element that located in
-		 op1's low 128bit.  */
-	      if (idx >= d->nelt && idx < (d->nelt + d->nelt / 2))
-		{
-		  goto expand_perm_const_2_end;
-		}
-	      /* Fail case 2: index tries to reach element that located in
-		 op0's low 128bit.  */
-	      if (idx < (d->nelt / 2))
-		{
-		  goto expand_perm_const_2_end;
-		}
-	      /* Success case: index tries to reach element that located in
-		 op0's high 128bit.  */
-	      if (idx >= d->nelt / 2 && idx < d->nelt)
-		{
-		  idx -= d->nelt / 2;
-		}
+	    case E_V4DFmode:
+	      emit_insn (gen_lasx_xvshuf_d_f (target, target, op1, op0));
+	      break;
+	    case E_V4DImode:
+	      emit_insn (gen_lasx_xvshuf_d (target, target, op1, op0));
+	      break;
+	    case E_V8SFmode:
+	      emit_insn (gen_lasx_xvshuf_w_f (target, target, op1, op0));
+	      break;
+	    case E_V8SImode:
+	      emit_insn (gen_lasx_xvshuf_w (target, target, op1, op0));
+	      break;
+	    case E_V16HImode:
+	      emit_insn (gen_lasx_xvshuf_h (target, target, op1, op0));
+	      break;
+	    case E_V32QImode:
+	      emit_insn (gen_lasx_xvshuf_b (target, op1, op0, target));
+	      break;
+	    default:
+	      gcc_unreachable ();
+	      break;
 	    }
-	  /* No need to process other case that we did not mentioned.  */
-
-	  /* Assign with original or processed value.  */
-	  remapped[i] = idx;
-	}
-    }
-
-  ok = true;
-  /* If testing_p is true, compiler is trying to figure out that backend can
-     handle this permutation, but doesn't want to generate actual insn.  So
-     if true, exit directly.  */
-  if (d->testing_p)
-    {
-      goto expand_perm_const_2_end;
-    }
-
-  /* Convert remapped selector array to RTL array.  */
-  for (i = 0; i < d->nelt; i += 1)
-    {
-      rperm[i] = GEN_INT (remapped[i]);
-    }
-
-  /* Copy selector vector from memory to vector regiter for later insn gen
-     function.
-     If vector's element in floating point value, we cannot fit selector
-     argument into insn gen function directly, because of the insn template
-     definition.  As a solution, generate a integral mode subreg of target,
-     then copy selector vector (that is in integral mode) to this subreg.  */
-  switch (d->vmode)
-    {
-    case E_V4DFmode:
-      sel = gen_rtx_CONST_VECTOR (E_V4DImode, gen_rtvec_v (d->nelt, rperm));
-      tmp = simplify_gen_subreg (E_V4DImode, d->target, d->vmode, 0);
-      emit_move_insn (tmp, sel);
-      break;
-    case E_V8SFmode:
-      sel = gen_rtx_CONST_VECTOR (E_V8SImode, gen_rtvec_v (d->nelt, rperm));
-      tmp = simplify_gen_subreg (E_V8SImode, d->target, d->vmode, 0);
-      emit_move_insn (tmp, sel);
-      break;
-    default:
-      sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, rperm));
-      emit_move_insn (d->target, sel);
-      break;
-    }
-
-  target = d->target;
-  /* If temp op registers are requested in previous if branch, then use temp
-     register intead of original one.  */
-  if (use_alt_op)
-    {
-      op0 = op0_alt != NULL_RTX ? op0_alt : d->op0;
-      op1 = op1_alt != NULL_RTX ? op1_alt : d->op1;
-    }
-  else
-    {
-      op0 = d->op0;
-      op1 = d->one_vector_p ? d->op0 : d->op1;
-    }
-
-  /* We FINALLY can generate xvshuf.* insn.  */
-  switch (d->vmode)
-    {
-    case E_V4DFmode:
-      emit_insn (gen_lasx_xvshuf_d_f (target, target, op1, op0));
-      break;
-    case E_V4DImode:
-      emit_insn (gen_lasx_xvshuf_d (target, target, op1, op0));
-      break;
-    case E_V8SFmode:
-      emit_insn (gen_lasx_xvshuf_w_f (target, target, op1, op0));
-      break;
-    case E_V8SImode:
-      emit_insn (gen_lasx_xvshuf_w (target, target, op1, op0));
-      break;
-    case E_V16HImode:
-      emit_insn (gen_lasx_xvshuf_h (target, target, op1, op0));
-      break;
-    case E_V32QImode:
-      emit_insn (gen_lasx_xvshuf_b (target, op1, op0, target));
-      break;
-    default:
-      gcc_unreachable ();
-      break;
-    }
 
-  /* Extra insn for swapping the hi/lo 128bit of target vector register.  */
-  if (reverse_hi_lo)
-    {
-      switch (d->vmode)
-	{
-	case E_V4DFmode:
-	  emit_insn (gen_lasx_xvpermi_q_v4df (d->target, d->target,
-					      d->target, GEN_INT (0x1)));
-	  break;
-	case E_V4DImode:
-	  emit_insn (gen_lasx_xvpermi_q_v4di (d->target, d->target,
-					      d->target, GEN_INT (0x1)));
-	  break;
-	case E_V8SFmode:
-	  emit_insn (gen_lasx_xvpermi_q_v8sf (d->target, d->target,
-					      d->target, GEN_INT (0x1)));
-	  break;
-	case E_V8SImode:
-	  emit_insn (gen_lasx_xvpermi_q_v8si (d->target, d->target,
-					      d->target, GEN_INT (0x1)));
-	  break;
-	case E_V16HImode:
-	  emit_insn (gen_lasx_xvpermi_q_v16hi (d->target, d->target,
-					       d->target, GEN_INT (0x1)));
-	  break;
-	case E_V32QImode:
-	  emit_insn (gen_lasx_xvpermi_q_v32qi (d->target, d->target,
-					       d->target, GEN_INT (0x1)));
-	  break;
-	default:
-	  break;
+	  return true;
 	}
     }
-  /* Extra insn required by odd/even extraction.  Swapping the second and third
-     64bit in target vector register.  */
-  else if (extract_ev_od)
-    {
-      rtx converted = simplify_gen_subreg (E_V4DImode, d->target, d->vmode, 0);
-      emit_insn (gen_lasx_xvpermi_d_v4di (converted, converted,
-					  GEN_INT (0xD8)));
-    }
 
-expand_perm_const_2_end:
-  return ok;
+  return false;
 }
 
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
@@ -10289,25 +9407,19 @@ loongarch_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
       if (!d.one_vector_p)
 	d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
 
-      ok = loongarch_expand_vec_perm_const_2 (&d);
-      if (ok)
-	return ok;
-
       start_sequence ();
-      ok = loongarch_expand_vec_perm_const_1 (&d);
+      ok = loongarch_expand_vec_perm_const (&d);
       end_sequence ();
       return ok;
     }
 
-  ok = loongarch_expand_vec_perm_const_2 (&d);
-  if (!ok)
-    ok = loongarch_expand_vec_perm_const_1 (&d);
+  ok = loongarch_expand_vec_perm_const (&d);
 
   /* If we were given a two-vector permutation which just happened to
      have both input vectors equal, we folded this into a one-vector
      permutation.  There are several loongson patterns that are matched
      via direct vec_select+vec_concat expansion, but we do not have
-     support in loongarch_expand_vec_perm_const_1 to guess the adjustment
+     support in loongarch_expand_vec_perm_const to guess the adjustment
      that should be made for a single operand.  Just try again with
      the original permutation.  */
   if (!ok && which == 3)
@@ -10316,7 +9428,7 @@ loongarch_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
       d.op1 = op1;
       d.one_vector_p = false;
       memcpy (d.perm, orig_perm, MAX_VECT_LEN);
-      ok = loongarch_expand_vec_perm_const_1 (&d);
+      ok = loongarch_expand_vec_perm_const (&d);
     }
 
   return ok;
-- 
2.39.3